aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-01-04 03:43:42 -0500
committerIngo Molnar <mingo@elte.hu>2011-01-04 03:43:42 -0500
commitbc030d6cb9532877c1c5a3f5e7123344fa24a285 (patch)
treed223d410b868b80d4c0deec192d354a5d06b201a /kernel
parentd3bd058826aa8b79590cca6c8e6d1557bf576ada (diff)
parent387c31c7e5c9805b0aef8833d1731a5fe7bdea14 (diff)
Merge commit 'v2.6.37-rc8' into x86/apic
Conflicts: arch/x86/include/asm/io_apic.h Merge reason: move to a fresh -rc, resolve the conflict. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c67
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c9
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c12
-rw-r--r--kernel/auditsc.c16
-rw-r--r--kernel/cgroup.c145
-rw-r--r--kernel/cgroup_freezer.c72
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c153
-rw-r--r--kernel/debug/debug_core.h1
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c87
-rw-r--r--kernel/debug/kdb/kdb_private.h48
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/futex.c5
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/gcov/fs.c1
-rw-r--r--kernel/hw_breakpoint.c3
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq_work.c4
-rw-r--r--kernel/jump_label.c77
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/kthread.c11
-rw-r--r--kernel/latencytop.c17
-rw-r--r--kernel/module.c14
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/perf_event.c266
-rw-r--r--kernel/pm_qos_params.c7
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/snapshot.c18
-rw-r--r--kernel/power/suspend.c5
-rw-r--r--kernel/power/swap.c61
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk.c30
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/relay.c15
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/rtmutex-tester.c6
-rw-r--r--kernel/sched.c332
-rw-r--r--kernel/sched_fair.c73
-rw-r--r--kernel/sched_stats.h20
-rw-r--r--kernel/sched_stoptask.c4
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/smp.c8
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/taskstats.c205
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/blktrace.c20
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/ring_buffer.c345
-rw-r--r--kernel/trace/trace.c38
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--kernel/trace/trace_kdb.c1
-rw-r--r--kernel/trace/trace_kprobe.c3
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c323
75 files changed, 1823 insertions, 1071 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..77770a034d59 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
467 struct task_struct *tsk; 467 struct task_struct *tsk;
468 int err; 468 int err;
469 469
470 read_lock(&tasklist_lock); 470 rcu_read_lock();
471 tsk = find_task_by_vpid(pid); 471 tsk = find_task_by_vpid(pid);
472 err = -ESRCH; 472 if (!tsk) {
473 if (!tsk) 473 rcu_read_unlock();
474 goto out; 474 return -ESRCH;
475 err = 0; 475 }
476 476 get_task_struct(tsk);
477 spin_lock_irq(&tsk->sighand->siglock); 477 rcu_read_unlock();
478 if (!tsk->signal->audit_tty) 478 err = tty_audit_push_task(tsk, loginuid, sessionid);
479 err = -EPERM; 479 put_task_struct(tsk);
480 spin_unlock_irq(&tsk->sighand->siglock);
481 if (err)
482 goto out;
483
484 tty_audit_push_task(tsk, loginuid, sessionid);
485out:
486 read_unlock(&tasklist_lock);
487 return err; 480 return err;
488} 481}
489 482
@@ -506,7 +499,7 @@ int audit_send_list(void *_dest)
506} 499}
507 500
508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 501struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
509 int multi, void *payload, int size) 502 int multi, const void *payload, int size)
510{ 503{
511 struct sk_buff *skb; 504 struct sk_buff *skb;
512 struct nlmsghdr *nlh; 505 struct nlmsghdr *nlh;
@@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)
555 * Allocates an skb, builds the netlink message, and sends it to the pid. 548 * Allocates an skb, builds the netlink message, and sends it to the pid.
556 * No failure notifications. 549 * No failure notifications.
557 */ 550 */
558void audit_send_reply(int pid, int seq, int type, int done, int multi, 551static void audit_send_reply(int pid, int seq, int type, int done, int multi,
559 void *payload, int size) 552 const void *payload, int size)
560{ 553{
561 struct sk_buff *skb; 554 struct sk_buff *skb;
562 struct task_struct *tsk; 555 struct task_struct *tsk;
@@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
880 case AUDIT_TTY_GET: { 873 case AUDIT_TTY_GET: {
881 struct audit_tty_status s; 874 struct audit_tty_status s;
882 struct task_struct *tsk; 875 struct task_struct *tsk;
876 unsigned long flags;
883 877
884 read_lock(&tasklist_lock); 878 rcu_read_lock();
885 tsk = find_task_by_vpid(pid); 879 tsk = find_task_by_vpid(pid);
886 if (!tsk) 880 if (tsk && lock_task_sighand(tsk, &flags)) {
887 err = -ESRCH;
888 else {
889 spin_lock_irq(&tsk->sighand->siglock);
890 s.enabled = tsk->signal->audit_tty != 0; 881 s.enabled = tsk->signal->audit_tty != 0;
891 spin_unlock_irq(&tsk->sighand->siglock); 882 unlock_task_sighand(tsk, &flags);
892 } 883 } else
893 read_unlock(&tasklist_lock); 884 err = -ESRCH;
894 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, 885 rcu_read_unlock();
895 &s, sizeof(s)); 886
887 if (!err)
888 audit_send_reply(NETLINK_CB(skb).pid, seq,
889 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
896 break; 890 break;
897 } 891 }
898 case AUDIT_TTY_SET: { 892 case AUDIT_TTY_SET: {
899 struct audit_tty_status *s; 893 struct audit_tty_status *s;
900 struct task_struct *tsk; 894 struct task_struct *tsk;
895 unsigned long flags;
901 896
902 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 897 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
903 return -EINVAL; 898 return -EINVAL;
904 s = data; 899 s = data;
905 if (s->enabled != 0 && s->enabled != 1) 900 if (s->enabled != 0 && s->enabled != 1)
906 return -EINVAL; 901 return -EINVAL;
907 read_lock(&tasklist_lock); 902 rcu_read_lock();
908 tsk = find_task_by_vpid(pid); 903 tsk = find_task_by_vpid(pid);
909 if (!tsk) 904 if (tsk && lock_task_sighand(tsk, &flags)) {
910 err = -ESRCH;
911 else {
912 spin_lock_irq(&tsk->sighand->siglock);
913 tsk->signal->audit_tty = s->enabled != 0; 905 tsk->signal->audit_tty = s->enabled != 0;
914 spin_unlock_irq(&tsk->sighand->siglock); 906 unlock_task_sighand(tsk, &flags);
915 } 907 } else
916 read_unlock(&tasklist_lock); 908 err = -ESRCH;
909 rcu_read_unlock();
917 break; 910 break;
918 } 911 }
919 default: 912 default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
84 int *dirlen); 84 int *dirlen);
85extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 85extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
86 int done, int multi, 86 int done, int multi,
87 void *payload, int size); 87 const void *payload, int size);
88extern void audit_send_reply(int pid, int seq, int type,
89 int done, int multi,
90 void *payload, int size);
91extern void audit_panic(const char *message); 88extern void audit_panic(const char *message);
92 89
93struct audit_netlink_list { 90struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..37b2bea170c8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
223{ 223{
224 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark; 225 struct fsnotify_mark *entry = &chunk->mark;
226 struct audit_chunk *new; 226 struct audit_chunk *new = NULL;
227 struct audit_tree *owner; 227 struct audit_tree *owner;
228 int size = chunk->count - 1; 228 int size = chunk->count - 1;
229 int i, j; 229 int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
232 232
233 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
234 234
235 if (size)
236 new = alloc_chunk(size);
237
235 spin_lock(&entry->lock); 238 spin_lock(&entry->lock);
236 if (chunk->dead || !entry->i.inode) { 239 if (chunk->dead || !entry->i.inode) {
237 spin_unlock(&entry->lock); 240 spin_unlock(&entry->lock);
241 if (new)
242 free_chunk(new);
238 goto out; 243 goto out;
239 } 244 }
240 245
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
255 goto out; 260 goto out;
256 } 261 }
257 262
258 new = alloc_chunk(size);
259 if (!new) 263 if (!new)
260 goto Fallback; 264 goto Fallback;
265
261 fsnotify_duplicate_mark(&new->mark, entry); 266 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 267 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
263 free_chunk(new); 268 free_chunk(new);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..d2e3c7866460 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
60}; 60};
61 61
62/* fsnotify handle. */ 62/* fsnotify handle. */
63struct fsnotify_group *audit_watch_group; 63static struct fsnotify_group *audit_watch_group;
64 64
65/* fsnotify events we care about. */ 65/* fsnotify events we care about. */
66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
123 } 123 }
124} 124}
125 125
126void audit_remove_watch(struct audit_watch *watch) 126static void audit_remove_watch(struct audit_watch *watch)
127{ 127{
128 list_del(&watch->wlist); 128 list_del(&watch->wlist);
129 audit_put_parent(watch->parent); 129 audit_put_parent(watch->parent);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..add2819af71b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1252 case AUDIT_LOGINUID: 1252 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1253 result = audit_comparator(cb->loginuid, f->op, f->val);
1254 break; 1254 break;
1255 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule)
1261 result = security_audit_rule_match(cb->sid,
1262 f->type,
1263 f->op,
1264 f->lsm_rule,
1265 NULL);
1266 break;
1255 } 1267 }
1256 1268
1257 if (!result) 1269 if (!result)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..f49a0318c2ed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
241 pid_t pid; 241 pid_t pid;
242 struct audit_cap_data cap; 242 struct audit_cap_data cap;
243 } capset; 243 } capset;
244 struct {
245 int fd;
246 int flags;
247 } mmap;
244 }; 248 };
245 int fds[2]; 249 int fds[2];
246 250
@@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); 1309 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); 1310 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; } 1311 break; }
1312 case AUDIT_MMAP: {
1313 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
1314 context->mmap.flags);
1315 break; }
1308 } 1316 }
1309 audit_log_end(ab); 1317 audit_log_end(ab);
1310} 1318}
@@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,
2476 context->type = AUDIT_CAPSET; 2484 context->type = AUDIT_CAPSET;
2477} 2485}
2478 2486
2487void __audit_mmap_fd(int fd, int flags)
2488{
2489 struct audit_context *context = current->audit_context;
2490 context->mmap.fd = fd;
2491 context->mmap.flags = flags;
2492 context->type = AUDIT_MMAP;
2493}
2494
2479/** 2495/**
2480 * audit_core_dumps - record information about processes that end abnormally 2496 * audit_core_dumps - record information about processes that end abnormally
2481 * @signr: signal value 2497 * @signr: signal value
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 291ba3d04bea..66a416b42c18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
52#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
53#include <linux/hash.h> 53#include <linux/hash.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/smp_lock.h>
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 243 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245} 244}
246 245
246static int clone_children(const struct cgroup *cgrp)
247{
248 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
249}
250
247/* 251/*
248 * for_each_subsys() allows you to iterate on each subsystem attached to 252 * for_each_subsys() allows you to iterate on each subsystem attached to
249 * an active hierarchy 253 * an active hierarchy
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
778 struct inode *inode = new_inode(sb); 782 struct inode *inode = new_inode(sb);
779 783
780 if (inode) { 784 if (inode) {
785 inode->i_ino = get_next_ino();
781 inode->i_mode = mode; 786 inode->i_mode = mode;
782 inode->i_uid = current_fsuid(); 787 inode->i_uid = current_fsuid();
783 inode->i_gid = current_fsgid(); 788 inode->i_gid = current_fsgid();
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040 seq_puts(seq, ",noprefix"); 1045 seq_puts(seq, ",noprefix");
1041 if (strlen(root->release_agent_path)) 1046 if (strlen(root->release_agent_path))
1042 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1047 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1048 if (clone_children(&root->top_cgroup))
1049 seq_puts(seq, ",clone_children");
1043 if (strlen(root->name)) 1050 if (strlen(root->name))
1044 seq_printf(seq, ",name=%s", root->name); 1051 seq_printf(seq, ",name=%s", root->name);
1045 mutex_unlock(&cgroup_mutex); 1052 mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {
1050 unsigned long subsys_bits; 1057 unsigned long subsys_bits;
1051 unsigned long flags; 1058 unsigned long flags;
1052 char *release_agent; 1059 char *release_agent;
1060 bool clone_children;
1053 char *name; 1061 char *name;
1054 /* User explicitly requested empty subsystem */ 1062 /* User explicitly requested empty subsystem */
1055 bool none; 1063 bool none;
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {
1066 */ 1074 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1075static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1068{ 1076{
1069 char *token, *o = data ?: "all"; 1077 char *token, *o = data;
1078 bool all_ss = false, one_ss = false;
1070 unsigned long mask = (unsigned long)-1; 1079 unsigned long mask = (unsigned long)-1;
1071 int i; 1080 int i;
1072 bool module_pin_failed = false; 1081 bool module_pin_failed = false;
@@ -1082,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1082 while ((token = strsep(&o, ",")) != NULL) { 1091 while ((token = strsep(&o, ",")) != NULL) {
1083 if (!*token) 1092 if (!*token)
1084 return -EINVAL; 1093 return -EINVAL;
1085 if (!strcmp(token, "all")) { 1094 if (!strcmp(token, "none")) {
1086 /* Add all non-disabled subsystems */
1087 opts->subsys_bits = 0;
1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1092 if (!ss->disabled)
1093 opts->subsys_bits |= 1ul << i;
1094 }
1095 } else if (!strcmp(token, "none")) {
1096 /* Explicitly have no subsystems */ 1095 /* Explicitly have no subsystems */
1097 opts->none = true; 1096 opts->none = true;
1098 } else if (!strcmp(token, "noprefix")) { 1097 continue;
1098 }
1099 if (!strcmp(token, "all")) {
1100 /* Mutually exclusive option 'all' + subsystem name */
1101 if (one_ss)
1102 return -EINVAL;
1103 all_ss = true;
1104 continue;
1105 }
1106 if (!strcmp(token, "noprefix")) {
1099 set_bit(ROOT_NOPREFIX, &opts->flags); 1107 set_bit(ROOT_NOPREFIX, &opts->flags);
1100 } else if (!strncmp(token, "release_agent=", 14)) { 1108 continue;
1109 }
1110 if (!strcmp(token, "clone_children")) {
1111 opts->clone_children = true;
1112 continue;
1113 }
1114 if (!strncmp(token, "release_agent=", 14)) {
1101 /* Specifying two release agents is forbidden */ 1115 /* Specifying two release agents is forbidden */
1102 if (opts->release_agent) 1116 if (opts->release_agent)
1103 return -EINVAL; 1117 return -EINVAL;
@@ -1105,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1119 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1120 if (!opts->release_agent)
1107 return -ENOMEM; 1121 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1122 continue;
1123 }
1124 if (!strncmp(token, "name=", 5)) {
1109 const char *name = token + 5; 1125 const char *name = token + 5;
1110 /* Can't specify an empty name */ 1126 /* Can't specify an empty name */
1111 if (!strlen(name)) 1127 if (!strlen(name))
@@ -1127,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1127 GFP_KERNEL); 1143 GFP_KERNEL);
1128 if (!opts->name) 1144 if (!opts->name)
1129 return -ENOMEM; 1145 return -ENOMEM;
1130 } else { 1146
1131 struct cgroup_subsys *ss; 1147 continue;
1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1148 }
1133 ss = subsys[i]; 1149
1134 if (ss == NULL) 1150 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1135 continue; 1151 struct cgroup_subsys *ss = subsys[i];
1136 if (!strcmp(token, ss->name)) { 1152 if (ss == NULL)
1137 if (!ss->disabled) 1153 continue;
1138 set_bit(i, &opts->subsys_bits); 1154 if (strcmp(token, ss->name))
1139 break; 1155 continue;
1140 } 1156 if (ss->disabled)
1141 } 1157 continue;
1142 if (i == CGROUP_SUBSYS_COUNT) 1158
1143 return -ENOENT; 1159 /* Mutually exclusive option 'all' + subsystem name */
1160 if (all_ss)
1161 return -EINVAL;
1162 set_bit(i, &opts->subsys_bits);
1163 one_ss = true;
1164
1165 break;
1166 }
1167 if (i == CGROUP_SUBSYS_COUNT)
1168 return -ENOENT;
1169 }
1170
1171 /*
1172 * If the 'all' option was specified select all the subsystems,
1173 * otherwise 'all, 'none' and a subsystem name options were not
1174 * specified, let's default to 'all'
1175 */
1176 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1177 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1178 struct cgroup_subsys *ss = subsys[i];
1179 if (ss == NULL)
1180 continue;
1181 if (ss->disabled)
1182 continue;
1183 set_bit(i, &opts->subsys_bits);
1144 } 1184 }
1145 } 1185 }
1146 1186
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1222 struct cgroup *cgrp = &root->top_cgroup; 1262 struct cgroup *cgrp = &root->top_cgroup;
1223 struct cgroup_sb_opts opts; 1263 struct cgroup_sb_opts opts;
1224 1264
1225 lock_kernel();
1226 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1265 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1227 mutex_lock(&cgroup_mutex); 1266 mutex_lock(&cgroup_mutex);
1228 1267
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1255 kfree(opts.name); 1294 kfree(opts.name);
1256 mutex_unlock(&cgroup_mutex); 1295 mutex_unlock(&cgroup_mutex);
1257 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1296 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1258 unlock_kernel();
1259 return ret; 1297 return ret;
1260} 1298}
1261 1299
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1357 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1358 if (opts->name) 1396 if (opts->name)
1359 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1398 if (opts->clone_children)
1399 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1360 return root; 1400 return root;
1361} 1401}
1362 1402
@@ -1420,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1420 return 0; 1460 return 0;
1421} 1461}
1422 1462
1423static int cgroup_get_sb(struct file_system_type *fs_type, 1463static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1424 int flags, const char *unused_dev_name, 1464 int flags, const char *unused_dev_name,
1425 void *data, struct vfsmount *mnt) 1465 void *data)
1426{ 1466{
1427 struct cgroup_sb_opts opts; 1467 struct cgroup_sb_opts opts;
1428 struct cgroupfs_root *root; 1468 struct cgroupfs_root *root;
@@ -1556,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1556 drop_parsed_module_refcounts(opts.subsys_bits); 1596 drop_parsed_module_refcounts(opts.subsys_bits);
1557 } 1597 }
1558 1598
1559 simple_set_mnt(mnt, sb);
1560 kfree(opts.release_agent); 1599 kfree(opts.release_agent);
1561 kfree(opts.name); 1600 kfree(opts.name);
1562 return 0; 1601 return dget(sb->s_root);
1563 1602
1564 drop_new_super: 1603 drop_new_super:
1565 deactivate_locked_super(sb); 1604 deactivate_locked_super(sb);
@@ -1568,8 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1568 out_err: 1607 out_err:
1569 kfree(opts.release_agent); 1608 kfree(opts.release_agent);
1570 kfree(opts.name); 1609 kfree(opts.name);
1571 1610 return ERR_PTR(ret);
1572 return ret;
1573} 1611}
1574 1612
1575static void cgroup_kill_sb(struct super_block *sb) { 1613static void cgroup_kill_sb(struct super_block *sb) {
@@ -1619,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1619 1657
1620static struct file_system_type cgroup_fs_type = { 1658static struct file_system_type cgroup_fs_type = {
1621 .name = "cgroup", 1659 .name = "cgroup",
1622 .get_sb = cgroup_get_sb, 1660 .mount = cgroup_mount,
1623 .kill_sb = cgroup_kill_sb, 1661 .kill_sb = cgroup_kill_sb,
1624}; 1662};
1625 1663
@@ -1883,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1883 const char *buffer) 1921 const char *buffer)
1884{ 1922{
1885 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 1923 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1924 if (strlen(buffer) >= PATH_MAX)
1925 return -EINVAL;
1886 if (!cgroup_lock_live_group(cgrp)) 1926 if (!cgroup_lock_live_group(cgrp))
1887 return -ENODEV; 1927 return -ENODEV;
1888 strcpy(cgrp->root->release_agent_path, buffer); 1928 strcpy(cgrp->root->release_agent_path, buffer);
@@ -3176,6 +3216,23 @@ fail:
3176 return ret; 3216 return ret;
3177} 3217}
3178 3218
3219static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3220 struct cftype *cft)
3221{
3222 return clone_children(cgrp);
3223}
3224
3225static int cgroup_clone_children_write(struct cgroup *cgrp,
3226 struct cftype *cft,
3227 u64 val)
3228{
3229 if (val)
3230 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3231 else
3232 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3233 return 0;
3234}
3235
3179/* 3236/*
3180 * for the common functions, 'private' gives the type of file 3237 * for the common functions, 'private' gives the type of file
3181 */ 3238 */
@@ -3206,6 +3263,11 @@ static struct cftype files[] = {
3206 .write_string = cgroup_write_event_control, 3263 .write_string = cgroup_write_event_control,
3207 .mode = S_IWUGO, 3264 .mode = S_IWUGO,
3208 }, 3265 },
3266 {
3267 .name = "cgroup.clone_children",
3268 .read_u64 = cgroup_clone_children_read,
3269 .write_u64 = cgroup_clone_children_write,
3270 },
3209}; 3271};
3210 3272
3211static struct cftype cft_release_agent = { 3273static struct cftype cft_release_agent = {
@@ -3335,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3335 if (notify_on_release(parent)) 3397 if (notify_on_release(parent))
3336 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3398 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3337 3399
3400 if (clone_children(parent))
3401 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3402
3338 for_each_subsys(root, ss) { 3403 for_each_subsys(root, ss) {
3339 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3404 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3340 3405
@@ -3349,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3349 goto err_destroy; 3414 goto err_destroy;
3350 } 3415 }
3351 /* At error, ->destroy() callback has to free assigned ID. */ 3416 /* At error, ->destroy() callback has to free assigned ID. */
3417 if (clone_children(parent) && ss->post_clone)
3418 ss->post_clone(ss, cgrp);
3352 } 3419 }
3353 3420
3354 cgroup_lock_hierarchy(root); 3421 cgroup_lock_hierarchy(root);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51int cgroup_freezing_or_frozen(struct task_struct *task) 51static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
52{ 52{
53 struct freezer *freezer; 53 enum freezer_state state = task_freezer(task)->state;
54 enum freezer_state state; 54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
55}
55 56
57int cgroup_freezing_or_frozen(struct task_struct *task)
58{
59 int result;
56 task_lock(task); 60 task_lock(task);
57 freezer = task_freezer(task); 61 result = __cgroup_freezing_or_frozen(task);
58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
62 task_unlock(task); 62 task_unlock(task);
63 63 return result;
64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
65} 64}
66 65
67/* 66/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
154 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
155} 154}
156 155
157/* Task is frozen or will freeze immediately when next it gets woken */
158static bool is_task_frozen_enough(struct task_struct *task)
159{
160 return frozen(task) ||
161 (task_is_stopped_or_traced(task) && freezing(task));
162}
163
164/* 156/*
165 * The call to cgroup_lock() in the freezer.state write method prevents 157 * The call to cgroup_lock() in the freezer.state write method prevents
166 * a write to that file racing against an attach, and hence the 158 * a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
174 166
175 /* 167 /*
176 * Anything frozen can't move or be moved to/from. 168 * Anything frozen can't move or be moved to/from.
177 *
178 * Since orig_freezer->state == FROZEN means that @task has been
179 * frozen, so it's sufficient to check the latter condition.
180 */ 169 */
181 170
182 if (is_task_frozen_enough(task)) 171 freezer = cgroup_freezer(new_cgroup);
172 if (freezer->state != CGROUP_THAWED)
183 return -EBUSY; 173 return -EBUSY;
184 174
185 freezer = cgroup_freezer(new_cgroup); 175 rcu_read_lock();
186 if (freezer->state == CGROUP_FROZEN) 176 if (__cgroup_freezing_or_frozen(task)) {
177 rcu_read_unlock();
187 return -EBUSY; 178 return -EBUSY;
179 }
180 rcu_read_unlock();
188 181
189 if (threadgroup) { 182 if (threadgroup) {
190 struct task_struct *c; 183 struct task_struct *c;
191 184
192 rcu_read_lock(); 185 rcu_read_lock();
193 list_for_each_entry_rcu(c, &task->thread_group, thread_group) { 186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
194 if (is_task_frozen_enough(c)) { 187 if (__cgroup_freezing_or_frozen(c)) {
195 rcu_read_unlock(); 188 rcu_read_unlock();
196 return -EBUSY; 189 return -EBUSY;
197 } 190 }
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
236/* 229/*
237 * caller must hold freezer->lock 230 * caller must hold freezer->lock
238 */ 231 */
239static void update_freezer_state(struct cgroup *cgroup, 232static void update_if_frozen(struct cgroup *cgroup,
240 struct freezer *freezer) 233 struct freezer *freezer)
241{ 234{
242 struct cgroup_iter it; 235 struct cgroup_iter it;
243 struct task_struct *task; 236 struct task_struct *task;
244 unsigned int nfrozen = 0, ntotal = 0; 237 unsigned int nfrozen = 0, ntotal = 0;
238 enum freezer_state old_state = freezer->state;
245 239
246 cgroup_iter_start(cgroup, &it); 240 cgroup_iter_start(cgroup, &it);
247 while ((task = cgroup_iter_next(cgroup, &it))) { 241 while ((task = cgroup_iter_next(cgroup, &it))) {
248 ntotal++; 242 ntotal++;
249 if (is_task_frozen_enough(task)) 243 if (frozen(task))
250 nfrozen++; 244 nfrozen++;
251 } 245 }
252 246
253 /* 247 if (old_state == CGROUP_THAWED) {
254 * Transition to FROZEN when no new tasks can be added ensures 248 BUG_ON(nfrozen > 0);
255 * that we never exist in the FROZEN state while there are unfrozen 249 } else if (old_state == CGROUP_FREEZING) {
256 * tasks. 250 if (nfrozen == ntotal)
257 */ 251 freezer->state = CGROUP_FROZEN;
258 if (nfrozen == ntotal) 252 } else { /* old_state == CGROUP_FROZEN */
259 freezer->state = CGROUP_FROZEN; 253 BUG_ON(nfrozen != ntotal);
260 else if (nfrozen > 0) 254 }
261 freezer->state = CGROUP_FREEZING; 255
262 else
263 freezer->state = CGROUP_THAWED;
264 cgroup_iter_end(cgroup, &it); 256 cgroup_iter_end(cgroup, &it);
265} 257}
266 258
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
279 if (state == CGROUP_FREEZING) { 271 if (state == CGROUP_FREEZING) {
280 /* We change from FREEZING to FROZEN lazily if the cgroup was 272 /* We change from FREEZING to FROZEN lazily if the cgroup was
281 * only partially frozen when we exitted write. */ 273 * only partially frozen when we exitted write. */
282 update_freezer_state(cgroup, freezer); 274 update_if_frozen(cgroup, freezer);
283 state = freezer->state; 275 state = freezer->state;
284 } 276 }
285 spin_unlock_irq(&freezer->lock); 277 spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
301 while ((task = cgroup_iter_next(cgroup, &it))) { 293 while ((task = cgroup_iter_next(cgroup, &it))) {
302 if (!freeze_task(task, true)) 294 if (!freeze_task(task, true))
303 continue; 295 continue;
304 if (is_task_frozen_enough(task)) 296 if (frozen(task))
305 continue; 297 continue;
306 if (!freezing(task) && !freezer_should_skip(task)) 298 if (!freezing(task) && !freezer_should_skip(task))
307 num_cant_freeze_now++; 299 num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
335 327
336 spin_lock_irq(&freezer->lock); 328 spin_lock_irq(&freezer->lock);
337 329
338 update_freezer_state(cgroup, freezer); 330 update_if_frozen(cgroup, freezer);
339 if (goal_state == freezer->state) 331 if (goal_state == freezer->state)
340 goto out; 332 goto out;
341 333
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
66static const struct file_operations ikconfig_file_ops = { 66static const struct file_operations ikconfig_file_ops = {
67 .owner = THIS_MODULE, 67 .owner = THIS_MODULE,
68 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
69 .llseek = default_llseek,
69}; 70};
70 71
71static int __init ikconfig_init(void) 72static int __init ikconfig_init(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 51b143e2a07a..4349935c2ad8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
231 * users. If someone tries to mount the "cpuset" filesystem, we 231 * users. If someone tries to mount the "cpuset" filesystem, we
232 * silently switch it to mount "cgroup" instead 232 * silently switch it to mount "cgroup" instead
233 */ 233 */
234static int cpuset_get_sb(struct file_system_type *fs_type, 234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, 235 int flags, const char *unused_dev_name, void *data)
236 void *data, struct vfsmount *mnt)
237{ 236{
238 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
239 int ret = -ENODEV; 238 struct dentry *ret = ERR_PTR(-ENODEV);
240 if (cgroup_fs) { 239 if (cgroup_fs) {
241 char mountopts[] = 240 char mountopts[] =
242 "cpuset,noprefix," 241 "cpuset,noprefix,"
243 "release_agent=/sbin/cpuset_release_agent"; 242 "release_agent=/sbin/cpuset_release_agent";
244 ret = cgroup_fs->get_sb(cgroup_fs, flags, 243 ret = cgroup_fs->mount(cgroup_fs, flags,
245 unused_dev_name, mountopts, mnt); 244 unused_dev_name, mountopts);
246 put_filesystem(cgroup_fs); 245 put_filesystem(cgroup_fs);
247 } 246 }
248 return ret; 247 return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
250 249
251static struct file_system_type cpuset_fs_type = { 250static struct file_system_type cpuset_fs_type = {
252 .name = "cpuset", 251 .name = "cpuset",
253 .get_sb = cpuset_get_sb, 252 .mount = cpuset_mount,
254}; 253};
255 254
256/* 255/*
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
325 325
326/* 326/*
327 * Prepare credentials for current to perform an execve() 327 * Prepare credentials for current to perform an execve()
328 * - The caller must hold current->cred_guard_mutex 328 * - The caller must hold ->cred_guard_mutex
329 */ 329 */
330struct cred *prepare_exec_creds(void) 330struct cred *prepare_exec_creds(void)
331{ 331{
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
384 struct cred *new; 384 struct cred *new;
385 int ret; 385 int ret;
386 386
387 mutex_init(&p->cred_guard_mutex);
388
389 if ( 387 if (
390#ifdef CONFIG_KEYS 388#ifdef CONFIG_KEYS
391 !p->cred->thread_keyring && 389 !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index de407c78178d..cefd4a11f6d9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -47,6 +47,7 @@
47#include <linux/pid.h> 47#include <linux/pid.h>
48#include <linux/smp.h> 48#include <linux/smp.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/rcupdate.h>
50 51
51#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
52#include <asm/byteorder.h> 53#include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
109 */ 110 */
110atomic_t kgdb_active = ATOMIC_INIT(-1); 111atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active); 112EXPORT_SYMBOL_GPL(kgdb_active);
113static DEFINE_RAW_SPINLOCK(dbg_master_lock);
114static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
112 115
113/* 116/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early 117 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet): 118 * bootup code (which might not have percpu set up yet):
116 */ 119 */
117static atomic_t passive_cpu_wait[NR_CPUS]; 120static atomic_t masters_in_kgdb;
118static atomic_t cpu_in_kgdb[NR_CPUS]; 121static atomic_t slaves_in_kgdb;
119static atomic_t kgdb_break_tasklet_var; 122static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint; 123atomic_t kgdb_setting_breakpoint;
121 124
@@ -206,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
206 return 0; 209 return 0;
207} 210}
208 211
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/* 212/*
222 * Some architectures need cache flushes when we set/clear a 213 * Some architectures need cache flushes when we set/clear a
223 * breakpoint: 214 * breakpoint:
@@ -457,26 +448,34 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
457 return 1; 448 return 1;
458} 449}
459 450
460static void dbg_cpu_switch(int cpu, int next_cpu) 451static void dbg_touch_watchdogs(void)
461{ 452{
462 /* Mark the cpu we are switching away from as a slave when it 453 touch_softlockup_watchdog_sync();
463 * holds the kgdb_active token. This must be done so that the 454 clocksource_touch_watchdog();
464 * that all the cpus wait in for the debug core will not enter 455 rcu_cpu_stall_reset();
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471} 456}
472 457
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) 458static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
459 int exception_state)
474{ 460{
475 unsigned long flags; 461 unsigned long flags;
476 int sstep_tries = 100; 462 int sstep_tries = 100;
477 int error; 463 int error;
478 int i, cpu; 464 int cpu;
479 int trace_on = 0; 465 int trace_on = 0;
466 int online_cpus = num_online_cpus();
467
468 kgdb_info[ks->cpu].enter_kgdb++;
469 kgdb_info[ks->cpu].exception_state |= exception_state;
470
471 if (exception_state == DCPU_WANT_MASTER)
472 atomic_inc(&masters_in_kgdb);
473 else
474 atomic_inc(&slaves_in_kgdb);
475
476 if (arch_kgdb_ops.disable_hw_break)
477 arch_kgdb_ops.disable_hw_break(regs);
478
480acquirelock: 479acquirelock:
481 /* 480 /*
482 * Interrupts will be restored by the 'trap return' code, except when 481 * Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +488,15 @@ acquirelock:
489 kgdb_info[cpu].task = current; 488 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0; 489 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; 490 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497 491
498 if (exception_level == 1) 492 /* Make sure the above info reaches the primary CPU */
493 smp_mb();
494
495 if (exception_level == 1) {
496 if (raw_spin_trylock(&dbg_master_lock))
497 atomic_xchg(&kgdb_active, cpu);
499 goto cpu_master_loop; 498 goto cpu_master_loop;
499 }
500 500
501 /* 501 /*
502 * CPU will loop if it is a slave or request to become a kgdb 502 * CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +508,12 @@ cpu_loop:
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; 508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop; 509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { 510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) 511 if (raw_spin_trylock(&dbg_master_lock)) {
512 atomic_xchg(&kgdb_active, cpu);
512 break; 513 break;
514 }
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { 515 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu])) 516 if (!raw_spin_is_locked(&dbg_slave_lock))
515 goto return_normal; 517 goto return_normal;
516 } else { 518 } else {
517return_normal: 519return_normal:
@@ -522,9 +524,12 @@ return_normal:
522 arch_kgdb_ops.correct_hw_break(); 524 arch_kgdb_ops.correct_hw_break();
523 if (trace_on) 525 if (trace_on)
524 tracing_on(); 526 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]); 527 kgdb_info[cpu].exception_state &=
526 touch_softlockup_watchdog_sync(); 528 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
527 clocksource_touch_watchdog(); 529 kgdb_info[cpu].enter_kgdb--;
530 smp_mb__before_atomic_dec();
531 atomic_dec(&slaves_in_kgdb);
532 dbg_touch_watchdogs();
528 local_irq_restore(flags); 533 local_irq_restore(flags);
529 return 0; 534 return 0;
530 } 535 }
@@ -541,8 +546,8 @@ return_normal:
541 (kgdb_info[cpu].task && 546 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 547 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1); 548 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync(); 549 raw_spin_unlock(&dbg_master_lock);
545 clocksource_touch_watchdog(); 550 dbg_touch_watchdogs();
546 local_irq_restore(flags); 551 local_irq_restore(flags);
547 552
548 goto acquirelock; 553 goto acquirelock;
@@ -563,16 +568,12 @@ return_normal:
563 if (dbg_io_ops->pre_exception) 568 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception(); 569 dbg_io_ops->pre_exception();
565 570
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /* 571 /*
569 * Get the passive CPU lock which will hold all the non-primary 572 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active 573 * CPU in a spin state while the debugger is active
571 */ 574 */
572 if (!kgdb_single_step) { 575 if (!kgdb_single_step)
573 for (i = 0; i < NR_CPUS; i++) 576 raw_spin_lock(&dbg_slave_lock);
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576 577
577#ifdef CONFIG_SMP 578#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */ 579 /* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +584,9 @@ return_normal:
583 /* 584 /*
584 * Wait for the other CPUs to be notified and be waiting for us: 585 * Wait for the other CPUs to be notified and be waiting for us:
585 */ 586 */
586 for_each_online_cpu(i) { 587 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) 588 atomic_read(&slaves_in_kgdb)) != online_cpus)
588 cpu_relax(); 589 cpu_relax();
589 }
590 590
591 /* 591 /*
592 * At this point the primary processor is completely 592 * At this point the primary processor is completely
@@ -615,7 +615,8 @@ cpu_master_loop:
615 if (error == DBG_PASS_EVENT) { 615 if (error == DBG_PASS_EVENT) {
616 dbg_kdb_mode = !dbg_kdb_mode; 616 dbg_kdb_mode = !dbg_kdb_mode;
617 } else if (error == DBG_SWITCH_CPU_EVENT) { 617 } else if (error == DBG_SWITCH_CPU_EVENT) {
618 dbg_cpu_switch(cpu, dbg_switch_cpu); 618 kgdb_info[dbg_switch_cpu].exception_state |=
619 DCPU_NEXT_MASTER;
619 goto cpu_loop; 620 goto cpu_loop;
620 } else { 621 } else {
621 kgdb_info[cpu].ret_state = error; 622 kgdb_info[cpu].ret_state = error;
@@ -627,24 +628,11 @@ cpu_master_loop:
627 if (dbg_io_ops->post_exception) 628 if (dbg_io_ops->post_exception)
628 dbg_io_ops->post_exception(); 629 dbg_io_ops->post_exception();
629 630
630 atomic_dec(&cpu_in_kgdb[ks->cpu]);
631
632 if (!kgdb_single_step) { 631 if (!kgdb_single_step) {
633 for (i = NR_CPUS-1; i >= 0; i--) 632 raw_spin_unlock(&dbg_slave_lock);
634 atomic_dec(&passive_cpu_wait[i]); 633 /* Wait till all the CPUs have quit from the debugger. */
635 /* 634 while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
636 * Wait till all the CPUs have quit from the debugger, 635 cpu_relax();
637 * but allow a CPU that hit an exception and is
638 * waiting to become the master to remain in the debug
639 * core.
640 */
641 for_each_online_cpu(i) {
642 while (kgdb_do_roundup &&
643 atomic_read(&cpu_in_kgdb[i]) &&
644 !(kgdb_info[i].exception_state &
645 DCPU_WANT_MASTER))
646 cpu_relax();
647 }
648 } 636 }
649 637
650kgdb_restore: 638kgdb_restore:
@@ -655,12 +643,20 @@ kgdb_restore:
655 else 643 else
656 kgdb_sstep_pid = 0; 644 kgdb_sstep_pid = 0;
657 } 645 }
646 if (arch_kgdb_ops.correct_hw_break)
647 arch_kgdb_ops.correct_hw_break();
658 if (trace_on) 648 if (trace_on)
659 tracing_on(); 649 tracing_on();
650
651 kgdb_info[cpu].exception_state &=
652 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
653 kgdb_info[cpu].enter_kgdb--;
654 smp_mb__before_atomic_dec();
655 atomic_dec(&masters_in_kgdb);
660 /* Free kgdb_active */ 656 /* Free kgdb_active */
661 atomic_set(&kgdb_active, -1); 657 atomic_set(&kgdb_active, -1);
662 touch_softlockup_watchdog_sync(); 658 raw_spin_unlock(&dbg_master_lock);
663 clocksource_touch_watchdog(); 659 dbg_touch_watchdogs();
664 local_irq_restore(flags); 660 local_irq_restore(flags);
665 661
666 return kgdb_info[cpu].ret_state; 662 return kgdb_info[cpu].ret_state;
@@ -678,7 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678{ 674{
679 struct kgdb_state kgdb_var; 675 struct kgdb_state kgdb_var;
680 struct kgdb_state *ks = &kgdb_var; 676 struct kgdb_state *ks = &kgdb_var;
681 int ret;
682 677
683 ks->cpu = raw_smp_processor_id(); 678 ks->cpu = raw_smp_processor_id();
684 ks->ex_vector = evector; 679 ks->ex_vector = evector;
@@ -689,11 +684,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
689 684
690 if (kgdb_reenter_check(ks)) 685 if (kgdb_reenter_check(ks))
691 return 0; /* Ouch, double exception ! */ 686 return 0; /* Ouch, double exception ! */
692 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; 687 if (kgdb_info[ks->cpu].enter_kgdb != 0)
693 ret = kgdb_cpu_enter(ks, regs); 688 return 0;
694 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | 689
695 DCPU_IS_SLAVE); 690 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
696 return ret;
697} 691}
698 692
699int kgdb_nmicallback(int cpu, void *regs) 693int kgdb_nmicallback(int cpu, void *regs)
@@ -706,12 +700,9 @@ int kgdb_nmicallback(int cpu, void *regs)
706 ks->cpu = cpu; 700 ks->cpu = cpu;
707 ks->linux_regs = regs; 701 ks->linux_regs = regs;
708 702
709 if (!atomic_read(&cpu_in_kgdb[cpu]) && 703 if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
710 atomic_read(&kgdb_active) != -1 && 704 raw_spin_is_locked(&dbg_master_lock)) {
711 atomic_read(&kgdb_active) != cpu) { 705 kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
712 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
713 kgdb_cpu_enter(ks, regs);
714 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
715 return 0; 706 return 0;
716 } 707 }
717#endif 708#endif
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
40 int exception_state; 40 int exception_state;
41 int ret_state; 41 int ret_state;
42 int irq_depth; 42 int irq_depth;
43 int enter_kgdb;
43}; 44};
44 45
45extern struct debuggerinfo_struct kgdb_info[]; 46extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
86 } 86 }
87 /* Set initial kdb state variables */ 87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS); 88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu; 89 kdb_initial_cpu = atomic_read(&kgdb_active);
90 kdb_current_task = kgdb_info[ks->cpu].task; 90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; 91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */ 92 /* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
105 ks->pass_exception = 1; 105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC); 106 KDB_FLAG_SET(CATASTROPHIC);
107 } 107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 108 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT); 109 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS); 110 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
823 823
824 return r; 824 return r;
825} 825}
826 826EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index caf057a3de0e..a6e729766821 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) 85 num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
86 86
87typedef struct _kdbmsg { 87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */ 88 int km_diag; /* kdb diagnostic */
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
646 } 646 }
647 if (!s->usable) 647 if (!s->usable)
648 return KDB_NOTIMP; 648 return KDB_NOTIMP;
649 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); 649 s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
650 if (!s->command) { 650 if (!s->command) {
651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n", 651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
652 cmdstr); 652 cmdstr);
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1127 /* special case below */ 1127 /* special case below */
1128 } else { 1128 } else {
1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", 1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1130 kdb_current, kdb_current->pid); 1130 kdb_current, kdb_current ? kdb_current->pid : 0);
1131#if defined(CONFIG_SMP) 1131#if defined(CONFIG_SMP)
1132 kdb_printf("on processor %d ", raw_smp_processor_id()); 1132 kdb_printf("on processor %d ", raw_smp_processor_id());
1133#endif 1133#endif
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
1749 int nextarg; 1749 int nextarg;
1750 long offset; 1750 long offset;
1751 1751
1752 if (raw_smp_processor_id() != kdb_initial_cpu) {
1753 kdb_printf("go must execute on the entry cpu, "
1754 "please use \"cpu %d\" and then execute go\n",
1755 kdb_initial_cpu);
1756 return KDB_BADCPUNUM;
1757 }
1752 if (argc == 1) { 1758 if (argc == 1) {
1753 if (raw_smp_processor_id() != kdb_initial_cpu) {
1754 kdb_printf("go <address> must be issued from the "
1755 "initial cpu, do cpu %d first\n",
1756 kdb_initial_cpu);
1757 return KDB_ARGCOUNT;
1758 }
1759 nextarg = 1; 1759 nextarg = 1;
1760 diag = kdbgetaddrarg(argc, argv, &nextarg, 1760 diag = kdbgetaddrarg(argc, argv, &nextarg,
1761 &addr, &offset, NULL); 1761 &addr, &offset, NULL);
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
2361 */ 2361 */
2362static int kdb_ll(int argc, const char **argv) 2362static int kdb_ll(int argc, const char **argv)
2363{ 2363{
2364 int diag; 2364 int diag = 0;
2365 unsigned long addr; 2365 unsigned long addr;
2366 long offset = 0; 2366 long offset = 0;
2367 unsigned long va; 2367 unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
2400 char buf[80]; 2400 char buf[80];
2401 2401
2402 if (KDB_FLAG(CMD_INTERRUPT)) 2402 if (KDB_FLAG(CMD_INTERRUPT))
2403 return 0; 2403 goto out;
2404 2404
2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); 2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2406 diag = kdb_parse(buf); 2406 diag = kdb_parse(buf);
2407 if (diag) 2407 if (diag)
2408 return diag; 2408 goto out;
2409 2409
2410 addr = va + linkoffset; 2410 addr = va + linkoffset;
2411 if (kdb_getword(&va, addr, sizeof(va))) 2411 if (kdb_getword(&va, addr, sizeof(va)))
2412 return 0; 2412 goto out;
2413 } 2413 }
2414 kfree(command);
2415 2414
2416 return 0; 2415out:
2416 kfree(command);
2417 return diag;
2417} 2418}
2418 2419
2419static int kdb_kgdb(int argc, const char **argv) 2420static int kdb_kgdb(int argc, const char **argv)
@@ -2603,20 +2604,17 @@ static int kdb_summary(int argc, const char **argv)
2603 */ 2604 */
2604static int kdb_per_cpu(int argc, const char **argv) 2605static int kdb_per_cpu(int argc, const char **argv)
2605{ 2606{
2606 char buf[256], fmtstr[64]; 2607 char fmtstr[64];
2607 kdb_symtab_t symtab; 2608 int cpu, diag, nextarg = 1;
2608 cpumask_t suppress = CPU_MASK_NONE; 2609 unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
2609 int cpu, diag;
2610 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2611 2610
2612 if (argc < 1 || argc > 3) 2611 if (argc < 1 || argc > 3)
2613 return KDB_ARGCOUNT; 2612 return KDB_ARGCOUNT;
2614 2613
2615 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); 2614 diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
2616 if (!kdbgetsymval(buf, &symtab)) { 2615 if (diag)
2617 kdb_printf("%s is not a per_cpu variable\n", argv[1]); 2616 return diag;
2618 return KDB_BADADDR; 2617
2619 }
2620 if (argc >= 2) { 2618 if (argc >= 2) {
2621 diag = kdbgetularg(argv[2], &bytesperword); 2619 diag = kdbgetularg(argv[2], &bytesperword);
2622 if (diag) 2620 if (diag)
@@ -2649,46 +2647,25 @@ static int kdb_per_cpu(int argc, const char **argv)
2649#define KDB_PCU(cpu) 0 2647#define KDB_PCU(cpu) 0
2650#endif 2648#endif
2651#endif 2649#endif
2652
2653 for_each_online_cpu(cpu) { 2650 for_each_online_cpu(cpu) {
2651 if (KDB_FLAG(CMD_INTERRUPT))
2652 return 0;
2653
2654 if (whichcpu != ~0UL && whichcpu != cpu) 2654 if (whichcpu != ~0UL && whichcpu != cpu)
2655 continue; 2655 continue;
2656 addr = symtab.sym_start + KDB_PCU(cpu); 2656 addr = symaddr + KDB_PCU(cpu);
2657 diag = kdb_getword(&val, addr, bytesperword); 2657 diag = kdb_getword(&val, addr, bytesperword);
2658 if (diag) { 2658 if (diag) {
2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " 2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2660 "read, diag=%d\n", cpu, addr, diag); 2660 "read, diag=%d\n", cpu, addr, diag);
2661 continue; 2661 continue;
2662 } 2662 }
2663#ifdef CONFIG_SMP
2664 if (!val) {
2665 cpu_set(cpu, suppress);
2666 continue;
2667 }
2668#endif /* CONFIG_SMP */
2669 kdb_printf("%5d ", cpu); 2663 kdb_printf("%5d ", cpu);
2670 kdb_md_line(fmtstr, addr, 2664 kdb_md_line(fmtstr, addr,
2671 bytesperword == KDB_WORD_SIZE, 2665 bytesperword == KDB_WORD_SIZE,
2672 1, bytesperword, 1, 1, 0); 2666 1, bytesperword, 1, 1, 0);
2673 } 2667 }
2674 if (cpus_weight(suppress) == 0)
2675 return 0;
2676 kdb_printf("Zero suppressed cpu(s):");
2677 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2678 cpu = next_cpu(cpu, suppress)) {
2679 kdb_printf(" %d", cpu);
2680 if (cpu == num_possible_cpus() - 1 ||
2681 next_cpu(cpu, suppress) != cpu + 1)
2682 continue;
2683 while (cpu < num_possible_cpus() &&
2684 next_cpu(cpu, suppress) == cpu + 1)
2685 ++cpu;
2686 kdb_printf("-%d", cpu);
2687 }
2688 kdb_printf("\n");
2689
2690#undef KDB_PCU 2668#undef KDB_PCU
2691
2692 return 0; 2669 return 0;
2693} 2670}
2694 2671
@@ -2763,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
2763 } 2740 }
2764 if (kdb_commands) { 2741 if (kdb_commands) {
2765 memcpy(new, kdb_commands, 2742 memcpy(new, kdb_commands,
2766 kdb_max_commands * sizeof(*new)); 2743 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2767 kfree(kdb_commands); 2744 kfree(kdb_commands);
2768 } 2745 }
2769 memset(new + kdb_max_commands, 0, 2746 memset(new + kdb_max_commands, 0,
2770 kdb_command_extend * sizeof(*new)); 2747 kdb_command_extend * sizeof(*new));
2771 kdb_commands = new; 2748 kdb_commands = new;
2772 kp = kdb_commands + kdb_max_commands; 2749 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
2773 kdb_max_commands += kdb_command_extend; 2750 kdb_max_commands += kdb_command_extend;
2774 } 2751 }
2775 2752
@@ -2783,6 +2760,8 @@ int kdb_register_repeat(char *cmd,
2783 2760
2784 return 0; 2761 return 0;
2785} 2762}
2763EXPORT_SYMBOL_GPL(kdb_register_repeat);
2764
2786 2765
2787/* 2766/*
2788 * kdb_register - Compatibility register function for commands that do 2767 * kdb_register - Compatibility register function for commands that do
@@ -2805,6 +2784,7 @@ int kdb_register(char *cmd,
2805 return kdb_register_repeat(cmd, func, usage, help, minlen, 2784 return kdb_register_repeat(cmd, func, usage, help, minlen,
2806 KDB_REPEAT_NONE); 2785 KDB_REPEAT_NONE);
2807} 2786}
2787EXPORT_SYMBOL_GPL(kdb_register);
2808 2788
2809/* 2789/*
2810 * kdb_unregister - This function is used to unregister a kernel 2790 * kdb_unregister - This function is used to unregister a kernel
@@ -2823,7 +2803,7 @@ int kdb_unregister(char *cmd)
2823 /* 2803 /*
2824 * find the command. 2804 * find the command.
2825 */ 2805 */
2826 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { 2806 for_each_kdbcmd(kp, i) {
2827 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { 2807 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2828 kp->cmd_name = NULL; 2808 kp->cmd_name = NULL;
2829 return 0; 2809 return 0;
@@ -2833,6 +2813,7 @@ int kdb_unregister(char *cmd)
2833 /* Couldn't find it. */ 2813 /* Couldn't find it. */
2834 return 1; 2814 return 1;
2835} 2815}
2816EXPORT_SYMBOL_GPL(kdb_unregister);
2836 2817
2837/* Initialize the kdb command table. */ 2818/* Initialize the kdb command table. */
2838static void __init kdb_inittab(void) 2819static void __init kdb_inittab(void)
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index be775f7e81e0..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
15#include <linux/kgdb.h> 15#include <linux/kgdb.h>
16#include "../debug_core.h" 16#include "../debug_core.h"
17 17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */ 18/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
@@ -93,17 +70,6 @@
93 */ 70 */
94#define KDB_MAXBPT 16 71#define KDB_MAXBPT 16
95 72
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */ 73/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab { 74typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */ 75 unsigned long value; /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len); 89extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124 90
125/* Exported Symbols for kernel loadable modules to use. */ 91/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t); 92extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t); 93extern int kdb_putarea_size(unsigned long, void *, size_t);
133 94
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 105extern int kdb_putword(unsigned long, unsigned long, size_t);
145 106
146extern int kdbgetularg(const char *, unsigned long *); 107extern int kdbgetularg(const char *, unsigned long *);
108extern int kdbgetu64arg(const char *, u64 *);
147extern char *kdbgetenv(const char *); 109extern char *kdbgetenv(const char *);
148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 110extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
149 long *, char **); 111 long *, char **);
@@ -255,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
255extern void kdb_print_nameval(const char *name, unsigned long val); 217extern void kdb_print_nameval(const char *name, unsigned long val);
256extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
257extern void kdb_meminfo_proc_show(void); 219extern void kdb_meminfo_proc_show(void);
258#ifdef CONFIG_KALLSYMS
259extern const char *kdb_walk_kallsyms(loff_t *pos);
260#else /* ! CONFIG_KALLSYMS */
261static inline const char *kdb_walk_kallsyms(loff_t *pos)
262{
263 return NULL;
264}
265#endif /* ! CONFIG_KALLSYMS */
266extern char *kdb_getstr(char *, size_t, char *); 220extern char *kdb_getstr(char *, size_t, char *);
267 221
268/* Defines for kdb_symbol_print */ 222/* Defines for kdb_symbol_print */
diff --git a/kernel/exit.c b/kernel/exit.c
index e2bdf37f9fde..676149a4ac5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/unistd.h> 56#include <asm/unistd.h>
@@ -95,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk)
95 sig->tty = NULL; 96 sig->tty = NULL;
96 } else { 97 } else {
97 /* 98 /*
99 * This can only happen if the caller is de_thread().
100 * FIXME: this is the temporary hack, we should teach
101 * posix-cpu-timers to handle this case correctly.
102 */
103 if (unlikely(has_group_leader_pid(tsk)))
104 posix_cpu_timers_exit_group(tsk);
105
106 /*
98 * If there is any task waiting for the group exit 107 * If there is any task waiting for the group exit
99 * then notify it: 108 * then notify it:
100 */ 109 */
@@ -687,6 +696,8 @@ static void exit_mm(struct task_struct * tsk)
687 enter_lazy_tlb(mm, current); 696 enter_lazy_tlb(mm, current);
688 /* We don't want this task to be frozen prematurely */ 697 /* We don't want this task to be frozen prematurely */
689 clear_freeze_flag(tsk); 698 clear_freeze_flag(tsk);
699 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
700 atomic_dec(&mm->oom_disable_count);
690 task_unlock(tsk); 701 task_unlock(tsk);
691 mm_update_next_owner(mm); 702 mm_update_next_owner(mm);
692 mmput(mm); 703 mmput(mm);
@@ -700,6 +711,8 @@ static void exit_mm(struct task_struct * tsk)
700 * space. 711 * space.
701 */ 712 */
702static struct task_struct *find_new_reaper(struct task_struct *father) 713static struct task_struct *find_new_reaper(struct task_struct *father)
714 __releases(&tasklist_lock)
715 __acquires(&tasklist_lock)
703{ 716{
704 struct pid_namespace *pid_ns = task_active_pid_ns(father); 717 struct pid_namespace *pid_ns = task_active_pid_ns(father);
705 struct task_struct *thread; 718 struct task_struct *thread;
@@ -901,6 +914,15 @@ NORET_TYPE void do_exit(long code)
901 if (unlikely(!tsk->pid)) 914 if (unlikely(!tsk->pid))
902 panic("Attempted to kill the idle task!"); 915 panic("Attempted to kill the idle task!");
903 916
917 /*
918 * If do_exit is called because this processes oopsed, it's possible
919 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
920 * continuing. Amongst other possible reasons, this is to prevent
921 * mm_release()->clear_child_tid() from writing to a user-controlled
922 * kernel address.
923 */
924 set_fs(USER_DS);
925
904 tracehook_report_exit(&code); 926 tracehook_report_exit(&code);
905 927
906 validate_creds_for_do_exit(tsk); 928 validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..5447dc7defa9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h>
68 69
69#include <asm/pgtable.h> 70#include <asm/pgtable.h>
70#include <asm/pgalloc.h> 71#include <asm/pgalloc.h>
@@ -272,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
272 273
273 setup_thread_stack(tsk, orig); 274 setup_thread_stack(tsk, orig);
274 clear_user_return_notifier(tsk); 275 clear_user_return_notifier(tsk);
276 clear_tsk_need_resched(tsk);
275 stackend = end_of_stack(tsk); 277 stackend = end_of_stack(tsk);
276 *stackend = STACK_END_MAGIC; /* for overflow detection */ 278 *stackend = STACK_END_MAGIC; /* for overflow detection */
277 279
@@ -488,6 +490,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
488 mm->cached_hole_size = ~0UL; 490 mm->cached_hole_size = ~0UL;
489 mm_init_aio(mm); 491 mm_init_aio(mm);
490 mm_init_owner(mm, p); 492 mm_init_owner(mm, p);
493 atomic_set(&mm->oom_disable_count, 0);
491 494
492 if (likely(!mm_alloc_pgd(mm))) { 495 if (likely(!mm_alloc_pgd(mm))) {
493 mm->def_flags = 0; 496 mm->def_flags = 0;
@@ -741,6 +744,8 @@ good_mm:
741 /* Initializing for Swap token stuff */ 744 /* Initializing for Swap token stuff */
742 mm->token_priority = 0; 745 mm->token_priority = 0;
743 mm->last_interval = 0; 746 mm->last_interval = 0;
747 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
748 atomic_inc(&mm->oom_disable_count);
744 749
745 tsk->mm = mm; 750 tsk->mm = mm;
746 tsk->active_mm = mm; 751 tsk->active_mm = mm;
@@ -904,6 +909,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 sig->oom_adj = current->signal->oom_adj; 909 sig->oom_adj = current->signal->oom_adj;
905 sig->oom_score_adj = current->signal->oom_score_adj; 910 sig->oom_score_adj = current->signal->oom_score_adj;
906 911
912 mutex_init(&sig->cred_guard_mutex);
913
907 return 0; 914 return 0;
908} 915}
909 916
@@ -1299,8 +1306,13 @@ bad_fork_cleanup_io:
1299bad_fork_cleanup_namespaces: 1306bad_fork_cleanup_namespaces:
1300 exit_task_namespaces(p); 1307 exit_task_namespaces(p);
1301bad_fork_cleanup_mm: 1308bad_fork_cleanup_mm:
1302 if (p->mm) 1309 if (p->mm) {
1310 task_lock(p);
1311 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1312 atomic_dec(&p->mm->oom_disable_count);
1313 task_unlock(p);
1303 mmput(p->mm); 1314 mmput(p->mm);
1315 }
1304bad_fork_cleanup_signal: 1316bad_fork_cleanup_signal:
1305 if (!(clone_flags & CLONE_THREAD)) 1317 if (!(clone_flags & CLONE_THREAD))
1306 free_signal_struct(p->signal); 1318 free_signal_struct(p->signal);
@@ -1693,6 +1705,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1693 active_mm = current->active_mm; 1705 active_mm = current->active_mm;
1694 current->mm = new_mm; 1706 current->mm = new_mm;
1695 current->active_mm = new_mm; 1707 current->active_mm = new_mm;
1708 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1709 atomic_dec(&mm->oom_disable_count);
1710 atomic_inc(&new_mm->oom_disable_count);
1711 }
1696 activate_mm(active_mm, new_mm); 1712 activate_mm(active_mm, new_mm);
1697 new_mm = mm; 1713 new_mm = mm;
1698 } 1714 }
diff --git a/kernel/futex.c b/kernel/futex.c
index a118bf160e0b..40a8777a27d0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
169 169
170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
171 case FUT_OFF_INODE: 171 case FUT_OFF_INODE:
172 atomic_inc(&key->shared.inode->i_count); 172 ihold(key->shared.inode);
173 break; 173 break;
174 case FUT_OFF_MMSHARED: 174 case FUT_OFF_MMSHARED:
175 atomic_inc(&key->private.mm->mm_count); 175 atomic_inc(&key->private.mm->mm_count);
@@ -2489,7 +2489,8 @@ void exit_robust_list(struct task_struct *curr)
2489{ 2489{
2490 struct robust_list_head __user *head = curr->robust_list; 2490 struct robust_list_head __user *head = curr->robust_list;
2491 struct robust_list __user *entry, *next_entry, *pending; 2491 struct robust_list __user *entry, *next_entry, *pending;
2492 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 2492 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2493 unsigned int uninitialized_var(next_pi);
2493 unsigned long futex_offset; 2494 unsigned long futex_offset;
2494 int rc; 2495 int rc;
2495 2496
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 06da4dfc339b..a7934ac75e5b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
49{ 49{
50 struct compat_robust_list_head __user *head = curr->compat_robust_list; 50 struct compat_robust_list_head __user *head = curr->compat_robust_list;
51 struct robust_list __user *entry, *next_entry, *pending; 51 struct robust_list __user *entry, *next_entry, *pending;
52 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 52 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
53 unsigned int uninitialized_var(next_pi);
53 compat_uptr_t uentry, next_uentry, upending; 54 compat_uptr_t uentry, next_uentry, upending;
54 compat_long_t futex_offset; 55 compat_long_t futex_offset;
55 int rc; 56 int rc;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index f83972b16564..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
561static const struct file_operations gcov_reset_fops = { 561static const struct file_operations gcov_reset_fops = {
562 .write = reset_write, 562 .write = reset_write,
563 .read = reset_read, 563 .read = reset_read,
564 .llseek = noop_llseek,
564}; 565};
565 566
566/* 567/*
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afca..e5325825aeb6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
620 .read = hw_breakpoint_pmu_read, 620 .read = hw_breakpoint_pmu_read,
621}; 621};
622 622
623static int __init init_hw_breakpoint(void) 623int __init init_hw_breakpoint(void)
624{ 624{
625 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
626 int cpu, err_cpu; 626 int cpu, err_cpu;
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
655 655
656 return -ENOMEM; 656 return -ENOMEM;
657} 657}
658core_initcall(init_hw_breakpoint);
659 658
660 659
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9d917ff72675..9988d03797f5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
393 struct irq_desc *desc = irq_to_desc(irq); 393 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 394 return desc ? desc->kstat_irqs[cpu] : 0;
395} 395}
396
397#ifdef CONFIG_GENERIC_HARDIRQS
398unsigned int kstat_irqs(unsigned int irq)
399{
400 struct irq_desc *desc = irq_to_desc(irq);
401 int cpu;
402 int sum = 0;
403
404 if (!desc)
405 return 0;
406 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu];
408 return sum;
409}
410#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 644e8d5fa367..5f92acc5f952 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -324,6 +324,10 @@ void enable_irq(unsigned int irq)
324 if (!desc) 324 if (!desc)
325 return; 325 return;
326 326
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
327 chip_bus_lock(desc); 331 chip_bus_lock(desc);
328 raw_spin_lock_irqsave(&desc->lock, flags); 332 raw_spin_lock_irqsave(&desc->lock, flags);
329 __enable_irq(desc, irq, false); 333 __enable_irq(desc, irq, false);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a88983..6c8a2a9f8a7b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
214 214
215static int irq_spurious_proc_open(struct inode *inode, struct file *file) 215static int irq_spurious_proc_open(struct inode *inode, struct file *file)
216{ 216{
217 return single_open(file, irq_spurious_proc_show, NULL); 217 return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
218} 218}
219 219
220static const struct file_operations irq_spurious_proc_fops = { 220static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f16763ff8481..90f881904bb1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -145,7 +145,9 @@ void irq_work_run(void)
145 * Clear the BUSY bit and return to the free state if 145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 146 * no-one else claimed it meanwhile.
147 */ 147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); 148 (void)cmpxchg(&entry->next,
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
149 } 151 }
150} 152}
151EXPORT_SYMBOL_GPL(irq_work_run); 153EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7be868bf25c6..3b79bd938330 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -39,6 +39,16 @@ struct jump_label_module_entry {
39 struct module *mod; 39 struct module *mod;
40}; 40};
41 41
42void jump_label_lock(void)
43{
44 mutex_lock(&jump_label_mutex);
45}
46
47void jump_label_unlock(void)
48{
49 mutex_unlock(&jump_label_mutex);
50}
51
42static int jump_label_cmp(const void *a, const void *b) 52static int jump_label_cmp(const void *a, const void *b)
43{ 53{
44 const struct jump_entry *jea = a; 54 const struct jump_entry *jea = a;
@@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
152 struct jump_label_module_entry *e_module; 162 struct jump_label_module_entry *e_module;
153 int count; 163 int count;
154 164
155 mutex_lock(&jump_label_mutex); 165 jump_label_lock();
156 entry = get_jump_label_entry((jump_label_t)key); 166 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) { 167 if (entry) {
158 count = entry->nr_entries; 168 count = entry->nr_entries;
@@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
168 count = e_module->nr_entries; 178 count = e_module->nr_entries;
169 iter = e_module->table; 179 iter = e_module->table;
170 while (count--) { 180 while (count--) {
171 if (kernel_text_address(iter->code)) 181 if (iter->key &&
182 kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type); 183 arch_jump_label_transform(iter, type);
173 iter++; 184 iter++;
174 } 185 }
175 } 186 }
176 } 187 }
177 mutex_unlock(&jump_label_mutex); 188 jump_label_unlock();
178} 189}
179 190
180static int addr_conflict(struct jump_entry *entry, void *start, void *end) 191static int addr_conflict(struct jump_entry *entry, void *start, void *end)
@@ -231,6 +242,7 @@ out:
231 * overlaps with any of the jump label patch addresses. Code 242 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that 243 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses. 244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
234 * 246 *
235 * returns 1 if there is an overlap, 0 otherwise 247 * returns 1 if there is an overlap, 0 otherwise
236 */ 248 */
@@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end)
241 struct jump_entry *iter_stop = __start___jump_table; 253 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0; 254 int conflict = 0;
243 255
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start; 256 iter = iter_start;
246 while (iter < iter_stop) { 257 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) { 258 if (addr_conflict(iter, start, end)) {
@@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end)
256 conflict = module_conflict(start, end); 267 conflict = module_conflict(start, end);
257#endif 268#endif
258out: 269out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict; 270 return conflict;
261} 271}
262 272
273/*
274 * Not all archs need this.
275 */
276void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{
278}
279
263static __init int init_jump_label(void) 280static __init int init_jump_label(void)
264{ 281{
265 int ret; 282 int ret;
@@ -267,7 +284,7 @@ static __init int init_jump_label(void)
267 struct jump_entry *iter_stop = __stop___jump_table; 284 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter; 285 struct jump_entry *iter;
269 286
270 mutex_lock(&jump_label_mutex); 287 jump_label_lock();
271 ret = build_jump_label_hashtable(__start___jump_table, 288 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table); 289 __stop___jump_table);
273 iter = iter_start; 290 iter = iter_start;
@@ -275,7 +292,7 @@ static __init int init_jump_label(void)
275 arch_jump_label_text_poke_early(iter->code); 292 arch_jump_label_text_poke_early(iter->code);
276 iter++; 293 iter++;
277 } 294 }
278 mutex_unlock(&jump_label_mutex); 295 jump_label_unlock();
279 return ret; 296 return ret;
280} 297}
281early_initcall(init_jump_label); 298early_initcall(init_jump_label);
@@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod)
366 } 383 }
367} 384}
368 385
386static void remove_jump_label_module_init(struct module *mod)
387{
388 struct hlist_head *head;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter;
393 int i, count;
394
395 /* if the module doesn't have jump label entries, just return */
396 if (!mod->num_jump_entries)
397 return;
398
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
400 head = &jump_label_table[i];
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 }
416 }
417}
418
369static int 419static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val, 420jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data) 421 void *data)
@@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
375 425
376 switch (val) { 426 switch (val) {
377 case MODULE_STATE_COMING: 427 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex); 428 jump_label_lock();
379 ret = add_jump_label_module(mod); 429 ret = add_jump_label_module(mod);
380 if (ret) 430 if (ret)
381 remove_jump_label_module(mod); 431 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex); 432 jump_label_unlock();
383 break; 433 break;
384 case MODULE_STATE_GOING: 434 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex); 435 jump_label_lock();
386 remove_jump_label_module(mod); 436 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex); 437 jump_label_unlock();
438 break;
439 case MODULE_STATE_LIVE:
440 jump_label_lock();
441 remove_jump_label_module_init(mod);
442 jump_label_unlock();
388 break; 443 break;
389 } 444 }
390 return ret; 445 return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
816 816
817 ptr = kmap(page); 817 ptr = kmap(page);
818 /* Start with a clear page */ 818 /* Start with a clear page */
819 memset(ptr, 0, PAGE_SIZE); 819 clear_page(ptr);
820 ptr += maddr & ~PAGE_MASK; 820 ptr += maddr & ~PAGE_MASK;
821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
822 if (mchunk > mbytes) 822 if (mchunk > mbytes)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ec4210c6501e..9737a76e106f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
74/* NOTE: change this value only with kprobe_mutex held */ 74/* NOTE: change this value only with kprobe_mutex held */
75static bool kprobes_all_disarmed; 75static bool kprobes_all_disarmed;
76 76
77static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 77/* This protects kprobe_table and optimizing_list */
78static DEFINE_MUTEX(kprobe_mutex);
78static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
79static struct { 80static struct {
80 spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
595} 596}
596 597
597#ifdef CONFIG_SYSCTL 598#ifdef CONFIG_SYSCTL
599/* This should be called with kprobe_mutex locked */
598static void __kprobes optimize_all_kprobes(void) 600static void __kprobes optimize_all_kprobes(void)
599{ 601{
600 struct hlist_head *head; 602 struct hlist_head *head;
@@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
607 return; 609 return;
608 610
609 kprobes_allow_optimization = true; 611 kprobes_allow_optimization = true;
610 mutex_lock(&text_mutex);
611 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 612 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
612 head = &kprobe_table[i]; 613 head = &kprobe_table[i];
613 hlist_for_each_entry_rcu(p, node, head, hlist) 614 hlist_for_each_entry_rcu(p, node, head, hlist)
614 if (!kprobe_disabled(p)) 615 if (!kprobe_disabled(p))
615 optimize_kprobe(p); 616 optimize_kprobe(p);
616 } 617 }
617 mutex_unlock(&text_mutex);
618 printk(KERN_INFO "Kprobes globally optimized\n"); 618 printk(KERN_INFO "Kprobes globally optimized\n");
619} 619}
620 620
621/* This should be called with kprobe_mutex locked */
621static void __kprobes unoptimize_all_kprobes(void) 622static void __kprobes unoptimize_all_kprobes(void)
622{ 623{
623 struct hlist_head *head; 624 struct hlist_head *head;
@@ -1144,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p)
1144 if (ret) 1145 if (ret)
1145 return ret; 1146 return ret;
1146 1147
1148 jump_label_lock();
1147 preempt_disable(); 1149 preempt_disable();
1148 if (!kernel_text_address((unsigned long) p->addr) || 1150 if (!kernel_text_address((unsigned long) p->addr) ||
1149 in_kprobes_functions((unsigned long) p->addr) || 1151 in_kprobes_functions((unsigned long) p->addr) ||
1150 ftrace_text_reserved(p->addr, p->addr) || 1152 ftrace_text_reserved(p->addr, p->addr) ||
1151 jump_label_text_reserved(p->addr, p->addr)) { 1153 jump_label_text_reserved(p->addr, p->addr))
1152 preempt_enable(); 1154 goto fail_with_jump_label;
1153 return -EINVAL;
1154 }
1155 1155
1156 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1156 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1157 p->flags &= KPROBE_FLAG_DISABLED; 1157 p->flags &= KPROBE_FLAG_DISABLED;
@@ -1165,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p)
1165 * We must hold a refcount of the probed module while updating 1165 * We must hold a refcount of the probed module while updating
1166 * its code to prohibit unexpected unloading. 1166 * its code to prohibit unexpected unloading.
1167 */ 1167 */
1168 if (unlikely(!try_module_get(probed_mod))) { 1168 if (unlikely(!try_module_get(probed_mod)))
1169 preempt_enable(); 1169 goto fail_with_jump_label;
1170 return -EINVAL; 1170
1171 }
1172 /* 1171 /*
1173 * If the module freed .init.text, we couldn't insert 1172 * If the module freed .init.text, we couldn't insert
1174 * kprobes in there. 1173 * kprobes in there.
@@ -1176,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1176 if (within_module_init((unsigned long)p->addr, probed_mod) && 1175 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1177 probed_mod->state != MODULE_STATE_COMING) { 1176 probed_mod->state != MODULE_STATE_COMING) {
1178 module_put(probed_mod); 1177 module_put(probed_mod);
1179 preempt_enable(); 1178 goto fail_with_jump_label;
1180 return -EINVAL;
1181 } 1179 }
1182 } 1180 }
1183 preempt_enable(); 1181 preempt_enable();
1182 jump_label_unlock();
1184 1183
1185 p->nmissed = 0; 1184 p->nmissed = 0;
1186 INIT_LIST_HEAD(&p->list); 1185 INIT_LIST_HEAD(&p->list);
1187 mutex_lock(&kprobe_mutex); 1186 mutex_lock(&kprobe_mutex);
1188 1187
1188 jump_label_lock(); /* needed to call jump_label_text_reserved() */
1189
1189 get_online_cpus(); /* For avoiding text_mutex deadlock. */ 1190 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1190 mutex_lock(&text_mutex); 1191 mutex_lock(&text_mutex);
1191 1192
@@ -1213,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1213out: 1214out:
1214 mutex_unlock(&text_mutex); 1215 mutex_unlock(&text_mutex);
1215 put_online_cpus(); 1216 put_online_cpus();
1217 jump_label_unlock();
1216 mutex_unlock(&kprobe_mutex); 1218 mutex_unlock(&kprobe_mutex);
1217 1219
1218 if (probed_mod) 1220 if (probed_mod)
1219 module_put(probed_mod); 1221 module_put(probed_mod);
1220 1222
1221 return ret; 1223 return ret;
1224
1225fail_with_jump_label:
1226 preempt_enable();
1227 jump_label_unlock();
1228 return -EINVAL;
1222} 1229}
1223EXPORT_SYMBOL_GPL(register_kprobe); 1230EXPORT_SYMBOL_GPL(register_kprobe);
1224 1231
@@ -2000,6 +2007,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2000static const struct file_operations fops_kp = { 2007static const struct file_operations fops_kp = {
2001 .read = read_enabled_file_bool, 2008 .read = read_enabled_file_bool,
2002 .write = write_enabled_file_bool, 2009 .write = write_enabled_file_bool,
2010 .llseek = default_llseek,
2003}; 2011};
2004 2012
2005static int __kprobes debugfs_kprobe_init(void) 2013static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..ca61bbdd44b2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -265,6 +265,17 @@ int kthreadd(void *unused)
265 return 0; 265 return 0;
266} 266}
267 267
268void __init_kthread_worker(struct kthread_worker *worker,
269 const char *name,
270 struct lock_class_key *key)
271{
272 spin_lock_init(&worker->lock);
273 lockdep_set_class_and_name(&worker->lock, key, name);
274 INIT_LIST_HEAD(&worker->work_list);
275 worker->task = NULL;
276}
277EXPORT_SYMBOL_GPL(__init_kthread_worker);
278
268/** 279/**
269 * kthread_worker_fn - kthread function to process kthread_worker 280 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker 281 * @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
197 /* 197 for (i = 0; i < tsk->latency_record_count; i++) {
198 * short term hack; if we're > 32 we stop; future we recycle:
199 */
200 tsk->latency_record_count++;
201 if (tsk->latency_record_count >= LT_SAVECOUNT)
202 goto out_unlock;
203
204 for (i = 0; i < LT_SAVECOUNT; i++) {
205 struct latency_record *mylat; 198 struct latency_record *mylat;
206 int same = 1; 199 int same = 1;
207 200
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
227 } 220 }
228 } 221 }
229 222
223 /*
224 * short term hack; if we're > 32 we stop; future we recycle:
225 */
226 if (tsk->latency_record_count >= LT_SAVECOUNT)
227 goto out_unlock;
228
230 /* Allocated a new one: */ 229 /* Allocated a new one: */
231 i = tsk->latency_record_count; 230 i = tsk->latency_record_count++;
232 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
233 232
234out_unlock: 233out_unlock:
diff --git a/kernel/module.c b/kernel/module.c
index 2df46301a7a4..d190664f25ff 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
2037{ 2037{
2038} 2038}
2039 2039
2040static void add_kallsyms(struct module *mod, struct load_info *info) 2040static void add_kallsyms(struct module *mod, const struct load_info *info)
2041{ 2041{
2042} 2042}
2043#endif /* CONFIG_KALLSYMS */ 2043#endif /* CONFIG_KALLSYMS */
@@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2326 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * 2326 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2327 mod->num_trace_events, GFP_KERNEL); 2327 mod->num_trace_events, GFP_KERNEL);
2328#endif 2328#endif
2329#ifdef CONFIG_TRACING
2330 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
2331 sizeof(*mod->trace_bprintk_fmt_start),
2332 &mod->num_trace_bprintk_fmt);
2333 /*
2334 * This section contains pointers to allocated objects in the trace
2335 * code and not scanning it leads to false positives.
2336 */
2337 kmemleak_scan_area(mod->trace_bprintk_fmt_start,
2338 sizeof(*mod->trace_bprintk_fmt_start) *
2339 mod->num_trace_bprintk_fmt, GFP_KERNEL);
2340#endif
2329#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2341#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2330 /* sechdrs[0].sh_size is always zero */ 2342 /* sechdrs[0].sh_size is always zero */
2331 mod->ftrace_callsites = section_objs(info, "__mcount_loc", 2343 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
85 return ERR_PTR(-EPERM); 85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current)) 86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM); 87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
88 96
89 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
90 if (!ns_cgroup) 98 if (!ns_cgroup)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c78..2870feee81dd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,6 +31,7 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
34 35
35#include <asm/irq_regs.h> 36#include <asm/irq_regs.h>
36 37
@@ -417,8 +418,8 @@ event_filter_match(struct perf_event *event)
417 return event->cpu == -1 || event->cpu == smp_processor_id(); 418 return event->cpu == -1 || event->cpu == smp_processor_id();
418} 419}
419 420
420static int 421static void
421__event_sched_out(struct perf_event *event, 422event_sched_out(struct perf_event *event,
422 struct perf_cpu_context *cpuctx, 423 struct perf_cpu_context *cpuctx,
423 struct perf_event_context *ctx) 424 struct perf_event_context *ctx)
424{ 425{
@@ -437,13 +438,14 @@ __event_sched_out(struct perf_event *event,
437 } 438 }
438 439
439 if (event->state != PERF_EVENT_STATE_ACTIVE) 440 if (event->state != PERF_EVENT_STATE_ACTIVE)
440 return 0; 441 return;
441 442
442 event->state = PERF_EVENT_STATE_INACTIVE; 443 event->state = PERF_EVENT_STATE_INACTIVE;
443 if (event->pending_disable) { 444 if (event->pending_disable) {
444 event->pending_disable = 0; 445 event->pending_disable = 0;
445 event->state = PERF_EVENT_STATE_OFF; 446 event->state = PERF_EVENT_STATE_OFF;
446 } 447 }
448 event->tstamp_stopped = ctx->time;
447 event->pmu->del(event, 0); 449 event->pmu->del(event, 0);
448 event->oncpu = -1; 450 event->oncpu = -1;
449 451
@@ -452,19 +454,6 @@ __event_sched_out(struct perf_event *event,
452 ctx->nr_active--; 454 ctx->nr_active--;
453 if (event->attr.exclusive || !cpuctx->active_oncpu) 455 if (event->attr.exclusive || !cpuctx->active_oncpu)
454 cpuctx->exclusive = 0; 456 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
468} 457}
469 458
470static void 459static void
@@ -664,7 +653,7 @@ retry:
664} 653}
665 654
666static int 655static int
667__event_sched_in(struct perf_event *event, 656event_sched_in(struct perf_event *event,
668 struct perf_cpu_context *cpuctx, 657 struct perf_cpu_context *cpuctx,
669 struct perf_event_context *ctx) 658 struct perf_event_context *ctx)
670{ 659{
@@ -684,6 +673,10 @@ __event_sched_in(struct perf_event *event,
684 return -EAGAIN; 673 return -EAGAIN;
685 } 674 }
686 675
676 event->tstamp_running += ctx->time - event->tstamp_stopped;
677
678 event->shadow_ctx_time = ctx->time - ctx->timestamp;
679
687 if (!is_software_event(event)) 680 if (!is_software_event(event))
688 cpuctx->active_oncpu++; 681 cpuctx->active_oncpu++;
689 ctx->nr_active++; 682 ctx->nr_active++;
@@ -694,35 +687,6 @@ __event_sched_in(struct perf_event *event,
694 return 0; 687 return 0;
695} 688}
696 689
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
726static int 690static int
727group_sched_in(struct perf_event *group_event, 691group_sched_in(struct perf_event *group_event,
728 struct perf_cpu_context *cpuctx, 692 struct perf_cpu_context *cpuctx,
@@ -730,19 +694,15 @@ group_sched_in(struct perf_event *group_event,
730{ 694{
731 struct perf_event *event, *partial_group = NULL; 695 struct perf_event *event, *partial_group = NULL;
732 struct pmu *pmu = group_event->pmu; 696 struct pmu *pmu = group_event->pmu;
697 u64 now = ctx->time;
698 bool simulate = false;
733 699
734 if (group_event->state == PERF_EVENT_STATE_OFF) 700 if (group_event->state == PERF_EVENT_STATE_OFF)
735 return 0; 701 return 0;
736 702
737 pmu->start_txn(pmu); 703 pmu->start_txn(pmu);
738 704
739 /* 705 if (event_sched_in(group_event, cpuctx, ctx)) {
740 * use __event_sched_in() to delay updating tstamp_running
741 * until the transaction is committed. In case of failure
742 * we will keep an unmodified tstamp_running which is a
743 * requirement to get correct timing information
744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu); 706 pmu->cancel_txn(pmu);
747 return -EAGAIN; 707 return -EAGAIN;
748 } 708 }
@@ -751,31 +711,42 @@ group_sched_in(struct perf_event *group_event,
751 * Schedule in siblings as one group (if any): 711 * Schedule in siblings as one group (if any):
752 */ 712 */
753 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 713 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
754 if (__event_sched_in(event, cpuctx, ctx)) { 714 if (event_sched_in(event, cpuctx, ctx)) {
755 partial_group = event; 715 partial_group = event;
756 goto group_error; 716 goto group_error;
757 } 717 }
758 } 718 }
759 719
760 if (!pmu->commit_txn(pmu)) { 720 if (!pmu->commit_txn(pmu))
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
763 return 0; 721 return 0;
764 } 722
765group_error: 723group_error:
766 /* 724 /*
767 * Groups can be scheduled in as one unit only, so undo any 725 * Groups can be scheduled in as one unit only, so undo any
768 * partial group before returning: 726 * partial group before returning:
727 * The events up to the failed event are scheduled out normally,
728 * tstamp_stopped will be updated.
769 * 729 *
770 * use __event_sched_out() to avoid updating tstamp_stopped 730 * The failed events and the remaining siblings need to have
771 * because the event never actually ran 731 * their timings updated as if they had gone thru event_sched_in()
732 * and event_sched_out(). This is required to get consistent timings
733 * across the group. This also takes care of the case where the group
734 * could never be scheduled by ensuring tstamp_stopped is set to mark
735 * the time the event was actually stopped, such that time delta
736 * calculation in update_event_times() is correct.
772 */ 737 */
773 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 738 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
774 if (event == partial_group) 739 if (event == partial_group)
775 break; 740 simulate = true;
776 __event_sched_out(event, cpuctx, ctx); 741
742 if (simulate) {
743 event->tstamp_running += now - event->tstamp_stopped;
744 event->tstamp_stopped = now;
745 } else {
746 event_sched_out(event, cpuctx, ctx);
747 }
777 } 748 }
778 __event_sched_out(group_event, cpuctx, ctx); 749 event_sched_out(group_event, cpuctx, ctx);
779 750
780 pmu->cancel_txn(pmu); 751 pmu->cancel_txn(pmu);
781 752
@@ -1316,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
1316{ 1287{
1317 int ctxn; 1288 int ctxn;
1318 1289
1319 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1320
1321 for_each_task_context_nr(ctxn) 1290 for_each_task_context_nr(ctxn)
1322 perf_event_context_sched_out(task, ctxn, next); 1291 perf_event_context_sched_out(task, ctxn, next);
1323} 1292}
@@ -1651,8 +1620,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
1651{ 1620{
1652 raw_spin_lock(&ctx->lock); 1621 raw_spin_lock(&ctx->lock);
1653 1622
1654 /* Rotate the first entry last of non-pinned groups */ 1623 /*
1655 list_rotate_left(&ctx->flexible_groups); 1624 * Rotate the first entry last of non-pinned groups. Rotation might be
1625 * disabled by the inheritance code.
1626 */
1627 if (!ctx->rotate_disable)
1628 list_rotate_left(&ctx->flexible_groups);
1656 1629
1657 raw_spin_unlock(&ctx->lock); 1630 raw_spin_unlock(&ctx->lock);
1658} 1631}
@@ -2264,11 +2237,6 @@ int perf_event_release_kernel(struct perf_event *event)
2264 raw_spin_unlock_irq(&ctx->lock); 2237 raw_spin_unlock_irq(&ctx->lock);
2265 mutex_unlock(&ctx->mutex); 2238 mutex_unlock(&ctx->mutex);
2266 2239
2267 mutex_lock(&event->owner->perf_event_mutex);
2268 list_del_init(&event->owner_entry);
2269 mutex_unlock(&event->owner->perf_event_mutex);
2270 put_task_struct(event->owner);
2271
2272 free_event(event); 2240 free_event(event);
2273 2241
2274 return 0; 2242 return 0;
@@ -2281,9 +2249,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2281static int perf_release(struct inode *inode, struct file *file) 2249static int perf_release(struct inode *inode, struct file *file)
2282{ 2250{
2283 struct perf_event *event = file->private_data; 2251 struct perf_event *event = file->private_data;
2252 struct task_struct *owner;
2284 2253
2285 file->private_data = NULL; 2254 file->private_data = NULL;
2286 2255
2256 rcu_read_lock();
2257 owner = ACCESS_ONCE(event->owner);
2258 /*
2259 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2260 * !owner it means the list deletion is complete and we can indeed
2261 * free this event, otherwise we need to serialize on
2262 * owner->perf_event_mutex.
2263 */
2264 smp_read_barrier_depends();
2265 if (owner) {
2266 /*
2267 * Since delayed_put_task_struct() also drops the last
2268 * task reference we can safely take a new reference
2269 * while holding the rcu_read_lock().
2270 */
2271 get_task_struct(owner);
2272 }
2273 rcu_read_unlock();
2274
2275 if (owner) {
2276 mutex_lock(&owner->perf_event_mutex);
2277 /*
2278 * We have to re-check the event->owner field, if it is cleared
2279 * we raced with perf_event_exit_task(), acquiring the mutex
2280 * ensured they're done, and we can proceed with freeing the
2281 * event.
2282 */
2283 if (event->owner)
2284 list_del_init(&event->owner_entry);
2285 mutex_unlock(&owner->perf_event_mutex);
2286 put_task_struct(owner);
2287 }
2288
2287 return perf_event_release_kernel(event); 2289 return perf_event_release_kernel(event);
2288} 2290}
2289 2291
@@ -3428,7 +3430,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3428} 3430}
3429 3431
3430static void perf_output_read_one(struct perf_output_handle *handle, 3432static void perf_output_read_one(struct perf_output_handle *handle,
3431 struct perf_event *event) 3433 struct perf_event *event,
3434 u64 enabled, u64 running)
3432{ 3435{
3433 u64 read_format = event->attr.read_format; 3436 u64 read_format = event->attr.read_format;
3434 u64 values[4]; 3437 u64 values[4];
@@ -3436,11 +3439,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3436 3439
3437 values[n++] = perf_event_count(event); 3440 values[n++] = perf_event_count(event);
3438 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3441 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3439 values[n++] = event->total_time_enabled + 3442 values[n++] = enabled +
3440 atomic64_read(&event->child_total_time_enabled); 3443 atomic64_read(&event->child_total_time_enabled);
3441 } 3444 }
3442 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 3445 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3443 values[n++] = event->total_time_running + 3446 values[n++] = running +
3444 atomic64_read(&event->child_total_time_running); 3447 atomic64_read(&event->child_total_time_running);
3445 } 3448 }
3446 if (read_format & PERF_FORMAT_ID) 3449 if (read_format & PERF_FORMAT_ID)
@@ -3453,7 +3456,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3453 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 3456 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3454 */ 3457 */
3455static void perf_output_read_group(struct perf_output_handle *handle, 3458static void perf_output_read_group(struct perf_output_handle *handle,
3456 struct perf_event *event) 3459 struct perf_event *event,
3460 u64 enabled, u64 running)
3457{ 3461{
3458 struct perf_event *leader = event->group_leader, *sub; 3462 struct perf_event *leader = event->group_leader, *sub;
3459 u64 read_format = event->attr.read_format; 3463 u64 read_format = event->attr.read_format;
@@ -3463,10 +3467,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3463 values[n++] = 1 + leader->nr_siblings; 3467 values[n++] = 1 + leader->nr_siblings;
3464 3468
3465 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3469 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3466 values[n++] = leader->total_time_enabled; 3470 values[n++] = enabled;
3467 3471
3468 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3472 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3469 values[n++] = leader->total_time_running; 3473 values[n++] = running;
3470 3474
3471 if (leader != event) 3475 if (leader != event)
3472 leader->pmu->read(leader); 3476 leader->pmu->read(leader);
@@ -3491,13 +3495,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3491 } 3495 }
3492} 3496}
3493 3497
3498#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3499 PERF_FORMAT_TOTAL_TIME_RUNNING)
3500
3494static void perf_output_read(struct perf_output_handle *handle, 3501static void perf_output_read(struct perf_output_handle *handle,
3495 struct perf_event *event) 3502 struct perf_event *event)
3496{ 3503{
3504 u64 enabled = 0, running = 0, now, ctx_time;
3505 u64 read_format = event->attr.read_format;
3506
3507 /*
3508 * compute total_time_enabled, total_time_running
3509 * based on snapshot values taken when the event
3510 * was last scheduled in.
3511 *
3512 * we cannot simply called update_context_time()
3513 * because of locking issue as we are called in
3514 * NMI context
3515 */
3516 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3517 now = perf_clock();
3518 ctx_time = event->shadow_ctx_time + now;
3519 enabled = ctx_time - event->tstamp_enabled;
3520 running = ctx_time - event->tstamp_running;
3521 }
3522
3497 if (event->attr.read_format & PERF_FORMAT_GROUP) 3523 if (event->attr.read_format & PERF_FORMAT_GROUP)
3498 perf_output_read_group(handle, event); 3524 perf_output_read_group(handle, event, enabled, running);
3499 else 3525 else
3500 perf_output_read_one(handle, event); 3526 perf_output_read_one(handle, event, enabled, running);
3501} 3527}
3502 3528
3503void perf_output_sample(struct perf_output_handle *handle, 3529void perf_output_sample(struct perf_output_handle *handle,
@@ -3798,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3798 rcu_read_lock(); 3824 rcu_read_lock();
3799 list_for_each_entry_rcu(pmu, &pmus, entry) { 3825 list_for_each_entry_rcu(pmu, &pmus, entry) {
3800 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3826 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3827 if (cpuctx->active_pmu != pmu)
3828 goto next;
3801 perf_event_task_ctx(&cpuctx->ctx, task_event); 3829 perf_event_task_ctx(&cpuctx->ctx, task_event);
3802 3830
3803 ctx = task_event->task_ctx; 3831 ctx = task_event->task_ctx;
@@ -3933,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3933 rcu_read_lock(); 3961 rcu_read_lock();
3934 list_for_each_entry_rcu(pmu, &pmus, entry) { 3962 list_for_each_entry_rcu(pmu, &pmus, entry) {
3935 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3963 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3964 if (cpuctx->active_pmu != pmu)
3965 goto next;
3936 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3966 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3937 3967
3938 ctxn = pmu->task_ctx_nr; 3968 ctxn = pmu->task_ctx_nr;
@@ -4118,6 +4148,8 @@ got_name:
4118 rcu_read_lock(); 4148 rcu_read_lock();
4119 list_for_each_entry_rcu(pmu, &pmus, entry) { 4149 list_for_each_entry_rcu(pmu, &pmus, entry) {
4120 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4150 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4151 if (cpuctx->active_pmu != pmu)
4152 goto next;
4121 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4153 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4122 vma->vm_flags & VM_EXEC); 4154 vma->vm_flags & VM_EXEC);
4123 4155
@@ -4687,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event)
4687 break; 4719 break;
4688 } 4720 }
4689 4721
4690 if (event_id > PERF_COUNT_SW_MAX) 4722 if (event_id >= PERF_COUNT_SW_MAX)
4691 return -ENOENT; 4723 return -ENOENT;
4692 4724
4693 if (!event->parent) { 4725 if (!event->parent) {
@@ -5119,20 +5151,36 @@ static void *find_pmu_context(int ctxn)
5119 return NULL; 5151 return NULL;
5120} 5152}
5121 5153
5122static void free_pmu_context(void * __percpu cpu_context) 5154static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5123{ 5155{
5124 struct pmu *pmu; 5156 int cpu;
5157
5158 for_each_possible_cpu(cpu) {
5159 struct perf_cpu_context *cpuctx;
5160
5161 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5162
5163 if (cpuctx->active_pmu == old_pmu)
5164 cpuctx->active_pmu = pmu;
5165 }
5166}
5167
5168static void free_pmu_context(struct pmu *pmu)
5169{
5170 struct pmu *i;
5125 5171
5126 mutex_lock(&pmus_lock); 5172 mutex_lock(&pmus_lock);
5127 /* 5173 /*
5128 * Like a real lame refcount. 5174 * Like a real lame refcount.
5129 */ 5175 */
5130 list_for_each_entry(pmu, &pmus, entry) { 5176 list_for_each_entry(i, &pmus, entry) {
5131 if (pmu->pmu_cpu_context == cpu_context) 5177 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5178 update_pmu_context(i, pmu);
5132 goto out; 5179 goto out;
5180 }
5133 } 5181 }
5134 5182
5135 free_percpu(cpu_context); 5183 free_percpu(pmu->pmu_cpu_context);
5136out: 5184out:
5137 mutex_unlock(&pmus_lock); 5185 mutex_unlock(&pmus_lock);
5138} 5186}
@@ -5164,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu)
5164 cpuctx->ctx.pmu = pmu; 5212 cpuctx->ctx.pmu = pmu;
5165 cpuctx->jiffies_interval = 1; 5213 cpuctx->jiffies_interval = 1;
5166 INIT_LIST_HEAD(&cpuctx->rotation_list); 5214 INIT_LIST_HEAD(&cpuctx->rotation_list);
5215 cpuctx->active_pmu = pmu;
5167 } 5216 }
5168 5217
5169got_cpu_context: 5218got_cpu_context:
@@ -5215,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu)
5215 synchronize_rcu(); 5264 synchronize_rcu();
5216 5265
5217 free_percpu(pmu->pmu_disable_count); 5266 free_percpu(pmu->pmu_disable_count);
5218 free_pmu_context(pmu->pmu_cpu_context); 5267 free_pmu_context(pmu);
5219} 5268}
5220 5269
5221struct pmu *perf_init_event(struct perf_event *event) 5270struct pmu *perf_init_event(struct perf_event *event)
@@ -5683,7 +5732,7 @@ SYSCALL_DEFINE5(perf_event_open,
5683 mutex_unlock(&ctx->mutex); 5732 mutex_unlock(&ctx->mutex);
5684 5733
5685 event->owner = current; 5734 event->owner = current;
5686 get_task_struct(current); 5735
5687 mutex_lock(&current->perf_event_mutex); 5736 mutex_lock(&current->perf_event_mutex);
5688 list_add_tail(&event->owner_entry, &current->perf_event_list); 5737 list_add_tail(&event->owner_entry, &current->perf_event_list);
5689 mutex_unlock(&current->perf_event_mutex); 5738 mutex_unlock(&current->perf_event_mutex);
@@ -5751,12 +5800,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5751 ++ctx->generation; 5800 ++ctx->generation;
5752 mutex_unlock(&ctx->mutex); 5801 mutex_unlock(&ctx->mutex);
5753 5802
5754 event->owner = current;
5755 get_task_struct(current);
5756 mutex_lock(&current->perf_event_mutex);
5757 list_add_tail(&event->owner_entry, &current->perf_event_list);
5758 mutex_unlock(&current->perf_event_mutex);
5759
5760 return event; 5803 return event;
5761 5804
5762err_free: 5805err_free:
@@ -5907,8 +5950,24 @@ again:
5907 */ 5950 */
5908void perf_event_exit_task(struct task_struct *child) 5951void perf_event_exit_task(struct task_struct *child)
5909{ 5952{
5953 struct perf_event *event, *tmp;
5910 int ctxn; 5954 int ctxn;
5911 5955
5956 mutex_lock(&child->perf_event_mutex);
5957 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
5958 owner_entry) {
5959 list_del_init(&event->owner_entry);
5960
5961 /*
5962 * Ensure the list deletion is visible before we clear
5963 * the owner, closes a race against perf_release() where
5964 * we need to serialize on the owner->perf_event_mutex.
5965 */
5966 smp_wmb();
5967 event->owner = NULL;
5968 }
5969 mutex_unlock(&child->perf_event_mutex);
5970
5912 for_each_task_context_nr(ctxn) 5971 for_each_task_context_nr(ctxn)
5913 perf_event_exit_task_context(child, ctxn); 5972 perf_event_exit_task_context(child, ctxn);
5914} 5973}
@@ -6128,6 +6187,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6128 struct perf_event *event; 6187 struct perf_event *event;
6129 struct task_struct *parent = current; 6188 struct task_struct *parent = current;
6130 int inherited_all = 1; 6189 int inherited_all = 1;
6190 unsigned long flags;
6131 int ret = 0; 6191 int ret = 0;
6132 6192
6133 child->perf_event_ctxp[ctxn] = NULL; 6193 child->perf_event_ctxp[ctxn] = NULL;
@@ -6168,6 +6228,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6168 break; 6228 break;
6169 } 6229 }
6170 6230
6231 /*
6232 * We can't hold ctx->lock when iterating the ->flexible_group list due
6233 * to allocations, but we need to prevent rotation because
6234 * rotate_ctx() will change the list from interrupt context.
6235 */
6236 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6237 parent_ctx->rotate_disable = 1;
6238 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6239
6171 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6240 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
6172 ret = inherit_task_group(event, parent, parent_ctx, 6241 ret = inherit_task_group(event, parent, parent_ctx,
6173 child, ctxn, &inherited_all); 6242 child, ctxn, &inherited_all);
@@ -6175,6 +6244,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6175 break; 6244 break;
6176 } 6245 }
6177 6246
6247 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6248 parent_ctx->rotate_disable = 0;
6249 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6250
6178 child_ctx = child->perf_event_ctxp[ctxn]; 6251 child_ctx = child->perf_event_ctxp[ctxn];
6179 6252
6180 if (child_ctx && inherited_all) { 6253 if (child_ctx && inherited_all) {
@@ -6327,6 +6400,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6327 6400
6328void __init perf_event_init(void) 6401void __init perf_event_init(void)
6329{ 6402{
6403 int ret;
6404
6330 perf_event_init_all_cpus(); 6405 perf_event_init_all_cpus();
6331 init_srcu_struct(&pmus_srcu); 6406 init_srcu_struct(&pmus_srcu);
6332 perf_pmu_register(&perf_swevent); 6407 perf_pmu_register(&perf_swevent);
@@ -6334,4 +6409,7 @@ void __init perf_event_init(void)
6334 perf_pmu_register(&perf_task_clock); 6409 perf_pmu_register(&perf_task_clock);
6335 perf_tp_register(); 6410 perf_tp_register();
6336 perf_cpu_notifier(perf_cpu_notify); 6411 perf_cpu_notifier(perf_cpu_notify);
6412
6413 ret = init_hw_breakpoint();
6414 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6337} 6415}
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 645e541a45f6..aeaa7f846821 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 110 .write = pm_qos_power_write,
111 .open = pm_qos_power_open, 111 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
113 .llseek = noop_llseek,
113}; 114};
114 115
115/* unlocked internal variant */ 116/* unlocked internal variant */
@@ -120,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
120 121
121 switch (o->type) { 122 switch (o->type) {
122 case PM_QOS_MIN: 123 case PM_QOS_MIN:
123 return plist_last(&o->requests)->prio; 124 return plist_first(&o->requests)->prio;
124 125
125 case PM_QOS_MAX: 126 case PM_QOS_MAX:
126 return plist_first(&o->requests)->prio; 127 return plist_last(&o->requests)->prio;
127 128
128 default: 129 default:
129 /* runtime check for not using enum */ 130 /* runtime check for not using enum */
@@ -398,7 +399,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
398 } else 399 } else
399 return -EINVAL; 400 return -EINVAL;
400 401
401 pm_qos_req = (struct pm_qos_request_list *)filp->private_data; 402 pm_qos_req = filp->private_data;
402 pm_qos_update_request(pm_qos_req, value); 403 pm_qos_update_request(pm_qos_req, value);
403 404
404 return count; 405 return count;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..05bb7173850e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
37 if (pid == 0) 37 if (pid == 0)
38 return 0; 38 return 0;
39 39
40 read_lock(&tasklist_lock); 40 rcu_read_lock();
41 p = find_task_by_vpid(pid); 41 p = find_task_by_vpid(pid);
42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? 42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
43 same_thread_group(p, current) : thread_group_leader(p))) { 43 same_thread_group(p, current) : has_group_leader_pid(p))) {
44 error = -EINVAL; 44 error = -EINVAL;
45 } 45 }
46 read_unlock(&tasklist_lock); 46 rcu_read_unlock();
47 47
48 return error; 48 return error;
49} 49}
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
390 390
391 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 391 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
392 392
393 read_lock(&tasklist_lock); 393 rcu_read_lock();
394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
395 if (pid == 0) { 395 if (pid == 0) {
396 p = current; 396 p = current;
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
404 p = current->group_leader; 404 p = current->group_leader;
405 } else { 405 } else {
406 p = find_task_by_vpid(pid); 406 p = find_task_by_vpid(pid);
407 if (p && !thread_group_leader(p)) 407 if (p && !has_group_leader_pid(p))
408 p = NULL; 408 p = NULL;
409 } 409 }
410 } 410 }
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
414 } else { 414 } else {
415 ret = -EINVAL; 415 ret = -EINVAL;
416 } 416 }
417 read_unlock(&tasklist_lock); 417 rcu_read_unlock();
418 418
419 return ret; 419 return ret;
420} 420}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 29bff6117abc..a5aff3ebad38 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -246,9 +246,13 @@ config PM_OPS
246 depends on PM_SLEEP || PM_RUNTIME 246 depends on PM_SLEEP || PM_RUNTIME
247 default y 247 default y
248 248
249config ARCH_HAS_OPP
250 bool
251
249config PM_OPP 252config PM_OPP
250 bool "Operating Performance Point (OPP) Layer library" 253 bool "Operating Performance Point (OPP) Layer library"
251 depends on PM 254 depends on PM
255 depends on ARCH_HAS_OPP
252 ---help--- 256 ---help---
253 SOCs have a standard set of tuples consisting of frequency and 257 SOCs have a standard set of tuples consisting of frequency and
254 voltage pairs that the device will support per voltage domain. This 258 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..048d0b514831 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -327,7 +327,6 @@ static int create_image(int platform_mode)
327int hibernation_snapshot(int platform_mode) 327int hibernation_snapshot(int platform_mode)
328{ 328{
329 int error; 329 int error;
330 gfp_t saved_mask;
331 330
332 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
333 if (error) 332 if (error)
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
339 goto Close; 338 goto Close;
340 339
341 suspend_console(); 340 suspend_console();
342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 341 pm_restrict_gfp_mask();
343 error = dpm_suspend_start(PMSG_FREEZE); 342 error = dpm_suspend_start(PMSG_FREEZE);
344 if (error) 343 if (error)
345 goto Recover_platform; 344 goto Recover_platform;
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode)
348 goto Recover_platform; 347 goto Recover_platform;
349 348
350 error = create_image(platform_mode); 349 error = create_image(platform_mode);
351 /* Control returns here after successful restore */ 350 /*
351 * Control returns here (1) after the image has been created or the
352 * image creation has failed and (2) after a successful restore.
353 */
352 354
353 Resume_devices: 355 Resume_devices:
354 /* We may need to release the preallocated image pages here. */ 356 /* We may need to release the preallocated image pages here. */
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode)
357 359
358 dpm_resume_end(in_suspend ? 360 dpm_resume_end(in_suspend ?
359 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 361 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
360 set_gfp_allowed_mask(saved_mask); 362
363 if (error || !in_suspend)
364 pm_restore_gfp_mask();
365
361 resume_console(); 366 resume_console();
362 Close: 367 Close:
363 platform_end(platform_mode); 368 platform_end(platform_mode);
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode)
452int hibernation_restore(int platform_mode) 457int hibernation_restore(int platform_mode)
453{ 458{
454 int error; 459 int error;
455 gfp_t saved_mask;
456 460
457 pm_prepare_console(); 461 pm_prepare_console();
458 suspend_console(); 462 suspend_console();
459 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 463 pm_restrict_gfp_mask();
460 error = dpm_suspend_start(PMSG_QUIESCE); 464 error = dpm_suspend_start(PMSG_QUIESCE);
461 if (!error) { 465 if (!error) {
462 error = resume_target_kernel(platform_mode); 466 error = resume_target_kernel(platform_mode);
463 dpm_resume_end(PMSG_RECOVER); 467 dpm_resume_end(PMSG_RECOVER);
464 } 468 }
465 set_gfp_allowed_mask(saved_mask); 469 pm_restore_gfp_mask();
466 resume_console(); 470 resume_console();
467 pm_restore_console(); 471 pm_restore_console();
468 return error; 472 return error;
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode)
476int hibernation_platform_enter(void) 480int hibernation_platform_enter(void)
477{ 481{
478 int error; 482 int error;
479 gfp_t saved_mask;
480 483
481 if (!hibernation_ops) 484 if (!hibernation_ops)
482 return -ENOSYS; 485 return -ENOSYS;
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void)
492 495
493 entering_platform_hibernation = true; 496 entering_platform_hibernation = true;
494 suspend_console(); 497 suspend_console();
495 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
496 error = dpm_suspend_start(PMSG_HIBERNATE); 498 error = dpm_suspend_start(PMSG_HIBERNATE);
497 if (error) { 499 if (error) {
498 if (hibernation_ops->recover) 500 if (hibernation_ops->recover)
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void)
536 Resume_devices: 538 Resume_devices:
537 entering_platform_hibernation = false; 539 entering_platform_hibernation = false;
538 dpm_resume_end(PMSG_RESTORE); 540 dpm_resume_end(PMSG_RESTORE);
539 set_gfp_allowed_mask(saved_mask);
540 resume_console(); 541 resume_console();
541 542
542 Close: 543 Close:
@@ -646,6 +647,7 @@ int hibernate(void)
646 swsusp_free(); 647 swsusp_free();
647 if (!error) 648 if (!error)
648 power_down(); 649 power_down();
650 pm_restore_gfp_mask();
649 } else { 651 } else {
650 pr_debug("PM: Image restored successfully.\n"); 652 pr_debug("PM: Image restored successfully.\n");
651 } 653 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ac7eb109f196..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
984 src = kmap_atomic(s_page, KM_USER0); 984 src = kmap_atomic(s_page, KM_USER0);
985 dst = kmap_atomic(d_page, KM_USER1); 985 dst = kmap_atomic(d_page, KM_USER1);
986 do_copy_page(dst, src); 986 do_copy_page(dst, src);
987 kunmap_atomic(src, KM_USER0);
988 kunmap_atomic(dst, KM_USER1); 987 kunmap_atomic(dst, KM_USER1);
988 kunmap_atomic(src, KM_USER0);
989 } else { 989 } else {
990 if (PageHighMem(d_page)) { 990 if (PageHighMem(d_page)) {
991 /* Page pointed to by src may contain some kernel 991 /* Page pointed to by src may contain some kernel
@@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
993 */ 993 */
994 safe_copy_page(buffer, s_page); 994 safe_copy_page(buffer, s_page);
995 dst = kmap_atomic(d_page, KM_USER0); 995 dst = kmap_atomic(d_page, KM_USER0);
996 memcpy(dst, buffer, PAGE_SIZE); 996 copy_page(dst, buffer);
997 kunmap_atomic(dst, KM_USER0); 997 kunmap_atomic(dst, KM_USER0);
998 } else { 998 } else {
999 safe_copy_page(page_address(d_page), s_page); 999 safe_copy_page(page_address(d_page), s_page);
@@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1687 memory_bm_position_reset(&orig_bm); 1687 memory_bm_position_reset(&orig_bm);
1688 memory_bm_position_reset(&copy_bm); 1688 memory_bm_position_reset(&copy_bm);
1689 } else if (handle->cur <= nr_meta_pages) { 1689 } else if (handle->cur <= nr_meta_pages) {
1690 memset(buffer, 0, PAGE_SIZE); 1690 clear_page(buffer);
1691 pack_pfns(buffer, &orig_bm); 1691 pack_pfns(buffer, &orig_bm);
1692 } else { 1692 } else {
1693 struct page *page; 1693 struct page *page;
@@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1701 void *kaddr; 1701 void *kaddr;
1702 1702
1703 kaddr = kmap_atomic(page, KM_USER0); 1703 kaddr = kmap_atomic(page, KM_USER0);
1704 memcpy(buffer, kaddr, PAGE_SIZE); 1704 copy_page(buffer, kaddr);
1705 kunmap_atomic(kaddr, KM_USER0); 1705 kunmap_atomic(kaddr, KM_USER0);
1706 handle->buffer = buffer; 1706 handle->buffer = buffer;
1707 } else { 1707 } else {
@@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void)
1984 void *dst; 1984 void *dst;
1985 1985
1986 dst = kmap_atomic(last_highmem_page, KM_USER0); 1986 dst = kmap_atomic(last_highmem_page, KM_USER0);
1987 memcpy(dst, buffer, PAGE_SIZE); 1987 copy_page(dst, buffer);
1988 kunmap_atomic(dst, KM_USER0); 1988 kunmap_atomic(dst, KM_USER0);
1989 last_highmem_page = NULL; 1989 last_highmem_page = NULL;
1990 } 1990 }
@@ -2270,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2270 2270
2271 kaddr1 = kmap_atomic(p1, KM_USER0); 2271 kaddr1 = kmap_atomic(p1, KM_USER0);
2272 kaddr2 = kmap_atomic(p2, KM_USER1); 2272 kaddr2 = kmap_atomic(p2, KM_USER1);
2273 memcpy(buf, kaddr1, PAGE_SIZE); 2273 copy_page(buf, kaddr1);
2274 memcpy(kaddr1, kaddr2, PAGE_SIZE); 2274 copy_page(kaddr1, kaddr2);
2275 memcpy(kaddr2, buf, PAGE_SIZE); 2275 copy_page(kaddr2, buf);
2276 kunmap_atomic(kaddr1, KM_USER0);
2277 kunmap_atomic(kaddr2, KM_USER1); 2276 kunmap_atomic(kaddr2, KM_USER1);
2277 kunmap_atomic(kaddr1, KM_USER0);
2278} 2278}
2279 2279
2280/** 2280/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..ecf770509d0d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state)
197int suspend_devices_and_enter(suspend_state_t state) 197int suspend_devices_and_enter(suspend_state_t state)
198{ 198{
199 int error; 199 int error;
200 gfp_t saved_mask;
201 200
202 if (!suspend_ops) 201 if (!suspend_ops)
203 return -ENOSYS; 202 return -ENOSYS;
@@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state)
208 goto Close; 207 goto Close;
209 } 208 }
210 suspend_console(); 209 suspend_console();
211 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 210 pm_restrict_gfp_mask();
212 suspend_test_start(); 211 suspend_test_start();
213 error = dpm_suspend_start(PMSG_SUSPEND); 212 error = dpm_suspend_start(PMSG_SUSPEND);
214 if (error) { 213 if (error) {
@@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state)
225 suspend_test_start(); 224 suspend_test_start();
226 dpm_resume_end(PMSG_RESUME); 225 dpm_resume_end(PMSG_RESUME);
227 suspend_test_finish("resume devices"); 226 suspend_test_finish("resume devices");
228 set_gfp_allowed_mask(saved_mask); 227 pm_restore_gfp_mask();
229 resume_console(); 228 resume_console();
230 Close: 229 Close:
231 if (suspend_ops->end) 230 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 916eaa790399..8c7e4832b9be 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
9 * 10 *
10 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
11 * 12 *
@@ -29,7 +30,7 @@
29 30
30#include "power.h" 31#include "power.h"
31 32
32#define HIBERNATE_SIG "LINHIB0001" 33#define HIBERNATE_SIG "S1SUSPEND"
33 34
34/* 35/*
35 * The swap map is a data structure used for keeping track of each page 36 * The swap map is a data structure used for keeping track of each page
@@ -251,7 +252,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
251 if (bio_chain) { 252 if (bio_chain) {
252 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 253 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
253 if (src) { 254 if (src) {
254 memcpy(src, buf, PAGE_SIZE); 255 copy_page(src, buf);
255 } else { 256 } else {
256 WARN_ON_ONCE(1); 257 WARN_ON_ONCE(1);
257 bio_chain = NULL; /* Go synchronous */ 258 bio_chain = NULL; /* Go synchronous */
@@ -325,7 +326,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
325 error = write_page(handle->cur, handle->cur_swap, NULL); 326 error = write_page(handle->cur, handle->cur_swap, NULL);
326 if (error) 327 if (error)
327 goto out; 328 goto out;
328 memset(handle->cur, 0, PAGE_SIZE); 329 clear_page(handle->cur);
329 handle->cur_swap = offset; 330 handle->cur_swap = offset;
330 handle->k = 0; 331 handle->k = 0;
331 } 332 }
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle,
753{ 754{
754 unsigned int m; 755 unsigned int m;
755 int error = 0; 756 int error = 0;
757 struct bio *bio;
756 struct timeval start; 758 struct timeval start;
757 struct timeval stop; 759 struct timeval stop;
758 unsigned nr_pages; 760 unsigned nr_pages;
759 size_t off, unc_len, cmp_len; 761 size_t i, off, unc_len, cmp_len;
760 unsigned char *unc, *cmp, *page; 762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
761 763
762 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 764 for (i = 0; i < LZO_CMP_PAGES; i++) {
763 if (!page) { 765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
764 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 766 if (!page[i]) {
765 return -ENOMEM; 767 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
768
769 while (i)
770 free_page((unsigned long)page[--i]);
771
772 return -ENOMEM;
773 }
766 } 774 }
767 775
768 unc = vmalloc(LZO_UNC_SIZE); 776 unc = vmalloc(LZO_UNC_SIZE);
769 if (!unc) { 777 if (!unc) {
770 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
771 free_page((unsigned long)page); 779
780 for (i = 0; i < LZO_CMP_PAGES; i++)
781 free_page((unsigned long)page[i]);
782
772 return -ENOMEM; 783 return -ENOMEM;
773 } 784 }
774 785
775 cmp = vmalloc(LZO_CMP_SIZE); 786 cmp = vmalloc(LZO_CMP_SIZE);
776 if (!cmp) { 787 if (!cmp) {
777 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
789
778 vfree(unc); 790 vfree(unc);
779 free_page((unsigned long)page); 791 for (i = 0; i < LZO_CMP_PAGES; i++)
792 free_page((unsigned long)page[i]);
793
780 return -ENOMEM; 794 return -ENOMEM;
781 } 795 }
782 796
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
787 if (!m) 801 if (!m)
788 m = 1; 802 m = 1;
789 nr_pages = 0; 803 nr_pages = 0;
804 bio = NULL;
790 do_gettimeofday(&start); 805 do_gettimeofday(&start);
791 806
792 error = snapshot_write_next(snapshot); 807 error = snapshot_write_next(snapshot);
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle,
794 goto out_finish; 809 goto out_finish;
795 810
796 for (;;) { 811 for (;;) {
797 error = swap_read_page(handle, page, NULL); /* sync */ 812 error = swap_read_page(handle, page[0], NULL); /* sync */
798 if (error) 813 if (error)
799 break; 814 break;
800 815
801 cmp_len = *(size_t *)page; 816 cmp_len = *(size_t *)page[0];
802 if (unlikely(!cmp_len || 817 if (unlikely(!cmp_len ||
803 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { 818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
804 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 819 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
806 break; 821 break;
807 } 822 }
808 823
809 memcpy(cmp, page, PAGE_SIZE); 824 for (off = PAGE_SIZE, i = 1;
810 for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { 825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
811 error = swap_read_page(handle, page, NULL); /* sync */ 826 error = swap_read_page(handle, page[i], &bio);
812 if (error) 827 if (error)
813 goto out_finish; 828 goto out_finish;
829 }
814 830
815 memcpy(cmp + off, page, PAGE_SIZE); 831 error = hib_wait_on_bio_chain(&bio); /* need all data now */
832 if (error)
833 goto out_finish;
834
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
816 } 838 }
817 839
818 unc_len = LZO_UNC_SIZE; 840 unc_len = LZO_UNC_SIZE;
@@ -857,7 +879,8 @@ out_finish:
857 879
858 vfree(cmp); 880 vfree(cmp);
859 vfree(unc); 881 vfree(unc);
860 free_page((unsigned long)page); 882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]);
861 884
862 return error; 885 return error;
863} 886}
@@ -910,7 +933,7 @@ int swsusp_check(void)
910 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
911 if (!IS_ERR(hib_resume_bdev)) { 934 if (!IS_ERR(hib_resume_bdev)) {
912 set_blocksize(hib_resume_bdev, PAGE_SIZE); 935 set_blocksize(hib_resume_bdev, PAGE_SIZE);
913 memset(swsusp_header, 0, PAGE_SIZE); 936 clear_page(swsusp_header);
914 error = hib_bio_read_page(swsusp_resume_block, 937 error = hib_bio_read_page(swsusp_resume_block,
915 swsusp_header, NULL); 938 swsusp_header, NULL);
916 if (error) 939 if (error)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
137 free_all_swap_pages(data->swap); 137 free_all_swap_pages(data->swap);
138 if (data->frozen) 138 if (data->frozen)
139 thaw_processes(); 139 thaw_processes();
140 pm_notifier_call_chain(data->mode == O_WRONLY ? 140 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 141 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 142 atomic_inc(&snapshot_device_available);
143 143
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
263 case SNAPSHOT_UNFREEZE: 263 case SNAPSHOT_UNFREEZE:
264 if (!data->frozen || data->ready) 264 if (!data->frozen || data->ready)
265 break; 265 break;
266 pm_restore_gfp_mask();
266 thaw_processes(); 267 thaw_processes();
267 usermodehelper_enable(); 268 usermodehelper_enable();
268 data->frozen = 0; 269 data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 error = -EPERM; 276 error = -EPERM;
276 break; 277 break;
277 } 278 }
279 pm_restore_gfp_mask();
278 error = hibernation_snapshot(data->platform_support); 280 error = hibernation_snapshot(data->platform_support);
279 if (!error) 281 if (!error)
280 error = put_user(in_suspend, (int __user *)arg); 282 error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index 2531017795f6..a23315dc4498 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
210 210
211#ifdef CONFIG_BOOT_PRINTK_DELAY 211#ifdef CONFIG_BOOT_PRINTK_DELAY
212 212
213static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 213static int boot_delay; /* msecs delay after each printk during bootup */
214static unsigned long long loops_per_msec; /* based on boot_delay */ 214static unsigned long long loops_per_msec; /* based on boot_delay */
215 215
216static int __init boot_delay_setup(char *str) 216static int __init boot_delay_setup(char *str)
@@ -261,6 +261,12 @@ static inline void boot_delay_msec(void)
261} 261}
262#endif 262#endif
263 263
264#ifdef CONFIG_SECURITY_DMESG_RESTRICT
265int dmesg_restrict = 1;
266#else
267int dmesg_restrict;
268#endif
269
264int do_syslog(int type, char __user *buf, int len, bool from_file) 270int do_syslog(int type, char __user *buf, int len, bool from_file)
265{ 271{
266 unsigned i, j, limit, count; 272 unsigned i, j, limit, count;
@@ -268,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
268 char c; 274 char c;
269 int error = 0; 275 int error = 0;
270 276
271 error = security_syslog(type, from_file); 277 /*
278 * If this is from /proc/kmsg we only do the capabilities checks
279 * at open time.
280 */
281 if (type == SYSLOG_ACTION_OPEN || !from_file) {
282 if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
283 return -EPERM;
284 if ((type != SYSLOG_ACTION_READ_ALL &&
285 type != SYSLOG_ACTION_SIZE_BUFFER) &&
286 !capable(CAP_SYS_ADMIN))
287 return -EPERM;
288 }
289
290 error = security_syslog(type);
272 if (error) 291 if (error)
273 return error; 292 return error;
274 293
@@ -647,6 +666,7 @@ static inline int can_use_console(unsigned int cpu)
647 * released but interrupts still disabled. 666 * released but interrupts still disabled.
648 */ 667 */
649static int acquire_console_semaphore_for_printk(unsigned int cpu) 668static int acquire_console_semaphore_for_printk(unsigned int cpu)
669 __releases(&logbuf_lock)
650{ 670{
651 int retval = 0; 671 int retval = 0;
652 672
@@ -1062,13 +1082,15 @@ void printk_tick(void)
1062 1082
1063int printk_needs_cpu(int cpu) 1083int printk_needs_cpu(int cpu)
1064{ 1084{
1085 if (unlikely(cpu_is_offline(cpu)))
1086 printk_tick();
1065 return per_cpu(printk_pending, cpu); 1087 return per_cpu(printk_pending, cpu);
1066} 1088}
1067 1089
1068void wake_up_klogd(void) 1090void wake_up_klogd(void)
1069{ 1091{
1070 if (waitqueue_active(&log_wait)) 1092 if (waitqueue_active(&log_wait))
1071 __raw_get_cpu_var(printk_pending) = 1; 1093 this_cpu_write(printk_pending, 1);
1072} 1094}
1073 1095
1074/** 1096/**
@@ -1511,7 +1533,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1511} 1533}
1512EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1534EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1513 1535
1514static const char const *kmsg_reasons[] = { 1536static const char * const kmsg_reasons[] = {
1515 [KMSG_DUMP_OOPS] = "oops", 1537 [KMSG_DUMP_OOPS] = "oops",
1516 [KMSG_DUMP_PANIC] = "panic", 1538 [KMSG_DUMP_PANIC] = "panic",
1517 [KMSG_DUMP_KEXEC] = "kexec", 1539 [KMSG_DUMP_KEXEC] = "kexec",
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..66f841b7fbd3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
555static const struct file_operations proc_profile_operations = { 555static const struct file_operations proc_profile_operations = {
556 .read = read_profile, 556 .read = read_profile,
557 .write = write_profile, 557 .write = write_profile,
558 .llseek = default_llseek,
558}; 559};
559 560
560#ifdef CONFIG_SMP 561#ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
181 * under ptrace. 181 * under ptrace.
182 */ 182 */
183 retval = -ERESTARTNOINTR; 183 retval = -ERESTARTNOINTR;
184 if (mutex_lock_interruptible(&task->cred_guard_mutex)) 184 if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
185 goto out; 185 goto out;
186 186
187 task_lock(task); 187 task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
208unlock_tasklist: 208unlock_tasklist:
209 write_unlock_irq(&tasklist_lock); 209 write_unlock_irq(&tasklist_lock);
210unlock_creds: 210unlock_creds:
211 mutex_unlock(&task->cred_guard_mutex); 211 mutex_unlock(&task->signal->cred_guard_mutex);
212out: 212out:
213 return retval; 213 return retval;
214} 214}
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
329 * and reacquire the lock. 329 * and reacquire the lock.
330 */ 330 */
331void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
332 __releases(&tasklist_lock)
333 __acquires(&tasklist_lock)
332{ 334{
333 struct task_struct *p, *n; 335 struct task_struct *p, *n;
334 LIST_HEAD(ptrace_dead); 336 LIST_HEAD(ptrace_dead);
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
402 return copied; 404 return copied;
403} 405}
404 406
405static int ptrace_setoptions(struct task_struct *child, long data) 407static int ptrace_setoptions(struct task_struct *child, unsigned long data)
406{ 408{
407 child->ptrace &= ~PT_TRACE_MASK; 409 child->ptrace &= ~PT_TRACE_MASK;
408 410
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
481#define is_sysemu_singlestep(request) 0 483#define is_sysemu_singlestep(request) 0
482#endif 484#endif
483 485
484static int ptrace_resume(struct task_struct *child, long request, long data) 486static int ptrace_resume(struct task_struct *child, long request,
487 unsigned long data)
485{ 488{
486 if (!valid_signal(data)) 489 if (!valid_signal(data))
487 return -EIO; 490 return -EIO;
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
558#endif 561#endif
559 562
560int ptrace_request(struct task_struct *child, long request, 563int ptrace_request(struct task_struct *child, long request,
561 long addr, long data) 564 unsigned long addr, unsigned long data)
562{ 565{
563 int ret = -EIO; 566 int ret = -EIO;
564 siginfo_t siginfo; 567 siginfo_t siginfo;
568 void __user *datavp = (void __user *) data;
569 unsigned long __user *datalp = datavp;
565 570
566 switch (request) { 571 switch (request) {
567 case PTRACE_PEEKTEXT: 572 case PTRACE_PEEKTEXT:
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
578 ret = ptrace_setoptions(child, data); 583 ret = ptrace_setoptions(child, data);
579 break; 584 break;
580 case PTRACE_GETEVENTMSG: 585 case PTRACE_GETEVENTMSG:
581 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 586 ret = put_user(child->ptrace_message, datalp);
582 break; 587 break;
583 588
584 case PTRACE_GETSIGINFO: 589 case PTRACE_GETSIGINFO:
585 ret = ptrace_getsiginfo(child, &siginfo); 590 ret = ptrace_getsiginfo(child, &siginfo);
586 if (!ret) 591 if (!ret)
587 ret = copy_siginfo_to_user((siginfo_t __user *) data, 592 ret = copy_siginfo_to_user(datavp, &siginfo);
588 &siginfo);
589 break; 593 break;
590 594
591 case PTRACE_SETSIGINFO: 595 case PTRACE_SETSIGINFO:
592 if (copy_from_user(&siginfo, (siginfo_t __user *) data, 596 if (copy_from_user(&siginfo, datavp, sizeof siginfo))
593 sizeof siginfo))
594 ret = -EFAULT; 597 ret = -EFAULT;
595 else 598 else
596 ret = ptrace_setsiginfo(child, &siginfo); 599 ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
621 } 624 }
622 mmput(mm); 625 mmput(mm);
623 626
624 ret = put_user(tmp, (unsigned long __user *) data); 627 ret = put_user(tmp, datalp);
625 break; 628 break;
626 } 629 }
627#endif 630#endif
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
650 case PTRACE_SETREGSET: 653 case PTRACE_SETREGSET:
651 { 654 {
652 struct iovec kiov; 655 struct iovec kiov;
653 struct iovec __user *uiov = (struct iovec __user *) data; 656 struct iovec __user *uiov = datavp;
654 657
655 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) 658 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
656 return -EFAULT; 659 return -EFAULT;
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
691#define arch_ptrace_attach(child) do { } while (0) 694#define arch_ptrace_attach(child) do { } while (0)
692#endif 695#endif
693 696
694SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) 697SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
698 unsigned long, data)
695{ 699{
696 struct task_struct *child; 700 struct task_struct *child;
697 long ret; 701 long ret;
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
732 return ret; 736 return ret;
733} 737}
734 738
735int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 739int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
740 unsigned long data)
736{ 741{
737 unsigned long tmp; 742 unsigned long tmp;
738 int copied; 743 int copied;
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
743 return put_user(tmp, (unsigned long __user *)data); 748 return put_user(tmp, (unsigned long __user *)data);
744} 749}
745 750
746int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) 751int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
752 unsigned long data)
747{ 753{
748 int copied; 754 int copied;
749 755
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
119 119
120int clean_sort_range(struct range *range, int az) 120int clean_sort_range(struct range *range, int az)
121{ 121{
122 int i, j, k = az - 1, nr_range = 0; 122 int i, j, k = az - 1, nr_range = az;
123 123
124 for (i = 0; i < k; i++) { 124 for (i = 0; i < k; i++) {
125 if (range[i].end) 125 if (range[i].end)
diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb929..859ea5a9605f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
70 */ 70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages) 71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{ 72{
73 struct page **array; 73 const size_t pa_size = n_pages * sizeof(struct page *);
74 size_t pa_size = n_pages * sizeof(struct page *); 74 if (pa_size > PAGE_SIZE)
75 75 return vzalloc(pa_size);
76 if (pa_size > PAGE_SIZE) { 76 return kzalloc(pa_size, GFP_KERNEL);
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84} 77}
85 78
86/* 79/*
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -357,6 +357,32 @@ int __weak page_is_ram(unsigned long pfn)
357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
358} 358}
359 359
360void __weak arch_remove_reservations(struct resource *avail)
361{
362}
363
364static resource_size_t simple_align_resource(void *data,
365 const struct resource *avail,
366 resource_size_t size,
367 resource_size_t align)
368{
369 return avail->start;
370}
371
372static void resource_clip(struct resource *res, resource_size_t min,
373 resource_size_t max)
374{
375 if (res->start < min)
376 res->start = min;
377 if (res->end > max)
378 res->end = max;
379}
380
381static bool resource_contains(struct resource *res1, struct resource *res2)
382{
383 return res1->start <= res2->start && res1->end >= res2->end;
384}
385
360/* 386/*
361 * Find empty slot in the resource tree given range and alignment. 387 * Find empty slot in the resource tree given range and alignment.
362 */ 388 */
@@ -370,8 +396,9 @@ static int find_resource(struct resource *root, struct resource *new,
370 void *alignf_data) 396 void *alignf_data)
371{ 397{
372 struct resource *this = root->child; 398 struct resource *this = root->child;
373 struct resource tmp = *new; 399 struct resource tmp = *new, avail, alloc;
374 400
401 tmp.flags = new->flags;
375 tmp.start = root->start; 402 tmp.start = root->start;
376 /* 403 /*
377 * Skip past an allocated resource that starts at 0, since the assignment 404 * Skip past an allocated resource that starts at 0, since the assignment
@@ -386,17 +413,22 @@ static int find_resource(struct resource *root, struct resource *new,
386 tmp.end = this->start - 1; 413 tmp.end = this->start - 1;
387 else 414 else
388 tmp.end = root->end; 415 tmp.end = root->end;
389 if (tmp.start < min) 416
390 tmp.start = min; 417 resource_clip(&tmp, min, max);
391 if (tmp.end > max) 418 arch_remove_reservations(&tmp);
392 tmp.end = max; 419
393 tmp.start = ALIGN(tmp.start, align); 420 /* Check for overflow after ALIGN() */
394 if (alignf) 421 avail = *new;
395 tmp.start = alignf(alignf_data, &tmp, size, align); 422 avail.start = ALIGN(tmp.start, align);
396 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 423 avail.end = tmp.end;
397 new->start = tmp.start; 424 if (avail.start >= tmp.start) {
398 new->end = tmp.start + size - 1; 425 alloc.start = alignf(alignf_data, &avail, size, align);
399 return 0; 426 alloc.end = alloc.start + size - 1;
427 if (resource_contains(&avail, &alloc)) {
428 new->start = alloc.start;
429 new->end = alloc.end;
430 return 0;
431 }
400 } 432 }
401 if (!this) 433 if (!this)
402 break; 434 break;
@@ -428,6 +460,9 @@ int allocate_resource(struct resource *root, struct resource *new,
428{ 460{
429 int err; 461 int err;
430 462
463 if (!alignf)
464 alignf = simple_align_resource;
465
431 write_lock(&resource_lock); 466 write_lock(&resource_lock);
432 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 467 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
433 if (err >= 0 && __request_resource(root, new)) 468 if (err >= 0 && __request_resource(root, new))
@@ -453,6 +488,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
453 488
454 if (first == parent) 489 if (first == parent)
455 return first; 490 return first;
491 if (WARN_ON(first == new)) /* duplicated insertion */
492 return first;
456 493
457 if ((first->start > new->start) || (first->end < new->end)) 494 if ((first->start > new->start) || (first->end < new->end))
458 break; 495 break;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..66cb89bc5ef1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
76 } 76 }
77 77
78 if (!lockwakeup && td->bkl == 4) { 78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
79 unlock_kernel(); 80 unlock_kernel();
81#endif
80 td->bkl = 0; 82 td->bkl = 0;
81 } 83 }
82 return 0; 84 return 0;
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
133 if (td->bkl) 135 if (td->bkl)
134 return 0; 136 return 0;
135 td->bkl = 1; 137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
136 lock_kernel(); 139 lock_kernel();
140#endif
137 td->bkl = 4; 141 td->bkl = 4;
138 return 0; 142 return 0;
139 143
140 case RTTEST_UNLOCKBKL: 144 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4) 145 if (td->bkl != 4)
142 break; 146 break;
147#ifdef CONFIG_LOCK_KERNEL
143 unlock_kernel(); 148 unlock_kernel();
149#endif
144 td->bkl = 0; 150 td->bkl = 0;
145 return 0; 151 return 0;
146 152
diff --git a/kernel/sched.c b/kernel/sched.c
index d42992bccdfa..297d1a0eedb0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -560,18 +560,8 @@ struct rq {
560 560
561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562 562
563static inline
564void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
565{
566 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
567 563
568 /* 564static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
569 * A queue event has occurred, and we're going to schedule. In
570 * this case, we can save a useless back to back clock update.
571 */
572 if (test_tsk_need_resched(p))
573 rq->skip_clock_update = 1;
574}
575 565
576static inline int cpu_of(struct rq *rq) 566static inline int cpu_of(struct rq *rq)
577{ 567{
@@ -646,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
646 636
647#endif /* CONFIG_CGROUP_SCHED */ 637#endif /* CONFIG_CGROUP_SCHED */
648 638
649static u64 irq_time_cpu(int cpu); 639static void update_rq_clock_task(struct rq *rq, s64 delta);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651 640
652inline void update_rq_clock(struct rq *rq) 641static void update_rq_clock(struct rq *rq)
653{ 642{
654 if (!rq->skip_clock_update) { 643 s64 delta;
655 int cpu = cpu_of(rq);
656 u64 irq_time;
657 644
658 rq->clock = sched_clock_cpu(cpu); 645 if (rq->skip_clock_update)
659 irq_time = irq_time_cpu(cpu); 646 return;
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662 647
663 sched_irq_time_avg_update(rq, irq_time); 648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
664 } 649 rq->clock += delta;
650 update_rq_clock_task(rq, delta);
665} 651}
666 652
667/* 653/*
@@ -1934,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1934 * They are read and saved off onto struct rq in update_rq_clock(). 1920 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can 1921 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old 1922 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of 1923 * or new value with a side effect of accounting a slice of irq time to wrong
1938 * accounting a slice of irq time to wrong task when irq is in progress 1924 * task when irq is in progress while we read rq->clock. That is a worthy
1939 * while we read rq->clock. That is a worthy compromise in place of having 1925 * compromise in place of having locks on each irq in account_system_time.
1940 * locks on each irq in account_system_time.
1941 */ 1926 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1927static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time); 1928static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1955,19 +1940,58 @@ void disable_sched_clock_irqtime(void)
1955 sched_clock_irqtime = 0; 1940 sched_clock_irqtime = 0;
1956} 1941}
1957 1942
1958static u64 irq_time_cpu(int cpu) 1943#ifndef CONFIG_64BIT
1944static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1945
1946static inline void irq_time_write_begin(void)
1959{ 1947{
1960 if (!sched_clock_irqtime) 1948 __this_cpu_inc(irq_time_seq.sequence);
1961 return 0; 1949 smp_wmb();
1950}
1962 1951
1952static inline void irq_time_write_end(void)
1953{
1954 smp_wmb();
1955 __this_cpu_inc(irq_time_seq.sequence);
1956}
1957
1958static inline u64 irq_time_read(int cpu)
1959{
1960 u64 irq_time;
1961 unsigned seq;
1962
1963 do {
1964 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1965 irq_time = per_cpu(cpu_softirq_time, cpu) +
1966 per_cpu(cpu_hardirq_time, cpu);
1967 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1968
1969 return irq_time;
1970}
1971#else /* CONFIG_64BIT */
1972static inline void irq_time_write_begin(void)
1973{
1974}
1975
1976static inline void irq_time_write_end(void)
1977{
1978}
1979
1980static inline u64 irq_time_read(int cpu)
1981{
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1982 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964} 1983}
1984#endif /* CONFIG_64BIT */
1965 1985
1986/*
1987 * Called before incrementing preempt_count on {soft,}irq_enter
1988 * and before decrementing preempt_count on {soft,}irq_exit.
1989 */
1966void account_system_vtime(struct task_struct *curr) 1990void account_system_vtime(struct task_struct *curr)
1967{ 1991{
1968 unsigned long flags; 1992 unsigned long flags;
1993 s64 delta;
1969 int cpu; 1994 int cpu;
1970 u64 now, delta;
1971 1995
1972 if (!sched_clock_irqtime) 1996 if (!sched_clock_irqtime)
1973 return; 1997 return;
@@ -1975,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr)
1975 local_irq_save(flags); 1999 local_irq_save(flags);
1976 2000
1977 cpu = smp_processor_id(); 2001 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu); 2002 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1979 delta = now - per_cpu(irq_start_time, cpu); 2003 __this_cpu_add(irq_start_time, delta);
1980 per_cpu(irq_start_time, cpu) = now; 2004
2005 irq_time_write_begin();
1981 /* 2006 /*
1982 * We do not account for softirq time from ksoftirqd here. 2007 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread 2008 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1985,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr)
1985 * that do not consume any time, but still wants to run. 2010 * that do not consume any time, but still wants to run.
1986 */ 2011 */
1987 if (hardirq_count()) 2012 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta; 2013 __this_cpu_add(cpu_hardirq_time, delta);
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 2014 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta; 2015 __this_cpu_add(cpu_softirq_time, delta);
1991 2016
2017 irq_time_write_end();
1992 local_irq_restore(flags); 2018 local_irq_restore(flags);
1993} 2019}
1994EXPORT_SYMBOL_GPL(account_system_vtime); 2020EXPORT_SYMBOL_GPL(account_system_vtime);
1995 2021
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) 2022static void update_rq_clock_task(struct rq *rq, s64 delta)
1997{ 2023{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { 2024 s64 irq_delta;
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time; 2025
2000 rq->prev_irq_time = curr_irq_time; 2026 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2001 sched_rt_avg_update(rq, delta_irq); 2027
2002 } 2028 /*
2029 * Since irq_time is only updated on {soft,}irq_exit, we might run into
2030 * this case when a previous update_rq_clock() happened inside a
2031 * {soft,}irq region.
2032 *
2033 * When this happens, we stop ->clock_task and only update the
2034 * prev_irq_time stamp to account for the part that fit, so that a next
2035 * update will consume the rest. This ensures ->clock_task is
2036 * monotonic.
2037 *
2038 * It does however cause some slight miss-attribution of {soft,}irq
2039 * time, a more accurate solution would be to update the irq_time using
2040 * the current rq->clock timestamp, except that would require using
2041 * atomic ops.
2042 */
2043 if (irq_delta > delta)
2044 irq_delta = delta;
2045
2046 rq->prev_irq_time += irq_delta;
2047 delta -= irq_delta;
2048 rq->clock_task += delta;
2049
2050 if (irq_delta && sched_feat(NONIRQ_POWER))
2051 sched_rt_avg_update(rq, irq_delta);
2003} 2052}
2004 2053
2005#else 2054#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2006 2055
2007static u64 irq_time_cpu(int cpu) 2056static void update_rq_clock_task(struct rq *rq, s64 delta)
2008{ 2057{
2009 return 0; 2058 rq->clock_task += delta;
2010} 2059}
2011 2060
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } 2061#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2013
2014#endif
2015 2062
2016#include "sched_idletask.c" 2063#include "sched_idletask.c"
2017#include "sched_fair.c" 2064#include "sched_fair.c"
@@ -2118,6 +2165,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2118 p->sched_class->prio_changed(rq, p, oldprio, running); 2165 p->sched_class->prio_changed(rq, p, oldprio, running);
2119} 2166}
2120 2167
2168static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2169{
2170 const struct sched_class *class;
2171
2172 if (p->sched_class == rq->curr->sched_class) {
2173 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2174 } else {
2175 for_each_class(class) {
2176 if (class == rq->curr->sched_class)
2177 break;
2178 if (class == p->sched_class) {
2179 resched_task(rq->curr);
2180 break;
2181 }
2182 }
2183 }
2184
2185 /*
2186 * A queue event has occurred, and we're going to schedule. In
2187 * this case, we can save a useless back to back clock update.
2188 */
2189 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2190 rq->skip_clock_update = 1;
2191}
2192
2121#ifdef CONFIG_SMP 2193#ifdef CONFIG_SMP
2122/* 2194/*
2123 * Is this task likely cache-hot: 2195 * Is this task likely cache-hot:
@@ -3104,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3104 return delta; 3176 return delta;
3105} 3177}
3106 3178
3179static unsigned long
3180calc_load(unsigned long load, unsigned long exp, unsigned long active)
3181{
3182 load *= exp;
3183 load += active * (FIXED_1 - exp);
3184 load += 1UL << (FSHIFT - 1);
3185 return load >> FSHIFT;
3186}
3187
3107#ifdef CONFIG_NO_HZ 3188#ifdef CONFIG_NO_HZ
3108/* 3189/*
3109 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3190 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3133,6 +3214,128 @@ static long calc_load_fold_idle(void)
3133 3214
3134 return delta; 3215 return delta;
3135} 3216}
3217
3218/**
3219 * fixed_power_int - compute: x^n, in O(log n) time
3220 *
3221 * @x: base of the power
3222 * @frac_bits: fractional bits of @x
3223 * @n: power to raise @x to.
3224 *
3225 * By exploiting the relation between the definition of the natural power
3226 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3227 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3228 * (where: n_i \elem {0, 1}, the binary vector representing n),
3229 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3230 * of course trivially computable in O(log_2 n), the length of our binary
3231 * vector.
3232 */
3233static unsigned long
3234fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3235{
3236 unsigned long result = 1UL << frac_bits;
3237
3238 if (n) for (;;) {
3239 if (n & 1) {
3240 result *= x;
3241 result += 1UL << (frac_bits - 1);
3242 result >>= frac_bits;
3243 }
3244 n >>= 1;
3245 if (!n)
3246 break;
3247 x *= x;
3248 x += 1UL << (frac_bits - 1);
3249 x >>= frac_bits;
3250 }
3251
3252 return result;
3253}
3254
3255/*
3256 * a1 = a0 * e + a * (1 - e)
3257 *
3258 * a2 = a1 * e + a * (1 - e)
3259 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3260 * = a0 * e^2 + a * (1 - e) * (1 + e)
3261 *
3262 * a3 = a2 * e + a * (1 - e)
3263 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3264 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3265 *
3266 * ...
3267 *
3268 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3269 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3270 * = a0 * e^n + a * (1 - e^n)
3271 *
3272 * [1] application of the geometric series:
3273 *
3274 * n 1 - x^(n+1)
3275 * S_n := \Sum x^i = -------------
3276 * i=0 1 - x
3277 */
3278static unsigned long
3279calc_load_n(unsigned long load, unsigned long exp,
3280 unsigned long active, unsigned int n)
3281{
3282
3283 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3284}
3285
3286/*
3287 * NO_HZ can leave us missing all per-cpu ticks calling
3288 * calc_load_account_active(), but since an idle CPU folds its delta into
3289 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3290 * in the pending idle delta if our idle period crossed a load cycle boundary.
3291 *
3292 * Once we've updated the global active value, we need to apply the exponential
3293 * weights adjusted to the number of cycles missed.
3294 */
3295static void calc_global_nohz(unsigned long ticks)
3296{
3297 long delta, active, n;
3298
3299 if (time_before(jiffies, calc_load_update))
3300 return;
3301
3302 /*
3303 * If we crossed a calc_load_update boundary, make sure to fold
3304 * any pending idle changes, the respective CPUs might have
3305 * missed the tick driven calc_load_account_active() update
3306 * due to NO_HZ.
3307 */
3308 delta = calc_load_fold_idle();
3309 if (delta)
3310 atomic_long_add(delta, &calc_load_tasks);
3311
3312 /*
3313 * If we were idle for multiple load cycles, apply them.
3314 */
3315 if (ticks >= LOAD_FREQ) {
3316 n = ticks / LOAD_FREQ;
3317
3318 active = atomic_long_read(&calc_load_tasks);
3319 active = active > 0 ? active * FIXED_1 : 0;
3320
3321 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3322 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3323 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3324
3325 calc_load_update += n * LOAD_FREQ;
3326 }
3327
3328 /*
3329 * Its possible the remainder of the above division also crosses
3330 * a LOAD_FREQ period, the regular check in calc_global_load()
3331 * which comes after this will take care of that.
3332 *
3333 * Consider us being 11 ticks before a cycle completion, and us
3334 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3335 * age us 4 cycles, and the test in calc_global_load() will
3336 * pick up the final one.
3337 */
3338}
3136#else 3339#else
3137static void calc_load_account_idle(struct rq *this_rq) 3340static void calc_load_account_idle(struct rq *this_rq)
3138{ 3341{
@@ -3142,6 +3345,10 @@ static inline long calc_load_fold_idle(void)
3142{ 3345{
3143 return 0; 3346 return 0;
3144} 3347}
3348
3349static void calc_global_nohz(unsigned long ticks)
3350{
3351}
3145#endif 3352#endif
3146 3353
3147/** 3354/**
@@ -3159,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3159 loads[2] = (avenrun[2] + offset) << shift; 3366 loads[2] = (avenrun[2] + offset) << shift;
3160} 3367}
3161 3368
3162static unsigned long
3163calc_load(unsigned long load, unsigned long exp, unsigned long active)
3164{
3165 load *= exp;
3166 load += active * (FIXED_1 - exp);
3167 return load >> FSHIFT;
3168}
3169
3170/* 3369/*
3171 * calc_load - update the avenrun load estimates 10 ticks after the 3370 * calc_load - update the avenrun load estimates 10 ticks after the
3172 * CPUs have updated calc_load_tasks. 3371 * CPUs have updated calc_load_tasks.
3173 */ 3372 */
3174void calc_global_load(void) 3373void calc_global_load(unsigned long ticks)
3175{ 3374{
3176 unsigned long upd = calc_load_update + 10;
3177 long active; 3375 long active;
3178 3376
3179 if (time_before(jiffies, upd)) 3377 calc_global_nohz(ticks);
3378
3379 if (time_before(jiffies, calc_load_update + 10))
3180 return; 3380 return;
3181 3381
3182 active = atomic_long_read(&calc_load_tasks); 3382 active = atomic_long_read(&calc_load_tasks);
@@ -3830,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3830{ 4030{
3831 if (prev->se.on_rq) 4031 if (prev->se.on_rq)
3832 update_rq_clock(rq); 4032 update_rq_clock(rq);
3833 rq->skip_clock_update = 0;
3834 prev->sched_class->put_prev_task(rq, prev); 4033 prev->sched_class->put_prev_task(rq, prev);
3835} 4034}
3836 4035
@@ -3888,7 +4087,6 @@ need_resched_nonpreemptible:
3888 hrtick_clear(rq); 4087 hrtick_clear(rq);
3889 4088
3890 raw_spin_lock_irq(&rq->lock); 4089 raw_spin_lock_irq(&rq->lock);
3891 clear_tsk_need_resched(prev);
3892 4090
3893 switch_count = &prev->nivcsw; 4091 switch_count = &prev->nivcsw;
3894 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4092 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3920,6 +4118,8 @@ need_resched_nonpreemptible:
3920 4118
3921 put_prev_task(rq, prev); 4119 put_prev_task(rq, prev);
3922 next = pick_next_task(rq); 4120 next = pick_next_task(rq);
4121 clear_tsk_need_resched(prev);
4122 rq->skip_clock_update = 0;
3923 4123
3924 if (likely(prev != next)) { 4124 if (likely(prev != next)) {
3925 sched_info_switch(prev, next); 4125 sched_info_switch(prev, next);
@@ -6960,6 +7160,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960 if (cpu != group_first_cpu(sd->groups)) 7160 if (cpu != group_first_cpu(sd->groups))
6961 return; 7161 return;
6962 7162
7163 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7164
6963 child = sd->child; 7165 child = sd->child;
6964 7166
6965 sd->groups->cpu_power = 0; 7167 sd->groups->cpu_power = 0;
@@ -8510,12 +8712,12 @@ void sched_move_task(struct task_struct *tsk)
8510 if (unlikely(running)) 8712 if (unlikely(running))
8511 tsk->sched_class->put_prev_task(rq, tsk); 8713 tsk->sched_class->put_prev_task(rq, tsk);
8512 8714
8513 set_task_rq(tsk, task_cpu(tsk));
8514
8515#ifdef CONFIG_FAIR_GROUP_SCHED 8715#ifdef CONFIG_FAIR_GROUP_SCHED
8516 if (tsk->sched_class->moved_group) 8716 if (tsk->sched_class->task_move_group)
8517 tsk->sched_class->moved_group(tsk, on_rq); 8717 tsk->sched_class->task_move_group(tsk, on_rq);
8718 else
8518#endif 8719#endif
8720 set_task_rq(tsk, task_cpu(tsk));
8519 8721
8520 if (unlikely(running)) 8722 if (unlikely(running))
8521 tsk->sched_class->set_curr_task(rq); 8723 tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 933f3d1b62ea..00ebd7686676 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1655 int scale = cfs_rq->nr_running >= sched_nr_latency;
1656 1656
1657 if (unlikely(rt_prio(p->prio)))
1658 goto preempt;
1659
1660 if (unlikely(p->sched_class != &fair_sched_class))
1661 return;
1662
1663 if (unlikely(se == pse)) 1657 if (unlikely(se == pse))
1664 return; 1658 return;
1665 1659
@@ -1764,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1758 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1759 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1760 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1771} 1761}
1772 1762
1773/* 1763/*
@@ -2035,13 +2025,16 @@ struct sd_lb_stats {
2035 unsigned long this_load_per_task; 2025 unsigned long this_load_per_task;
2036 unsigned long this_nr_running; 2026 unsigned long this_nr_running;
2037 unsigned long this_has_capacity; 2027 unsigned long this_has_capacity;
2028 unsigned int this_idle_cpus;
2038 2029
2039 /* Statistics of the busiest group */ 2030 /* Statistics of the busiest group */
2031 unsigned int busiest_idle_cpus;
2040 unsigned long max_load; 2032 unsigned long max_load;
2041 unsigned long busiest_load_per_task; 2033 unsigned long busiest_load_per_task;
2042 unsigned long busiest_nr_running; 2034 unsigned long busiest_nr_running;
2043 unsigned long busiest_group_capacity; 2035 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity; 2036 unsigned long busiest_has_capacity;
2037 unsigned int busiest_group_weight;
2045 2038
2046 int group_imb; /* Is there imbalance in this sd */ 2039 int group_imb; /* Is there imbalance in this sd */
2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2040#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2056,8 @@ struct sg_lb_stats {
2063 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2056 unsigned long sum_nr_running; /* Nr tasks running in the group */
2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2057 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2065 unsigned long group_capacity; 2058 unsigned long group_capacity;
2059 unsigned long idle_cpus;
2060 unsigned long group_weight;
2066 int group_imb; /* Is there an imbalance in the group ? */ 2061 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */ 2062 int group_has_capacity; /* Is there extra capacity in the group? */
2068}; 2063};
@@ -2431,7 +2426,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2431 sgs->group_load += load; 2426 sgs->group_load += load;
2432 sgs->sum_nr_running += rq->nr_running; 2427 sgs->sum_nr_running += rq->nr_running;
2433 sgs->sum_weighted_load += weighted_cpuload(i); 2428 sgs->sum_weighted_load += weighted_cpuload(i);
2434 2429 if (idle_cpu(i))
2430 sgs->idle_cpus++;
2435 } 2431 }
2436 2432
2437 /* 2433 /*
@@ -2469,6 +2465,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2465 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2470 if (!sgs->group_capacity) 2466 if (!sgs->group_capacity)
2471 sgs->group_capacity = fix_small_capacity(sd, group); 2467 sgs->group_capacity = fix_small_capacity(sd, group);
2468 sgs->group_weight = group->group_weight;
2472 2469
2473 if (sgs->group_capacity > sgs->sum_nr_running) 2470 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1; 2471 sgs->group_has_capacity = 1;
@@ -2576,13 +2573,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2576 sds->this_nr_running = sgs.sum_nr_running; 2573 sds->this_nr_running = sgs.sum_nr_running;
2577 sds->this_load_per_task = sgs.sum_weighted_load; 2574 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity; 2575 sds->this_has_capacity = sgs.group_has_capacity;
2576 sds->this_idle_cpus = sgs.idle_cpus;
2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2577 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2580 sds->max_load = sgs.avg_load; 2578 sds->max_load = sgs.avg_load;
2581 sds->busiest = sg; 2579 sds->busiest = sg;
2582 sds->busiest_nr_running = sgs.sum_nr_running; 2580 sds->busiest_nr_running = sgs.sum_nr_running;
2581 sds->busiest_idle_cpus = sgs.idle_cpus;
2583 sds->busiest_group_capacity = sgs.group_capacity; 2582 sds->busiest_group_capacity = sgs.group_capacity;
2584 sds->busiest_load_per_task = sgs.sum_weighted_load; 2583 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity; 2584 sds->busiest_has_capacity = sgs.group_has_capacity;
2585 sds->busiest_group_weight = sgs.group_weight;
2586 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2587 } 2587 }
2588 2588
@@ -2860,8 +2860,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2860 if (sds.this_load >= sds.avg_load) 2860 if (sds.this_load >= sds.avg_load)
2861 goto out_balanced; 2861 goto out_balanced;
2862 2862
2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 /*
2864 goto out_balanced; 2864 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
2865 * And to check for busy balance use !idle_cpu instead of
2866 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
2867 * even when they are idle.
2868 */
2869 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
2870 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2871 goto out_balanced;
2872 } else {
2873 /*
2874 * This cpu is idle. If the busiest group load doesn't
2875 * have more tasks than the number of available cpu's and
2876 * there is no imbalance between this and busiest group
2877 * wrt to idle cpu's, it is balanced.
2878 */
2879 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
2880 sds.busiest_nr_running <= sds.busiest_group_weight)
2881 goto out_balanced;
2882 }
2865 2883
2866force_balance: 2884force_balance:
2867 /* Looks like there is an imbalance. Compute it */ 2885 /* Looks like there is an imbalance. Compute it */
@@ -3197,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3197 interval = msecs_to_jiffies(sd->balance_interval); 3215 interval = msecs_to_jiffies(sd->balance_interval);
3198 if (time_after(next_balance, sd->last_balance + interval)) 3216 if (time_after(next_balance, sd->last_balance + interval))
3199 next_balance = sd->last_balance + interval; 3217 next_balance = sd->last_balance + interval;
3200 if (pulled_task) 3218 if (pulled_task) {
3219 this_rq->idle_stamp = 0;
3201 break; 3220 break;
3221 }
3202 } 3222 }
3203 3223
3204 raw_spin_lock(&this_rq->lock); 3224 raw_spin_lock(&this_rq->lock);
@@ -3869,13 +3889,26 @@ static void set_curr_task_fair(struct rq *rq)
3869} 3889}
3870 3890
3871#ifdef CONFIG_FAIR_GROUP_SCHED 3891#ifdef CONFIG_FAIR_GROUP_SCHED
3872static void moved_group_fair(struct task_struct *p, int on_rq) 3892static void task_move_group_fair(struct task_struct *p, int on_rq)
3873{ 3893{
3874 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3894 /*
3875 3895 * If the task was not on the rq at the time of this cgroup movement
3876 update_curr(cfs_rq); 3896 * it must have been asleep, sleeping tasks keep their ->vruntime
3897 * absolute on their old rq until wakeup (needed for the fair sleeper
3898 * bonus in place_entity()).
3899 *
3900 * If it was on the rq, we've just 'preempted' it, which does convert
3901 * ->vruntime to a relative base.
3902 *
3903 * Make sure both cases convert their relative position when migrating
3904 * to another cgroup's rq. This does somewhat interfere with the
3905 * fair sleeper stuff for the first placement, but who cares.
3906 */
3907 if (!on_rq)
3908 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
3909 set_task_rq(p, task_cpu(p));
3877 if (!on_rq) 3910 if (!on_rq)
3878 place_entity(cfs_rq, &p->se, 1); 3911 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
3879} 3912}
3880#endif 3913#endif
3881 3914
@@ -3927,7 +3960,7 @@ static const struct sched_class fair_sched_class = {
3927 .get_rr_interval = get_rr_interval_fair, 3960 .get_rr_interval = get_rr_interval_fair,
3928 3961
3929#ifdef CONFIG_FAIR_GROUP_SCHED 3962#ifdef CONFIG_FAIR_GROUP_SCHED
3930 .moved_group = moved_group_fair, 3963 .task_move_group = task_move_group_fair,
3931#endif 3964#endif
3932}; 3965};
3933 3966
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..48ddf431db0e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
157} 157}
158 158
159/* 159/*
160 * Called when a process is dequeued from the active array and given 160 * We are interested in knowing how long it was from the *first* time a
161 * the cpu. We should note that with the exception of interactive
162 * tasks, the expired queue will become the active queue after the active
163 * queue is empty, without explicitly dequeuing and requeuing tasks in the
164 * expired queue. (Interactive tasks may be requeued directly to the
165 * active queue, thus delaying tasks in the expired queue from running;
166 * see scheduler_tick()).
167 *
168 * Though we are interested in knowing how long it was from the *first* time a
169 * task was queued to the time that it finally hit a cpu, we call this routine 161 * task was queued to the time that it finally hit a cpu, we call this routine
170 * from dequeue_task() to account for possible rq->clock skew across cpus. The 162 * from dequeue_task() to account for possible rq->clock skew across cpus. The
171 * delta taken on each cpu would annul the skew. 163 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
203} 195}
204 196
205/* 197/*
206 * Called when a process is queued into either the active or expired
207 * array. The time is noted and later used to determine how long we
208 * had to wait for us to reach the cpu. Since the expired queue will
209 * become the active queue after active queue is empty, without dequeuing
210 * and requeuing any tasks, we are interested in queuing to either. It
211 * is unusual but not impossible for tasks to be dequeued and immediately
212 * requeued in the same or another array: this can happen in sched_yield(),
213 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
214 * to runqueue.
215 *
216 * This function is only called from enqueue_task(), but also only updates 198 * This function is only called from enqueue_task(), but also only updates
217 * the timestamp if it is already not set. It's assumed that 199 * the timestamp if it is already not set. It's assumed that
218 * sched_info_dequeued() will clear that stamp when appropriate. 200 * sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 45bddc0c1048..2bf6b47058c1 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p,
19static void 19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) 20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{ 21{
22 resched_task(rq->curr); /* we preempt everything */ 22 /* we're never preempted */
23} 23}
24 24
25static struct task_struct *pick_next_task_stop(struct rq *rq) 25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 26{
27 struct task_struct *stop = rq->stop; 27 struct task_struct *stop = rq->stop;
28 28
29 if (stop && stop->state == TASK_RUNNING) 29 if (stop && stop->se.on_rq)
30 return stop; 30 return stop;
31 31
32 return NULL; 32 return NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
1105 return count; 1105 return count;
1106} 1106}
1107 1107
1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1109 unsigned long *flags)
1109{ 1110{
1110 struct sighand_struct *sighand; 1111 struct sighand_struct *sighand;
1111 1112
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
1617 * is gone, we keep current->exit_code unless clear_code. 1618 * is gone, we keep current->exit_code unless clear_code.
1618 */ 1619 */
1619static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1620static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1621 __releases(&current->sighand->siglock)
1622 __acquires(&current->sighand->siglock)
1620{ 1623{
1621 if (arch_ptrace_stop_needed(exit_code, info)) { 1624 if (arch_ptrace_stop_needed(exit_code, info)) {
1622 /* 1625 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
267 * 267 *
268 * Returns 0 on success, else a negative status code. 268 * Returns 0 on success, else a negative status code.
269 */ 269 */
270int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
271 int wait) 271 int wait)
272{ 272{
273 struct call_single_data d = { 273 struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
336 * 3) any other online cpu in @mask 336 * 3) any other online cpu in @mask
337 */ 337 */
338int smp_call_function_any(const struct cpumask *mask, 338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait) 339 smp_call_func_t func, void *info, int wait)
340{ 340{
341 unsigned int cpu; 341 unsigned int cpu;
342 const struct cpumask *nodemask; 342 const struct cpumask *nodemask;
@@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
416 * must be disabled when calling this function. 416 * must be disabled when calling this function.
417 */ 417 */
418void smp_call_function_many(const struct cpumask *mask, 418void smp_call_function_many(const struct cpumask *mask,
419 void (*func)(void *), void *info, bool wait) 419 smp_call_func_t func, void *info, bool wait)
420{ 420{
421 struct call_function_data *data; 421 struct call_function_data *data;
422 unsigned long flags; 422 unsigned long flags;
@@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
500 * You must not call this function with disabled interrupts or from a 500 * You must not call this function with disabled interrupts or from a
501 * hardware interrupt handler or from a bottom half handler. 501 * hardware interrupt handler or from a bottom half handler.
502 */ 502 */
503int smp_call_function(void (*func)(void *), void *info, int wait) 503int smp_call_function(smp_call_func_t func, void *info, int wait)
504{ 504{
505 preempt_disable(); 505 preempt_disable();
506 smp_call_function_many(cpu_online_mask, func, info, wait); 506 smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fc978889b194..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
67 * to the pending events, so lets the scheduler to balance 67 * to the pending events, so lets the scheduler to balance
68 * the softirq load for us. 68 * the softirq load for us.
69 */ 69 */
70void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -229,18 +229,20 @@ restart:
229 229
230 do { 230 do {
231 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec;
232 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
233 kstat_incr_softirqs_this_cpu(h - softirq_vec);
234 234
235 trace_softirq_entry(h, softirq_vec); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236
237 trace_softirq_entry(vec_nr);
236 h->action(h); 238 h->action(h);
237 trace_softirq_exit(h, softirq_vec); 239 trace_softirq_exit(vec_nr);
238 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
239 printk(KERN_ERR "huh, entered softirq %td %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
240 "with preempt_count %08x," 242 "with preempt_count %08x,"
241 " exited with %08x?\n", h - softirq_vec, 243 " exited with %08x?\n", vec_nr,
242 softirq_to_name[h - softirq_vec], 244 softirq_to_name[vec_nr], h->action,
243 h->action, prev_count, preempt_count()); 245 prev_count, preempt_count());
244 preempt_count() = prev_count; 246 preempt_count() = prev_count;
245 } 247 }
246 248
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 090c28812ce1..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
262 cpu_stop_fn_t fn = work->fn; 262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg; 263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done; 264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN]; 265 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
266 266
267 __set_current_state(TASK_RUNNING); 267 __set_current_state(TASK_RUNNING);
268 268
@@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
305 cpu); 305 cpu);
306 if (IS_ERR(p)) 306 if (IS_ERR(p))
307 return NOTIFY_BAD; 307 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu); 309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p); 310 sched_set_stop_task(cpu, p);
@@ -372,7 +372,7 @@ static int __init cpu_stop_init(void)
372 /* start one for the boot cpu */ 372 /* start one for the boot cpu */
373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, 373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
374 bcpu); 374 bcpu);
375 BUG_ON(err == NOTIFY_BAD); 375 BUG_ON(err != NOTIFY_OK);
376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
377 register_cpu_notifier(&cpu_stop_cpu_notifier); 377 register_cpu_notifier(&cpu_stop_cpu_notifier);
378 378
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..5abfa1518554 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,8 +161,6 @@ extern int no_unaligned_warning;
161extern int unaligned_dump_stack; 161extern int unaligned_dump_stack;
162#endif 162#endif
163 163
164extern struct ratelimit_state printk_ratelimit_state;
165
166#ifdef CONFIG_PROC_SYSCTL 164#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 165static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 166 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -704,6 +702,15 @@ static struct ctl_table kern_table[] = {
704 .extra1 = &zero, 702 .extra1 = &zero,
705 .extra2 = &ten_thousand, 703 .extra2 = &ten_thousand,
706 }, 704 },
705 {
706 .procname = "dmesg_restrict",
707 .data = &dmesg_restrict,
708 .maxlen = sizeof(int),
709 .mode = 0644,
710 .proc_handler = proc_dointvec_minmax,
711 .extra1 = &zero,
712 .extra2 = &one,
713 },
707#endif 714#endif
708 { 715 {
709 .procname = "ngroups_max", 716 .procname = "ngroups_max",
@@ -1340,28 +1347,28 @@ static struct ctl_table fs_table[] = {
1340 .data = &inodes_stat, 1347 .data = &inodes_stat,
1341 .maxlen = 2*sizeof(int), 1348 .maxlen = 2*sizeof(int),
1342 .mode = 0444, 1349 .mode = 0444,
1343 .proc_handler = proc_dointvec, 1350 .proc_handler = proc_nr_inodes,
1344 }, 1351 },
1345 { 1352 {
1346 .procname = "inode-state", 1353 .procname = "inode-state",
1347 .data = &inodes_stat, 1354 .data = &inodes_stat,
1348 .maxlen = 7*sizeof(int), 1355 .maxlen = 7*sizeof(int),
1349 .mode = 0444, 1356 .mode = 0444,
1350 .proc_handler = proc_dointvec, 1357 .proc_handler = proc_nr_inodes,
1351 }, 1358 },
1352 { 1359 {
1353 .procname = "file-nr", 1360 .procname = "file-nr",
1354 .data = &files_stat, 1361 .data = &files_stat,
1355 .maxlen = 3*sizeof(int), 1362 .maxlen = sizeof(files_stat),
1356 .mode = 0444, 1363 .mode = 0444,
1357 .proc_handler = proc_nr_files, 1364 .proc_handler = proc_nr_files,
1358 }, 1365 },
1359 { 1366 {
1360 .procname = "file-max", 1367 .procname = "file-max",
1361 .data = &files_stat.max_files, 1368 .data = &files_stat.max_files,
1362 .maxlen = sizeof(int), 1369 .maxlen = sizeof(files_stat.max_files),
1363 .mode = 0644, 1370 .mode = 0644,
1364 .proc_handler = proc_dointvec, 1371 .proc_handler = proc_doulongvec_minmax,
1365 }, 1372 },
1366 { 1373 {
1367 .procname = "nr_open", 1374 .procname = "nr_open",
@@ -1377,7 +1384,7 @@ static struct ctl_table fs_table[] = {
1377 .data = &dentry_stat, 1384 .data = &dentry_stat,
1378 .maxlen = 6*sizeof(int), 1385 .maxlen = 6*sizeof(int),
1379 .mode = 0444, 1386 .mode = 0444,
1380 .proc_handler = proc_dointvec, 1387 .proc_handler = proc_nr_dentry,
1381 }, 1388 },
1382 { 1389 {
1383 .procname = "overflowuid", 1390 .procname = "overflowuid",
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..3308fd7f1b52 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
175 up_write(&listeners->sem); 175 up_write(&listeners->sem);
176} 176}
177 177
178static int fill_pid(pid_t pid, struct task_struct *tsk, 178static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
179 struct taskstats *stats)
180{ 179{
181 int rc = 0;
182
183 if (!tsk) {
184 rcu_read_lock();
185 tsk = find_task_by_vpid(pid);
186 if (tsk)
187 get_task_struct(tsk);
188 rcu_read_unlock();
189 if (!tsk)
190 return -ESRCH;
191 } else
192 get_task_struct(tsk);
193
194 memset(stats, 0, sizeof(*stats)); 180 memset(stats, 0, sizeof(*stats));
195 /* 181 /*
196 * Each accounting subsystem adds calls to its functions to 182 * Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
209 195
210 /* fill in extended acct fields */ 196 /* fill in extended acct fields */
211 xacct_add_tsk(stats, tsk); 197 xacct_add_tsk(stats, tsk);
198}
212 199
213 /* Define err: label here if needed */ 200static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
214 put_task_struct(tsk); 201{
215 return rc; 202 struct task_struct *tsk;
216 203
204 rcu_read_lock();
205 tsk = find_task_by_vpid(pid);
206 if (tsk)
207 get_task_struct(tsk);
208 rcu_read_unlock();
209 if (!tsk)
210 return -ESRCH;
211 fill_stats(tsk, stats);
212 put_task_struct(tsk);
213 return 0;
217} 214}
218 215
219static int fill_tgid(pid_t tgid, struct task_struct *first, 216static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
220 struct taskstats *stats)
221{ 217{
222 struct task_struct *tsk; 218 struct task_struct *tsk, *first;
223 unsigned long flags; 219 unsigned long flags;
224 int rc = -ESRCH; 220 int rc = -ESRCH;
225 221
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 * leaders who are already counted with the dead tasks 224 * leaders who are already counted with the dead tasks
229 */ 225 */
230 rcu_read_lock(); 226 rcu_read_lock();
231 if (!first) 227 first = find_task_by_vpid(tgid);
232 first = find_task_by_vpid(tgid);
233 228
234 if (!first || !lock_task_sighand(first, &flags)) 229 if (!first || !lock_task_sighand(first, &flags))
235 goto out; 230 goto out;
@@ -268,7 +263,6 @@ out:
268 return rc; 263 return rc;
269} 264}
270 265
271
272static void fill_tgid_exit(struct task_struct *tsk) 266static void fill_tgid_exit(struct task_struct *tsk)
273{ 267{
274 unsigned long flags; 268 unsigned long flags;
@@ -355,6 +349,10 @@ static int parse(struct nlattr *na, struct cpumask *mask)
355 return ret; 349 return ret;
356} 350}
357 351
352#ifdef CONFIG_IA64
353#define TASKSTATS_NEEDS_PADDING 1
354#endif
355
358static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 356static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
359{ 357{
360 struct nlattr *na, *ret; 358 struct nlattr *na, *ret;
@@ -364,9 +362,33 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
364 ? TASKSTATS_TYPE_AGGR_PID 362 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 363 : TASKSTATS_TYPE_AGGR_TGID;
366 364
365 /*
366 * The taskstats structure is internally aligned on 8 byte
367 * boundaries but the layout of the aggregrate reply, with
368 * two NLA headers and the pid (each 4 bytes), actually
369 * force the entire structure to be unaligned. This causes
370 * the kernel to issue unaligned access warnings on some
371 * architectures like ia64. Unfortunately, some software out there
372 * doesn't properly unroll the NLA packet and assumes that the start
373 * of the taskstats structure will always be 20 bytes from the start
374 * of the netlink payload. Aligning the start of the taskstats
375 * structure breaks this software, which we don't want. So, for now
376 * the alignment only happens on architectures that require it
377 * and those users will have to update to fixed versions of those
378 * packages. Space is reserved in the packet only when needed.
379 * This ifdef should be removed in several years e.g. 2012 once
380 * we can be confident that fixed versions are installed on most
381 * systems. We add the padding before the aggregate since the
382 * aggregate is already a defined type.
383 */
384#ifdef TASKSTATS_NEEDS_PADDING
385 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
386 goto err;
387#endif
367 na = nla_nest_start(skb, aggr); 388 na = nla_nest_start(skb, aggr);
368 if (!na) 389 if (!na)
369 goto err; 390 goto err;
391
370 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 392 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
371 goto err; 393 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 394 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
@@ -424,74 +446,122 @@ err:
424 return rc; 446 return rc;
425} 447}
426 448
427static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 449static int cmd_attr_register_cpumask(struct genl_info *info)
428{ 450{
429 int rc;
430 struct sk_buff *rep_skb;
431 struct taskstats *stats;
432 size_t size;
433 cpumask_var_t mask; 451 cpumask_var_t mask;
452 int rc;
434 453
435 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 454 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
436 return -ENOMEM; 455 return -ENOMEM;
437
438 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 456 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
439 if (rc < 0) 457 if (rc < 0)
440 goto free_return_rc; 458 goto out;
441 if (rc == 0) { 459 rc = add_del_listener(info->snd_pid, mask, REGISTER);
442 rc = add_del_listener(info->snd_pid, mask, REGISTER); 460out:
443 goto free_return_rc; 461 free_cpumask_var(mask);
444 } 462 return rc;
463}
464
465static int cmd_attr_deregister_cpumask(struct genl_info *info)
466{
467 cpumask_var_t mask;
468 int rc;
445 469
470 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
471 return -ENOMEM;
446 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 472 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
447 if (rc < 0) 473 if (rc < 0)
448 goto free_return_rc; 474 goto out;
449 if (rc == 0) { 475 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
450 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 476out:
451free_return_rc:
452 free_cpumask_var(mask);
453 return rc;
454 }
455 free_cpumask_var(mask); 477 free_cpumask_var(mask);
478 return rc;
479}
480
481static size_t taskstats_packet_size(void)
482{
483 size_t size;
456 484
457 /*
458 * Size includes space for nested attributes
459 */
460 size = nla_total_size(sizeof(u32)) + 485 size = nla_total_size(sizeof(u32)) +
461 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 486 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
487#ifdef TASKSTATS_NEEDS_PADDING
488 size += nla_total_size(0); /* Padding for alignment */
489#endif
490 return size;
491}
492
493static int cmd_attr_pid(struct genl_info *info)
494{
495 struct taskstats *stats;
496 struct sk_buff *rep_skb;
497 size_t size;
498 u32 pid;
499 int rc;
500
501 size = taskstats_packet_size();
462 502
463 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 503 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
464 if (rc < 0) 504 if (rc < 0)
465 return rc; 505 return rc;
466 506
467 rc = -EINVAL; 507 rc = -EINVAL;
468 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 508 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
469 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 509 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
470 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 510 if (!stats)
471 if (!stats)
472 goto err;
473
474 rc = fill_pid(pid, NULL, stats);
475 if (rc < 0)
476 goto err;
477 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
478 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
479 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
480 if (!stats)
481 goto err;
482
483 rc = fill_tgid(tgid, NULL, stats);
484 if (rc < 0)
485 goto err;
486 } else
487 goto err; 511 goto err;
488 512
513 rc = fill_stats_for_pid(pid, stats);
514 if (rc < 0)
515 goto err;
489 return send_reply(rep_skb, info); 516 return send_reply(rep_skb, info);
490err: 517err:
491 nlmsg_free(rep_skb); 518 nlmsg_free(rep_skb);
492 return rc; 519 return rc;
493} 520}
494 521
522static int cmd_attr_tgid(struct genl_info *info)
523{
524 struct taskstats *stats;
525 struct sk_buff *rep_skb;
526 size_t size;
527 u32 tgid;
528 int rc;
529
530 size = taskstats_packet_size();
531
532 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
533 if (rc < 0)
534 return rc;
535
536 rc = -EINVAL;
537 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
538 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
539 if (!stats)
540 goto err;
541
542 rc = fill_stats_for_tgid(tgid, stats);
543 if (rc < 0)
544 goto err;
545 return send_reply(rep_skb, info);
546err:
547 nlmsg_free(rep_skb);
548 return rc;
549}
550
551static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
552{
553 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
554 return cmd_attr_register_cpumask(info);
555 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
556 return cmd_attr_deregister_cpumask(info);
557 else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
558 return cmd_attr_pid(info);
559 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
560 return cmd_attr_tgid(info);
561 else
562 return -EINVAL;
563}
564
495static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 565static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
496{ 566{
497 struct signal_struct *sig = tsk->signal; 567 struct signal_struct *sig = tsk->signal;
@@ -532,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
532 /* 602 /*
533 * Size includes space for nested attributes 603 * Size includes space for nested attributes
534 */ 604 */
535 size = nla_total_size(sizeof(u32)) + 605 size = taskstats_packet_size();
536 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
537 606
538 is_thread_group = !!taskstats_tgid_alloc(tsk); 607 is_thread_group = !!taskstats_tgid_alloc(tsk);
539 if (is_thread_group) { 608 if (is_thread_group) {
@@ -555,9 +624,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
555 if (!stats) 624 if (!stats)
556 goto err; 625 goto err;
557 626
558 rc = fill_pid(-1, tsk, stats); 627 fill_stats(tsk, stats);
559 if (rc < 0)
560 goto err;
561 628
562 /* 629 /*
563 * Doesn't matter if tsk is the leader or the last group member leaving 630 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..353b9227c2ec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1252 struct tvec_base *base = __get_cpu_var(tvec_bases);
1253 unsigned long expires; 1253 unsigned long expires;
1254 1254
1255 /*
1256 * Pretend that there is no timer pending if the cpu is offline.
1257 * Possible pending timers will be migrated later to an active cpu.
1258 */
1259 if (cpu_is_offline(smp_processor_id()))
1260 return now + NEXT_TIMER_MAX_DELTA;
1255 spin_lock(&base->lock); 1261 spin_lock(&base->lock);
1256 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1262 if (time_before_eq(base->next_timer, base->timer_jiffies))
1257 base->next_timer = __next_timer_interrupt(base); 1263 base->next_timer = __next_timer_interrupt(base);
@@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks)
1319{ 1325{
1320 jiffies_64 += ticks; 1326 jiffies_64 += ticks;
1321 update_wall_time(); 1327 update_wall_time();
1322 calc_global_load(); 1328 calc_global_load(ticks);
1323} 1329}
1324 1330
1325#ifdef __ARCH_WANT_SYS_ALARM 1331#ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bcdef88..ea37e2ff4164 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -126,7 +126,7 @@ if FTRACE
126config FUNCTION_TRACER 126config FUNCTION_TRACER
127 bool "Kernel Function Tracer" 127 bool "Kernel Function Tracer"
128 depends on HAVE_FUNCTION_TRACER 128 depends on HAVE_FUNCTION_TRACER
129 select FRAME_POINTER if (!ARM_UNWIND) 129 select FRAME_POINTER if !ARM_UNWIND && !S390
130 select KALLSYMS 130 select KALLSYMS
131 select GENERIC_TRACER 131 select GENERIC_TRACER
132 select CONTEXT_SWITCH_TRACER 132 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..7b8ec0281548 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/smp_lock.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/uaccess.h> 27#include <linux/uaccess.h>
29 28
@@ -169,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
170 BLK_TC_ACT(BLK_TC_WRITE) }; 169 BLK_TC_ACT(BLK_TC_WRITE) };
171 170
172#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
173#define BLK_TC_RAHEAD BLK_TC_AHEAD 171#define BLK_TC_RAHEAD BLK_TC_AHEAD
174 172
175/* The ilog2() calls fall out because they're constant */ 173/* The ilog2() calls fall out because they're constant */
@@ -197,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
197 return; 195 return;
198 196
199 what |= ddir_act[rw & WRITE]; 197 what |= ddir_act[rw & WRITE];
200 what |= MASK_TC_BIT(rw, HARDBARRIER);
201 what |= MASK_TC_BIT(rw, SYNC); 198 what |= MASK_TC_BIT(rw, SYNC);
202 what |= MASK_TC_BIT(rw, RAHEAD); 199 what |= MASK_TC_BIT(rw, RAHEAD);
203 what |= MASK_TC_BIT(rw, META); 200 what |= MASK_TC_BIT(rw, META);
@@ -326,6 +323,7 @@ static const struct file_operations blk_dropped_fops = {
326 .owner = THIS_MODULE, 323 .owner = THIS_MODULE,
327 .open = blk_dropped_open, 324 .open = blk_dropped_open,
328 .read = blk_dropped_read, 325 .read = blk_dropped_read,
326 .llseek = default_llseek,
329}; 327};
330 328
331static int blk_msg_open(struct inode *inode, struct file *filp) 329static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +363,7 @@ static const struct file_operations blk_msg_fops = {
365 .owner = THIS_MODULE, 363 .owner = THIS_MODULE,
366 .open = blk_msg_open, 364 .open = blk_msg_open,
367 .write = blk_msg_write, 365 .write = blk_msg_write,
366 .llseek = noop_llseek,
368}; 367};
369 368
370/* 369/*
@@ -639,7 +638,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
639 if (!q) 638 if (!q)
640 return -ENXIO; 639 return -ENXIO;
641 640
642 lock_kernel();
643 mutex_lock(&bdev->bd_mutex); 641 mutex_lock(&bdev->bd_mutex);
644 642
645 switch (cmd) { 643 switch (cmd) {
@@ -667,7 +665,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
667 } 665 }
668 666
669 mutex_unlock(&bdev->bd_mutex); 667 mutex_unlock(&bdev->bd_mutex);
670 unlock_kernel();
671 return ret; 668 return ret;
672} 669}
673 670
@@ -1652,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1652 struct block_device *bdev; 1649 struct block_device *bdev;
1653 ssize_t ret = -ENXIO; 1650 ssize_t ret = -ENXIO;
1654 1651
1655 lock_kernel();
1656 bdev = bdget(part_devt(p)); 1652 bdev = bdget(part_devt(p));
1657 if (bdev == NULL) 1653 if (bdev == NULL)
1658 goto out_unlock_kernel; 1654 goto out;
1659 1655
1660 q = blk_trace_get_queue(bdev); 1656 q = blk_trace_get_queue(bdev);
1661 if (q == NULL) 1657 if (q == NULL)
@@ -1683,8 +1679,7 @@ out_unlock_bdev:
1683 mutex_unlock(&bdev->bd_mutex); 1679 mutex_unlock(&bdev->bd_mutex);
1684out_bdput: 1680out_bdput:
1685 bdput(bdev); 1681 bdput(bdev);
1686out_unlock_kernel: 1682out:
1687 unlock_kernel();
1688 return ret; 1683 return ret;
1689} 1684}
1690 1685
@@ -1714,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1714 1709
1715 ret = -ENXIO; 1710 ret = -ENXIO;
1716 1711
1717 lock_kernel();
1718 p = dev_to_part(dev); 1712 p = dev_to_part(dev);
1719 bdev = bdget(part_devt(p)); 1713 bdev = bdget(part_devt(p));
1720 if (bdev == NULL) 1714 if (bdev == NULL)
1721 goto out_unlock_kernel; 1715 goto out;
1722 1716
1723 q = blk_trace_get_queue(bdev); 1717 q = blk_trace_get_queue(bdev);
1724 if (q == NULL) 1718 if (q == NULL)
@@ -1753,8 +1747,6 @@ out_unlock_bdev:
1753 mutex_unlock(&bdev->bd_mutex); 1747 mutex_unlock(&bdev->bd_mutex);
1754out_bdput: 1748out_bdput:
1755 bdput(bdev); 1749 bdput(bdev);
1756out_unlock_kernel:
1757 unlock_kernel();
1758out: 1750out:
1759 return ret ? ret : count; 1751 return ret ? ret : count;
1760} 1752}
@@ -1813,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1813 1805
1814 if (rw & REQ_RAHEAD) 1806 if (rw & REQ_RAHEAD)
1815 rwbs[i++] = 'A'; 1807 rwbs[i++] = 'A';
1816 if (rw & REQ_HARDBARRIER)
1817 rwbs[i++] = 'B';
1818 if (rw & REQ_SYNC) 1808 if (rw & REQ_SYNC)
1819 rwbs[i++] = 'S'; 1809 rwbs[i++] = 'S';
1820 if (rw & REQ_META) 1810 if (rw & REQ_META)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ebd80d50c474..f3dadae83883 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
800 .open = tracing_open_generic, 800 .open = tracing_open_generic,
801 .read = ftrace_profile_read, 801 .read = ftrace_profile_read,
802 .write = ftrace_profile_write, 802 .write = ftrace_profile_write,
803 .llseek = default_llseek,
803}; 804};
804 805
805/* used to initialize the real stat files */ 806/* used to initialize the real stat files */
@@ -2669,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = {
2669 .read = seq_read, 2670 .read = seq_read,
2670 .write = ftrace_graph_write, 2671 .write = ftrace_graph_write,
2671 .release = ftrace_graph_release, 2672 .release = ftrace_graph_release,
2673 .llseek = seq_lseek,
2672}; 2674};
2673#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2675#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2674 2676
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c5a632a669e1..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
225}; 225};
226 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
227static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
228{ 231{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
249} 252}
250 253
251/* inline for ring buffer fast paths */ 254/*
252static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
254{ 261{
255 switch (event->type_len) { 262 switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 281 return 0;
275} 282}
276 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
277/** 301/**
278 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
280 */ 310 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 312{
283 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 320 return length;
286 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 329static void *
295rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
296{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 336 if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
404/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 441/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 442#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 443
407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
409
410int ring_buffer_print_page_header(struct trace_seq *s) 444int ring_buffer_print_page_header(struct trace_seq *s)
411{ 445{
412 struct buffer_data_page field; 446 struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1580 iter->head = 0;
1547} 1581}
1548 1582
1583/* Slow path, do not inline */
1584static noinline struct ring_buffer_event *
1585rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1586{
1587 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1588
1589 /* Not the first event on the page? */
1590 if (rb_event_index(event)) {
1591 event->time_delta = delta & TS_MASK;
1592 event->array[0] = delta >> TS_SHIFT;
1593 } else {
1594 /* nope, just zero it */
1595 event->time_delta = 0;
1596 event->array[0] = 0;
1597 }
1598
1599 return skip_time_extend(event);
1600}
1601
1549/** 1602/**
1550 * ring_buffer_update_event - update event type and data 1603 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1604 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1611 * data field.
1559 */ 1612 */
1560static void 1613static void
1561rb_update_event(struct ring_buffer_event *event, 1614rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1615 struct ring_buffer_event *event, unsigned length,
1616 int add_timestamp, u64 delta)
1563{ 1617{
1564 event->type_len = type; 1618 /* Only a commit updates the timestamp */
1565 1619 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1620 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1621
1573 case 0: 1622 /*
1574 length -= RB_EVNT_HDR_SIZE; 1623 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1624 * add it to the start of the resevered space.
1576 event->array[0] = length; 1625 */
1577 else 1626 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1627 event = rb_add_time_stamp(event, delta);
1579 break; 1628 length -= RB_LEN_TIME_EXTEND;
1580 default: 1629 delta = 0;
1581 BUG();
1582 } 1630 }
1631
1632 event->time_delta = delta;
1633 length -= RB_EVNT_HDR_SIZE;
1634 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1635 event->type_len = 0;
1636 event->array[0] = length;
1637 } else
1638 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1639}
1584 1640
1585/* 1641/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1823 local_sub(length, &tail_page->write); 1879 local_sub(length, &tail_page->write);
1824} 1880}
1825 1881
1826static struct ring_buffer_event * 1882/*
1883 * This is the slow path, force gcc not to inline it.
1884 */
1885static noinline struct ring_buffer_event *
1827rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1886rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1828 unsigned long length, unsigned long tail, 1887 unsigned long length, unsigned long tail,
1829 struct buffer_page *tail_page, u64 *ts) 1888 struct buffer_page *tail_page, u64 ts)
1830{ 1889{
1831 struct buffer_page *commit_page = cpu_buffer->commit_page; 1890 struct buffer_page *commit_page = cpu_buffer->commit_page;
1832 struct ring_buffer *buffer = cpu_buffer->buffer; 1891 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1909 * Nested commits always have zero deltas, so 1968 * Nested commits always have zero deltas, so
1910 * just reread the time stamp 1969 * just reread the time stamp
1911 */ 1970 */
1912 *ts = rb_time_stamp(buffer); 1971 ts = rb_time_stamp(buffer);
1913 next_page->page->time_stamp = *ts; 1972 next_page->page->time_stamp = ts;
1914 } 1973 }
1915 1974
1916 out_again: 1975 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1929 1988
1930static struct ring_buffer_event * 1989static struct ring_buffer_event *
1931__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1990__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1932 unsigned type, unsigned long length, u64 *ts) 1991 unsigned long length, u64 ts,
1992 u64 delta, int add_timestamp)
1933{ 1993{
1934 struct buffer_page *tail_page; 1994 struct buffer_page *tail_page;
1935 struct ring_buffer_event *event; 1995 struct ring_buffer_event *event;
1936 unsigned long tail, write; 1996 unsigned long tail, write;
1937 1997
1998 /*
1999 * If the time delta since the last event is too big to
2000 * hold in the time field of the event, then we append a
2001 * TIME EXTEND event ahead of the data event.
2002 */
2003 if (unlikely(add_timestamp))
2004 length += RB_LEN_TIME_EXTEND;
2005
1938 tail_page = cpu_buffer->tail_page; 2006 tail_page = cpu_buffer->tail_page;
1939 write = local_add_return(length, &tail_page->write); 2007 write = local_add_return(length, &tail_page->write);
1940 2008
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1943 tail = write - length; 2011 tail = write - length;
1944 2012
1945 /* See if we shot pass the end of this buffer page */ 2013 /* See if we shot pass the end of this buffer page */
1946 if (write > BUF_PAGE_SIZE) 2014 if (unlikely(write > BUF_PAGE_SIZE))
1947 return rb_move_tail(cpu_buffer, length, tail, 2015 return rb_move_tail(cpu_buffer, length, tail,
1948 tail_page, ts); 2016 tail_page, ts);
1949 2017
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1951 2019
1952 event = __rb_page_index(tail_page, tail); 2020 event = __rb_page_index(tail_page, tail);
1953 kmemcheck_annotate_bitfield(event, bitfield); 2021 kmemcheck_annotate_bitfield(event, bitfield);
1954 rb_update_event(event, type, length); 2022 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1955 2023
1956 /* The passed in type is zero for DATA */ 2024 local_inc(&tail_page->entries);
1957 if (likely(!type))
1958 local_inc(&tail_page->entries);
1959 2025
1960 /* 2026 /*
1961 * If this is the first commit on the page, then update 2027 * If this is the first commit on the page, then update
1962 * its timestamp. 2028 * its timestamp.
1963 */ 2029 */
1964 if (!tail) 2030 if (!tail)
1965 tail_page->page->time_stamp = *ts; 2031 tail_page->page->time_stamp = ts;
1966 2032
1967 return event; 2033 return event;
1968} 2034}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1977 unsigned long addr; 2043 unsigned long addr;
1978 2044
1979 new_index = rb_event_index(event); 2045 new_index = rb_event_index(event);
1980 old_index = new_index + rb_event_length(event); 2046 old_index = new_index + rb_event_ts_length(event);
1981 addr = (unsigned long)event; 2047 addr = (unsigned long)event;
1982 addr &= PAGE_MASK; 2048 addr &= PAGE_MASK;
1983 2049
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2003 return 0; 2069 return 0;
2004} 2070}
2005 2071
2006static int
2007rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2008 u64 *ts, u64 *delta)
2009{
2010 struct ring_buffer_event *event;
2011 int ret;
2012
2013 WARN_ONCE(*delta > (1ULL << 59),
2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2015 (unsigned long long)*delta,
2016 (unsigned long long)*ts,
2017 (unsigned long long)cpu_buffer->write_stamp);
2018
2019 /*
2020 * The delta is too big, we to add a
2021 * new timestamp.
2022 */
2023 event = __rb_reserve_next(cpu_buffer,
2024 RINGBUF_TYPE_TIME_EXTEND,
2025 RB_LEN_TIME_EXTEND,
2026 ts);
2027 if (!event)
2028 return -EBUSY;
2029
2030 if (PTR_ERR(event) == -EAGAIN)
2031 return -EAGAIN;
2032
2033 /* Only a commited time event can update the write stamp */
2034 if (rb_event_is_commit(cpu_buffer, event)) {
2035 /*
2036 * If this is the first on the page, then it was
2037 * updated with the page itself. Try to discard it
2038 * and if we can't just make it zero.
2039 */
2040 if (rb_event_index(event)) {
2041 event->time_delta = *delta & TS_MASK;
2042 event->array[0] = *delta >> TS_SHIFT;
2043 } else {
2044 /* try to discard, since we do not need this */
2045 if (!rb_try_to_discard(cpu_buffer, event)) {
2046 /* nope, just zero it */
2047 event->time_delta = 0;
2048 event->array[0] = 0;
2049 }
2050 }
2051 cpu_buffer->write_stamp = *ts;
2052 /* let the caller know this was the commit */
2053 ret = 1;
2054 } else {
2055 /* Try to discard the event */
2056 if (!rb_try_to_discard(cpu_buffer, event)) {
2057 /* Darn, this is just wasted space */
2058 event->time_delta = 0;
2059 event->array[0] = 0;
2060 }
2061 ret = 0;
2062 }
2063
2064 *delta = 0;
2065
2066 return ret;
2067}
2068
2069static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2072static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2070{ 2073{
2071 local_inc(&cpu_buffer->committing); 2074 local_inc(&cpu_buffer->committing);
2072 local_inc(&cpu_buffer->commits); 2075 local_inc(&cpu_buffer->commits);
2073} 2076}
2074 2077
2075static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2078static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2076{ 2079{
2077 unsigned long commits; 2080 unsigned long commits;
2078 2081
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2110 unsigned long length) 2113 unsigned long length)
2111{ 2114{
2112 struct ring_buffer_event *event; 2115 struct ring_buffer_event *event;
2113 u64 ts, delta = 0; 2116 u64 ts, delta;
2114 int commit = 0;
2115 int nr_loops = 0; 2117 int nr_loops = 0;
2118 int add_timestamp;
2119 u64 diff;
2116 2120
2117 rb_start_commit(cpu_buffer); 2121 rb_start_commit(cpu_buffer);
2118 2122
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2133 2137
2134 length = rb_calculate_event_length(length); 2138 length = rb_calculate_event_length(length);
2135 again: 2139 again:
2140 add_timestamp = 0;
2141 delta = 0;
2142
2136 /* 2143 /*
2137 * We allow for interrupts to reenter here and do a trace. 2144 * We allow for interrupts to reenter here and do a trace.
2138 * If one does, it will cause this original code to loop 2145 * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2146 goto out_fail; 2153 goto out_fail;
2147 2154
2148 ts = rb_time_stamp(cpu_buffer->buffer); 2155 ts = rb_time_stamp(cpu_buffer->buffer);
2156 diff = ts - cpu_buffer->write_stamp;
2149 2157
2150 /* 2158 /* make sure this diff is calculated here */
2151 * Only the first commit can update the timestamp. 2159 barrier();
2152 * Yes there is a race here. If an interrupt comes in
2153 * just after the conditional and it traces too, then it
2154 * will also check the deltas. More than one timestamp may
2155 * also be made. But only the entry that did the actual
2156 * commit will be something other than zero.
2157 */
2158 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2159 rb_page_write(cpu_buffer->tail_page) ==
2160 rb_commit_index(cpu_buffer))) {
2161 u64 diff;
2162
2163 diff = ts - cpu_buffer->write_stamp;
2164
2165 /* make sure this diff is calculated here */
2166 barrier();
2167
2168 /* Did the write stamp get updated already? */
2169 if (unlikely(ts < cpu_buffer->write_stamp))
2170 goto get_event;
2171 2160
2161 /* Did the write stamp get updated already? */
2162 if (likely(ts >= cpu_buffer->write_stamp)) {
2172 delta = diff; 2163 delta = diff;
2173 if (unlikely(test_time_stamp(delta))) { 2164 if (unlikely(test_time_stamp(delta))) {
2174 2165 WARN_ONCE(delta > (1ULL << 59),
2175 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2176 if (commit == -EBUSY) 2167 (unsigned long long)delta,
2177 goto out_fail; 2168 (unsigned long long)ts,
2178 2169 (unsigned long long)cpu_buffer->write_stamp);
2179 if (commit == -EAGAIN) 2170 add_timestamp = 1;
2180 goto again;
2181
2182 RB_WARN_ON(cpu_buffer, commit < 0);
2183 } 2171 }
2184 } 2172 }
2185 2173
2186 get_event: 2174 event = __rb_reserve_next(cpu_buffer, length, ts,
2187 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2175 delta, add_timestamp);
2188 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2176 if (unlikely(PTR_ERR(event) == -EAGAIN))
2189 goto again; 2177 goto again;
2190 2178
2191 if (!event) 2179 if (!event)
2192 goto out_fail; 2180 goto out_fail;
2193 2181
2194 if (!rb_event_is_commit(cpu_buffer, event))
2195 delta = 0;
2196
2197 event->time_delta = delta;
2198
2199 return event; 2182 return event;
2200 2183
2201 out_fail: 2184 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2207 2190
2208#define TRACE_RECURSIVE_DEPTH 16 2191#define TRACE_RECURSIVE_DEPTH 16
2209 2192
2210static int trace_recursive_lock(void) 2193/* Keep this code out of the fast path cache */
2194static noinline void trace_recursive_fail(void)
2211{ 2195{
2212 current->trace_recursion++;
2213
2214 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2215 return 0;
2216
2217 /* Disable all tracing before we do anything else */ 2196 /* Disable all tracing before we do anything else */
2218 tracing_off_permanent(); 2197 tracing_off_permanent();
2219 2198
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
2225 in_nmi()); 2204 in_nmi());
2226 2205
2227 WARN_ON_ONCE(1); 2206 WARN_ON_ONCE(1);
2207}
2208
2209static inline int trace_recursive_lock(void)
2210{
2211 current->trace_recursion++;
2212
2213 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2214 return 0;
2215
2216 trace_recursive_fail();
2217
2228 return -1; 2218 return -1;
2229} 2219}
2230 2220
2231static void trace_recursive_unlock(void) 2221static inline void trace_recursive_unlock(void)
2232{ 2222{
2233 WARN_ON_ONCE(!current->trace_recursion); 2223 WARN_ON_ONCE(!current->trace_recursion);
2234 2224
@@ -2308,12 +2298,28 @@ static void
2308rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2298rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2309 struct ring_buffer_event *event) 2299 struct ring_buffer_event *event)
2310{ 2300{
2301 u64 delta;
2302
2311 /* 2303 /*
2312 * The event first in the commit queue updates the 2304 * The event first in the commit queue updates the
2313 * time stamp. 2305 * time stamp.
2314 */ 2306 */
2315 if (rb_event_is_commit(cpu_buffer, event)) 2307 if (rb_event_is_commit(cpu_buffer, event)) {
2316 cpu_buffer->write_stamp += event->time_delta; 2308 /*
2309 * A commit event that is first on a page
2310 * updates the write timestamp with the page stamp
2311 */
2312 if (!rb_event_index(event))
2313 cpu_buffer->write_stamp =
2314 cpu_buffer->commit_page->page->time_stamp;
2315 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2316 delta = event->array[0];
2317 delta <<= TS_SHIFT;
2318 delta += event->time_delta;
2319 cpu_buffer->write_stamp += delta;
2320 } else
2321 cpu_buffer->write_stamp += event->time_delta;
2322 }
2317} 2323}
2318 2324
2319static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2325static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2353 2359
2354static inline void rb_event_discard(struct ring_buffer_event *event) 2360static inline void rb_event_discard(struct ring_buffer_event *event)
2355{ 2361{
2362 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2363 event = skip_time_extend(event);
2364
2356 /* array[0] holds the actual length for the discarded event */ 2365 /* array[0] holds the actual length for the discarded event */
2357 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2366 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2358 event->type_len = RINGBUF_TYPE_PADDING; 2367 event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3049 3058
3050 again: 3059 again:
3051 /* 3060 /*
3052 * We repeat when a timestamp is encountered. It is possible 3061 * We repeat when a time extend is encountered.
3053 * to get multiple timestamps from an interrupt entering just 3062 * Since the time extend is always attached to a data event,
3054 * as one timestamp is about to be written, or from discarded 3063 * we should never loop more than once.
3055 * commits. The most that we can have is the number on a single page. 3064 * (We never hit the following condition more than twice).
3056 */ 3065 */
3057 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3066 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3058 return NULL; 3067 return NULL;
3059 3068
3060 reader = rb_get_reader_page(cpu_buffer); 3069 reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3130 return NULL; 3139 return NULL;
3131 3140
3132 /* 3141 /*
3133 * We repeat when a timestamp is encountered. 3142 * We repeat when a time extend is encountered.
3134 * We can get multiple timestamps by nested interrupts or also 3143 * Since the time extend is always attached to a data event,
3135 * if filtering is on (discarding commits). Since discarding 3144 * we should never loop more than once.
3136 * commits can be frequent we can get a lot of timestamps. 3145 * (We never hit the following condition more than twice).
3137 * But we limit them by not adding timestamps if they begin
3138 * at the start of a page.
3139 */ 3146 */
3140 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3147 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3141 return NULL; 3148 return NULL;
3142 3149
3143 if (rb_per_cpu_empty(cpu_buffer)) 3150 if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3835 if (len > (commit - read)) 3842 if (len > (commit - read))
3836 len = (commit - read); 3843 len = (commit - read);
3837 3844
3838 size = rb_event_length(event); 3845 /* Always keep the time extend and data together */
3846 size = rb_event_ts_length(event);
3839 3847
3840 if (len < size) 3848 if (len < size)
3841 goto out_unlock; 3849 goto out_unlock;
@@ -3845,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3845 3853
3846 /* Need to copy one event at a time */ 3854 /* Need to copy one event at a time */
3847 do { 3855 do {
3856 /* We need the size of one event, because
3857 * rb_advance_reader only advances by one event,
3858 * whereas rb_event_ts_length may include the size of
3859 * one or two events.
3860 * We have already ensured there's enough space if this
3861 * is a time extend. */
3862 size = rb_event_length(event);
3848 memcpy(bpage->data + pos, rpage->data + rpos, size); 3863 memcpy(bpage->data + pos, rpage->data + rpos, size);
3849 3864
3850 len -= size; 3865 len -= size;
@@ -3857,8 +3872,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3857 break; 3872 break;
3858 3873
3859 event = rb_reader_event(cpu_buffer); 3874 event = rb_reader_event(cpu_buffer);
3860 size = rb_event_length(event); 3875 /* Always keep the time extend and data together */
3861 } while (len > size); 3876 size = rb_event_ts_length(event);
3877 } while (len >= size);
3862 3878
3863 /* update bpage */ 3879 /* update bpage */
3864 local_set(&bpage->commit, pos); 3880 local_set(&bpage->commit, pos);
@@ -3974,6 +3990,7 @@ static const struct file_operations rb_simple_fops = {
3974 .open = tracing_open_generic, 3990 .open = tracing_open_generic,
3975 .read = rb_simple_read, 3991 .read = rb_simple_read,
3976 .write = rb_simple_write, 3992 .write = rb_simple_write,
3993 .llseek = default_llseek,
3977}; 3994};
3978 3995
3979 3996
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..f8cf959bad45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
21#include <linux/notifier.h> 20#include <linux/notifier.h>
22#include <linux/irqflags.h> 21#include <linux/irqflags.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
@@ -1284,6 +1283,8 @@ void trace_dump_stack(void)
1284 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1283 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1285} 1284}
1286 1285
1286static DEFINE_PER_CPU(int, user_stack_count);
1287
1287void 1288void
1288ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1289ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1289{ 1290{
@@ -1302,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1302 if (unlikely(in_nmi())) 1303 if (unlikely(in_nmi()))
1303 return; 1304 return;
1304 1305
1306 /*
1307 * prevent recursion, since the user stack tracing may
1308 * trigger other kernel events.
1309 */
1310 preempt_disable();
1311 if (__this_cpu_read(user_stack_count))
1312 goto out;
1313
1314 __this_cpu_inc(user_stack_count);
1315
1316
1317
1305 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1306 sizeof(*entry), flags, pc); 1319 sizeof(*entry), flags, pc);
1307 if (!event) 1320 if (!event)
@@ -1319,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1319 save_stack_trace_user(&trace); 1332 save_stack_trace_user(&trace);
1320 if (!filter_check_discard(call, entry, buffer, event)) 1333 if (!filter_check_discard(call, entry, buffer, event))
1321 ring_buffer_unlock_commit(buffer, event); 1334 ring_buffer_unlock_commit(buffer, event);
1335
1336 __this_cpu_dec(user_stack_count);
1337
1338 out:
1339 preempt_enable();
1322} 1340}
1323 1341
1324#ifdef UNUSED 1342#ifdef UNUSED
@@ -2320,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
2320 return count; 2338 return count;
2321} 2339}
2322 2340
2341static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
2342{
2343 if (file->f_mode & FMODE_READ)
2344 return seq_lseek(file, offset, origin);
2345 else
2346 return 0;
2347}
2348
2323static const struct file_operations tracing_fops = { 2349static const struct file_operations tracing_fops = {
2324 .open = tracing_open, 2350 .open = tracing_open,
2325 .read = seq_read, 2351 .read = seq_read,
2326 .write = tracing_write_stub, 2352 .write = tracing_write_stub,
2327 .llseek = seq_lseek, 2353 .llseek = tracing_seek,
2328 .release = tracing_release, 2354 .release = tracing_release,
2329}; 2355};
2330 2356
@@ -3996,13 +4022,9 @@ static void tracing_init_debugfs_percpu(long cpu)
3996{ 4022{
3997 struct dentry *d_percpu = tracing_dentry_percpu(); 4023 struct dentry *d_percpu = tracing_dentry_percpu();
3998 struct dentry *d_cpu; 4024 struct dentry *d_cpu;
3999 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 4025 char cpu_dir[30]; /* 30 characters should be more than enough */
4000 char cpu_dir[7];
4001
4002 if (cpu > 999 || cpu < 0)
4003 return;
4004 4026
4005 sprintf(cpu_dir, "cpu%ld", cpu); 4027 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4006 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4028 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4007 if (!d_cpu) { 4029 if (!d_cpu) {
4008 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 4030 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 398c0e8b332c..0725eeab1937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -932,6 +932,7 @@ static const struct file_operations ftrace_enable_fops = {
932 .open = tracing_open_generic, 932 .open = tracing_open_generic,
933 .read = event_enable_read, 933 .read = event_enable_read,
934 .write = event_enable_write, 934 .write = event_enable_write,
935 .llseek = default_llseek,
935}; 936};
936 937
937static const struct file_operations ftrace_event_format_fops = { 938static const struct file_operations ftrace_event_format_fops = {
@@ -944,29 +945,34 @@ static const struct file_operations ftrace_event_format_fops = {
944static const struct file_operations ftrace_event_id_fops = { 945static const struct file_operations ftrace_event_id_fops = {
945 .open = tracing_open_generic, 946 .open = tracing_open_generic,
946 .read = event_id_read, 947 .read = event_id_read,
948 .llseek = default_llseek,
947}; 949};
948 950
949static const struct file_operations ftrace_event_filter_fops = { 951static const struct file_operations ftrace_event_filter_fops = {
950 .open = tracing_open_generic, 952 .open = tracing_open_generic,
951 .read = event_filter_read, 953 .read = event_filter_read,
952 .write = event_filter_write, 954 .write = event_filter_write,
955 .llseek = default_llseek,
953}; 956};
954 957
955static const struct file_operations ftrace_subsystem_filter_fops = { 958static const struct file_operations ftrace_subsystem_filter_fops = {
956 .open = tracing_open_generic, 959 .open = tracing_open_generic,
957 .read = subsystem_filter_read, 960 .read = subsystem_filter_read,
958 .write = subsystem_filter_write, 961 .write = subsystem_filter_write,
962 .llseek = default_llseek,
959}; 963};
960 964
961static const struct file_operations ftrace_system_enable_fops = { 965static const struct file_operations ftrace_system_enable_fops = {
962 .open = tracing_open_generic, 966 .open = tracing_open_generic,
963 .read = system_enable_read, 967 .read = system_enable_read,
964 .write = system_enable_write, 968 .write = system_enable_write,
969 .llseek = default_llseek,
965}; 970};
966 971
967static const struct file_operations ftrace_show_header_fops = { 972static const struct file_operations ftrace_show_header_fops = {
968 .open = tracing_open_generic, 973 .open = tracing_open_generic,
969 .read = show_header, 974 .read = show_header,
975 .llseek = default_llseek,
970}; 976};
971 977
972static struct dentry *event_trace_events_dir(void) 978static struct dentry *event_trace_events_dir(void)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 7b8ecd751d93..3c5c5dfea0b3 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -13,7 +13,6 @@
13#include <linux/kdb.h> 13#include <linux/kdb.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15 15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h" 16#include "trace.h"
18#include "trace_output.h" 17#include "trace_output.h"
19 18
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 544301d29dee..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h> 33#include <linux/limits.h>
34#include <linux/uaccess.h>
35#include <asm/bitsperlong.h> 34#include <asm/bitsperlong.h>
36 35
37#include "trace.h" 36#include "trace.h"
@@ -648,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp)
648 } 647 }
649 ret = register_probe_event(tp); 648 ret = register_probe_event(tp);
650 if (ret) { 649 if (ret) {
651 pr_warning("Faild to register probe event(%d)\n", ret); 650 pr_warning("Failed to register probe event(%d)\n", ret);
652 goto end; 651 goto end;
653 } 652 }
654 653
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a6b7e0e0f3eb..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
195 .open = tracing_open_generic, 195 .open = tracing_open_generic,
196 .read = stack_max_size_read, 196 .read = stack_max_size_read,
197 .write = stack_max_size_write, 197 .write = stack_max_size_write,
198 .llseek = default_llseek,
198}; 199};
199 200
200static void * 201static void *
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
63 stats->ac_ppid = pid_alive(tsk) ? 63 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 64 rcu_dereference(tsk->real_parent)->tgid : 0;
65 rcu_read_unlock(); 65 rcu_read_unlock();
66 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 66 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 67 stats->ac_stime = cputime_to_usecs(tsk->stime);
68 stats->ac_utimescaled = 68 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
69 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; 69 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
70 stats->ac_stimescaled =
71 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
72 stats->ac_minflt = tsk->min_flt; 70 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 71 stats->ac_majflt = tsk->maj_flt;
74 72
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
91 * upon function exit. 91 * upon function exit.
92 */ 92 */
93static void free_user(struct user_struct *up, unsigned long flags) 93static void free_user(struct user_struct *up, unsigned long flags)
94 __releases(&uidhash_lock)
94{ 95{
95 uid_hash_remove(up); 96 uid_hash_remove(up);
96 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 92}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 93EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 94
95/* 95/**
96 * finish_wait - clean up after waiting in a queue 96 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 97 * @q: waitqueue waited on
98 * @wait: wait descriptor 98 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
127} 127}
128EXPORT_SYMBOL(finish_wait); 128EXPORT_SYMBOL(finish_wait);
129 129
130/* 130/**
131 * abort_exclusive_wait - abort exclusive waiting in a queue 131 * abort_exclusive_wait - abort exclusive waiting in a queue
132 * @q: waitqueue waited on 132 * @q: waitqueue waited on
133 * @wait: wait descriptor 133 * @wait: wait descriptor
134 * @state: runstate of the waiter to be woken 134 * @mode: runstate of the waiter to be woken
135 * @key: key to identify a wait bit queue or %NULL 135 * @key: key to identify a wait bit queue or %NULL
136 * 136 *
137 * Sets current thread back to running state and removes 137 * Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index bafba687a6d8..6e3c41a4024c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __initdata no_watchdog; 46static int no_watchdog;
47 47
48 48
49/* boot commands */ 49/* boot commands */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f77afd939229..e785b0f2aea5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -42,9 +42,6 @@
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44 44
45#define CREATE_TRACE_POINTS
46#include <trace/events/workqueue.h>
47
48#include "workqueue_sched.h" 45#include "workqueue_sched.h"
49 46
50enum { 47enum {
@@ -257,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq);
257EXPORT_SYMBOL_GPL(system_nrt_wq); 254EXPORT_SYMBOL_GPL(system_nrt_wq);
258EXPORT_SYMBOL_GPL(system_unbound_wq); 255EXPORT_SYMBOL_GPL(system_unbound_wq);
259 256
257#define CREATE_TRACE_POINTS
258#include <trace/events/workqueue.h>
259
260#define for_each_busy_worker(worker, i, pos, gcwq) \ 260#define for_each_busy_worker(worker, i, pos, gcwq) \
261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) 262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -310,21 +310,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
310 (cpu) < WORK_CPU_NONE; \ 310 (cpu) < WORK_CPU_NONE; \
311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
312 312
313#ifdef CONFIG_LOCKDEP
314/**
315 * in_workqueue_context() - in context of specified workqueue?
316 * @wq: the workqueue of interest
317 *
318 * Checks lockdep state to see if the current task is executing from
319 * within a workqueue item. This function exists only if lockdep is
320 * enabled.
321 */
322int in_workqueue_context(struct workqueue_struct *wq)
323{
324 return lock_is_held(&wq->lockdep_map);
325}
326#endif
327
328#ifdef CONFIG_DEBUG_OBJECTS_WORK 313#ifdef CONFIG_DEBUG_OBJECTS_WORK
329 314
330static struct debug_obj_descr work_debug_descr; 315static struct debug_obj_descr work_debug_descr;
@@ -604,7 +589,9 @@ static bool keep_working(struct global_cwq *gcwq)
604{ 589{
605 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 590 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
606 591
607 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; 592 return !list_empty(&gcwq->worklist) &&
593 (atomic_read(nr_running) <= 1 ||
594 gcwq->flags & GCWQ_HIGHPRI_PENDING);
608} 595}
609 596
610/* Do we need a new worker? Called from manager. */ 597/* Do we need a new worker? Called from manager. */
@@ -674,7 +661,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
674{ 661{
675 struct worker *worker = kthread_data(task); 662 struct worker *worker = kthread_data(task);
676 663
677 if (likely(!(worker->flags & WORKER_NOT_RUNNING))) 664 if (!(worker->flags & WORKER_NOT_RUNNING))
678 atomic_inc(get_gcwq_nr_running(cpu)); 665 atomic_inc(get_gcwq_nr_running(cpu));
679} 666}
680 667
@@ -700,7 +687,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
700 struct global_cwq *gcwq = get_gcwq(cpu); 687 struct global_cwq *gcwq = get_gcwq(cpu);
701 atomic_t *nr_running = get_gcwq_nr_running(cpu); 688 atomic_t *nr_running = get_gcwq_nr_running(cpu);
702 689
703 if (unlikely(worker->flags & WORKER_NOT_RUNNING)) 690 if (worker->flags & WORKER_NOT_RUNNING)
704 return NULL; 691 return NULL;
705 692
706 /* this can only happen on the local cpu */ 693 /* this can only happen on the local cpu */
@@ -997,6 +984,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
997 984
998 /* gcwq determined, get cwq and queue */ 985 /* gcwq determined, get cwq and queue */
999 cwq = get_cwq(gcwq->cpu, wq); 986 cwq = get_cwq(gcwq->cpu, wq);
987 trace_workqueue_queue_work(cpu, cwq, work);
1000 988
1001 BUG_ON(!list_empty(&work->entry)); 989 BUG_ON(!list_empty(&work->entry));
1002 990
@@ -1004,6 +992,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1004 work_flags = work_color_to_flags(cwq->work_color); 992 work_flags = work_color_to_flags(cwq->work_color);
1005 993
1006 if (likely(cwq->nr_active < cwq->max_active)) { 994 if (likely(cwq->nr_active < cwq->max_active)) {
995 trace_workqueue_activate_work(work);
1007 cwq->nr_active++; 996 cwq->nr_active++;
1008 worklist = gcwq_determine_ins_pos(gcwq, cwq); 997 worklist = gcwq_determine_ins_pos(gcwq, cwq);
1009 } else { 998 } else {
@@ -1679,6 +1668,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1679 struct work_struct, entry); 1668 struct work_struct, entry);
1680 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); 1669 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1681 1670
1671 trace_workqueue_activate_work(work);
1682 move_linked_works(work, pos, NULL); 1672 move_linked_works(work, pos, NULL);
1683 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 1673 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1684 cwq->nr_active++; 1674 cwq->nr_active++;
@@ -2074,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2074 * checks and call back into the fixup functions where we 2064 * checks and call back into the fixup functions where we
2075 * might deadlock. 2065 * might deadlock.
2076 */ 2066 */
2077 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2067 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2078 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2068 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2079 init_completion(&barr->done); 2069 init_completion(&barr->done);
2080 2070
@@ -2326,27 +2316,17 @@ out_unlock:
2326} 2316}
2327EXPORT_SYMBOL_GPL(flush_workqueue); 2317EXPORT_SYMBOL_GPL(flush_workqueue);
2328 2318
2329/** 2319static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2330 * flush_work - block until a work_struct's callback has terminated 2320 bool wait_executing)
2331 * @work: the work which is to be flushed
2332 *
2333 * Returns false if @work has already terminated.
2334 *
2335 * It is expected that, prior to calling flush_work(), the caller has
2336 * arranged for the work to not be requeued, otherwise it doesn't make
2337 * sense to use this function.
2338 */
2339int flush_work(struct work_struct *work)
2340{ 2321{
2341 struct worker *worker = NULL; 2322 struct worker *worker = NULL;
2342 struct global_cwq *gcwq; 2323 struct global_cwq *gcwq;
2343 struct cpu_workqueue_struct *cwq; 2324 struct cpu_workqueue_struct *cwq;
2344 struct wq_barrier barr;
2345 2325
2346 might_sleep(); 2326 might_sleep();
2347 gcwq = get_work_gcwq(work); 2327 gcwq = get_work_gcwq(work);
2348 if (!gcwq) 2328 if (!gcwq)
2349 return 0; 2329 return false;
2350 2330
2351 spin_lock_irq(&gcwq->lock); 2331 spin_lock_irq(&gcwq->lock);
2352 if (!list_empty(&work->entry)) { 2332 if (!list_empty(&work->entry)) {
@@ -2359,28 +2339,127 @@ int flush_work(struct work_struct *work)
2359 cwq = get_work_cwq(work); 2339 cwq = get_work_cwq(work);
2360 if (unlikely(!cwq || gcwq != cwq->gcwq)) 2340 if (unlikely(!cwq || gcwq != cwq->gcwq))
2361 goto already_gone; 2341 goto already_gone;
2362 } else { 2342 } else if (wait_executing) {
2363 worker = find_worker_executing_work(gcwq, work); 2343 worker = find_worker_executing_work(gcwq, work);
2364 if (!worker) 2344 if (!worker)
2365 goto already_gone; 2345 goto already_gone;
2366 cwq = worker->current_cwq; 2346 cwq = worker->current_cwq;
2367 } 2347 } else
2348 goto already_gone;
2368 2349
2369 insert_wq_barrier(cwq, &barr, work, worker); 2350 insert_wq_barrier(cwq, barr, work, worker);
2370 spin_unlock_irq(&gcwq->lock); 2351 spin_unlock_irq(&gcwq->lock);
2371 2352
2372 lock_map_acquire(&cwq->wq->lockdep_map); 2353 lock_map_acquire(&cwq->wq->lockdep_map);
2373 lock_map_release(&cwq->wq->lockdep_map); 2354 lock_map_release(&cwq->wq->lockdep_map);
2374 2355 return true;
2375 wait_for_completion(&barr.done);
2376 destroy_work_on_stack(&barr.work);
2377 return 1;
2378already_gone: 2356already_gone:
2379 spin_unlock_irq(&gcwq->lock); 2357 spin_unlock_irq(&gcwq->lock);
2380 return 0; 2358 return false;
2359}
2360
2361/**
2362 * flush_work - wait for a work to finish executing the last queueing instance
2363 * @work: the work to flush
2364 *
2365 * Wait until @work has finished execution. This function considers
2366 * only the last queueing instance of @work. If @work has been
2367 * enqueued across different CPUs on a non-reentrant workqueue or on
2368 * multiple workqueues, @work might still be executing on return on
2369 * some of the CPUs from earlier queueing.
2370 *
2371 * If @work was queued only on a non-reentrant, ordered or unbound
2372 * workqueue, @work is guaranteed to be idle on return if it hasn't
2373 * been requeued since flush started.
2374 *
2375 * RETURNS:
2376 * %true if flush_work() waited for the work to finish execution,
2377 * %false if it was already idle.
2378 */
2379bool flush_work(struct work_struct *work)
2380{
2381 struct wq_barrier barr;
2382
2383 if (start_flush_work(work, &barr, true)) {
2384 wait_for_completion(&barr.done);
2385 destroy_work_on_stack(&barr.work);
2386 return true;
2387 } else
2388 return false;
2381} 2389}
2382EXPORT_SYMBOL_GPL(flush_work); 2390EXPORT_SYMBOL_GPL(flush_work);
2383 2391
2392static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2393{
2394 struct wq_barrier barr;
2395 struct worker *worker;
2396
2397 spin_lock_irq(&gcwq->lock);
2398
2399 worker = find_worker_executing_work(gcwq, work);
2400 if (unlikely(worker))
2401 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2402
2403 spin_unlock_irq(&gcwq->lock);
2404
2405 if (unlikely(worker)) {
2406 wait_for_completion(&barr.done);
2407 destroy_work_on_stack(&barr.work);
2408 return true;
2409 } else
2410 return false;
2411}
2412
2413static bool wait_on_work(struct work_struct *work)
2414{
2415 bool ret = false;
2416 int cpu;
2417
2418 might_sleep();
2419
2420 lock_map_acquire(&work->lockdep_map);
2421 lock_map_release(&work->lockdep_map);
2422
2423 for_each_gcwq_cpu(cpu)
2424 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2425 return ret;
2426}
2427
2428/**
2429 * flush_work_sync - wait until a work has finished execution
2430 * @work: the work to flush
2431 *
2432 * Wait until @work has finished execution. On return, it's
2433 * guaranteed that all queueing instances of @work which happened
2434 * before this function is called are finished. In other words, if
2435 * @work hasn't been requeued since this function was called, @work is
2436 * guaranteed to be idle on return.
2437 *
2438 * RETURNS:
2439 * %true if flush_work_sync() waited for the work to finish execution,
2440 * %false if it was already idle.
2441 */
2442bool flush_work_sync(struct work_struct *work)
2443{
2444 struct wq_barrier barr;
2445 bool pending, waited;
2446
2447 /* we'll wait for executions separately, queue barr only if pending */
2448 pending = start_flush_work(work, &barr, false);
2449
2450 /* wait for executions to finish */
2451 waited = wait_on_work(work);
2452
2453 /* wait for the pending one */
2454 if (pending) {
2455 wait_for_completion(&barr.done);
2456 destroy_work_on_stack(&barr.work);
2457 }
2458
2459 return pending || waited;
2460}
2461EXPORT_SYMBOL_GPL(flush_work_sync);
2462
2384/* 2463/*
2385 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 2464 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2386 * so this work can't be re-armed in any way. 2465 * so this work can't be re-armed in any way.
@@ -2423,39 +2502,7 @@ static int try_to_grab_pending(struct work_struct *work)
2423 return ret; 2502 return ret;
2424} 2503}
2425 2504
2426static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) 2505static bool __cancel_work_timer(struct work_struct *work,
2427{
2428 struct wq_barrier barr;
2429 struct worker *worker;
2430
2431 spin_lock_irq(&gcwq->lock);
2432
2433 worker = find_worker_executing_work(gcwq, work);
2434 if (unlikely(worker))
2435 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2436
2437 spin_unlock_irq(&gcwq->lock);
2438
2439 if (unlikely(worker)) {
2440 wait_for_completion(&barr.done);
2441 destroy_work_on_stack(&barr.work);
2442 }
2443}
2444
2445static void wait_on_work(struct work_struct *work)
2446{
2447 int cpu;
2448
2449 might_sleep();
2450
2451 lock_map_acquire(&work->lockdep_map);
2452 lock_map_release(&work->lockdep_map);
2453
2454 for_each_gcwq_cpu(cpu)
2455 wait_on_cpu_work(get_gcwq(cpu), work);
2456}
2457
2458static int __cancel_work_timer(struct work_struct *work,
2459 struct timer_list* timer) 2506 struct timer_list* timer)
2460{ 2507{
2461 int ret; 2508 int ret;
@@ -2472,42 +2519,81 @@ static int __cancel_work_timer(struct work_struct *work,
2472} 2519}
2473 2520
2474/** 2521/**
2475 * cancel_work_sync - block until a work_struct's callback has terminated 2522 * cancel_work_sync - cancel a work and wait for it to finish
2476 * @work: the work which is to be flushed 2523 * @work: the work to cancel
2477 *
2478 * Returns true if @work was pending.
2479 * 2524 *
2480 * cancel_work_sync() will cancel the work if it is queued. If the work's 2525 * Cancel @work and wait for its execution to finish. This function
2481 * callback appears to be running, cancel_work_sync() will block until it 2526 * can be used even if the work re-queues itself or migrates to
2482 * has completed. 2527 * another workqueue. On return from this function, @work is
2528 * guaranteed to be not pending or executing on any CPU.
2483 * 2529 *
2484 * It is possible to use this function if the work re-queues itself. It can 2530 * cancel_work_sync(&delayed_work->work) must not be used for
2485 * cancel the work even if it migrates to another workqueue, however in that 2531 * delayed_work's. Use cancel_delayed_work_sync() instead.
2486 * case it only guarantees that work->func() has completed on the last queued
2487 * workqueue.
2488 * 2532 *
2489 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not 2533 * The caller must ensure that the workqueue on which @work was last
2490 * pending, otherwise it goes into a busy-wait loop until the timer expires.
2491 *
2492 * The caller must ensure that workqueue_struct on which this work was last
2493 * queued can't be destroyed before this function returns. 2534 * queued can't be destroyed before this function returns.
2535 *
2536 * RETURNS:
2537 * %true if @work was pending, %false otherwise.
2494 */ 2538 */
2495int cancel_work_sync(struct work_struct *work) 2539bool cancel_work_sync(struct work_struct *work)
2496{ 2540{
2497 return __cancel_work_timer(work, NULL); 2541 return __cancel_work_timer(work, NULL);
2498} 2542}
2499EXPORT_SYMBOL_GPL(cancel_work_sync); 2543EXPORT_SYMBOL_GPL(cancel_work_sync);
2500 2544
2501/** 2545/**
2502 * cancel_delayed_work_sync - reliably kill off a delayed work. 2546 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2503 * @dwork: the delayed work struct 2547 * @dwork: the delayed work to flush
2548 *
2549 * Delayed timer is cancelled and the pending work is queued for
2550 * immediate execution. Like flush_work(), this function only
2551 * considers the last queueing instance of @dwork.
2552 *
2553 * RETURNS:
2554 * %true if flush_work() waited for the work to finish execution,
2555 * %false if it was already idle.
2556 */
2557bool flush_delayed_work(struct delayed_work *dwork)
2558{
2559 if (del_timer_sync(&dwork->timer))
2560 __queue_work(raw_smp_processor_id(),
2561 get_work_cwq(&dwork->work)->wq, &dwork->work);
2562 return flush_work(&dwork->work);
2563}
2564EXPORT_SYMBOL(flush_delayed_work);
2565
2566/**
2567 * flush_delayed_work_sync - wait for a dwork to finish
2568 * @dwork: the delayed work to flush
2504 * 2569 *
2505 * Returns true if @dwork was pending. 2570 * Delayed timer is cancelled and the pending work is queued for
2571 * execution immediately. Other than timer handling, its behavior
2572 * is identical to flush_work_sync().
2506 * 2573 *
2507 * It is possible to use this function if @dwork rearms itself via queue_work() 2574 * RETURNS:
2508 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 2575 * %true if flush_work_sync() waited for the work to finish execution,
2576 * %false if it was already idle.
2509 */ 2577 */
2510int cancel_delayed_work_sync(struct delayed_work *dwork) 2578bool flush_delayed_work_sync(struct delayed_work *dwork)
2579{
2580 if (del_timer_sync(&dwork->timer))
2581 __queue_work(raw_smp_processor_id(),
2582 get_work_cwq(&dwork->work)->wq, &dwork->work);
2583 return flush_work_sync(&dwork->work);
2584}
2585EXPORT_SYMBOL(flush_delayed_work_sync);
2586
2587/**
2588 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2589 * @dwork: the delayed work cancel
2590 *
2591 * This is cancel_work_sync() for delayed works.
2592 *
2593 * RETURNS:
2594 * %true if @dwork was pending, %false otherwise.
2595 */
2596bool cancel_delayed_work_sync(struct delayed_work *dwork)
2511{ 2597{
2512 return __cancel_work_timer(&dwork->work, &dwork->timer); 2598 return __cancel_work_timer(&dwork->work, &dwork->timer);
2513} 2599}
@@ -2559,23 +2645,6 @@ int schedule_delayed_work(struct delayed_work *dwork,
2559EXPORT_SYMBOL(schedule_delayed_work); 2645EXPORT_SYMBOL(schedule_delayed_work);
2560 2646
2561/** 2647/**
2562 * flush_delayed_work - block until a dwork_struct's callback has terminated
2563 * @dwork: the delayed work which is to be flushed
2564 *
2565 * Any timeout is cancelled, and any pending work is run immediately.
2566 */
2567void flush_delayed_work(struct delayed_work *dwork)
2568{
2569 if (del_timer_sync(&dwork->timer)) {
2570 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
2571 &dwork->work);
2572 put_cpu();
2573 }
2574 flush_work(&dwork->work);
2575}
2576EXPORT_SYMBOL(flush_delayed_work);
2577
2578/**
2579 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 2648 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2580 * @cpu: cpu to use 2649 * @cpu: cpu to use
2581 * @dwork: job to be done 2650 * @dwork: job to be done
@@ -2592,13 +2661,15 @@ int schedule_delayed_work_on(int cpu,
2592EXPORT_SYMBOL(schedule_delayed_work_on); 2661EXPORT_SYMBOL(schedule_delayed_work_on);
2593 2662
2594/** 2663/**
2595 * schedule_on_each_cpu - call a function on each online CPU from keventd 2664 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2596 * @func: the function to call 2665 * @func: the function to call
2597 * 2666 *
2598 * Returns zero on success. 2667 * schedule_on_each_cpu() executes @func on each online CPU using the
2599 * Returns -ve errno on failure. 2668 * system workqueue and blocks until all CPUs have completed.
2600 *
2601 * schedule_on_each_cpu() is very slow. 2669 * schedule_on_each_cpu() is very slow.
2670 *
2671 * RETURNS:
2672 * 0 on success, -errno on failure.
2602 */ 2673 */
2603int schedule_on_each_cpu(work_func_t func) 2674int schedule_on_each_cpu(work_func_t func)
2604{ 2675{
@@ -2720,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2720 } 2791 }
2721 } 2792 }
2722 2793
2723 /* just in case, make sure it's actually aligned */ 2794 /* just in case, make sure it's actually aligned
2795 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2796 */
2724 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 2797 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2725 return wq->cpu_wq.v ? 0 : -ENOMEM; 2798 return wq->cpu_wq.v ? 0 : -ENOMEM;
2726} 2799}
@@ -2764,6 +2837,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2764 unsigned int cpu; 2837 unsigned int cpu;
2765 2838
2766 /* 2839 /*
2840 * Workqueues which may be used during memory reclaim should
2841 * have a rescuer to guarantee forward progress.
2842 */
2843 if (flags & WQ_MEM_RECLAIM)
2844 flags |= WQ_RESCUER;
2845
2846 /*
2767 * Unbound workqueues aren't concurrency managed and should be 2847 * Unbound workqueues aren't concurrency managed and should be
2768 * dispatched to workers immediately. 2848 * dispatched to workers immediately.
2769 */ 2849 */
@@ -3612,7 +3692,8 @@ static int __init init_workqueues(void)
3612 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3692 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3613 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3693 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3614 WQ_UNBOUND_MAX_ACTIVE); 3694 WQ_UNBOUND_MAX_ACTIVE);
3615 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); 3695 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3696 !system_unbound_wq);
3616 return 0; 3697 return 0;
3617} 3698}
3618early_initcall(init_workqueues); 3699early_initcall(init_workqueues);