aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/audit.c32
-rw-r--r--kernel/audit_tree.c91
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/cgroup.c31
-rw-r--r--kernel/cgroup_freezer.c19
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/fork.c27
-rw-r--r--kernel/hrtimer.c325
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c68
-rw-r--r--kernel/irq/migration.c11
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kprobes.c23
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/posix-cpu-timers.c9
-rw-r--r--kernel/posix-timers.c46
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/sched.c37
-rw-r--r--kernel/sched_clock.c6
-rw-r--r--kernel/sched_debug.c46
-rw-r--r--kernel/sched_fair.c17
-rw-r--r--kernel/sched_stats.h15
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/stop_machine.c5
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/tick-sched.c48
-rw-r--r--kernel/time/timekeeping.c22
-rw-r--r--kernel/trace/ftrace.c147
-rw-r--r--kernel/trace/ring_buffer.c120
-rw-r--r--kernel/trace/trace.c20
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_sysprof.c1
47 files changed, 762 insertions, 618 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d84..19fad003b19d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14CFLAGS_REMOVE_sched.o = -mno-spe
15
16ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
18CFLAGS_REMOVE_lockdep.o = -pg 16CFLAGS_REMOVE_lockdep.o = -pg
@@ -21,7 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_sched.o = -mno-spe -pg 22CFLAGS_REMOVE_sched.o = -pg
25endif 23endif
26 24
27obj-$(CONFIG_FREEZER) += freezer.o 25obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 4414e93d8750..ce6d8ea3131e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,8 +61,11 @@
61 61
62#include "audit.h" 62#include "audit.h"
63 63
64/* No auditing will take place until audit_initialized != 0. 64/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
65 * (Initialization happens after skb_init is called.) */ 65 * (Initialization happens after skb_init is called.) */
66#define AUDIT_DISABLED -1
67#define AUDIT_UNINITIALIZED 0
68#define AUDIT_INITIALIZED 1
66static int audit_initialized; 69static int audit_initialized;
67 70
68#define AUDIT_OFF 0 71#define AUDIT_OFF 0
@@ -965,6 +968,9 @@ static int __init audit_init(void)
965{ 968{
966 int i; 969 int i;
967 970
971 if (audit_initialized == AUDIT_DISABLED)
972 return 0;
973
968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 974 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
969 audit_default ? "enabled" : "disabled"); 975 audit_default ? "enabled" : "disabled");
970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, 976 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
@@ -976,7 +982,7 @@ static int __init audit_init(void)
976 982
977 skb_queue_head_init(&audit_skb_queue); 983 skb_queue_head_init(&audit_skb_queue);
978 skb_queue_head_init(&audit_skb_hold_queue); 984 skb_queue_head_init(&audit_skb_hold_queue);
979 audit_initialized = 1; 985 audit_initialized = AUDIT_INITIALIZED;
980 audit_enabled = audit_default; 986 audit_enabled = audit_default;
981 audit_ever_enabled |= !!audit_default; 987 audit_ever_enabled |= !!audit_default;
982 988
@@ -999,13 +1005,21 @@ __initcall(audit_init);
999static int __init audit_enable(char *str) 1005static int __init audit_enable(char *str)
1000{ 1006{
1001 audit_default = !!simple_strtol(str, NULL, 0); 1007 audit_default = !!simple_strtol(str, NULL, 0);
1002 printk(KERN_INFO "audit: %s%s\n", 1008 if (!audit_default)
1003 audit_default ? "enabled" : "disabled", 1009 audit_initialized = AUDIT_DISABLED;
1004 audit_initialized ? "" : " (after initialization)"); 1010
1005 if (audit_initialized) { 1011 printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
1012
1013 if (audit_initialized == AUDIT_INITIALIZED) {
1006 audit_enabled = audit_default; 1014 audit_enabled = audit_default;
1007 audit_ever_enabled |= !!audit_default; 1015 audit_ever_enabled |= !!audit_default;
1016 } else if (audit_initialized == AUDIT_UNINITIALIZED) {
1017 printk(" (after initialization)");
1018 } else {
1019 printk(" (until reboot)");
1008 } 1020 }
1021 printk("\n");
1022
1009 return 1; 1023 return 1;
1010} 1024}
1011 1025
@@ -1107,9 +1121,7 @@ unsigned int audit_serial(void)
1107static inline void audit_get_stamp(struct audit_context *ctx, 1121static inline void audit_get_stamp(struct audit_context *ctx,
1108 struct timespec *t, unsigned int *serial) 1122 struct timespec *t, unsigned int *serial)
1109{ 1123{
1110 if (ctx) 1124 if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
1111 auditsc_get_stamp(ctx, t, serial);
1112 else {
1113 *t = CURRENT_TIME; 1125 *t = CURRENT_TIME;
1114 *serial = audit_serial(); 1126 *serial = audit_serial();
1115 } 1127 }
@@ -1146,7 +1158,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1146 int reserve; 1158 int reserve;
1147 unsigned long timeout_start = jiffies; 1159 unsigned long timeout_start = jiffies;
1148 1160
1149 if (!audit_initialized) 1161 if (audit_initialized != AUDIT_INITIALIZED)
1150 return NULL; 1162 return NULL;
1151 1163
1152 if (unlikely(audit_filter_type(type))) 1164 if (unlikely(audit_filter_type(type)))
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ba0e0d934f2..8b509441f49a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
24 struct list_head trees; /* with root here */ 24 struct list_head trees; /* with root here */
25 int dead; 25 int dead;
26 int count; 26 int count;
27 atomic_long_t refs;
27 struct rcu_head head; 28 struct rcu_head head;
28 struct node { 29 struct node {
29 struct list_head list; 30 struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
56 * tree is refcounted; one reference for "some rules on rules_list refer to 57 * tree is refcounted; one reference for "some rules on rules_list refer to
57 * it", one for each chunk with pointer to it. 58 * it", one for each chunk with pointer to it.
58 * 59 *
59 * chunk is refcounted by embedded inotify_watch. 60 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
61 * of watch contributes 1 to .refs).
60 * 62 *
61 * node.index allows to get from node.list to containing chunk. 63 * node.index allows to get from node.list to containing chunk.
62 * MSB of that sucker is stolen to mark taggings that we might have to 64 * MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
121 INIT_LIST_HEAD(&chunk->hash); 123 INIT_LIST_HEAD(&chunk->hash);
122 INIT_LIST_HEAD(&chunk->trees); 124 INIT_LIST_HEAD(&chunk->trees);
123 chunk->count = count; 125 chunk->count = count;
126 atomic_long_set(&chunk->refs, 1);
124 for (i = 0; i < count; i++) { 127 for (i = 0; i < count; i++) {
125 INIT_LIST_HEAD(&chunk->owners[i].list); 128 INIT_LIST_HEAD(&chunk->owners[i].list);
126 chunk->owners[i].index = i; 129 chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
129 return chunk; 132 return chunk;
130} 133}
131 134
132static void __free_chunk(struct rcu_head *rcu) 135static void free_chunk(struct audit_chunk *chunk)
133{ 136{
134 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
135 int i; 137 int i;
136 138
137 for (i = 0; i < chunk->count; i++) { 139 for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
141 kfree(chunk); 143 kfree(chunk);
142} 144}
143 145
144static inline void free_chunk(struct audit_chunk *chunk) 146void audit_put_chunk(struct audit_chunk *chunk)
145{ 147{
146 call_rcu(&chunk->head, __free_chunk); 148 if (atomic_long_dec_and_test(&chunk->refs))
149 free_chunk(chunk);
147} 150}
148 151
149void audit_put_chunk(struct audit_chunk *chunk) 152static void __put_chunk(struct rcu_head *rcu)
150{ 153{
151 put_inotify_watch(&chunk->watch); 154 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
155 audit_put_chunk(chunk);
152} 156}
153 157
154enum {HASH_SIZE = 128}; 158enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
176 180
177 list_for_each_entry_rcu(p, list, hash) { 181 list_for_each_entry_rcu(p, list, hash) {
178 if (p->watch.inode == inode) { 182 if (p->watch.inode == inode) {
179 get_inotify_watch(&p->watch); 183 atomic_long_inc(&p->refs);
180 return p; 184 return p;
181 } 185 }
182 } 186 }
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
194 198
195/* tagging and untagging inodes with trees */ 199/* tagging and untagging inodes with trees */
196 200
197static void untag_chunk(struct audit_chunk *chunk, struct node *p) 201static struct audit_chunk *find_chunk(struct node *p)
202{
203 int index = p->index & ~(1U<<31);
204 p -= index;
205 return container_of(p, struct audit_chunk, owners[0]);
206}
207
208static void untag_chunk(struct node *p)
198{ 209{
210 struct audit_chunk *chunk = find_chunk(p);
199 struct audit_chunk *new; 211 struct audit_chunk *new;
200 struct audit_tree *owner; 212 struct audit_tree *owner;
201 int size = chunk->count - 1; 213 int size = chunk->count - 1;
202 int i, j; 214 int i, j;
203 215
216 if (!pin_inotify_watch(&chunk->watch)) {
217 /*
218 * Filesystem is shutting down; all watches are getting
219 * evicted, just take it off the node list for this
220 * tree and let the eviction logics take care of the
221 * rest.
222 */
223 owner = p->owner;
224 if (owner->root == chunk) {
225 list_del_init(&owner->same_root);
226 owner->root = NULL;
227 }
228 list_del_init(&p->list);
229 p->owner = NULL;
230 put_tree(owner);
231 return;
232 }
233
234 spin_unlock(&hash_lock);
235
236 /*
237 * pin_inotify_watch() succeeded, so the watch won't go away
238 * from under us.
239 */
204 mutex_lock(&chunk->watch.inode->inotify_mutex); 240 mutex_lock(&chunk->watch.inode->inotify_mutex);
205 if (chunk->dead) { 241 if (chunk->dead) {
206 mutex_unlock(&chunk->watch.inode->inotify_mutex); 242 mutex_unlock(&chunk->watch.inode->inotify_mutex);
207 return; 243 goto out;
208 } 244 }
209 245
210 owner = p->owner; 246 owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
221 inotify_evict_watch(&chunk->watch); 257 inotify_evict_watch(&chunk->watch);
222 mutex_unlock(&chunk->watch.inode->inotify_mutex); 258 mutex_unlock(&chunk->watch.inode->inotify_mutex);
223 put_inotify_watch(&chunk->watch); 259 put_inotify_watch(&chunk->watch);
224 return; 260 goto out;
225 } 261 }
226 262
227 new = alloc_chunk(size); 263 new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
263 inotify_evict_watch(&chunk->watch); 299 inotify_evict_watch(&chunk->watch);
264 mutex_unlock(&chunk->watch.inode->inotify_mutex); 300 mutex_unlock(&chunk->watch.inode->inotify_mutex);
265 put_inotify_watch(&chunk->watch); 301 put_inotify_watch(&chunk->watch);
266 return; 302 goto out;
267 303
268Fallback: 304Fallback:
269 // do the best we can 305 // do the best we can
@@ -277,6 +313,9 @@ Fallback:
277 put_tree(owner); 313 put_tree(owner);
278 spin_unlock(&hash_lock); 314 spin_unlock(&hash_lock);
279 mutex_unlock(&chunk->watch.inode->inotify_mutex); 315 mutex_unlock(&chunk->watch.inode->inotify_mutex);
316out:
317 unpin_inotify_watch(&chunk->watch);
318 spin_lock(&hash_lock);
280} 319}
281 320
282static int create_chunk(struct inode *inode, struct audit_tree *tree) 321static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
387 return 0; 426 return 0;
388} 427}
389 428
390static struct audit_chunk *find_chunk(struct node *p)
391{
392 int index = p->index & ~(1U<<31);
393 p -= index;
394 return container_of(p, struct audit_chunk, owners[0]);
395}
396
397static void kill_rules(struct audit_tree *tree) 429static void kill_rules(struct audit_tree *tree)
398{ 430{
399 struct audit_krule *rule, *next; 431 struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
431 spin_lock(&hash_lock); 463 spin_lock(&hash_lock);
432 while (!list_empty(&victim->chunks)) { 464 while (!list_empty(&victim->chunks)) {
433 struct node *p; 465 struct node *p;
434 struct audit_chunk *chunk;
435 466
436 p = list_entry(victim->chunks.next, struct node, list); 467 p = list_entry(victim->chunks.next, struct node, list);
437 chunk = find_chunk(p);
438 get_inotify_watch(&chunk->watch);
439 spin_unlock(&hash_lock);
440
441 untag_chunk(chunk, p);
442 468
443 put_inotify_watch(&chunk->watch); 469 untag_chunk(p);
444 spin_lock(&hash_lock);
445 } 470 }
446 spin_unlock(&hash_lock); 471 spin_unlock(&hash_lock);
447 put_tree(victim); 472 put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
469 494
470 while (!list_empty(&tree->chunks)) { 495 while (!list_empty(&tree->chunks)) {
471 struct node *node; 496 struct node *node;
472 struct audit_chunk *chunk;
473 497
474 node = list_entry(tree->chunks.next, struct node, list); 498 node = list_entry(tree->chunks.next, struct node, list);
475 499
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
477 if (!(node->index & (1U<<31))) 501 if (!(node->index & (1U<<31)))
478 break; 502 break;
479 503
480 chunk = find_chunk(node); 504 untag_chunk(node);
481 get_inotify_watch(&chunk->watch);
482 spin_unlock(&hash_lock);
483
484 untag_chunk(chunk, node);
485
486 put_inotify_watch(&chunk->watch);
487 spin_lock(&hash_lock);
488 } 505 }
489 if (!tree->root && !tree->goner) { 506 if (!tree->root && !tree->goner) {
490 tree->goner = 1; 507 tree->goner = 1;
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
878static void destroy_watch(struct inotify_watch *watch) 895static void destroy_watch(struct inotify_watch *watch)
879{ 896{
880 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 897 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
881 free_chunk(chunk); 898 call_rcu(&chunk->head, __put_chunk);
882} 899}
883 900
884static const struct inotify_operations rtree_inotify_ops = { 901static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0ef..9fd85a4640a0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
1094 list_for_each_entry_safe(p, n, in_list, ilist) { 1094 list_for_each_entry_safe(p, n, in_list, ilist) {
1095 list_del(&p->ilist); 1095 list_del(&p->ilist);
1096 inotify_rm_watch(audit_ih, &p->wdata); 1096 inotify_rm_watch(audit_ih, &p->wdata);
1097 /* the put matching the get in audit_do_del_rule() */ 1097 /* the unpin matching the pin in audit_do_del_rule() */
1098 put_inotify_watch(&p->wdata); 1098 unpin_inotify_watch(&p->wdata);
1099 } 1099 }
1100} 1100}
1101 1101
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
1389 /* Put parent on the inotify un-registration 1389 /* Put parent on the inotify un-registration
1390 * list. Grab a reference before releasing 1390 * list. Grab a reference before releasing
1391 * audit_filter_mutex, to be released in 1391 * audit_filter_mutex, to be released in
1392 * audit_inotify_unregister(). */ 1392 * audit_inotify_unregister().
1393 list_add(&parent->ilist, &inotify_list); 1393 * If filesystem is going away, just leave
1394 get_inotify_watch(&parent->wdata); 1394 * the sucker alone, eviction will take
1395 * care of it.
1396 */
1397 if (pin_inotify_watch(&parent->wdata))
1398 list_add(&parent->ilist, &inotify_list);
1395 } 1399 }
1396 } 1400 }
1397 } 1401 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c3..2a3f0afc4d2a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1459,7 +1459,6 @@ void audit_free(struct task_struct *tsk)
1459 1459
1460/** 1460/**
1461 * audit_syscall_entry - fill in an audit record at syscall entry 1461 * audit_syscall_entry - fill in an audit record at syscall entry
1462 * @tsk: task being audited
1463 * @arch: architecture type 1462 * @arch: architecture type
1464 * @major: major syscall type (function) 1463 * @major: major syscall type (function)
1465 * @a1: additional syscall register 1 1464 * @a1: additional syscall register 1
@@ -1548,9 +1547,25 @@ void audit_syscall_entry(int arch, int major,
1548 context->ppid = 0; 1547 context->ppid = 0;
1549} 1548}
1550 1549
1550void audit_finish_fork(struct task_struct *child)
1551{
1552 struct audit_context *ctx = current->audit_context;
1553 struct audit_context *p = child->audit_context;
1554 if (!p || !ctx || !ctx->auditable)
1555 return;
1556 p->arch = ctx->arch;
1557 p->major = ctx->major;
1558 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1559 p->ctime = ctx->ctime;
1560 p->dummy = ctx->dummy;
1561 p->auditable = ctx->auditable;
1562 p->in_syscall = ctx->in_syscall;
1563 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1564 p->ppid = current->pid;
1565}
1566
1551/** 1567/**
1552 * audit_syscall_exit - deallocate audit context after a system call 1568 * audit_syscall_exit - deallocate audit context after a system call
1553 * @tsk: task being audited
1554 * @valid: success/failure flag 1569 * @valid: success/failure flag
1555 * @return_code: syscall return value 1570 * @return_code: syscall return value
1556 * 1571 *
@@ -1942,15 +1957,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
1942 * 1957 *
1943 * Also sets the context as auditable. 1958 * Also sets the context as auditable.
1944 */ 1959 */
1945void auditsc_get_stamp(struct audit_context *ctx, 1960int auditsc_get_stamp(struct audit_context *ctx,
1946 struct timespec *t, unsigned int *serial) 1961 struct timespec *t, unsigned int *serial)
1947{ 1962{
1963 if (!ctx->in_syscall)
1964 return 0;
1948 if (!ctx->serial) 1965 if (!ctx->serial)
1949 ctx->serial = audit_serial(); 1966 ctx->serial = audit_serial();
1950 t->tv_sec = ctx->ctime.tv_sec; 1967 t->tv_sec = ctx->ctime.tv_sec;
1951 t->tv_nsec = ctx->ctime.tv_nsec; 1968 t->tv_nsec = ctx->ctime.tv_nsec;
1952 *serial = ctx->serial; 1969 *serial = ctx->serial;
1953 ctx->auditable = 1; 1970 ctx->auditable = 1;
1971 return 1;
1954} 1972}
1955 1973
1956/* global counter which is incremented every time something logs in */ 1974/* global counter which is incremented every time something logs in */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 358e77564e6f..2606d0fb4e54 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -702,7 +702,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
702 * any child cgroups exist. This is theoretically supportable 702 * any child cgroups exist. This is theoretically supportable
703 * but involves complex error handling, so it's being left until 703 * but involves complex error handling, so it's being left until
704 * later */ 704 * later */
705 if (!list_empty(&cgrp->children)) 705 if (root->number_of_cgroups > 1)
706 return -EBUSY; 706 return -EBUSY;
707 707
708 /* Process each subsystem */ 708 /* Process each subsystem */
@@ -1024,7 +1024,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1024 if (ret == -EBUSY) { 1024 if (ret == -EBUSY) {
1025 mutex_unlock(&cgroup_mutex); 1025 mutex_unlock(&cgroup_mutex);
1026 mutex_unlock(&inode->i_mutex); 1026 mutex_unlock(&inode->i_mutex);
1027 goto drop_new_super; 1027 goto free_cg_links;
1028 } 1028 }
1029 1029
1030 /* EBUSY should be the only error here */ 1030 /* EBUSY should be the only error here */
@@ -1073,10 +1073,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1073 1073
1074 return simple_set_mnt(mnt, sb); 1074 return simple_set_mnt(mnt, sb);
1075 1075
1076 free_cg_links:
1077 free_cg_links(&tmp_cg_links);
1076 drop_new_super: 1078 drop_new_super:
1077 up_write(&sb->s_umount); 1079 up_write(&sb->s_umount);
1078 deactivate_super(sb); 1080 deactivate_super(sb);
1079 free_cg_links(&tmp_cg_links);
1080 return ret; 1081 return ret;
1081} 1082}
1082 1083
@@ -2039,10 +2040,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2039 struct cgroup *cgrp; 2040 struct cgroup *cgrp;
2040 struct cgroup_iter it; 2041 struct cgroup_iter it;
2041 struct task_struct *tsk; 2042 struct task_struct *tsk;
2043
2042 /* 2044 /*
2043 * Validate dentry by checking the superblock operations 2045 * Validate dentry by checking the superblock operations,
2046 * and make sure it's a directory.
2044 */ 2047 */
2045 if (dentry->d_sb->s_op != &cgroup_ops) 2048 if (dentry->d_sb->s_op != &cgroup_ops ||
2049 !S_ISDIR(dentry->d_inode->i_mode))
2046 goto err; 2050 goto err;
2047 2051
2048 ret = 0; 2052 ret = 0;
@@ -2472,10 +2476,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2472 mutex_unlock(&cgroup_mutex); 2476 mutex_unlock(&cgroup_mutex);
2473 return -EBUSY; 2477 return -EBUSY;
2474 } 2478 }
2475 2479 mutex_unlock(&cgroup_mutex);
2476 parent = cgrp->parent;
2477 root = cgrp->root;
2478 sb = root->sb;
2479 2480
2480 /* 2481 /*
2481 * Call pre_destroy handlers of subsys. Notify subsystems 2482 * Call pre_destroy handlers of subsys. Notify subsystems
@@ -2483,7 +2484,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2483 */ 2484 */
2484 cgroup_call_pre_destroy(cgrp); 2485 cgroup_call_pre_destroy(cgrp);
2485 2486
2486 if (cgroup_has_css_refs(cgrp)) { 2487 mutex_lock(&cgroup_mutex);
2488 parent = cgrp->parent;
2489 root = cgrp->root;
2490 sb = root->sb;
2491
2492 if (atomic_read(&cgrp->count)
2493 || !list_empty(&cgrp->children)
2494 || cgroup_has_css_refs(cgrp)) {
2487 mutex_unlock(&cgroup_mutex); 2495 mutex_unlock(&cgroup_mutex);
2488 return -EBUSY; 2496 return -EBUSY;
2489 } 2497 }
@@ -2927,9 +2935,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2927 again: 2935 again:
2928 root = subsys->root; 2936 root = subsys->root;
2929 if (root == &rootnode) { 2937 if (root == &rootnode) {
2930 printk(KERN_INFO
2931 "Not cloning cgroup for unused subsystem %s\n",
2932 subsys->name);
2933 mutex_unlock(&cgroup_mutex); 2938 mutex_unlock(&cgroup_mutex);
2934 return 0; 2939 return 0;
2935 } 2940 }
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 7fa476f01d05..fb249e2bcada 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -184,9 +184,20 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
184{ 184{
185 struct freezer *freezer; 185 struct freezer *freezer;
186 186
187 task_lock(task); 187 /*
188 * No lock is needed, since the task isn't on tasklist yet,
189 * so it can't be moved to another cgroup, which means the
190 * freezer won't be removed and will be valid during this
191 * function call.
192 */
188 freezer = task_freezer(task); 193 freezer = task_freezer(task);
189 task_unlock(task); 194
195 /*
196 * The root cgroup is non-freezable, so we can skip the
197 * following check.
198 */
199 if (!freezer->css.cgroup->parent)
200 return;
190 201
191 spin_lock_irq(&freezer->lock); 202 spin_lock_irq(&freezer->lock);
192 BUG_ON(freezer->state == CGROUP_FROZEN); 203 BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -331,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup,
331 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 342 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
332 goal_state = CGROUP_FROZEN; 343 goal_state = CGROUP_FROZEN;
333 else 344 else
334 return -EIO; 345 return -EINVAL;
335 346
336 if (!cgroup_lock_live_group(cgroup)) 347 if (!cgroup_lock_live_group(cgroup))
337 return -ENODEV; 348 return -ENODEV;
@@ -350,6 +361,8 @@ static struct cftype files[] = {
350 361
351static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) 362static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
352{ 363{
364 if (!cgroup->parent)
365 return 0;
353 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); 366 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
354} 367}
355 368
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5a732c5ef08b..8ea32e8d68b0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,7 +462,7 @@ out:
462 * It must be called by the arch code on the new cpu, before the new cpu 462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */ 464 */
465void notify_cpu_starting(unsigned int cpu) 465void __cpuinit notify_cpu_starting(unsigned int cpu)
466{ 466{
467 unsigned long val = CPU_STARTING; 467 unsigned long val = CPU_STARTING;
468 468
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e00526f52ec..96c0ba13b8cd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/mempolicy.h> 37#include <linux/mempolicy.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/memory.h>
39#include <linux/module.h> 40#include <linux/module.h>
40#include <linux/mount.h> 41#include <linux/mount.h>
41#include <linux/namei.h> 42#include <linux/namei.h>
@@ -584,10 +585,9 @@ static int generate_sched_domains(cpumask_t **domains,
584 int i, j, k; /* indices for partition finding loops */ 585 int i, j, k; /* indices for partition finding loops */
585 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 586 cpumask_t *doms; /* resulting partition; i.e. sched domains */
586 struct sched_domain_attr *dattr; /* attributes for custom domains */ 587 struct sched_domain_attr *dattr; /* attributes for custom domains */
587 int ndoms; /* number of sched domains in result */ 588 int ndoms = 0; /* number of sched domains in result */
588 int nslot; /* next empty doms[] cpumask_t slot */ 589 int nslot; /* next empty doms[] cpumask_t slot */
589 590
590 ndoms = 0;
591 doms = NULL; 591 doms = NULL;
592 dattr = NULL; 592 dattr = NULL;
593 csa = NULL; 593 csa = NULL;
@@ -674,10 +674,8 @@ restart:
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 675 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
677 if (!doms) { 677 if (!doms)
678 ndoms = 0;
679 goto done; 678 goto done;
680 }
681 679
682 /* 680 /*
683 * The rest of the code, including the scheduler, can deal with 681 * The rest of the code, including the scheduler, can deal with
@@ -732,6 +730,13 @@ restart:
732done: 730done:
733 kfree(csa); 731 kfree(csa);
734 732
733 /*
734 * Fallback to the default domain if kmalloc() failed.
735 * See comments in partition_sched_domains().
736 */
737 if (doms == NULL)
738 ndoms = 1;
739
735 *domains = doms; 740 *domains = doms;
736 *attributes = dattr; 741 *attributes = dattr;
737 return ndoms; 742 return ndoms;
@@ -2011,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2011 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2016 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2012 * See also the previous routine cpuset_track_online_cpus(). 2017 * See also the previous routine cpuset_track_online_cpus().
2013 */ 2018 */
2014void cpuset_track_online_nodes(void) 2019static int cpuset_track_online_nodes(struct notifier_block *self,
2020 unsigned long action, void *arg)
2015{ 2021{
2016 cgroup_lock(); 2022 cgroup_lock();
2017 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2023 switch (action) {
2018 scan_for_empty_cpusets(&top_cpuset); 2024 case MEM_ONLINE:
2025 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2026 break;
2027 case MEM_OFFLINE:
2028 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2029 scan_for_empty_cpusets(&top_cpuset);
2030 break;
2031 default:
2032 break;
2033 }
2019 cgroup_unlock(); 2034 cgroup_unlock();
2035 return NOTIFY_OK;
2020} 2036}
2021#endif 2037#endif
2022 2038
@@ -2032,6 +2048,7 @@ void __init cpuset_init_smp(void)
2032 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2033 2049
2034 hotcpu_notifier(cpuset_track_online_cpus, 0); 2050 hotcpu_notifier(cpuset_track_online_cpus, 0);
2051 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2035} 2052}
2036 2053
2037/** 2054/**
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d9467..2d8be7ebb0f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -40,7 +40,6 @@
40#include <linux/cn_proc.h> 40#include <linux/cn_proc.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
44#include <linux/pipe_fs_i.h> 43#include <linux/pipe_fs_i.h>
45#include <linux/audit.h> /* for audit_free() */ 44#include <linux/audit.h> /* for audit_free() */
46#include <linux/resource.h> 45#include <linux/resource.h>
@@ -141,6 +140,11 @@ static void __exit_signal(struct task_struct *tsk)
141 if (sig) { 140 if (sig) {
142 flush_sigqueue(&sig->shared_pending); 141 flush_sigqueue(&sig->shared_pending);
143 taskstats_tgid_free(sig); 142 taskstats_tgid_free(sig);
143 /*
144 * Make sure ->signal can't go away under rq->lock,
145 * see account_group_exec_runtime().
146 */
147 task_rq_unlock_wait(tsk);
144 __cleanup_signal(sig); 148 __cleanup_signal(sig);
145 } 149 }
146} 150}
@@ -1054,14 +1058,6 @@ NORET_TYPE void do_exit(long code)
1054 exit_itimers(tsk->signal); 1058 exit_itimers(tsk->signal);
1055 } 1059 }
1056 acct_collect(code, group_dead); 1060 acct_collect(code, group_dead);
1057#ifdef CONFIG_FUTEX
1058 if (unlikely(tsk->robust_list))
1059 exit_robust_list(tsk);
1060#ifdef CONFIG_COMPAT
1061 if (unlikely(tsk->compat_robust_list))
1062 compat_exit_robust_list(tsk);
1063#endif
1064#endif
1065 if (group_dead) 1061 if (group_dead)
1066 tty_audit_exit(); 1062 tty_audit_exit();
1067 if (unlikely(tsk->audit_context)) 1063 if (unlikely(tsk->audit_context))
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe0..495da2e9a8b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/jiffies.h> 40#include <linux/jiffies.h>
41#include <linux/tracehook.h> 41#include <linux/tracehook.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -314,17 +315,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
314 file = tmp->vm_file; 315 file = tmp->vm_file;
315 if (file) { 316 if (file) {
316 struct inode *inode = file->f_path.dentry->d_inode; 317 struct inode *inode = file->f_path.dentry->d_inode;
318 struct address_space *mapping = file->f_mapping;
319
317 get_file(file); 320 get_file(file);
318 if (tmp->vm_flags & VM_DENYWRITE) 321 if (tmp->vm_flags & VM_DENYWRITE)
319 atomic_dec(&inode->i_writecount); 322 atomic_dec(&inode->i_writecount);
320 323 spin_lock(&mapping->i_mmap_lock);
321 /* insert tmp into the share list, just after mpnt */ 324 if (tmp->vm_flags & VM_SHARED)
322 spin_lock(&file->f_mapping->i_mmap_lock); 325 mapping->i_mmap_writable++;
323 tmp->vm_truncate_count = mpnt->vm_truncate_count; 326 tmp->vm_truncate_count = mpnt->vm_truncate_count;
324 flush_dcache_mmap_lock(file->f_mapping); 327 flush_dcache_mmap_lock(mapping);
328 /* insert tmp into the share list, just after mpnt */
325 vma_prio_tree_add(tmp, mpnt); 329 vma_prio_tree_add(tmp, mpnt);
326 flush_dcache_mmap_unlock(file->f_mapping); 330 flush_dcache_mmap_unlock(mapping);
327 spin_unlock(&file->f_mapping->i_mmap_lock); 331 spin_unlock(&mapping->i_mmap_lock);
328 } 332 }
329 333
330 /* 334 /*
@@ -519,6 +523,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
519{ 523{
520 struct completion *vfork_done = tsk->vfork_done; 524 struct completion *vfork_done = tsk->vfork_done;
521 525
526 /* Get rid of any futexes when releasing the mm */
527#ifdef CONFIG_FUTEX
528 if (unlikely(tsk->robust_list))
529 exit_robust_list(tsk);
530#ifdef CONFIG_COMPAT
531 if (unlikely(tsk->compat_robust_list))
532 compat_exit_robust_list(tsk);
533#endif
534#endif
535
522 /* Get rid of any cached register state */ 536 /* Get rid of any cached register state */
523 deactivate_mm(tsk, mm); 537 deactivate_mm(tsk, mm);
524 538
@@ -1387,6 +1401,7 @@ long do_fork(unsigned long clone_flags,
1387 init_completion(&vfork); 1401 init_completion(&vfork);
1388 } 1402 }
1389 1403
1404 audit_finish_fork(p);
1390 tracehook_report_clone(trace, regs, clone_flags, nr, p); 1405 tracehook_report_clone(trace, regs, clone_flags, nr, p);
1391 1406
1392 /* 1407 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b465dfde426..bda9cb924276 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
443#endif 443#endif
444 444
445/*
446 * Check, whether the timer is on the callback pending list
447 */
448static inline int hrtimer_cb_pending(const struct hrtimer *timer)
449{
450 return timer->state & HRTIMER_STATE_PENDING;
451}
452
453/*
454 * Remove a timer from the callback pending list
455 */
456static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
457{
458 list_del_init(&timer->cb_entry);
459}
460
461/* High resolution timer related functions */ 445/* High resolution timer related functions */
462#ifdef CONFIG_HIGH_RES_TIMERS 446#ifdef CONFIG_HIGH_RES_TIMERS
463 447
@@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651{ 635{
652} 636}
653 637
638static void __run_hrtimer(struct hrtimer *timer);
639
654/* 640/*
655 * When High resolution timers are active, try to reprogram. Note, that in case 641 * When High resolution timers are active, try to reprogram. Note, that in case
656 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 642 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,40 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
661 struct hrtimer_clock_base *base) 647 struct hrtimer_clock_base *base)
662{ 648{
663 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 649 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
664 650 /*
665 /* Timer is expired, act upon the callback mode */ 651 * XXX: recursion check?
666 switch(timer->cb_mode) { 652 * hrtimer_forward() should round up with timer granularity
667 case HRTIMER_CB_IRQSAFE_NO_RESTART: 653 * so that we never get into inf recursion here,
668 debug_hrtimer_deactivate(timer); 654 * it doesn't do that though
669 /* 655 */
670 * We can call the callback from here. No restart 656 __run_hrtimer(timer);
671 * happens, so no danger of recursion 657 return 1;
672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1;
675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
677 /*
678 * This is solely for the sched tick emulation with
679 * dynamic tick support to ensure that we do not
680 * restart the tick right on the edge and end up with
681 * the tick timer in the softirq ! The calling site
682 * takes care of this. Also used for hrtimer sleeper !
683 */
684 debug_hrtimer_deactivate(timer);
685 return 1;
686 case HRTIMER_CB_IRQSAFE:
687 case HRTIMER_CB_SOFTIRQ:
688 /*
689 * Move everything else into the softirq pending list !
690 */
691 list_add_tail(&timer->cb_entry,
692 &base->cpu_base->cb_pending);
693 timer->state = HRTIMER_STATE_PENDING;
694 return 1;
695 default:
696 BUG();
697 }
698 } 658 }
699 return 0; 659 return 0;
700} 660}
@@ -733,11 +693,6 @@ static int hrtimer_switch_to_hres(void)
733 return 1; 693 return 1;
734} 694}
735 695
736static inline void hrtimer_raise_softirq(void)
737{
738 raise_softirq(HRTIMER_SOFTIRQ);
739}
740
741#else 696#else
742 697
743static inline int hrtimer_hres_active(void) { return 0; } 698static inline int hrtimer_hres_active(void) { return 0; }
@@ -756,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
756{ 711{
757 return 0; 712 return 0;
758} 713}
759static inline void hrtimer_raise_softirq(void) { }
760 714
761#endif /* CONFIG_HIGH_RES_TIMERS */ 715#endif /* CONFIG_HIGH_RES_TIMERS */
762 716
@@ -899,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
899 struct hrtimer_clock_base *base, 853 struct hrtimer_clock_base *base,
900 unsigned long newstate, int reprogram) 854 unsigned long newstate, int reprogram)
901{ 855{
902 /* High res. callback list. NOP for !HIGHRES */ 856 if (timer->state & HRTIMER_STATE_ENQUEUED) {
903 if (hrtimer_cb_pending(timer))
904 hrtimer_remove_cb_pending(timer);
905 else {
906 /* 857 /*
907 * Remove the timer from the rbtree and replace the 858 * Remove the timer from the rbtree and replace the
908 * first entry pointer if necessary. 859 * first entry pointer if necessary.
@@ -962,7 +913,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
962{ 913{
963 struct hrtimer_clock_base *base, *new_base; 914 struct hrtimer_clock_base *base, *new_base;
964 unsigned long flags; 915 unsigned long flags;
965 int ret, raise; 916 int ret;
966 917
967 base = lock_hrtimer_base(timer, &flags); 918 base = lock_hrtimer_base(timer, &flags);
968 919
@@ -997,26 +948,8 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
997 enqueue_hrtimer(timer, new_base, 948 enqueue_hrtimer(timer, new_base,
998 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 949 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
999 950
1000 /*
1001 * The timer may be expired and moved to the cb_pending
1002 * list. We can not raise the softirq with base lock held due
1003 * to a possible deadlock with runqueue lock.
1004 */
1005 raise = timer->state == HRTIMER_STATE_PENDING;
1006
1007 /*
1008 * We use preempt_disable to prevent this task from migrating after
1009 * setting up the softirq and raising it. Otherwise, if me migrate
1010 * we will raise the softirq on the wrong CPU.
1011 */
1012 preempt_disable();
1013
1014 unlock_hrtimer_base(timer, &flags); 951 unlock_hrtimer_base(timer, &flags);
1015 952
1016 if (raise)
1017 hrtimer_raise_softirq();
1018 preempt_enable();
1019
1020 return ret; 953 return ret;
1021} 954}
1022EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 955EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1201,60 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1201} 1134}
1202EXPORT_SYMBOL_GPL(hrtimer_get_res); 1135EXPORT_SYMBOL_GPL(hrtimer_get_res);
1203 1136
1204static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1205{
1206 spin_lock_irq(&cpu_base->lock);
1207
1208 while (!list_empty(&cpu_base->cb_pending)) {
1209 enum hrtimer_restart (*fn)(struct hrtimer *);
1210 struct hrtimer *timer;
1211 int restart;
1212
1213 timer = list_entry(cpu_base->cb_pending.next,
1214 struct hrtimer, cb_entry);
1215
1216 debug_hrtimer_deactivate(timer);
1217 timer_stats_account_hrtimer(timer);
1218
1219 fn = timer->function;
1220 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1221 spin_unlock_irq(&cpu_base->lock);
1222
1223 restart = fn(timer);
1224
1225 spin_lock_irq(&cpu_base->lock);
1226
1227 timer->state &= ~HRTIMER_STATE_CALLBACK;
1228 if (restart == HRTIMER_RESTART) {
1229 BUG_ON(hrtimer_active(timer));
1230 /*
1231 * Enqueue the timer, allow reprogramming of the event
1232 * device
1233 */
1234 enqueue_hrtimer(timer, timer->base, 1);
1235 } else if (hrtimer_active(timer)) {
1236 /*
1237 * If the timer was rearmed on another CPU, reprogram
1238 * the event device.
1239 */
1240 struct hrtimer_clock_base *base = timer->base;
1241
1242 if (base->first == &timer->node &&
1243 hrtimer_reprogram(timer, base)) {
1244 /*
1245 * Timer is expired. Thus move it from tree to
1246 * pending list again.
1247 */
1248 __remove_hrtimer(timer, base,
1249 HRTIMER_STATE_PENDING, 0);
1250 list_add_tail(&timer->cb_entry,
1251 &base->cpu_base->cb_pending);
1252 }
1253 }
1254 }
1255 spin_unlock_irq(&cpu_base->lock);
1256}
1257
1258static void __run_hrtimer(struct hrtimer *timer) 1137static void __run_hrtimer(struct hrtimer *timer)
1259{ 1138{
1260 struct hrtimer_clock_base *base = timer->base; 1139 struct hrtimer_clock_base *base = timer->base;
@@ -1262,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer)
1262 enum hrtimer_restart (*fn)(struct hrtimer *); 1141 enum hrtimer_restart (*fn)(struct hrtimer *);
1263 int restart; 1142 int restart;
1264 1143
1144 WARN_ON(!irqs_disabled());
1145
1265 debug_hrtimer_deactivate(timer); 1146 debug_hrtimer_deactivate(timer);
1266 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1147 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1267 timer_stats_account_hrtimer(timer); 1148 timer_stats_account_hrtimer(timer);
1268
1269 fn = timer->function; 1149 fn = timer->function;
1270 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || 1150
1271 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { 1151 /*
1272 /* 1152 * Because we run timers from hardirq context, there is no chance
1273 * Used for scheduler timers, avoid lock inversion with 1153 * they get migrated to another cpu, therefore its safe to unlock
1274 * rq->lock and tasklist_lock. 1154 * the timer base.
1275 * 1155 */
1276 * These timers are required to deal with enqueue expiry 1156 spin_unlock(&cpu_base->lock);
1277 * themselves and are not allowed to migrate. 1157 restart = fn(timer);
1278 */ 1158 spin_lock(&cpu_base->lock);
1279 spin_unlock(&cpu_base->lock);
1280 restart = fn(timer);
1281 spin_lock(&cpu_base->lock);
1282 } else
1283 restart = fn(timer);
1284 1159
1285 /* 1160 /*
1286 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1161 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
@@ -1305,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1305 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1180 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1306 struct hrtimer_clock_base *base; 1181 struct hrtimer_clock_base *base;
1307 ktime_t expires_next, now; 1182 ktime_t expires_next, now;
1308 int i, raise = 0; 1183 int i;
1309 1184
1310 BUG_ON(!cpu_base->hres_active); 1185 BUG_ON(!cpu_base->hres_active);
1311 cpu_base->nr_events++; 1186 cpu_base->nr_events++;
@@ -1354,16 +1229,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1354 break; 1229 break;
1355 } 1230 }
1356 1231
1357 /* Move softirq callbacks to the pending list */
1358 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1359 __remove_hrtimer(timer, base,
1360 HRTIMER_STATE_PENDING, 0);
1361 list_add_tail(&timer->cb_entry,
1362 &base->cpu_base->cb_pending);
1363 raise = 1;
1364 continue;
1365 }
1366
1367 __run_hrtimer(timer); 1232 __run_hrtimer(timer);
1368 } 1233 }
1369 spin_unlock(&cpu_base->lock); 1234 spin_unlock(&cpu_base->lock);
@@ -1377,10 +1242,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1377 if (tick_program_event(expires_next, 0)) 1242 if (tick_program_event(expires_next, 0))
1378 goto retry; 1243 goto retry;
1379 } 1244 }
1380
1381 /* Raise softirq ? */
1382 if (raise)
1383 raise_softirq(HRTIMER_SOFTIRQ);
1384} 1245}
1385 1246
1386/** 1247/**
@@ -1407,11 +1268,6 @@ void hrtimer_peek_ahead_timers(void)
1407 local_irq_restore(flags); 1268 local_irq_restore(flags);
1408} 1269}
1409 1270
1410static void run_hrtimer_softirq(struct softirq_action *h)
1411{
1412 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
1413}
1414
1415#endif /* CONFIG_HIGH_RES_TIMERS */ 1271#endif /* CONFIG_HIGH_RES_TIMERS */
1416 1272
1417/* 1273/*
@@ -1423,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
1423 */ 1279 */
1424void hrtimer_run_pending(void) 1280void hrtimer_run_pending(void)
1425{ 1281{
1426 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1427
1428 if (hrtimer_hres_active()) 1282 if (hrtimer_hres_active())
1429 return; 1283 return;
1430 1284
@@ -1438,8 +1292,6 @@ void hrtimer_run_pending(void)
1438 */ 1292 */
1439 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) 1293 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1440 hrtimer_switch_to_hres(); 1294 hrtimer_switch_to_hres();
1441
1442 run_hrtimer_pending(cpu_base);
1443} 1295}
1444 1296
1445/* 1297/*
@@ -1476,14 +1328,6 @@ void hrtimer_run_queues(void)
1476 hrtimer_get_expires_tv64(timer)) 1328 hrtimer_get_expires_tv64(timer))
1477 break; 1329 break;
1478 1330
1479 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1480 __remove_hrtimer(timer, base,
1481 HRTIMER_STATE_PENDING, 0);
1482 list_add_tail(&timer->cb_entry,
1483 &base->cpu_base->cb_pending);
1484 continue;
1485 }
1486
1487 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1488 } 1332 }
1489 spin_unlock(&cpu_base->lock); 1333 spin_unlock(&cpu_base->lock);
@@ -1510,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1510{ 1354{
1511 sl->timer.function = hrtimer_wakeup; 1355 sl->timer.function = hrtimer_wakeup;
1512 sl->task = task; 1356 sl->task = task;
1513#ifdef CONFIG_HIGH_RES_TIMERS
1514 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1515#endif
1516} 1357}
1517 1358
1518static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1359static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1649,18 +1490,16 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1649 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1490 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1650 cpu_base->clock_base[i].cpu_base = cpu_base; 1491 cpu_base->clock_base[i].cpu_base = cpu_base;
1651 1492
1652 INIT_LIST_HEAD(&cpu_base->cb_pending);
1653 hrtimer_init_hres(cpu_base); 1493 hrtimer_init_hres(cpu_base);
1654} 1494}
1655 1495
1656#ifdef CONFIG_HOTPLUG_CPU 1496#ifdef CONFIG_HOTPLUG_CPU
1657 1497
1658static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1498static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1659 struct hrtimer_clock_base *new_base, int dcpu) 1499 struct hrtimer_clock_base *new_base)
1660{ 1500{
1661 struct hrtimer *timer; 1501 struct hrtimer *timer;
1662 struct rb_node *node; 1502 struct rb_node *node;
1663 int raise = 0;
1664 1503
1665 while ((node = rb_first(&old_base->active))) { 1504 while ((node = rb_first(&old_base->active))) {
1666 timer = rb_entry(node, struct hrtimer, node); 1505 timer = rb_entry(node, struct hrtimer, node);
@@ -1668,18 +1507,6 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1668 debug_hrtimer_deactivate(timer); 1507 debug_hrtimer_deactivate(timer);
1669 1508
1670 /* 1509 /*
1671 * Should not happen. Per CPU timers should be
1672 * canceled _before_ the migration code is called
1673 */
1674 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1675 __remove_hrtimer(timer, old_base,
1676 HRTIMER_STATE_INACTIVE, 0);
1677 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1678 timer, timer->function, dcpu);
1679 continue;
1680 }
1681
1682 /*
1683 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1510 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1684 * timer could be seen as !active and just vanish away 1511 * timer could be seen as !active and just vanish away
1685 * under us on another CPU 1512 * under us on another CPU
@@ -1687,69 +1514,34 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1687 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1514 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1688 timer->base = new_base; 1515 timer->base = new_base;
1689 /* 1516 /*
1690 * Enqueue the timer. Allow reprogramming of the event device 1517 * Enqueue the timers on the new cpu, but do not reprogram
1518 * the timer as that would enable a deadlock between
1519 * hrtimer_enqueue_reprogramm() running the timer and us still
1520 * holding a nested base lock.
1521 *
1522 * Instead we tickle the hrtimer interrupt after the migration
1523 * is done, which will run all expired timers and re-programm
1524 * the timer device.
1691 */ 1525 */
1692 enqueue_hrtimer(timer, new_base, 1); 1526 enqueue_hrtimer(timer, new_base, 0);
1693 1527
1694#ifdef CONFIG_HIGH_RES_TIMERS
1695 /*
1696 * Happens with high res enabled when the timer was
1697 * already expired and the callback mode is
1698 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1699 * enqueue code does not move them to the soft irq
1700 * pending list for performance/latency reasons, but
1701 * in the migration state, we need to do that
1702 * otherwise we end up with a stale timer.
1703 */
1704 if (timer->state == HRTIMER_STATE_MIGRATE) {
1705 timer->state = HRTIMER_STATE_PENDING;
1706 list_add_tail(&timer->cb_entry,
1707 &new_base->cpu_base->cb_pending);
1708 raise = 1;
1709 }
1710#endif
1711 /* Clear the migration state bit */ 1528 /* Clear the migration state bit */
1712 timer->state &= ~HRTIMER_STATE_MIGRATE; 1529 timer->state &= ~HRTIMER_STATE_MIGRATE;
1713 } 1530 }
1714 return raise;
1715}
1716
1717#ifdef CONFIG_HIGH_RES_TIMERS
1718static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1719 struct hrtimer_cpu_base *new_base)
1720{
1721 struct hrtimer *timer;
1722 int raise = 0;
1723
1724 while (!list_empty(&old_base->cb_pending)) {
1725 timer = list_entry(old_base->cb_pending.next,
1726 struct hrtimer, cb_entry);
1727
1728 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1729 timer->base = &new_base->clock_base[timer->base->index];
1730 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1731 raise = 1;
1732 }
1733 return raise;
1734}
1735#else
1736static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1737 struct hrtimer_cpu_base *new_base)
1738{
1739 return 0;
1740} 1531}
1741#endif
1742 1532
1743static void migrate_hrtimers(int cpu) 1533static int migrate_hrtimers(int scpu)
1744{ 1534{
1745 struct hrtimer_cpu_base *old_base, *new_base; 1535 struct hrtimer_cpu_base *old_base, *new_base;
1746 int i, raise = 0; 1536 int dcpu, i;
1747 1537
1748 BUG_ON(cpu_online(cpu)); 1538 BUG_ON(cpu_online(scpu));
1749 old_base = &per_cpu(hrtimer_bases, cpu); 1539 old_base = &per_cpu(hrtimer_bases, scpu);
1750 new_base = &get_cpu_var(hrtimer_bases); 1540 new_base = &get_cpu_var(hrtimer_bases);
1751 1541
1752 tick_cancel_sched_timer(cpu); 1542 dcpu = smp_processor_id();
1543
1544 tick_cancel_sched_timer(scpu);
1753 /* 1545 /*
1754 * The caller is globally serialized and nobody else 1546 * The caller is globally serialized and nobody else
1755 * takes two locks at once, deadlock is not possible. 1547 * takes two locks at once, deadlock is not possible.
@@ -1758,41 +1550,47 @@ static void migrate_hrtimers(int cpu)
1758 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1550 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1759 1551
1760 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1552 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1761 if (migrate_hrtimer_list(&old_base->clock_base[i], 1553 migrate_hrtimer_list(&old_base->clock_base[i],
1762 &new_base->clock_base[i], cpu)) 1554 &new_base->clock_base[i]);
1763 raise = 1;
1764 } 1555 }
1765 1556
1766 if (migrate_hrtimer_pending(old_base, new_base))
1767 raise = 1;
1768
1769 spin_unlock(&old_base->lock); 1557 spin_unlock(&old_base->lock);
1770 spin_unlock_irq(&new_base->lock); 1558 spin_unlock_irq(&new_base->lock);
1771 put_cpu_var(hrtimer_bases); 1559 put_cpu_var(hrtimer_bases);
1772 1560
1773 if (raise) 1561 return dcpu;
1774 hrtimer_raise_softirq(); 1562}
1563
1564static void tickle_timers(void *arg)
1565{
1566 hrtimer_peek_ahead_timers();
1775} 1567}
1568
1776#endif /* CONFIG_HOTPLUG_CPU */ 1569#endif /* CONFIG_HOTPLUG_CPU */
1777 1570
1778static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1571static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1779 unsigned long action, void *hcpu) 1572 unsigned long action, void *hcpu)
1780{ 1573{
1781 unsigned int cpu = (long)hcpu; 1574 int scpu = (long)hcpu;
1782 1575
1783 switch (action) { 1576 switch (action) {
1784 1577
1785 case CPU_UP_PREPARE: 1578 case CPU_UP_PREPARE:
1786 case CPU_UP_PREPARE_FROZEN: 1579 case CPU_UP_PREPARE_FROZEN:
1787 init_hrtimers_cpu(cpu); 1580 init_hrtimers_cpu(scpu);
1788 break; 1581 break;
1789 1582
1790#ifdef CONFIG_HOTPLUG_CPU 1583#ifdef CONFIG_HOTPLUG_CPU
1791 case CPU_DEAD: 1584 case CPU_DEAD:
1792 case CPU_DEAD_FROZEN: 1585 case CPU_DEAD_FROZEN:
1793 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1586 {
1794 migrate_hrtimers(cpu); 1587 int dcpu;
1588
1589 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1590 dcpu = migrate_hrtimers(scpu);
1591 smp_call_function_single(dcpu, tickle_timers, NULL, 0);
1795 break; 1592 break;
1593 }
1796#endif 1594#endif
1797 1595
1798 default: 1596 default:
@@ -1811,9 +1609,6 @@ void __init hrtimers_init(void)
1811 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1609 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1812 (void *)(long)smp_processor_id()); 1610 (void *)(long)smp_processor_id());
1813 register_cpu_notifier(&hrtimers_nb); 1611 register_cpu_notifier(&hrtimers_nb);
1814#ifdef CONFIG_HIGH_RES_TIMERS
1815 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1816#endif
1817} 1612}
1818 1613
1819/** 1614/**
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c9767e641980..64c1c7253dae 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -25,6 +25,8 @@ static inline void unregister_handler_proc(unsigned int irq,
25 struct irqaction *action) { } 25 struct irqaction *action) { }
26#endif 26#endif
27 27
28extern int irq_select_affinity_usr(unsigned int irq);
29
28/* 30/*
29 * Debugging printout: 31 * Debugging printout:
30 */ 32 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c498a1b8c621..801addda3c43 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -82,24 +82,27 @@ int irq_can_set_affinity(unsigned int irq)
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 82int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
83{ 83{
84 struct irq_desc *desc = irq_to_desc(irq); 84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags;
85 86
86 if (!desc->chip->set_affinity) 87 if (!desc->chip->set_affinity)
87 return -EINVAL; 88 return -EINVAL;
88 89
90 spin_lock_irqsave(&desc->lock, flags);
91
89#ifdef CONFIG_GENERIC_PENDING_IRQ 92#ifdef CONFIG_GENERIC_PENDING_IRQ
90 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 93 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
91 unsigned long flags;
92
93 spin_lock_irqsave(&desc->lock, flags);
94 desc->affinity = cpumask; 94 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask); 95 desc->chip->set_affinity(irq, cpumask);
96 spin_unlock_irqrestore(&desc->lock, flags); 96 } else {
97 } else 97 desc->status |= IRQ_MOVE_PENDING;
98 set_pending_irq(irq, cpumask); 98 desc->pending_mask = cpumask;
99 }
99#else 100#else
100 desc->affinity = cpumask; 101 desc->affinity = cpumask;
101 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
102#endif 103#endif
104 desc->status |= IRQ_AFFINITY_SET;
105 spin_unlock_irqrestore(&desc->lock, flags);
103 return 0; 106 return 0;
104} 107}
105 108
@@ -107,24 +110,59 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
107/* 110/*
108 * Generic version of the affinity autoselector. 111 * Generic version of the affinity autoselector.
109 */ 112 */
110int irq_select_affinity(unsigned int irq) 113int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
111{ 114{
112 cpumask_t mask; 115 cpumask_t mask;
113 struct irq_desc *desc;
114 116
115 if (!irq_can_set_affinity(irq)) 117 if (!irq_can_set_affinity(irq))
116 return 0; 118 return 0;
117 119
118 cpus_and(mask, cpu_online_map, irq_default_affinity); 120 cpus_and(mask, cpu_online_map, irq_default_affinity);
119 121
120 desc = irq_to_desc(irq); 122 /*
123 * Preserve an userspace affinity setup, but make sure that
124 * one of the targets is online.
125 */
126 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
127 if (cpus_intersects(desc->affinity, cpu_online_map))
128 mask = desc->affinity;
129 else
130 desc->status &= ~IRQ_AFFINITY_SET;
131 }
132
121 desc->affinity = mask; 133 desc->affinity = mask;
122 desc->chip->set_affinity(irq, mask); 134 desc->chip->set_affinity(irq, mask);
123 135
124 return 0; 136 return 0;
125} 137}
138#else
139static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
140{
141 return irq_select_affinity(irq);
142}
126#endif 143#endif
127 144
145/*
146 * Called when affinity is set via /proc/irq
147 */
148int irq_select_affinity_usr(unsigned int irq)
149{
150 struct irq_desc *desc = irq_to_desc(irq);
151 unsigned long flags;
152 int ret;
153
154 spin_lock_irqsave(&desc->lock, flags);
155 ret = do_irq_select_affinity(irq, desc);
156 spin_unlock_irqrestore(&desc->lock, flags);
157
158 return ret;
159}
160
161#else
162static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
163{
164 return 0;
165}
128#endif 166#endif
129 167
130/** 168/**
@@ -327,7 +365,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
327 * IRQF_TRIGGER_* but the PIC does not support multiple 365 * IRQF_TRIGGER_* but the PIC does not support multiple
328 * flow-types? 366 * flow-types?
329 */ 367 */
330 pr_warning("No set_type function for IRQ %d (%s)\n", irq, 368 pr_debug("No set_type function for IRQ %d (%s)\n", irq,
331 chip ? (chip->name ? : "unknown") : "unknown"); 369 chip ? (chip->name ? : "unknown") : "unknown");
332 return 0; 370 return 0;
333 } 371 }
@@ -445,8 +483,12 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
445 /* Undo nested disables: */ 483 /* Undo nested disables: */
446 desc->depth = 1; 484 desc->depth = 1;
447 485
486 /* Exclude IRQ from balancing if requested */
487 if (new->flags & IRQF_NOBALANCING)
488 desc->status |= IRQ_NO_BALANCING;
489
448 /* Set default affinity mask once everything is setup */ 490 /* Set default affinity mask once everything is setup */
449 irq_select_affinity(irq); 491 do_irq_select_affinity(irq, desc);
450 492
451 } else if ((new->flags & IRQF_TRIGGER_MASK) 493 } else if ((new->flags & IRQF_TRIGGER_MASK)
452 && (new->flags & IRQF_TRIGGER_MASK) 494 && (new->flags & IRQF_TRIGGER_MASK)
@@ -459,10 +501,6 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
459 501
460 *p = new; 502 *p = new;
461 503
462 /* Exclude IRQ from balancing */
463 if (new->flags & IRQF_NOBALANCING)
464 desc->status |= IRQ_NO_BALANCING;
465
466 /* Reset broken irq detection when installing new handler */ 504 /* Reset broken irq detection when installing new handler */
467 desc->irq_count = 0; 505 desc->irq_count = 0;
468 desc->irqs_unhandled = 0; 506 desc->irqs_unhandled = 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 90b920d3f52b..9db681d95814 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,17 +1,6 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{
6 struct irq_desc *desc = irq_to_desc(irq);
7 unsigned long flags;
8
9 spin_lock_irqsave(&desc->lock, flags);
10 desc->status |= IRQ_MOVE_PENDING;
11 desc->pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags);
13}
14
15void move_masked_irq(int irq) 4void move_masked_irq(int irq)
16{ 5{
17 struct irq_desc *desc = irq_to_desc(irq); 6 struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4d161c70ba55..d257e7d6a8a4 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
62 if (!cpus_intersects(new_value, cpu_online_map)) 62 if (!cpus_intersects(new_value, cpu_online_map))
63 /* Special case for empty set - allow the architecture 63 /* Special case for empty set - allow the architecture
64 code to set default SMP affinity. */ 64 code to set default SMP affinity. */
65 return irq_select_affinity(irq) ? -EINVAL : count; 65 return irq_select_affinity_usr(irq) ? -EINVAL : count;
66 66
67 irq_set_affinity(irq, new_value); 67 irq_set_affinity(irq, new_value);
68 68
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5072cf1685a2..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -304,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address)
304 char *modname; 304 char *modname;
305 const char *name; 305 const char *name;
306 unsigned long offset, size; 306 unsigned long offset, size;
307 char namebuf[KSYM_NAME_LEN]; 307 int len;
308 308
309 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 309 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
310 if (!name) 310 if (!name)
311 return sprintf(buffer, "0x%lx", address); 311 return sprintf(buffer, "0x%lx", address);
312 312
313 if (name != buffer)
314 strcpy(buffer, name);
315 len = strlen(buffer);
316 buffer += len;
317
313 if (modname) 318 if (modname)
314 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 319 len += sprintf(buffer, "+%#lx/%#lx [%s]",
315 size, modname); 320 offset, size, modname);
316 else 321 else
317 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 322 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
323
324 return len;
318} 325}
319 326
320/* Look up a kernel symbol and print it to the kernel messages. */ 327/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 8b57a2597f21..9f8a3f25259a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned; 75 spinlock_t lock ____cacheline_aligned_in_smp;
76} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 76} kretprobe_table_locks[KPROBE_TABLE_SIZE];
77 77
78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
613 return -EINVAL; 613 return -EINVAL;
614 p->addr = addr; 614 p->addr = addr;
615 615
616 if (!kernel_text_address((unsigned long) p->addr) || 616 preempt_disable();
617 in_kprobes_functions((unsigned long) p->addr)) 617 if (!__kernel_text_address((unsigned long) p->addr) ||
618 in_kprobes_functions((unsigned long) p->addr)) {
619 preempt_enable();
618 return -EINVAL; 620 return -EINVAL;
621 }
619 622
620 p->mod_refcounted = 0; 623 p->mod_refcounted = 0;
621 624
622 /* 625 /*
623 * Check if are we probing a module. 626 * Check if are we probing a module.
624 */ 627 */
625 probed_mod = module_text_address((unsigned long) p->addr); 628 probed_mod = __module_text_address((unsigned long) p->addr);
626 if (probed_mod) { 629 if (probed_mod) {
627 struct module *calling_mod = module_text_address(called_from); 630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
628 /* 632 /*
629 * We must allow modules to probe themself and in this case 633 * We must allow modules to probe themself and in this case
630 * avoid incrementing the module refcount, so as to allow 634 * avoid incrementing the module refcount, so as to allow
631 * unloading of self probing modules. 635 * unloading of self probing modules.
632 */ 636 */
633 if (calling_mod && calling_mod != probed_mod) { 637 if (calling_mod && calling_mod != probed_mod) {
634 if (unlikely(!try_module_get(probed_mod))) 638 if (unlikely(!try_module_get(probed_mod))) {
639 preempt_enable();
635 return -EINVAL; 640 return -EINVAL;
641 }
636 p->mod_refcounted = 1; 642 p->mod_refcounted = 1;
637 } else 643 } else
638 probed_mod = NULL; 644 probed_mod = NULL;
639 } 645 }
646 preempt_enable();
640 647
641 p->nmissed = 0; 648 p->nmissed = 0;
642 INIT_LIST_HEAD(&p->list); 649 INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
718 struct kprobe *old_p; 725 struct kprobe *old_p;
719 726
720 if (p->mod_refcounted) { 727 if (p->mod_refcounted) {
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
721 mod = module_text_address((unsigned long)p->addr); 732 mod = module_text_address((unsigned long)p->addr);
722 if (mod) 733 if (mod)
723 module_put(mod); 734 module_put(mod);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c56923..449db466bdbc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
191 latency_record[i].time, 191 latency_record[i].time,
192 latency_record[i].max); 192 latency_record[i].max);
193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
194 char sym[KSYM_NAME_LEN]; 194 char sym[KSYM_SYMBOL_LEN];
195 char *c; 195 char *c;
196 if (!latency_record[i].backtrace[q]) 196 if (!latency_record[i].backtrace[q])
197 break; 197 break;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06e157119d2b..46a404173db2 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3276,10 +3276,10 @@ void __init lockdep_info(void)
3276{ 3276{
3277 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); 3277 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
3278 3278
3279 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); 3279 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
3280 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); 3280 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
3281 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); 3281 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
3282 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); 3282 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
3283 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); 3283 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
3284 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); 3284 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
3285 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); 3285 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
diff --git a/kernel/panic.c b/kernel/panic.c
index 6513aac8e992..4d5088355bfe 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -167,6 +167,7 @@ static const struct tnt tnts[] = {
167 * 'M' - System experienced a machine check exception. 167 * 'M' - System experienced a machine check exception.
168 * 'B' - System has hit bad_page. 168 * 'B' - System has hit bad_page.
169 * 'U' - Userspace-defined naughtiness. 169 * 'U' - Userspace-defined naughtiness.
170 * 'D' - Kernel has oopsed before
170 * 'A' - ACPI table overridden. 171 * 'A' - ACPI table overridden.
171 * 'W' - Taint on warning. 172 * 'W' - Taint on warning.
172 * 'C' - modules from drivers/staging are loaded. 173 * 'C' - modules from drivers/staging are loaded.
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 153dcb2639c3..4e5288a831de 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -311,7 +311,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
311 struct task_cputime cputime; 311 struct task_cputime cputime;
312 312
313 thread_group_cputime(p, &cputime); 313 thread_group_cputime(p, &cputime);
314 switch (which_clock) { 314 switch (CPUCLOCK_WHICH(which_clock)) {
315 default: 315 default:
316 return -EINVAL; 316 return -EINVAL;
317 case CPUCLOCK_PROF: 317 case CPUCLOCK_PROF:
@@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
1308 */ 1308 */
1309static inline int fastpath_timer_check(struct task_struct *tsk) 1309static inline int fastpath_timer_check(struct task_struct *tsk)
1310{ 1310{
1311 struct signal_struct *sig = tsk->signal; 1311 struct signal_struct *sig;
1312 1312
1313 if (unlikely(!sig)) 1313 /* tsk == current, ensure it is safe to use ->signal/sighand */
1314 if (unlikely(tsk->exit_state))
1314 return 0; 1315 return 0;
1315 1316
1316 if (!task_cputime_zero(&tsk->cputime_expires)) { 1317 if (!task_cputime_zero(&tsk->cputime_expires)) {
@@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1323 if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) 1324 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1324 return 1; 1325 return 1;
1325 } 1326 }
1327
1328 sig = tsk->signal;
1326 if (!task_cputime_zero(&sig->cputime_expires)) { 1329 if (!task_cputime_zero(&sig->cputime_expires)) {
1327 struct task_cputime group_sample; 1330 struct task_cputime group_sample;
1328 1331
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5e79c662294b..887c63787de6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
116 * must supply functions here, even if the function just returns 116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the 117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the 118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_process 119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code. 120 * fields are not modified by timer code.
121 * 121 *
122 * At this time all functions EXCEPT clock_nanosleep can be 122 * At this time all functions EXCEPT clock_nanosleep can be
@@ -197,6 +197,11 @@ static int common_timer_create(struct k_itimer *new_timer)
197 return 0; 197 return 0;
198} 198}
199 199
200static int no_timer_create(struct k_itimer *new_timer)
201{
202 return -EOPNOTSUPP;
203}
204
200/* 205/*
201 * Return nonzero if we know a priori this clockid_t value is bogus. 206 * Return nonzero if we know a priori this clockid_t value is bogus.
202 */ 207 */
@@ -248,6 +253,7 @@ static __init int init_posix_timers(void)
248 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
249 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
250 .clock_set = do_posix_clock_nosettime, 255 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create,
251 }; 257 };
252 258
253 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 259 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -313,7 +319,8 @@ void do_schedule_next_timer(struct siginfo *info)
313 319
314int posix_timer_event(struct k_itimer *timr, int si_private) 320int posix_timer_event(struct k_itimer *timr, int si_private)
315{ 321{
316 int shared, ret; 322 struct task_struct *task;
323 int shared, ret = -1;
317 /* 324 /*
318 * FIXME: if ->sigq is queued we can race with 325 * FIXME: if ->sigq is queued we can race with
319 * dequeue_signal()->do_schedule_next_timer(). 326 * dequeue_signal()->do_schedule_next_timer().
@@ -327,8 +334,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
327 */ 334 */
328 timr->sigq->info.si_sys_private = si_private; 335 timr->sigq->info.si_sys_private = si_private;
329 336
330 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); 337 rcu_read_lock();
331 ret = send_sigqueue(timr->sigq, timr->it_process, shared); 338 task = pid_task(timr->it_pid, PIDTYPE_PID);
339 if (task) {
340 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
341 ret = send_sigqueue(timr->sigq, task, shared);
342 }
343 rcu_read_unlock();
332 /* If we failed to send the signal the timer stops. */ 344 /* If we failed to send the signal the timer stops. */
333 return ret > 0; 345 return ret > 0;
334} 346}
@@ -405,7 +417,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
405 return ret; 417 return ret;
406} 418}
407 419
408static struct task_struct * good_sigevent(sigevent_t * event) 420static struct pid *good_sigevent(sigevent_t * event)
409{ 421{
410 struct task_struct *rtn = current->group_leader; 422 struct task_struct *rtn = current->group_leader;
411 423
@@ -419,7 +431,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
419 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) 431 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
420 return NULL; 432 return NULL;
421 433
422 return rtn; 434 return task_pid(rtn);
423} 435}
424 436
425void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 437void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -458,6 +470,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
458 idr_remove(&posix_timers_id, tmr->it_id); 470 idr_remove(&posix_timers_id, tmr->it_id);
459 spin_unlock_irqrestore(&idr_lock, flags); 471 spin_unlock_irqrestore(&idr_lock, flags);
460 } 472 }
473 put_pid(tmr->it_pid);
461 sigqueue_free(tmr->sigq); 474 sigqueue_free(tmr->sigq);
462 kmem_cache_free(posix_timers_cache, tmr); 475 kmem_cache_free(posix_timers_cache, tmr);
463} 476}
@@ -471,7 +484,6 @@ sys_timer_create(const clockid_t which_clock,
471{ 484{
472 struct k_itimer *new_timer; 485 struct k_itimer *new_timer;
473 int error, new_timer_id; 486 int error, new_timer_id;
474 struct task_struct *process;
475 sigevent_t event; 487 sigevent_t event;
476 int it_id_set = IT_ID_NOT_SET; 488 int it_id_set = IT_ID_NOT_SET;
477 489
@@ -525,11 +537,9 @@ sys_timer_create(const clockid_t which_clock,
525 goto out; 537 goto out;
526 } 538 }
527 rcu_read_lock(); 539 rcu_read_lock();
528 process = good_sigevent(&event); 540 new_timer->it_pid = get_pid(good_sigevent(&event));
529 if (process)
530 get_task_struct(process);
531 rcu_read_unlock(); 541 rcu_read_unlock();
532 if (!process) { 542 if (!new_timer->it_pid) {
533 error = -EINVAL; 543 error = -EINVAL;
534 goto out; 544 goto out;
535 } 545 }
@@ -537,8 +547,7 @@ sys_timer_create(const clockid_t which_clock,
537 event.sigev_notify = SIGEV_SIGNAL; 547 event.sigev_notify = SIGEV_SIGNAL;
538 event.sigev_signo = SIGALRM; 548 event.sigev_signo = SIGALRM;
539 event.sigev_value.sival_int = new_timer->it_id; 549 event.sigev_value.sival_int = new_timer->it_id;
540 process = current->group_leader; 550 new_timer->it_pid = get_pid(task_tgid(current));
541 get_task_struct(process);
542 } 551 }
543 552
544 new_timer->it_sigev_notify = event.sigev_notify; 553 new_timer->it_sigev_notify = event.sigev_notify;
@@ -548,7 +557,7 @@ sys_timer_create(const clockid_t which_clock,
548 new_timer->sigq->info.si_code = SI_TIMER; 557 new_timer->sigq->info.si_code = SI_TIMER;
549 558
550 spin_lock_irq(&current->sighand->siglock); 559 spin_lock_irq(&current->sighand->siglock);
551 new_timer->it_process = process; 560 new_timer->it_signal = current->signal;
552 list_add(&new_timer->list, &current->signal->posix_timers); 561 list_add(&new_timer->list, &current->signal->posix_timers);
553 spin_unlock_irq(&current->sighand->siglock); 562 spin_unlock_irq(&current->sighand->siglock);
554 563
@@ -583,8 +592,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
583 timr = idr_find(&posix_timers_id, (int)timer_id); 592 timr = idr_find(&posix_timers_id, (int)timer_id);
584 if (timr) { 593 if (timr) {
585 spin_lock(&timr->it_lock); 594 spin_lock(&timr->it_lock);
586 if (timr->it_process && 595 if (timr->it_signal == current->signal) {
587 same_thread_group(timr->it_process, current)) {
588 spin_unlock(&idr_lock); 596 spin_unlock(&idr_lock);
589 return timr; 597 return timr;
590 } 598 }
@@ -831,8 +839,7 @@ retry_delete:
831 * This keeps any tasks waiting on the spin lock from thinking 839 * This keeps any tasks waiting on the spin lock from thinking
832 * they got something (see the lock code above). 840 * they got something (see the lock code above).
833 */ 841 */
834 put_task_struct(timer->it_process); 842 timer->it_signal = NULL;
835 timer->it_process = NULL;
836 843
837 unlock_timer(timer, flags); 844 unlock_timer(timer, flags);
838 release_posix_timer(timer, IT_ID_SET); 845 release_posix_timer(timer, IT_ID_SET);
@@ -858,8 +865,7 @@ retry_delete:
858 * This keeps any tasks waiting on the spin lock from thinking 865 * This keeps any tasks waiting on the spin lock from thinking
859 * they got something (see the lock code above). 866 * they got something (see the lock code above).
860 */ 867 */
861 put_task_struct(timer->it_process); 868 timer->it_signal = NULL;
862 timer->it_process = NULL;
863 869
864 unlock_timer(timer, flags); 870 unlock_timer(timer, flags);
865 release_posix_timer(timer, IT_ID_SET); 871 release_posix_timer(timer, IT_ID_SET);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 19122cf6d827..b8f7ce9473e8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -174,7 +174,7 @@ static void suspend_test_finish(const char *label)
174 * has some performance issues. The stack dump of a WARN_ON 174 * has some performance issues. The stack dump of a WARN_ON
175 * is more likely to get the right attention than a printk... 175 * is more likely to get the right attention than a printk...
176 */ 176 */
177 WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); 177 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
178} 178}
179 179
180#else 180#else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b7713b53d07a..6da14358537c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -633,7 +633,7 @@ void swsusp_close(fmode_t mode)
633 return; 633 return;
634 } 634 }
635 635
636 blkdev_put(resume_bdev, mode); /* move up */ 636 blkdev_put(resume_bdev, mode);
637} 637}
638 638
639static int swsusp_header_init(void) 639static int swsusp_header_init(void)
diff --git a/kernel/profile.c b/kernel/profile.c
index 9830a037d8db..dc41827fbfee 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -351,7 +351,7 @@ out:
351 put_cpu(); 351 put_cpu();
352} 352}
353 353
354static int __devinit profile_cpu_callback(struct notifier_block *info, 354static int __cpuinit profile_cpu_callback(struct notifier_block *info,
355 unsigned long action, void *__cpu) 355 unsigned long action, void *__cpu)
356{ 356{
357 int node, cpu = (unsigned long)__cpu; 357 int node, cpu = (unsigned long)__cpu;
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
544}; 544};
545 545
546#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
547static void __init profile_nop(void *unused) 547static inline void profile_nop(void *unused)
548{ 548{
549} 549}
550 550
@@ -596,7 +596,7 @@ out_cleanup:
596#define create_hash_tables() ({ 0; }) 596#define create_hash_tables() ({ 0; })
597#endif 597#endif
598 598
599int create_proc_profile(void) 599int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
600{ 600{
601 struct proc_dir_entry *entry; 601 struct proc_dir_entry *entry;
602 602
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1e68e4c39e2c..4c8bcd7dd8e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
612 return (copied == sizeof(data)) ? 0 : -EIO; 612 return (copied == sizeof(data)) ? 0 : -EIO;
613} 613}
614 614
615#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE 615#if defined CONFIG_COMPAT
616#include <linux/compat.h> 616#include <linux/compat.h>
617 617
618int compat_ptrace_request(struct task_struct *child, compat_long_t request, 618int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -709,4 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
709 unlock_kernel(); 709 unlock_kernel();
710 return ret; 710 return ret;
711} 711}
712#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ 712#endif /* CONFIG_COMPAT */
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a7855c08..09ac2008f77b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
400 } 400 }
401 401
402 mutex_lock(&relay_channels_mutex); 402 mutex_lock(&relay_channels_mutex);
403 for_each_online_cpu(i) 403 for_each_possible_cpu(i)
404 if (chan->buf[i]) 404 if (chan->buf[i])
405 __relay_reset(chan->buf[i], 0); 405 __relay_reset(chan->buf[i], 0);
406 mutex_unlock(&relay_channels_mutex); 406 mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
611 return chan; 611 return chan;
612 612
613free_bufs: 613free_bufs:
614 for_each_online_cpu(i) { 614 for_each_possible_cpu(i) {
615 if (!chan->buf[i]) 615 if (chan->buf[i])
616 break; 616 relay_close_buf(chan->buf[i]);
617 relay_close_buf(chan->buf[i]);
618 } 617 }
619 618
620 kref_put(&chan->kref, relay_destroy_channel); 619 kref_put(&chan->kref, relay_destroy_channel);
@@ -1318,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in,
1318 if (ret < 0) 1317 if (ret < 0)
1319 break; 1318 break;
1320 else if (!ret) { 1319 else if (!ret) {
1321 if (spliced) 1320 if (flags & SPLICE_F_NONBLOCK)
1322 break;
1323 if (flags & SPLICE_F_NONBLOCK) {
1324 ret = -EAGAIN; 1321 ret = -EAGAIN;
1325 break; 1322 break;
1326 }
1327 } 1323 }
1328 1324
1329 *ppos += ret; 1325 *ppos += ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index 57c933ffbee1..22c532a6f82c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -203,7 +203,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
203 hrtimer_init(&rt_b->rt_period_timer, 203 hrtimer_init(&rt_b->rt_period_timer,
204 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 204 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 rt_b->rt_period_timer.function = sched_rt_period_timer; 205 rt_b->rt_period_timer.function = sched_rt_period_timer;
206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207} 206}
208 207
209static inline int rt_bandwidth_enabled(void) 208static inline int rt_bandwidth_enabled(void)
@@ -399,7 +398,7 @@ struct cfs_rq {
399 */ 398 */
400 struct sched_entity *curr, *next, *last; 399 struct sched_entity *curr, *next, *last;
401 400
402 unsigned long nr_spread_over; 401 unsigned int nr_spread_over;
403 402
404#ifdef CONFIG_FAIR_GROUP_SCHED 403#ifdef CONFIG_FAIR_GROUP_SCHED
405 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 404 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -969,6 +968,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
969 } 968 }
970} 969}
971 970
971void task_rq_unlock_wait(struct task_struct *p)
972{
973 struct rq *rq = task_rq(p);
974
975 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
976 spin_unlock_wait(&rq->lock);
977}
978
972static void __task_rq_unlock(struct rq *rq) 979static void __task_rq_unlock(struct rq *rq)
973 __releases(rq->lock) 980 __releases(rq->lock)
974{ 981{
@@ -1131,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
1131 1138
1132 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1139 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1133 rq->hrtick_timer.function = hrtick; 1140 rq->hrtick_timer.function = hrtick;
1134 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1135} 1141}
1136#else /* CONFIG_SCHED_HRTICK */ 1142#else /* CONFIG_SCHED_HRTICK */
1137static inline void hrtick_clear(struct rq *rq) 1143static inline void hrtick_clear(struct rq *rq)
@@ -1445,9 +1451,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1445static unsigned long cpu_avg_load_per_task(int cpu) 1451static unsigned long cpu_avg_load_per_task(int cpu)
1446{ 1452{
1447 struct rq *rq = cpu_rq(cpu); 1453 struct rq *rq = cpu_rq(cpu);
1454 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1448 1455
1449 if (rq->nr_running) 1456 if (nr_running)
1450 rq->avg_load_per_task = rq->load.weight / rq->nr_running; 1457 rq->avg_load_per_task = rq->load.weight / nr_running;
1458 else
1459 rq->avg_load_per_task = 0;
1451 1460
1452 return rq->avg_load_per_task; 1461 return rq->avg_load_per_task;
1453} 1462}
@@ -5860,6 +5869,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5860 struct rq *rq = cpu_rq(cpu); 5869 struct rq *rq = cpu_rq(cpu);
5861 unsigned long flags; 5870 unsigned long flags;
5862 5871
5872 spin_lock_irqsave(&rq->lock, flags);
5873
5863 __sched_fork(idle); 5874 __sched_fork(idle);
5864 idle->se.exec_start = sched_clock(); 5875 idle->se.exec_start = sched_clock();
5865 5876
@@ -5867,7 +5878,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5867 idle->cpus_allowed = cpumask_of_cpu(cpu); 5878 idle->cpus_allowed = cpumask_of_cpu(cpu);
5868 __set_task_cpu(idle, cpu); 5879 __set_task_cpu(idle, cpu);
5869 5880
5870 spin_lock_irqsave(&rq->lock, flags);
5871 rq->curr = rq->idle = idle; 5881 rq->curr = rq->idle = idle;
5872#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5882#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5873 idle->oncpu = 1; 5883 idle->oncpu = 1;
@@ -6575,7 +6585,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6575 req = list_entry(rq->migration_queue.next, 6585 req = list_entry(rq->migration_queue.next,
6576 struct migration_req, list); 6586 struct migration_req, list);
6577 list_del_init(&req->list); 6587 list_del_init(&req->list);
6588 spin_unlock_irq(&rq->lock);
6578 complete(&req->done); 6589 complete(&req->done);
6590 spin_lock_irq(&rq->lock);
6579 } 6591 }
6580 spin_unlock_irq(&rq->lock); 6592 spin_unlock_irq(&rq->lock);
6581 break; 6593 break;
@@ -7778,13 +7790,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7778 * 7790 *
7779 * The passed in 'doms_new' should be kmalloc'd. This routine takes 7791 * The passed in 'doms_new' should be kmalloc'd. This routine takes
7780 * ownership of it and will kfree it when done with it. If the caller 7792 * ownership of it and will kfree it when done with it. If the caller
7781 * failed the kmalloc call, then it can pass in doms_new == NULL, 7793 * failed the kmalloc call, then it can pass in doms_new == NULL &&
7782 * and partition_sched_domains() will fallback to the single partition 7794 * ndoms_new == 1, and partition_sched_domains() will fallback to
7783 * 'fallback_doms', it also forces the domains to be rebuilt. 7795 * the single partition 'fallback_doms', it also forces the domains
7796 * to be rebuilt.
7784 * 7797 *
7785 * If doms_new==NULL it will be replaced with cpu_online_map. 7798 * If doms_new == NULL it will be replaced with cpu_online_map.
7786 * ndoms_new==0 is a special case for destroying existing domains. 7799 * ndoms_new == 0 is a special case for destroying existing domains,
7787 * It will not create the default domain. 7800 * and it will not create the default domain.
7788 * 7801 *
7789 * Call with hotplug lock held 7802 * Call with hotplug lock held
7790 */ 7803 */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 81787248b60f..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
118 118
119 /* 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 121 * max(scd->tick_gtod, scd->clock),
122 * max(scd->clock, scd->tick_gtod + TICK_NSEC)); 122 * scd->tick_gtod + TICK_NSEC);
123 */ 123 */
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 127 max_clock = scd->tick_gtod + TICK_NSEC;
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec32..26ed8e3d1c15 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
144 last = __pick_last_entity(cfs_rq); 144 last = __pick_last_entity(cfs_rq);
145 if (last) 145 if (last)
146 max_vruntime = last->vruntime; 146 max_vruntime = last->vruntime;
147 min_vruntime = rq->cfs.min_vruntime; 147 min_vruntime = cfs_rq->min_vruntime;
148 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 148 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
149 spin_unlock_irqrestore(&rq->lock, flags); 149 spin_unlock_irqrestore(&rq->lock, flags);
150 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 150 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
161 SPLIT_NS(spread0)); 161 SPLIT_NS(spread0));
162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
164#ifdef CONFIG_SCHEDSTATS
165#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
166
167 P(yld_exp_empty);
168 P(yld_act_empty);
169 P(yld_both_empty);
170 P(yld_count);
171 164
172 P(sched_switch); 165 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
173 P(sched_count);
174 P(sched_goidle);
175
176 P(ttwu_count);
177 P(ttwu_local);
178
179 P(bkl_count);
180
181#undef P
182#endif
183 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
184 cfs_rq->nr_spread_over); 166 cfs_rq->nr_spread_over);
185#ifdef CONFIG_FAIR_GROUP_SCHED 167#ifdef CONFIG_FAIR_GROUP_SCHED
186#ifdef CONFIG_SMP 168#ifdef CONFIG_SMP
@@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu)
260#undef P 242#undef P
261#undef PN 243#undef PN
262 244
245#ifdef CONFIG_SCHEDSTATS
246#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
247
248 P(yld_exp_empty);
249 P(yld_act_empty);
250 P(yld_both_empty);
251 P(yld_count);
252
253 P(sched_switch);
254 P(sched_count);
255 P(sched_goidle);
256
257 P(ttwu_count);
258 P(ttwu_local);
259
260 P(bkl_count);
261
262#undef P
263#endif
263 print_cfs_stats(m, cpu); 264 print_cfs_stats(m, cpu);
264 print_rt_stats(m, cpu); 265 print_rt_stats(m, cpu);
265 266
@@ -422,10 +423,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
422#undef __P 423#undef __P
423 424
424 { 425 {
426 unsigned int this_cpu = raw_smp_processor_id();
425 u64 t0, t1; 427 u64 t0, t1;
426 428
427 t0 = sched_clock(); 429 t0 = cpu_clock(this_cpu);
428 t1 = sched_clock(); 430 t1 = cpu_clock(this_cpu);
429 SEQ_printf(m, "%-35s:%21Ld\n", 431 SEQ_printf(m, "%-35s:%21Ld\n",
430 "clock-delta", (long long)(t1-t0)); 432 "clock-delta", (long long)(t1-t0));
431 } 433 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 51aa3e102acb..98345e45b059 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -716,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
716 __enqueue_entity(cfs_rq, se); 716 __enqueue_entity(cfs_rq, se);
717} 717}
718 718
719static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{
721 if (cfs_rq->last == se)
722 cfs_rq->last = NULL;
723
724 if (cfs_rq->next == se)
725 cfs_rq->next = NULL;
726}
727
719static void 728static void
720dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 729dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
721{ 730{
@@ -738,11 +747,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
738#endif 747#endif
739 } 748 }
740 749
741 if (cfs_rq->last == se) 750 clear_buddies(cfs_rq, se);
742 cfs_rq->last = NULL;
743
744 if (cfs_rq->next == se)
745 cfs_rq->next = NULL;
746 751
747 if (se != cfs_rq->curr) 752 if (se != cfs_rq->curr)
748 __dequeue_entity(cfs_rq, se); 753 __dequeue_entity(cfs_rq, se);
@@ -977,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
977 if (unlikely(cfs_rq->nr_running == 1)) 982 if (unlikely(cfs_rq->nr_running == 1))
978 return; 983 return;
979 984
985 clear_buddies(cfs_rq, se);
986
980 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 987 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
981 update_rq_clock(rq); 988 update_rq_clock(rq);
982 /* 989 /*
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ee71bec1da66..7dbf72a2b02c 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -298,9 +298,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
298{ 298{
299 struct signal_struct *sig; 299 struct signal_struct *sig;
300 300
301 sig = tsk->signal; 301 /* tsk == current, ensure it is safe to use ->signal */
302 if (unlikely(!sig)) 302 if (unlikely(tsk->exit_state))
303 return; 303 return;
304
305 sig = tsk->signal;
304 if (sig->cputime.totals) { 306 if (sig->cputime.totals) {
305 struct task_cputime *times; 307 struct task_cputime *times;
306 308
@@ -325,9 +327,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
325{ 327{
326 struct signal_struct *sig; 328 struct signal_struct *sig;
327 329
328 sig = tsk->signal; 330 /* tsk == current, ensure it is safe to use ->signal */
329 if (unlikely(!sig)) 331 if (unlikely(tsk->exit_state))
330 return; 332 return;
333
334 sig = tsk->signal;
331 if (sig->cputime.totals) { 335 if (sig->cputime.totals) {
332 struct task_cputime *times; 336 struct task_cputime *times;
333 337
@@ -353,8 +357,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
353 struct signal_struct *sig; 357 struct signal_struct *sig;
354 358
355 sig = tsk->signal; 359 sig = tsk->signal;
360 /* see __exit_signal()->task_rq_unlock_wait() */
361 barrier();
356 if (unlikely(!sig)) 362 if (unlikely(!sig))
357 return; 363 return;
364
358 if (sig->cputime.totals) { 365 if (sig->cputime.totals) {
359 struct task_cputime *times; 366 struct task_cputime *times;
360 367
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7110daeb9a90..e7c69a720d69 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,10 +269,11 @@ void irq_enter(void)
269{ 269{
270 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
271 271
272 if (idle_cpu(cpu) && !in_interrupt()) 272 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter();
273 tick_check_idle(cpu); 274 tick_check_idle(cpu);
274 275 } else
275 __irq_enter(); 276 __irq_enter();
276} 277}
277 278
278#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 279#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 3953e4aed733..dc0b3be6b7d5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
188 if ((long)(now - t->last_switch_timestamp) < 188 if ((long)(now - t->last_switch_timestamp) <
189 sysctl_hung_task_timeout_secs) 189 sysctl_hung_task_timeout_secs)
190 return; 190 return;
191 if (sysctl_hung_task_warnings < 0) 191 if (!sysctl_hung_task_warnings)
192 return; 192 return;
193 sysctl_hung_task_warnings--; 193 sysctl_hung_task_warnings--;
194 194
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bc4c00872c9..24e8ceacc388 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -112,7 +112,7 @@ static int chill(void *unused)
112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
113{ 113{
114 struct work_struct *sm_work; 114 struct work_struct *sm_work;
115 int i; 115 int i, ret;
116 116
117 /* Set up initial state. */ 117 /* Set up initial state. */
118 mutex_lock(&lock); 118 mutex_lock(&lock);
@@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
137 /* This will release the thread on our CPU. */ 137 /* This will release the thread on our CPU. */
138 put_cpu(); 138 put_cpu();
139 flush_workqueue(stop_machine_wq); 139 flush_workqueue(stop_machine_wq);
140 ret = active.fnret;
140 mutex_unlock(&lock); 141 mutex_unlock(&lock);
141 return active.fnret; 142 return ret;
142} 143}
143 144
144int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b04..e14a23281707 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
31cond_syscall(sys_bind); 31cond_syscall(sys_bind);
32cond_syscall(sys_listen); 32cond_syscall(sys_listen);
33cond_syscall(sys_accept); 33cond_syscall(sys_accept);
34cond_syscall(sys_paccept); 34cond_syscall(sys_accept4);
35cond_syscall(sys_connect); 35cond_syscall(sys_connect);
36cond_syscall(sys_getsockname); 36cond_syscall(sys_getsockname);
37cond_syscall(sys_getpeername); 37cond_syscall(sys_getpeername);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d902..3d56fe7570da 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
176#ifdef CONFIG_INOTIFY_USER 176#ifdef CONFIG_INOTIFY_USER
177extern struct ctl_table inotify_table[]; 177extern struct ctl_table inotify_table[];
178#endif 178#endif
179#ifdef CONFIG_EPOLL
180extern struct ctl_table epoll_table[];
181#endif
179 182
180#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 183#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
181int sysctl_legacy_va_layout; 184int sysctl_legacy_va_layout;
@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = {
1325 .child = inotify_table, 1328 .child = inotify_table,
1326 }, 1329 },
1327#endif 1330#endif
1331#ifdef CONFIG_EPOLL
1332 {
1333 .procname = "epoll",
1334 .mode = 0555,
1335 .child = epoll_table,
1336 },
1337#endif
1328#endif 1338#endif
1329 { 1339 {
1330 .ctl_name = KERN_SETUID_DUMPABLE, 1340 .ctl_name = KERN_SETUID_DUMPABLE,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ff15e5d486b..f5f793d92415 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
131{ 131{
132 enum hrtimer_restart res = HRTIMER_NORESTART; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
133 133
134 write_seqlock_irq(&xtime_lock); 134 write_seqlock(&xtime_lock);
135 135
136 switch (time_state) { 136 switch (time_state) {
137 case TIME_OK: 137 case TIME_OK:
@@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
164 } 164 }
165 update_vsyscall(&xtime, clock); 165 update_vsyscall(&xtime, clock);
166 166
167 write_sequnlock_irq(&xtime_lock); 167 write_sequnlock(&xtime_lock);
168 168
169 return res; 169 return res;
170} 170}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5bbb1044f847..8f3fc2582d38 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle)
247 if (need_resched()) 247 if (need_resched())
248 goto end; 248 goto end;
249 249
250 if (unlikely(local_softirq_pending())) { 250 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
251 static int ratelimit; 251 static int ratelimit;
252 252
253 if (ratelimit < 10) { 253 if (ratelimit < 10) {
@@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle)
282 /* Schedule the tick, if we are at least one jiffie off */ 282 /* Schedule the tick, if we are at least one jiffie off */
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 /*
286 * calculate the expiry time for the next timer wheel
287 * timer
288 */
289 expires = ktime_add_ns(last_update, tick_period.tv64 *
290 delta_jiffies);
291
292 /*
293 * If this cpu is the one which updates jiffies, then
294 * give up the assignment and let it be taken by the
295 * cpu which runs the tick timer next, which might be
296 * this cpu as well. If we don't drop this here the
297 * jiffies might be stale and do_timer() never
298 * invoked.
299 */
300 if (cpu == tick_do_timer_cpu)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302
285 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 304 cpu_set(cpu, nohz_cpu_mask);
305
306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
308 goto out;
309
287 /* 310 /*
288 * nohz_stop_sched_tick can be called several times before 311 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 312 * the nohz_restart_sched_tick is called. This happens when
@@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle)
306 rcu_enter_nohz(); 329 rcu_enter_nohz();
307 } 330 }
308 331
309 /*
310 * If this cpu is the one which updates jiffies, then
311 * give up the assignment and let it be taken by the
312 * cpu which runs the tick timer next, which might be
313 * this cpu as well. If we don't drop this here the
314 * jiffies might be stale and do_timer() never
315 * invoked.
316 */
317 if (cpu == tick_do_timer_cpu)
318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
319
320 ts->idle_sleeps++; 332 ts->idle_sleeps++;
321 333
322 /* 334 /*
@@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle)
332 goto out; 344 goto out;
333 } 345 }
334 346
335 /* 347 /* Mark expiries */
336 * calculate the expiry time for the next timer wheel
337 * timer
338 */
339 expires = ktime_add_ns(last_update, tick_period.tv64 *
340 delta_jiffies);
341 ts->idle_expires = expires; 348 ts->idle_expires = expires;
342 349
343 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@ -568,6 +575,9 @@ static void tick_nohz_switch_to_nohz(void)
568 */ 575 */
569static void tick_nohz_kick_tick(int cpu) 576static void tick_nohz_kick_tick(int cpu)
570{ 577{
578#if 0
579 /* Switch back to 2.6.27 behaviour */
580
571 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 581 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
572 ktime_t delta, now; 582 ktime_t delta, now;
573 583
@@ -584,6 +594,7 @@ static void tick_nohz_kick_tick(int cpu)
584 return; 594 return;
585 595
586 tick_nohz_restart(ts, now); 596 tick_nohz_restart(ts, now);
597#endif
587} 598}
588 599
589#else 600#else
@@ -677,7 +688,6 @@ void tick_setup_sched_timer(void)
677 */ 688 */
678 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 689 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
679 ts->sched_timer.function = tick_sched_timer; 690 ts->sched_timer.function = tick_sched_timer;
680 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
681 691
682 /* Get the next period (per cpu) */ 692 /* Get the next period (per cpu) */
683 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 693 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e7acfb482a68..fa05e88aa76f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -518,6 +518,28 @@ void update_wall_time(void)
518 /* correct the clock when NTP error is too big */ 518 /* correct the clock when NTP error is too big */
519 clocksource_adjust(offset); 519 clocksource_adjust(offset);
520 520
521 /*
522 * Since in the loop above, we accumulate any amount of time
523 * in xtime_nsec over a second into xtime.tv_sec, its possible for
524 * xtime_nsec to be fairly small after the loop. Further, if we're
525 * slightly speeding the clocksource up in clocksource_adjust(),
526 * its possible the required corrective factor to xtime_nsec could
527 * cause it to underflow.
528 *
529 * Now, we cannot simply roll the accumulated second back, since
530 * the NTP subsystem has been notified via second_overflow. So
531 * instead we push xtime_nsec forward by the amount we underflowed,
532 * and add that amount into the error.
533 *
534 * We'll correct this error next time through this function, when
535 * xtime_nsec is not as small.
536 */
537 if (unlikely((s64)clock->xtime_nsec < 0)) {
538 s64 neg = -(s64)clock->xtime_nsec;
539 clock->xtime_nsec = 0;
540 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
541 }
542
521 /* store full nanoseconds into xtime after rounding it up and 543 /* store full nanoseconds into xtime after rounding it up and
522 * add the remainder to the error difference. 544 * add the remainder to the error difference.
523 */ 545 */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c8..78db083390f0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -185,7 +185,6 @@ enum {
185}; 185};
186 186
187static int ftrace_filtered; 187static int ftrace_filtered;
188static int tracing_on;
189 188
190static LIST_HEAD(ftrace_new_addrs); 189static LIST_HEAD(ftrace_new_addrs);
191 190
@@ -327,96 +326,89 @@ ftrace_record_ip(unsigned long ip)
327 326
328static int 327static int
329__ftrace_replace_code(struct dyn_ftrace *rec, 328__ftrace_replace_code(struct dyn_ftrace *rec,
330 unsigned char *old, unsigned char *new, int enable) 329 unsigned char *nop, int enable)
331{ 330{
332 unsigned long ip, fl; 331 unsigned long ip, fl;
332 unsigned char *call, *old, *new;
333 333
334 ip = rec->ip; 334 ip = rec->ip;
335 335
336 if (ftrace_filtered && enable) { 336 /*
337 * If this record is not to be traced and
338 * it is not enabled then do nothing.
339 *
340 * If this record is not to be traced and
341 * it is enabled then disabled it.
342 *
343 */
344 if (rec->flags & FTRACE_FL_NOTRACE) {
345 if (rec->flags & FTRACE_FL_ENABLED)
346 rec->flags &= ~FTRACE_FL_ENABLED;
347 else
348 return 0;
349
350 } else if (ftrace_filtered && enable) {
337 /* 351 /*
338 * If filtering is on: 352 * Filtering is on:
339 *
340 * If this record is set to be filtered and
341 * is enabled then do nothing.
342 *
343 * If this record is set to be filtered and
344 * it is not enabled, enable it.
345 *
346 * If this record is not set to be filtered
347 * and it is not enabled do nothing.
348 *
349 * If this record is set not to trace then
350 * do nothing.
351 *
352 * If this record is set not to trace and
353 * it is enabled then disable it.
354 *
355 * If this record is not set to be filtered and
356 * it is enabled, disable it.
357 */ 353 */
358 354
359 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | 355 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
360 FTRACE_FL_ENABLED);
361 356
362 if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || 357 /* Record is filtered and enabled, do nothing */
363 (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || 358 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
364 !fl || (fl == FTRACE_FL_NOTRACE))
365 return 0; 359 return 0;
366 360
367 /* 361 /* Record is not filtered and is not enabled do nothing */
368 * If it is enabled disable it, 362 if (!fl)
369 * otherwise enable it! 363 return 0;
370 */ 364
371 if (fl & FTRACE_FL_ENABLED) { 365 /* Record is not filtered but enabled, disable it */
372 /* swap new and old */ 366 if (fl == FTRACE_FL_ENABLED)
373 new = old;
374 old = ftrace_call_replace(ip, FTRACE_ADDR);
375 rec->flags &= ~FTRACE_FL_ENABLED; 367 rec->flags &= ~FTRACE_FL_ENABLED;
376 } else { 368 else
377 new = ftrace_call_replace(ip, FTRACE_ADDR); 369 /* Otherwise record is filtered but not enabled, enable it */
378 rec->flags |= FTRACE_FL_ENABLED; 370 rec->flags |= FTRACE_FL_ENABLED;
379 }
380 } else { 371 } else {
372 /* Disable or not filtered */
381 373
382 if (enable) { 374 if (enable) {
383 /* 375 /* if record is enabled, do nothing */
384 * If this record is set not to trace and is
385 * not enabled, do nothing.
386 */
387 fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
388 if (fl == FTRACE_FL_NOTRACE)
389 return 0;
390
391 new = ftrace_call_replace(ip, FTRACE_ADDR);
392 } else
393 old = ftrace_call_replace(ip, FTRACE_ADDR);
394
395 if (enable) {
396 if (rec->flags & FTRACE_FL_ENABLED) 376 if (rec->flags & FTRACE_FL_ENABLED)
397 return 0; 377 return 0;
378
398 rec->flags |= FTRACE_FL_ENABLED; 379 rec->flags |= FTRACE_FL_ENABLED;
380
399 } else { 381 } else {
382
383 /* if record is not enabled do nothing */
400 if (!(rec->flags & FTRACE_FL_ENABLED)) 384 if (!(rec->flags & FTRACE_FL_ENABLED))
401 return 0; 385 return 0;
386
402 rec->flags &= ~FTRACE_FL_ENABLED; 387 rec->flags &= ~FTRACE_FL_ENABLED;
403 } 388 }
404 } 389 }
405 390
391 call = ftrace_call_replace(ip, FTRACE_ADDR);
392
393 if (rec->flags & FTRACE_FL_ENABLED) {
394 old = nop;
395 new = call;
396 } else {
397 old = call;
398 new = nop;
399 }
400
406 return ftrace_modify_code(ip, old, new); 401 return ftrace_modify_code(ip, old, new);
407} 402}
408 403
409static void ftrace_replace_code(int enable) 404static void ftrace_replace_code(int enable)
410{ 405{
411 int i, failed; 406 int i, failed;
412 unsigned char *new = NULL, *old = NULL; 407 unsigned char *nop = NULL;
413 struct dyn_ftrace *rec; 408 struct dyn_ftrace *rec;
414 struct ftrace_page *pg; 409 struct ftrace_page *pg;
415 410
416 if (enable) 411 nop = ftrace_nop_replace();
417 old = ftrace_nop_replace();
418 else
419 new = ftrace_nop_replace();
420 412
421 for (pg = ftrace_pages_start; pg; pg = pg->next) { 413 for (pg = ftrace_pages_start; pg; pg = pg->next) {
422 for (i = 0; i < pg->index; i++) { 414 for (i = 0; i < pg->index; i++) {
@@ -434,7 +426,7 @@ static void ftrace_replace_code(int enable)
434 unfreeze_record(rec); 426 unfreeze_record(rec);
435 } 427 }
436 428
437 failed = __ftrace_replace_code(rec, old, new, enable); 429 failed = __ftrace_replace_code(rec, nop, enable);
438 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 430 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
439 rec->flags |= FTRACE_FL_FAILED; 431 rec->flags |= FTRACE_FL_FAILED;
440 if ((system_state == SYSTEM_BOOTING) || 432 if ((system_state == SYSTEM_BOOTING) ||
@@ -506,13 +498,10 @@ static int __ftrace_modify_code(void *data)
506{ 498{
507 int *command = data; 499 int *command = data;
508 500
509 if (*command & FTRACE_ENABLE_CALLS) { 501 if (*command & FTRACE_ENABLE_CALLS)
510 ftrace_replace_code(1); 502 ftrace_replace_code(1);
511 tracing_on = 1; 503 else if (*command & FTRACE_DISABLE_CALLS)
512 } else if (*command & FTRACE_DISABLE_CALLS) {
513 ftrace_replace_code(0); 504 ftrace_replace_code(0);
514 tracing_on = 0;
515 }
516 505
517 if (*command & FTRACE_UPDATE_TRACE_FUNC) 506 if (*command & FTRACE_UPDATE_TRACE_FUNC)
518 ftrace_update_ftrace_func(ftrace_trace_function); 507 ftrace_update_ftrace_func(ftrace_trace_function);
@@ -538,8 +527,7 @@ static void ftrace_startup(void)
538 527
539 mutex_lock(&ftrace_start_lock); 528 mutex_lock(&ftrace_start_lock);
540 ftrace_start++; 529 ftrace_start++;
541 if (ftrace_start == 1) 530 command |= FTRACE_ENABLE_CALLS;
542 command |= FTRACE_ENABLE_CALLS;
543 531
544 if (saved_ftrace_func != ftrace_trace_function) { 532 if (saved_ftrace_func != ftrace_trace_function) {
545 saved_ftrace_func = ftrace_trace_function; 533 saved_ftrace_func = ftrace_trace_function;
@@ -677,7 +665,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
677 665
678 cnt = num_to_init / ENTRIES_PER_PAGE; 666 cnt = num_to_init / ENTRIES_PER_PAGE;
679 pr_info("ftrace: allocating %ld entries in %d pages\n", 667 pr_info("ftrace: allocating %ld entries in %d pages\n",
680 num_to_init, cnt); 668 num_to_init, cnt + 1);
681 669
682 for (i = 0; i < cnt; i++) { 670 for (i = 0; i < cnt; i++) {
683 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 671 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -738,6 +726,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
738 ((iter->flags & FTRACE_ITER_FAILURES) && 726 ((iter->flags & FTRACE_ITER_FAILURES) &&
739 !(rec->flags & FTRACE_FL_FAILED)) || 727 !(rec->flags & FTRACE_FL_FAILED)) ||
740 728
729 ((iter->flags & FTRACE_ITER_FILTER) &&
730 !(rec->flags & FTRACE_FL_FILTER)) ||
731
741 ((iter->flags & FTRACE_ITER_NOTRACE) && 732 ((iter->flags & FTRACE_ITER_NOTRACE) &&
742 !(rec->flags & FTRACE_FL_NOTRACE))) { 733 !(rec->flags & FTRACE_FL_NOTRACE))) {
743 rec = NULL; 734 rec = NULL;
@@ -757,13 +748,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
757 void *p = NULL; 748 void *p = NULL;
758 loff_t l = -1; 749 loff_t l = -1;
759 750
760 if (*pos != iter->pos) { 751 if (*pos > iter->pos)
761 for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) 752 *pos = iter->pos;
762 ; 753
763 } else { 754 l = *pos;
764 l = *pos; 755 p = t_next(m, p, &l);
765 p = t_next(m, p, &l);
766 }
767 756
768 return p; 757 return p;
769} 758}
@@ -774,15 +763,21 @@ static void t_stop(struct seq_file *m, void *p)
774 763
775static int t_show(struct seq_file *m, void *v) 764static int t_show(struct seq_file *m, void *v)
776{ 765{
766 struct ftrace_iterator *iter = m->private;
777 struct dyn_ftrace *rec = v; 767 struct dyn_ftrace *rec = v;
778 char str[KSYM_SYMBOL_LEN]; 768 char str[KSYM_SYMBOL_LEN];
769 int ret = 0;
779 770
780 if (!rec) 771 if (!rec)
781 return 0; 772 return 0;
782 773
783 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 774 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
784 775
785 seq_printf(m, "%s\n", str); 776 ret = seq_printf(m, "%s\n", str);
777 if (ret < 0) {
778 iter->pos--;
779 iter->idx--;
780 }
786 781
787 return 0; 782 return 0;
788} 783}
@@ -808,7 +803,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
808 return -ENOMEM; 803 return -ENOMEM;
809 804
810 iter->pg = ftrace_pages_start; 805 iter->pg = ftrace_pages_start;
811 iter->pos = -1; 806 iter->pos = 0;
812 807
813 ret = seq_open(file, &show_ftrace_seq_ops); 808 ret = seq_open(file, &show_ftrace_seq_ops);
814 if (!ret) { 809 if (!ret) {
@@ -895,7 +890,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
895 890
896 if (file->f_mode & FMODE_READ) { 891 if (file->f_mode & FMODE_READ) {
897 iter->pg = ftrace_pages_start; 892 iter->pg = ftrace_pages_start;
898 iter->pos = -1; 893 iter->pos = 0;
899 iter->flags = enable ? FTRACE_ITER_FILTER : 894 iter->flags = enable ? FTRACE_ITER_FILTER :
900 FTRACE_ITER_NOTRACE; 895 FTRACE_ITER_NOTRACE;
901 896
@@ -1186,7 +1181,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1186 1181
1187 mutex_lock(&ftrace_sysctl_lock); 1182 mutex_lock(&ftrace_sysctl_lock);
1188 mutex_lock(&ftrace_start_lock); 1183 mutex_lock(&ftrace_start_lock);
1189 if (iter->filtered && ftrace_start && ftrace_enabled) 1184 if (ftrace_start && ftrace_enabled)
1190 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1185 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1191 mutex_unlock(&ftrace_start_lock); 1186 mutex_unlock(&ftrace_start_lock);
1192 mutex_unlock(&ftrace_sysctl_lock); 1187 mutex_unlock(&ftrace_sysctl_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3f3380638646..668bbb5ef2bd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,14 +16,49 @@
16#include <linux/list.h> 16#include <linux/list.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18 18
19#include "trace.h"
20
21/* Global flag to disable all recording to ring buffers */
22static int ring_buffers_off __read_mostly;
23
24/**
25 * tracing_on - enable all tracing buffers
26 *
27 * This function enables all tracing buffers that may have been
28 * disabled with tracing_off.
29 */
30void tracing_on(void)
31{
32 ring_buffers_off = 0;
33}
34
35/**
36 * tracing_off - turn off all tracing buffers
37 *
38 * This function stops all tracing buffers from recording data.
39 * It does not disable any overhead the tracers themselves may
40 * be causing. This function simply causes all recording to
41 * the ring buffers to fail.
42 */
43void tracing_off(void)
44{
45 ring_buffers_off = 1;
46}
47
19/* Up this if you want to test the TIME_EXTENTS and normalization */ 48/* Up this if you want to test the TIME_EXTENTS and normalization */
20#define DEBUG_SHIFT 0 49#define DEBUG_SHIFT 0
21 50
22/* FIXME!!! */ 51/* FIXME!!! */
23u64 ring_buffer_time_stamp(int cpu) 52u64 ring_buffer_time_stamp(int cpu)
24{ 53{
54 u64 time;
55
56 preempt_disable_notrace();
25 /* shift to debug/test normalization and TIME_EXTENTS */ 57 /* shift to debug/test normalization and TIME_EXTENTS */
26 return sched_clock() << DEBUG_SHIFT; 58 time = sched_clock() << DEBUG_SHIFT;
59 preempt_enable_notrace();
60
61 return time;
27} 62}
28 63
29void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 64void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
@@ -503,6 +538,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
503 LIST_HEAD(pages); 538 LIST_HEAD(pages);
504 int i, cpu; 539 int i, cpu;
505 540
541 /*
542 * Always succeed at resizing a non-existent buffer:
543 */
544 if (!buffer)
545 return size;
546
506 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 547 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
507 size *= BUF_PAGE_SIZE; 548 size *= BUF_PAGE_SIZE;
508 buffer_size = buffer->pages * BUF_PAGE_SIZE; 549 buffer_size = buffer->pages * BUF_PAGE_SIZE;
@@ -576,6 +617,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
576 list_del_init(&page->list); 617 list_del_init(&page->list);
577 free_buffer_page(page); 618 free_buffer_page(page);
578 } 619 }
620 mutex_unlock(&buffer->mutex);
579 return -ENOMEM; 621 return -ENOMEM;
580} 622}
581 623
@@ -1060,7 +1102,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1060 1102
1061 /* Did the write stamp get updated already? */ 1103 /* Did the write stamp get updated already? */
1062 if (unlikely(ts < cpu_buffer->write_stamp)) 1104 if (unlikely(ts < cpu_buffer->write_stamp))
1063 goto again; 1105 delta = 0;
1064 1106
1065 if (test_time_stamp(delta)) { 1107 if (test_time_stamp(delta)) {
1066 1108
@@ -1133,6 +1175,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1133 struct ring_buffer_event *event; 1175 struct ring_buffer_event *event;
1134 int cpu, resched; 1176 int cpu, resched;
1135 1177
1178 if (ring_buffers_off)
1179 return NULL;
1180
1136 if (atomic_read(&buffer->record_disabled)) 1181 if (atomic_read(&buffer->record_disabled))
1137 return NULL; 1182 return NULL;
1138 1183
@@ -1170,7 +1215,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1170 1215
1171 out: 1216 out:
1172 if (resched) 1217 if (resched)
1173 preempt_enable_notrace(); 1218 preempt_enable_no_resched_notrace();
1174 else 1219 else
1175 preempt_enable_notrace(); 1220 preempt_enable_notrace();
1176 return NULL; 1221 return NULL;
@@ -1249,6 +1294,9 @@ int ring_buffer_write(struct ring_buffer *buffer,
1249 int ret = -EBUSY; 1294 int ret = -EBUSY;
1250 int cpu, resched; 1295 int cpu, resched;
1251 1296
1297 if (ring_buffers_off)
1298 return -EBUSY;
1299
1252 if (atomic_read(&buffer->record_disabled)) 1300 if (atomic_read(&buffer->record_disabled))
1253 return -EBUSY; 1301 return -EBUSY;
1254 1302
@@ -2070,3 +2118,69 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2070 return 0; 2118 return 0;
2071} 2119}
2072 2120
2121static ssize_t
2122rb_simple_read(struct file *filp, char __user *ubuf,
2123 size_t cnt, loff_t *ppos)
2124{
2125 int *p = filp->private_data;
2126 char buf[64];
2127 int r;
2128
2129 /* !ring_buffers_off == tracing_on */
2130 r = sprintf(buf, "%d\n", !*p);
2131
2132 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2133}
2134
2135static ssize_t
2136rb_simple_write(struct file *filp, const char __user *ubuf,
2137 size_t cnt, loff_t *ppos)
2138{
2139 int *p = filp->private_data;
2140 char buf[64];
2141 long val;
2142 int ret;
2143
2144 if (cnt >= sizeof(buf))
2145 return -EINVAL;
2146
2147 if (copy_from_user(&buf, ubuf, cnt))
2148 return -EFAULT;
2149
2150 buf[cnt] = 0;
2151
2152 ret = strict_strtoul(buf, 10, &val);
2153 if (ret < 0)
2154 return ret;
2155
2156 /* !ring_buffers_off == tracing_on */
2157 *p = !val;
2158
2159 (*ppos)++;
2160
2161 return cnt;
2162}
2163
2164static struct file_operations rb_simple_fops = {
2165 .open = tracing_open_generic,
2166 .read = rb_simple_read,
2167 .write = rb_simple_write,
2168};
2169
2170
2171static __init int rb_init_debugfs(void)
2172{
2173 struct dentry *d_tracer;
2174 struct dentry *entry;
2175
2176 d_tracer = tracing_init_dentry();
2177
2178 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2179 &ring_buffers_off, &rb_simple_fops);
2180 if (!entry)
2181 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2182
2183 return 0;
2184}
2185
2186fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9f3b478f9171..d86e3252f300 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1755,7 +1755,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1755 return TRACE_TYPE_HANDLED; 1755 return TRACE_TYPE_HANDLED;
1756 1756
1757 SEQ_PUT_FIELD_RET(s, entry->pid); 1757 SEQ_PUT_FIELD_RET(s, entry->pid);
1758 SEQ_PUT_FIELD_RET(s, iter->cpu); 1758 SEQ_PUT_FIELD_RET(s, entry->cpu);
1759 SEQ_PUT_FIELD_RET(s, iter->ts); 1759 SEQ_PUT_FIELD_RET(s, iter->ts);
1760 1760
1761 switch (entry->type) { 1761 switch (entry->type) {
@@ -1936,6 +1936,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1936 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1936 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1937 } 1937 }
1938 mutex_unlock(&trace_types_lock); 1938 mutex_unlock(&trace_types_lock);
1939 kfree(iter);
1939 1940
1940 return ERR_PTR(-ENOMEM); 1941 return ERR_PTR(-ENOMEM);
1941} 1942}
@@ -2676,7 +2677,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2676{ 2677{
2677 unsigned long val; 2678 unsigned long val;
2678 char buf[64]; 2679 char buf[64];
2679 int ret; 2680 int ret, cpu;
2680 struct trace_array *tr = filp->private_data; 2681 struct trace_array *tr = filp->private_data;
2681 2682
2682 if (cnt >= sizeof(buf)) 2683 if (cnt >= sizeof(buf))
@@ -2704,6 +2705,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2704 goto out; 2705 goto out;
2705 } 2706 }
2706 2707
2708 /* disable all cpu buffers */
2709 for_each_tracing_cpu(cpu) {
2710 if (global_trace.data[cpu])
2711 atomic_inc(&global_trace.data[cpu]->disabled);
2712 if (max_tr.data[cpu])
2713 atomic_inc(&max_tr.data[cpu]->disabled);
2714 }
2715
2707 if (val != global_trace.entries) { 2716 if (val != global_trace.entries) {
2708 ret = ring_buffer_resize(global_trace.buffer, val); 2717 ret = ring_buffer_resize(global_trace.buffer, val);
2709 if (ret < 0) { 2718 if (ret < 0) {
@@ -2735,6 +2744,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2735 if (tracing_disabled) 2744 if (tracing_disabled)
2736 cnt = -ENOMEM; 2745 cnt = -ENOMEM;
2737 out: 2746 out:
2747 for_each_tracing_cpu(cpu) {
2748 if (global_trace.data[cpu])
2749 atomic_dec(&global_trace.data[cpu]->disabled);
2750 if (max_tr.data[cpu])
2751 atomic_dec(&max_tr.data[cpu]->disabled);
2752 }
2753
2738 max_tr.entries = global_trace.entries; 2754 max_tr.entries = global_trace.entries;
2739 mutex_unlock(&trace_types_lock); 2755 mutex_unlock(&trace_types_lock);
2740 2756
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f28484618ff0..e62cbf78eab6 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -18,12 +18,14 @@ struct header_iter {
18 18
19static struct trace_array *mmio_trace_array; 19static struct trace_array *mmio_trace_array;
20static bool overrun_detected; 20static bool overrun_detected;
21static unsigned long prev_overruns;
21 22
22static void mmio_reset_data(struct trace_array *tr) 23static void mmio_reset_data(struct trace_array *tr)
23{ 24{
24 int cpu; 25 int cpu;
25 26
26 overrun_detected = false; 27 overrun_detected = false;
28 prev_overruns = 0;
27 tr->time_start = ftrace_now(tr->cpu); 29 tr->time_start = ftrace_now(tr->cpu);
28 30
29 for_each_online_cpu(cpu) 31 for_each_online_cpu(cpu)
@@ -128,16 +130,12 @@ static void mmio_close(struct trace_iterator *iter)
128 130
129static unsigned long count_overruns(struct trace_iterator *iter) 131static unsigned long count_overruns(struct trace_iterator *iter)
130{ 132{
131 int cpu;
132 unsigned long cnt = 0; 133 unsigned long cnt = 0;
133/* FIXME: */ 134 unsigned long over = ring_buffer_overruns(iter->tr->buffer);
134#if 0 135
135 for_each_online_cpu(cpu) { 136 if (over > prev_overruns)
136 cnt += iter->overrun[cpu]; 137 cnt = over - prev_overruns;
137 iter->overrun[cpu] = 0; 138 prev_overruns = over;
138 }
139#endif
140 (void)cpu;
141 return cnt; 139 return cnt;
142} 140}
143 141
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index be682b62fe58..3bdb44bde4b7 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -184,11 +184,16 @@ static struct file_operations stack_max_size_fops = {
184static void * 184static void *
185t_next(struct seq_file *m, void *v, loff_t *pos) 185t_next(struct seq_file *m, void *v, loff_t *pos)
186{ 186{
187 long i = (long)m->private; 187 long i;
188 188
189 (*pos)++; 189 (*pos)++;
190 190
191 i++; 191 if (v == SEQ_START_TOKEN)
192 i = 0;
193 else {
194 i = *(long *)v;
195 i++;
196 }
192 197
193 if (i >= max_stack_trace.nr_entries || 198 if (i >= max_stack_trace.nr_entries ||
194 stack_dump_trace[i] == ULONG_MAX) 199 stack_dump_trace[i] == ULONG_MAX)
@@ -201,12 +206,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
201 206
202static void *t_start(struct seq_file *m, loff_t *pos) 207static void *t_start(struct seq_file *m, loff_t *pos)
203{ 208{
204 void *t = &m->private; 209 void *t = SEQ_START_TOKEN;
205 loff_t l = 0; 210 loff_t l = 0;
206 211
207 local_irq_disable(); 212 local_irq_disable();
208 __raw_spin_lock(&max_stack_lock); 213 __raw_spin_lock(&max_stack_lock);
209 214
215 if (*pos == 0)
216 return SEQ_START_TOKEN;
217
210 for (; t && l < *pos; t = t_next(m, t, &l)) 218 for (; t && l < *pos; t = t_next(m, t, &l))
211 ; 219 ;
212 220
@@ -235,10 +243,10 @@ static int trace_lookup_stack(struct seq_file *m, long i)
235 243
236static int t_show(struct seq_file *m, void *v) 244static int t_show(struct seq_file *m, void *v)
237{ 245{
238 long i = *(long *)v; 246 long i;
239 int size; 247 int size;
240 248
241 if (i < 0) { 249 if (v == SEQ_START_TOKEN) {
242 seq_printf(m, " Depth Size Location" 250 seq_printf(m, " Depth Size Location"
243 " (%d entries)\n" 251 " (%d entries)\n"
244 " ----- ---- --------\n", 252 " ----- ---- --------\n",
@@ -246,6 +254,8 @@ static int t_show(struct seq_file *m, void *v)
246 return 0; 254 return 0;
247 } 255 }
248 256
257 i = *(long *)v;
258
249 if (i >= max_stack_trace.nr_entries || 259 if (i >= max_stack_trace.nr_entries ||
250 stack_dump_trace[i] == ULONG_MAX) 260 stack_dump_trace[i] == ULONG_MAX)
251 return 0; 261 return 0;
@@ -275,10 +285,6 @@ static int stack_trace_open(struct inode *inode, struct file *file)
275 int ret; 285 int ret;
276 286
277 ret = seq_open(file, &stack_trace_seq_ops); 287 ret = seq_open(file, &stack_trace_seq_ops);
278 if (!ret) {
279 struct seq_file *m = file->private_data;
280 m->private = (void *)-1;
281 }
282 288
283 return ret; 289 return ret;
284} 290}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba55..ae542e2e38d5 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,6 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 205
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 207}