aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2009-08-26 20:17:51 -0400
committerH. Peter Anvin <hpa@zytor.com>2009-08-26 20:24:28 -0400
commitb855192c08fcb14adbc5d3a7cab182022d433cca (patch)
tree3a10cafbfbf98cafacf667eb218c71300b351bfa /kernel
parentd886c73cd4cf02a71e1650cbcb6176799d78aac1 (diff)
parent3e0e1e9c5a327d4dba8490d83ef55c0564e6e8a7 (diff)
Merge branch 'x86/urgent' into x86/pat
Reason: Change to is_new_memtype_allowed() in x86/urgent Resolved semantic conflicts in: arch/x86/mm/pat.c arch/x86/mm/ioremap.c Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c151
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c29
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hrtimer.c110
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c72
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c9
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c842
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcutree.c3
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched.c61
-rw-r--r--kernel/sched_cpupri.c15
-rw-r--r--kernel/sched_fair.c45
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/time/clockevents.c11
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c13
-rw-r--r--kernel/trace/ftrace.c28
-rw-r--r--kernel/trace/ring_buffer.c15
-rw-r--r--kernel/trace/trace.c14
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_event_profile.c2
-rw-r--r--kernel/trace/trace_event_types.h3
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c20
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_stack.c11
-rw-r--r--kernel/trace/trace_stat.c34
-rw-r--r--kernel/wait.c5
54 files changed, 1143 insertions, 609 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a682cdf5..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 736 * to zero, soon.
736 * 737 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 739 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 741
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 743{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
745} 746}
746 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
747static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 761 unsigned long final_bits)
749{ 762{
@@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
960 INIT_LIST_HEAD(&cgrp->children); 973 INIT_LIST_HEAD(&cgrp->children);
961 INIT_LIST_HEAD(&cgrp->css_sets); 974 INIT_LIST_HEAD(&cgrp->css_sets);
962 INIT_LIST_HEAD(&cgrp->release_list); 975 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list);
963 init_rwsem(&cgrp->pids_mutex); 977 init_rwsem(&cgrp->pids_mutex);
964} 978}
965static void init_cgroup_root(struct cgroupfs_root *root) 979static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1357 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1358 * is no longer empty. 1372 * is no longer empty.
1359 */ 1373 */
1360 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1361 return 0; 1375 return 0;
1362} 1376}
1363 1377
@@ -2201,12 +2215,30 @@ err:
2201 return ret; 2215 return ret;
2202} 2216}
2203 2217
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2204static int cmppid(const void *a, const void *b) 2237static int cmppid(const void *a, const void *b)
2205{ 2238{
2206 return *(pid_t *)a - *(pid_t *)b; 2239 return *(pid_t *)a - *(pid_t *)b;
2207} 2240}
2208 2241
2209
2210/* 2242/*
2211 * seq_file methods for the "tasks" file. The seq_file position is the 2243 * seq_file methods for the "tasks" file. The seq_file position is the
2212 * next pid to display; the seq_file iterator is a pointer to the pid 2244 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2221 * after a seek to the start). Use a binary-search to find the 2253 * after a seek to the start). Use a binary-search to find the
2222 * next pid to display, if any 2254 * next pid to display, if any
2223 */ 2255 */
2224 struct cgroup *cgrp = s->private; 2256 struct cgroup_pids *cp = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2225 int index = 0, pid = *pos; 2258 int index = 0, pid = *pos;
2226 int *iter; 2259 int *iter;
2227 2260
2228 down_read(&cgrp->pids_mutex); 2261 down_read(&cgrp->pids_mutex);
2229 if (pid) { 2262 if (pid) {
2230 int end = cgrp->pids_length; 2263 int end = cp->length;
2231 2264
2232 while (index < end) { 2265 while (index < end) {
2233 int mid = (index + end) / 2; 2266 int mid = (index + end) / 2;
2234 if (cgrp->tasks_pids[mid] == pid) { 2267 if (cp->tasks_pids[mid] == pid) {
2235 index = mid; 2268 index = mid;
2236 break; 2269 break;
2237 } else if (cgrp->tasks_pids[mid] <= pid) 2270 } else if (cp->tasks_pids[mid] <= pid)
2238 index = mid + 1; 2271 index = mid + 1;
2239 else 2272 else
2240 end = mid; 2273 end = mid;
2241 } 2274 }
2242 } 2275 }
2243 /* If we're off the end of the array, we're done */ 2276 /* If we're off the end of the array, we're done */
2244 if (index >= cgrp->pids_length) 2277 if (index >= cp->length)
2245 return NULL; 2278 return NULL;
2246 /* Update the abstract position to be the actual pid that we found */ 2279 /* Update the abstract position to be the actual pid that we found */
2247 iter = cgrp->tasks_pids + index; 2280 iter = cp->tasks_pids + index;
2248 *pos = *iter; 2281 *pos = *iter;
2249 return iter; 2282 return iter;
2250} 2283}
2251 2284
2252static void cgroup_tasks_stop(struct seq_file *s, void *v) 2285static void cgroup_tasks_stop(struct seq_file *s, void *v)
2253{ 2286{
2254 struct cgroup *cgrp = s->private; 2287 struct cgroup_pids *cp = s->private;
2288 struct cgroup *cgrp = cp->cgrp;
2255 up_read(&cgrp->pids_mutex); 2289 up_read(&cgrp->pids_mutex);
2256} 2290}
2257 2291
2258static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2259{ 2293{
2260 struct cgroup *cgrp = s->private; 2294 struct cgroup_pids *cp = s->private;
2261 int *p = v; 2295 int *p = v;
2262 int *end = cgrp->tasks_pids + cgrp->pids_length; 2296 int *end = cp->tasks_pids + cp->length;
2263 2297
2264 /* 2298 /*
2265 * Advance to the next pid in the array. If this goes off the 2299 * Advance to the next pid in the array. If this goes off the
@@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2286 .show = cgroup_tasks_show, 2320 .show = cgroup_tasks_show,
2287}; 2321};
2288 2322
2289static void release_cgroup_pid_array(struct cgroup *cgrp) 2323static void release_cgroup_pid_array(struct cgroup_pids *cp)
2290{ 2324{
2325 struct cgroup *cgrp = cp->cgrp;
2326
2291 down_write(&cgrp->pids_mutex); 2327 down_write(&cgrp->pids_mutex);
2292 BUG_ON(!cgrp->pids_use_count); 2328 BUG_ON(!cp->use_count);
2293 if (!--cgrp->pids_use_count) { 2329 if (!--cp->use_count) {
2294 kfree(cgrp->tasks_pids); 2330 list_del(&cp->list);
2295 cgrp->tasks_pids = NULL; 2331 put_pid_ns(cp->ns);
2296 cgrp->pids_length = 0; 2332 kfree(cp->tasks_pids);
2333 kfree(cp);
2297 } 2334 }
2298 up_write(&cgrp->pids_mutex); 2335 up_write(&cgrp->pids_mutex);
2299} 2336}
2300 2337
2301static int cgroup_tasks_release(struct inode *inode, struct file *file) 2338static int cgroup_tasks_release(struct inode *inode, struct file *file)
2302{ 2339{
2303 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2340 struct seq_file *seq;
2341 struct cgroup_pids *cp;
2304 2342
2305 if (!(file->f_mode & FMODE_READ)) 2343 if (!(file->f_mode & FMODE_READ))
2306 return 0; 2344 return 0;
2307 2345
2308 release_cgroup_pid_array(cgrp); 2346 seq = file->private_data;
2347 cp = seq->private;
2348
2349 release_cgroup_pid_array(cp);
2309 return seq_release(inode, file); 2350 return seq_release(inode, file);
2310} 2351}
2311 2352
@@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
2324static int cgroup_tasks_open(struct inode *unused, struct file *file) 2365static int cgroup_tasks_open(struct inode *unused, struct file *file)
2325{ 2366{
2326 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns;
2369 struct cgroup_pids *cp;
2327 pid_t *pidarray; 2370 pid_t *pidarray;
2328 int npids; 2371 int npids;
2329 int retval; 2372 int retval;
@@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2350 * array if necessary 2393 * array if necessary
2351 */ 2394 */
2352 down_write(&cgrp->pids_mutex); 2395 down_write(&cgrp->pids_mutex);
2353 kfree(cgrp->tasks_pids); 2396
2354 cgrp->tasks_pids = pidarray; 2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2355 cgrp->pids_length = npids; 2398 if (ns == cp->ns)
2356 cgrp->pids_use_count++; 2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2357 up_write(&cgrp->pids_mutex); 2417 up_write(&cgrp->pids_mutex);
2358 2418
2359 file->f_op = &cgroup_tasks_operations; 2419 file->f_op = &cgroup_tasks_operations;
2360 2420
2361 retval = seq_open(file, &cgroup_tasks_seq_operations); 2421 retval = seq_open(file, &cgroup_tasks_seq_operations);
2362 if (retval) { 2422 if (retval) {
2363 release_cgroup_pid_array(cgrp); 2423 release_cgroup_pid_array(cp);
2364 return retval; 2424 return retval;
2365 } 2425 }
2366 ((struct seq_file *)file->private_data)->private = cgrp; 2426 ((struct seq_file *)file->private_data)->private = cp;
2367 return 0; 2427 return 0;
2368} 2428}
2369 2429
@@ -2696,33 +2756,42 @@ again:
2696 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2697 2757
2698 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2699 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2700 * that rmdir() request comes. 2771 * that rmdir() request comes.
2701 */ 2772 */
2702 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2703 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2704 return ret; 2776 return ret;
2777 }
2705 2778
2706 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2707 parent = cgrp->parent; 2780 parent = cgrp->parent;
2708 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2709 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2710 return -EBUSY; 2784 return -EBUSY;
2711 } 2785 }
2712 /*
2713 * css_put/get is provided for subsys to grab refcnt to css. In typical
2714 * case, subsystem has no reference after pre_destroy(). But, under
2715 * hierarchy management, some *temporal* refcnt can be hold.
2716 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2717 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2718 * is called when css_put() is called and refcnt goes down to 0.
2719 */
2720 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2721 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2722
2723 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2724 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2725 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2726 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2727 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2728 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3294 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3295 check_for_release(cgrp); 3364 check_for_release(cgrp);
3296 } 3365 }
3297 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3298 } 3367 }
3299 rcu_read_unlock(); 3368 rcu_read_unlock();
3300} 3369}
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd54..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..021e1138556e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -427,6 +426,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
427 init_rwsem(&mm->mmap_sem); 426 init_rwsem(&mm->mmap_sem);
428 INIT_LIST_HEAD(&mm->mmlist); 427 INIT_LIST_HEAD(&mm->mmlist);
429 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 428 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
429 mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0;
430 mm->core_state = NULL; 430 mm->core_state = NULL;
431 mm->nr_ptes = 0; 431 mm->nr_ptes = 0;
432 set_mm_counter(mm, file_rss, 0); 432 set_mm_counter(mm, file_rss, 0);
@@ -568,18 +568,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 568 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 569 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 570 */
571 if (tsk->clear_child_tid 571 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 572 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 573 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 574 /*
575 * We don't check the error code - if userspace has
576 * not set up a proper pointer then tough luck.
577 */
578 put_user(0, tsk->clear_child_tid);
579 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
580 1, NULL, NULL, 0);
581 }
575 tsk->clear_child_tid = NULL; 582 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 583 }
584} 584}
585 585
@@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1269 write_unlock_irq(&tasklist_lock); 1269 write_unlock_irq(&tasklist_lock);
1270 proc_fork_connector(p); 1270 proc_fork_connector(p);
1271 cgroup_post_fork(p); 1271 cgroup_post_fork(p);
1272 perf_counter_fork(p);
1272 return p; 1273 return p;
1273 1274
1274bad_fork_free_pid: 1275bad_fork_free_pid:
@@ -1408,12 +1409,6 @@ long do_fork(unsigned long clone_flags,
1408 if (clone_flags & CLONE_VFORK) { 1409 if (clone_flags & CLONE_VFORK) {
1409 p->vfork_done = &vfork; 1410 p->vfork_done = &vfork;
1410 init_completion(&vfork); 1411 init_completion(&vfork);
1411 } else if (!(clone_flags & CLONE_VM)) {
1412 /*
1413 * vfork will do an exec which will call
1414 * set_task_comm()
1415 */
1416 perf_counter_fork(p);
1417 } 1412 }
1418 1413
1419 audit_finish_fork(p); 1414 audit_finish_fork(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index 794c862125fe..e18cfbdc7190 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -247,6 +247,7 @@ again:
247 if (err < 0) 247 if (err < 0)
248 return err; 248 return err;
249 249
250 page = compound_head(page);
250 lock_page(page); 251 lock_page(page);
251 if (!page->mapping) { 252 if (!page->mapping) {
252 unlock_page(page); 253 unlock_page(page);
@@ -1009,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1009 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1010 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1010 * q: the futex_q 1011 * q: the futex_q
1011 * key: the key of the requeue target futex 1012 * key: the key of the requeue target futex
1013 * hb: the hash_bucket of the requeue target futex
1012 * 1014 *
1013 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1015 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1014 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1016 * target futex if it is uncontended or via a lock steal. Set the futex_q key
1015 * to the requeue target futex so the waiter can detect the wakeup on the right 1017 * to the requeue target futex so the waiter can detect the wakeup on the right
1016 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1018 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1017 * atomic lock acquisition. Must be called with the q->lock_ptr held. 1019 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1020 * to protect access to the pi_state to fixup the owner later. Must be called
1021 * with both q->lock_ptr and hb->lock held.
1018 */ 1022 */
1019static inline 1023static inline
1020void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) 1024void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1025 struct futex_hash_bucket *hb)
1021{ 1026{
1022 drop_futex_key_refs(&q->key); 1027 drop_futex_key_refs(&q->key);
1023 get_futex_key_refs(key); 1028 get_futex_key_refs(key);
@@ -1029,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1029 WARN_ON(!q->rt_waiter); 1034 WARN_ON(!q->rt_waiter);
1030 q->rt_waiter = NULL; 1035 q->rt_waiter = NULL;
1031 1036
1037 q->lock_ptr = &hb->lock;
1038#ifdef CONFIG_DEBUG_PI_LIST
1039 q->list.plist.lock = &hb->lock;
1040#endif
1041
1032 wake_up_state(q->task, TASK_NORMAL); 1042 wake_up_state(q->task, TASK_NORMAL);
1033} 1043}
1034 1044
@@ -1087,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1087 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1097 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1088 set_waiters); 1098 set_waiters);
1089 if (ret == 1) 1099 if (ret == 1)
1090 requeue_pi_wake_futex(top_waiter, key2); 1100 requeue_pi_wake_futex(top_waiter, key2, hb2);
1091 1101
1092 return ret; 1102 return ret;
1093} 1103}
@@ -1246,8 +1256,15 @@ retry_private:
1246 if (!match_futex(&this->key, &key1)) 1256 if (!match_futex(&this->key, &key1))
1247 continue; 1257 continue;
1248 1258
1249 WARN_ON(!requeue_pi && this->rt_waiter); 1259 /*
1250 WARN_ON(requeue_pi && !this->rt_waiter); 1260 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1261 * be paired with each other and no other futex ops.
1262 */
1263 if ((requeue_pi && !this->rt_waiter) ||
1264 (!requeue_pi && this->rt_waiter)) {
1265 ret = -EINVAL;
1266 break;
1267 }
1251 1268
1252 /* 1269 /*
1253 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1270 * Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1272,7 +1289,7 @@ retry_private:
1272 this->task, 1); 1289 this->task, 1);
1273 if (ret == 1) { 1290 if (ret == 1) {
1274 /* We got the lock. */ 1291 /* We got the lock. */
1275 requeue_pi_wake_futex(this, &key2); 1292 requeue_pi_wake_futex(this, &key2, hb2);
1276 continue; 1293 continue;
1277 } else if (ret) { 1294 } else if (ret) {
1278 /* -EDEADLK */ 1295 /* -EDEADLK */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
180 int cmd = op & FUTEX_CMD_MASK; 180 int cmd = op & FUTEX_CMD_MASK;
181 181
182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
183 cmd == FUTEX_WAIT_BITSET)) { 183 cmd == FUTEX_WAIT_BITSET ||
184 cmd == FUTEX_WAIT_REQUEUE_PI)) {
184 if (get_compat_timespec(&ts, utime)) 185 if (get_compat_timespec(&ts, utime))
185 return -EFAULT; 186 return -EFAULT;
186 if (!timespec_valid(&ts)) 187 if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
191 t = ktime_add_safe(ktime_get(), t); 192 t = ktime_add_safe(ktime_get(), t);
192 tp = &t; 193 tp = &t;
193 } 194 }
194 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 195 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
196 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
195 val2 = (int) (unsigned long) utime; 197 val2 = (int) (unsigned long) utime;
196 198
197 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 199 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9002958a96e7..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 245
214again: 246again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
217 249
218 if (base != new_base) { 250 if (base != new_base) {
219 /* 251 /*
220 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
235 267
236 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
241 new_base->offset); 273 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 274 }
269 timer->base = new_base; 275 timer->base = new_base;
270 } 276 }
@@ -1276,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1276 1282
1277 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1278 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1279 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1280 1296
1281 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1282 ktime_t basenow; 1298 ktime_t basenow;
1283 struct rb_node *node; 1299 struct rb_node *node;
1284 1300
1285 spin_lock(&cpu_base->lock);
1286
1287 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1288 1302
1289 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1316,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1316 1330
1317 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1318 } 1332 }
1319 spin_unlock(&cpu_base->lock);
1320 base++; 1333 base++;
1321 } 1334 }
1322 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1323 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1324 1342
1325 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1326 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 73468253143b..e70ed5592eb9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47 46
48/* 47/*
49 * Debugging printout: 48 * Debugging printout:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 50da67672901..d222515a5a06 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
112 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 if (!desc->chip->set_affinity(irq, cpumask)) { 121 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask); 122 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask); 123 irq_set_thread_affinity(desc);
116 } 124 }
117 } 125 }
118 else { 126 else {
@@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
122#else 130#else
123 if (!desc->chip->set_affinity(irq, cpumask)) { 131 if (!desc->chip->set_affinity(irq, cpumask)) {
124 cpumask_copy(desc->affinity, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask); 133 irq_set_thread_affinity(desc);
126 } 134 }
127#endif 135#endif
128 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
@@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
176 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
177 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
178 if (!ret) 186 if (!ret)
179 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
180 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
181 189
182 return ret; 190 return ret;
@@ -443,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
443 return -1; 451 return -1;
444} 452}
445 453
454#ifdef CONFIG_SMP
455/*
456 * Check whether we need to change the affinity of the interrupt thread.
457 */
458static void
459irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
460{
461 cpumask_var_t mask;
462
463 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
464 return;
465
466 /*
467 * In case we are out of memory we set IRQTF_AFFINITY again and
468 * try again next time
469 */
470 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
471 set_bit(IRQTF_AFFINITY, &action->thread_flags);
472 return;
473 }
474
475 spin_lock_irq(&desc->lock);
476 cpumask_copy(mask, desc->affinity);
477 spin_unlock_irq(&desc->lock);
478
479 set_cpus_allowed_ptr(current, mask);
480 free_cpumask_var(mask);
481}
482#else
483static inline void
484irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
485#endif
486
446/* 487/*
447 * Interrupt handler thread 488 * Interrupt handler thread
448 */ 489 */
@@ -458,6 +499,8 @@ static int irq_thread(void *data)
458 499
459 while (!irq_wait_for_interrupt(action)) { 500 while (!irq_wait_for_interrupt(action)) {
460 501
502 irq_thread_check_affinity(desc, action);
503
461 atomic_inc(&desc->threads_active); 504 atomic_inc(&desc->threads_active);
462 505
463 spin_lock_irq(&desc->lock); 506 spin_lock_irq(&desc->lock);
@@ -718,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
718{ 761{
719 struct irq_desc *desc = irq_to_desc(irq); 762 struct irq_desc *desc = irq_to_desc(irq);
720 struct irqaction *action, **action_ptr; 763 struct irqaction *action, **action_ptr;
721 struct task_struct *irqthread;
722 unsigned long flags; 764 unsigned long flags;
723 765
724 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 766 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -766,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
766 desc->chip->disable(irq); 808 desc->chip->disable(irq);
767 } 809 }
768 810
769 irqthread = action->thread;
770 action->thread = NULL;
771
772 spin_unlock_irqrestore(&desc->lock, flags); 811 spin_unlock_irqrestore(&desc->lock, flags);
773 812
774 unregister_handler_proc(irq, action); 813 unregister_handler_proc(irq, action);
@@ -776,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
776 /* Make sure it's not being used on another CPU: */ 815 /* Make sure it's not being used on another CPU: */
777 synchronize_irq(irq); 816 synchronize_irq(irq);
778 817
779 if (irqthread) {
780 if (!test_bit(IRQTF_DIED, &action->thread_flags))
781 kthread_stop(irqthread);
782 put_task_struct(irqthread);
783 }
784
785#ifdef CONFIG_DEBUG_SHIRQ 818#ifdef CONFIG_DEBUG_SHIRQ
786 /* 819 /*
787 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 820 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -797,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
797 local_irq_restore(flags); 830 local_irq_restore(flags);
798 } 831 }
799#endif 832#endif
833
834 if (action->thread) {
835 if (!test_bit(IRQTF_DIED, &action->thread_flags))
836 kthread_stop(action->thread);
837 put_task_struct(action->thread);
838 }
839
800 return action; 840 return action;
801} 841}
802 842
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index cfe767ca1545..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -45,7 +45,7 @@ void move_masked_irq(int irq)
45 < nr_cpu_ids)) 45 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
47 cpumask_copy(desc->affinity, desc->pending_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc, desc->pending_mask); 48 irq_set_thread_affinity(desc);
49 } 49 }
50 50
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee57bf2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
107 107
108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
109{ 109{
110 /* those all static, do move them */ 110 /* those static or target node is -1, do not move them */
111 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY || node == -1)
112 return desc; 112 return desc;
113 113
114 if (desc->node != node) 114 if (desc->node != node)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ae1c35201cc8..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..0540948e29ab 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
237{ 237{
238 struct kprobe_insn_page *kip; 238 struct kprobe_insn_page *kip;
239 struct hlist_node *pos, *next; 239 struct hlist_node *pos, *next;
240 int safety;
241 240
242 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 242 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 243 return -EAGAIN;
248 244
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 694 p->addr = addr;
699 695
700 preempt_disable(); 696 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 697 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 698 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 699 preempt_enable();
704 return -EINVAL; 700 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9b1a7de26979..eb8751aa0418 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind);
180 * @k: thread created by kthread_create(). 180 * @k: thread created by kthread_create().
181 * 181 *
182 * Sets kthread_should_stop() for @k to return true, wakes it, and 182 * Sets kthread_should_stop() for @k to return true, wakes it, and
183 * waits for it to exit. Your threadfn() must not call do_exit() 183 * waits for it to exit. This can also be called after kthread_create()
184 * itself if you use this function! This can also be called after 184 * instead of calling wake_up_process(): the thread will exit without
185 * kthread_create() instead of calling wake_up_process(): the thread 185 * calling threadfn().
186 * will exit without calling threadfn(). 186 *
187 * If threadfn() may call do_exit() itself, the caller must ensure
188 * task_struct can't go away.
187 * 189 *
188 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 190 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
189 * was never called. 191 * was never called.
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..e94caa666dba 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
758 &proc_lockdep_stats_operations); 758 &proc_lockdep_stats_operations);
759 759
760#ifdef CONFIG_LOCK_STAT 760#ifdef CONFIG_LOCK_STAT
761 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); 761 proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
762 &proc_lock_stat_operations);
762#endif 763#endif
763 764
764 return 0; 765 return 0;
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..fd1411403558 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1068{ 1068{
1069 const unsigned long *crc; 1069 const unsigned long *crc;
1070 1070
1071 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1071 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1072 &crc, true, false))
1072 BUG(); 1073 BUG();
1073 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1074 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1074} 1075}
@@ -2451,9 +2452,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2451 return ret; 2452 return ret;
2452 } 2453 }
2453 if (ret > 0) { 2454 if (ret > 0) {
2454 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2455 printk(KERN_WARNING
2455 "it should follow 0/-E convention\n" 2456"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2456 KERN_WARNING "%s: loading module anyway...\n", 2457"%s: loading module anyway...\n",
2457 __func__, mod->name, ret, 2458 __func__, mod->name, ret,
2458 __func__); 2459 __func__);
2459 dump_stack(); 2460 dump_stack();
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50da2347..534e20d14d63 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
@@ -87,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); } 88void __weak hw_perf_enable(void) { barrier(); }
88 89
89void __weak hw_perf_counter_setup(int cpu) { barrier(); } 90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
90 92
91int __weak 93int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader, 94hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -146,6 +148,28 @@ static void put_ctx(struct perf_counter_context *ctx)
146 } 148 }
147} 149}
148 150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
149/* 173/*
150 * Get the perf_counter_context for a task and lock it. 174 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked, 175 * This has to cope with with the fact that until it is locked,
@@ -283,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
283 return; 307 return;
284 308
285 counter->state = PERF_COUNTER_STATE_INACTIVE; 309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
286 counter->tstamp_stopped = ctx->time; 314 counter->tstamp_stopped = ctx->time;
287 counter->pmu->disable(counter); 315 counter->pmu->disable(counter);
288 counter->oncpu = -1; 316 counter->oncpu = -1;
@@ -1081,7 +1109,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1081 __perf_counter_sync_stat(counter, next_counter); 1109 __perf_counter_sync_stat(counter, next_counter);
1082 1110
1083 counter = list_next_entry(counter, event_entry); 1111 counter = list_next_entry(counter, event_entry);
1084 next_counter = list_next_entry(counter, event_entry); 1112 next_counter = list_next_entry(next_counter, event_entry);
1085 } 1113 }
1086} 1114}
1087 1115
@@ -1288,7 +1316,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1288#define MAX_INTERRUPTS (~0ULL) 1316#define MAX_INTERRUPTS (~0ULL)
1289 1317
1290static void perf_log_throttle(struct perf_counter *counter, int enable); 1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1291static void perf_log_period(struct perf_counter *counter, u64 period);
1292 1319
1293static void perf_adjust_period(struct perf_counter *counter, u64 events) 1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1294{ 1321{
@@ -1307,8 +1334,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
1307 if (!sample_period) 1334 if (!sample_period)
1308 sample_period = 1; 1335 sample_period = 1;
1309 1336
1310 perf_log_period(counter, sample_period);
1311
1312 hwc->sample_period = sample_period; 1337 hwc->sample_period = sample_period;
1313} 1338}
1314 1339
@@ -1463,10 +1488,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1463 /* 1488 /*
1464 * Unclone this context if we enabled any counter. 1489 * Unclone this context if we enabled any counter.
1465 */ 1490 */
1466 if (enabled && ctx->parent_ctx) { 1491 if (enabled)
1467 put_ctx(ctx->parent_ctx); 1492 unclone_ctx(ctx);
1468 ctx->parent_ctx = NULL;
1469 }
1470 1493
1471 spin_unlock(&ctx->lock); 1494 spin_unlock(&ctx->lock);
1472 1495
@@ -1526,7 +1549,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1526 1549
1527static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1550static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1528{ 1551{
1529 struct perf_counter_context *parent_ctx;
1530 struct perf_counter_context *ctx; 1552 struct perf_counter_context *ctx;
1531 struct perf_cpu_context *cpuctx; 1553 struct perf_cpu_context *cpuctx;
1532 struct task_struct *task; 1554 struct task_struct *task;
@@ -1586,11 +1608,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1586 retry: 1608 retry:
1587 ctx = perf_lock_task_context(task, &flags); 1609 ctx = perf_lock_task_context(task, &flags);
1588 if (ctx) { 1610 if (ctx) {
1589 parent_ctx = ctx->parent_ctx; 1611 unclone_ctx(ctx);
1590 if (parent_ctx) {
1591 put_ctx(parent_ctx);
1592 ctx->parent_ctx = NULL; /* no longer a clone */
1593 }
1594 spin_unlock_irqrestore(&ctx->lock, flags); 1612 spin_unlock_irqrestore(&ctx->lock, flags);
1595 } 1613 }
1596 1614
@@ -1642,6 +1660,8 @@ static void free_counter(struct perf_counter *counter)
1642 atomic_dec(&nr_mmap_counters); 1660 atomic_dec(&nr_mmap_counters);
1643 if (counter->attr.comm) 1661 if (counter->attr.comm)
1644 atomic_dec(&nr_comm_counters); 1662 atomic_dec(&nr_comm_counters);
1663 if (counter->attr.task)
1664 atomic_dec(&nr_task_counters);
1645 } 1665 }
1646 1666
1647 if (counter->destroy) 1667 if (counter->destroy)
@@ -1676,14 +1696,133 @@ static int perf_release(struct inode *inode, struct file *file)
1676 return 0; 1696 return 0;
1677} 1697}
1678 1698
1699static int perf_counter_read_size(struct perf_counter *counter)
1700{
1701 int entry = sizeof(u64); /* value */
1702 int size = 0;
1703 int nr = 1;
1704
1705 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1706 size += sizeof(u64);
1707
1708 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1709 size += sizeof(u64);
1710
1711 if (counter->attr.read_format & PERF_FORMAT_ID)
1712 entry += sizeof(u64);
1713
1714 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1715 nr += counter->group_leader->nr_siblings;
1716 size += sizeof(u64);
1717 }
1718
1719 size += entry * nr;
1720
1721 return size;
1722}
1723
1724static u64 perf_counter_read_value(struct perf_counter *counter)
1725{
1726 struct perf_counter *child;
1727 u64 total = 0;
1728
1729 total += perf_counter_read(counter);
1730 list_for_each_entry(child, &counter->child_list, child_list)
1731 total += perf_counter_read(child);
1732
1733 return total;
1734}
1735
1736static int perf_counter_read_entry(struct perf_counter *counter,
1737 u64 read_format, char __user *buf)
1738{
1739 int n = 0, count = 0;
1740 u64 values[2];
1741
1742 values[n++] = perf_counter_read_value(counter);
1743 if (read_format & PERF_FORMAT_ID)
1744 values[n++] = primary_counter_id(counter);
1745
1746 count = n * sizeof(u64);
1747
1748 if (copy_to_user(buf, values, count))
1749 return -EFAULT;
1750
1751 return count;
1752}
1753
1754static int perf_counter_read_group(struct perf_counter *counter,
1755 u64 read_format, char __user *buf)
1756{
1757 struct perf_counter *leader = counter->group_leader, *sub;
1758 int n = 0, size = 0, err = -EFAULT;
1759 u64 values[3];
1760
1761 values[n++] = 1 + leader->nr_siblings;
1762 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1763 values[n++] = leader->total_time_enabled +
1764 atomic64_read(&leader->child_total_time_enabled);
1765 }
1766 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1767 values[n++] = leader->total_time_running +
1768 atomic64_read(&leader->child_total_time_running);
1769 }
1770
1771 size = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, size))
1774 return -EFAULT;
1775
1776 err = perf_counter_read_entry(leader, read_format, buf + size);
1777 if (err < 0)
1778 return err;
1779
1780 size += err;
1781
1782 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1783 err = perf_counter_read_entry(counter, read_format,
1784 buf + size);
1785 if (err < 0)
1786 return err;
1787
1788 size += err;
1789 }
1790
1791 return size;
1792}
1793
1794static int perf_counter_read_one(struct perf_counter *counter,
1795 u64 read_format, char __user *buf)
1796{
1797 u64 values[4];
1798 int n = 0;
1799
1800 values[n++] = perf_counter_read_value(counter);
1801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1802 values[n++] = counter->total_time_enabled +
1803 atomic64_read(&counter->child_total_time_enabled);
1804 }
1805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1806 values[n++] = counter->total_time_running +
1807 atomic64_read(&counter->child_total_time_running);
1808 }
1809 if (read_format & PERF_FORMAT_ID)
1810 values[n++] = primary_counter_id(counter);
1811
1812 if (copy_to_user(buf, values, n * sizeof(u64)))
1813 return -EFAULT;
1814
1815 return n * sizeof(u64);
1816}
1817
1679/* 1818/*
1680 * Read the performance counter - simple non blocking version for now 1819 * Read the performance counter - simple non blocking version for now
1681 */ 1820 */
1682static ssize_t 1821static ssize_t
1683perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1822perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1684{ 1823{
1685 u64 values[4]; 1824 u64 read_format = counter->attr.read_format;
1686 int n; 1825 int ret;
1687 1826
1688 /* 1827 /*
1689 * Return end-of-file for a read on a counter that is in 1828 * Return end-of-file for a read on a counter that is in
@@ -1693,28 +1832,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1693 if (counter->state == PERF_COUNTER_STATE_ERROR) 1832 if (counter->state == PERF_COUNTER_STATE_ERROR)
1694 return 0; 1833 return 0;
1695 1834
1835 if (count < perf_counter_read_size(counter))
1836 return -ENOSPC;
1837
1696 WARN_ON_ONCE(counter->ctx->parent_ctx); 1838 WARN_ON_ONCE(counter->ctx->parent_ctx);
1697 mutex_lock(&counter->child_mutex); 1839 mutex_lock(&counter->child_mutex);
1698 values[0] = perf_counter_read(counter); 1840 if (read_format & PERF_FORMAT_GROUP)
1699 n = 1; 1841 ret = perf_counter_read_group(counter, read_format, buf);
1700 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1842 else
1701 values[n++] = counter->total_time_enabled + 1843 ret = perf_counter_read_one(counter, read_format, buf);
1702 atomic64_read(&counter->child_total_time_enabled);
1703 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1704 values[n++] = counter->total_time_running +
1705 atomic64_read(&counter->child_total_time_running);
1706 if (counter->attr.read_format & PERF_FORMAT_ID)
1707 values[n++] = counter->id;
1708 mutex_unlock(&counter->child_mutex); 1844 mutex_unlock(&counter->child_mutex);
1709 1845
1710 if (count < n * sizeof(u64)) 1846 return ret;
1711 return -EINVAL;
1712 count = n * sizeof(u64);
1713
1714 if (copy_to_user(buf, values, count))
1715 return -EFAULT;
1716
1717 return count;
1718} 1847}
1719 1848
1720static ssize_t 1849static ssize_t
@@ -1811,8 +1940,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1811 1940
1812 counter->attr.sample_freq = value; 1941 counter->attr.sample_freq = value;
1813 } else { 1942 } else {
1814 perf_log_period(counter, value);
1815
1816 counter->attr.sample_period = value; 1943 counter->attr.sample_period = value;
1817 counter->hw.sample_period = value; 1944 counter->hw.sample_period = value;
1818 } 1945 }
@@ -2020,7 +2147,7 @@ fail:
2020 2147
2021static void perf_mmap_free_page(unsigned long addr) 2148static void perf_mmap_free_page(unsigned long addr)
2022{ 2149{
2023 struct page *page = virt_to_page(addr); 2150 struct page *page = virt_to_page((void *)addr);
2024 2151
2025 page->mapping = NULL; 2152 page->mapping = NULL;
2026 __free_page(page); 2153 __free_page(page);
@@ -2220,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
2220 2347
2221 if (counter->pending_disable) { 2348 if (counter->pending_disable) {
2222 counter->pending_disable = 0; 2349 counter->pending_disable = 0;
2223 perf_counter_disable(counter); 2350 __perf_counter_disable(counter);
2224 } 2351 }
2225 2352
2226 if (counter->pending_wakeup) { 2353 if (counter->pending_wakeup) {
@@ -2605,7 +2732,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2605 return task_pid_nr_ns(p, counter->ns); 2732 return task_pid_nr_ns(p, counter->ns);
2606} 2733}
2607 2734
2608static void perf_counter_output(struct perf_counter *counter, int nmi, 2735static void perf_output_read_one(struct perf_output_handle *handle,
2736 struct perf_counter *counter)
2737{
2738 u64 read_format = counter->attr.read_format;
2739 u64 values[4];
2740 int n = 0;
2741
2742 values[n++] = atomic64_read(&counter->count);
2743 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2744 values[n++] = counter->total_time_enabled +
2745 atomic64_read(&counter->child_total_time_enabled);
2746 }
2747 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2748 values[n++] = counter->total_time_running +
2749 atomic64_read(&counter->child_total_time_running);
2750 }
2751 if (read_format & PERF_FORMAT_ID)
2752 values[n++] = primary_counter_id(counter);
2753
2754 perf_output_copy(handle, values, n * sizeof(u64));
2755}
2756
2757/*
2758 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2759 */
2760static void perf_output_read_group(struct perf_output_handle *handle,
2761 struct perf_counter *counter)
2762{
2763 struct perf_counter *leader = counter->group_leader, *sub;
2764 u64 read_format = counter->attr.read_format;
2765 u64 values[5];
2766 int n = 0;
2767
2768 values[n++] = 1 + leader->nr_siblings;
2769
2770 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2771 values[n++] = leader->total_time_enabled;
2772
2773 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2774 values[n++] = leader->total_time_running;
2775
2776 if (leader != counter)
2777 leader->pmu->read(leader);
2778
2779 values[n++] = atomic64_read(&leader->count);
2780 if (read_format & PERF_FORMAT_ID)
2781 values[n++] = primary_counter_id(leader);
2782
2783 perf_output_copy(handle, values, n * sizeof(u64));
2784
2785 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2786 n = 0;
2787
2788 if (sub != counter)
2789 sub->pmu->read(sub);
2790
2791 values[n++] = atomic64_read(&sub->count);
2792 if (read_format & PERF_FORMAT_ID)
2793 values[n++] = primary_counter_id(sub);
2794
2795 perf_output_copy(handle, values, n * sizeof(u64));
2796 }
2797}
2798
2799static void perf_output_read(struct perf_output_handle *handle,
2800 struct perf_counter *counter)
2801{
2802 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2803 perf_output_read_group(handle, counter);
2804 else
2805 perf_output_read_one(handle, counter);
2806}
2807
2808void perf_counter_output(struct perf_counter *counter, int nmi,
2609 struct perf_sample_data *data) 2809 struct perf_sample_data *data)
2610{ 2810{
2611 int ret; 2811 int ret;
@@ -2616,10 +2816,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2616 struct { 2816 struct {
2617 u32 pid, tid; 2817 u32 pid, tid;
2618 } tid_entry; 2818 } tid_entry;
2619 struct {
2620 u64 id;
2621 u64 counter;
2622 } group_entry;
2623 struct perf_callchain_entry *callchain = NULL; 2819 struct perf_callchain_entry *callchain = NULL;
2624 int callchain_size = 0; 2820 int callchain_size = 0;
2625 u64 time; 2821 u64 time;
@@ -2661,19 +2857,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2661 if (sample_type & PERF_SAMPLE_ID) 2857 if (sample_type & PERF_SAMPLE_ID)
2662 header.size += sizeof(u64); 2858 header.size += sizeof(u64);
2663 2859
2860 if (sample_type & PERF_SAMPLE_STREAM_ID)
2861 header.size += sizeof(u64);
2862
2664 if (sample_type & PERF_SAMPLE_CPU) { 2863 if (sample_type & PERF_SAMPLE_CPU) {
2665 header.size += sizeof(cpu_entry); 2864 header.size += sizeof(cpu_entry);
2666 2865
2667 cpu_entry.cpu = raw_smp_processor_id(); 2866 cpu_entry.cpu = raw_smp_processor_id();
2867 cpu_entry.reserved = 0;
2668 } 2868 }
2669 2869
2670 if (sample_type & PERF_SAMPLE_PERIOD) 2870 if (sample_type & PERF_SAMPLE_PERIOD)
2671 header.size += sizeof(u64); 2871 header.size += sizeof(u64);
2672 2872
2673 if (sample_type & PERF_SAMPLE_GROUP) { 2873 if (sample_type & PERF_SAMPLE_READ)
2674 header.size += sizeof(u64) + 2874 header.size += perf_counter_read_size(counter);
2675 counter->nr_siblings * sizeof(group_entry);
2676 }
2677 2875
2678 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2876 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2679 callchain = perf_callchain(data->regs); 2877 callchain = perf_callchain(data->regs);
@@ -2685,6 +2883,18 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2685 header.size += sizeof(u64); 2883 header.size += sizeof(u64);
2686 } 2884 }
2687 2885
2886 if (sample_type & PERF_SAMPLE_RAW) {
2887 int size = sizeof(u32);
2888
2889 if (data->raw)
2890 size += data->raw->size;
2891 else
2892 size += sizeof(u32);
2893
2894 WARN_ON_ONCE(size & (sizeof(u64)-1));
2895 header.size += size;
2896 }
2897
2688 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2898 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2689 if (ret) 2899 if (ret)
2690 return; 2900 return;
@@ -2703,7 +2913,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2703 if (sample_type & PERF_SAMPLE_ADDR) 2913 if (sample_type & PERF_SAMPLE_ADDR)
2704 perf_output_put(&handle, data->addr); 2914 perf_output_put(&handle, data->addr);
2705 2915
2706 if (sample_type & PERF_SAMPLE_ID) 2916 if (sample_type & PERF_SAMPLE_ID) {
2917 u64 id = primary_counter_id(counter);
2918
2919 perf_output_put(&handle, id);
2920 }
2921
2922 if (sample_type & PERF_SAMPLE_STREAM_ID)
2707 perf_output_put(&handle, counter->id); 2923 perf_output_put(&handle, counter->id);
2708 2924
2709 if (sample_type & PERF_SAMPLE_CPU) 2925 if (sample_type & PERF_SAMPLE_CPU)
@@ -2712,26 +2928,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2712 if (sample_type & PERF_SAMPLE_PERIOD) 2928 if (sample_type & PERF_SAMPLE_PERIOD)
2713 perf_output_put(&handle, data->period); 2929 perf_output_put(&handle, data->period);
2714 2930
2715 /* 2931 if (sample_type & PERF_SAMPLE_READ)
2716 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2932 perf_output_read(&handle, counter);
2717 */
2718 if (sample_type & PERF_SAMPLE_GROUP) {
2719 struct perf_counter *leader, *sub;
2720 u64 nr = counter->nr_siblings;
2721
2722 perf_output_put(&handle, nr);
2723
2724 leader = counter->group_leader;
2725 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2726 if (sub != counter)
2727 sub->pmu->read(sub);
2728
2729 group_entry.id = sub->id;
2730 group_entry.counter = atomic64_read(&sub->count);
2731
2732 perf_output_put(&handle, group_entry);
2733 }
2734 }
2735 2933
2736 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2737 if (callchain) 2935 if (callchain)
@@ -2742,6 +2940,22 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2742 } 2940 }
2743 } 2941 }
2744 2942
2943 if (sample_type & PERF_SAMPLE_RAW) {
2944 if (data->raw) {
2945 perf_output_put(&handle, data->raw->size);
2946 perf_output_copy(&handle, data->raw->data, data->raw->size);
2947 } else {
2948 struct {
2949 u32 size;
2950 u32 data;
2951 } raw = {
2952 .size = sizeof(u32),
2953 .data = 0,
2954 };
2955 perf_output_put(&handle, raw);
2956 }
2957 }
2958
2745 perf_output_end(&handle); 2959 perf_output_end(&handle);
2746} 2960}
2747 2961
@@ -2754,8 +2968,6 @@ struct perf_read_event {
2754 2968
2755 u32 pid; 2969 u32 pid;
2756 u32 tid; 2970 u32 tid;
2757 u64 value;
2758 u64 format[3];
2759}; 2971};
2760 2972
2761static void 2973static void
@@ -2767,87 +2979,74 @@ perf_counter_read_event(struct perf_counter *counter,
2767 .header = { 2979 .header = {
2768 .type = PERF_EVENT_READ, 2980 .type = PERF_EVENT_READ,
2769 .misc = 0, 2981 .misc = 0,
2770 .size = sizeof(event) - sizeof(event.format), 2982 .size = sizeof(event) + perf_counter_read_size(counter),
2771 }, 2983 },
2772 .pid = perf_counter_pid(counter, task), 2984 .pid = perf_counter_pid(counter, task),
2773 .tid = perf_counter_tid(counter, task), 2985 .tid = perf_counter_tid(counter, task),
2774 .value = atomic64_read(&counter->count),
2775 }; 2986 };
2776 int ret, i = 0; 2987 int ret;
2777
2778 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2779 event.header.size += sizeof(u64);
2780 event.format[i++] = counter->total_time_enabled;
2781 }
2782
2783 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2784 event.header.size += sizeof(u64);
2785 event.format[i++] = counter->total_time_running;
2786 }
2787
2788 if (counter->attr.read_format & PERF_FORMAT_ID) {
2789 u64 id;
2790
2791 event.header.size += sizeof(u64);
2792 if (counter->parent)
2793 id = counter->parent->id;
2794 else
2795 id = counter->id;
2796
2797 event.format[i++] = id;
2798 }
2799 2988
2800 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); 2989 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
2801 if (ret) 2990 if (ret)
2802 return; 2991 return;
2803 2992
2804 perf_output_copy(&handle, &event, event.header.size); 2993 perf_output_put(&handle, event);
2994 perf_output_read(&handle, counter);
2995
2805 perf_output_end(&handle); 2996 perf_output_end(&handle);
2806} 2997}
2807 2998
2808/* 2999/*
2809 * fork tracking 3000 * task tracking -- fork/exit
3001 *
3002 * enabled by: attr.comm | attr.mmap | attr.task
2810 */ 3003 */
2811 3004
2812struct perf_fork_event { 3005struct perf_task_event {
2813 struct task_struct *task; 3006 struct task_struct *task;
3007 struct perf_counter_context *task_ctx;
2814 3008
2815 struct { 3009 struct {
2816 struct perf_event_header header; 3010 struct perf_event_header header;
2817 3011
2818 u32 pid; 3012 u32 pid;
2819 u32 ppid; 3013 u32 ppid;
3014 u32 tid;
3015 u32 ptid;
2820 } event; 3016 } event;
2821}; 3017};
2822 3018
2823static void perf_counter_fork_output(struct perf_counter *counter, 3019static void perf_counter_task_output(struct perf_counter *counter,
2824 struct perf_fork_event *fork_event) 3020 struct perf_task_event *task_event)
2825{ 3021{
2826 struct perf_output_handle handle; 3022 struct perf_output_handle handle;
2827 int size = fork_event->event.header.size; 3023 int size = task_event->event.header.size;
2828 struct task_struct *task = fork_event->task; 3024 struct task_struct *task = task_event->task;
2829 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3025 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2830 3026
2831 if (ret) 3027 if (ret)
2832 return; 3028 return;
2833 3029
2834 fork_event->event.pid = perf_counter_pid(counter, task); 3030 task_event->event.pid = perf_counter_pid(counter, task);
2835 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 3031 task_event->event.ppid = perf_counter_pid(counter, current);
3032
3033 task_event->event.tid = perf_counter_tid(counter, task);
3034 task_event->event.ptid = perf_counter_tid(counter, current);
2836 3035
2837 perf_output_put(&handle, fork_event->event); 3036 perf_output_put(&handle, task_event->event);
2838 perf_output_end(&handle); 3037 perf_output_end(&handle);
2839} 3038}
2840 3039
2841static int perf_counter_fork_match(struct perf_counter *counter) 3040static int perf_counter_task_match(struct perf_counter *counter)
2842{ 3041{
2843 if (counter->attr.comm || counter->attr.mmap) 3042 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2844 return 1; 3043 return 1;
2845 3044
2846 return 0; 3045 return 0;
2847} 3046}
2848 3047
2849static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 3048static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2850 struct perf_fork_event *fork_event) 3049 struct perf_task_event *task_event)
2851{ 3050{
2852 struct perf_counter *counter; 3051 struct perf_counter *counter;
2853 3052
@@ -2856,51 +3055,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2856 3055
2857 rcu_read_lock(); 3056 rcu_read_lock();
2858 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3057 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2859 if (perf_counter_fork_match(counter)) 3058 if (perf_counter_task_match(counter))
2860 perf_counter_fork_output(counter, fork_event); 3059 perf_counter_task_output(counter, task_event);
2861 } 3060 }
2862 rcu_read_unlock(); 3061 rcu_read_unlock();
2863} 3062}
2864 3063
2865static void perf_counter_fork_event(struct perf_fork_event *fork_event) 3064static void perf_counter_task_event(struct perf_task_event *task_event)
2866{ 3065{
2867 struct perf_cpu_context *cpuctx; 3066 struct perf_cpu_context *cpuctx;
2868 struct perf_counter_context *ctx; 3067 struct perf_counter_context *ctx = task_event->task_ctx;
2869 3068
2870 cpuctx = &get_cpu_var(perf_cpu_context); 3069 cpuctx = &get_cpu_var(perf_cpu_context);
2871 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 3070 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2872 put_cpu_var(perf_cpu_context); 3071 put_cpu_var(perf_cpu_context);
2873 3072
2874 rcu_read_lock(); 3073 rcu_read_lock();
2875 /* 3074 if (!ctx)
2876 * doesn't really matter which of the child contexts the 3075 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2877 * events ends up in.
2878 */
2879 ctx = rcu_dereference(current->perf_counter_ctxp);
2880 if (ctx) 3076 if (ctx)
2881 perf_counter_fork_ctx(ctx, fork_event); 3077 perf_counter_task_ctx(ctx, task_event);
2882 rcu_read_unlock(); 3078 rcu_read_unlock();
2883} 3079}
2884 3080
2885void perf_counter_fork(struct task_struct *task) 3081static void perf_counter_task(struct task_struct *task,
3082 struct perf_counter_context *task_ctx,
3083 int new)
2886{ 3084{
2887 struct perf_fork_event fork_event; 3085 struct perf_task_event task_event;
2888 3086
2889 if (!atomic_read(&nr_comm_counters) && 3087 if (!atomic_read(&nr_comm_counters) &&
2890 !atomic_read(&nr_mmap_counters)) 3088 !atomic_read(&nr_mmap_counters) &&
3089 !atomic_read(&nr_task_counters))
2891 return; 3090 return;
2892 3091
2893 fork_event = (struct perf_fork_event){ 3092 task_event = (struct perf_task_event){
2894 .task = task, 3093 .task = task,
2895 .event = { 3094 .task_ctx = task_ctx,
3095 .event = {
2896 .header = { 3096 .header = {
2897 .type = PERF_EVENT_FORK, 3097 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2898 .size = sizeof(fork_event.event), 3098 .misc = 0,
3099 .size = sizeof(task_event.event),
2899 }, 3100 },
3101 /* .pid */
3102 /* .ppid */
3103 /* .tid */
3104 /* .ptid */
2900 }, 3105 },
2901 }; 3106 };
2902 3107
2903 perf_counter_fork_event(&fork_event); 3108 perf_counter_task_event(&task_event);
3109}
3110
3111void perf_counter_fork(struct task_struct *task)
3112{
3113 perf_counter_task(task, NULL, 1);
2904} 3114}
2905 3115
2906/* 3116/*
@@ -2968,8 +3178,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2968 struct perf_cpu_context *cpuctx; 3178 struct perf_cpu_context *cpuctx;
2969 struct perf_counter_context *ctx; 3179 struct perf_counter_context *ctx;
2970 unsigned int size; 3180 unsigned int size;
2971 char *comm = comm_event->task->comm; 3181 char comm[TASK_COMM_LEN];
2972 3182
3183 memset(comm, 0, sizeof(comm));
3184 strncpy(comm, comm_event->task->comm, sizeof(comm));
2973 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3185 size = ALIGN(strlen(comm)+1, sizeof(u64));
2974 3186
2975 comm_event->comm = comm; 3187 comm_event->comm = comm;
@@ -3004,8 +3216,16 @@ void perf_counter_comm(struct task_struct *task)
3004 3216
3005 comm_event = (struct perf_comm_event){ 3217 comm_event = (struct perf_comm_event){
3006 .task = task, 3218 .task = task,
3219 /* .comm */
3220 /* .comm_size */
3007 .event = { 3221 .event = {
3008 .header = { .type = PERF_EVENT_COMM, }, 3222 .header = {
3223 .type = PERF_EVENT_COMM,
3224 .misc = 0,
3225 /* .size */
3226 },
3227 /* .pid */
3228 /* .tid */
3009 }, 3229 },
3010 }; 3230 };
3011 3231
@@ -3088,8 +3308,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3088 char *buf = NULL; 3308 char *buf = NULL;
3089 const char *name; 3309 const char *name;
3090 3310
3311 memset(tmp, 0, sizeof(tmp));
3312
3091 if (file) { 3313 if (file) {
3092 buf = kzalloc(PATH_MAX, GFP_KERNEL); 3314 /*
3315 * d_path works from the end of the buffer backwards, so we
3316 * need to add enough zero bytes after the string to handle
3317 * the 64bit alignment we do later.
3318 */
3319 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3093 if (!buf) { 3320 if (!buf) {
3094 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3321 name = strncpy(tmp, "//enomem", sizeof(tmp));
3095 goto got_name; 3322 goto got_name;
@@ -3100,9 +3327,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3100 goto got_name; 3327 goto got_name;
3101 } 3328 }
3102 } else { 3329 } else {
3103 name = arch_vma_name(mmap_event->vma); 3330 if (arch_vma_name(mmap_event->vma)) {
3104 if (name) 3331 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3332 sizeof(tmp));
3105 goto got_name; 3333 goto got_name;
3334 }
3106 3335
3107 if (!vma->vm_mm) { 3336 if (!vma->vm_mm) {
3108 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3337 name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@ -3147,8 +3376,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
3147 3376
3148 mmap_event = (struct perf_mmap_event){ 3377 mmap_event = (struct perf_mmap_event){
3149 .vma = vma, 3378 .vma = vma,
3379 /* .file_name */
3380 /* .file_size */
3150 .event = { 3381 .event = {
3151 .header = { .type = PERF_EVENT_MMAP, }, 3382 .header = {
3383 .type = PERF_EVENT_MMAP,
3384 .misc = 0,
3385 /* .size */
3386 },
3387 /* .pid */
3388 /* .tid */
3152 .start = vma->vm_start, 3389 .start = vma->vm_start,
3153 .len = vma->vm_end - vma->vm_start, 3390 .len = vma->vm_end - vma->vm_start,
3154 .pgoff = vma->vm_pgoff, 3391 .pgoff = vma->vm_pgoff,
@@ -3159,49 +3396,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
3159} 3396}
3160 3397
3161/* 3398/*
3162 * Log sample_period changes so that analyzing tools can re-normalize the
3163 * event flow.
3164 */
3165
3166struct freq_event {
3167 struct perf_event_header header;
3168 u64 time;
3169 u64 id;
3170 u64 period;
3171};
3172
3173static void perf_log_period(struct perf_counter *counter, u64 period)
3174{
3175 struct perf_output_handle handle;
3176 struct freq_event event;
3177 int ret;
3178
3179 if (counter->hw.sample_period == period)
3180 return;
3181
3182 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
3183 return;
3184
3185 event = (struct freq_event) {
3186 .header = {
3187 .type = PERF_EVENT_PERIOD,
3188 .misc = 0,
3189 .size = sizeof(event),
3190 },
3191 .time = sched_clock(),
3192 .id = counter->id,
3193 .period = period,
3194 };
3195
3196 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
3197 if (ret)
3198 return;
3199
3200 perf_output_put(&handle, event);
3201 perf_output_end(&handle);
3202}
3203
3204/*
3205 * IRQ throttle logging 3399 * IRQ throttle logging
3206 */ 3400 */
3207 3401
@@ -3214,16 +3408,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3214 struct perf_event_header header; 3408 struct perf_event_header header;
3215 u64 time; 3409 u64 time;
3216 u64 id; 3410 u64 id;
3411 u64 stream_id;
3217 } throttle_event = { 3412 } throttle_event = {
3218 .header = { 3413 .header = {
3219 .type = PERF_EVENT_THROTTLE + 1, 3414 .type = PERF_EVENT_THROTTLE,
3220 .misc = 0, 3415 .misc = 0,
3221 .size = sizeof(throttle_event), 3416 .size = sizeof(throttle_event),
3222 }, 3417 },
3223 .time = sched_clock(), 3418 .time = sched_clock(),
3224 .id = counter->id, 3419 .id = primary_counter_id(counter),
3420 .stream_id = counter->id,
3225 }; 3421 };
3226 3422
3423 if (enable)
3424 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3425
3227 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); 3426 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3228 if (ret) 3427 if (ret)
3229 return; 3428 return;
@@ -3300,125 +3499,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3300 * Generic software counter infrastructure 3499 * Generic software counter infrastructure
3301 */ 3500 */
3302 3501
3303static void perf_swcounter_update(struct perf_counter *counter) 3502/*
3503 * We directly increment counter->count and keep a second value in
3504 * counter->hw.period_left to count intervals. This period counter
3505 * is kept in the range [-sample_period, 0] so that we can use the
3506 * sign as trigger.
3507 */
3508
3509static u64 perf_swcounter_set_period(struct perf_counter *counter)
3304{ 3510{
3305 struct hw_perf_counter *hwc = &counter->hw; 3511 struct hw_perf_counter *hwc = &counter->hw;
3306 u64 prev, now; 3512 u64 period = hwc->last_period;
3307 s64 delta; 3513 u64 nr, offset;
3514 s64 old, val;
3515
3516 hwc->last_period = hwc->sample_period;
3308 3517
3309again: 3518again:
3310 prev = atomic64_read(&hwc->prev_count); 3519 old = val = atomic64_read(&hwc->period_left);
3311 now = atomic64_read(&hwc->count); 3520 if (val < 0)
3312 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3521 return 0;
3313 goto again;
3314 3522
3315 delta = now - prev; 3523 nr = div64_u64(period + val, period);
3524 offset = nr * period;
3525 val -= offset;
3526 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3527 goto again;
3316 3528
3317 atomic64_add(delta, &counter->count); 3529 return nr;
3318 atomic64_sub(delta, &hwc->period_left);
3319} 3530}
3320 3531
3321static void perf_swcounter_set_period(struct perf_counter *counter) 3532static void perf_swcounter_overflow(struct perf_counter *counter,
3533 int nmi, struct perf_sample_data *data)
3322{ 3534{
3323 struct hw_perf_counter *hwc = &counter->hw; 3535 struct hw_perf_counter *hwc = &counter->hw;
3324 s64 left = atomic64_read(&hwc->period_left); 3536 u64 overflow;
3325 s64 period = hwc->sample_period;
3326 3537
3327 if (unlikely(left <= -period)) { 3538 data->period = counter->hw.last_period;
3328 left = period; 3539 overflow = perf_swcounter_set_period(counter);
3329 atomic64_set(&hwc->period_left, left);
3330 hwc->last_period = period;
3331 }
3332 3540
3333 if (unlikely(left <= 0)) { 3541 if (hwc->interrupts == MAX_INTERRUPTS)
3334 left += period; 3542 return;
3335 atomic64_add(period, &hwc->period_left);
3336 hwc->last_period = period;
3337 }
3338 3543
3339 atomic64_set(&hwc->prev_count, -left); 3544 for (; overflow; overflow--) {
3340 atomic64_set(&hwc->count, -left); 3545 if (perf_counter_overflow(counter, nmi, data)) {
3546 /*
3547 * We inhibit the overflow from happening when
3548 * hwc->interrupts == MAX_INTERRUPTS.
3549 */
3550 break;
3551 }
3552 }
3341} 3553}
3342 3554
3343static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3555static void perf_swcounter_unthrottle(struct perf_counter *counter)
3344{ 3556{
3345 enum hrtimer_restart ret = HRTIMER_RESTART;
3346 struct perf_sample_data data;
3347 struct perf_counter *counter;
3348 u64 period;
3349
3350 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3351 counter->pmu->read(counter);
3352
3353 data.addr = 0;
3354 data.regs = get_irq_regs();
3355 /* 3557 /*
3356 * In case we exclude kernel IPs or are somehow not in interrupt 3558 * Nothing to do, we already reset hwc->interrupts.
3357 * context, provide the next best thing, the user IP.
3358 */ 3559 */
3359 if ((counter->attr.exclude_kernel || !data.regs) && 3560}
3360 !counter->attr.exclude_user)
3361 data.regs = task_pt_regs(current);
3362 3561
3363 if (data.regs) { 3562static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3364 if (perf_counter_overflow(counter, 0, &data)) 3563 int nmi, struct perf_sample_data *data)
3365 ret = HRTIMER_NORESTART; 3564{
3366 } 3565 struct hw_perf_counter *hwc = &counter->hw;
3367 3566
3368 period = max_t(u64, 10000, counter->hw.sample_period); 3567 atomic64_add(nr, &counter->count);
3369 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3370 3568
3371 return ret; 3569 if (!hwc->sample_period)
3372} 3570 return;
3373 3571
3374static void perf_swcounter_overflow(struct perf_counter *counter, 3572 if (!data->regs)
3375 int nmi, struct perf_sample_data *data) 3573 return;
3376{
3377 data->period = counter->hw.last_period;
3378 3574
3379 perf_swcounter_update(counter); 3575 if (!atomic64_add_negative(nr, &hwc->period_left))
3380 perf_swcounter_set_period(counter); 3576 perf_swcounter_overflow(counter, nmi, data);
3381 if (perf_counter_overflow(counter, nmi, data))
3382 /* soft-disable the counter */
3383 ;
3384} 3577}
3385 3578
3386static int perf_swcounter_is_counting(struct perf_counter *counter) 3579static int perf_swcounter_is_counting(struct perf_counter *counter)
3387{ 3580{
3388 struct perf_counter_context *ctx; 3581 /*
3389 unsigned long flags; 3582 * The counter is active, we're good!
3390 int count; 3583 */
3391
3392 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3584 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3393 return 1; 3585 return 1;
3394 3586
3587 /*
3588 * The counter is off/error, not counting.
3589 */
3395 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3590 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3396 return 0; 3591 return 0;
3397 3592
3398 /* 3593 /*
3399 * If the counter is inactive, it could be just because 3594 * The counter is inactive, if the context is active
3400 * its task is scheduled out, or because it's in a group 3595 * we're part of a group that didn't make it on the 'pmu',
3401 * which could not go on the PMU. We want to count in 3596 * not counting.
3402 * the first case but not the second. If the context is
3403 * currently active then an inactive software counter must
3404 * be the second case. If it's not currently active then
3405 * we need to know whether the counter was active when the
3406 * context was last active, which we can determine by
3407 * comparing counter->tstamp_stopped with ctx->time.
3408 *
3409 * We are within an RCU read-side critical section,
3410 * which protects the existence of *ctx.
3411 */ 3597 */
3412 ctx = counter->ctx; 3598 if (counter->ctx->is_active)
3413 spin_lock_irqsave(&ctx->lock, flags); 3599 return 0;
3414 count = 1; 3600
3415 /* Re-check state now we have the lock */ 3601 /*
3416 if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3602 * We're inactive and the context is too, this means the
3417 counter->ctx->is_active || 3603 * task is scheduled out, we're counting events that happen
3418 counter->tstamp_stopped < ctx->time) 3604 * to us, like migration events.
3419 count = 0; 3605 */
3420 spin_unlock_irqrestore(&ctx->lock, flags); 3606 return 1;
3421 return count;
3422} 3607}
3423 3608
3424static int perf_swcounter_match(struct perf_counter *counter, 3609static int perf_swcounter_match(struct perf_counter *counter,
@@ -3444,15 +3629,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
3444 return 1; 3629 return 1;
3445} 3630}
3446 3631
3447static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3448 int nmi, struct perf_sample_data *data)
3449{
3450 int neg = atomic64_add_negative(nr, &counter->hw.count);
3451
3452 if (counter->hw.sample_period && !neg && data->regs)
3453 perf_swcounter_overflow(counter, nmi, data);
3454}
3455
3456static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3632static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3457 enum perf_type_id type, 3633 enum perf_type_id type,
3458 u32 event, u64 nr, int nmi, 3634 u32 event, u64 nr, int nmi,
@@ -3531,27 +3707,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3531 3707
3532static void perf_swcounter_read(struct perf_counter *counter) 3708static void perf_swcounter_read(struct perf_counter *counter)
3533{ 3709{
3534 perf_swcounter_update(counter);
3535} 3710}
3536 3711
3537static int perf_swcounter_enable(struct perf_counter *counter) 3712static int perf_swcounter_enable(struct perf_counter *counter)
3538{ 3713{
3539 perf_swcounter_set_period(counter); 3714 struct hw_perf_counter *hwc = &counter->hw;
3715
3716 if (hwc->sample_period) {
3717 hwc->last_period = hwc->sample_period;
3718 perf_swcounter_set_period(counter);
3719 }
3540 return 0; 3720 return 0;
3541} 3721}
3542 3722
3543static void perf_swcounter_disable(struct perf_counter *counter) 3723static void perf_swcounter_disable(struct perf_counter *counter)
3544{ 3724{
3545 perf_swcounter_update(counter);
3546} 3725}
3547 3726
3548static const struct pmu perf_ops_generic = { 3727static const struct pmu perf_ops_generic = {
3549 .enable = perf_swcounter_enable, 3728 .enable = perf_swcounter_enable,
3550 .disable = perf_swcounter_disable, 3729 .disable = perf_swcounter_disable,
3551 .read = perf_swcounter_read, 3730 .read = perf_swcounter_read,
3731 .unthrottle = perf_swcounter_unthrottle,
3552}; 3732};
3553 3733
3554/* 3734/*
3735 * hrtimer based swcounter callback
3736 */
3737
3738static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3739{
3740 enum hrtimer_restart ret = HRTIMER_RESTART;
3741 struct perf_sample_data data;
3742 struct perf_counter *counter;
3743 u64 period;
3744
3745 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3746 counter->pmu->read(counter);
3747
3748 data.addr = 0;
3749 data.regs = get_irq_regs();
3750 /*
3751 * In case we exclude kernel IPs or are somehow not in interrupt
3752 * context, provide the next best thing, the user IP.
3753 */
3754 if ((counter->attr.exclude_kernel || !data.regs) &&
3755 !counter->attr.exclude_user)
3756 data.regs = task_pt_regs(current);
3757
3758 if (data.regs) {
3759 if (perf_counter_overflow(counter, 0, &data))
3760 ret = HRTIMER_NORESTART;
3761 }
3762
3763 period = max_t(u64, 10000, counter->hw.sample_period);
3764 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3765
3766 return ret;
3767}
3768
3769/*
3555 * Software counter: cpu wall time clock 3770 * Software counter: cpu wall time clock
3556 */ 3771 */
3557 3772
@@ -3668,17 +3883,24 @@ static const struct pmu perf_ops_task_clock = {
3668}; 3883};
3669 3884
3670#ifdef CONFIG_EVENT_PROFILE 3885#ifdef CONFIG_EVENT_PROFILE
3671void perf_tpcounter_event(int event_id) 3886void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3887 int entry_size)
3672{ 3888{
3889 struct perf_raw_record raw = {
3890 .size = entry_size,
3891 .data = record,
3892 };
3893
3673 struct perf_sample_data data = { 3894 struct perf_sample_data data = {
3674 .regs = get_irq_regs(); 3895 .regs = get_irq_regs(),
3675 .addr = 0, 3896 .addr = addr,
3897 .raw = &raw,
3676 }; 3898 };
3677 3899
3678 if (!data.regs) 3900 if (!data.regs)
3679 data.regs = task_pt_regs(current); 3901 data.regs = task_pt_regs(current);
3680 3902
3681 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); 3903 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3682} 3904}
3683EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3905EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3684 3906
@@ -3687,16 +3909,20 @@ extern void ftrace_profile_disable(int);
3687 3909
3688static void tp_perf_counter_destroy(struct perf_counter *counter) 3910static void tp_perf_counter_destroy(struct perf_counter *counter)
3689{ 3911{
3690 ftrace_profile_disable(perf_event_id(&counter->attr)); 3912 ftrace_profile_disable(counter->attr.config);
3691} 3913}
3692 3914
3693static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3915static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3694{ 3916{
3695 int event_id = perf_event_id(&counter->attr); 3917 /*
3696 int ret; 3918 * Raw tracepoint data is a severe data leak, only allow root to
3919 * have these.
3920 */
3921 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3922 !capable(CAP_SYS_ADMIN))
3923 return ERR_PTR(-EPERM);
3697 3924
3698 ret = ftrace_profile_enable(event_id); 3925 if (ftrace_profile_enable(counter->attr.config))
3699 if (ret)
3700 return NULL; 3926 return NULL;
3701 3927
3702 counter->destroy = tp_perf_counter_destroy; 3928 counter->destroy = tp_perf_counter_destroy;
@@ -3829,9 +4055,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3829 atomic64_set(&hwc->period_left, hwc->sample_period); 4055 atomic64_set(&hwc->period_left, hwc->sample_period);
3830 4056
3831 /* 4057 /*
3832 * we currently do not support PERF_SAMPLE_GROUP on inherited counters 4058 * we currently do not support PERF_FORMAT_GROUP on inherited counters
3833 */ 4059 */
3834 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 4060 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
3835 goto done; 4061 goto done;
3836 4062
3837 switch (attr->type) { 4063 switch (attr->type) {
@@ -3874,6 +4100,8 @@ done:
3874 atomic_inc(&nr_mmap_counters); 4100 atomic_inc(&nr_mmap_counters);
3875 if (counter->attr.comm) 4101 if (counter->attr.comm)
3876 atomic_inc(&nr_comm_counters); 4102 atomic_inc(&nr_comm_counters);
4103 if (counter->attr.task)
4104 atomic_inc(&nr_task_counters);
3877 } 4105 }
3878 4106
3879 return counter; 4107 return counter;
@@ -4235,8 +4463,10 @@ void perf_counter_exit_task(struct task_struct *child)
4235 struct perf_counter_context *child_ctx; 4463 struct perf_counter_context *child_ctx;
4236 unsigned long flags; 4464 unsigned long flags;
4237 4465
4238 if (likely(!child->perf_counter_ctxp)) 4466 if (likely(!child->perf_counter_ctxp)) {
4467 perf_counter_task(child, NULL, 0);
4239 return; 4468 return;
4469 }
4240 4470
4241 local_irq_save(flags); 4471 local_irq_save(flags);
4242 /* 4472 /*
@@ -4255,17 +4485,20 @@ void perf_counter_exit_task(struct task_struct *child)
4255 */ 4485 */
4256 spin_lock(&child_ctx->lock); 4486 spin_lock(&child_ctx->lock);
4257 child->perf_counter_ctxp = NULL; 4487 child->perf_counter_ctxp = NULL;
4258 if (child_ctx->parent_ctx) { 4488 /*
4259 /* 4489 * If this context is a clone; unclone it so it can't get
4260 * This context is a clone; unclone it so it can't get 4490 * swapped to another process while we're removing all
4261 * swapped to another process while we're removing all 4491 * the counters from it.
4262 * the counters from it. 4492 */
4263 */ 4493 unclone_ctx(child_ctx);
4264 put_ctx(child_ctx->parent_ctx); 4494 spin_unlock_irqrestore(&child_ctx->lock, flags);
4265 child_ctx->parent_ctx = NULL; 4495
4266 } 4496 /*
4267 spin_unlock(&child_ctx->lock); 4497 * Report the task dead after unscheduling the counters so that we
4268 local_irq_restore(flags); 4498 * won't get any samples after PERF_EVENT_EXIT. We can however still
4499 * get a few PERF_EVENT_READ events.
4500 */
4501 perf_counter_task(child, child_ctx, 0);
4269 4502
4270 /* 4503 /*
4271 * We can recurse on the same lock type through: 4504 * We can recurse on the same lock type through:
@@ -4486,6 +4719,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4486 perf_counter_init_cpu(cpu); 4719 perf_counter_init_cpu(cpu);
4487 break; 4720 break;
4488 4721
4722 case CPU_ONLINE:
4723 case CPU_ONLINE_FROZEN:
4724 hw_perf_counter_setup_online(cpu);
4725 break;
4726
4489 case CPU_DOWN_PREPARE: 4727 case CPU_DOWN_PREPARE:
4490 case CPU_DOWN_PREPARE_FROZEN: 4728 case CPU_DOWN_PREPARE_FROZEN:
4491 perf_counter_exit_cpu(cpu); 4729 perf_counter_exit_cpu(cpu);
@@ -4510,6 +4748,8 @@ void __init perf_counter_init(void)
4510{ 4748{
4511 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4749 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4512 (void *)(long)smp_processor_id()); 4750 (void *)(long)smp_processor_id());
4751 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4752 (void *)(long)smp_processor_id());
4513 register_cpu_notifier(&perf_cpu_nb); 4753 register_cpu_notifier(&perf_cpu_nb);
4514} 4754}
4515 4755
diff --git a/kernel/pid.c b/kernel/pid.c
index 5fa1db48d8b7..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,7 +36,6 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/kmemleak.h>
40 39
41#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -513,12 +512,6 @@ void __init pidhash_init(void)
513 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); 512 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
514 if (!pid_hash) 513 if (!pid_hash)
515 panic("Could not alloc pidhash!\n"); 514 panic("Could not alloc pidhash!\n");
516 /*
517 * pid_hash contains references to allocated struct pid objects and it
518 * must be scanned by kmemleak to avoid false positives.
519 */
520 kmemleak_alloc(pid_hash, pidhash_size * sizeof(*(pid_hash)), 0,
521 GFP_KERNEL);
522 for (i = 0; i < pidhash_size; i++) 515 for (i = 0; i < pidhash_size; i++)
523 INIT_HLIST_HEAD(&pid_hash[i]); 516 INIT_HLIST_HEAD(&pid_hash[i]);
524} 517}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
521} 521}
522void posix_cpu_timers_exit_group(struct task_struct *tsk) 522void posix_cpu_timers_exit_group(struct task_struct *tsk)
523{ 523{
524 struct task_cputime cputime; 524 struct signal_struct *const sig = tsk->signal;
525 525
526 thread_group_cputimer(tsk, &cputime);
527 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
528 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime_add(tsk->utime, sig->utime),
528 cputime_add(tsk->stime, sig->stime),
529 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
529} 530}
530 531
531static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 532static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 260 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 261 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 262 .timer_create = no_timer_create,
263 .nsleep = no_nsleep,
257 }; 264 };
258 265
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 266 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 69911b5745eb..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -117,11 +117,12 @@ int __ref profile_init(void)
117 117
118 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
119 119
120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
121 if (prof_buffer) 121 if (prof_buffer)
122 return 0; 122 return 0;
123 123
124 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
125 if (prof_buffer) 126 if (prof_buffer)
126 return 0; 127 return 0;
127 128
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07ba..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)
181 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
182 * under ptrace. 182 * under ptrace.
183 */ 183 */
184 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
185 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
186 goto out; 186 goto out;
187 187
188 task_lock(task); 188 task_lock(task);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1533,7 +1533,7 @@ void __init __rcu_init(void)
1533 int j; 1533 int j;
1534 struct rcu_node *rnp; 1534 struct rcu_node *rnp;
1535 1535
1536 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1536 printk(KERN_INFO "Hierarchical RCU implementation.\n");
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1546,7 +1546,6 @@ void __init __rcu_init(void)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); 1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */ 1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb); 1548 register_cpu_notifier(&rcu_nb);
1549 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1550} 1549}
1551 1550
1552module_param(blimit, int, 0); 1551module_param(blimit, int, 0);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a78c5a..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { 1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1044 1043 spin_unlock(&lock->wait_lock);
1045 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1; 1045 return 1;
1047 } 1046 }
1048 1047
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050 1049
1051
1052 if (ret && !waiter->task) { 1050 if (ret && !waiter->task) {
1053 /* 1051 /*
1054 * Reset the return value. We might have 1052 * Reset the return value. We might have
diff --git a/kernel/sched.c b/kernel/sched.c
index 7c9098d186e6..1b59e265273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -493,6 +493,7 @@ struct rt_rq {
493#endif 493#endif
494#ifdef CONFIG_SMP 494#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory; 495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
496 int overloaded; 497 int overloaded;
497 struct plist_head pushable_tasks; 498 struct plist_head pushable_tasks;
498#endif 499#endif
@@ -2571,15 +2572,37 @@ static void __sched_fork(struct task_struct *p)
2571 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2572 2573
2573#ifdef CONFIG_SCHEDSTATS 2574#ifdef CONFIG_SCHEDSTATS
2574 p->se.wait_start = 0; 2575 p->se.wait_start = 0;
2575 p->se.sum_sleep_runtime = 0; 2576 p->se.wait_max = 0;
2576 p->se.sleep_start = 0; 2577 p->se.wait_count = 0;
2577 p->se.block_start = 0; 2578 p->se.wait_sum = 0;
2578 p->se.sleep_max = 0; 2579
2579 p->se.block_max = 0; 2580 p->se.sleep_start = 0;
2580 p->se.exec_max = 0; 2581 p->se.sleep_max = 0;
2581 p->se.slice_max = 0; 2582 p->se.sum_sleep_runtime = 0;
2582 p->se.wait_max = 0; 2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2583#endif 2606#endif
2584 2607
2585 INIT_LIST_HEAD(&p->rt.run_list); 2608 INIT_LIST_HEAD(&p->rt.run_list);
@@ -6541,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
6541 return 0; 6564 return 0;
6542} 6565}
6543 6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6544static void __cond_resched(void) 6572static void __cond_resched(void)
6545{ 6573{
6546#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6560,8 +6588,7 @@ static void __cond_resched(void)
6560 6588
6561int __sched _cond_resched(void) 6589int __sched _cond_resched(void)
6562{ 6590{
6563 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6591 if (should_resched()) {
6564 system_state == SYSTEM_RUNNING) {
6565 __cond_resched(); 6592 __cond_resched();
6566 return 1; 6593 return 1;
6567 } 6594 }
@@ -6579,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
6579 */ 6606 */
6580int cond_resched_lock(spinlock_t *lock) 6607int cond_resched_lock(spinlock_t *lock)
6581{ 6608{
6582 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6609 int resched = should_resched();
6583 int ret = 0; 6610 int ret = 0;
6584 6611
6585 if (spin_needbreak(lock) || resched) { 6612 if (spin_needbreak(lock) || resched) {
6586 spin_unlock(lock); 6613 spin_unlock(lock);
6587 if (resched && need_resched()) 6614 if (resched)
6588 __cond_resched(); 6615 __cond_resched();
6589 else 6616 else
6590 cpu_relax(); 6617 cpu_relax();
@@ -6599,7 +6626,7 @@ int __sched cond_resched_softirq(void)
6599{ 6626{
6600 BUG_ON(!in_softirq()); 6627 BUG_ON(!in_softirq());
6601 6628
6602 if (need_resched() && system_state == SYSTEM_RUNNING) { 6629 if (should_resched()) {
6603 local_bh_enable(); 6630 local_bh_enable();
6604 __cond_resched(); 6631 __cond_resched();
6605 local_bh_disable(); 6632 local_bh_disable();
@@ -7262,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7262static void calc_global_load_remove(struct rq *rq) 7289static void calc_global_load_remove(struct rq *rq)
7263{ 7290{
7264 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7291 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7292 rq->calc_load_active = 0;
7265} 7293}
7266#endif /* CONFIG_HOTPLUG_CPU */ 7294#endif /* CONFIG_HOTPLUG_CPU */
7267 7295
@@ -7488,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7488 task_rq_unlock(rq, &flags); 7516 task_rq_unlock(rq, &flags);
7489 get_task_struct(p); 7517 get_task_struct(p);
7490 cpu_rq(cpu)->migration_thread = p; 7518 cpu_rq(cpu)->migration_thread = p;
7519 rq->calc_load_update = calc_load_update;
7491 break; 7520 break;
7492 7521
7493 case CPU_ONLINE: 7522 case CPU_ONLINE:
@@ -7498,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7498 /* Update our root-domain */ 7527 /* Update our root-domain */
7499 rq = cpu_rq(cpu); 7528 rq = cpu_rq(cpu);
7500 spin_lock_irqsave(&rq->lock, flags); 7529 spin_lock_irqsave(&rq->lock, flags);
7501 rq->calc_load_update = calc_load_update;
7502 rq->calc_load_active = 0;
7503 if (rq->rd) { 7530 if (rq->rd) {
7504 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7531 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7505 7532
@@ -9070,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9070#ifdef CONFIG_SMP 9097#ifdef CONFIG_SMP
9071 rt_rq->rt_nr_migratory = 0; 9098 rt_rq->rt_nr_migratory = 0;
9072 rt_rq->overloaded = 0; 9099 rt_rq->overloaded = 0;
9073 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9100 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9074#endif 9101#endif
9075 9102
9076 rt_rq->rt_time = 0; 9103 rt_rq->rt_time = 0;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6c251790dde..d014efbf947a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba7fd6e9556f..652e8bdef9aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 266 return min_vruntime;
267} 267}
268 268
269static inline int entity_before(struct sched_entity *a,
270 struct sched_entity *b)
271{
272 return (s64)(a->vruntime - b->vruntime) < 0;
273}
274
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 275static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 276{
271 return se->vruntime - cfs_rq->min_vruntime; 277 return se->vruntime - cfs_rq->min_vruntime;
@@ -605,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
605static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 611static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
606{ 612{
607#ifdef CONFIG_SCHEDSTATS 613#ifdef CONFIG_SCHEDSTATS
614 struct task_struct *tsk = NULL;
615
616 if (entity_is_task(se))
617 tsk = task_of(se);
618
608 if (se->sleep_start) { 619 if (se->sleep_start) {
609 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
610 struct task_struct *tsk = task_of(se);
611 621
612 if ((s64)delta < 0) 622 if ((s64)delta < 0)
613 delta = 0; 623 delta = 0;
@@ -618,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
618 se->sleep_start = 0; 628 se->sleep_start = 0;
619 se->sum_sleep_runtime += delta; 629 se->sum_sleep_runtime += delta;
620 630
621 account_scheduler_latency(tsk, delta >> 10, 1); 631 if (tsk)
632 account_scheduler_latency(tsk, delta >> 10, 1);
622 } 633 }
623 if (se->block_start) { 634 if (se->block_start) {
624 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 635 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
625 struct task_struct *tsk = task_of(se);
626 636
627 if ((s64)delta < 0) 637 if ((s64)delta < 0)
628 delta = 0; 638 delta = 0;
@@ -633,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 se->block_start = 0; 643 se->block_start = 0;
634 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
635 645
636 /* 646 if (tsk) {
637 * Blocking time is in units of nanosecs, so shift by 20 to 647 /*
638 * get a milliseconds-range estimation of the amount of 648 * Blocking time is in units of nanosecs, so shift by
639 * time that the task spent sleeping: 649 * 20 to get a milliseconds-range estimation of the
640 */ 650 * amount of time that the task spent sleeping:
641 if (unlikely(prof_on == SLEEP_PROFILING)) { 651 */
642 652 if (unlikely(prof_on == SLEEP_PROFILING)) {
643 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 653 profile_hits(SLEEP_PROFILING,
644 delta >> 20); 654 (void *)get_wchan(tsk),
655 delta >> 20);
656 }
657 account_scheduler_latency(tsk, delta >> 10, 0);
645 } 658 }
646 account_scheduler_latency(tsk, delta >> 10, 0);
647 } 659 }
648#endif 660#endif
649} 661}
@@ -687,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
687 * all of which have the same weight. 699 * all of which have the same weight.
688 */ 700 */
689 if (sched_feat(NORMALIZED_SLEEPER) && 701 if (sched_feat(NORMALIZED_SLEEPER) &&
690 task_of(se)->policy != SCHED_IDLE) 702 (!entity_is_task(se) ||
703 task_of(se)->policy != SCHED_IDLE))
691 thresh = calc_delta_fair(thresh, se); 704 thresh = calc_delta_fair(thresh, se);
692 705
693 vruntime -= thresh; 706 vruntime -= thresh;
@@ -1016,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
1016 /* 1029 /*
1017 * Already in the rightmost position? 1030 * Already in the rightmost position?
1018 */ 1031 */
1019 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1032 if (unlikely(!rightmost || entity_before(rightmost, se)))
1020 return; 1033 return;
1021 1034
1022 /* 1035 /*
@@ -1712,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1712 1725
1713 /* 'curr' will be NULL if the child belongs to a different group */ 1726 /* 'curr' will be NULL if the child belongs to a different group */
1714 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1727 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1715 curr && curr->vruntime < se->vruntime) { 1728 curr && entity_before(curr, se)) {
1716 /* 1729 /*
1717 * Upon rescheduling, sched_class::put_prev_task() will place 1730 * Upon rescheduling, sched_class::put_prev_task() will place
1718 * 'current' within the tree based on its new key value. 1731 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..3918e01994e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
10 10
11#ifdef CONFIG_RT_GROUP_SCHED 11#ifdef CONFIG_RT_GROUP_SCHED
12 12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 16{
15 return rt_rq->rq; 17 return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 24
23#else /* CONFIG_RT_GROUP_SCHED */ 25#else /* CONFIG_RT_GROUP_SCHED */
24 26
27#define rt_entity_is_task(rt_se) (1)
28
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 30{
27 return container_of(rt_rq, struct rq, rt); 31 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 77
74static void update_rt_migration(struct rt_rq *rt_rq) 78static void update_rt_migration(struct rt_rq *rt_rq)
75{ 79{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 80 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 81 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 82 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 83 rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 90
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 91static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 92{
93 if (!rt_entity_is_task(rt_se))
94 return;
95
96 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
97
98 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 99 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 100 rt_rq->rt_nr_migratory++;
91 101
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 104
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 105static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 106{
107 if (!rt_entity_is_task(rt_se))
108 return;
109
110 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
111
112 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 113 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 114 rt_rq->rt_nr_migratory--;
99 115
diff --git a/kernel/signal.c b/kernel/signal.c
index ccf1ceedaebe..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2454 stack_t oss; 2454 stack_t oss;
2455 int error; 2455 int error;
2456 2456
2457 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2458 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2459 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2460 oss.ss_flags = sas_ss_flags(sp);
2461 }
2462 2460
2463 if (uss) { 2461 if (uss) {
2464 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2466 int ss_flags; 2464 int ss_flags;
2467 2465
2468 error = -EFAULT; 2466 error = -EFAULT;
2469 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2470 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2471 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2472 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2473 goto out; 2473 goto out;
2474 2474
2475 error = -EPERM; 2475 error = -EPERM;
@@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2501 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2502 } 2502 }
2503 2503
2504 error = 0;
2504 if (uoss) { 2505 if (uoss) {
2505 error = -EFAULT; 2506 error = -EFAULT;
2506 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2507 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2508 } 2512 }
2509 2513
2510 error = 0;
2511out: 2514out:
2512 return error; 2515 return error;
2513} 2516}
diff --git a/kernel/smp.c b/kernel/smp.c
index 2accdf6712e5..8e218500ab14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3a94905fa5d2..eb5e131a0485 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
345 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
346} 346}
347 347
348/* Tasklets */ 348/*
349 * Tasklets
350 */
349struct tasklet_head 351struct tasklet_head
350{ 352{
351 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
493 495
494EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
495 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
496DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
497EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
498 560
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..a6dcd67b041d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg)
254 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
255} 255}
256EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 257#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 592bf584d1d2..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
513 * Check to make sure we don't switch to a non-highres capable 513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz) 514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */ 515 */
516 if (tick_oneshot_mode_active() && 516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { 517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. " 518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name); 519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
diff --git a/kernel/timer.c b/kernel/timer.c
index 0b36b9e5cc8b..a7f07d5a6241 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -714,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
714 * networking code - if the timer is re-modified 714 * networking code - if the timer is re-modified
715 * to be the same thing then just return: 715 * to be the same thing then just return:
716 */ 716 */
717 if (timer->expires == expires && timer_pending(timer)) 717 if (timer_pending(timer) && timer->expires == expires)
718 return 1; 718 return 1;
719 719
720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1551f47e7669..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -226,13 +226,13 @@ config BOOT_TRACER
226 the timings of the initcalls and traces key events and the identity 226 the timings of the initcalls and traces key events and the identity
227 of tasks that can cause boot delays, such as context-switches. 227 of tasks that can cause boot delays, such as context-switches.
228 228
229 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 229 Its aim is to be parsed by the scripts/bootgraph.pl tool to
230 produce pretty graphics about boot inefficiencies, giving a visual 230 produce pretty graphics about boot inefficiencies, giving a visual
231 representation of the delays during initcalls - but the raw 231 representation of the delays during initcalls - but the raw
232 /debug/tracing/trace text output is readable too. 232 /debug/tracing/trace text output is readable too.
233 233
234 You must pass in ftrace=initcall to the kernel command line 234 You must pass in initcall_debug and ftrace=initcall to the kernel
235 to enable this on bootup. 235 command line to enable this on bootup.
236 236
237config TRACE_BRANCH_PROFILING 237config TRACE_BRANCH_PROFILING
238 bool 238 bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..7a34cb563fec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
@@ -266,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt)
266{ 267{
267 debugfs_remove(bt->msg_file); 268 debugfs_remove(bt->msg_file);
268 debugfs_remove(bt->dropped_file); 269 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
270 relay_close(bt->rchan); 270 relay_close(bt->rchan);
271 debugfs_remove(bt->dir);
271 free_percpu(bt->sequence); 272 free_percpu(bt->sequence);
272 free_percpu(bt->msg_data); 273 free_percpu(bt->msg_data);
273 kfree(bt); 274 kfree(bt);
@@ -377,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
377 378
378static int blk_remove_buf_file_callback(struct dentry *dentry) 379static int blk_remove_buf_file_callback(struct dentry *dentry)
379{ 380{
380 struct dentry *parent = dentry->d_parent;
381 debugfs_remove(dentry); 381 debugfs_remove(dentry);
382 382
383 /*
384 * this will fail for all but the last file, but that is ok. what we
385 * care about is the top level buts->name directory going away, when
386 * the last trace file is gone. Then we don't have to rmdir() that
387 * manually on trace stop, so it nicely solves the issue with
388 * force killing of running traces.
389 */
390
391 debugfs_remove(parent);
392 return 0; 383 return 0;
393} 384}
394 385
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3716bf04df6..1e1d23c26308 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -768,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
768 .stat_show = function_stat_show 768 .stat_show = function_stat_show
769}; 769};
770 770
771static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
772{ 772{
773 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
774 struct dentry *entry; 774 struct dentry *entry;
@@ -786,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
786 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
787 * we still do not free memory. 787 * we still do not free memory.
788 */ 788 */
789 kfree(stat);
790 WARN(1, 789 WARN(1,
791 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
792 cpu); 791 cpu);
@@ -813,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
813} 812}
814 813
815#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
816static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
817{ 816{
818} 817}
819#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1663,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1663 1662
1664 mutex_lock(&ftrace_regex_lock); 1663 mutex_lock(&ftrace_regex_lock);
1665 if ((file->f_mode & FMODE_WRITE) && 1664 if ((file->f_mode & FMODE_WRITE) &&
1666 !(file->f_flags & O_APPEND)) 1665 (file->f_flags & O_TRUNC))
1667 ftrace_filter_reset(enable); 1666 ftrace_filter_reset(enable);
1668 1667
1669 if (file->f_mode & FMODE_READ) { 1668 if (file->f_mode & FMODE_READ) {
@@ -2578,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2578 2577
2579 mutex_lock(&graph_lock); 2578 mutex_lock(&graph_lock);
2580 if ((file->f_mode & FMODE_WRITE) && 2579 if ((file->f_mode & FMODE_WRITE) &&
2581 !(file->f_flags & O_APPEND)) { 2580 (file->f_flags & O_TRUNC)) {
2582 ftrace_graph_count = 0; 2581 ftrace_graph_count = 0;
2583 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2582 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2584 } 2583 }
@@ -2597,6 +2596,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2597} 2596}
2598 2597
2599static int 2598static int
2599ftrace_graph_release(struct inode *inode, struct file *file)
2600{
2601 if (file->f_mode & FMODE_READ)
2602 seq_release(inode, file);
2603 return 0;
2604}
2605
2606static int
2600ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2607ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2601{ 2608{
2602 struct dyn_ftrace *rec; 2609 struct dyn_ftrace *rec;
@@ -2725,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2725} 2732}
2726 2733
2727static const struct file_operations ftrace_graph_fops = { 2734static const struct file_operations ftrace_graph_fops = {
2728 .open = ftrace_graph_open, 2735 .open = ftrace_graph_open,
2729 .read = seq_read, 2736 .read = seq_read,
2730 .write = ftrace_graph_write, 2737 .write = ftrace_graph_write,
2738 .release = ftrace_graph_release,
2731}; 2739};
2732#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2740#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2733 2741
@@ -3160,10 +3168,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3160 3168
3161 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3169 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3162 3170
3163 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3171 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3164 goto out; 3172 goto out;
3165 3173
3166 last_ftrace_enabled = ftrace_enabled; 3174 last_ftrace_enabled = !!ftrace_enabled;
3167 3175
3168 if (ftrace_enabled) { 3176 if (ftrace_enabled) {
3169 3177
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bf27bb7a63e2..a330513d96ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
735 735
736 put_online_cpus(); 736 put_online_cpus();
737 737
738 kfree(buffer->buffers);
738 free_cpumask_var(buffer->cpumask); 739 free_cpumask_var(buffer->cpumask);
739 740
740 kfree(buffer); 741 kfree(buffer);
@@ -1785,7 +1786,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1785 */ 1786 */
1786 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1787 1788
1788 if (!rb_try_to_discard(cpu_buffer, event)) 1789 if (rb_try_to_discard(cpu_buffer, event))
1789 goto out; 1790 goto out;
1790 1791
1791 /* 1792 /*
@@ -2383,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2383 * the box. Return the padding, and we will release 2384 * the box. Return the padding, and we will release
2384 * the current locks, and try again. 2385 * the current locks, and try again.
2385 */ 2386 */
2386 rb_advance_reader(cpu_buffer);
2387 return event; 2387 return event;
2388 2388
2389 case RINGBUF_TYPE_TIME_EXTEND: 2389 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void)
2486 * buffer too. A one time deal is all you get from reading 2486 * buffer too. A one time deal is all you get from reading
2487 * the ring buffer from an NMI. 2487 * the ring buffer from an NMI.
2488 */ 2488 */
2489 if (likely(!in_nmi() && !oops_in_progress)) 2489 if (likely(!in_nmi()))
2490 return 1; 2490 return 1;
2491 2491
2492 tracing_off_permanent(); 2492 tracing_off_permanent();
@@ -2519,6 +2519,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2519 if (dolock) 2519 if (dolock)
2520 spin_lock(&cpu_buffer->reader_lock); 2520 spin_lock(&cpu_buffer->reader_lock);
2521 event = rb_buffer_peek(buffer, cpu, ts); 2521 event = rb_buffer_peek(buffer, cpu, ts);
2522 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2523 rb_advance_reader(cpu_buffer);
2522 if (dolock) 2524 if (dolock)
2523 spin_unlock(&cpu_buffer->reader_lock); 2525 spin_unlock(&cpu_buffer->reader_lock);
2524 local_irq_restore(flags); 2526 local_irq_restore(flags);
@@ -2590,12 +2592,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2590 spin_lock(&cpu_buffer->reader_lock); 2592 spin_lock(&cpu_buffer->reader_lock);
2591 2593
2592 event = rb_buffer_peek(buffer, cpu, ts); 2594 event = rb_buffer_peek(buffer, cpu, ts);
2593 if (!event) 2595 if (event)
2594 goto out_unlock; 2596 rb_advance_reader(cpu_buffer);
2595
2596 rb_advance_reader(cpu_buffer);
2597 2597
2598 out_unlock:
2599 if (dolock) 2598 if (dolock)
2600 spin_unlock(&cpu_buffer->reader_lock); 2599 spin_unlock(&cpu_buffer->reader_lock);
2601 local_irq_restore(flags); 2600 local_irq_restore(flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3aa0a0dfdfa8..c22b40f8f576 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -847,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
847 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
848 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
849} 850}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
850 852
851struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
852 int type, 854 int type,
@@ -2030,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2030 2032
2031 /* If this file was open for write, then erase contents */ 2033 /* If this file was open for write, then erase contents */
2032 if ((file->f_mode & FMODE_WRITE) && 2034 if ((file->f_mode & FMODE_WRITE) &&
2033 !(file->f_flags & O_APPEND)) { 2035 (file->f_flags & O_TRUNC)) {
2034 long cpu = (long) inode->i_private; 2036 long cpu = (long) inode->i_private;
2035 2037
2036 if (cpu == TRACE_PIPE_ALL_CPU) 2038 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -3084,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3084 break; 3086 break;
3085 } 3087 }
3086 3088
3087 trace_consume(iter); 3089 if (ret != TRACE_TYPE_NO_CONSUME)
3090 trace_consume(iter);
3088 rem -= count; 3091 rem -= count;
3089 if (!find_next_entry_inc(iter)) { 3092 if (!find_next_entry_inc(iter)) {
3090 rem = 0; 3093 rem = 0;
@@ -4232,8 +4235,11 @@ static void __ftrace_dump(bool disable_tracing)
4232 iter.pos = -1; 4235 iter.pos = -1;
4233 4236
4234 if (find_next_entry_inc(&iter) != NULL) { 4237 if (find_next_entry_inc(&iter) != NULL) {
4235 print_trace_line(&iter); 4238 int ret;
4236 trace_consume(&iter); 4239
4240 ret = print_trace_line(&iter);
4241 if (ret != TRACE_TYPE_NO_CONSUME)
4242 trace_consume(&iter);
4237 } 4243 }
4238 4244
4239 trace_printk_seq(&iter.seq); 4245 trace_printk_seq(&iter.seq);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3548ae5cc780..8b9f4f6e9559 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
439 int *ent_cpu, u64 *ent_ts); 439 int *ent_cpu, u64 *ent_ts);
440 440
441void tracing_generic_entry_update(struct trace_entry *entry,
442 unsigned long flags,
443 int pc);
444
445void default_wait_pipe(struct trace_iterator *iter); 441void default_wait_pipe(struct trace_iterator *iter);
446void poll_wait_pipe(struct trace_iterator *iter); 442void poll_wait_pipe(struct trace_iterator *iter);
447 443
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecfe..11ba5bb4ed0a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)
14 14
15 mutex_lock(&event_mutex); 15 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 16 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id) { 17 if (event->id == event_id && event->profile_enable) {
18 ret = event->profile_enable(event); 18 ret = event->profile_enable(event);
19 break; 19 break;
20 } 20 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5e32e375134d..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore, 26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT( 27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func) 28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
29 TRACE_FIELD(int, ret.depth, depth) 32 TRACE_FIELD(int, ret.depth, depth)
30 ), 33 ),
31 TP_RAW_FMT("<-- %lx (%d)") 34 TP_RAW_FMT("<-- %lx (%d)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53c8fd376a88..e75276a49cf5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
376 const struct seq_operations *seq_ops; 376 const struct seq_operations *seq_ops;
377 377
378 if ((file->f_mode & FMODE_WRITE) && 378 if ((file->f_mode & FMODE_WRITE) &&
379 !(file->f_flags & O_APPEND)) 379 (file->f_flags & O_TRUNC))
380 ftrace_clear_events(); 380 ftrace_clear_events();
381 381
382 seq_ops = inode->i_private; 382 seq_ops = inode->i_private;
@@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
940 entry = trace_create_file("enable", 0644, call->dir, call, 940 entry = trace_create_file("enable", 0644, call->dir, call,
941 enable); 941 enable);
942 942
943 if (call->id) 943 if (call->id && call->profile_enable)
944 entry = trace_create_file("id", 0444, call->dir, call, 944 entry = trace_create_file("id", 0444, call->dir, call,
945 id); 945 id);
946 946
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 936c621bbf46..f32dc9d1ea7b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
624 return -ENOSPC; 624 return -ENOSPC;
625 } 625 }
626 626
627 filter->preds[filter->n_preds] = pred;
628 filter->n_preds++;
629
630 list_for_each_entry(call, &ftrace_events, list) { 627 list_for_each_entry(call, &ftrace_events, list) {
631 628
632 if (!call->define_fields) 629 if (!call->define_fields)
@@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
643 } 640 }
644 replace_filter_string(call->filter, filter_string); 641 replace_filter_string(call->filter, filter_string);
645 } 642 }
643
644 filter->preds[filter->n_preds] = pred;
645 filter->n_preds++;
646out: 646out:
647 return err; 647 return err;
648} 648}
@@ -1029,12 +1029,17 @@ static int replace_preds(struct event_subsystem *system,
1029 1029
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1030 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1031 pred = create_logical_pred(elt->op);
1032 if (!pred)
1033 return -ENOMEM;
1032 if (call) { 1034 if (call) {
1033 err = filter_add_pred(ps, call, pred); 1035 err = filter_add_pred(ps, call, pred);
1034 filter_free_pred(pred); 1036 filter_free_pred(pred);
1035 } else 1037 } else {
1036 err = filter_add_subsystem_pred(ps, system, 1038 err = filter_add_subsystem_pred(ps, system,
1037 pred, filter_string); 1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1038 if (err) 1043 if (err)
1039 return err; 1044 return err;
1040 1045
@@ -1048,12 +1053,17 @@ static int replace_preds(struct event_subsystem *system,
1048 } 1053 }
1049 1054
1050 pred = create_pred(elt->op, operand1, operand2); 1055 pred = create_pred(elt->op, operand1, operand2);
1056 if (!pred)
1057 return -ENOMEM;
1051 if (call) { 1058 if (call) {
1052 err = filter_add_pred(ps, call, pred); 1059 err = filter_add_pred(ps, call, pred);
1053 filter_free_pred(pred); 1060 filter_free_pred(pred);
1054 } else 1061 } else {
1055 err = filter_add_subsystem_pred(ps, system, pred, 1062 err = filter_add_subsystem_pred(ps, system, pred,
1056 filter_string); 1063 filter_string);
1064 if (err)
1065 filter_free_pred(pred);
1066 }
1057 if (err) 1067 if (err)
1058 return err; 1068 return err;
1059 1069
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 7402144bff21..75ef000613c3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -363,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
363 out_reg: 363 out_reg:
364 ret = register_ftrace_function_probe(glob, ops, count); 364 ret = register_ftrace_function_probe(glob, ops, count);
365 365
366 return ret; 366 return ret < 0 ? ret : 0;
367} 367}
368 368
369static struct ftrace_func_command ftrace_traceon_cmd = { 369static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d2249abafb53..420ec3487579 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
843 843
844 switch (entry->type) { 844 switch (entry->type) {
845 case TRACE_GRAPH_ENT: { 845 case TRACE_GRAPH_ENT: {
846 struct ftrace_graph_ent_entry *field; 846 /*
847 * print_graph_entry() may consume the current event,
848 * thus @field may become invalid, so we need to save it.
849 * sizeof(struct ftrace_graph_ent_entry) is very small,
850 * it can be safely saved at the stack.
851 */
852 struct ftrace_graph_ent_entry *field, saved;
847 trace_assign_type(field, entry); 853 trace_assign_type(field, entry);
848 return print_graph_entry(field, s, iter); 854 saved = *field;
855 return print_graph_entry(&saved, s, iter);
849 } 856 }
850 case TRACE_GRAPH_RET: { 857 case TRACE_GRAPH_RET: {
851 struct ftrace_graph_ret_entry *field; 858 struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 7b6278110827..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
176 const char *str = *fmt; 176 const char *str = *fmt;
177 int i; 177 int i;
178 178
179 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 179 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
180 180
181 /* 181 /*
182 * Tabs and new lines need to be converted. 182 * Tabs and new lines need to be converted.
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..6a2a9d484cd6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 301
302static int stack_trace_open(struct inode *inode, struct file *file) 302static int stack_trace_open(struct inode *inode, struct file *file)
303{ 303{
304 int ret; 304 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 305}
310 306
311static const struct file_operations stack_trace_fops = { 307static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 308 .open = stack_trace_open,
313 .read = seq_read, 309 .read = seq_read,
314 .llseek = seq_lseek, 310 .llseek = seq_lseek,
311 .release = seq_release,
315}; 312};
316 313
317int 314int
@@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 323 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 324
328 if (ret || !write || 325 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 326 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 327 goto out;
331 328
332 last_stack_tracer_enabled = stack_tracer_enabled; 329 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 330
334 if (stack_tracer_enabled) 331 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 332 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index e66f5e493342..aea321c82fa0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node)
73 } 73 }
74} 74}
75 75
76static void reset_stat_session(struct stat_session *session) 76static void __reset_stat_session(struct stat_session *session)
77{ 77{
78 struct rb_node *node = session->stat_root.rb_node; 78 struct rb_node *node = session->stat_root.rb_node;
79 79
@@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session)
83 session->stat_root = RB_ROOT; 83 session->stat_root = RB_ROOT;
84} 84}
85 85
86static void reset_stat_session(struct stat_session *session)
87{
88 mutex_lock(&session->stat_mutex);
89 __reset_stat_session(session);
90 mutex_unlock(&session->stat_mutex);
91}
92
86static void destroy_session(struct stat_session *session) 93static void destroy_session(struct stat_session *session)
87{ 94{
88 debugfs_remove(session->file); 95 debugfs_remove(session->file);
89 reset_stat_session(session); 96 __reset_stat_session(session);
90 mutex_destroy(&session->stat_mutex); 97 mutex_destroy(&session->stat_mutex);
91 kfree(session); 98 kfree(session);
92} 99}
@@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session)
150 int i; 157 int i;
151 158
152 mutex_lock(&session->stat_mutex); 159 mutex_lock(&session->stat_mutex);
153 reset_stat_session(session); 160 __reset_stat_session(session);
154 161
155 if (!ts->stat_cmp) 162 if (!ts->stat_cmp)
156 ts->stat_cmp = dummy_cmp; 163 ts->stat_cmp = dummy_cmp;
@@ -183,7 +190,7 @@ exit:
183 return ret; 190 return ret;
184 191
185exit_free_rbtree: 192exit_free_rbtree:
186 reset_stat_session(session); 193 __reset_stat_session(session);
187 mutex_unlock(&session->stat_mutex); 194 mutex_unlock(&session->stat_mutex);
188 return ret; 195 return ret;
189} 196}
@@ -250,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = {
250static int tracing_stat_open(struct inode *inode, struct file *file) 257static int tracing_stat_open(struct inode *inode, struct file *file)
251{ 258{
252 int ret; 259 int ret;
253 260 struct seq_file *m;
254 struct stat_session *session = inode->i_private; 261 struct stat_session *session = inode->i_private;
255 262
263 ret = stat_seq_init(session);
264 if (ret)
265 return ret;
266
256 ret = seq_open(file, &trace_stat_seq_ops); 267 ret = seq_open(file, &trace_stat_seq_ops);
257 if (!ret) { 268 if (ret) {
258 struct seq_file *m = file->private_data; 269 reset_stat_session(session);
259 m->private = session; 270 return ret;
260 ret = stat_seq_init(session);
261 } 271 }
262 272
273 m = file->private_data;
274 m->private = session;
263 return ret; 275 return ret;
264} 276}
265 277
@@ -270,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
270{ 282{
271 struct stat_session *session = i->i_private; 283 struct stat_session *session = i->i_private;
272 284
273 mutex_lock(&session->stat_mutex);
274 reset_stat_session(session); 285 reset_stat_session(session);
275 mutex_unlock(&session->stat_mutex);
276 286
277 return 0; 287 return seq_release(i, f);
278} 288}
279 289
280static const struct file_operations tracing_stat_fops = { 290static const struct file_operations tracing_stat_fops = {
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4275cf..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void init_waitqueue_head(wait_queue_head_t *q) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key);
16 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
17} 18}
18 19
19EXPORT_SYMBOL(init_waitqueue_head); 20EXPORT_SYMBOL(__init_waitqueue_head);
20 21
21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 22void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 23{