aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:54 -0400
committerDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:54 -0400
commit9134d02bc0af4a8747d448d1f811ec5f8eb96df6 (patch)
tree704c3e5dcc10f360815c4868a74711f82fb62e27 /kernel
parentbbb20089a3275a19e475dbc21320c3742e3ca423 (diff)
parent80ffb3cceaefa405f2ecd46d66500ed8d53efe74 (diff)
Merge commit 'md/for-linus' into async-tx-next
Conflicts: drivers/md/raid5.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/cgroup.c151
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c3
-rw-r--r--kernel/hrtimer.c110
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c55
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/module.c9
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c583
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcutree.c3
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c61
-rw-r--r--kernel/sched_cpupri.c15
-rw-r--r--kernel/sched_fair.c45
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/sysctl.c13
-rw-r--r--kernel/time/clockevents.c11
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c4
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c1
-rw-r--r--kernel/trace/ftrace.c84
-rw-r--r--kernel/trace/ring_buffer.c11
-rw-r--r--kernel/trace/trace.c36
-rw-r--r--kernel/trace/trace.h7
-rw-r--r--kernel/trace/trace_event_profile.c2
-rw-r--r--kernel/trace/trace_event_types.h3
-rw-r--r--kernel/trace/trace_events.c32
-rw-r--r--kernel/trace/trace_functions.c5
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c28
-rw-r--r--kernel/trace/trace_stack.c11
-rw-r--r--kernel/trace/trace_stat.c40
51 files changed, 1070 insertions, 492 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 780c8dcf4516..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
96obj-$(CONFIG_FUNCTION_TRACER) += trace/ 96obj-$(CONFIG_FUNCTION_TRACER) += trace/
97obj-$(CONFIG_TRACING) += trace/ 97obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 98obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/
99obj-$(CONFIG_SMP) += sched_cpupri.o 100obj-$(CONFIG_SMP) += sched_cpupri.o
100obj-$(CONFIG_SLOW_WORK) += slow-work.o 101obj-$(CONFIG_SLOW_WORK) += slow-work.o
101obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a682cdf5..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 736 * to zero, soon.
736 * 737 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 739 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 741
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 743{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
745} 746}
746 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
747static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 761 unsigned long final_bits)
749{ 762{
@@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
960 INIT_LIST_HEAD(&cgrp->children); 973 INIT_LIST_HEAD(&cgrp->children);
961 INIT_LIST_HEAD(&cgrp->css_sets); 974 INIT_LIST_HEAD(&cgrp->css_sets);
962 INIT_LIST_HEAD(&cgrp->release_list); 975 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list);
963 init_rwsem(&cgrp->pids_mutex); 977 init_rwsem(&cgrp->pids_mutex);
964} 978}
965static void init_cgroup_root(struct cgroupfs_root *root) 979static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1357 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1358 * is no longer empty. 1372 * is no longer empty.
1359 */ 1373 */
1360 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1361 return 0; 1375 return 0;
1362} 1376}
1363 1377
@@ -2201,12 +2215,30 @@ err:
2201 return ret; 2215 return ret;
2202} 2216}
2203 2217
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2204static int cmppid(const void *a, const void *b) 2237static int cmppid(const void *a, const void *b)
2205{ 2238{
2206 return *(pid_t *)a - *(pid_t *)b; 2239 return *(pid_t *)a - *(pid_t *)b;
2207} 2240}
2208 2241
2209
2210/* 2242/*
2211 * seq_file methods for the "tasks" file. The seq_file position is the 2243 * seq_file methods for the "tasks" file. The seq_file position is the
2212 * next pid to display; the seq_file iterator is a pointer to the pid 2244 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2221 * after a seek to the start). Use a binary-search to find the 2253 * after a seek to the start). Use a binary-search to find the
2222 * next pid to display, if any 2254 * next pid to display, if any
2223 */ 2255 */
2224 struct cgroup *cgrp = s->private; 2256 struct cgroup_pids *cp = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2225 int index = 0, pid = *pos; 2258 int index = 0, pid = *pos;
2226 int *iter; 2259 int *iter;
2227 2260
2228 down_read(&cgrp->pids_mutex); 2261 down_read(&cgrp->pids_mutex);
2229 if (pid) { 2262 if (pid) {
2230 int end = cgrp->pids_length; 2263 int end = cp->length;
2231 2264
2232 while (index < end) { 2265 while (index < end) {
2233 int mid = (index + end) / 2; 2266 int mid = (index + end) / 2;
2234 if (cgrp->tasks_pids[mid] == pid) { 2267 if (cp->tasks_pids[mid] == pid) {
2235 index = mid; 2268 index = mid;
2236 break; 2269 break;
2237 } else if (cgrp->tasks_pids[mid] <= pid) 2270 } else if (cp->tasks_pids[mid] <= pid)
2238 index = mid + 1; 2271 index = mid + 1;
2239 else 2272 else
2240 end = mid; 2273 end = mid;
2241 } 2274 }
2242 } 2275 }
2243 /* If we're off the end of the array, we're done */ 2276 /* If we're off the end of the array, we're done */
2244 if (index >= cgrp->pids_length) 2277 if (index >= cp->length)
2245 return NULL; 2278 return NULL;
2246 /* Update the abstract position to be the actual pid that we found */ 2279 /* Update the abstract position to be the actual pid that we found */
2247 iter = cgrp->tasks_pids + index; 2280 iter = cp->tasks_pids + index;
2248 *pos = *iter; 2281 *pos = *iter;
2249 return iter; 2282 return iter;
2250} 2283}
2251 2284
2252static void cgroup_tasks_stop(struct seq_file *s, void *v) 2285static void cgroup_tasks_stop(struct seq_file *s, void *v)
2253{ 2286{
2254 struct cgroup *cgrp = s->private; 2287 struct cgroup_pids *cp = s->private;
2288 struct cgroup *cgrp = cp->cgrp;
2255 up_read(&cgrp->pids_mutex); 2289 up_read(&cgrp->pids_mutex);
2256} 2290}
2257 2291
2258static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2259{ 2293{
2260 struct cgroup *cgrp = s->private; 2294 struct cgroup_pids *cp = s->private;
2261 int *p = v; 2295 int *p = v;
2262 int *end = cgrp->tasks_pids + cgrp->pids_length; 2296 int *end = cp->tasks_pids + cp->length;
2263 2297
2264 /* 2298 /*
2265 * Advance to the next pid in the array. If this goes off the 2299 * Advance to the next pid in the array. If this goes off the
@@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2286 .show = cgroup_tasks_show, 2320 .show = cgroup_tasks_show,
2287}; 2321};
2288 2322
2289static void release_cgroup_pid_array(struct cgroup *cgrp) 2323static void release_cgroup_pid_array(struct cgroup_pids *cp)
2290{ 2324{
2325 struct cgroup *cgrp = cp->cgrp;
2326
2291 down_write(&cgrp->pids_mutex); 2327 down_write(&cgrp->pids_mutex);
2292 BUG_ON(!cgrp->pids_use_count); 2328 BUG_ON(!cp->use_count);
2293 if (!--cgrp->pids_use_count) { 2329 if (!--cp->use_count) {
2294 kfree(cgrp->tasks_pids); 2330 list_del(&cp->list);
2295 cgrp->tasks_pids = NULL; 2331 put_pid_ns(cp->ns);
2296 cgrp->pids_length = 0; 2332 kfree(cp->tasks_pids);
2333 kfree(cp);
2297 } 2334 }
2298 up_write(&cgrp->pids_mutex); 2335 up_write(&cgrp->pids_mutex);
2299} 2336}
2300 2337
2301static int cgroup_tasks_release(struct inode *inode, struct file *file) 2338static int cgroup_tasks_release(struct inode *inode, struct file *file)
2302{ 2339{
2303 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2340 struct seq_file *seq;
2341 struct cgroup_pids *cp;
2304 2342
2305 if (!(file->f_mode & FMODE_READ)) 2343 if (!(file->f_mode & FMODE_READ))
2306 return 0; 2344 return 0;
2307 2345
2308 release_cgroup_pid_array(cgrp); 2346 seq = file->private_data;
2347 cp = seq->private;
2348
2349 release_cgroup_pid_array(cp);
2309 return seq_release(inode, file); 2350 return seq_release(inode, file);
2310} 2351}
2311 2352
@@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
2324static int cgroup_tasks_open(struct inode *unused, struct file *file) 2365static int cgroup_tasks_open(struct inode *unused, struct file *file)
2325{ 2366{
2326 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns;
2369 struct cgroup_pids *cp;
2327 pid_t *pidarray; 2370 pid_t *pidarray;
2328 int npids; 2371 int npids;
2329 int retval; 2372 int retval;
@@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2350 * array if necessary 2393 * array if necessary
2351 */ 2394 */
2352 down_write(&cgrp->pids_mutex); 2395 down_write(&cgrp->pids_mutex);
2353 kfree(cgrp->tasks_pids); 2396
2354 cgrp->tasks_pids = pidarray; 2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2355 cgrp->pids_length = npids; 2398 if (ns == cp->ns)
2356 cgrp->pids_use_count++; 2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2357 up_write(&cgrp->pids_mutex); 2417 up_write(&cgrp->pids_mutex);
2358 2418
2359 file->f_op = &cgroup_tasks_operations; 2419 file->f_op = &cgroup_tasks_operations;
2360 2420
2361 retval = seq_open(file, &cgroup_tasks_seq_operations); 2421 retval = seq_open(file, &cgroup_tasks_seq_operations);
2362 if (retval) { 2422 if (retval) {
2363 release_cgroup_pid_array(cgrp); 2423 release_cgroup_pid_array(cp);
2364 return retval; 2424 return retval;
2365 } 2425 }
2366 ((struct seq_file *)file->private_data)->private = cgrp; 2426 ((struct seq_file *)file->private_data)->private = cp;
2367 return 0; 2427 return 0;
2368} 2428}
2369 2429
@@ -2696,33 +2756,42 @@ again:
2696 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2697 2757
2698 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2699 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2700 * that rmdir() request comes. 2771 * that rmdir() request comes.
2701 */ 2772 */
2702 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2703 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2704 return ret; 2776 return ret;
2777 }
2705 2778
2706 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2707 parent = cgrp->parent; 2780 parent = cgrp->parent;
2708 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2709 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2710 return -EBUSY; 2784 return -EBUSY;
2711 } 2785 }
2712 /*
2713 * css_put/get is provided for subsys to grab refcnt to css. In typical
2714 * case, subsystem has no reference after pre_destroy(). But, under
2715 * hierarchy management, some *temporal* refcnt can be hold.
2716 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2717 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2718 * is called when css_put() is called and refcnt goes down to 0.
2719 */
2720 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2721 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2722
2723 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2724 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2725 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2726 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2727 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2728 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3294 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3295 check_for_release(cgrp); 3364 check_for_release(cgrp);
3296 } 3365 }
3297 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3298 } 3367 }
3299 rcu_read_unlock(); 3368 rcu_read_unlock();
3300} 3369}
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd54..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..021e1138556e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -427,6 +426,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
427 init_rwsem(&mm->mmap_sem); 426 init_rwsem(&mm->mmap_sem);
428 INIT_LIST_HEAD(&mm->mmlist); 427 INIT_LIST_HEAD(&mm->mmlist);
429 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 428 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
429 mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0;
430 mm->core_state = NULL; 430 mm->core_state = NULL;
431 mm->nr_ptes = 0; 431 mm->nr_ptes = 0;
432 set_mm_counter(mm, file_rss, 0); 432 set_mm_counter(mm, file_rss, 0);
@@ -568,18 +568,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 568 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 569 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 570 */
571 if (tsk->clear_child_tid 571 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 572 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 573 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 574 /*
575 * We don't check the error code - if userspace has
576 * not set up a proper pointer then tough luck.
577 */
578 put_user(0, tsk->clear_child_tid);
579 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
580 1, NULL, NULL, 0);
581 }
575 tsk->clear_child_tid = NULL; 582 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 583 }
584} 584}
585 585
@@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1269 write_unlock_irq(&tasklist_lock); 1269 write_unlock_irq(&tasklist_lock);
1270 proc_fork_connector(p); 1270 proc_fork_connector(p);
1271 cgroup_post_fork(p); 1271 cgroup_post_fork(p);
1272 perf_counter_fork(p);
1272 return p; 1273 return p;
1273 1274
1274bad_fork_free_pid: 1275bad_fork_free_pid:
@@ -1408,12 +1409,6 @@ long do_fork(unsigned long clone_flags,
1408 if (clone_flags & CLONE_VFORK) { 1409 if (clone_flags & CLONE_VFORK) {
1409 p->vfork_done = &vfork; 1410 p->vfork_done = &vfork;
1410 init_completion(&vfork); 1411 init_completion(&vfork);
1411 } else if (!(clone_flags & CLONE_VM)) {
1412 /*
1413 * vfork will do an exec which will call
1414 * set_task_comm()
1415 */
1416 perf_counter_fork(p);
1417 } 1412 }
1418 1413
1419 audit_finish_fork(p); 1414 audit_finish_fork(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index 1c337112335c..0672ff88f159 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -247,6 +247,7 @@ again:
247 if (err < 0) 247 if (err < 0)
248 return err; 248 return err;
249 249
250 page = compound_head(page);
250 lock_page(page); 251 lock_page(page);
251 if (!page->mapping) { 252 if (!page->mapping) {
252 unlock_page(page); 253 unlock_page(page);
@@ -299,7 +300,7 @@ void put_futex_key(int fshared, union futex_key *key)
299static int fault_in_user_writeable(u32 __user *uaddr) 300static int fault_in_user_writeable(u32 __user *uaddr)
300{ 301{
301 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 302 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
302 sizeof(*uaddr), 1, 0, NULL, NULL); 303 1, 1, 0, NULL, NULL);
303 return ret < 0 ? ret : 0; 304 return ret < 0 ? ret : 0;
304} 305}
305 306
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9002958a96e7..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 245
214again: 246again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
217 249
218 if (base != new_base) { 250 if (base != new_base) {
219 /* 251 /*
220 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
235 267
236 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
241 new_base->offset); 273 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 274 }
269 timer->base = new_base; 275 timer->base = new_base;
270 } 276 }
@@ -1276,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1276 1282
1277 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1278 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1279 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1280 1296
1281 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1282 ktime_t basenow; 1298 ktime_t basenow;
1283 struct rb_node *node; 1299 struct rb_node *node;
1284 1300
1285 spin_lock(&cpu_base->lock);
1286
1287 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1288 1302
1289 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1316,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1316 1330
1317 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1318 } 1332 }
1319 spin_unlock(&cpu_base->lock);
1320 base++; 1333 base++;
1321 } 1334 }
1322 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1323 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1324 1342
1325 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1326 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 73468253143b..e70ed5592eb9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47 46
48/* 47/*
49 * Debugging printout: 48 * Debugging printout:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 50da67672901..61c679db4687 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
112 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 if (!desc->chip->set_affinity(irq, cpumask)) { 121 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask); 122 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask); 123 irq_set_thread_affinity(desc);
116 } 124 }
117 } 125 }
118 else { 126 else {
@@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
122#else 130#else
123 if (!desc->chip->set_affinity(irq, cpumask)) { 131 if (!desc->chip->set_affinity(irq, cpumask)) {
124 cpumask_copy(desc->affinity, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask); 133 irq_set_thread_affinity(desc);
126 } 134 }
127#endif 135#endif
128 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
@@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
176 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
177 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
178 if (!ret) 186 if (!ret)
179 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
180 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
181 189
182 return ret; 190 return ret;
@@ -443,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
443 return -1; 451 return -1;
444} 452}
445 453
454#ifdef CONFIG_SMP
455/*
456 * Check whether we need to change the affinity of the interrupt thread.
457 */
458static void
459irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
460{
461 cpumask_var_t mask;
462
463 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
464 return;
465
466 /*
467 * In case we are out of memory we set IRQTF_AFFINITY again and
468 * try again next time
469 */
470 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
471 set_bit(IRQTF_AFFINITY, &action->thread_flags);
472 return;
473 }
474
475 spin_lock_irq(&desc->lock);
476 cpumask_copy(mask, desc->affinity);
477 spin_unlock_irq(&desc->lock);
478
479 set_cpus_allowed_ptr(current, mask);
480 free_cpumask_var(mask);
481}
482#else
483static inline void
484irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
485#endif
486
446/* 487/*
447 * Interrupt handler thread 488 * Interrupt handler thread
448 */ 489 */
@@ -458,6 +499,8 @@ static int irq_thread(void *data)
458 499
459 while (!irq_wait_for_interrupt(action)) { 500 while (!irq_wait_for_interrupt(action)) {
460 501
502 irq_thread_check_affinity(desc, action);
503
461 atomic_inc(&desc->threads_active); 504 atomic_inc(&desc->threads_active);
462 505
463 spin_lock_irq(&desc->lock); 506 spin_lock_irq(&desc->lock);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index cfe767ca1545..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -45,7 +45,7 @@ void move_masked_irq(int irq)
45 < nr_cpu_ids)) 45 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
47 cpumask_copy(desc->affinity, desc->pending_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc, desc->pending_mask); 48 irq_set_thread_affinity(desc);
49 } 49 }
50 50
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ae1c35201cc8..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..0540948e29ab 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
237{ 237{
238 struct kprobe_insn_page *kip; 238 struct kprobe_insn_page *kip;
239 struct hlist_node *pos, *next; 239 struct hlist_node *pos, *next;
240 int safety;
241 240
242 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 242 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 243 return -EAGAIN;
248 244
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 694 p->addr = addr;
699 695
700 preempt_disable(); 696 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 697 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 698 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 699 preempt_enable();
704 return -EINVAL; 700 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9b1a7de26979..eb8751aa0418 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind);
180 * @k: thread created by kthread_create(). 180 * @k: thread created by kthread_create().
181 * 181 *
182 * Sets kthread_should_stop() for @k to return true, wakes it, and 182 * Sets kthread_should_stop() for @k to return true, wakes it, and
183 * waits for it to exit. Your threadfn() must not call do_exit() 183 * waits for it to exit. This can also be called after kthread_create()
184 * itself if you use this function! This can also be called after 184 * instead of calling wake_up_process(): the thread will exit without
185 * kthread_create() instead of calling wake_up_process(): the thread 185 * calling threadfn().
186 * will exit without calling threadfn(). 186 *
187 * If threadfn() may call do_exit() itself, the caller must ensure
188 * task_struct can't go away.
187 * 189 *
188 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 190 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
189 * was never called. 191 * was never called.
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..fd1411403558 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1068{ 1068{
1069 const unsigned long *crc; 1069 const unsigned long *crc;
1070 1070
1071 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1071 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1072 &crc, true, false))
1072 BUG(); 1073 BUG();
1073 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1074 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1074} 1075}
@@ -2451,9 +2452,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2451 return ret; 2452 return ret;
2452 } 2453 }
2453 if (ret > 0) { 2454 if (ret > 0) {
2454 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2455 printk(KERN_WARNING
2455 "it should follow 0/-E convention\n" 2456"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2456 KERN_WARNING "%s: loading module anyway...\n", 2457"%s: loading module anyway...\n",
2457 __func__, mod->name, ret, 2458 __func__, mod->name, ret,
2458 __func__); 2459 __func__);
2459 dump_stack(); 2460 dump_stack();
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea4..673c1aaf7332 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
@@ -146,6 +147,28 @@ static void put_ctx(struct perf_counter_context *ctx)
146 } 147 }
147} 148}
148 149
150static void unclone_ctx(struct perf_counter_context *ctx)
151{
152 if (ctx->parent_ctx) {
153 put_ctx(ctx->parent_ctx);
154 ctx->parent_ctx = NULL;
155 }
156}
157
158/*
159 * If we inherit counters we want to return the parent counter id
160 * to userspace.
161 */
162static u64 primary_counter_id(struct perf_counter *counter)
163{
164 u64 id = counter->id;
165
166 if (counter->parent)
167 id = counter->parent->id;
168
169 return id;
170}
171
149/* 172/*
150 * Get the perf_counter_context for a task and lock it. 173 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked, 174 * This has to cope with with the fact that until it is locked,
@@ -236,6 +259,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
236 259
237 list_add_rcu(&counter->event_entry, &ctx->event_list); 260 list_add_rcu(&counter->event_entry, &ctx->event_list);
238 ctx->nr_counters++; 261 ctx->nr_counters++;
262 if (counter->attr.inherit_stat)
263 ctx->nr_stat++;
239} 264}
240 265
241/* 266/*
@@ -250,6 +275,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
250 if (list_empty(&counter->list_entry)) 275 if (list_empty(&counter->list_entry))
251 return; 276 return;
252 ctx->nr_counters--; 277 ctx->nr_counters--;
278 if (counter->attr.inherit_stat)
279 ctx->nr_stat--;
253 280
254 list_del_init(&counter->list_entry); 281 list_del_init(&counter->list_entry);
255 list_del_rcu(&counter->event_entry); 282 list_del_rcu(&counter->event_entry);
@@ -1006,6 +1033,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
1006 && !ctx1->pin_count && !ctx2->pin_count; 1033 && !ctx1->pin_count && !ctx2->pin_count;
1007} 1034}
1008 1035
1036static void __perf_counter_read(void *counter);
1037
1038static void __perf_counter_sync_stat(struct perf_counter *counter,
1039 struct perf_counter *next_counter)
1040{
1041 u64 value;
1042
1043 if (!counter->attr.inherit_stat)
1044 return;
1045
1046 /*
1047 * Update the counter value, we cannot use perf_counter_read()
1048 * because we're in the middle of a context switch and have IRQs
1049 * disabled, which upsets smp_call_function_single(), however
1050 * we know the counter must be on the current CPU, therefore we
1051 * don't need to use it.
1052 */
1053 switch (counter->state) {
1054 case PERF_COUNTER_STATE_ACTIVE:
1055 __perf_counter_read(counter);
1056 break;
1057
1058 case PERF_COUNTER_STATE_INACTIVE:
1059 update_counter_times(counter);
1060 break;
1061
1062 default:
1063 break;
1064 }
1065
1066 /*
1067 * In order to keep per-task stats reliable we need to flip the counter
1068 * values when we flip the contexts.
1069 */
1070 value = atomic64_read(&next_counter->count);
1071 value = atomic64_xchg(&counter->count, value);
1072 atomic64_set(&next_counter->count, value);
1073
1074 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1075 swap(counter->total_time_running, next_counter->total_time_running);
1076
1077 /*
1078 * Since we swizzled the values, update the user visible data too.
1079 */
1080 perf_counter_update_userpage(counter);
1081 perf_counter_update_userpage(next_counter);
1082}
1083
1084#define list_next_entry(pos, member) \
1085 list_entry(pos->member.next, typeof(*pos), member)
1086
1087static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1088 struct perf_counter_context *next_ctx)
1089{
1090 struct perf_counter *counter, *next_counter;
1091
1092 if (!ctx->nr_stat)
1093 return;
1094
1095 counter = list_first_entry(&ctx->event_list,
1096 struct perf_counter, event_entry);
1097
1098 next_counter = list_first_entry(&next_ctx->event_list,
1099 struct perf_counter, event_entry);
1100
1101 while (&counter->event_entry != &ctx->event_list &&
1102 &next_counter->event_entry != &next_ctx->event_list) {
1103
1104 __perf_counter_sync_stat(counter, next_counter);
1105
1106 counter = list_next_entry(counter, event_entry);
1107 next_counter = list_next_entry(next_counter, event_entry);
1108 }
1109}
1110
1009/* 1111/*
1010 * Called from scheduler to remove the counters of the current task, 1112 * Called from scheduler to remove the counters of the current task,
1011 * with interrupts disabled. 1113 * with interrupts disabled.
@@ -1061,6 +1163,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
1061 ctx->task = next; 1163 ctx->task = next;
1062 next_ctx->task = task; 1164 next_ctx->task = task;
1063 do_switch = 0; 1165 do_switch = 0;
1166
1167 perf_counter_sync_stat(ctx, next_ctx);
1064 } 1168 }
1065 spin_unlock(&next_ctx->lock); 1169 spin_unlock(&next_ctx->lock);
1066 spin_unlock(&ctx->lock); 1170 spin_unlock(&ctx->lock);
@@ -1207,7 +1311,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1207#define MAX_INTERRUPTS (~0ULL) 1311#define MAX_INTERRUPTS (~0ULL)
1208 1312
1209static void perf_log_throttle(struct perf_counter *counter, int enable); 1313static void perf_log_throttle(struct perf_counter *counter, int enable);
1210static void perf_log_period(struct perf_counter *counter, u64 period);
1211 1314
1212static void perf_adjust_period(struct perf_counter *counter, u64 events) 1315static void perf_adjust_period(struct perf_counter *counter, u64 events)
1213{ 1316{
@@ -1226,8 +1329,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
1226 if (!sample_period) 1329 if (!sample_period)
1227 sample_period = 1; 1330 sample_period = 1;
1228 1331
1229 perf_log_period(counter, sample_period);
1230
1231 hwc->sample_period = sample_period; 1332 hwc->sample_period = sample_period;
1232} 1333}
1233 1334
@@ -1348,9 +1449,54 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1348} 1449}
1349 1450
1350/* 1451/*
1452 * Enable all of a task's counters that have been marked enable-on-exec.
1453 * This expects task == current.
1454 */
1455static void perf_counter_enable_on_exec(struct task_struct *task)
1456{
1457 struct perf_counter_context *ctx;
1458 struct perf_counter *counter;
1459 unsigned long flags;
1460 int enabled = 0;
1461
1462 local_irq_save(flags);
1463 ctx = task->perf_counter_ctxp;
1464 if (!ctx || !ctx->nr_counters)
1465 goto out;
1466
1467 __perf_counter_task_sched_out(ctx);
1468
1469 spin_lock(&ctx->lock);
1470
1471 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1472 if (!counter->attr.enable_on_exec)
1473 continue;
1474 counter->attr.enable_on_exec = 0;
1475 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1476 continue;
1477 counter->state = PERF_COUNTER_STATE_INACTIVE;
1478 counter->tstamp_enabled =
1479 ctx->time - counter->total_time_enabled;
1480 enabled = 1;
1481 }
1482
1483 /*
1484 * Unclone this context if we enabled any counter.
1485 */
1486 if (enabled)
1487 unclone_ctx(ctx);
1488
1489 spin_unlock(&ctx->lock);
1490
1491 perf_counter_task_sched_in(task, smp_processor_id());
1492 out:
1493 local_irq_restore(flags);
1494}
1495
1496/*
1351 * Cross CPU call to read the hardware counter 1497 * Cross CPU call to read the hardware counter
1352 */ 1498 */
1353static void __read(void *info) 1499static void __perf_counter_read(void *info)
1354{ 1500{
1355 struct perf_counter *counter = info; 1501 struct perf_counter *counter = info;
1356 struct perf_counter_context *ctx = counter->ctx; 1502 struct perf_counter_context *ctx = counter->ctx;
@@ -1372,7 +1518,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
1372 */ 1518 */
1373 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1519 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1374 smp_call_function_single(counter->oncpu, 1520 smp_call_function_single(counter->oncpu,
1375 __read, counter, 1); 1521 __perf_counter_read, counter, 1);
1376 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 1522 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1377 update_counter_times(counter); 1523 update_counter_times(counter);
1378 } 1524 }
@@ -1398,7 +1544,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1398 1544
1399static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1545static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1400{ 1546{
1401 struct perf_counter_context *parent_ctx;
1402 struct perf_counter_context *ctx; 1547 struct perf_counter_context *ctx;
1403 struct perf_cpu_context *cpuctx; 1548 struct perf_cpu_context *cpuctx;
1404 struct task_struct *task; 1549 struct task_struct *task;
@@ -1458,11 +1603,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1458 retry: 1603 retry:
1459 ctx = perf_lock_task_context(task, &flags); 1604 ctx = perf_lock_task_context(task, &flags);
1460 if (ctx) { 1605 if (ctx) {
1461 parent_ctx = ctx->parent_ctx; 1606 unclone_ctx(ctx);
1462 if (parent_ctx) {
1463 put_ctx(parent_ctx);
1464 ctx->parent_ctx = NULL; /* no longer a clone */
1465 }
1466 spin_unlock_irqrestore(&ctx->lock, flags); 1607 spin_unlock_irqrestore(&ctx->lock, flags);
1467 } 1608 }
1468 1609
@@ -1508,11 +1649,15 @@ static void free_counter(struct perf_counter *counter)
1508{ 1649{
1509 perf_pending_sync(counter); 1650 perf_pending_sync(counter);
1510 1651
1511 atomic_dec(&nr_counters); 1652 if (!counter->parent) {
1512 if (counter->attr.mmap) 1653 atomic_dec(&nr_counters);
1513 atomic_dec(&nr_mmap_counters); 1654 if (counter->attr.mmap)
1514 if (counter->attr.comm) 1655 atomic_dec(&nr_mmap_counters);
1515 atomic_dec(&nr_comm_counters); 1656 if (counter->attr.comm)
1657 atomic_dec(&nr_comm_counters);
1658 if (counter->attr.task)
1659 atomic_dec(&nr_task_counters);
1660 }
1516 1661
1517 if (counter->destroy) 1662 if (counter->destroy)
1518 counter->destroy(counter); 1663 counter->destroy(counter);
@@ -1546,6 +1691,18 @@ static int perf_release(struct inode *inode, struct file *file)
1546 return 0; 1691 return 0;
1547} 1692}
1548 1693
1694static u64 perf_counter_read_tree(struct perf_counter *counter)
1695{
1696 struct perf_counter *child;
1697 u64 total = 0;
1698
1699 total += perf_counter_read(counter);
1700 list_for_each_entry(child, &counter->child_list, child_list)
1701 total += perf_counter_read(child);
1702
1703 return total;
1704}
1705
1549/* 1706/*
1550 * Read the performance counter - simple non blocking version for now 1707 * Read the performance counter - simple non blocking version for now
1551 */ 1708 */
@@ -1565,7 +1722,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1565 1722
1566 WARN_ON_ONCE(counter->ctx->parent_ctx); 1723 WARN_ON_ONCE(counter->ctx->parent_ctx);
1567 mutex_lock(&counter->child_mutex); 1724 mutex_lock(&counter->child_mutex);
1568 values[0] = perf_counter_read(counter); 1725 values[0] = perf_counter_read_tree(counter);
1569 n = 1; 1726 n = 1;
1570 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1727 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1571 values[n++] = counter->total_time_enabled + 1728 values[n++] = counter->total_time_enabled +
@@ -1574,7 +1731,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1574 values[n++] = counter->total_time_running + 1731 values[n++] = counter->total_time_running +
1575 atomic64_read(&counter->child_total_time_running); 1732 atomic64_read(&counter->child_total_time_running);
1576 if (counter->attr.read_format & PERF_FORMAT_ID) 1733 if (counter->attr.read_format & PERF_FORMAT_ID)
1577 values[n++] = counter->id; 1734 values[n++] = primary_counter_id(counter);
1578 mutex_unlock(&counter->child_mutex); 1735 mutex_unlock(&counter->child_mutex);
1579 1736
1580 if (count < n * sizeof(u64)) 1737 if (count < n * sizeof(u64))
@@ -1681,8 +1838,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1681 1838
1682 counter->attr.sample_freq = value; 1839 counter->attr.sample_freq = value;
1683 } else { 1840 } else {
1684 perf_log_period(counter, value);
1685
1686 counter->attr.sample_period = value; 1841 counter->attr.sample_period = value;
1687 counter->hw.sample_period = value; 1842 counter->hw.sample_period = value;
1688 } 1843 }
@@ -1751,6 +1906,14 @@ int perf_counter_task_disable(void)
1751 return 0; 1906 return 0;
1752} 1907}
1753 1908
1909static int perf_counter_index(struct perf_counter *counter)
1910{
1911 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1912 return 0;
1913
1914 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
1915}
1916
1754/* 1917/*
1755 * Callers need to ensure there can be no nesting of this function, otherwise 1918 * Callers need to ensure there can be no nesting of this function, otherwise
1756 * the seqlock logic goes bad. We can not serialize this because the arch 1919 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1775,11 +1938,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
1775 preempt_disable(); 1938 preempt_disable();
1776 ++userpg->lock; 1939 ++userpg->lock;
1777 barrier(); 1940 barrier();
1778 userpg->index = counter->hw.idx; 1941 userpg->index = perf_counter_index(counter);
1779 userpg->offset = atomic64_read(&counter->count); 1942 userpg->offset = atomic64_read(&counter->count);
1780 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 1943 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1781 userpg->offset -= atomic64_read(&counter->hw.prev_count); 1944 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1782 1945
1946 userpg->time_enabled = counter->total_time_enabled +
1947 atomic64_read(&counter->child_total_time_enabled);
1948
1949 userpg->time_running = counter->total_time_running +
1950 atomic64_read(&counter->child_total_time_running);
1951
1783 barrier(); 1952 barrier();
1784 ++userpg->lock; 1953 ++userpg->lock;
1785 preempt_enable(); 1954 preempt_enable();
@@ -1876,7 +2045,7 @@ fail:
1876 2045
1877static void perf_mmap_free_page(unsigned long addr) 2046static void perf_mmap_free_page(unsigned long addr)
1878{ 2047{
1879 struct page *page = virt_to_page(addr); 2048 struct page *page = virt_to_page((void *)addr);
1880 2049
1881 page->mapping = NULL; 2050 page->mapping = NULL;
1882 __free_page(page); 2051 __free_page(page);
@@ -2483,15 +2652,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2483 u32 cpu, reserved; 2652 u32 cpu, reserved;
2484 } cpu_entry; 2653 } cpu_entry;
2485 2654
2486 header.type = 0; 2655 header.type = PERF_EVENT_SAMPLE;
2487 header.size = sizeof(header); 2656 header.size = sizeof(header);
2488 2657
2489 header.misc = PERF_EVENT_MISC_OVERFLOW; 2658 header.misc = 0;
2490 header.misc |= perf_misc_flags(data->regs); 2659 header.misc |= perf_misc_flags(data->regs);
2491 2660
2492 if (sample_type & PERF_SAMPLE_IP) { 2661 if (sample_type & PERF_SAMPLE_IP) {
2493 ip = perf_instruction_pointer(data->regs); 2662 ip = perf_instruction_pointer(data->regs);
2494 header.type |= PERF_SAMPLE_IP;
2495 header.size += sizeof(ip); 2663 header.size += sizeof(ip);
2496 } 2664 }
2497 2665
@@ -2500,7 +2668,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2500 tid_entry.pid = perf_counter_pid(counter, current); 2668 tid_entry.pid = perf_counter_pid(counter, current);
2501 tid_entry.tid = perf_counter_tid(counter, current); 2669 tid_entry.tid = perf_counter_tid(counter, current);
2502 2670
2503 header.type |= PERF_SAMPLE_TID;
2504 header.size += sizeof(tid_entry); 2671 header.size += sizeof(tid_entry);
2505 } 2672 }
2506 2673
@@ -2510,34 +2677,29 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2510 */ 2677 */
2511 time = sched_clock(); 2678 time = sched_clock();
2512 2679
2513 header.type |= PERF_SAMPLE_TIME;
2514 header.size += sizeof(u64); 2680 header.size += sizeof(u64);
2515 } 2681 }
2516 2682
2517 if (sample_type & PERF_SAMPLE_ADDR) { 2683 if (sample_type & PERF_SAMPLE_ADDR)
2518 header.type |= PERF_SAMPLE_ADDR; 2684 header.size += sizeof(u64);
2685
2686 if (sample_type & PERF_SAMPLE_ID)
2519 header.size += sizeof(u64); 2687 header.size += sizeof(u64);
2520 }
2521 2688
2522 if (sample_type & PERF_SAMPLE_ID) { 2689 if (sample_type & PERF_SAMPLE_STREAM_ID)
2523 header.type |= PERF_SAMPLE_ID;
2524 header.size += sizeof(u64); 2690 header.size += sizeof(u64);
2525 }
2526 2691
2527 if (sample_type & PERF_SAMPLE_CPU) { 2692 if (sample_type & PERF_SAMPLE_CPU) {
2528 header.type |= PERF_SAMPLE_CPU;
2529 header.size += sizeof(cpu_entry); 2693 header.size += sizeof(cpu_entry);
2530 2694
2531 cpu_entry.cpu = raw_smp_processor_id(); 2695 cpu_entry.cpu = raw_smp_processor_id();
2696 cpu_entry.reserved = 0;
2532 } 2697 }
2533 2698
2534 if (sample_type & PERF_SAMPLE_PERIOD) { 2699 if (sample_type & PERF_SAMPLE_PERIOD)
2535 header.type |= PERF_SAMPLE_PERIOD;
2536 header.size += sizeof(u64); 2700 header.size += sizeof(u64);
2537 }
2538 2701
2539 if (sample_type & PERF_SAMPLE_GROUP) { 2702 if (sample_type & PERF_SAMPLE_GROUP) {
2540 header.type |= PERF_SAMPLE_GROUP;
2541 header.size += sizeof(u64) + 2703 header.size += sizeof(u64) +
2542 counter->nr_siblings * sizeof(group_entry); 2704 counter->nr_siblings * sizeof(group_entry);
2543 } 2705 }
@@ -2547,10 +2709,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2547 2709
2548 if (callchain) { 2710 if (callchain) {
2549 callchain_size = (1 + callchain->nr) * sizeof(u64); 2711 callchain_size = (1 + callchain->nr) * sizeof(u64);
2550
2551 header.type |= PERF_SAMPLE_CALLCHAIN;
2552 header.size += callchain_size; 2712 header.size += callchain_size;
2553 } 2713 } else
2714 header.size += sizeof(u64);
2554 } 2715 }
2555 2716
2556 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2717 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2571,7 +2732,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2571 if (sample_type & PERF_SAMPLE_ADDR) 2732 if (sample_type & PERF_SAMPLE_ADDR)
2572 perf_output_put(&handle, data->addr); 2733 perf_output_put(&handle, data->addr);
2573 2734
2574 if (sample_type & PERF_SAMPLE_ID) 2735 if (sample_type & PERF_SAMPLE_ID) {
2736 u64 id = primary_counter_id(counter);
2737
2738 perf_output_put(&handle, id);
2739 }
2740
2741 if (sample_type & PERF_SAMPLE_STREAM_ID)
2575 perf_output_put(&handle, counter->id); 2742 perf_output_put(&handle, counter->id);
2576 2743
2577 if (sample_type & PERF_SAMPLE_CPU) 2744 if (sample_type & PERF_SAMPLE_CPU)
@@ -2594,24 +2761,85 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2594 if (sub != counter) 2761 if (sub != counter)
2595 sub->pmu->read(sub); 2762 sub->pmu->read(sub);
2596 2763
2597 group_entry.id = sub->id; 2764 group_entry.id = primary_counter_id(sub);
2598 group_entry.counter = atomic64_read(&sub->count); 2765 group_entry.counter = atomic64_read(&sub->count);
2599 2766
2600 perf_output_put(&handle, group_entry); 2767 perf_output_put(&handle, group_entry);
2601 } 2768 }
2602 } 2769 }
2603 2770
2604 if (callchain) 2771 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2605 perf_output_copy(&handle, callchain, callchain_size); 2772 if (callchain)
2773 perf_output_copy(&handle, callchain, callchain_size);
2774 else {
2775 u64 nr = 0;
2776 perf_output_put(&handle, nr);
2777 }
2778 }
2779
2780 perf_output_end(&handle);
2781}
2782
2783/*
2784 * read event
2785 */
2786
2787struct perf_read_event {
2788 struct perf_event_header header;
2789
2790 u32 pid;
2791 u32 tid;
2792 u64 value;
2793 u64 format[3];
2794};
2795
2796static void
2797perf_counter_read_event(struct perf_counter *counter,
2798 struct task_struct *task)
2799{
2800 struct perf_output_handle handle;
2801 struct perf_read_event event = {
2802 .header = {
2803 .type = PERF_EVENT_READ,
2804 .misc = 0,
2805 .size = sizeof(event) - sizeof(event.format),
2806 },
2807 .pid = perf_counter_pid(counter, task),
2808 .tid = perf_counter_tid(counter, task),
2809 .value = atomic64_read(&counter->count),
2810 };
2811 int ret, i = 0;
2812
2813 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2814 event.header.size += sizeof(u64);
2815 event.format[i++] = counter->total_time_enabled;
2816 }
2817
2818 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2819 event.header.size += sizeof(u64);
2820 event.format[i++] = counter->total_time_running;
2821 }
2822
2823 if (counter->attr.read_format & PERF_FORMAT_ID) {
2824 event.header.size += sizeof(u64);
2825 event.format[i++] = primary_counter_id(counter);
2826 }
2606 2827
2828 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
2829 if (ret)
2830 return;
2831
2832 perf_output_copy(&handle, &event, event.header.size);
2607 perf_output_end(&handle); 2833 perf_output_end(&handle);
2608} 2834}
2609 2835
2610/* 2836/*
2611 * fork tracking 2837 * task tracking -- fork/exit
2838 *
2839 * enabled by: attr.comm | attr.mmap | attr.task
2612 */ 2840 */
2613 2841
2614struct perf_fork_event { 2842struct perf_task_event {
2615 struct task_struct *task; 2843 struct task_struct *task;
2616 2844
2617 struct { 2845 struct {
@@ -2619,37 +2847,42 @@ struct perf_fork_event {
2619 2847
2620 u32 pid; 2848 u32 pid;
2621 u32 ppid; 2849 u32 ppid;
2850 u32 tid;
2851 u32 ptid;
2622 } event; 2852 } event;
2623}; 2853};
2624 2854
2625static void perf_counter_fork_output(struct perf_counter *counter, 2855static void perf_counter_task_output(struct perf_counter *counter,
2626 struct perf_fork_event *fork_event) 2856 struct perf_task_event *task_event)
2627{ 2857{
2628 struct perf_output_handle handle; 2858 struct perf_output_handle handle;
2629 int size = fork_event->event.header.size; 2859 int size = task_event->event.header.size;
2630 struct task_struct *task = fork_event->task; 2860 struct task_struct *task = task_event->task;
2631 int ret = perf_output_begin(&handle, counter, size, 0, 0); 2861 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2632 2862
2633 if (ret) 2863 if (ret)
2634 return; 2864 return;
2635 2865
2636 fork_event->event.pid = perf_counter_pid(counter, task); 2866 task_event->event.pid = perf_counter_pid(counter, task);
2637 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 2867 task_event->event.ppid = perf_counter_pid(counter, task->real_parent);
2638 2868
2639 perf_output_put(&handle, fork_event->event); 2869 task_event->event.tid = perf_counter_tid(counter, task);
2870 task_event->event.ptid = perf_counter_tid(counter, task->real_parent);
2871
2872 perf_output_put(&handle, task_event->event);
2640 perf_output_end(&handle); 2873 perf_output_end(&handle);
2641} 2874}
2642 2875
2643static int perf_counter_fork_match(struct perf_counter *counter) 2876static int perf_counter_task_match(struct perf_counter *counter)
2644{ 2877{
2645 if (counter->attr.comm || counter->attr.mmap) 2878 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2646 return 1; 2879 return 1;
2647 2880
2648 return 0; 2881 return 0;
2649} 2882}
2650 2883
2651static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 2884static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2652 struct perf_fork_event *fork_event) 2885 struct perf_task_event *task_event)
2653{ 2886{
2654 struct perf_counter *counter; 2887 struct perf_counter *counter;
2655 2888
@@ -2658,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2658 2891
2659 rcu_read_lock(); 2892 rcu_read_lock();
2660 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 2893 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2661 if (perf_counter_fork_match(counter)) 2894 if (perf_counter_task_match(counter))
2662 perf_counter_fork_output(counter, fork_event); 2895 perf_counter_task_output(counter, task_event);
2663 } 2896 }
2664 rcu_read_unlock(); 2897 rcu_read_unlock();
2665} 2898}
2666 2899
2667static void perf_counter_fork_event(struct perf_fork_event *fork_event) 2900static void perf_counter_task_event(struct perf_task_event *task_event)
2668{ 2901{
2669 struct perf_cpu_context *cpuctx; 2902 struct perf_cpu_context *cpuctx;
2670 struct perf_counter_context *ctx; 2903 struct perf_counter_context *ctx;
2671 2904
2672 cpuctx = &get_cpu_var(perf_cpu_context); 2905 cpuctx = &get_cpu_var(perf_cpu_context);
2673 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 2906 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2674 put_cpu_var(perf_cpu_context); 2907 put_cpu_var(perf_cpu_context);
2675 2908
2676 rcu_read_lock(); 2909 rcu_read_lock();
@@ -2680,29 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event)
2680 */ 2913 */
2681 ctx = rcu_dereference(current->perf_counter_ctxp); 2914 ctx = rcu_dereference(current->perf_counter_ctxp);
2682 if (ctx) 2915 if (ctx)
2683 perf_counter_fork_ctx(ctx, fork_event); 2916 perf_counter_task_ctx(ctx, task_event);
2684 rcu_read_unlock(); 2917 rcu_read_unlock();
2685} 2918}
2686 2919
2687void perf_counter_fork(struct task_struct *task) 2920static void perf_counter_task(struct task_struct *task, int new)
2688{ 2921{
2689 struct perf_fork_event fork_event; 2922 struct perf_task_event task_event;
2690 2923
2691 if (!atomic_read(&nr_comm_counters) && 2924 if (!atomic_read(&nr_comm_counters) &&
2692 !atomic_read(&nr_mmap_counters)) 2925 !atomic_read(&nr_mmap_counters) &&
2926 !atomic_read(&nr_task_counters))
2693 return; 2927 return;
2694 2928
2695 fork_event = (struct perf_fork_event){ 2929 task_event = (struct perf_task_event){
2696 .task = task, 2930 .task = task,
2697 .event = { 2931 .event = {
2698 .header = { 2932 .header = {
2699 .type = PERF_EVENT_FORK, 2933 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2700 .size = sizeof(fork_event.event), 2934 .misc = 0,
2935 .size = sizeof(task_event.event),
2701 }, 2936 },
2937 /* .pid */
2938 /* .ppid */
2939 /* .tid */
2940 /* .ptid */
2702 }, 2941 },
2703 }; 2942 };
2704 2943
2705 perf_counter_fork_event(&fork_event); 2944 perf_counter_task_event(&task_event);
2945}
2946
2947void perf_counter_fork(struct task_struct *task)
2948{
2949 perf_counter_task(task, 1);
2706} 2950}
2707 2951
2708/* 2952/*
@@ -2770,8 +3014,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2770 struct perf_cpu_context *cpuctx; 3014 struct perf_cpu_context *cpuctx;
2771 struct perf_counter_context *ctx; 3015 struct perf_counter_context *ctx;
2772 unsigned int size; 3016 unsigned int size;
2773 char *comm = comm_event->task->comm; 3017 char comm[TASK_COMM_LEN];
2774 3018
3019 memset(comm, 0, sizeof(comm));
3020 strncpy(comm, comm_event->task->comm, sizeof(comm));
2775 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3021 size = ALIGN(strlen(comm)+1, sizeof(u64));
2776 3022
2777 comm_event->comm = comm; 3023 comm_event->comm = comm;
@@ -2798,13 +3044,24 @@ void perf_counter_comm(struct task_struct *task)
2798{ 3044{
2799 struct perf_comm_event comm_event; 3045 struct perf_comm_event comm_event;
2800 3046
3047 if (task->perf_counter_ctxp)
3048 perf_counter_enable_on_exec(task);
3049
2801 if (!atomic_read(&nr_comm_counters)) 3050 if (!atomic_read(&nr_comm_counters))
2802 return; 3051 return;
2803 3052
2804 comm_event = (struct perf_comm_event){ 3053 comm_event = (struct perf_comm_event){
2805 .task = task, 3054 .task = task,
3055 /* .comm */
3056 /* .comm_size */
2806 .event = { 3057 .event = {
2807 .header = { .type = PERF_EVENT_COMM, }, 3058 .header = {
3059 .type = PERF_EVENT_COMM,
3060 .misc = 0,
3061 /* .size */
3062 },
3063 /* .pid */
3064 /* .tid */
2808 }, 3065 },
2809 }; 3066 };
2810 3067
@@ -2887,8 +3144,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2887 char *buf = NULL; 3144 char *buf = NULL;
2888 const char *name; 3145 const char *name;
2889 3146
3147 memset(tmp, 0, sizeof(tmp));
3148
2890 if (file) { 3149 if (file) {
2891 buf = kzalloc(PATH_MAX, GFP_KERNEL); 3150 /*
3151 * d_path works from the end of the buffer backwards, so we
3152 * need to add enough zero bytes after the string to handle
3153 * the 64bit alignment we do later.
3154 */
3155 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
2892 if (!buf) { 3156 if (!buf) {
2893 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3157 name = strncpy(tmp, "//enomem", sizeof(tmp));
2894 goto got_name; 3158 goto got_name;
@@ -2899,9 +3163,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2899 goto got_name; 3163 goto got_name;
2900 } 3164 }
2901 } else { 3165 } else {
2902 name = arch_vma_name(mmap_event->vma); 3166 if (arch_vma_name(mmap_event->vma)) {
2903 if (name) 3167 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3168 sizeof(tmp));
2904 goto got_name; 3169 goto got_name;
3170 }
2905 3171
2906 if (!vma->vm_mm) { 3172 if (!vma->vm_mm) {
2907 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3173 name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@ -2946,8 +3212,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2946 3212
2947 mmap_event = (struct perf_mmap_event){ 3213 mmap_event = (struct perf_mmap_event){
2948 .vma = vma, 3214 .vma = vma,
3215 /* .file_name */
3216 /* .file_size */
2949 .event = { 3217 .event = {
2950 .header = { .type = PERF_EVENT_MMAP, }, 3218 .header = {
3219 .type = PERF_EVENT_MMAP,
3220 .misc = 0,
3221 /* .size */
3222 },
3223 /* .pid */
3224 /* .tid */
2951 .start = vma->vm_start, 3225 .start = vma->vm_start,
2952 .len = vma->vm_end - vma->vm_start, 3226 .len = vma->vm_end - vma->vm_start,
2953 .pgoff = vma->vm_pgoff, 3227 .pgoff = vma->vm_pgoff,
@@ -2958,49 +3232,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2958} 3232}
2959 3233
2960/* 3234/*
2961 * Log sample_period changes so that analyzing tools can re-normalize the
2962 * event flow.
2963 */
2964
2965struct freq_event {
2966 struct perf_event_header header;
2967 u64 time;
2968 u64 id;
2969 u64 period;
2970};
2971
2972static void perf_log_period(struct perf_counter *counter, u64 period)
2973{
2974 struct perf_output_handle handle;
2975 struct freq_event event;
2976 int ret;
2977
2978 if (counter->hw.sample_period == period)
2979 return;
2980
2981 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2982 return;
2983
2984 event = (struct freq_event) {
2985 .header = {
2986 .type = PERF_EVENT_PERIOD,
2987 .misc = 0,
2988 .size = sizeof(event),
2989 },
2990 .time = sched_clock(),
2991 .id = counter->id,
2992 .period = period,
2993 };
2994
2995 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2996 if (ret)
2997 return;
2998
2999 perf_output_put(&handle, event);
3000 perf_output_end(&handle);
3001}
3002
3003/*
3004 * IRQ throttle logging 3235 * IRQ throttle logging
3005 */ 3236 */
3006 3237
@@ -3013,16 +3244,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3013 struct perf_event_header header; 3244 struct perf_event_header header;
3014 u64 time; 3245 u64 time;
3015 u64 id; 3246 u64 id;
3247 u64 stream_id;
3016 } throttle_event = { 3248 } throttle_event = {
3017 .header = { 3249 .header = {
3018 .type = PERF_EVENT_THROTTLE + 1, 3250 .type = PERF_EVENT_THROTTLE,
3019 .misc = 0, 3251 .misc = 0,
3020 .size = sizeof(throttle_event), 3252 .size = sizeof(throttle_event),
3021 }, 3253 },
3022 .time = sched_clock(), 3254 .time = sched_clock(),
3023 .id = counter->id, 3255 .id = primary_counter_id(counter),
3256 .stream_id = counter->id,
3024 }; 3257 };
3025 3258
3259 if (enable)
3260 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3261
3026 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); 3262 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3027 if (ret) 3263 if (ret)
3028 return; 3264 return;
@@ -3317,8 +3553,8 @@ out:
3317 put_cpu_var(perf_cpu_context); 3553 put_cpu_var(perf_cpu_context);
3318} 3554}
3319 3555
3320void 3556void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3321perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3557 struct pt_regs *regs, u64 addr)
3322{ 3558{
3323 struct perf_sample_data data = { 3559 struct perf_sample_data data = {
3324 .regs = regs, 3560 .regs = regs,
@@ -3470,7 +3706,7 @@ static const struct pmu perf_ops_task_clock = {
3470void perf_tpcounter_event(int event_id) 3706void perf_tpcounter_event(int event_id)
3471{ 3707{
3472 struct perf_sample_data data = { 3708 struct perf_sample_data data = {
3473 .regs = get_irq_regs(); 3709 .regs = get_irq_regs(),
3474 .addr = 0, 3710 .addr = 0,
3475 }; 3711 };
3476 3712
@@ -3486,16 +3722,12 @@ extern void ftrace_profile_disable(int);
3486 3722
3487static void tp_perf_counter_destroy(struct perf_counter *counter) 3723static void tp_perf_counter_destroy(struct perf_counter *counter)
3488{ 3724{
3489 ftrace_profile_disable(perf_event_id(&counter->attr)); 3725 ftrace_profile_disable(counter->attr.config);
3490} 3726}
3491 3727
3492static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3728static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3493{ 3729{
3494 int event_id = perf_event_id(&counter->attr); 3730 if (ftrace_profile_enable(counter->attr.config))
3495 int ret;
3496
3497 ret = ftrace_profile_enable(event_id);
3498 if (ret)
3499 return NULL; 3731 return NULL;
3500 3732
3501 counter->destroy = tp_perf_counter_destroy; 3733 counter->destroy = tp_perf_counter_destroy;
@@ -3509,9 +3741,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3509} 3741}
3510#endif 3742#endif
3511 3743
3744atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3745
3746static void sw_perf_counter_destroy(struct perf_counter *counter)
3747{
3748 u64 event = counter->attr.config;
3749
3750 WARN_ON(counter->parent);
3751
3752 atomic_dec(&perf_swcounter_enabled[event]);
3753}
3754
3512static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) 3755static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3513{ 3756{
3514 const struct pmu *pmu = NULL; 3757 const struct pmu *pmu = NULL;
3758 u64 event = counter->attr.config;
3515 3759
3516 /* 3760 /*
3517 * Software counters (currently) can't in general distinguish 3761 * Software counters (currently) can't in general distinguish
@@ -3520,7 +3764,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3520 * to be kernel events, and page faults are never hypervisor 3764 * to be kernel events, and page faults are never hypervisor
3521 * events. 3765 * events.
3522 */ 3766 */
3523 switch (counter->attr.config) { 3767 switch (event) {
3524 case PERF_COUNT_SW_CPU_CLOCK: 3768 case PERF_COUNT_SW_CPU_CLOCK:
3525 pmu = &perf_ops_cpu_clock; 3769 pmu = &perf_ops_cpu_clock;
3526 3770
@@ -3541,6 +3785,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3541 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 3785 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3542 case PERF_COUNT_SW_CONTEXT_SWITCHES: 3786 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3543 case PERF_COUNT_SW_CPU_MIGRATIONS: 3787 case PERF_COUNT_SW_CPU_MIGRATIONS:
3788 if (!counter->parent) {
3789 atomic_inc(&perf_swcounter_enabled[event]);
3790 counter->destroy = sw_perf_counter_destroy;
3791 }
3544 pmu = &perf_ops_generic; 3792 pmu = &perf_ops_generic;
3545 break; 3793 break;
3546 } 3794 }
@@ -3556,6 +3804,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3556 int cpu, 3804 int cpu,
3557 struct perf_counter_context *ctx, 3805 struct perf_counter_context *ctx,
3558 struct perf_counter *group_leader, 3806 struct perf_counter *group_leader,
3807 struct perf_counter *parent_counter,
3559 gfp_t gfpflags) 3808 gfp_t gfpflags)
3560{ 3809{
3561 const struct pmu *pmu; 3810 const struct pmu *pmu;
@@ -3591,6 +3840,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3591 counter->ctx = ctx; 3840 counter->ctx = ctx;
3592 counter->oncpu = -1; 3841 counter->oncpu = -1;
3593 3842
3843 counter->parent = parent_counter;
3844
3594 counter->ns = get_pid_ns(current->nsproxy->pid_ns); 3845 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3595 counter->id = atomic64_inc_return(&perf_counter_id); 3846 counter->id = atomic64_inc_return(&perf_counter_id);
3596 3847
@@ -3648,11 +3899,15 @@ done:
3648 3899
3649 counter->pmu = pmu; 3900 counter->pmu = pmu;
3650 3901
3651 atomic_inc(&nr_counters); 3902 if (!counter->parent) {
3652 if (counter->attr.mmap) 3903 atomic_inc(&nr_counters);
3653 atomic_inc(&nr_mmap_counters); 3904 if (counter->attr.mmap)
3654 if (counter->attr.comm) 3905 atomic_inc(&nr_mmap_counters);
3655 atomic_inc(&nr_comm_counters); 3906 if (counter->attr.comm)
3907 atomic_inc(&nr_comm_counters);
3908 if (counter->attr.task)
3909 atomic_inc(&nr_task_counters);
3910 }
3656 3911
3657 return counter; 3912 return counter;
3658} 3913}
@@ -3815,7 +4070,7 @@ SYSCALL_DEFINE5(perf_counter_open,
3815 } 4070 }
3816 4071
3817 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4072 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3818 GFP_KERNEL); 4073 NULL, GFP_KERNEL);
3819 ret = PTR_ERR(counter); 4074 ret = PTR_ERR(counter);
3820 if (IS_ERR(counter)) 4075 if (IS_ERR(counter))
3821 goto err_put_context; 4076 goto err_put_context;
@@ -3881,7 +4136,8 @@ inherit_counter(struct perf_counter *parent_counter,
3881 4136
3882 child_counter = perf_counter_alloc(&parent_counter->attr, 4137 child_counter = perf_counter_alloc(&parent_counter->attr,
3883 parent_counter->cpu, child_ctx, 4138 parent_counter->cpu, child_ctx,
3884 group_leader, GFP_KERNEL); 4139 group_leader, parent_counter,
4140 GFP_KERNEL);
3885 if (IS_ERR(child_counter)) 4141 if (IS_ERR(child_counter))
3886 return child_counter; 4142 return child_counter;
3887 get_ctx(child_ctx); 4143 get_ctx(child_ctx);
@@ -3904,12 +4160,6 @@ inherit_counter(struct perf_counter *parent_counter,
3904 */ 4160 */
3905 add_counter_to_ctx(child_counter, child_ctx); 4161 add_counter_to_ctx(child_counter, child_ctx);
3906 4162
3907 child_counter->parent = parent_counter;
3908 /*
3909 * inherit into child's child as well:
3910 */
3911 child_counter->attr.inherit = 1;
3912
3913 /* 4163 /*
3914 * Get a reference to the parent filp - we will fput it 4164 * Get a reference to the parent filp - we will fput it
3915 * when the child counter exits. This is safe to do because 4165 * when the child counter exits. This is safe to do because
@@ -3953,10 +4203,14 @@ static int inherit_group(struct perf_counter *parent_counter,
3953} 4203}
3954 4204
3955static void sync_child_counter(struct perf_counter *child_counter, 4205static void sync_child_counter(struct perf_counter *child_counter,
3956 struct perf_counter *parent_counter) 4206 struct task_struct *child)
3957{ 4207{
4208 struct perf_counter *parent_counter = child_counter->parent;
3958 u64 child_val; 4209 u64 child_val;
3959 4210
4211 if (child_counter->attr.inherit_stat)
4212 perf_counter_read_event(child_counter, child);
4213
3960 child_val = atomic64_read(&child_counter->count); 4214 child_val = atomic64_read(&child_counter->count);
3961 4215
3962 /* 4216 /*
@@ -3985,7 +4239,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
3985 4239
3986static void 4240static void
3987__perf_counter_exit_task(struct perf_counter *child_counter, 4241__perf_counter_exit_task(struct perf_counter *child_counter,
3988 struct perf_counter_context *child_ctx) 4242 struct perf_counter_context *child_ctx,
4243 struct task_struct *child)
3989{ 4244{
3990 struct perf_counter *parent_counter; 4245 struct perf_counter *parent_counter;
3991 4246
@@ -3999,7 +4254,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
3999 * counters need to be zapped - but otherwise linger. 4254 * counters need to be zapped - but otherwise linger.
4000 */ 4255 */
4001 if (parent_counter) { 4256 if (parent_counter) {
4002 sync_child_counter(child_counter, parent_counter); 4257 sync_child_counter(child_counter, child);
4003 free_counter(child_counter); 4258 free_counter(child_counter);
4004 } 4259 }
4005} 4260}
@@ -4013,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child)
4013 struct perf_counter_context *child_ctx; 4268 struct perf_counter_context *child_ctx;
4014 unsigned long flags; 4269 unsigned long flags;
4015 4270
4016 if (likely(!child->perf_counter_ctxp)) 4271 if (likely(!child->perf_counter_ctxp)) {
4272 perf_counter_task(child, 0);
4017 return; 4273 return;
4274 }
4018 4275
4019 local_irq_save(flags); 4276 local_irq_save(flags);
4020 /* 4277 /*
@@ -4032,18 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child)
4032 * incremented the context's refcount before we do put_ctx below. 4289 * incremented the context's refcount before we do put_ctx below.
4033 */ 4290 */
4034 spin_lock(&child_ctx->lock); 4291 spin_lock(&child_ctx->lock);
4292 /*
4293 * If this context is a clone; unclone it so it can't get
4294 * swapped to another process while we're removing all
4295 * the counters from it.
4296 */
4297 unclone_ctx(child_ctx);
4298 spin_unlock_irqrestore(&child_ctx->lock, flags);
4299
4300 /*
4301 * Report the task dead after unscheduling the counters so that we
4302 * won't get any samples after PERF_EVENT_EXIT. We can however still
4303 * get a few PERF_EVENT_READ events.
4304 */
4305 perf_counter_task(child, 0);
4306
4035 child->perf_counter_ctxp = NULL; 4307 child->perf_counter_ctxp = NULL;
4036 if (child_ctx->parent_ctx) {
4037 /*
4038 * This context is a clone; unclone it so it can't get
4039 * swapped to another process while we're removing all
4040 * the counters from it.
4041 */
4042 put_ctx(child_ctx->parent_ctx);
4043 child_ctx->parent_ctx = NULL;
4044 }
4045 spin_unlock(&child_ctx->lock);
4046 local_irq_restore(flags);
4047 4308
4048 /* 4309 /*
4049 * We can recurse on the same lock type through: 4310 * We can recurse on the same lock type through:
@@ -4061,7 +4322,7 @@ void perf_counter_exit_task(struct task_struct *child)
4061again: 4322again:
4062 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, 4323 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4063 list_entry) 4324 list_entry)
4064 __perf_counter_exit_task(child_counter, child_ctx); 4325 __perf_counter_exit_task(child_counter, child_ctx, child);
4065 4326
4066 /* 4327 /*
4067 * If the last counter was a group counter, it will have appended all 4328 * If the last counter was a group counter, it will have appended all
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 260 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 261 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 262 .timer_create = no_timer_create,
263 .nsleep = no_nsleep,
257 }; 264 };
258 265
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 266 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 69911b5745eb..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -117,11 +117,12 @@ int __ref profile_init(void)
117 117
118 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
119 119
120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
121 if (prof_buffer) 121 if (prof_buffer)
122 return 0; 122 return 0;
123 123
124 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
125 if (prof_buffer) 126 if (prof_buffer)
126 return 0; 127 return 0;
127 128
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07ba..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)
181 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
182 * under ptrace. 182 * under ptrace.
183 */ 183 */
184 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
185 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
186 goto out; 186 goto out;
187 187
188 task_lock(task); 188 task_lock(task);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1533,7 +1533,7 @@ void __init __rcu_init(void)
1533 int j; 1533 int j;
1534 struct rcu_node *rnp; 1534 struct rcu_node *rnp;
1535 1535
1536 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1536 printk(KERN_INFO "Hierarchical RCU implementation.\n");
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1546,7 +1546,6 @@ void __init __rcu_init(void)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); 1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */ 1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb); 1548 register_cpu_notifier(&rcu_nb);
1549 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1550} 1549}
1551 1550
1552module_param(blimit, int, 0); 1551module_param(blimit, int, 0);
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/sched.c b/kernel/sched.c
index 7c9098d186e6..1b59e265273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -493,6 +493,7 @@ struct rt_rq {
493#endif 493#endif
494#ifdef CONFIG_SMP 494#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory; 495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
496 int overloaded; 497 int overloaded;
497 struct plist_head pushable_tasks; 498 struct plist_head pushable_tasks;
498#endif 499#endif
@@ -2571,15 +2572,37 @@ static void __sched_fork(struct task_struct *p)
2571 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2572 2573
2573#ifdef CONFIG_SCHEDSTATS 2574#ifdef CONFIG_SCHEDSTATS
2574 p->se.wait_start = 0; 2575 p->se.wait_start = 0;
2575 p->se.sum_sleep_runtime = 0; 2576 p->se.wait_max = 0;
2576 p->se.sleep_start = 0; 2577 p->se.wait_count = 0;
2577 p->se.block_start = 0; 2578 p->se.wait_sum = 0;
2578 p->se.sleep_max = 0; 2579
2579 p->se.block_max = 0; 2580 p->se.sleep_start = 0;
2580 p->se.exec_max = 0; 2581 p->se.sleep_max = 0;
2581 p->se.slice_max = 0; 2582 p->se.sum_sleep_runtime = 0;
2582 p->se.wait_max = 0; 2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2583#endif 2606#endif
2584 2607
2585 INIT_LIST_HEAD(&p->rt.run_list); 2608 INIT_LIST_HEAD(&p->rt.run_list);
@@ -6541,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
6541 return 0; 6564 return 0;
6542} 6565}
6543 6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6544static void __cond_resched(void) 6572static void __cond_resched(void)
6545{ 6573{
6546#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6560,8 +6588,7 @@ static void __cond_resched(void)
6560 6588
6561int __sched _cond_resched(void) 6589int __sched _cond_resched(void)
6562{ 6590{
6563 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6591 if (should_resched()) {
6564 system_state == SYSTEM_RUNNING) {
6565 __cond_resched(); 6592 __cond_resched();
6566 return 1; 6593 return 1;
6567 } 6594 }
@@ -6579,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
6579 */ 6606 */
6580int cond_resched_lock(spinlock_t *lock) 6607int cond_resched_lock(spinlock_t *lock)
6581{ 6608{
6582 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6609 int resched = should_resched();
6583 int ret = 0; 6610 int ret = 0;
6584 6611
6585 if (spin_needbreak(lock) || resched) { 6612 if (spin_needbreak(lock) || resched) {
6586 spin_unlock(lock); 6613 spin_unlock(lock);
6587 if (resched && need_resched()) 6614 if (resched)
6588 __cond_resched(); 6615 __cond_resched();
6589 else 6616 else
6590 cpu_relax(); 6617 cpu_relax();
@@ -6599,7 +6626,7 @@ int __sched cond_resched_softirq(void)
6599{ 6626{
6600 BUG_ON(!in_softirq()); 6627 BUG_ON(!in_softirq());
6601 6628
6602 if (need_resched() && system_state == SYSTEM_RUNNING) { 6629 if (should_resched()) {
6603 local_bh_enable(); 6630 local_bh_enable();
6604 __cond_resched(); 6631 __cond_resched();
6605 local_bh_disable(); 6632 local_bh_disable();
@@ -7262,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7262static void calc_global_load_remove(struct rq *rq) 7289static void calc_global_load_remove(struct rq *rq)
7263{ 7290{
7264 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7291 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7292 rq->calc_load_active = 0;
7265} 7293}
7266#endif /* CONFIG_HOTPLUG_CPU */ 7294#endif /* CONFIG_HOTPLUG_CPU */
7267 7295
@@ -7488,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7488 task_rq_unlock(rq, &flags); 7516 task_rq_unlock(rq, &flags);
7489 get_task_struct(p); 7517 get_task_struct(p);
7490 cpu_rq(cpu)->migration_thread = p; 7518 cpu_rq(cpu)->migration_thread = p;
7519 rq->calc_load_update = calc_load_update;
7491 break; 7520 break;
7492 7521
7493 case CPU_ONLINE: 7522 case CPU_ONLINE:
@@ -7498,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7498 /* Update our root-domain */ 7527 /* Update our root-domain */
7499 rq = cpu_rq(cpu); 7528 rq = cpu_rq(cpu);
7500 spin_lock_irqsave(&rq->lock, flags); 7529 spin_lock_irqsave(&rq->lock, flags);
7501 rq->calc_load_update = calc_load_update;
7502 rq->calc_load_active = 0;
7503 if (rq->rd) { 7530 if (rq->rd) {
7504 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7531 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7505 7532
@@ -9070,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9070#ifdef CONFIG_SMP 9097#ifdef CONFIG_SMP
9071 rt_rq->rt_nr_migratory = 0; 9098 rt_rq->rt_nr_migratory = 0;
9072 rt_rq->overloaded = 0; 9099 rt_rq->overloaded = 0;
9073 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9100 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9074#endif 9101#endif
9075 9102
9076 rt_rq->rt_time = 0; 9103 rt_rq->rt_time = 0;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6c251790dde..d014efbf947a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba7fd6e9556f..652e8bdef9aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 266 return min_vruntime;
267} 267}
268 268
269static inline int entity_before(struct sched_entity *a,
270 struct sched_entity *b)
271{
272 return (s64)(a->vruntime - b->vruntime) < 0;
273}
274
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 275static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 276{
271 return se->vruntime - cfs_rq->min_vruntime; 277 return se->vruntime - cfs_rq->min_vruntime;
@@ -605,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
605static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 611static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
606{ 612{
607#ifdef CONFIG_SCHEDSTATS 613#ifdef CONFIG_SCHEDSTATS
614 struct task_struct *tsk = NULL;
615
616 if (entity_is_task(se))
617 tsk = task_of(se);
618
608 if (se->sleep_start) { 619 if (se->sleep_start) {
609 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
610 struct task_struct *tsk = task_of(se);
611 621
612 if ((s64)delta < 0) 622 if ((s64)delta < 0)
613 delta = 0; 623 delta = 0;
@@ -618,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
618 se->sleep_start = 0; 628 se->sleep_start = 0;
619 se->sum_sleep_runtime += delta; 629 se->sum_sleep_runtime += delta;
620 630
621 account_scheduler_latency(tsk, delta >> 10, 1); 631 if (tsk)
632 account_scheduler_latency(tsk, delta >> 10, 1);
622 } 633 }
623 if (se->block_start) { 634 if (se->block_start) {
624 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 635 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
625 struct task_struct *tsk = task_of(se);
626 636
627 if ((s64)delta < 0) 637 if ((s64)delta < 0)
628 delta = 0; 638 delta = 0;
@@ -633,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 se->block_start = 0; 643 se->block_start = 0;
634 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
635 645
636 /* 646 if (tsk) {
637 * Blocking time is in units of nanosecs, so shift by 20 to 647 /*
638 * get a milliseconds-range estimation of the amount of 648 * Blocking time is in units of nanosecs, so shift by
639 * time that the task spent sleeping: 649 * 20 to get a milliseconds-range estimation of the
640 */ 650 * amount of time that the task spent sleeping:
641 if (unlikely(prof_on == SLEEP_PROFILING)) { 651 */
642 652 if (unlikely(prof_on == SLEEP_PROFILING)) {
643 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 653 profile_hits(SLEEP_PROFILING,
644 delta >> 20); 654 (void *)get_wchan(tsk),
655 delta >> 20);
656 }
657 account_scheduler_latency(tsk, delta >> 10, 0);
645 } 658 }
646 account_scheduler_latency(tsk, delta >> 10, 0);
647 } 659 }
648#endif 660#endif
649} 661}
@@ -687,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
687 * all of which have the same weight. 699 * all of which have the same weight.
688 */ 700 */
689 if (sched_feat(NORMALIZED_SLEEPER) && 701 if (sched_feat(NORMALIZED_SLEEPER) &&
690 task_of(se)->policy != SCHED_IDLE) 702 (!entity_is_task(se) ||
703 task_of(se)->policy != SCHED_IDLE))
691 thresh = calc_delta_fair(thresh, se); 704 thresh = calc_delta_fair(thresh, se);
692 705
693 vruntime -= thresh; 706 vruntime -= thresh;
@@ -1016,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
1016 /* 1029 /*
1017 * Already in the rightmost position? 1030 * Already in the rightmost position?
1018 */ 1031 */
1019 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1032 if (unlikely(!rightmost || entity_before(rightmost, se)))
1020 return; 1033 return;
1021 1034
1022 /* 1035 /*
@@ -1712,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1712 1725
1713 /* 'curr' will be NULL if the child belongs to a different group */ 1726 /* 'curr' will be NULL if the child belongs to a different group */
1714 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1727 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1715 curr && curr->vruntime < se->vruntime) { 1728 curr && entity_before(curr, se)) {
1716 /* 1729 /*
1717 * Upon rescheduling, sched_class::put_prev_task() will place 1730 * Upon rescheduling, sched_class::put_prev_task() will place
1718 * 'current' within the tree based on its new key value. 1731 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..3918e01994e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
10 10
11#ifdef CONFIG_RT_GROUP_SCHED 11#ifdef CONFIG_RT_GROUP_SCHED
12 12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 16{
15 return rt_rq->rq; 17 return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 24
23#else /* CONFIG_RT_GROUP_SCHED */ 25#else /* CONFIG_RT_GROUP_SCHED */
24 26
27#define rt_entity_is_task(rt_se) (1)
28
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 30{
27 return container_of(rt_rq, struct rq, rt); 31 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 77
74static void update_rt_migration(struct rt_rq *rt_rq) 78static void update_rt_migration(struct rt_rq *rt_rq)
75{ 79{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 80 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 81 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 82 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 83 rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 90
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 91static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 92{
93 if (!rt_entity_is_task(rt_se))
94 return;
95
96 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
97
98 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 99 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 100 rt_rq->rt_nr_migratory++;
91 101
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 104
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 105static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 106{
107 if (!rt_entity_is_task(rt_se))
108 return;
109
110 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
111
112 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 113 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 114 rt_rq->rt_nr_migratory--;
99 115
diff --git a/kernel/signal.c b/kernel/signal.c
index ccf1ceedaebe..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2454 stack_t oss; 2454 stack_t oss;
2455 int error; 2455 int error;
2456 2456
2457 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2458 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2459 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2460 oss.ss_flags = sas_ss_flags(sp);
2461 }
2462 2460
2463 if (uss) { 2461 if (uss) {
2464 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2466 int ss_flags; 2464 int ss_flags;
2467 2465
2468 error = -EFAULT; 2466 error = -EFAULT;
2469 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2470 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2471 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2472 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2473 goto out; 2473 goto out;
2474 2474
2475 error = -EPERM; 2475 error = -EPERM;
@@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2501 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2502 } 2502 }
2503 2503
2504 error = 0;
2504 if (uoss) { 2505 if (uoss) {
2505 error = -EFAULT; 2506 error = -EFAULT;
2506 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2507 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2508 } 2512 }
2509 2513
2510 error = 0;
2511out: 2514out:
2512 return error; 2515 return error;
2513} 2516}
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..94188b8ecc33 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3a94905fa5d2..eb5e131a0485 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
345 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
346} 346}
347 347
348/* Tasklets */ 348/*
349 * Tasklets
350 */
349struct tasklet_head 351struct tasklet_head
350{ 352{
351 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
493 495
494EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
495 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
496DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
497EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
498 560
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62e4ff9968b5..98e02328c67d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = {
335 .data = &sysctl_timer_migration, 335 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 336 .maxlen = sizeof(unsigned int),
337 .mode = 0644, 337 .mode = 0644,
338 .proc_handler = &proc_dointvec, 338 .proc_handler = &proc_dointvec_minmax,
339 .strategy = &sysctl_intvec,
340 .extra1 = &zero,
341 .extra2 = &one,
339 }, 342 },
340#endif 343#endif
341 { 344 {
@@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = {
744 .proc_handler = &proc_dointvec, 747 .proc_handler = &proc_dointvec,
745 }, 748 },
746 { 749 {
750 .ctl_name = CTL_UNNUMBERED,
751 .procname = "panic_on_io_nmi",
752 .data = &panic_on_io_nmi,
753 .maxlen = sizeof(int),
754 .mode = 0644,
755 .proc_handler = &proc_dointvec,
756 },
757 {
747 .ctl_name = KERN_BOOTLOADER_TYPE, 758 .ctl_name = KERN_BOOTLOADER_TYPE,
748 .procname = "bootloader_type", 759 .procname = "bootloader_type",
749 .data = &bootloader_type, 760 .data = &bootloader_type,
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..a6dcd67b041d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg)
254 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
255} 255}
256EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 257#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 592bf584d1d2..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
513 * Check to make sure we don't switch to a non-highres capable 513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz) 514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */ 515 */
516 if (tick_oneshot_mode_active() && 516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { 517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. " 518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name); 519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..a7f07d5a6241 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
380{ 380{
381 unsigned int flag = 0; 381 unsigned int flag = 0;
382 382
383 if (likely(!timer->start_site))
384 return;
383 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
384 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
385 387
@@ -712,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
712 * networking code - if the timer is re-modified 714 * networking code - if the timer is re-modified
713 * to be the same thing then just return: 715 * to be the same thing then just return:
714 */ 716 */
715 if (timer->expires == expires && timer_pending(timer)) 717 if (timer_pending(timer) && timer->expires == expires)
716 return 1; 718 return 1;
717 719
718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1551f47e7669..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -226,13 +226,13 @@ config BOOT_TRACER
226 the timings of the initcalls and traces key events and the identity 226 the timings of the initcalls and traces key events and the identity
227 of tasks that can cause boot delays, such as context-switches. 227 of tasks that can cause boot delays, such as context-switches.
228 228
229 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 229 Its aim is to be parsed by the scripts/bootgraph.pl tool to
230 produce pretty graphics about boot inefficiencies, giving a visual 230 produce pretty graphics about boot inefficiencies, giving a visual
231 representation of the delays during initcalls - but the raw 231 representation of the delays during initcalls - but the raw
232 /debug/tracing/trace text output is readable too. 232 /debug/tracing/trace text output is readable too.
233 233
234 You must pass in ftrace=initcall to the kernel command line 234 You must pass in initcall_debug and ftrace=initcall to the kernel
235 to enable this on bootup. 235 command line to enable this on bootup.
236 236
237config TRACE_BRANCH_PROFILING 237config TRACE_BRANCH_PROFILING
238 bool 238 bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..1090b0aed9ba 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3718d55fb4c3..1e1d23c26308 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292 292
293 again: 293 again:
294 rec++; 294 if (idx != 0)
295 rec++;
296
295 if ((void *)rec >= (void *)&pg->records[pg->index]) { 297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next; 298 pg = pg->next;
297 if (!pg) 299 if (!pg)
@@ -766,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
766 .stat_show = function_stat_show 768 .stat_show = function_stat_show
767}; 769};
768 770
769static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
770{ 772{
771 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
772 struct dentry *entry; 774 struct dentry *entry;
@@ -784,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
784 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
785 * we still do not free memory. 787 * we still do not free memory.
786 */ 788 */
787 kfree(stat);
788 WARN(1, 789 WARN(1,
789 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
790 cpu); 791 cpu);
@@ -811,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
811} 812}
812 813
813#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
815{ 816{
816} 817}
817#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1417,10 +1418,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1417{ 1418{
1418 struct ftrace_iterator *iter = m->private; 1419 struct ftrace_iterator *iter = m->private;
1419 void *p = NULL; 1420 void *p = NULL;
1421 loff_t l;
1422
1423 if (!(iter->flags & FTRACE_ITER_HASH))
1424 *pos = 0;
1420 1425
1421 iter->flags |= FTRACE_ITER_HASH; 1426 iter->flags |= FTRACE_ITER_HASH;
1422 1427
1423 return t_hash_next(m, p, pos); 1428 iter->hidx = 0;
1429 for (l = 0; l <= *pos; ) {
1430 p = t_hash_next(m, p, &l);
1431 if (!p)
1432 break;
1433 }
1434 return p;
1424} 1435}
1425 1436
1426static int t_hash_show(struct seq_file *m, void *v) 1437static int t_hash_show(struct seq_file *m, void *v)
@@ -1467,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1467 iter->pg = iter->pg->next; 1478 iter->pg = iter->pg->next;
1468 iter->idx = 0; 1479 iter->idx = 0;
1469 goto retry; 1480 goto retry;
1470 } else {
1471 iter->idx = -1;
1472 } 1481 }
1473 } else { 1482 } else {
1474 rec = &iter->pg->records[iter->idx++]; 1483 rec = &iter->pg->records[iter->idx++];
@@ -1497,6 +1506,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1497{ 1506{
1498 struct ftrace_iterator *iter = m->private; 1507 struct ftrace_iterator *iter = m->private;
1499 void *p = NULL; 1508 void *p = NULL;
1509 loff_t l;
1500 1510
1501 mutex_lock(&ftrace_lock); 1511 mutex_lock(&ftrace_lock);
1502 /* 1512 /*
@@ -1508,23 +1518,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1508 if (*pos > 0) 1518 if (*pos > 0)
1509 return t_hash_start(m, pos); 1519 return t_hash_start(m, pos);
1510 iter->flags |= FTRACE_ITER_PRINTALL; 1520 iter->flags |= FTRACE_ITER_PRINTALL;
1511 (*pos)++;
1512 return iter; 1521 return iter;
1513 } 1522 }
1514 1523
1515 if (iter->flags & FTRACE_ITER_HASH) 1524 if (iter->flags & FTRACE_ITER_HASH)
1516 return t_hash_start(m, pos); 1525 return t_hash_start(m, pos);
1517 1526
1518 if (*pos > 0) { 1527 iter->pg = ftrace_pages_start;
1519 if (iter->idx < 0) 1528 iter->idx = 0;
1520 return p; 1529 for (l = 0; l <= *pos; ) {
1521 (*pos)--; 1530 p = t_next(m, p, &l);
1522 iter->idx--; 1531 if (!p)
1532 break;
1523 } 1533 }
1524 1534
1525 p = t_next(m, p, pos); 1535 if (!p && iter->flags & FTRACE_ITER_FILTER)
1526
1527 if (!p)
1528 return t_hash_start(m, pos); 1536 return t_hash_start(m, pos);
1529 1537
1530 return p; 1538 return p;
@@ -1654,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1654 1662
1655 mutex_lock(&ftrace_regex_lock); 1663 mutex_lock(&ftrace_regex_lock);
1656 if ((file->f_mode & FMODE_WRITE) && 1664 if ((file->f_mode & FMODE_WRITE) &&
1657 !(file->f_flags & O_APPEND)) 1665 (file->f_flags & O_TRUNC))
1658 ftrace_filter_reset(enable); 1666 ftrace_filter_reset(enable);
1659 1667
1660 if (file->f_mode & FMODE_READ) { 1668 if (file->f_mode & FMODE_READ) {
@@ -2500,32 +2508,31 @@ int ftrace_graph_count;
2500unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2508unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2501 2509
2502static void * 2510static void *
2503g_next(struct seq_file *m, void *v, loff_t *pos) 2511__g_next(struct seq_file *m, loff_t *pos)
2504{ 2512{
2505 unsigned long *array = m->private; 2513 unsigned long *array = m->private;
2506 int index = *pos;
2507
2508 (*pos)++;
2509 2514
2510 if (index >= ftrace_graph_count) 2515 if (*pos >= ftrace_graph_count)
2511 return NULL; 2516 return NULL;
2517 return &array[*pos];
2518}
2512 2519
2513 return &array[index]; 2520static void *
2521g_next(struct seq_file *m, void *v, loff_t *pos)
2522{
2523 (*pos)++;
2524 return __g_next(m, pos);
2514} 2525}
2515 2526
2516static void *g_start(struct seq_file *m, loff_t *pos) 2527static void *g_start(struct seq_file *m, loff_t *pos)
2517{ 2528{
2518 void *p = NULL;
2519
2520 mutex_lock(&graph_lock); 2529 mutex_lock(&graph_lock);
2521 2530
2522 /* Nothing, tell g_show to print all functions are enabled */ 2531 /* Nothing, tell g_show to print all functions are enabled */
2523 if (!ftrace_graph_count && !*pos) 2532 if (!ftrace_graph_count && !*pos)
2524 return (void *)1; 2533 return (void *)1;
2525 2534
2526 p = g_next(m, p, pos); 2535 return __g_next(m, pos);
2527
2528 return p;
2529} 2536}
2530 2537
2531static void g_stop(struct seq_file *m, void *p) 2538static void g_stop(struct seq_file *m, void *p)
@@ -2570,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2570 2577
2571 mutex_lock(&graph_lock); 2578 mutex_lock(&graph_lock);
2572 if ((file->f_mode & FMODE_WRITE) && 2579 if ((file->f_mode & FMODE_WRITE) &&
2573 !(file->f_flags & O_APPEND)) { 2580 (file->f_flags & O_TRUNC)) {
2574 ftrace_graph_count = 0; 2581 ftrace_graph_count = 0;
2575 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2582 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2576 } 2583 }
@@ -2589,6 +2596,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2589} 2596}
2590 2597
2591static int 2598static int
2599ftrace_graph_release(struct inode *inode, struct file *file)
2600{
2601 if (file->f_mode & FMODE_READ)
2602 seq_release(inode, file);
2603 return 0;
2604}
2605
2606static int
2592ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2607ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2593{ 2608{
2594 struct dyn_ftrace *rec; 2609 struct dyn_ftrace *rec;
@@ -2717,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2717} 2732}
2718 2733
2719static const struct file_operations ftrace_graph_fops = { 2734static const struct file_operations ftrace_graph_fops = {
2720 .open = ftrace_graph_open, 2735 .open = ftrace_graph_open,
2721 .read = seq_read, 2736 .read = seq_read,
2722 .write = ftrace_graph_write, 2737 .write = ftrace_graph_write,
2738 .release = ftrace_graph_release,
2723}; 2739};
2724#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2740#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2725 2741
@@ -3152,10 +3168,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3152 3168
3153 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3169 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3154 3170
3155 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3171 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3156 goto out; 3172 goto out;
3157 3173
3158 last_ftrace_enabled = ftrace_enabled; 3174 last_ftrace_enabled = !!ftrace_enabled;
3159 3175
3160 if (ftrace_enabled) { 3176 if (ftrace_enabled) {
3161 3177
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 04dac2638258..bf27bb7a63e2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1563,6 +1563,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1563 return NULL; 1563 return NULL;
1564} 1564}
1565 1565
1566#ifdef CONFIG_TRACING
1567
1566#define TRACE_RECURSIVE_DEPTH 16 1568#define TRACE_RECURSIVE_DEPTH 16
1567 1569
1568static int trace_recursive_lock(void) 1570static int trace_recursive_lock(void)
@@ -1593,6 +1595,13 @@ static void trace_recursive_unlock(void)
1593 current->trace_recursion--; 1595 current->trace_recursion--;
1594} 1596}
1595 1597
1598#else
1599
1600#define trace_recursive_lock() (0)
1601#define trace_recursive_unlock() do { } while (0)
1602
1603#endif
1604
1596static DEFINE_PER_CPU(int, rb_need_resched); 1605static DEFINE_PER_CPU(int, rb_need_resched);
1597 1606
1598/** 1607/**
@@ -3104,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3104} 3113}
3105EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3114EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3106 3115
3116#ifdef CONFIG_TRACING
3107static ssize_t 3117static ssize_t
3108rb_simple_read(struct file *filp, char __user *ubuf, 3118rb_simple_read(struct file *filp, char __user *ubuf,
3109 size_t cnt, loff_t *ppos) 3119 size_t cnt, loff_t *ppos)
@@ -3171,6 +3181,7 @@ static __init int rb_init_debugfs(void)
3171} 3181}
3172 3182
3173fs_initcall(rb_init_debugfs); 3183fs_initcall(rb_init_debugfs);
3184#endif
3174 3185
3175#ifdef CONFIG_HOTPLUG_CPU 3186#ifdef CONFIG_HOTPLUG_CPU
3176static int rb_cpu_notify(struct notifier_block *self, 3187static int rb_cpu_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 076fa6f0ee48..8930e39b9d8c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -284,13 +285,12 @@ void trace_wake_up(void)
284static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
285{ 286{
286 unsigned long buf_size; 287 unsigned long buf_size;
287 int ret;
288 288
289 if (!str) 289 if (!str)
290 return 0; 290 return 0;
291 ret = strict_strtoul(str, 0, &buf_size); 291 buf_size = memparse(str, &str);
292 /* nr_entries can not be zero */ 292 /* nr_entries can not be zero */
293 if (ret < 0 || buf_size == 0) 293 if (buf_size == 0)
294 return 0; 294 return 0;
295 trace_buf_size = buf_size; 295 trace_buf_size = buf_size;
296 return 1; 296 return 1;
@@ -2031,7 +2031,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2031 2031
2032 /* If this file was open for write, then erase contents */ 2032 /* If this file was open for write, then erase contents */
2033 if ((file->f_mode & FMODE_WRITE) && 2033 if ((file->f_mode & FMODE_WRITE) &&
2034 !(file->f_flags & O_APPEND)) { 2034 (file->f_flags & O_TRUNC)) {
2035 long cpu = (long) inode->i_private; 2035 long cpu = (long) inode->i_private;
2036 2036
2037 if (cpu == TRACE_PIPE_ALL_CPU) 2037 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2053,25 +2053,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2053static void * 2053static void *
2054t_next(struct seq_file *m, void *v, loff_t *pos) 2054t_next(struct seq_file *m, void *v, loff_t *pos)
2055{ 2055{
2056 struct tracer *t = m->private; 2056 struct tracer *t = v;
2057 2057
2058 (*pos)++; 2058 (*pos)++;
2059 2059
2060 if (t) 2060 if (t)
2061 t = t->next; 2061 t = t->next;
2062 2062
2063 m->private = t;
2064
2065 return t; 2063 return t;
2066} 2064}
2067 2065
2068static void *t_start(struct seq_file *m, loff_t *pos) 2066static void *t_start(struct seq_file *m, loff_t *pos)
2069{ 2067{
2070 struct tracer *t = m->private; 2068 struct tracer *t;
2071 loff_t l = 0; 2069 loff_t l = 0;
2072 2070
2073 mutex_lock(&trace_types_lock); 2071 mutex_lock(&trace_types_lock);
2074 for (; t && l < *pos; t = t_next(m, t, &l)) 2072 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2075 ; 2073 ;
2076 2074
2077 return t; 2075 return t;
@@ -2107,18 +2105,10 @@ static struct seq_operations show_traces_seq_ops = {
2107 2105
2108static int show_traces_open(struct inode *inode, struct file *file) 2106static int show_traces_open(struct inode *inode, struct file *file)
2109{ 2107{
2110 int ret;
2111
2112 if (tracing_disabled) 2108 if (tracing_disabled)
2113 return -ENODEV; 2109 return -ENODEV;
2114 2110
2115 ret = seq_open(file, &show_traces_seq_ops); 2111 return seq_open(file, &show_traces_seq_ops);
2116 if (!ret) {
2117 struct seq_file *m = file->private_data;
2118 m->private = trace_types;
2119 }
2120
2121 return ret;
2122} 2112}
2123 2113
2124static ssize_t 2114static ssize_t
@@ -3095,7 +3085,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3095 break; 3085 break;
3096 } 3086 }
3097 3087
3098 trace_consume(iter); 3088 if (ret != TRACE_TYPE_NO_CONSUME)
3089 trace_consume(iter);
3099 rem -= count; 3090 rem -= count;
3100 if (!find_next_entry_inc(iter)) { 3091 if (!find_next_entry_inc(iter)) {
3101 rem = 0; 3092 rem = 0;
@@ -4243,8 +4234,11 @@ static void __ftrace_dump(bool disable_tracing)
4243 iter.pos = -1; 4234 iter.pos = -1;
4244 4235
4245 if (find_next_entry_inc(&iter) != NULL) { 4236 if (find_next_entry_inc(&iter) != NULL) {
4246 print_trace_line(&iter); 4237 int ret;
4247 trace_consume(&iter); 4238
4239 ret = print_trace_line(&iter);
4240 if (ret != TRACE_TYPE_NO_CONSUME)
4241 trace_consume(&iter);
4248 } 4242 }
4249 4243
4250 trace_printk_seq(&iter.seq); 4244 trace_printk_seq(&iter.seq);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..3548ae5cc780 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter)
597 597
598extern struct pid *ftrace_pid_trace; 598extern struct pid *ftrace_pid_trace;
599 599
600#ifdef CONFIG_FUNCTION_TRACER
600static inline int ftrace_trace_task(struct task_struct *task) 601static inline int ftrace_trace_task(struct task_struct *task)
601{ 602{
602 if (!ftrace_pid_trace) 603 if (!ftrace_pid_trace)
@@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
604 605
605 return test_tsk_trace_trace(task); 606 return test_tsk_trace_trace(task);
606} 607}
608#else
609static inline int ftrace_trace_task(struct task_struct *task)
610{
611 return 1;
612}
613#endif
607 614
608/* 615/*
609 * trace_iterator_flags is an enumeration that defines bit 616 * trace_iterator_flags is an enumeration that defines bit
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecfe..11ba5bb4ed0a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id)
14 14
15 mutex_lock(&event_mutex); 15 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 16 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id) { 17 if (event->id == event_id && event->profile_enable) {
18 ret = event->profile_enable(event); 18 ret = event->profile_enable(event);
19 break; 19 break;
20 } 20 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5e32e375134d..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore, 26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT( 27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func) 28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
29 TRACE_FIELD(int, ret.depth, depth) 32 TRACE_FIELD(int, ret.depth, depth)
30 ), 33 ),
31 TP_RAW_FMT("<-- %lx (%d)") 34 TP_RAW_FMT("<-- %lx (%d)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..e75276a49cf5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
300 300
301static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
302{ 302{
303 struct ftrace_event_call *call = NULL;
304 loff_t l;
305
303 mutex_lock(&event_mutex); 306 mutex_lock(&event_mutex);
304 if (*pos == 0) 307
305 m->private = ftrace_events.next; 308 m->private = ftrace_events.next;
306 return t_next(m, NULL, pos); 309 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l);
311 if (!call)
312 break;
313 }
314 return call;
307} 315}
308 316
309static void * 317static void *
@@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
332 340
333static void *s_start(struct seq_file *m, loff_t *pos) 341static void *s_start(struct seq_file *m, loff_t *pos)
334{ 342{
343 struct ftrace_event_call *call = NULL;
344 loff_t l;
345
335 mutex_lock(&event_mutex); 346 mutex_lock(&event_mutex);
336 if (*pos == 0) 347
337 m->private = ftrace_events.next; 348 m->private = ftrace_events.next;
338 return s_next(m, NULL, pos); 349 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l);
351 if (!call)
352 break;
353 }
354 return call;
339} 355}
340 356
341static int t_show(struct seq_file *m, void *v) 357static int t_show(struct seq_file *m, void *v)
@@ -360,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
360 const struct seq_operations *seq_ops; 376 const struct seq_operations *seq_ops;
361 377
362 if ((file->f_mode & FMODE_WRITE) && 378 if ((file->f_mode & FMODE_WRITE) &&
363 !(file->f_flags & O_APPEND)) 379 (file->f_flags & O_TRUNC))
364 ftrace_clear_events(); 380 ftrace_clear_events();
365 381
366 seq_ops = inode->i_private; 382 seq_ops = inode->i_private;
@@ -924,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
924 entry = trace_create_file("enable", 0644, call->dir, call, 940 entry = trace_create_file("enable", 0644, call->dir, call,
925 enable); 941 enable);
926 942
927 if (call->id) 943 if (call->id && call->profile_enable)
928 entry = trace_create_file("id", 0444, call->dir, call, 944 entry = trace_create_file("id", 0444, call->dir, call,
929 id); 945 id);
930 946
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 90f134764837..75ef000613c3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -302,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
302 if (count == -1) 302 if (count == -1)
303 seq_printf(m, ":unlimited\n"); 303 seq_printf(m, ":unlimited\n");
304 else 304 else
305 seq_printf(m, ":count=%ld", count); 305 seq_printf(m, ":count=%ld\n", count);
306 seq_putc(m, '\n');
307 306
308 return 0; 307 return 0;
309} 308}
@@ -364,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
364 out_reg: 363 out_reg:
365 ret = register_ftrace_function_probe(glob, ops, count); 364 ret = register_ftrace_function_probe(glob, ops, count);
366 365
367 return ret; 366 return ret < 0 ? ret : 0;
368} 367}
369 368
370static struct ftrace_func_command ftrace_traceon_cmd = { 369static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d2249abafb53..420ec3487579 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
843 843
844 switch (entry->type) { 844 switch (entry->type) {
845 case TRACE_GRAPH_ENT: { 845 case TRACE_GRAPH_ENT: {
846 struct ftrace_graph_ent_entry *field; 846 /*
847 * print_graph_entry() may consume the current event,
848 * thus @field may become invalid, so we need to save it.
849 * sizeof(struct ftrace_graph_ent_entry) is very small,
850 * it can be safely saved at the stack.
851 */
852 struct ftrace_graph_ent_entry *field, saved;
847 trace_assign_type(field, entry); 853 trace_assign_type(field, entry);
848 return print_graph_entry(field, s, iter); 854 saved = *field;
855 return print_graph_entry(&saved, s, iter);
849 } 856 }
850 case TRACE_GRAPH_RET: { 857 case TRACE_GRAPH_RET: {
851 struct ftrace_graph_ret_entry *field; 858 struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
182 const char *str = *fmt; 176 const char *str = *fmt;
183 int i; 177 int i;
184 178
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 179 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
186 180
187 /* 181 /*
188 * Tabs and new lines need to be converted. 182 * Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..6a2a9d484cd6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 301
302static int stack_trace_open(struct inode *inode, struct file *file) 302static int stack_trace_open(struct inode *inode, struct file *file)
303{ 303{
304 int ret; 304 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 305}
310 306
311static const struct file_operations stack_trace_fops = { 307static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 308 .open = stack_trace_open,
313 .read = seq_read, 309 .read = seq_read,
314 .llseek = seq_lseek, 310 .llseek = seq_lseek,
311 .release = seq_release,
315}; 312};
316 313
317int 314int
@@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 323 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 324
328 if (ret || !write || 325 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 326 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 327 goto out;
331 328
332 last_stack_tracer_enabled = stack_tracer_enabled; 329 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 330
334 if (stack_tracer_enabled) 331 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 332 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..aea321c82fa0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node)
73 } 73 }
74} 74}
75 75
76static void reset_stat_session(struct stat_session *session) 76static void __reset_stat_session(struct stat_session *session)
77{ 77{
78 struct rb_node *node = session->stat_root.rb_node; 78 struct rb_node *node = session->stat_root.rb_node;
79 79
@@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session)
83 session->stat_root = RB_ROOT; 83 session->stat_root = RB_ROOT;
84} 84}
85 85
86static void reset_stat_session(struct stat_session *session)
87{
88 mutex_lock(&session->stat_mutex);
89 __reset_stat_session(session);
90 mutex_unlock(&session->stat_mutex);
91}
92
86static void destroy_session(struct stat_session *session) 93static void destroy_session(struct stat_session *session)
87{ 94{
88 debugfs_remove(session->file); 95 debugfs_remove(session->file);
89 reset_stat_session(session); 96 __reset_stat_session(session);
90 mutex_destroy(&session->stat_mutex); 97 mutex_destroy(&session->stat_mutex);
91 kfree(session); 98 kfree(session);
92} 99}
@@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session)
150 int i; 157 int i;
151 158
152 mutex_lock(&session->stat_mutex); 159 mutex_lock(&session->stat_mutex);
153 reset_stat_session(session); 160 __reset_stat_session(session);
154 161
155 if (!ts->stat_cmp) 162 if (!ts->stat_cmp)
156 ts->stat_cmp = dummy_cmp; 163 ts->stat_cmp = dummy_cmp;
@@ -183,7 +190,7 @@ exit:
183 return ret; 190 return ret;
184 191
185exit_free_rbtree: 192exit_free_rbtree:
186 reset_stat_session(session); 193 __reset_stat_session(session);
187 mutex_unlock(&session->stat_mutex); 194 mutex_unlock(&session->stat_mutex);
188 return ret; 195 return ret;
189} 196}
@@ -199,17 +206,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
199 mutex_lock(&session->stat_mutex); 206 mutex_lock(&session->stat_mutex);
200 207
201 /* If we are in the beginning of the file, print the headers */ 208 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) { 209 if (!*pos && session->ts->stat_headers)
203 (*pos)++;
204 return SEQ_START_TOKEN; 210 return SEQ_START_TOKEN;
205 }
206 211
207 node = rb_first(&session->stat_root); 212 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++) 213 for (i = 0; node && i < *pos; i++)
209 node = rb_next(node); 214 node = rb_next(node);
210 215
211 (*pos)++;
212
213 return node; 216 return node;
214} 217}
215 218
@@ -254,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = {
254static int tracing_stat_open(struct inode *inode, struct file *file) 257static int tracing_stat_open(struct inode *inode, struct file *file)
255{ 258{
256 int ret; 259 int ret;
257 260 struct seq_file *m;
258 struct stat_session *session = inode->i_private; 261 struct stat_session *session = inode->i_private;
259 262
263 ret = stat_seq_init(session);
264 if (ret)
265 return ret;
266
260 ret = seq_open(file, &trace_stat_seq_ops); 267 ret = seq_open(file, &trace_stat_seq_ops);
261 if (!ret) { 268 if (ret) {
262 struct seq_file *m = file->private_data; 269 reset_stat_session(session);
263 m->private = session; 270 return ret;
264 ret = stat_seq_init(session);
265 } 271 }
266 272
273 m = file->private_data;
274 m->private = session;
267 return ret; 275 return ret;
268} 276}
269 277
@@ -274,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
274{ 282{
275 struct stat_session *session = i->i_private; 283 struct stat_session *session = i->i_private;
276 284
277 mutex_lock(&session->stat_mutex);
278 reset_stat_session(session); 285 reset_stat_session(session);
279 mutex_unlock(&session->stat_mutex);
280 286
281 return 0; 287 return seq_release(i, f);
282} 288}
283 289
284static const struct file_operations tracing_stat_fops = { 290static const struct file_operations tracing_stat_fops = {