aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-12-28 15:21:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-12-28 15:21:10 -0500
commitb0f4b285d7ed174804658539129a834270f4829a (patch)
treebe7f8dca58075aba2c6a137fcfd4d44c5c333efc /arch/x86/kernel
parentbe9c5ae4eeec2e85527e95647348b8ea4eb25128 (diff)
parent5250d329e38cdf7580faeb9c53c17d3588d7d19c (diff)
Merge branch 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (241 commits) sched, trace: update trace_sched_wakeup() tracing/ftrace: don't trace on early stage of a secondary cpu boot, v3 Revert "x86: disable X86_PTRACE_BTS" ring-buffer: prevent false positive warning ring-buffer: fix dangling commit race ftrace: enable format arguments checking x86, bts: memory accounting x86, bts: add fork and exit handling ftrace: introduce tracing_reset_online_cpus() helper tracing: fix warnings in kernel/trace/trace_sched_switch.c tracing: fix warning in kernel/trace/trace.c tracing/ring-buffer: remove unused ring_buffer size trace: fix task state printout ftrace: add not to regex on filtering functions trace: better use of stack_trace_enabled for boot up code trace: add a way to enable or disable the stack tracer x86: entry_64 - introduce FTRACE_ frame macro v2 tracing/ftrace: add the printk-msg-only option tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp() x86, bts: correctly report invalid bts records ... Fixed up trivial conflict in scripts/recordmcount.pl due to SH bits being already partly merged by the SH merge.
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/apic.c3
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/ds.c1138
-rw-r--r--arch/x86/kernel/dumpstack.c34
-rw-r--r--arch/x86/kernel/dumpstack.h2
-rw-r--r--arch/x86/kernel/dumpstack_32.c5
-rw-r--r--arch/x86/kernel/dumpstack_64.c7
-rw-r--r--arch/x86/kernel/entry_32.S51
-rw-r--r--arch/x86/kernel/entry_64.S98
-rw-r--r--arch/x86/kernel/ftrace.c390
-rw-r--r--arch/x86/kernel/irq_64.c3
-rw-r--r--arch/x86/kernel/process.c16
-rw-r--r--arch/x86/kernel/process_32.c67
-rw-r--r--arch/x86/kernel/process_64.c58
-rw-r--r--arch/x86/kernel/ptrace.c431
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S1
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S1
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
23 files changed, 1465 insertions, 923 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 1f208aaee780..88dd768eab6d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -66,6 +66,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
66obj-$(CONFIG_X86_IO_APIC) += io_apic.o 66obj-$(CONFIG_X86_IO_APIC) += io_apic.o
67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
69obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 70obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
70obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 71obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
71obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 72obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 7397911f8478..b5229affb953 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h> 32#include <linux/dmar.h>
33#include <linux/ftrace.h>
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35#include <asm/smp.h> 36#include <asm/smp.h>
@@ -790,7 +791,7 @@ static void local_apic_timer_interrupt(void)
790 * [ if a single-CPU system runs an SMP kernel then we call the local 791 * [ if a single-CPU system runs an SMP kernel then we call the local
791 * interrupt as well. Thus we cannot inline the local irq ... ] 792 * interrupt as well. Thus we cannot inline the local irq ... ]
792 */ 793 */
793void smp_apic_timer_interrupt(struct pt_regs *regs) 794void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
794{ 795{
795 struct pt_regs *old_regs = set_irq_regs(regs); 796 struct pt_regs *old_regs = set_irq_regs(regs);
796 797
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a5c04e88777e..82db7f45e2de 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -2,6 +2,11 @@
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg
8endif
9
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 10obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o common.o 11obj-y += proc.o capflags.o powerflags.o common.o
7obj-y += vmware.o hypervisor.o 12obj-y += vmware.o hypervisor.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467d..88ea02dcb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h>
36 37
37#include <linux/acpi.h> 38#include <linux/acpi.h>
38#include <acpi/processor.h> 39#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391 unsigned int next_perf_state = 0; /* Index into perf table */ 392 unsigned int next_perf_state = 0; /* Index into perf table */
392 unsigned int i; 393 unsigned int i;
393 int result = 0; 394 int result = 0;
395 struct power_trace it;
394 396
395 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 397 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
396 398
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
427 } 429 }
428 } 430 }
429 431
432 trace_power_mark(&it, POWER_PSTATE, next_perf_state);
433
430 switch (data->cpu_feature) { 434 switch (data->cpu_feature) {
431 case SYSTEM_INTEL_MSR_CAPABLE: 435 case SYSTEM_INTEL_MSR_CAPABLE:
432 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 436 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ccfd2047630c..8ea6929e974c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h> 14#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17 16
@@ -326,9 +325,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
326 set_cpu_cap(c, X86_FEATURE_P3); 325 set_cpu_cap(c, X86_FEATURE_P3);
327#endif 326#endif
328 327
329 if (cpu_has_bts)
330 ptrace_bts_init_intel(c);
331
332 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 328 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
333 /* 329 /*
334 * let's use the legacy cpuid vector 0x1 and 0x4 for topology 330 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index d6938d9351cf..da91701a2348 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,14 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It does not do:
15 * - get_task_struct on all parameter tasks 14 * - security checking (is the caller allowed to trace the task)
16 * - current is allowed to trace parameter tasks 15 * - buffer allocation (memory accounting)
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,22 +27,69 @@
28#include <linux/slab.h> 27#include <linux/slab.h>
29#include <linux/sched.h> 28#include <linux/sched.h>
30#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/kernel.h>
31 31
32 32
33/* 33/*
34 * The configuration for a particular DS hardware implementation. 34 * The configuration for a particular DS hardware implementation.
35 */ 35 */
36struct ds_configuration { 36struct ds_configuration {
37 /* the size of the DS structure in bytes */ 37 /* the name of the configuration */
38 unsigned char sizeof_ds; 38 const char *name;
39 /* the size of one pointer-typed field in the DS structure in bytes; 39 /* the size of one pointer-typed field in the DS structure and
40 this covers the first 8 fields related to buffer management. */ 40 in the BTS and PEBS buffers in bytes;
41 this covers the first 8 DS fields related to buffer management. */
41 unsigned char sizeof_field; 42 unsigned char sizeof_field;
42 /* the size of a BTS/PEBS record in bytes */ 43 /* the size of a BTS/PEBS record in bytes */
43 unsigned char sizeof_rec[2]; 44 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed
46 * by enum ds_feature */
47 unsigned long ctl[dsf_ctl_max];
44}; 48};
45static struct ds_configuration ds_cfg; 49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
46 50
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
52
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56
57#define BTS_CONTROL \
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
59 ds_cfg.ctl[dsf_bts_overflow])
60
61
62/*
63 * A BTS or PEBS tracer.
64 *
65 * This holds the configuration of the tracer and serves as a handle
66 * to identify tracers.
67 */
68struct ds_tracer {
69 /* the DS context (partially) owned by this tracer */
70 struct ds_context *context;
71 /* the buffer provided on ds_request() and its size in bytes */
72 void *buffer;
73 size_t size;
74};
75
76struct bts_tracer {
77 /* the common DS part */
78 struct ds_tracer ds;
79 /* the trace including the DS configuration */
80 struct bts_trace trace;
81 /* buffer overflow notification function */
82 bts_ovfl_callback_t ovfl;
83};
84
85struct pebs_tracer {
86 /* the common DS part */
87 struct ds_tracer ds;
88 /* the trace including the DS configuration */
89 struct pebs_trace trace;
90 /* buffer overflow notification function */
91 pebs_ovfl_callback_t ovfl;
92};
47 93
48/* 94/*
49 * Debug Store (DS) save area configuration (see Intel64 and IA32 95 * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -109,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
109 155
110 156
111/* 157/*
112 * Locking is done only for allocating BTS or PEBS resources and for 158 * Locking is done only for allocating BTS or PEBS resources.
113 * guarding context and buffer memory allocation.
114 *
115 * Most functions require the current task to own the ds context part
116 * they are going to access. All the locking is done when validating
117 * access to the context.
118 */ 159 */
119static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); 160static DEFINE_SPINLOCK(ds_lock);
120
121/*
122 * Validate that the current task is allowed to access the BTS/PEBS
123 * buffer of the parameter task.
124 *
125 * Returns 0, if access is granted; -Eerrno, otherwise.
126 */
127static inline int ds_validate_access(struct ds_context *context,
128 enum ds_qualifier qual)
129{
130 if (!context)
131 return -EPERM;
132
133 if (context->owner[qual] == current)
134 return 0;
135
136 return -EPERM;
137}
138 161
139 162
140/* 163/*
@@ -150,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context,
150 * >0 number of per-thread tracers 173 * >0 number of per-thread tracers
151 * <0 number of per-cpu tracers 174 * <0 number of per-cpu tracers
152 * 175 *
153 * The below functions to get and put tracers and to check the
154 * allocation type require the ds_lock to be held by the caller.
155 *
156 * Tracers essentially gives the number of ds contexts for a certain 176 * Tracers essentially gives the number of ds contexts for a certain
157 * type of allocation. 177 * type of allocation.
158 */ 178 */
159static long tracers; 179static atomic_t tracers = ATOMIC_INIT(0);
160 180
161static inline void get_tracer(struct task_struct *task) 181static inline void get_tracer(struct task_struct *task)
162{ 182{
163 tracers += (task ? 1 : -1); 183 if (task)
184 atomic_inc(&tracers);
185 else
186 atomic_dec(&tracers);
164} 187}
165 188
166static inline void put_tracer(struct task_struct *task) 189static inline void put_tracer(struct task_struct *task)
167{ 190{
168 tracers -= (task ? 1 : -1); 191 if (task)
192 atomic_dec(&tracers);
193 else
194 atomic_inc(&tracers);
169} 195}
170 196
171static inline int check_tracer(struct task_struct *task) 197static inline int check_tracer(struct task_struct *task)
172{ 198{
173 return (task ? (tracers >= 0) : (tracers <= 0)); 199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
174} 202}
175 203
176 204
@@ -183,99 +211,70 @@ static inline int check_tracer(struct task_struct *task)
183 * 211 *
184 * Contexts are use-counted. They are allocated on first access and 212 * Contexts are use-counted. They are allocated on first access and
185 * deallocated when the last user puts the context. 213 * deallocated when the last user puts the context.
186 *
187 * We distinguish between an allocating and a non-allocating get of a
188 * context:
189 * - the allocating get is used for requesting BTS/PEBS resources. It
190 * requires the caller to hold the global ds_lock.
191 * - the non-allocating get is used for all other cases. A
192 * non-existing context indicates an error. It acquires and releases
193 * the ds_lock itself for obtaining the context.
194 *
195 * A context and its DS configuration are allocated and deallocated
196 * together. A context always has a DS configuration of the
197 * appropriate size.
198 */ 214 */
199static DEFINE_PER_CPU(struct ds_context *, system_context); 215struct ds_context {
200 216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
201#define this_system_context per_cpu(system_context, smp_processor_id()) 217 unsigned char ds[MAX_SIZEOF_DS];
202 218 /* the owner of the BTS and PEBS configuration, respectively */
203/* 219 struct bts_tracer *bts_master;
204 * Returns the pointer to the parameter task's context or to the 220 struct pebs_tracer *pebs_master;
205 * system-wide context, if task is NULL. 221 /* use count */
206 * 222 unsigned long count;
207 * Increases the use count of the returned context, if not NULL. 223 /* a pointer to the context location inside the thread_struct
208 */ 224 * or the per_cpu context array */
209static inline struct ds_context *ds_get_context(struct task_struct *task) 225 struct ds_context **this;
210{ 226 /* a pointer to the task owning this context, or NULL, if the
211 struct ds_context *context; 227 * context is owned by a cpu */
212 unsigned long irq; 228 struct task_struct *task;
229};
213 230
214 spin_lock_irqsave(&ds_lock, irq); 231static DEFINE_PER_CPU(struct ds_context *, system_context_array);
215 232
216 context = (task ? task->thread.ds_ctx : this_system_context); 233#define system_context per_cpu(system_context_array, smp_processor_id())
217 if (context)
218 context->count++;
219 234
220 spin_unlock_irqrestore(&ds_lock, irq);
221
222 return context;
223}
224 235
225/* 236static inline struct ds_context *ds_get_context(struct task_struct *task)
226 * Same as ds_get_context, but allocates the context and it's DS
227 * structure, if necessary; returns NULL; if out of memory.
228 */
229static inline struct ds_context *ds_alloc_context(struct task_struct *task)
230{ 237{
231 struct ds_context **p_context = 238 struct ds_context **p_context =
232 (task ? &task->thread.ds_ctx : &this_system_context); 239 (task ? &task->thread.ds_ctx : &system_context);
233 struct ds_context *context = *p_context; 240 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL;
234 unsigned long irq; 242 unsigned long irq;
235 243
236 if (!context) { 244 /* Chances are small that we already have a context. */
237 context = kzalloc(sizeof(*context), GFP_KERNEL); 245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
238 if (!context) 246 if (!new_context)
239 return NULL; 247 return NULL;
240
241 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
242 if (!context->ds) {
243 kfree(context);
244 return NULL;
245 }
246 248
247 spin_lock_irqsave(&ds_lock, irq); 249 spin_lock_irqsave(&ds_lock, irq);
248 250
249 if (*p_context) { 251 context = *p_context;
250 kfree(context->ds); 252 if (!context) {
251 kfree(context); 253 context = new_context;
252 254
253 context = *p_context; 255 context->this = p_context;
254 } else { 256 context->task = task;
255 *p_context = context; 257 context->count = 0;
256 258
257 context->this = p_context; 259 if (task)
258 context->task = task; 260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
259 261
260 if (task) 262 if (!task || (task == current))
261 set_tsk_thread_flag(task, TIF_DS_AREA_MSR); 263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
262 264
263 if (!task || (task == current)) 265 *p_context = context;
264 wrmsrl(MSR_IA32_DS_AREA,
265 (unsigned long)context->ds);
266 }
267 spin_unlock_irqrestore(&ds_lock, irq);
268 } 266 }
269 267
270 context->count++; 268 context->count++;
271 269
270 spin_unlock_irqrestore(&ds_lock, irq);
271
272 if (context != new_context)
273 kfree(new_context);
274
272 return context; 275 return context;
273} 276}
274 277
275/*
276 * Decreases the use count of the parameter context, if not NULL.
277 * Deallocates the context, if the use count reaches zero.
278 */
279static inline void ds_put_context(struct ds_context *context) 278static inline void ds_put_context(struct ds_context *context)
280{ 279{
281 unsigned long irq; 280 unsigned long irq;
@@ -285,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context)
285 284
286 spin_lock_irqsave(&ds_lock, irq); 285 spin_lock_irqsave(&ds_lock, irq);
287 286
288 if (--context->count) 287 if (--context->count) {
289 goto out; 288 spin_unlock_irqrestore(&ds_lock, irq);
289 return;
290 }
290 291
291 *(context->this) = NULL; 292 *(context->this) = NULL;
292 293
@@ -296,135 +297,263 @@ static inline void ds_put_context(struct ds_context *context)
296 if (!context->task || (context->task == current)) 297 if (!context->task || (context->task == current))
297 wrmsrl(MSR_IA32_DS_AREA, 0); 298 wrmsrl(MSR_IA32_DS_AREA, 0);
298 299
299 put_tracer(context->task); 300 spin_unlock_irqrestore(&ds_lock, irq);
300 301
301 /* free any leftover buffers from tracers that did not
302 * deallocate them properly. */
303 kfree(context->buffer[ds_bts]);
304 kfree(context->buffer[ds_pebs]);
305 kfree(context->ds);
306 kfree(context); 302 kfree(context);
307 out:
308 spin_unlock_irqrestore(&ds_lock, irq);
309} 303}
310 304
311 305
312/* 306/*
313 * Handle a buffer overflow 307 * Call the tracer's callback on a buffer overflow.
314 * 308 *
315 * task: the task whose buffers are overflowing;
316 * NULL for a buffer overflow on the current cpu
317 * context: the ds context 309 * context: the ds context
318 * qual: the buffer type 310 * qual: the buffer type
319 */ 311 */
320static void ds_overflow(struct task_struct *task, struct ds_context *context, 312static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
321 enum ds_qualifier qual)
322{ 313{
323 if (!context) 314 switch (qual) {
324 return; 315 case ds_bts:
325 316 if (context->bts_master &&
326 if (context->callback[qual]) 317 context->bts_master->ovfl)
327 (*context->callback[qual])(task); 318 context->bts_master->ovfl(context->bts_master);
328 319 break;
329 /* todo: do some more overflow handling */ 320 case ds_pebs:
321 if (context->pebs_master &&
322 context->pebs_master->ovfl)
323 context->pebs_master->ovfl(context->pebs_master);
324 break;
325 }
330} 326}
331 327
332 328
333/* 329/*
334 * Allocate a non-pageable buffer of the parameter size. 330 * Write raw data into the BTS or PEBS buffer.
335 * Checks the memory and the locked memory rlimit.
336 * 331 *
337 * Returns the buffer, if successful; 332 * The remainder of any partially written record is zeroed out.
338 * NULL, if out of memory or rlimit exceeded.
339 * 333 *
340 * size: the requested buffer size in bytes 334 * context: the DS context
341 * pages (out): if not NULL, contains the number of pages reserved 335 * qual: the buffer type
336 * record: the data to write
337 * size: the size of the data
342 */ 338 */
343static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) 339static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size)
344{ 341{
345 unsigned long rlim, vm, pgsz; 342 int bytes_written = 0;
346 void *buffer;
347 343
348 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 344 if (!record)
345 return -EINVAL;
349 346
350 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 347 while (size) {
351 vm = current->mm->total_vm + pgsz; 348 unsigned long base, index, end, write_end, int_th;
352 if (rlim < vm) 349 unsigned long write_size, adj_write_size;
353 return NULL;
354 350
355 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 351 /*
356 vm = current->mm->locked_vm + pgsz; 352 * write as much as possible without producing an
357 if (rlim < vm) 353 * overflow interrupt.
358 return NULL; 354 *
355 * interrupt_threshold must either be
356 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum
358 *
359 * index points to a valid record.
360 */
361 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index);
363 end = ds_get(context->ds, qual, ds_absolute_maximum);
364 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
359 365
360 buffer = kzalloc(size, GFP_KERNEL); 366 write_end = min(end, int_th);
361 if (!buffer)
362 return NULL;
363 367
364 current->mm->total_vm += pgsz; 368 /* if we are already beyond the interrupt threshold,
365 current->mm->locked_vm += pgsz; 369 * we fill the entire buffer */
370 if (write_end <= index)
371 write_end = end;
366 372
367 if (pages) 373 if (write_end <= index)
368 *pages = pgsz; 374 break;
375
376 write_size = min((unsigned long) size, write_end - index);
377 memcpy((void *)index, record, write_size);
378
379 record = (const char *)record + write_size;
380 size -= write_size;
381 bytes_written += write_size;
382
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual];
369 385
370 return buffer; 386 /* zero out trailing bytes */
387 memset((char *)index + write_size, 0,
388 adj_write_size - write_size);
389 index += adj_write_size;
390
391 if (index >= end)
392 index = base;
393 ds_set(context->ds, qual, ds_index, index);
394
395 if (index >= int_th)
396 ds_overflow(context, qual);
397 }
398
399 return bytes_written;
371} 400}
372 401
373static int ds_request(struct task_struct *task, void *base, size_t size, 402
374 ds_ovfl_callback_t ovfl, enum ds_qualifier qual) 403/*
404 * Branch Trace Store (BTS) uses the following format. Different
405 * architectures vary in the size of those fields.
406 * - source linear address
407 * - destination linear address
408 * - flags
409 *
410 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode.
412 *
413 * We compute the base address for the first 8 fields based on:
414 * - the field size stored in the DS configuration
415 * - the relative field position
416 *
417 * In order to store additional information in the BTS buffer, we use
418 * a special source address to indicate that the record requires
419 * special interpretation.
420 *
421 * Netburst indicated via a bit in the flags field whether the branch
422 * was predicted; this is ignored.
423 *
424 * We use two levels of abstraction:
425 * - the raw data level defined here
426 * - an arch-independent level defined in ds.h
427 */
428
429enum bts_field {
430 bts_from,
431 bts_to,
432 bts_flags,
433
434 bts_qual = bts_from,
435 bts_jiffies = bts_to,
436 bts_pid = bts_flags,
437
438 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440};
441
442static inline unsigned long bts_get(const char *base, enum bts_field field)
375{ 443{
376 struct ds_context *context; 444 base += (ds_cfg.sizeof_field * field);
377 unsigned long buffer, adj; 445 return *(unsigned long *)base;
378 const unsigned long alignment = (1 << 3); 446}
379 unsigned long irq;
380 int error = 0;
381 447
382 if (!ds_cfg.sizeof_ds) 448static inline void bts_set(char *base, enum bts_field field, unsigned long val)
383 return -EOPNOTSUPP; 449{
450 base += (ds_cfg.sizeof_field * field);;
451 (*(unsigned long *)base) = val;
452}
384 453
385 /* we require some space to do alignment adjustments below */ 454
386 if (size < (alignment + ds_cfg.sizeof_rec[qual])) 455/*
456 * The raw BTS data is architecture dependent.
457 *
458 * For higher-level users, we give an arch-independent view.
459 * - ds.h defines struct bts_struct
460 * - bts_read translates one raw bts record into a bts_struct
461 * - bts_write translates one bts_struct into the raw format and
462 * writes it into the top of the parameter tracer's buffer.
463 *
464 * return: bytes read/written on success; -Eerrno, otherwise
465 */
466static int bts_read(struct bts_tracer *tracer, const void *at,
467 struct bts_struct *out)
468{
469 if (!tracer)
387 return -EINVAL; 470 return -EINVAL;
388 471
389 /* buffer overflow notification is not yet implemented */ 472 if (at < tracer->trace.ds.begin)
390 if (ovfl) 473 return -EINVAL;
391 return -EOPNOTSUPP;
392 474
475 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
476 return -EINVAL;
393 477
394 context = ds_alloc_context(task); 478 memset(out, 0, sizeof(*out));
395 if (!context) 479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
396 return -ENOMEM; 480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
482 out->variant.timestamp.pid = bts_get(at, bts_pid);
483 } else {
484 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from);
486 out->variant.lbr.to = bts_get(at, bts_to);
487
488 if (!out->variant.lbr.from && !out->variant.lbr.to)
489 out->qualifier = bts_invalid;
490 }
397 491
398 spin_lock_irqsave(&ds_lock, irq); 492 return ds_cfg.sizeof_rec[ds_bts];
493}
399 494
400 error = -EPERM; 495static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
401 if (!check_tracer(task)) 496{
402 goto out_unlock; 497 unsigned char raw[MAX_SIZEOF_BTS];
403 498
404 get_tracer(task); 499 if (!tracer)
500 return -EINVAL;
405 501
406 error = -EALREADY; 502 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
407 if (context->owner[qual] == current) 503 return -EOVERFLOW;
408 goto out_put_tracer;
409 error = -EPERM;
410 if (context->owner[qual] != NULL)
411 goto out_put_tracer;
412 context->owner[qual] = current;
413 504
414 spin_unlock_irqrestore(&ds_lock, irq); 505 switch (in->qualifier) {
506 case bts_invalid:
507 bts_set(raw, bts_from, 0);
508 bts_set(raw, bts_to, 0);
509 bts_set(raw, bts_flags, 0);
510 break;
511 case bts_branch:
512 bts_set(raw, bts_from, in->variant.lbr.from);
513 bts_set(raw, bts_to, in->variant.lbr.to);
514 bts_set(raw, bts_flags, 0);
515 break;
516 case bts_task_arrives:
517 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid);
521 break;
522 default:
523 return -EINVAL;
524 }
415 525
526 return ds_write(tracer->ds.context, ds_bts, raw,
527 ds_cfg.sizeof_rec[ds_bts]);
528}
416 529
417 error = -ENOMEM;
418 if (!base) {
419 base = ds_allocate_buffer(size, &context->pages[qual]);
420 if (!base)
421 goto out_release;
422 530
423 context->buffer[qual] = base; 531static void ds_write_config(struct ds_context *context,
424 } 532 struct ds_trace *cfg, enum ds_qualifier qual)
425 error = 0; 533{
534 unsigned char *ds = context->ds;
426 535
427 context->callback[qual] = ovfl; 536 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
537 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
538 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
539 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
540}
541
542static void ds_read_config(struct ds_context *context,
543 struct ds_trace *cfg, enum ds_qualifier qual)
544{
545 unsigned char *ds = context->ds;
546
547 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
548 cfg->top = (void *)ds_get(ds, qual, ds_index);
549 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
550 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
551}
552
553static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
554 void *base, size_t size, size_t ith,
555 unsigned int flags) {
556 unsigned long buffer, adj;
428 557
429 /* adjust the buffer address and size to meet alignment 558 /* adjust the buffer address and size to meet alignment
430 * constraints: 559 * constraints:
@@ -436,410 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size,
436 */ 565 */
437 buffer = (unsigned long)base; 566 buffer = (unsigned long)base;
438 567
439 adj = ALIGN(buffer, alignment) - buffer; 568 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
440 buffer += adj; 569 buffer += adj;
441 size -= adj; 570 size -= adj;
442 571
443 size /= ds_cfg.sizeof_rec[qual]; 572 trace->n = size / ds_cfg.sizeof_rec[qual];
444 size *= ds_cfg.sizeof_rec[qual]; 573 trace->size = ds_cfg.sizeof_rec[qual];
445
446 ds_set(context->ds, qual, ds_buffer_base, buffer);
447 ds_set(context->ds, qual, ds_index, buffer);
448 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
449
450 if (ovfl) {
451 /* todo: select a suitable interrupt threshold */
452 } else
453 ds_set(context->ds, qual,
454 ds_interrupt_threshold, buffer + size + 1);
455 574
456 /* we keep the context until ds_release */ 575 size = (trace->n * trace->size);
457 return error;
458
459 out_release:
460 context->owner[qual] = NULL;
461 ds_put_context(context);
462 put_tracer(task);
463 return error;
464 576
465 out_put_tracer: 577 trace->begin = (void *)buffer;
466 spin_unlock_irqrestore(&ds_lock, irq); 578 trace->top = trace->begin;
467 ds_put_context(context); 579 trace->end = (void *)(buffer + size);
468 put_tracer(task); 580 /* The value for 'no threshold' is -1, which will set the
469 return error; 581 * threshold outside of the buffer, just like we want it.
582 */
583 trace->ith = (void *)(buffer + size - ith);
470 584
471 out_unlock: 585 trace->flags = flags;
472 spin_unlock_irqrestore(&ds_lock, irq);
473 ds_put_context(context);
474 return error;
475} 586}
476 587
477int ds_request_bts(struct task_struct *task, void *base, size_t size,
478 ds_ovfl_callback_t ovfl)
479{
480 return ds_request(task, base, size, ovfl, ds_bts);
481}
482 588
483int ds_request_pebs(struct task_struct *task, void *base, size_t size, 589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
484 ds_ovfl_callback_t ovfl) 590 enum ds_qualifier qual, struct task_struct *task,
485{ 591 void *base, size_t size, size_t th, unsigned int flags)
486 return ds_request(task, base, size, ovfl, ds_pebs);
487}
488
489static int ds_release(struct task_struct *task, enum ds_qualifier qual)
490{ 592{
491 struct ds_context *context; 593 struct ds_context *context;
492 int error; 594 int error;
493 595
494 context = ds_get_context(task); 596 error = -EINVAL;
495 error = ds_validate_access(context, qual); 597 if (!base)
496 if (error < 0)
497 goto out; 598 goto out;
498 599
499 kfree(context->buffer[qual]); 600 /* we require some space to do alignment adjustments below */
500 context->buffer[qual] = NULL; 601 error = -EINVAL;
501 602 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
502 current->mm->total_vm -= context->pages[qual]; 603 goto out;
503 current->mm->locked_vm -= context->pages[qual];
504 context->pages[qual] = 0;
505 context->owner[qual] = NULL;
506
507 /*
508 * we put the context twice:
509 * once for the ds_get_context
510 * once for the corresponding ds_request
511 */
512 ds_put_context(context);
513 out:
514 ds_put_context(context);
515 return error;
516}
517 604
518int ds_release_bts(struct task_struct *task) 605 if (th != (size_t)-1) {
519{ 606 th *= ds_cfg.sizeof_rec[qual];
520 return ds_release(task, ds_bts);
521}
522 607
523int ds_release_pebs(struct task_struct *task) 608 error = -EINVAL;
524{ 609 if (size <= th)
525 return ds_release(task, ds_pebs); 610 goto out;
526} 611 }
527 612
528static int ds_get_index(struct task_struct *task, size_t *pos, 613 tracer->buffer = base;
529 enum ds_qualifier qual) 614 tracer->size = size;
530{
531 struct ds_context *context;
532 unsigned long base, index;
533 int error;
534 615
616 error = -ENOMEM;
535 context = ds_get_context(task); 617 context = ds_get_context(task);
536 error = ds_validate_access(context, qual); 618 if (!context)
537 if (error < 0)
538 goto out; 619 goto out;
620 tracer->context = context;
539 621
540 base = ds_get(context->ds, qual, ds_buffer_base); 622 ds_init_ds_trace(trace, qual, base, size, th, flags);
541 index = ds_get(context->ds, qual, ds_index);
542 623
543 error = ((index - base) / ds_cfg.sizeof_rec[qual]); 624 error = 0;
544 if (pos)
545 *pos = error;
546 out: 625 out:
547 ds_put_context(context);
548 return error; 626 return error;
549} 627}
550 628
551int ds_get_bts_index(struct task_struct *task, size_t *pos) 629struct bts_tracer *ds_request_bts(struct task_struct *task,
552{ 630 void *base, size_t size,
553 return ds_get_index(task, pos, ds_bts); 631 bts_ovfl_callback_t ovfl, size_t th,
554} 632 unsigned int flags)
555
556int ds_get_pebs_index(struct task_struct *task, size_t *pos)
557{
558 return ds_get_index(task, pos, ds_pebs);
559}
560
561static int ds_get_end(struct task_struct *task, size_t *pos,
562 enum ds_qualifier qual)
563{ 633{
564 struct ds_context *context; 634 struct bts_tracer *tracer;
565 unsigned long base, end; 635 unsigned long irq;
566 int error; 636 int error;
567 637
568 context = ds_get_context(task); 638 error = -EOPNOTSUPP;
569 error = ds_validate_access(context, qual); 639 if (!ds_cfg.ctl[dsf_bts])
570 if (error < 0)
571 goto out; 640 goto out;
572 641
573 base = ds_get(context->ds, qual, ds_buffer_base); 642 /* buffer overflow notification is not yet implemented */
574 end = ds_get(context->ds, qual, ds_absolute_maximum); 643 error = -EOPNOTSUPP;
644 if (ovfl)
645 goto out;
575 646
576 error = ((end - base) / ds_cfg.sizeof_rec[qual]); 647 error = -ENOMEM;
577 if (pos) 648 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
578 *pos = error; 649 if (!tracer)
579 out: 650 goto out;
580 ds_put_context(context); 651 tracer->ovfl = ovfl;
581 return error;
582}
583 652
584int ds_get_bts_end(struct task_struct *task, size_t *pos) 653 error = ds_request(&tracer->ds, &tracer->trace.ds,
585{ 654 ds_bts, task, base, size, th, flags);
586 return ds_get_end(task, pos, ds_bts); 655 if (error < 0)
587} 656 goto out_tracer;
588 657
589int ds_get_pebs_end(struct task_struct *task, size_t *pos)
590{
591 return ds_get_end(task, pos, ds_pebs);
592}
593 658
594static int ds_access(struct task_struct *task, size_t index, 659 spin_lock_irqsave(&ds_lock, irq);
595 const void **record, enum ds_qualifier qual)
596{
597 struct ds_context *context;
598 unsigned long base, idx;
599 int error;
600 660
601 if (!record) 661 error = -EPERM;
602 return -EINVAL; 662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
603 665
604 context = ds_get_context(task); 666 error = -EPERM;
605 error = ds_validate_access(context, qual); 667 if (tracer->ds.context->bts_master)
606 if (error < 0) 668 goto out_put_tracer;
607 goto out; 669 tracer->ds.context->bts_master = tracer;
608 670
609 base = ds_get(context->ds, qual, ds_buffer_base); 671 spin_unlock_irqrestore(&ds_lock, irq);
610 idx = base + (index * ds_cfg.sizeof_rec[qual]);
611 672
612 error = -EINVAL;
613 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
614 goto out;
615 673
616 *record = (const void *)idx; 674 tracer->trace.read = bts_read;
617 error = ds_cfg.sizeof_rec[qual]; 675 tracer->trace.write = bts_write;
618 out:
619 ds_put_context(context);
620 return error;
621}
622 676
623int ds_access_bts(struct task_struct *task, size_t index, const void **record) 677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
624{ 678 ds_resume_bts(tracer);
625 return ds_access(task, index, record, ds_bts);
626}
627 679
628int ds_access_pebs(struct task_struct *task, size_t index, const void **record) 680 return tracer;
629{ 681
630 return ds_access(task, index, record, ds_pebs); 682 out_put_tracer:
683 put_tracer(task);
684 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq);
686 ds_put_context(tracer->ds.context);
687 out_tracer:
688 kfree(tracer);
689 out:
690 return ERR_PTR(error);
631} 691}
632 692
633static int ds_write(struct task_struct *task, const void *record, size_t size, 693struct pebs_tracer *ds_request_pebs(struct task_struct *task,
634 enum ds_qualifier qual, int force) 694 void *base, size_t size,
695 pebs_ovfl_callback_t ovfl, size_t th,
696 unsigned int flags)
635{ 697{
636 struct ds_context *context; 698 struct pebs_tracer *tracer;
699 unsigned long irq;
637 int error; 700 int error;
638 701
639 if (!record) 702 /* buffer overflow notification is not yet implemented */
640 return -EINVAL; 703 error = -EOPNOTSUPP;
704 if (ovfl)
705 goto out;
641 706
642 error = -EPERM; 707 error = -ENOMEM;
643 context = ds_get_context(task); 708 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
644 if (!context) 709 if (!tracer)
645 goto out; 710 goto out;
711 tracer->ovfl = ovfl;
646 712
647 if (!force) { 713 error = ds_request(&tracer->ds, &tracer->trace.ds,
648 error = ds_validate_access(context, qual); 714 ds_pebs, task, base, size, th, flags);
649 if (error < 0) 715 if (error < 0)
650 goto out; 716 goto out_tracer;
651 }
652 717
653 error = 0; 718 spin_lock_irqsave(&ds_lock, irq);
654 while (size) {
655 unsigned long base, index, end, write_end, int_th;
656 unsigned long write_size, adj_write_size;
657 719
658 /* 720 error = -EPERM;
659 * write as much as possible without producing an 721 if (!check_tracer(task))
660 * overflow interrupt. 722 goto out_unlock;
661 * 723 get_tracer(task);
662 * interrupt_threshold must either be
663 * - bigger than absolute_maximum or
664 * - point to a record between buffer_base and absolute_maximum
665 *
666 * index points to a valid record.
667 */
668 base = ds_get(context->ds, qual, ds_buffer_base);
669 index = ds_get(context->ds, qual, ds_index);
670 end = ds_get(context->ds, qual, ds_absolute_maximum);
671 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
672 724
673 write_end = min(end, int_th); 725 error = -EPERM;
726 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer;
728 tracer->ds.context->pebs_master = tracer;
674 729
675 /* if we are already beyond the interrupt threshold, 730 spin_unlock_irqrestore(&ds_lock, irq);
676 * we fill the entire buffer */
677 if (write_end <= index)
678 write_end = end;
679 731
680 if (write_end <= index) 732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
681 goto out; 733 ds_resume_pebs(tracer);
682 734
683 write_size = min((unsigned long) size, write_end - index); 735 return tracer;
684 memcpy((void *)index, record, write_size);
685 736
686 record = (const char *)record + write_size; 737 out_put_tracer:
687 size -= write_size; 738 put_tracer(task);
688 error += write_size; 739 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq);
741 ds_put_context(tracer->ds.context);
742 out_tracer:
743 kfree(tracer);
744 out:
745 return ERR_PTR(error);
746}
689 747
690 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 748void ds_release_bts(struct bts_tracer *tracer)
691 adj_write_size *= ds_cfg.sizeof_rec[qual]; 749{
750 if (!tracer)
751 return;
692 752
693 /* zero out trailing bytes */ 753 ds_suspend_bts(tracer);
694 memset((char *)index + write_size, 0,
695 adj_write_size - write_size);
696 index += adj_write_size;
697 754
698 if (index >= end) 755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
699 index = base; 756 tracer->ds.context->bts_master = NULL;
700 ds_set(context->ds, qual, ds_index, index);
701 757
702 if (index >= int_th) 758 put_tracer(tracer->ds.context->task);
703 ds_overflow(task, context, qual); 759 ds_put_context(tracer->ds.context);
704 }
705 760
706 out: 761 kfree(tracer);
707 ds_put_context(context);
708 return error;
709} 762}
710 763
711int ds_write_bts(struct task_struct *task, const void *record, size_t size) 764void ds_suspend_bts(struct bts_tracer *tracer)
712{ 765{
713 return ds_write(task, record, size, ds_bts, /* force = */ 0); 766 struct task_struct *task;
714}
715 767
716int ds_write_pebs(struct task_struct *task, const void *record, size_t size) 768 if (!tracer)
717{ 769 return;
718 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
719}
720 770
721int ds_unchecked_write_bts(struct task_struct *task, 771 task = tracer->ds.context->task;
722 const void *record, size_t size)
723{
724 return ds_write(task, record, size, ds_bts, /* force = */ 1);
725}
726 772
727int ds_unchecked_write_pebs(struct task_struct *task, 773 if (!task || (task == current))
728 const void *record, size_t size) 774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
729{ 775
730 return ds_write(task, record, size, ds_pebs, /* force = */ 1); 776 if (task) {
777 task->thread.debugctlmsr &= ~BTS_CONTROL;
778
779 if (!task->thread.debugctlmsr)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
781 }
731} 782}
732 783
733static int ds_reset_or_clear(struct task_struct *task, 784void ds_resume_bts(struct bts_tracer *tracer)
734 enum ds_qualifier qual, int clear)
735{ 785{
736 struct ds_context *context; 786 struct task_struct *task;
737 unsigned long base, end; 787 unsigned long control;
738 int error;
739 788
740 context = ds_get_context(task); 789 if (!tracer)
741 error = ds_validate_access(context, qual); 790 return;
742 if (error < 0)
743 goto out;
744 791
745 base = ds_get(context->ds, qual, ds_buffer_base); 792 task = tracer->ds.context->task;
746 end = ds_get(context->ds, qual, ds_absolute_maximum);
747 793
748 if (clear) 794 control = ds_cfg.ctl[dsf_bts];
749 memset((void *)base, 0, end - base); 795 if (!(tracer->trace.ds.flags & BTS_KERNEL))
796 control |= ds_cfg.ctl[dsf_bts_kernel];
797 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user];
750 799
751 ds_set(context->ds, qual, ds_index, base); 800 if (task) {
801 task->thread.debugctlmsr |= control;
802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
803 }
752 804
753 error = 0; 805 if (!task || (task == current))
754 out: 806 update_debugctlmsr(get_debugctlmsr() | control);
755 ds_put_context(context);
756 return error;
757} 807}
758 808
759int ds_reset_bts(struct task_struct *task) 809void ds_release_pebs(struct pebs_tracer *tracer)
760{ 810{
761 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); 811 if (!tracer)
812 return;
813
814 ds_suspend_pebs(tracer);
815
816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
817 tracer->ds.context->pebs_master = NULL;
818
819 put_tracer(tracer->ds.context->task);
820 ds_put_context(tracer->ds.context);
821
822 kfree(tracer);
762} 823}
763 824
764int ds_reset_pebs(struct task_struct *task) 825void ds_suspend_pebs(struct pebs_tracer *tracer)
765{ 826{
766 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); 827
767} 828}
768 829
769int ds_clear_bts(struct task_struct *task) 830void ds_resume_pebs(struct pebs_tracer *tracer)
770{ 831{
771 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); 832
772} 833}
773 834
774int ds_clear_pebs(struct task_struct *task) 835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
775{ 836{
776 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); 837 if (!tracer)
838 return NULL;
839
840 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
841 return &tracer->trace;
777} 842}
778 843
779int ds_get_pebs_reset(struct task_struct *task, u64 *value) 844const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
780{ 845{
781 struct ds_context *context; 846 if (!tracer)
782 int error; 847 return NULL;
783 848
784 if (!value) 849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value =
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
852
853 return &tracer->trace;
854}
855
856int ds_reset_bts(struct bts_tracer *tracer)
857{
858 if (!tracer)
785 return -EINVAL; 859 return -EINVAL;
786 860
787 context = ds_get_context(task); 861 tracer->trace.ds.top = tracer->trace.ds.begin;
788 error = ds_validate_access(context, ds_pebs);
789 if (error < 0)
790 goto out;
791 862
792 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); 863 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
864 (unsigned long)tracer->trace.ds.top);
793 865
794 error = 0; 866 return 0;
795 out:
796 ds_put_context(context);
797 return error;
798} 867}
799 868
800int ds_set_pebs_reset(struct task_struct *task, u64 value) 869int ds_reset_pebs(struct pebs_tracer *tracer)
801{ 870{
802 struct ds_context *context; 871 if (!tracer)
803 int error; 872 return -EINVAL;
804 873
805 context = ds_get_context(task); 874 tracer->trace.ds.top = tracer->trace.ds.begin;
806 error = ds_validate_access(context, ds_pebs);
807 if (error < 0)
808 goto out;
809 875
810 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; 876 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
877 (unsigned long)tracer->trace.ds.top);
811 878
812 error = 0; 879 return 0;
813 out: 880}
814 ds_put_context(context); 881
815 return error; 882int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
883{
884 if (!tracer)
885 return -EINVAL;
886
887 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
888
889 return 0;
816} 890}
817 891
818static const struct ds_configuration ds_cfg_var = { 892static const struct ds_configuration ds_cfg_netburst = {
819 .sizeof_ds = sizeof(long) * 12, 893 .name = "netburst",
820 .sizeof_field = sizeof(long), 894 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
821 .sizeof_rec[ds_bts] = sizeof(long) * 3, 895 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6),
897
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
822#ifdef __i386__ 900#ifdef __i386__
823 .sizeof_rec[ds_pebs] = sizeof(long) * 10 901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
824#else 902#else
825 .sizeof_rec[ds_pebs] = sizeof(long) * 18 903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
826#endif 904#endif
827}; 905};
828static const struct ds_configuration ds_cfg_64 = { 906static const struct ds_configuration ds_cfg_pentium_m = {
829 .sizeof_ds = 8 * 12, 907 .name = "pentium m",
830 .sizeof_field = 8, 908 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
831 .sizeof_rec[ds_bts] = 8 * 3, 909
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
832#ifdef __i386__ 912#ifdef __i386__
833 .sizeof_rec[ds_pebs] = 8 * 10 913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
834#else 914#else
835 .sizeof_rec[ds_pebs] = 8 * 18 915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
836#endif 916#endif
837}; 917};
918static const struct ds_configuration ds_cfg_core2 = {
919 .name = "core 2",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10),
923
924 .sizeof_field = 8,
925 .sizeof_rec[ds_bts] = 8 * 3,
926 .sizeof_rec[ds_pebs] = 8 * 18,
927};
838 928
839static inline void 929static void
840ds_configure(const struct ds_configuration *cfg) 930ds_configure(const struct ds_configuration *cfg)
841{ 931{
932 memset(&ds_cfg, 0, sizeof(ds_cfg));
842 ds_cfg = *cfg; 933 ds_cfg = *cfg;
934
935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
936
937 if (!cpu_has_bts) {
938 ds_cfg.ctl[dsf_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n");
940 }
941 if (!cpu_has_pebs)
942 printk(KERN_INFO "[ds] pebs not available\n");
943
944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
843} 945}
844 946
845void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -852,10 +954,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
852 break; 954 break;
853 case 0xD: 955 case 0xD:
854 case 0xE: /* Pentium M */ 956 case 0xE: /* Pentium M */
855 ds_configure(&ds_cfg_var); 957 ds_configure(&ds_cfg_pentium_m);
856 break; 958 break;
857 default: /* Core2, Atom, ... */ 959 default: /* Core2, Atom, ... */
858 ds_configure(&ds_cfg_64); 960 ds_configure(&ds_cfg_core2);
859 break; 961 break;
860 } 962 }
861 break; 963 break;
@@ -864,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
864 case 0x0: 966 case 0x0:
865 case 0x1: 967 case 0x1:
866 case 0x2: /* Netburst */ 968 case 0x2: /* Netburst */
867 ds_configure(&ds_cfg_var); 969 ds_configure(&ds_cfg_netburst);
868 break; 970 break;
869 default: 971 default:
870 /* sorry, don't know about them */ 972 /* sorry, don't know about them */
@@ -877,12 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
877 } 979 }
878} 980}
879 981
880void ds_free(struct ds_context *context) 982/*
983 * Change the DS configuration from tracing prev to tracing next.
984 */
985void ds_switch_to(struct task_struct *prev, struct task_struct *next)
986{
987 struct ds_context *prev_ctx = prev->thread.ds_ctx;
988 struct ds_context *next_ctx = next->thread.ds_ctx;
989
990 if (prev_ctx) {
991 update_debugctlmsr(0);
992
993 if (prev_ctx->bts_master &&
994 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
995 struct bts_struct ts = {
996 .qualifier = bts_task_departs,
997 .variant.timestamp.jiffies = jiffies_64,
998 .variant.timestamp.pid = prev->pid
999 };
1000 bts_write(prev_ctx->bts_master, &ts);
1001 }
1002 }
1003
1004 if (next_ctx) {
1005 if (next_ctx->bts_master &&
1006 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1007 struct bts_struct ts = {
1008 .qualifier = bts_task_arrives,
1009 .variant.timestamp.jiffies = jiffies_64,
1010 .variant.timestamp.pid = next->pid
1011 };
1012 bts_write(next_ctx->bts_master, &ts);
1013 }
1014
1015 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1016 }
1017
1018 update_debugctlmsr(next->thread.debugctlmsr);
1019}
1020
1021void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1022{
1023 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
1024 tsk->thread.ds_ctx = NULL;
1025}
1026
1027void ds_exit_thread(struct task_struct *tsk)
881{ 1028{
882 /* This is called when the task owning the parameter context 1029 WARN_ON(tsk->thread.ds_ctx);
883 * is dying. There should not be any user of that context left
884 * to disturb us, anymore. */
885 unsigned long leftovers = context->count;
886 while (leftovers--)
887 ds_put_context(context);
888} 1030}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 5962176dfabb..6b1f6f6f8661 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -30,6 +30,37 @@ void printk_address(unsigned long address, int reliable)
30 reliable ? "" : "? ", (void *) address); 30 reliable ? "" : "? ", (void *) address);
31} 31}
32 32
33#ifdef CONFIG_FUNCTION_GRAPH_TRACER
34static void
35print_ftrace_graph_addr(unsigned long addr, void *data,
36 const struct stacktrace_ops *ops,
37 struct thread_info *tinfo, int *graph)
38{
39 struct task_struct *task = tinfo->task;
40 unsigned long ret_addr;
41 int index = task->curr_ret_stack;
42
43 if (addr != (unsigned long)return_to_handler)
44 return;
45
46 if (!task->ret_stack || index < *graph)
47 return;
48
49 index -= *graph;
50 ret_addr = task->ret_stack[index].ret;
51
52 ops->address(data, ret_addr, 1);
53
54 (*graph)++;
55}
56#else
57static inline void
58print_ftrace_graph_addr(unsigned long addr, void *data,
59 const struct stacktrace_ops *ops,
60 struct thread_info *tinfo, int *graph)
61{ }
62#endif
63
33/* 64/*
34 * x86-64 can have up to three kernel stacks: 65 * x86-64 can have up to three kernel stacks:
35 * process stack 66 * process stack
@@ -54,7 +85,7 @@ unsigned long
54print_context_stack(struct thread_info *tinfo, 85print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp, 86 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data, 87 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end) 88 unsigned long *end, int *graph)
58{ 89{
59 struct stack_frame *frame = (struct stack_frame *)bp; 90 struct stack_frame *frame = (struct stack_frame *)bp;
60 91
@@ -70,6 +101,7 @@ print_context_stack(struct thread_info *tinfo,
70 } else { 101 } else {
71 ops->address(data, addr, bp == 0); 102 ops->address(data, addr, bp == 0);
72 } 103 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
73 } 105 }
74 stack++; 106 stack++;
75 } 107 }
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 3119a801c32b..da87590b8698 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -18,7 +18,7 @@ extern unsigned long
18print_context_stack(struct thread_info *tinfo, 18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp, 19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data, 20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end); 21 unsigned long *end, int *graph);
22 22
23extern void 23extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 7b031b106ec8..d593cd1f58dc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -23,6 +23,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
24 const struct stacktrace_ops *ops, void *data) 24 const struct stacktrace_ops *ops, void *data)
25{ 25{
26 int graph = 0;
27
26 if (!task) 28 if (!task)
27 task = current; 29 task = current;
28 30
@@ -50,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
50 52
51 context = (struct thread_info *) 53 context = (struct thread_info *)
52 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 54 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
53 bp = print_context_stack(context, stack, bp, ops, data, NULL); 55 bp = print_context_stack(context, stack, bp, ops,
56 data, NULL, &graph);
54 57
55 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
56 if (!stack) 59 if (!stack)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 33ff10287a5d..c302d0707048 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -109,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
110 unsigned used = 0; 110 unsigned used = 0;
111 struct thread_info *tinfo; 111 struct thread_info *tinfo;
112 int graph = 0;
112 113
113 if (!task) 114 if (!task)
114 task = current; 115 task = current;
@@ -149,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
149 break; 150 break;
150 151
151 bp = print_context_stack(tinfo, stack, bp, ops, 152 bp = print_context_stack(tinfo, stack, bp, ops,
152 data, estack_end); 153 data, estack_end, &graph);
153 ops->stack(data, "<EOE>"); 154 ops->stack(data, "<EOE>");
154 /* 155 /*
155 * We link to the next stack via the 156 * We link to the next stack via the
@@ -168,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
168 if (ops->stack(data, "IRQ") < 0) 169 if (ops->stack(data, "IRQ") < 0)
169 break; 170 break;
170 bp = print_context_stack(tinfo, stack, bp, 171 bp = print_context_stack(tinfo, stack, bp,
171 ops, data, irqstack_end); 172 ops, data, irqstack_end, &graph);
172 /* 173 /*
173 * We link to the next stack (which would be 174 * We link to the next stack (which would be
174 * the process stack normally) the last 175 * the process stack normally) the last
@@ -186,7 +187,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
186 /* 187 /*
187 * This handles the process stack: 188 * This handles the process stack:
188 */ 189 */
189 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); 190 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
190 put_cpu(); 191 put_cpu();
191} 192}
192EXPORT_SYMBOL(dump_trace); 193EXPORT_SYMBOL(dump_trace);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index fe7014176eb0..d6f0490a7391 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -954,6 +954,9 @@ ENTRY(mcount)
954END(mcount) 954END(mcount)
955 955
956ENTRY(ftrace_caller) 956ENTRY(ftrace_caller)
957 cmpl $0, function_trace_stop
958 jne ftrace_stub
959
957 pushl %eax 960 pushl %eax
958 pushl %ecx 961 pushl %ecx
959 pushl %edx 962 pushl %edx
@@ -968,6 +971,11 @@ ftrace_call:
968 popl %edx 971 popl %edx
969 popl %ecx 972 popl %ecx
970 popl %eax 973 popl %eax
974#ifdef CONFIG_FUNCTION_GRAPH_TRACER
975.globl ftrace_graph_call
976ftrace_graph_call:
977 jmp ftrace_stub
978#endif
971 979
972.globl ftrace_stub 980.globl ftrace_stub
973ftrace_stub: 981ftrace_stub:
@@ -977,8 +985,18 @@ END(ftrace_caller)
977#else /* ! CONFIG_DYNAMIC_FTRACE */ 985#else /* ! CONFIG_DYNAMIC_FTRACE */
978 986
979ENTRY(mcount) 987ENTRY(mcount)
988 cmpl $0, function_trace_stop
989 jne ftrace_stub
990
980 cmpl $ftrace_stub, ftrace_trace_function 991 cmpl $ftrace_stub, ftrace_trace_function
981 jnz trace 992 jnz trace
993#ifdef CONFIG_FUNCTION_GRAPH_TRACER
994 cmpl $ftrace_stub, ftrace_graph_return
995 jnz ftrace_graph_caller
996
997 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
998 jnz ftrace_graph_caller
999#endif
982.globl ftrace_stub 1000.globl ftrace_stub
983ftrace_stub: 1001ftrace_stub:
984 ret 1002 ret
@@ -997,12 +1015,43 @@ trace:
997 popl %edx 1015 popl %edx
998 popl %ecx 1016 popl %ecx
999 popl %eax 1017 popl %eax
1000
1001 jmp ftrace_stub 1018 jmp ftrace_stub
1002END(mcount) 1019END(mcount)
1003#endif /* CONFIG_DYNAMIC_FTRACE */ 1020#endif /* CONFIG_DYNAMIC_FTRACE */
1004#endif /* CONFIG_FUNCTION_TRACER */ 1021#endif /* CONFIG_FUNCTION_TRACER */
1005 1022
1023#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1024ENTRY(ftrace_graph_caller)
1025 cmpl $0, function_trace_stop
1026 jne ftrace_stub
1027
1028 pushl %eax
1029 pushl %ecx
1030 pushl %edx
1031 movl 0xc(%esp), %edx
1032 lea 0x4(%ebp), %eax
1033 subl $MCOUNT_INSN_SIZE, %edx
1034 call prepare_ftrace_return
1035 popl %edx
1036 popl %ecx
1037 popl %eax
1038 ret
1039END(ftrace_graph_caller)
1040
1041.globl return_to_handler
1042return_to_handler:
1043 pushl $0
1044 pushl %eax
1045 pushl %ecx
1046 pushl %edx
1047 call ftrace_return_to_handler
1048 movl %eax, 0xc(%esp)
1049 popl %edx
1050 popl %ecx
1051 popl %eax
1052 ret
1053#endif
1054
1006.section .rodata,"a" 1055.section .rodata,"a"
1007#include "syscall_table_32.S" 1056#include "syscall_table_32.S"
1008 1057
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3194636a4293..e28c7a987793 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -67,16 +67,10 @@ ENTRY(mcount)
67END(mcount) 67END(mcount)
68 68
69ENTRY(ftrace_caller) 69ENTRY(ftrace_caller)
70 cmpl $0, function_trace_stop
71 jne ftrace_stub
70 72
71 /* taken from glibc */ 73 MCOUNT_SAVE_FRAME
72 subq $0x38, %rsp
73 movq %rax, (%rsp)
74 movq %rcx, 8(%rsp)
75 movq %rdx, 16(%rsp)
76 movq %rsi, 24(%rsp)
77 movq %rdi, 32(%rsp)
78 movq %r8, 40(%rsp)
79 movq %r9, 48(%rsp)
80 74
81 movq 0x38(%rsp), %rdi 75 movq 0x38(%rsp), %rdi
82 movq 8(%rbp), %rsi 76 movq 8(%rbp), %rsi
@@ -86,14 +80,13 @@ ENTRY(ftrace_caller)
86ftrace_call: 80ftrace_call:
87 call ftrace_stub 81 call ftrace_stub
88 82
89 movq 48(%rsp), %r9 83 MCOUNT_RESTORE_FRAME
90 movq 40(%rsp), %r8 84
91 movq 32(%rsp), %rdi 85#ifdef CONFIG_FUNCTION_GRAPH_TRACER
92 movq 24(%rsp), %rsi 86.globl ftrace_graph_call
93 movq 16(%rsp), %rdx 87ftrace_graph_call:
94 movq 8(%rsp), %rcx 88 jmp ftrace_stub
95 movq (%rsp), %rax 89#endif
96 addq $0x38, %rsp
97 90
98.globl ftrace_stub 91.globl ftrace_stub
99ftrace_stub: 92ftrace_stub:
@@ -102,15 +95,63 @@ END(ftrace_caller)
102 95
103#else /* ! CONFIG_DYNAMIC_FTRACE */ 96#else /* ! CONFIG_DYNAMIC_FTRACE */
104ENTRY(mcount) 97ENTRY(mcount)
98 cmpl $0, function_trace_stop
99 jne ftrace_stub
100
105 cmpq $ftrace_stub, ftrace_trace_function 101 cmpq $ftrace_stub, ftrace_trace_function
106 jnz trace 102 jnz trace
103
104#ifdef CONFIG_FUNCTION_GRAPH_TRACER
105 cmpq $ftrace_stub, ftrace_graph_return
106 jnz ftrace_graph_caller
107
108 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
109 jnz ftrace_graph_caller
110#endif
111
107.globl ftrace_stub 112.globl ftrace_stub
108ftrace_stub: 113ftrace_stub:
109 retq 114 retq
110 115
111trace: 116trace:
112 /* taken from glibc */ 117 MCOUNT_SAVE_FRAME
113 subq $0x38, %rsp 118
119 movq 0x38(%rsp), %rdi
120 movq 8(%rbp), %rsi
121 subq $MCOUNT_INSN_SIZE, %rdi
122
123 call *ftrace_trace_function
124
125 MCOUNT_RESTORE_FRAME
126
127 jmp ftrace_stub
128END(mcount)
129#endif /* CONFIG_DYNAMIC_FTRACE */
130#endif /* CONFIG_FUNCTION_TRACER */
131
132#ifdef CONFIG_FUNCTION_GRAPH_TRACER
133ENTRY(ftrace_graph_caller)
134 cmpl $0, function_trace_stop
135 jne ftrace_stub
136
137 MCOUNT_SAVE_FRAME
138
139 leaq 8(%rbp), %rdi
140 movq 0x38(%rsp), %rsi
141 subq $MCOUNT_INSN_SIZE, %rsi
142
143 call prepare_ftrace_return
144
145 MCOUNT_RESTORE_FRAME
146
147 retq
148END(ftrace_graph_caller)
149
150
151.globl return_to_handler
152return_to_handler:
153 subq $80, %rsp
154
114 movq %rax, (%rsp) 155 movq %rax, (%rsp)
115 movq %rcx, 8(%rsp) 156 movq %rcx, 8(%rsp)
116 movq %rdx, 16(%rsp) 157 movq %rdx, 16(%rsp)
@@ -118,13 +159,14 @@ trace:
118 movq %rdi, 32(%rsp) 159 movq %rdi, 32(%rsp)
119 movq %r8, 40(%rsp) 160 movq %r8, 40(%rsp)
120 movq %r9, 48(%rsp) 161 movq %r9, 48(%rsp)
162 movq %r10, 56(%rsp)
163 movq %r11, 64(%rsp)
121 164
122 movq 0x38(%rsp), %rdi 165 call ftrace_return_to_handler
123 movq 8(%rbp), %rsi
124 subq $MCOUNT_INSN_SIZE, %rdi
125
126 call *ftrace_trace_function
127 166
167 movq %rax, 72(%rsp)
168 movq 64(%rsp), %r11
169 movq 56(%rsp), %r10
128 movq 48(%rsp), %r9 170 movq 48(%rsp), %r9
129 movq 40(%rsp), %r8 171 movq 40(%rsp), %r8
130 movq 32(%rsp), %rdi 172 movq 32(%rsp), %rdi
@@ -132,12 +174,10 @@ trace:
132 movq 16(%rsp), %rdx 174 movq 16(%rsp), %rdx
133 movq 8(%rsp), %rcx 175 movq 8(%rsp), %rcx
134 movq (%rsp), %rax 176 movq (%rsp), %rax
135 addq $0x38, %rsp 177 addq $72, %rsp
178 retq
179#endif
136 180
137 jmp ftrace_stub
138END(mcount)
139#endif /* CONFIG_DYNAMIC_FTRACE */
140#endif /* CONFIG_FUNCTION_TRACER */
141 181
142#ifndef CONFIG_PREEMPT 182#ifndef CONFIG_PREEMPT
143#define retint_kernel retint_restore_args 183#define retint_kernel retint_restore_args
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9bf..1b43086b097a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
20#include <asm/ftrace.h> 21#include <asm/ftrace.h>
22#include <linux/ftrace.h>
21#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/nmi.h>
22 25
23 26
24static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 27#ifdef CONFIG_DYNAMIC_FTRACE
25 28
26union ftrace_code_union { 29union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 30 char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
31 } __attribute__((packed)); 34 } __attribute__((packed));
32}; 35};
33 36
34
35static int ftrace_calc_offset(long ip, long addr) 37static int ftrace_calc_offset(long ip, long addr)
36{ 38{
37 return (int)(addr - ip); 39 return (int)(addr - ip);
38} 40}
39 41
40unsigned char *ftrace_nop_replace(void) 42static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
41{
42 return ftrace_nop;
43}
44
45unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{ 43{
47 static union ftrace_code_union calc; 44 static union ftrace_code_union calc;
48 45
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
56 return calc.code; 53 return calc.code;
57} 54}
58 55
59int 56/*
57 * Modifying code must take extra care. On an SMP machine, if
58 * the code being modified is also being executed on another CPU
59 * that CPU will have undefined results and possibly take a GPF.
60 * We use kstop_machine to stop other CPUS from exectuing code.
61 * But this does not stop NMIs from happening. We still need
62 * to protect against that. We separate out the modification of
63 * the code to take care of this.
64 *
65 * Two buffers are added: An IP buffer and a "code" buffer.
66 *
67 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code
70 * 3) Wait for any running NMIs to finish.
71 * 4) Write the code
72 * 5) clear the flag.
73 * 6) Wait for any running NMIs to finish.
74 *
75 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write
77 * and if it is, it will write what is in the IP and "code" buffers.
78 *
79 * The trick is, it does not matter if everyone is writing the same
80 * content to the code location. Also, if a CPU is executing code
81 * it is OK to write to that code location if the contents being written
82 * are the same as what exists.
83 */
84
85static atomic_t in_nmi = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */
90
91static unsigned nmi_wait_count;
92static atomic_t nmi_update_count = ATOMIC_INIT(0);
93
94int ftrace_arch_read_dyn_info(char *buf, int size)
95{
96 int r;
97
98 r = snprintf(buf, size, "%u %u",
99 nmi_wait_count,
100 atomic_read(&nmi_update_count));
101 return r;
102}
103
104static void ftrace_mod_code(void)
105{
106 /*
107 * Yes, more than one CPU process can be writing to mod_code_status.
108 * (and the code itself)
109 * But if one were to fail, then they all should, and if one were
110 * to succeed, then they all should.
111 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE);
114}
115
116void ftrace_nmi_enter(void)
117{
118 atomic_inc(&in_nmi);
119 /* Must have in_nmi seen before reading write flag */
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code();
123 atomic_inc(&nmi_update_count);
124 }
125}
126
127void ftrace_nmi_exit(void)
128{
129 /* Finish all executions before clearing in_nmi */
130 smp_wmb();
131 atomic_dec(&in_nmi);
132}
133
134static void wait_for_nmi(void)
135{
136 int waited = 0;
137
138 while (atomic_read(&in_nmi)) {
139 waited = 1;
140 cpu_relax();
141 }
142
143 if (waited)
144 nmi_wait_count++;
145}
146
147static int
148do_ftrace_mod_code(unsigned long ip, void *new_code)
149{
150 mod_code_ip = (void *)ip;
151 mod_code_newcode = new_code;
152
153 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb();
160
161 wait_for_nmi();
162
163 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb();
165
166 ftrace_mod_code();
167
168 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb();
175
176 wait_for_nmi();
177
178 return mod_code_status;
179}
180
181
182
183
184static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
185
186static unsigned char *ftrace_nop_replace(void)
187{
188 return ftrace_nop;
189}
190
191static int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 192ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 193 unsigned char *new_code)
62{ 194{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
81 return -EINVAL; 213 return -EINVAL;
82 214
83 /* replace the text with the new text */ 215 /* replace the text with the new text */
84 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) 216 if (do_ftrace_mod_code(ip, new_code))
85 return -EPERM; 217 return -EPERM;
86 218
87 sync_core(); 219 sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
89 return 0; 221 return 0;
90} 222}
91 223
224int ftrace_make_nop(struct module *mod,
225 struct dyn_ftrace *rec, unsigned long addr)
226{
227 unsigned char *new, *old;
228 unsigned long ip = rec->ip;
229
230 old = ftrace_call_replace(ip, addr);
231 new = ftrace_nop_replace();
232
233 return ftrace_modify_code(rec->ip, old, new);
234}
235
236int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
237{
238 unsigned char *new, *old;
239 unsigned long ip = rec->ip;
240
241 old = ftrace_nop_replace();
242 new = ftrace_call_replace(ip, addr);
243
244 return ftrace_modify_code(rec->ip, old, new);
245}
246
92int ftrace_update_ftrace_func(ftrace_func_t func) 247int ftrace_update_ftrace_func(ftrace_func_t func)
93{ 248{
94 unsigned long ip = (unsigned long)(&ftrace_call); 249 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
165 320
166 return 0; 321 return 0;
167} 322}
323#endif
324
325#ifdef CONFIG_FUNCTION_GRAPH_TRACER
326
327#ifdef CONFIG_DYNAMIC_FTRACE
328extern void ftrace_graph_call(void);
329
330static int ftrace_mod_jmp(unsigned long ip,
331 int old_offset, int new_offset)
332{
333 unsigned char code[MCOUNT_INSN_SIZE];
334
335 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
336 return -EFAULT;
337
338 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
339 return -EINVAL;
340
341 *(int *)(&code[1]) = new_offset;
342
343 if (do_ftrace_mod_code(ip, &code))
344 return -EPERM;
345
346 return 0;
347}
348
349int ftrace_enable_ftrace_graph_caller(void)
350{
351 unsigned long ip = (unsigned long)(&ftrace_graph_call);
352 int old_offset, new_offset;
353
354 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
355 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
356
357 return ftrace_mod_jmp(ip, old_offset, new_offset);
358}
359
360int ftrace_disable_ftrace_graph_caller(void)
361{
362 unsigned long ip = (unsigned long)(&ftrace_graph_call);
363 int old_offset, new_offset;
364
365 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
366 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
367
368 return ftrace_mod_jmp(ip, old_offset, new_offset);
369}
370
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */
391
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/*
466 * Hook the return address and push it in the stack of return addrs
467 * in current thread info.
468 */
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{
471 unsigned long old;
472 unsigned long long calltime;
473 int faulted;
474 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long)
476 &return_to_handler;
477
478 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi)))
480 return;
481
482 if (unlikely(atomic_read(&current->tracing_graph_pause)))
483 return;
484
485 /*
486 * Protect against fault, even if it shouldn't
487 * happen. This tool is too much intrusive to
488 * ignore such a protection.
489 */
490 asm volatile(
491 "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
492 "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
493 " movl $0, %[faulted]\n"
494
495 ".section .fixup, \"ax\"\n"
496 "3: movl $1, %[faulted]\n"
497 ".previous\n"
498
499 _ASM_EXTABLE(1b, 3b)
500 _ASM_EXTABLE(2b, 3b)
501
502 : [parent_replaced] "=r" (parent), [old] "=r" (old),
503 [faulted] "=r" (faulted)
504 : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
505 : "memory"
506 );
507
508 if (unlikely(faulted)) {
509 ftrace_graph_stop();
510 WARN_ON(1);
511 return;
512 }
513
514 if (unlikely(!__kernel_text_address(old))) {
515 ftrace_graph_stop();
516 *parent = old;
517 WARN_ON(1);
518 return;
519 }
520
521 calltime = cpu_clock(raw_smp_processor_id());
522
523 if (push_return_trace(old, calltime,
524 self_addr, &trace.depth) == -EBUSY) {
525 *parent = old;
526 return;
527 }
528
529 trace.func = self_addr;
530
531 /* Only trace if the calling function expects to */
532 if (!ftrace_graph_entry(&trace)) {
533 current->curr_ret_stack--;
534 *parent = old;
535 }
536}
537#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1d3d0e71b044..1df869e5bd0b 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,6 +13,7 @@
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/ftrace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io_apic.h> 18#include <asm/io_apic.h>
18#include <asm/idle.h> 19#include <asm/idle.h>
@@ -45,7 +46,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
45 * SMP cross-CPU interrupts have their own specific 46 * SMP cross-CPU interrupts have their own specific
46 * handlers). 47 * handlers).
47 */ 48 */
48asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 49asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
49{ 50{
50 struct pt_regs *old_regs = set_irq_regs(regs); 51 struct pt_regs *old_regs = set_irq_regs(regs);
51 struct irq_desc *desc; 52 struct irq_desc *desc;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b8f3e9dbabd7..e68bb9e30864 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/ftrace.h>
11#include <asm/system.h> 12#include <asm/system.h>
12#include <asm/apic.h> 13#include <asm/apic.h>
13 14
@@ -102,6 +103,9 @@ static inline int hlt_use_halt(void)
102void default_idle(void) 103void default_idle(void)
103{ 104{
104 if (hlt_use_halt()) { 105 if (hlt_use_halt()) {
106 struct power_trace it;
107
108 trace_power_start(&it, POWER_CSTATE, 1);
105 current_thread_info()->status &= ~TS_POLLING; 109 current_thread_info()->status &= ~TS_POLLING;
106 /* 110 /*
107 * TS_POLLING-cleared state must be visible before we 111 * TS_POLLING-cleared state must be visible before we
@@ -114,6 +118,7 @@ void default_idle(void)
114 else 118 else
115 local_irq_enable(); 119 local_irq_enable();
116 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121 trace_power_end(&it);
117 } else { 122 } else {
118 local_irq_enable(); 123 local_irq_enable();
119 /* loop is done by the caller */ 124 /* loop is done by the caller */
@@ -171,24 +176,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
171 */ 176 */
172void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 177void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
173{ 178{
179 struct power_trace it;
180
181 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
174 if (!need_resched()) { 182 if (!need_resched()) {
175 __monitor((void *)&current_thread_info()->flags, 0, 0); 183 __monitor((void *)&current_thread_info()->flags, 0, 0);
176 smp_mb(); 184 smp_mb();
177 if (!need_resched()) 185 if (!need_resched())
178 __mwait(ax, cx); 186 __mwait(ax, cx);
179 } 187 }
188 trace_power_end(&it);
180} 189}
181 190
182/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 191/* Default MONITOR/MWAIT with no hints, used for default C1 state */
183static void mwait_idle(void) 192static void mwait_idle(void)
184{ 193{
194 struct power_trace it;
185 if (!need_resched()) { 195 if (!need_resched()) {
196 trace_power_start(&it, POWER_CSTATE, 1);
186 __monitor((void *)&current_thread_info()->flags, 0, 0); 197 __monitor((void *)&current_thread_info()->flags, 0, 0);
187 smp_mb(); 198 smp_mb();
188 if (!need_resched()) 199 if (!need_resched())
189 __sti_mwait(0, 0); 200 __sti_mwait(0, 0);
190 else 201 else
191 local_irq_enable(); 202 local_irq_enable();
203 trace_power_end(&it);
192 } else 204 } else
193 local_irq_enable(); 205 local_irq_enable();
194} 206}
@@ -200,9 +212,13 @@ static void mwait_idle(void)
200 */ 212 */
201static void poll_idle(void) 213static void poll_idle(void)
202{ 214{
215 struct power_trace it;
216
217 trace_power_start(&it, POWER_CSTATE, 0);
203 local_irq_enable(); 218 local_irq_enable();
204 while (!need_resched()) 219 while (!need_resched())
205 cpu_relax(); 220 cpu_relax();
221 trace_power_end(&it);
206} 222}
207 223
208/* 224/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..3ba155d24884 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h> 40#include <linux/dmi.h>
41#include <linux/ftrace.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
@@ -59,6 +60,7 @@
59#include <asm/idle.h> 60#include <asm/idle.h>
60#include <asm/syscalls.h> 61#include <asm/syscalls.h>
61#include <asm/smp.h> 62#include <asm/smp.h>
63#include <asm/ds.h>
62 64
63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 65asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
64 66
@@ -250,14 +252,8 @@ void exit_thread(void)
250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 252 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
251 put_cpu(); 253 put_cpu();
252 } 254 }
253#ifdef CONFIG_X86_DS 255
254 /* Free any DS contexts that have not been properly released. */ 256 ds_exit_thread(current);
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
261} 257}
262 258
263void flush_thread(void) 259void flush_thread(void)
@@ -339,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
339 kfree(p->thread.io_bitmap_ptr); 335 kfree(p->thread.io_bitmap_ptr);
340 p->thread.io_bitmap_max = 0; 336 p->thread.io_bitmap_max = 0;
341 } 337 }
338
339 ds_copy_thread(p, current);
340
341 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
342 p->thread.debugctlmsr = 0;
343
342 return err; 344 return err;
343} 345}
344 346
@@ -419,48 +421,19 @@ int set_tsc_mode(unsigned int val)
419 return 0; 421 return 0;
420} 422}
421 423
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
451static noinline void 424static noinline void
452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 425__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
453 struct tss_struct *tss) 426 struct tss_struct *tss)
454{ 427{
455 struct thread_struct *prev, *next; 428 struct thread_struct *prev, *next;
456 unsigned long debugctl;
457 429
458 prev = &prev_p->thread; 430 prev = &prev_p->thread;
459 next = &next_p->thread; 431 next = &next_p->thread;
460 432
461 debugctl = update_debugctl(prev, next, prev->debugctlmsr); 433 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
462 434 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
463 if (next->debugctlmsr != debugctl) 435 ds_switch_to(prev_p, next_p);
436 else if (next->debugctlmsr != prev->debugctlmsr)
464 update_debugctlmsr(next->debugctlmsr); 437 update_debugctlmsr(next->debugctlmsr);
465 438
466 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 439 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -482,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
482 hard_enable_TSC(); 455 hard_enable_TSC();
483 } 456 }
484 457
485#ifdef CONFIG_X86_PTRACE_BTS
486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
488
489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
491#endif /* CONFIG_X86_PTRACE_BTS */
492
493
494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 458 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
495 /* 459 /*
496 * Disable the bitmap via an invalid offset. We still cache 460 * Disable the bitmap via an invalid offset. We still cache
@@ -548,7 +512,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
548 * the task-switch, and shows up in ret_from_fork in entry.S, 512 * the task-switch, and shows up in ret_from_fork in entry.S,
549 * for example. 513 * for example.
550 */ 514 */
551struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 515__notrace_funcgraph struct task_struct *
516__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552{ 517{
553 struct thread_struct *prev = &prev_p->thread, 518 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread; 519 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b6..416fb9282f4f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h> 40#include <linux/uaccess.h>
41#include <linux/io.h> 41#include <linux/io.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include <asm/system.h> 45#include <asm/system.h>
@@ -52,6 +53,7 @@
52#include <asm/ia32.h> 53#include <asm/ia32.h>
53#include <asm/idle.h> 54#include <asm/idle.h>
54#include <asm/syscalls.h> 55#include <asm/syscalls.h>
56#include <asm/ds.h>
55 57
56asmlinkage extern void ret_from_fork(void); 58asmlinkage extern void ret_from_fork(void);
57 59
@@ -235,14 +237,8 @@ void exit_thread(void)
235 t->io_bitmap_max = 0; 237 t->io_bitmap_max = 0;
236 put_cpu(); 238 put_cpu();
237 } 239 }
238#ifdef CONFIG_X86_DS 240
239 /* Free any DS contexts that have not been properly released. */ 241 ds_exit_thread(current);
240 if (unlikely(t->ds_ctx)) {
241 /* we clear debugctl to make sure DS is not used. */
242 update_debugctlmsr(0);
243 ds_free(t->ds_ctx);
244 }
245#endif /* CONFIG_X86_DS */
246} 242}
247 243
248void flush_thread(void) 244void flush_thread(void)
@@ -372,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
372 if (err) 368 if (err)
373 goto out; 369 goto out;
374 } 370 }
371
372 ds_copy_thread(p, me);
373
374 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
375 p->thread.debugctlmsr = 0;
376
375 err = 0; 377 err = 0;
376out: 378out:
377 if (err && p->thread.io_bitmap_ptr) { 379 if (err && p->thread.io_bitmap_ptr) {
@@ -470,35 +472,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
470 struct tss_struct *tss) 472 struct tss_struct *tss)
471{ 473{
472 struct thread_struct *prev, *next; 474 struct thread_struct *prev, *next;
473 unsigned long debugctl;
474 475
475 prev = &prev_p->thread, 476 prev = &prev_p->thread,
476 next = &next_p->thread; 477 next = &next_p->thread;
477 478
478 debugctl = prev->debugctlmsr; 479 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
479 480 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
480#ifdef CONFIG_X86_DS 481 ds_switch_to(prev_p, next_p);
481 { 482 else if (next->debugctlmsr != prev->debugctlmsr)
482 unsigned long ds_prev = 0, ds_next = 0;
483
484 if (prev->ds_ctx)
485 ds_prev = (unsigned long)prev->ds_ctx->ds;
486 if (next->ds_ctx)
487 ds_next = (unsigned long)next->ds_ctx->ds;
488
489 if (ds_next != ds_prev) {
490 /*
491 * We clear debugctl to make sure DS
492 * is not in use when we change it:
493 */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsrl(MSR_IA32_DS_AREA, ds_next);
497 }
498 }
499#endif /* CONFIG_X86_DS */
500
501 if (next->debugctlmsr != debugctl)
502 update_debugctlmsr(next->debugctlmsr); 483 update_debugctlmsr(next->debugctlmsr);
503 484
504 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 485 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -533,14 +514,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
533 */ 514 */
534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 515 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
535 } 516 }
536
537#ifdef CONFIG_X86_PTRACE_BTS
538 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
539 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
540
541 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
542 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
543#endif /* CONFIG_X86_PTRACE_BTS */
544} 517}
545 518
546/* 519/*
@@ -551,8 +524,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
551 * - could test fs/gs bitsliced 524 * - could test fs/gs bitsliced
552 * 525 *
553 * Kprobes not supported here. Set the probe on schedule instead. 526 * Kprobes not supported here. Set the probe on schedule instead.
527 * Function graph tracer not supported too.
554 */ 528 */
555struct task_struct * 529__notrace_funcgraph struct task_struct *
556__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 530__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
557{ 531{
558 struct thread_struct *prev = &prev_p->thread; 532 struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 06180dff5b2e..0a5df5f82fb9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -581,158 +581,91 @@ static int ioperm_get(struct task_struct *target,
581} 581}
582 582
583#ifdef CONFIG_X86_PTRACE_BTS 583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index, 584static int ptrace_bts_read_record(struct task_struct *child, size_t index,
664 struct bts_struct __user *out) 585 struct bts_struct __user *out)
665{ 586{
666 struct bts_struct ret; 587 const struct bts_trace *trace;
667 const void *bts_record; 588 struct bts_struct bts;
668 size_t bts_index, bts_end; 589 const unsigned char *at;
669 int error; 590 int error;
670 591
671 error = ds_get_bts_end(child, &bts_end); 592 trace = ds_read_bts(child->bts);
672 if (error < 0) 593 if (!trace)
673 return error; 594 return -EPERM;
674
675 if (bts_end <= index)
676 return -EINVAL;
677 595
678 error = ds_get_bts_index(child, &bts_index); 596 at = trace->ds.top - ((index + 1) * trace->ds.size);
679 if (error < 0) 597 if ((void *)at < trace->ds.begin)
680 return error; 598 at += (trace->ds.n * trace->ds.size);
681 599
682 /* translate the ptrace bts index into the ds bts index */ 600 if (!trace->read)
683 bts_index += bts_end - (index + 1); 601 return -EOPNOTSUPP;
684 if (bts_end <= bts_index)
685 bts_index -= bts_end;
686 602
687 error = ds_access_bts(child, bts_index, &bts_record); 603 error = trace->read(child->bts, at, &bts);
688 if (error < 0) 604 if (error < 0)
689 return error; 605 return error;
690 606
691 ptrace_bts_translate_record(&ret, bts_record); 607 if (copy_to_user(out, &bts, sizeof(bts)))
692
693 if (copy_to_user(out, &ret, sizeof(ret)))
694 return -EFAULT; 608 return -EFAULT;
695 609
696 return sizeof(ret); 610 return sizeof(bts);
697} 611}
698 612
699static int ptrace_bts_drain(struct task_struct *child, 613static int ptrace_bts_drain(struct task_struct *child,
700 long size, 614 long size,
701 struct bts_struct __user *out) 615 struct bts_struct __user *out)
702{ 616{
703 struct bts_struct ret; 617 const struct bts_trace *trace;
704 const unsigned char *raw; 618 const unsigned char *at;
705 size_t end, i; 619 int error, drained = 0;
706 int error;
707 620
708 error = ds_get_bts_index(child, &end); 621 trace = ds_read_bts(child->bts);
709 if (error < 0) 622 if (!trace)
710 return error; 623 return -EPERM;
711 624
712 if (size < (end * sizeof(struct bts_struct))) 625 if (!trace->read)
626 return -EOPNOTSUPP;
627
628 if (size < (trace->ds.top - trace->ds.begin))
713 return -EIO; 629 return -EIO;
714 630
715 error = ds_access_bts(child, 0, (const void **)&raw); 631 for (at = trace->ds.begin; (void *)at < trace->ds.top;
716 if (error < 0) 632 out++, drained++, at += trace->ds.size) {
717 return error; 633 struct bts_struct bts;
634 int error;
718 635
719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { 636 error = trace->read(child->bts, at, &bts);
720 ptrace_bts_translate_record(&ret, raw); 637 if (error < 0)
638 return error;
721 639
722 if (copy_to_user(out, &ret, sizeof(ret))) 640 if (copy_to_user(out, &bts, sizeof(bts)))
723 return -EFAULT; 641 return -EFAULT;
724 } 642 }
725 643
726 error = ds_clear_bts(child); 644 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
645
646 error = ds_reset_bts(child->bts);
727 if (error < 0) 647 if (error < 0)
728 return error; 648 return error;
729 649
730 return end; 650 return drained;
731} 651}
732 652
733static void ptrace_bts_ovfl(struct task_struct *child) 653static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
734{ 654{
735 send_sig(child->thread.bts_ovfl_signal, child, 0); 655 child->bts_buffer = alloc_locked_buffer(size);
656 if (!child->bts_buffer)
657 return -ENOMEM;
658
659 child->bts_size = size;
660
661 return 0;
662}
663
664static void ptrace_bts_free_buffer(struct task_struct *child)
665{
666 free_locked_buffer(child->bts_buffer, child->bts_size);
667 child->bts_buffer = NULL;
668 child->bts_size = 0;
736} 669}
737 670
738static int ptrace_bts_config(struct task_struct *child, 671static int ptrace_bts_config(struct task_struct *child,
@@ -740,114 +673,86 @@ static int ptrace_bts_config(struct task_struct *child,
740 const struct ptrace_bts_config __user *ucfg) 673 const struct ptrace_bts_config __user *ucfg)
741{ 674{
742 struct ptrace_bts_config cfg; 675 struct ptrace_bts_config cfg;
743 int error = 0; 676 unsigned int flags = 0;
744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748 677
749 error = -EIO;
750 if (cfg_size < sizeof(cfg)) 678 if (cfg_size < sizeof(cfg))
751 goto errout; 679 return -EIO;
752 680
753 error = -EFAULT;
754 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 681 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
755 goto errout; 682 return -EFAULT;
756 683
757 error = -EINVAL; 684 if (child->bts) {
758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && 685 ds_release_bts(child->bts);
759 !(cfg.flags & PTRACE_BTS_O_ALLOC)) 686 child->bts = NULL;
760 goto errout; 687 }
761 688
762 if (cfg.flags & PTRACE_BTS_O_ALLOC) { 689 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
763 ds_ovfl_callback_t ovfl = NULL; 690 if (!cfg.signal)
764 unsigned int sig = 0; 691 return -EINVAL;
765 692
766 /* we ignore the error in case we were not tracing child */ 693 return -EOPNOTSUPP;
767 (void)ds_release_bts(child);
768 694
769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 695 child->thread.bts_ovfl_signal = cfg.signal;
770 if (!cfg.signal) 696 }
771 goto errout;
772 697
773 sig = cfg.signal; 698 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
774 ovfl = ptrace_bts_ovfl; 699 (cfg.size != child->bts_size)) {
775 } 700 int error;
776 701
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); 702 ptrace_bts_free_buffer(child);
778 if (error < 0)
779 goto errout;
780 703
781 child->thread.bts_ovfl_signal = sig; 704 error = ptrace_bts_allocate_buffer(child, cfg.size);
705 if (error < 0)
706 return error;
782 } 707 }
783 708
784 error = -EINVAL;
785 if (!child->thread.ds_ctx && cfg.flags)
786 goto errout;
787
788 if (cfg.flags & PTRACE_BTS_O_TRACE) 709 if (cfg.flags & PTRACE_BTS_O_TRACE)
789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask; 710 flags |= BTS_USER;
790 else
791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
792 711
793 if (cfg.flags & PTRACE_BTS_O_SCHED) 712 if (cfg.flags & PTRACE_BTS_O_SCHED)
794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 713 flags |= BTS_TIMESTAMPS;
795 else
796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
797 714
798 error = sizeof(cfg); 715 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
716 /* ovfl = */ NULL, /* th = */ (size_t)-1,
717 flags);
718 if (IS_ERR(child->bts)) {
719 int error = PTR_ERR(child->bts);
799 720
800out: 721 ptrace_bts_free_buffer(child);
801 if (child->thread.debugctlmsr) 722 child->bts = NULL;
802 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
803 else
804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
805 723
806 return error; 724 return error;
725 }
807 726
808errout: 727 return sizeof(cfg);
809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
811 goto out;
812} 728}
813 729
814static int ptrace_bts_status(struct task_struct *child, 730static int ptrace_bts_status(struct task_struct *child,
815 long cfg_size, 731 long cfg_size,
816 struct ptrace_bts_config __user *ucfg) 732 struct ptrace_bts_config __user *ucfg)
817{ 733{
734 const struct bts_trace *trace;
818 struct ptrace_bts_config cfg; 735 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
822 736
823 if (cfg_size < sizeof(cfg)) 737 if (cfg_size < sizeof(cfg))
824 return -EIO; 738 return -EIO;
825 739
826 error = ds_get_bts_end(child, &end); 740 trace = ds_read_bts(child->bts);
827 if (error < 0) 741 if (!trace)
828 return error; 742 return -EPERM;
829
830 error = ds_access_bts(child, /* index = */ 0, &base);
831 if (error < 0)
832 return error;
833
834 error = ds_access_bts(child, /* index = */ end, &max);
835 if (error < 0)
836 return error;
837 743
838 memset(&cfg, 0, sizeof(cfg)); 744 memset(&cfg, 0, sizeof(cfg));
839 cfg.size = (max - base); 745 cfg.size = trace->ds.end - trace->ds.begin;
840 cfg.signal = child->thread.bts_ovfl_signal; 746 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct); 747 cfg.bts_size = sizeof(struct bts_struct);
842 748
843 if (cfg.signal) 749 if (cfg.signal)
844 cfg.flags |= PTRACE_BTS_O_SIGNAL; 750 cfg.flags |= PTRACE_BTS_O_SIGNAL;
845 751
846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 752 if (trace->ds.flags & BTS_USER)
847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
848 cfg.flags |= PTRACE_BTS_O_TRACE; 753 cfg.flags |= PTRACE_BTS_O_TRACE;
849 754
850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 755 if (trace->ds.flags & BTS_TIMESTAMPS)
851 cfg.flags |= PTRACE_BTS_O_SCHED; 756 cfg.flags |= PTRACE_BTS_O_SCHED;
852 757
853 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 758 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -856,109 +761,77 @@ static int ptrace_bts_status(struct task_struct *child,
856 return sizeof(cfg); 761 return sizeof(cfg);
857} 762}
858 763
859static int ptrace_bts_write_record(struct task_struct *child, 764static int ptrace_bts_clear(struct task_struct *child)
860 const struct bts_struct *in)
861{ 765{
862 unsigned char bts_record[BTS_MAX_RECORD_SIZE]; 766 const struct bts_trace *trace;
863 767
864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); 768 trace = ds_read_bts(child->bts);
769 if (!trace)
770 return -EPERM;
865 771
866 memset(bts_record, 0, bts_cfg.sizeof_bts); 772 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
867 switch (in->qualifier) {
868 case BTS_INVALID:
869 break;
870 773
871 case BTS_BRANCH: 774 return ds_reset_bts(child->bts);
872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip); 775}
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
875 776
876 case BTS_TASK_ARRIVES: 777static int ptrace_bts_size(struct task_struct *child)
877 case BTS_TASK_DEPARTS: 778{
878 bts_set(bts_record, bts_from, bts_escape); 779 const struct bts_trace *trace;
879 bts_set(bts_record, bts_qual, in->qualifier);
880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
882 780
883 default: 781 trace = ds_read_bts(child->bts);
884 return -EINVAL; 782 if (!trace)
885 } 783 return -EPERM;
886 784
887 /* The writing task will be the switched-to task on a context 785 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
888 * switch. It needs to write into the switched-from task's BTS
889 * buffer. */
890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
891} 786}
892 787
893void ptrace_bts_take_timestamp(struct task_struct *tsk, 788static void ptrace_bts_fork(struct task_struct *tsk)
894 enum bts_qualifier qualifier)
895{ 789{
896 struct bts_struct rec = { 790 tsk->bts = NULL;
897 .qualifier = qualifier, 791 tsk->bts_buffer = NULL;
898 .variant.jiffies = jiffies_64 792 tsk->bts_size = 0;
899 }; 793 tsk->thread.bts_ovfl_signal = 0;
900
901 ptrace_bts_write_record(tsk, &rec);
902} 794}
903 795
904static const struct bts_configuration bts_cfg_netburst = { 796static void ptrace_bts_untrace(struct task_struct *child)
905 .sizeof_bts = sizeof(long) * 3, 797{
906 .sizeof_field = sizeof(long), 798 if (unlikely(child->bts)) {
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5) 799 ds_release_bts(child->bts);
908}; 800 child->bts = NULL;
801
802 /* We cannot update total_vm and locked_vm since
803 child's mm is already gone. But we can reclaim the
804 memory. */
805 kfree(child->bts_buffer);
806 child->bts_buffer = NULL;
807 child->bts_size = 0;
808 }
809}
909 810
910static const struct bts_configuration bts_cfg_pentium_m = { 811static void ptrace_bts_detach(struct task_struct *child)
911 .sizeof_bts = sizeof(long) * 3, 812{
912 .sizeof_field = sizeof(long), 813 if (unlikely(child->bts)) {
913 .debugctl_mask = (1<<6)|(1<<7) 814 ds_release_bts(child->bts);
914}; 815 child->bts = NULL;
915 816
916static const struct bts_configuration bts_cfg_core2 = { 817 ptrace_bts_free_buffer(child);
917 .sizeof_bts = 8 * 3, 818 }
918 .sizeof_field = 8, 819}
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9) 820#else
920}; 821static inline void ptrace_bts_fork(struct task_struct *tsk) {}
822static inline void ptrace_bts_detach(struct task_struct *child) {}
823static inline void ptrace_bts_untrace(struct task_struct *child) {}
824#endif /* CONFIG_X86_PTRACE_BTS */
921 825
922static inline void bts_configure(const struct bts_configuration *cfg) 826void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
923{ 827{
924 bts_cfg = *cfg; 828 ptrace_bts_fork(child);
925} 829}
926 830
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) 831void x86_ptrace_untrace(struct task_struct *child)
928{ 832{
929 switch (c->x86) { 833 ptrace_bts_untrace(child);
930 case 0x6:
931 switch (c->x86_model) {
932 case 0 ... 0xC:
933 /* sorry, don't know about them */
934 break;
935 case 0xD:
936 case 0xE: /* Pentium M */
937 bts_configure(&bts_cfg_pentium_m);
938 break;
939 default: /* Core2, Atom, ... */
940 bts_configure(&bts_cfg_core2);
941 break;
942 }
943 break;
944 case 0xF:
945 switch (c->x86_model) {
946 case 0x0:
947 case 0x1:
948 case 0x2: /* Netburst */
949 bts_configure(&bts_cfg_netburst);
950 break;
951 default:
952 /* sorry, don't know about them */
953 break;
954 }
955 break;
956 default:
957 /* sorry, don't know about them */
958 break;
959 }
960} 834}
961#endif /* CONFIG_X86_PTRACE_BTS */
962 835
963/* 836/*
964 * Called by kernel/ptrace.c when detaching.. 837 * Called by kernel/ptrace.c when detaching..
@@ -971,15 +844,7 @@ void ptrace_disable(struct task_struct *child)
971#ifdef TIF_SYSCALL_EMU 844#ifdef TIF_SYSCALL_EMU
972 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 845 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
973#endif 846#endif
974#ifdef CONFIG_X86_PTRACE_BTS 847 ptrace_bts_detach(child);
975 (void)ds_release_bts(child);
976
977 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
978 if (!child->thread.debugctlmsr)
979 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
980
981 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
982#endif /* CONFIG_X86_PTRACE_BTS */
983} 848}
984 849
985#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1111,7 +976,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1111 break; 976 break;
1112 977
1113 case PTRACE_BTS_SIZE: 978 case PTRACE_BTS_SIZE:
1114 ret = ds_get_bts_index(child, /* pos = */ NULL); 979 ret = ptrace_bts_size(child);
1115 break; 980 break;
1116 981
1117 case PTRACE_BTS_GET: 982 case PTRACE_BTS_GET:
@@ -1120,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1120 break; 985 break;
1121 986
1122 case PTRACE_BTS_CLEAR: 987 case PTRACE_BTS_CLEAR:
1123 ret = ds_clear_bts(child); 988 ret = ptrace_bts_clear(child);
1124 break; 989 break;
1125 990
1126 case PTRACE_BTS_DRAIN: 991 case PTRACE_BTS_DRAIN:
@@ -1383,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1383 1248
1384 case PTRACE_GET_THREAD_AREA: 1249 case PTRACE_GET_THREAD_AREA:
1385 case PTRACE_SET_THREAD_AREA: 1250 case PTRACE_SET_THREAD_AREA:
1251#ifdef CONFIG_X86_PTRACE_BTS
1252 case PTRACE_BTS_CONFIG:
1253 case PTRACE_BTS_STATUS:
1254 case PTRACE_BTS_SIZE:
1255 case PTRACE_BTS_GET:
1256 case PTRACE_BTS_CLEAR:
1257 case PTRACE_BTS_DRAIN:
1258#endif /* CONFIG_X86_PTRACE_BTS */
1386 return arch_ptrace(child, request, addr, data); 1259 return arch_ptrace(child, request, addr, data);
1387 1260
1388 default: 1261 default:
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7a430c4d1551..f8500c969442 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -288,7 +288,7 @@ static int __cpuinitdata unsafe_smp;
288/* 288/*
289 * Activate a secondary processor. 289 * Activate a secondary processor.
290 */ 290 */
291static void __cpuinit start_secondary(void *unused) 291notrace static void __cpuinit start_secondary(void *unused)
292{ 292{
293 /* 293 /*
294 * Don't put *anything* before cpu_init(), SMP booting is too 294 * Don't put *anything* before cpu_init(), SMP booting is too
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h>
9#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
10 11
11static void save_stack_warning(void *data, char *msg) 12static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
83 trace->entries[trace->nr_entries++] = ULONG_MAX; 84 trace->entries[trace->nr_entries++] = ULONG_MAX;
84} 85}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 86EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
87
88/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
89
90struct stack_frame {
91 const void __user *next_fp;
92 unsigned long ret_addr;
93};
94
95static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
96{
97 int ret;
98
99 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
100 return 0;
101
102 ret = 1;
103 pagefault_disable();
104 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
105 ret = 0;
106 pagefault_enable();
107
108 return ret;
109}
110
111static inline void __save_stack_trace_user(struct stack_trace *trace)
112{
113 const struct pt_regs *regs = task_pt_regs(current);
114 const void __user *fp = (const void __user *)regs->bp;
115
116 if (trace->nr_entries < trace->max_entries)
117 trace->entries[trace->nr_entries++] = regs->ip;
118
119 while (trace->nr_entries < trace->max_entries) {
120 struct stack_frame frame;
121
122 frame.next_fp = NULL;
123 frame.ret_addr = 0;
124 if (!copy_stack_frame(fp, &frame))
125 break;
126 if ((unsigned long)fp < regs->sp)
127 break;
128 if (frame.ret_addr) {
129 trace->entries[trace->nr_entries++] =
130 frame.ret_addr;
131 }
132 if (fp == frame.next_fp)
133 break;
134 fp = frame.next_fp;
135 }
136}
137
138void save_stack_trace_user(struct stack_trace *trace)
139{
140 /*
141 * Trace user stack if we are not a kernel thread
142 */
143 if (current->mm) {
144 __save_stack_trace_user(trace);
145 }
146 if (trace->nr_entries < trace->max_entries)
147 trace->entries[trace->nr_entries++] = ULONG_MAX;
148}
149
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc2..82c67559dde7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
44 SCHED_TEXT 44 SCHED_TEXT
45 LOCK_TEXT 45 LOCK_TEXT
46 KPROBES_TEXT 46 KPROBES_TEXT
47 IRQENTRY_TEXT
47 *(.fixup) 48 *(.fixup)
48 *(.gnu.warning) 49 *(.gnu.warning)
49 _etext = .; /* End of text section */ 50 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405b..1a614c0e6bef 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -35,6 +35,7 @@ SECTIONS
35 SCHED_TEXT 35 SCHED_TEXT
36 LOCK_TEXT 36 LOCK_TEXT
37 KPROBES_TEXT 37 KPROBES_TEXT
38 IRQENTRY_TEXT
38 *(.fixup) 39 *(.fixup)
39 *(.gnu.warning) 40 *(.gnu.warning)
40 _etext = .; /* End of text section */ 41 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ebf2f12900f5..44153afc9067 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */ 18 */
19 19
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
20#include <linux/time.h> 23#include <linux/time.h>
21#include <linux/init.h> 24#include <linux/init.h>
22#include <linux/kernel.h> 25#include <linux/kernel.h>