aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2006-12-06 23:37:24 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-07 11:39:36 -0500
commitece8a684c75df215320b4155944979e3f78c5c93 (patch)
tree2a65e786dd70ab368c12836a39c36070c0e4974f
parent70888bd5b70579e278d5ef1b7e1ec6a420d38b9e (diff)
[PATCH] sleep profiling
Implement prof=sleep profiling. TASK_UNINTERRUPTIBLE sleeps will be taken as a profile hit, and every millisecond spent sleeping causes a profile-hit for the call site that initiated the sleep. Sample readprofile output on i386: 306 ps2_sendbyte 1.3973 432 call_usermodehelper_keys 1.9548 484 ps2_command 0.6453 790 __driver_attach 4.7879 1593 msleep 44.2500 3976 sync_buffer 64.1290 4076 do_lookup 12.4648 8587 sync_page 122.6714 20820 total 0.0067 (NOTE: architectures need to check whether get_wchan() can be called from deep within the wakeup path.) akpm: we need to mark more functions __sched. lock_sock(), msleep(), others.. akpm: the contention in do_lookup() is a surprise. Presumably doing disk reads for directory contents while holding i_mutex. [akpm@osdl.org: various fixes] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--Documentation/kernel-parameters.txt1
-rw-r--r--include/linux/profile.h24
-rw-r--r--kernel/profile.c39
-rw-r--r--kernel/sched.c11
4 files changed, 65 insertions, 10 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8fe6b834ef27..2a40d9f6ffad 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1294,6 +1294,7 @@ and is between 256 and 4096 characters. It is defined in the file
1294 Param: "schedule" - profile schedule points. 1294 Param: "schedule" - profile schedule points.
1295 Param: <number> - step/bucket size as a power of 2 for 1295 Param: <number> - step/bucket size as a power of 2 for
1296 statistical time based profiling. 1296 statistical time based profiling.
1297 Param: "sleep" - profile D-state sleeping (millisecs)
1297 1298
1298 processor.max_cstate= [HW,ACPI] 1299 processor.max_cstate= [HW,ACPI]
1299 Limit processor to maximum C-state 1300 Limit processor to maximum C-state
diff --git a/include/linux/profile.h b/include/linux/profile.h
index acce53fd38b6..5670b340c4ef 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -6,10 +6,15 @@
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/cpumask.h> 8#include <linux/cpumask.h>
9#include <linux/cache.h>
10
9#include <asm/errno.h> 11#include <asm/errno.h>
10 12
13extern int prof_on __read_mostly;
14
11#define CPU_PROFILING 1 15#define CPU_PROFILING 1
12#define SCHED_PROFILING 2 16#define SCHED_PROFILING 2
17#define SLEEP_PROFILING 3
13 18
14struct proc_dir_entry; 19struct proc_dir_entry;
15struct pt_regs; 20struct pt_regs;
@@ -18,7 +23,24 @@ struct notifier_block;
18/* init basic kernel profiler */ 23/* init basic kernel profiler */
19void __init profile_init(void); 24void __init profile_init(void);
20void profile_tick(int); 25void profile_tick(int);
21void profile_hit(int, void *); 26
27/*
28 * Add multiple profiler hits to a given address:
29 */
30void profile_hits(int, void *ip, unsigned int nr_hits);
31
32/*
33 * Single profiler hit:
34 */
35static inline void profile_hit(int type, void *ip)
36{
37 /*
38 * Speedup for the common (no profiling enabled) case:
39 */
40 if (unlikely(prof_on == type))
41 profile_hits(type, ip, 1);
42}
43
22#ifdef CONFIG_PROC_FS 44#ifdef CONFIG_PROC_FS
23void create_prof_cpu_mask(struct proc_dir_entry *); 45void create_prof_cpu_mask(struct proc_dir_entry *);
24#else 46#else
diff --git a/kernel/profile.c b/kernel/profile.c
index 15b012df4ff1..04fd84e8cdbe 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
40 40
41static atomic_t *prof_buffer; 41static atomic_t *prof_buffer;
42static unsigned long prof_len, prof_shift; 42static unsigned long prof_len, prof_shift;
43static int prof_on __read_mostly; 43int prof_on __read_mostly;
44static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 44static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
51static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
52{ 52{
53 static char __initdata schedstr[] = "schedule"; 53 static char __initdata schedstr[] = "schedule";
54 static char __initdata sleepstr[] = "sleep";
54 int par; 55 int par;
55 56
56 if (!strncmp(str, schedstr, strlen(schedstr))) { 57 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
58 prof_on = SLEEP_PROFILING;
59 if (str[strlen(sleepstr)] == ',')
60 str += strlen(sleepstr) + 1;
61 if (get_option(&str, &par))
62 prof_shift = par;
63 printk(KERN_INFO
64 "kernel sleep profiling enabled (shift: %ld)\n",
65 prof_shift);
66 } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
57 prof_on = SCHED_PROFILING; 67 prof_on = SCHED_PROFILING;
58 if (str[strlen(schedstr)] == ',') 68 if (str[strlen(schedstr)] == ',')
59 str += strlen(schedstr) + 1; 69 str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
204 * positions to which hits are accounted during short intervals (e.g. 214 * positions to which hits are accounted during short intervals (e.g.
205 * several seconds) is usually very small. Exclusion from buffer 215 * several seconds) is usually very small. Exclusion from buffer
206 * flipping is provided by interrupt disablement (note that for 216 * flipping is provided by interrupt disablement (note that for
207 * SCHED_PROFILING profile_hit() may be called from process context). 217 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
218 * process context).
208 * The hash function is meant to be lightweight as opposed to strong, 219 * The hash function is meant to be lightweight as opposed to strong,
209 * and was vaguely inspired by ppc64 firmware-supported inverted 220 * and was vaguely inspired by ppc64 firmware-supported inverted
210 * pagetable hash functions, but uses a full hashtable full of finite 221 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
257 mutex_unlock(&profile_flip_mutex); 268 mutex_unlock(&profile_flip_mutex);
258} 269}
259 270
260void profile_hit(int type, void *__pc) 271void profile_hits(int type, void *__pc, unsigned int nr_hits)
261{ 272{
262 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 273 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
263 int i, j, cpu; 274 int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
274 put_cpu(); 285 put_cpu();
275 return; 286 return;
276 } 287 }
288 /*
289 * We buffer the global profiler buffer into a per-CPU
290 * queue and thus reduce the number of global (and possibly
291 * NUMA-alien) accesses. The write-queue is self-coalescing:
292 */
277 local_irq_save(flags); 293 local_irq_save(flags);
278 do { 294 do {
279 for (j = 0; j < PROFILE_GRPSZ; ++j) { 295 for (j = 0; j < PROFILE_GRPSZ; ++j) {
280 if (hits[i + j].pc == pc) { 296 if (hits[i + j].pc == pc) {
281 hits[i + j].hits++; 297 hits[i + j].hits += nr_hits;
282 goto out; 298 goto out;
283 } else if (!hits[i + j].hits) { 299 } else if (!hits[i + j].hits) {
284 hits[i + j].pc = pc; 300 hits[i + j].pc = pc;
285 hits[i + j].hits = 1; 301 hits[i + j].hits = nr_hits;
286 goto out; 302 goto out;
287 } 303 }
288 } 304 }
289 i = (i + secondary) & (NR_PROFILE_HIT - 1); 305 i = (i + secondary) & (NR_PROFILE_HIT - 1);
290 } while (i != primary); 306 } while (i != primary);
291 atomic_inc(&prof_buffer[pc]); 307
308 /*
309 * Add the current hit(s) and flush the write-queue out
310 * to the global buffer:
311 */
312 atomic_add(nr_hits, &prof_buffer[pc]);
292 for (i = 0; i < NR_PROFILE_HIT; ++i) { 313 for (i = 0; i < NR_PROFILE_HIT; ++i) {
293 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 314 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
294 hits[i].pc = hits[i].hits = 0; 315 hits[i].pc = hits[i].hits = 0;
@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
356#define profile_flip_buffers() do { } while (0) 377#define profile_flip_buffers() do { } while (0)
357#define profile_discard_flip_buffers() do { } while (0) 378#define profile_discard_flip_buffers() do { } while (0)
358 379
359void profile_hit(int type, void *__pc) 380void profile_hits(int type, void *__pc, unsigned int nr_hits)
360{ 381{
361 unsigned long pc; 382 unsigned long pc;
362 383
363 if (prof_on != type || !prof_buffer) 384 if (prof_on != type || !prof_buffer)
364 return; 385 return;
365 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 386 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
366 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); 387 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
367} 388}
368#endif /* !CONFIG_SMP */ 389#endif /* !CONFIG_SMP */
369 390
diff --git a/kernel/sched.c b/kernel/sched.c
index 343e1794233e..75a005ed4eda 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
948 } 948 }
949#endif 949#endif
950 950
951 /*
952 * Sleep time is in units of nanosecs, so shift by 20 to get a
953 * milliseconds-range estimation of the amount of time that the task
954 * spent sleeping:
955 */
956 if (unlikely(prof_on == SLEEP_PROFILING)) {
957 if (p->state == TASK_UNINTERRUPTIBLE)
958 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
959 (now - p->timestamp) >> 20);
960 }
961
951 if (!rt_task(p)) 962 if (!rt_task(p))
952 p->prio = recalc_task_prio(p, now); 963 p->prio = recalc_task_prio(p, now);
953 964