diff options
author | Ingo Molnar <mingo@elte.hu> | 2006-12-06 23:37:24 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.osdl.org> | 2006-12-07 11:39:36 -0500 |
commit | ece8a684c75df215320b4155944979e3f78c5c93 (patch) | |
tree | 2a65e786dd70ab368c12836a39c36070c0e4974f | |
parent | 70888bd5b70579e278d5ef1b7e1ec6a420d38b9e (diff) |
[PATCH] sleep profiling
Implement prof=sleep profiling. TASK_UNINTERRUPTIBLE sleeps will be taken
as a profile hit, and every millisecond spent sleeping causes a profile-hit
for the call site that initiated the sleep.
Sample readprofile output on i386:
306 ps2_sendbyte 1.3973
432 call_usermodehelper_keys 1.9548
484 ps2_command 0.6453
790 __driver_attach 4.7879
1593 msleep 44.2500
3976 sync_buffer 64.1290
4076 do_lookup 12.4648
8587 sync_page 122.6714
20820 total 0.0067
(NOTE: architectures need to check whether get_wchan() can be called from
deep within the wakeup path.)
akpm: we need to mark more functions __sched. lock_sock(), msleep(), others..
akpm: the contention in do_lookup() is a surprise. Presumably doing disk
reads for directory contents while holding i_mutex.
[akpm@osdl.org: various fixes]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | Documentation/kernel-parameters.txt | 1 | ||||
-rw-r--r-- | include/linux/profile.h | 24 | ||||
-rw-r--r-- | kernel/profile.c | 39 | ||||
-rw-r--r-- | kernel/sched.c | 11 |
4 files changed, 65 insertions, 10 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8fe6b834ef27..2a40d9f6ffad 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1294,6 +1294,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1294 | Param: "schedule" - profile schedule points. | 1294 | Param: "schedule" - profile schedule points. |
1295 | Param: <number> - step/bucket size as a power of 2 for | 1295 | Param: <number> - step/bucket size as a power of 2 for |
1296 | statistical time based profiling. | 1296 | statistical time based profiling. |
1297 | Param: "sleep" - profile D-state sleeping (millisecs) | ||
1297 | 1298 | ||
1298 | processor.max_cstate= [HW,ACPI] | 1299 | processor.max_cstate= [HW,ACPI] |
1299 | Limit processor to maximum C-state | 1300 | Limit processor to maximum C-state |
diff --git a/include/linux/profile.h b/include/linux/profile.h index acce53fd38b6..5670b340c4ef 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h | |||
@@ -6,10 +6,15 @@ | |||
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
8 | #include <linux/cpumask.h> | 8 | #include <linux/cpumask.h> |
9 | #include <linux/cache.h> | ||
10 | |||
9 | #include <asm/errno.h> | 11 | #include <asm/errno.h> |
10 | 12 | ||
13 | extern int prof_on __read_mostly; | ||
14 | |||
11 | #define CPU_PROFILING 1 | 15 | #define CPU_PROFILING 1 |
12 | #define SCHED_PROFILING 2 | 16 | #define SCHED_PROFILING 2 |
17 | #define SLEEP_PROFILING 3 | ||
13 | 18 | ||
14 | struct proc_dir_entry; | 19 | struct proc_dir_entry; |
15 | struct pt_regs; | 20 | struct pt_regs; |
@@ -18,7 +23,24 @@ struct notifier_block; | |||
18 | /* init basic kernel profiler */ | 23 | /* init basic kernel profiler */ |
19 | void __init profile_init(void); | 24 | void __init profile_init(void); |
20 | void profile_tick(int); | 25 | void profile_tick(int); |
21 | void profile_hit(int, void *); | 26 | |
27 | /* | ||
28 | * Add multiple profiler hits to a given address: | ||
29 | */ | ||
30 | void profile_hits(int, void *ip, unsigned int nr_hits); | ||
31 | |||
32 | /* | ||
33 | * Single profiler hit: | ||
34 | */ | ||
35 | static inline void profile_hit(int type, void *ip) | ||
36 | { | ||
37 | /* | ||
38 | * Speedup for the common (no profiling enabled) case: | ||
39 | */ | ||
40 | if (unlikely(prof_on == type)) | ||
41 | profile_hits(type, ip, 1); | ||
42 | } | ||
43 | |||
22 | #ifdef CONFIG_PROC_FS | 44 | #ifdef CONFIG_PROC_FS |
23 | void create_prof_cpu_mask(struct proc_dir_entry *); | 45 | void create_prof_cpu_mask(struct proc_dir_entry *); |
24 | #else | 46 | #else |
diff --git a/kernel/profile.c b/kernel/profile.c index 15b012df4ff1..04fd84e8cdbe 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; | |||
40 | 40 | ||
41 | static atomic_t *prof_buffer; | 41 | static atomic_t *prof_buffer; |
42 | static unsigned long prof_len, prof_shift; | 42 | static unsigned long prof_len, prof_shift; |
43 | static int prof_on __read_mostly; | 43 | int prof_on __read_mostly; |
44 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | 44 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; |
45 | #ifdef CONFIG_SMP | 45 | #ifdef CONFIG_SMP |
46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | 46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); |
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex); | |||
51 | static int __init profile_setup(char * str) | 51 | static int __init profile_setup(char * str) |
52 | { | 52 | { |
53 | static char __initdata schedstr[] = "schedule"; | 53 | static char __initdata schedstr[] = "schedule"; |
54 | static char __initdata sleepstr[] = "sleep"; | ||
54 | int par; | 55 | int par; |
55 | 56 | ||
56 | if (!strncmp(str, schedstr, strlen(schedstr))) { | 57 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
58 | prof_on = SLEEP_PROFILING; | ||
59 | if (str[strlen(sleepstr)] == ',') | ||
60 | str += strlen(sleepstr) + 1; | ||
61 | if (get_option(&str, &par)) | ||
62 | prof_shift = par; | ||
63 | printk(KERN_INFO | ||
64 | "kernel sleep profiling enabled (shift: %ld)\n", | ||
65 | prof_shift); | ||
66 | } else if (!strncmp(str, sleepstr, strlen(sleepstr))) { | ||
57 | prof_on = SCHED_PROFILING; | 67 | prof_on = SCHED_PROFILING; |
58 | if (str[strlen(schedstr)] == ',') | 68 | if (str[strlen(schedstr)] == ',') |
59 | str += strlen(schedstr) + 1; | 69 | str += strlen(schedstr) + 1; |
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister); | |||
204 | * positions to which hits are accounted during short intervals (e.g. | 214 | * positions to which hits are accounted during short intervals (e.g. |
205 | * several seconds) is usually very small. Exclusion from buffer | 215 | * several seconds) is usually very small. Exclusion from buffer |
206 | * flipping is provided by interrupt disablement (note that for | 216 | * flipping is provided by interrupt disablement (note that for |
207 | * SCHED_PROFILING profile_hit() may be called from process context). | 217 | * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from |
218 | * process context). | ||
208 | * The hash function is meant to be lightweight as opposed to strong, | 219 | * The hash function is meant to be lightweight as opposed to strong, |
209 | * and was vaguely inspired by ppc64 firmware-supported inverted | 220 | * and was vaguely inspired by ppc64 firmware-supported inverted |
210 | * pagetable hash functions, but uses a full hashtable full of finite | 221 | * pagetable hash functions, but uses a full hashtable full of finite |
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void) | |||
257 | mutex_unlock(&profile_flip_mutex); | 268 | mutex_unlock(&profile_flip_mutex); |
258 | } | 269 | } |
259 | 270 | ||
260 | void profile_hit(int type, void *__pc) | 271 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
261 | { | 272 | { |
262 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | 273 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; |
263 | int i, j, cpu; | 274 | int i, j, cpu; |
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc) | |||
274 | put_cpu(); | 285 | put_cpu(); |
275 | return; | 286 | return; |
276 | } | 287 | } |
288 | /* | ||
289 | * We buffer the global profiler buffer into a per-CPU | ||
290 | * queue and thus reduce the number of global (and possibly | ||
291 | * NUMA-alien) accesses. The write-queue is self-coalescing: | ||
292 | */ | ||
277 | local_irq_save(flags); | 293 | local_irq_save(flags); |
278 | do { | 294 | do { |
279 | for (j = 0; j < PROFILE_GRPSZ; ++j) { | 295 | for (j = 0; j < PROFILE_GRPSZ; ++j) { |
280 | if (hits[i + j].pc == pc) { | 296 | if (hits[i + j].pc == pc) { |
281 | hits[i + j].hits++; | 297 | hits[i + j].hits += nr_hits; |
282 | goto out; | 298 | goto out; |
283 | } else if (!hits[i + j].hits) { | 299 | } else if (!hits[i + j].hits) { |
284 | hits[i + j].pc = pc; | 300 | hits[i + j].pc = pc; |
285 | hits[i + j].hits = 1; | 301 | hits[i + j].hits = nr_hits; |
286 | goto out; | 302 | goto out; |
287 | } | 303 | } |
288 | } | 304 | } |
289 | i = (i + secondary) & (NR_PROFILE_HIT - 1); | 305 | i = (i + secondary) & (NR_PROFILE_HIT - 1); |
290 | } while (i != primary); | 306 | } while (i != primary); |
291 | atomic_inc(&prof_buffer[pc]); | 307 | |
308 | /* | ||
309 | * Add the current hit(s) and flush the write-queue out | ||
310 | * to the global buffer: | ||
311 | */ | ||
312 | atomic_add(nr_hits, &prof_buffer[pc]); | ||
292 | for (i = 0; i < NR_PROFILE_HIT; ++i) { | 313 | for (i = 0; i < NR_PROFILE_HIT; ++i) { |
293 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); | 314 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); |
294 | hits[i].pc = hits[i].hits = 0; | 315 | hits[i].pc = hits[i].hits = 0; |
@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
356 | #define profile_flip_buffers() do { } while (0) | 377 | #define profile_flip_buffers() do { } while (0) |
357 | #define profile_discard_flip_buffers() do { } while (0) | 378 | #define profile_discard_flip_buffers() do { } while (0) |
358 | 379 | ||
359 | void profile_hit(int type, void *__pc) | 380 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
360 | { | 381 | { |
361 | unsigned long pc; | 382 | unsigned long pc; |
362 | 383 | ||
363 | if (prof_on != type || !prof_buffer) | 384 | if (prof_on != type || !prof_buffer) |
364 | return; | 385 | return; |
365 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | 386 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; |
366 | atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); | 387 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
367 | } | 388 | } |
368 | #endif /* !CONFIG_SMP */ | 389 | #endif /* !CONFIG_SMP */ |
369 | 390 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 343e1794233e..75a005ed4eda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
948 | } | 948 | } |
949 | #endif | 949 | #endif |
950 | 950 | ||
951 | /* | ||
952 | * Sleep time is in units of nanosecs, so shift by 20 to get a | ||
953 | * milliseconds-range estimation of the amount of time that the task | ||
954 | * spent sleeping: | ||
955 | */ | ||
956 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
957 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
958 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | ||
959 | (now - p->timestamp) >> 20); | ||
960 | } | ||
961 | |||
951 | if (!rt_task(p)) | 962 | if (!rt_task(p)) |
952 | p->prio = recalc_task_prio(p, now); | 963 | p->prio = recalc_task_prio(p, now); |
953 | 964 | ||