diff options
| author | Ingo Molnar <mingo@elte.hu> | 2006-12-06 23:37:24 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@woody.osdl.org> | 2006-12-07 11:39:36 -0500 |
| commit | ece8a684c75df215320b4155944979e3f78c5c93 (patch) | |
| tree | 2a65e786dd70ab368c12836a39c36070c0e4974f | |
| parent | 70888bd5b70579e278d5ef1b7e1ec6a420d38b9e (diff) | |
[PATCH] sleep profiling
Implement prof=sleep profiling. TASK_UNINTERRUPTIBLE sleeps will be taken
as a profile hit, and every millisecond spent sleeping causes a profile-hit
for the call site that initiated the sleep.
Sample readprofile output on i386:
306 ps2_sendbyte 1.3973
432 call_usermodehelper_keys 1.9548
484 ps2_command 0.6453
790 __driver_attach 4.7879
1593 msleep 44.2500
3976 sync_buffer 64.1290
4076 do_lookup 12.4648
8587 sync_page 122.6714
20820 total 0.0067
(NOTE: architectures need to check whether get_wchan() can be called from
deep within the wakeup path.)
akpm: we need to mark more functions __sched. lock_sock(), msleep(), others..
akpm: the contention in do_lookup() is a surprise. Presumably doing disk
reads for directory contents while holding i_mutex.
[akpm@osdl.org: various fixes]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
| -rw-r--r-- | Documentation/kernel-parameters.txt | 1 | ||||
| -rw-r--r-- | include/linux/profile.h | 24 | ||||
| -rw-r--r-- | kernel/profile.c | 39 | ||||
| -rw-r--r-- | kernel/sched.c | 11 |
4 files changed, 65 insertions, 10 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8fe6b834ef27..2a40d9f6ffad 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
| @@ -1294,6 +1294,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
| 1294 | Param: "schedule" - profile schedule points. | 1294 | Param: "schedule" - profile schedule points. |
| 1295 | Param: <number> - step/bucket size as a power of 2 for | 1295 | Param: <number> - step/bucket size as a power of 2 for |
| 1296 | statistical time based profiling. | 1296 | statistical time based profiling. |
| 1297 | Param: "sleep" - profile D-state sleeping (millisecs) | ||
| 1297 | 1298 | ||
| 1298 | processor.max_cstate= [HW,ACPI] | 1299 | processor.max_cstate= [HW,ACPI] |
| 1299 | Limit processor to maximum C-state | 1300 | Limit processor to maximum C-state |
diff --git a/include/linux/profile.h b/include/linux/profile.h index acce53fd38b6..5670b340c4ef 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h | |||
| @@ -6,10 +6,15 @@ | |||
| 6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
| 7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
| 8 | #include <linux/cpumask.h> | 8 | #include <linux/cpumask.h> |
| 9 | #include <linux/cache.h> | ||
| 10 | |||
| 9 | #include <asm/errno.h> | 11 | #include <asm/errno.h> |
| 10 | 12 | ||
| 13 | extern int prof_on __read_mostly; | ||
| 14 | |||
| 11 | #define CPU_PROFILING 1 | 15 | #define CPU_PROFILING 1 |
| 12 | #define SCHED_PROFILING 2 | 16 | #define SCHED_PROFILING 2 |
| 17 | #define SLEEP_PROFILING 3 | ||
| 13 | 18 | ||
| 14 | struct proc_dir_entry; | 19 | struct proc_dir_entry; |
| 15 | struct pt_regs; | 20 | struct pt_regs; |
| @@ -18,7 +23,24 @@ struct notifier_block; | |||
| 18 | /* init basic kernel profiler */ | 23 | /* init basic kernel profiler */ |
| 19 | void __init profile_init(void); | 24 | void __init profile_init(void); |
| 20 | void profile_tick(int); | 25 | void profile_tick(int); |
| 21 | void profile_hit(int, void *); | 26 | |
| 27 | /* | ||
| 28 | * Add multiple profiler hits to a given address: | ||
| 29 | */ | ||
| 30 | void profile_hits(int, void *ip, unsigned int nr_hits); | ||
| 31 | |||
| 32 | /* | ||
| 33 | * Single profiler hit: | ||
| 34 | */ | ||
| 35 | static inline void profile_hit(int type, void *ip) | ||
| 36 | { | ||
| 37 | /* | ||
| 38 | * Speedup for the common (no profiling enabled) case: | ||
| 39 | */ | ||
| 40 | if (unlikely(prof_on == type)) | ||
| 41 | profile_hits(type, ip, 1); | ||
| 42 | } | ||
| 43 | |||
| 22 | #ifdef CONFIG_PROC_FS | 44 | #ifdef CONFIG_PROC_FS |
| 23 | void create_prof_cpu_mask(struct proc_dir_entry *); | 45 | void create_prof_cpu_mask(struct proc_dir_entry *); |
| 24 | #else | 46 | #else |
diff --git a/kernel/profile.c b/kernel/profile.c index 15b012df4ff1..04fd84e8cdbe 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; | |||
| 40 | 40 | ||
| 41 | static atomic_t *prof_buffer; | 41 | static atomic_t *prof_buffer; |
| 42 | static unsigned long prof_len, prof_shift; | 42 | static unsigned long prof_len, prof_shift; |
| 43 | static int prof_on __read_mostly; | 43 | int prof_on __read_mostly; |
| 44 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | 44 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; |
| 45 | #ifdef CONFIG_SMP | 45 | #ifdef CONFIG_SMP |
| 46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | 46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); |
| @@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex); | |||
| 51 | static int __init profile_setup(char * str) | 51 | static int __init profile_setup(char * str) |
| 52 | { | 52 | { |
| 53 | static char __initdata schedstr[] = "schedule"; | 53 | static char __initdata schedstr[] = "schedule"; |
| 54 | static char __initdata sleepstr[] = "sleep"; | ||
| 54 | int par; | 55 | int par; |
| 55 | 56 | ||
| 56 | if (!strncmp(str, schedstr, strlen(schedstr))) { | 57 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
| 58 | prof_on = SLEEP_PROFILING; | ||
| 59 | if (str[strlen(sleepstr)] == ',') | ||
| 60 | str += strlen(sleepstr) + 1; | ||
| 61 | if (get_option(&str, &par)) | ||
| 62 | prof_shift = par; | ||
| 63 | printk(KERN_INFO | ||
| 64 | "kernel sleep profiling enabled (shift: %ld)\n", | ||
| 65 | prof_shift); | ||
| 66 | } else if (!strncmp(str, sleepstr, strlen(sleepstr))) { | ||
| 57 | prof_on = SCHED_PROFILING; | 67 | prof_on = SCHED_PROFILING; |
| 58 | if (str[strlen(schedstr)] == ',') | 68 | if (str[strlen(schedstr)] == ',') |
| 59 | str += strlen(schedstr) + 1; | 69 | str += strlen(schedstr) + 1; |
| @@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister); | |||
| 204 | * positions to which hits are accounted during short intervals (e.g. | 214 | * positions to which hits are accounted during short intervals (e.g. |
| 205 | * several seconds) is usually very small. Exclusion from buffer | 215 | * several seconds) is usually very small. Exclusion from buffer |
| 206 | * flipping is provided by interrupt disablement (note that for | 216 | * flipping is provided by interrupt disablement (note that for |
| 207 | * SCHED_PROFILING profile_hit() may be called from process context). | 217 | * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from |
| 218 | * process context). | ||
| 208 | * The hash function is meant to be lightweight as opposed to strong, | 219 | * The hash function is meant to be lightweight as opposed to strong, |
| 209 | * and was vaguely inspired by ppc64 firmware-supported inverted | 220 | * and was vaguely inspired by ppc64 firmware-supported inverted |
| 210 | * pagetable hash functions, but uses a full hashtable full of finite | 221 | * pagetable hash functions, but uses a full hashtable full of finite |
| @@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void) | |||
| 257 | mutex_unlock(&profile_flip_mutex); | 268 | mutex_unlock(&profile_flip_mutex); |
| 258 | } | 269 | } |
| 259 | 270 | ||
| 260 | void profile_hit(int type, void *__pc) | 271 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
| 261 | { | 272 | { |
| 262 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | 273 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; |
| 263 | int i, j, cpu; | 274 | int i, j, cpu; |
| @@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc) | |||
| 274 | put_cpu(); | 285 | put_cpu(); |
| 275 | return; | 286 | return; |
| 276 | } | 287 | } |
| 288 | /* | ||
| 289 | * We buffer the global profiler buffer into a per-CPU | ||
| 290 | * queue and thus reduce the number of global (and possibly | ||
| 291 | * NUMA-alien) accesses. The write-queue is self-coalescing: | ||
| 292 | */ | ||
| 277 | local_irq_save(flags); | 293 | local_irq_save(flags); |
| 278 | do { | 294 | do { |
| 279 | for (j = 0; j < PROFILE_GRPSZ; ++j) { | 295 | for (j = 0; j < PROFILE_GRPSZ; ++j) { |
| 280 | if (hits[i + j].pc == pc) { | 296 | if (hits[i + j].pc == pc) { |
| 281 | hits[i + j].hits++; | 297 | hits[i + j].hits += nr_hits; |
| 282 | goto out; | 298 | goto out; |
| 283 | } else if (!hits[i + j].hits) { | 299 | } else if (!hits[i + j].hits) { |
| 284 | hits[i + j].pc = pc; | 300 | hits[i + j].pc = pc; |
| 285 | hits[i + j].hits = 1; | 301 | hits[i + j].hits = nr_hits; |
| 286 | goto out; | 302 | goto out; |
| 287 | } | 303 | } |
| 288 | } | 304 | } |
| 289 | i = (i + secondary) & (NR_PROFILE_HIT - 1); | 305 | i = (i + secondary) & (NR_PROFILE_HIT - 1); |
| 290 | } while (i != primary); | 306 | } while (i != primary); |
| 291 | atomic_inc(&prof_buffer[pc]); | 307 | |
| 308 | /* | ||
| 309 | * Add the current hit(s) and flush the write-queue out | ||
| 310 | * to the global buffer: | ||
| 311 | */ | ||
| 312 | atomic_add(nr_hits, &prof_buffer[pc]); | ||
| 292 | for (i = 0; i < NR_PROFILE_HIT; ++i) { | 313 | for (i = 0; i < NR_PROFILE_HIT; ++i) { |
| 293 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); | 314 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); |
| 294 | hits[i].pc = hits[i].hits = 0; | 315 | hits[i].pc = hits[i].hits = 0; |
| @@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
| 356 | #define profile_flip_buffers() do { } while (0) | 377 | #define profile_flip_buffers() do { } while (0) |
| 357 | #define profile_discard_flip_buffers() do { } while (0) | 378 | #define profile_discard_flip_buffers() do { } while (0) |
| 358 | 379 | ||
| 359 | void profile_hit(int type, void *__pc) | 380 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
| 360 | { | 381 | { |
| 361 | unsigned long pc; | 382 | unsigned long pc; |
| 362 | 383 | ||
| 363 | if (prof_on != type || !prof_buffer) | 384 | if (prof_on != type || !prof_buffer) |
| 364 | return; | 385 | return; |
| 365 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | 386 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; |
| 366 | atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); | 387 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
| 367 | } | 388 | } |
| 368 | #endif /* !CONFIG_SMP */ | 389 | #endif /* !CONFIG_SMP */ |
| 369 | 390 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 343e1794233e..75a005ed4eda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
| 948 | } | 948 | } |
| 949 | #endif | 949 | #endif |
| 950 | 950 | ||
| 951 | /* | ||
| 952 | * Sleep time is in units of nanosecs, so shift by 20 to get a | ||
| 953 | * milliseconds-range estimation of the amount of time that the task | ||
| 954 | * spent sleeping: | ||
| 955 | */ | ||
| 956 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
| 957 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
| 958 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | ||
| 959 | (now - p->timestamp) >> 20); | ||
| 960 | } | ||
| 961 | |||
| 951 | if (!rt_task(p)) | 962 | if (!rt_task(p)) |
| 952 | p->prio = recalc_task_prio(p, now); | 963 | p->prio = recalc_task_prio(p, now); |
| 953 | 964 | ||
