[PATCH] sleep profiling

Implement prof=sleep profiling. TASK_UNINTERRUPTIBLE sleeps will be taken as a profile hit, and every millisecond spent sleeping causes a profile-hit for the call site that initiated the sleep. Sample readprofile output on i386: 306 ps2_sendbyte 1.3973 432 call_usermodehelper_keys 1.9548 484 ps2_command 0.6453 790 __driver_attach 4.7879 1593 msleep 44.2500 3976 sync_buffer 64.1290 4076 do_lookup 12.4648 8587 sync_page 122.6714 20820 total 0.0067 (NOTE: architectures need to check whether get_wchan() can be called from deep within the wakeup path.) akpm: we need to mark more functions __sched. lock_sock(), msleep(), others.. akpm: the contention in do_lookup() is a surprise. Presumably doing disk reads for directory contents while holding i_mutex. [akpm@osdl.org: various fixes] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Ingo Molnar <mingo@elte.hu> 2006-12-06 23:37:24 -0500
committer: Linus Torvalds <torvalds@woody.osdl.org> 2006-12-07 11:39:36 -0500
commit: ece8a684c75df215320b4155944979e3f78c5c93 (patch)
tree: 2a65e786dd70ab368c12836a39c36070c0e4974f /kernel
parent: 70888bd5b70579e278d5ef1b7e1ec6a420d38b9e (diff)
2 files changed, 41 insertions, 9 deletions
diff --git a/kernel/profile.c b/kernel/profile.c
index 15b012df4ff1..04fd84e8cdbe 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
-static int prof_on __read_mostly;
+int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
 static int __init profile_setup(char * str)
 {
        static char __initdata schedstr[] = "schedule";
+        static char __initdata sleepstr[] = "sleep";
        int par;
-        if (!strncmp(str, schedstr, strlen(schedstr))) {
+        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+                prof_on = SLEEP_PROFILING;
+                if (str[strlen(sleepstr)] == ',')
+                        str += strlen(sleepstr) + 1;
+                if (get_option(&str, &par))
+                        prof_shift = par;
+                printk(KERN_INFO
+                        "kernel sleep profiling enabled (shift: %ld)\n",
+                        prof_shift);
+        } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
                prof_on = SCHED_PROFILING;
                if (str[strlen(schedstr)] == ',')
                        str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
 * positions to which hits are accounted during short intervals (e.g.
 * several seconds) is usually very small. Exclusion from buffer
 * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING profile_hit() may be called from process context).
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
 * The hash function is meant to be lightweight as opposed to strong,
 * and was vaguely inspired by ppc64 firmware-supported inverted
 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
        mutex_unlock(&profile_flip_mutex);
 }
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
        int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
                put_cpu();
                return;
        }
+        /*
+         * We buffer the global profiler buffer into a per-CPU
+         * queue and thus reduce the number of global (and possibly
+         * NUMA-alien) accesses. The write-queue is self-coalescing:
+         */
        local_irq_save(flags);
        do {
                for (j = 0; j < PROFILE_GRPSZ; ++j) {
                        if (hits[i + j].pc == pc) {
-                                hits[i + j].hits++;
+                                hits[i + j].hits += nr_hits;
                                goto out;
                        } else if (!hits[i + j].hits) {
                                hits[i + j].pc = pc;
-                                hits[i + j].hits = 1;
+                                hits[i + j].hits = nr_hits;
                                goto out;
                        }
                }
                i = (i + secondary) & (NR_PROFILE_HIT - 1);
        } while (i != primary);
-        atomic_inc(&prof_buffer[pc]);
+        /*
+         * Add the current hit(s) and flush the write-queue out
+         * to the global buffer:
+         */
+        atomic_add(nr_hits, &prof_buffer[pc]);
        for (i = 0; i < NR_PROFILE_HIT; ++i) {
                atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
                hits[i].pc = hits[i].hits = 0;
@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 #define profile_flip_buffers()          do { } while (0)
 #define profile_discard_flip_buffers()  do { } while (0)
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long pc;
        if (prof_on != type || !prof_buffer)
                return;
        pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
-        atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+        atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
diff --git a/kernel/sched.c b/kernel/sched.c
index 343e1794233e..75a005ed4eda 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
        }
 #endif
+        /*
+         * Sleep time is in units of nanosecs, so shift by 20 to get a
+         * milliseconds-range estimation of the amount of time that the task
+         * spent sleeping:
+         */
+        if (unlikely(prof_on == SLEEP_PROFILING)) {
+                if (p->state == TASK_UNINTERRUPTIBLE)
+                        profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+                                     (now - p->timestamp) >> 20);
+        }
        if (!rt_task(p))
                p->prio = recalc_task_prio(p, now);
author	Ingo Molnar <mingo@elte.hu>	2006-12-06 23:37:24 -0500
committer	Linus Torvalds <torvalds@woody.osdl.org>	2006-12-07 11:39:36 -0500
commit	ece8a684c75df215320b4155944979e3f78c5c93 (patch)
tree	2a65e786dd70ab368c12836a39c36070c0e4974f /kernel
parent	70888bd5b70579e278d5ef1b7e1ec6a420d38b9e (diff)

diff --git a/kernel/profile.c b/kernel/profile.c index 15b012df4ff1..04fd84e8cdbe 100644 --- a/kernel/profile.c +++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (timer_hook)(struct pt_regs ) __read_mostly;
40		40
41	static atomic_t *prof_buffer;	41	static atomic_t *prof_buffer;
42	static unsigned long prof_len, prof_shift;	42	static unsigned long prof_len, prof_shift;
43	static int prof_on __read_mostly;	43	int prof_on __read_mostly;
44	static cpumask_t prof_cpu_mask = CPU_MASK_ALL;	44	static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
45	#ifdef CONFIG_SMP	45	#ifdef CONFIG_SMP
46	static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);	46	static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
51	static int __init profile_setup(char * str)	51	static int __init profile_setup(char * str)
52	{	52	{
53	static char __initdata schedstr[] = "schedule";	53	static char __initdata schedstr[] = "schedule";
		54	static char __initdata sleepstr[] = "sleep";
54	int par;	55	int par;
55		56
56	if (!strncmp(str, schedstr, strlen(schedstr))) {	57	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
		58	prof_on = SLEEP_PROFILING;
		59	if (str[strlen(sleepstr)] == ',')
		60	str += strlen(sleepstr) + 1;
		61	if (get_option(&str, &par))
		62	prof_shift = par;
		63	printk(KERN_INFO
		64	"kernel sleep profiling enabled (shift: %ld)\n",
		65	prof_shift);
		66	} else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
57	prof_on = SCHED_PROFILING;	67	prof_on = SCHED_PROFILING;
58	if (str[strlen(schedstr)] == ',')	68	if (str[strlen(schedstr)] == ',')
59	str += strlen(schedstr) + 1;	69	str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
204	* positions to which hits are accounted during short intervals (e.g.	214	* positions to which hits are accounted during short intervals (e.g.
205	* several seconds) is usually very small. Exclusion from buffer	215	* several seconds) is usually very small. Exclusion from buffer
206	* flipping is provided by interrupt disablement (note that for	216	* flipping is provided by interrupt disablement (note that for
207	* SCHED_PROFILING profile_hit() may be called from process context).	217	* SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
		218	* process context).
208	* The hash function is meant to be lightweight as opposed to strong,	219	* The hash function is meant to be lightweight as opposed to strong,
209	* and was vaguely inspired by ppc64 firmware-supported inverted	220	* and was vaguely inspired by ppc64 firmware-supported inverted
210	* pagetable hash functions, but uses a full hashtable full of finite	221	* pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
257	mutex_unlock(&profile_flip_mutex);	268	mutex_unlock(&profile_flip_mutex);
258	}	269	}
259		270
260	void profile_hit(int type, void *__pc)	271	void profile_hits(int type, void *__pc, unsigned int nr_hits)
261	{	272	{
262	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;	273	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
263	int i, j, cpu;	274	int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
274	put_cpu();	285	put_cpu();
275	return;	286	return;
276	}	287	}
		288	/*
		289	* We buffer the global profiler buffer into a per-CPU
		290	* queue and thus reduce the number of global (and possibly
		291	* NUMA-alien) accesses. The write-queue is self-coalescing:
		292	*/
277	local_irq_save(flags);	293	local_irq_save(flags);
278	do {	294	do {
279	for (j = 0; j < PROFILE_GRPSZ; ++j) {	295	for (j = 0; j < PROFILE_GRPSZ; ++j) {
280	if (hits[i + j].pc == pc) {	296	if (hits[i + j].pc == pc) {
281	hits[i + j].hits++;	297	hits[i + j].hits += nr_hits;
282	goto out;	298	goto out;
283	} else if (!hits[i + j].hits) {	299	} else if (!hits[i + j].hits) {
284	hits[i + j].pc = pc;	300	hits[i + j].pc = pc;
285	hits[i + j].hits = 1;	301	hits[i + j].hits = nr_hits;
286	goto out;	302	goto out;
287	}	303	}
288	}	304	}
289	i = (i + secondary) & (NR_PROFILE_HIT - 1);	305	i = (i + secondary) & (NR_PROFILE_HIT - 1);
290	} while (i != primary);	306	} while (i != primary);
291	atomic_inc(&prof_buffer[pc]);	307
		308	/*
		309	* Add the current hit(s) and flush the write-queue out
		310	* to the global buffer:
		311	*/
		312	atomic_add(nr_hits, &prof_buffer[pc]);
292	for (i = 0; i < NR_PROFILE_HIT; ++i) {	313	for (i = 0; i < NR_PROFILE_HIT; ++i) {
293	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);	314	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
294	hits[i].pc = hits[i].hits = 0;	315	hits[i].pc = hits[i].hits = 0;
@@ -356,14 +377,14 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
356	#define profile_flip_buffers() do { } while (0)	377	#define profile_flip_buffers() do { } while (0)
357	#define profile_discard_flip_buffers() do { } while (0)	378	#define profile_discard_flip_buffers() do { } while (0)
358		379
359	void profile_hit(int type, void *__pc)	380	void profile_hits(int type, void *__pc, unsigned int nr_hits)
360	{	381	{
361	unsigned long pc;	382	unsigned long pc;
362		383
363	if (prof_on != type \|\| !prof_buffer)	384	if (prof_on != type \|\| !prof_buffer)
364	return;	385	return;
365	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;	386	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
366	atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);	387	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
367	}	388	}
368	#endif /* !CONFIG_SMP */	389	#endif /* !CONFIG_SMP */
369		390


diff --git a/kernel/sched.c b/kernel/sched.c index 343e1794233e..75a005ed4eda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct p, struct rq rq, int local)
948	}	948	}
949	#endif	949	#endif
950		950
		951	/*
		952	* Sleep time is in units of nanosecs, so shift by 20 to get a
		953	* milliseconds-range estimation of the amount of time that the task
		954	* spent sleeping:
		955	*/
		956	if (unlikely(prof_on == SLEEP_PROFILING)) {
		957	if (p->state == TASK_UNINTERRUPTIBLE)
		958	profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
		959	(now - p->timestamp) >> 20);
		960	}
		961
951	if (!rt_task(p))	962	if (!rt_task(p))
952	p->prio = recalc_task_prio(p, now);	963	p->prio = recalc_task_prio(p, now);
953		964