sched: Fix /proc/sched_debug failure on very very large systems

On systems with 4096 cores attemping to read /proc/sched_debug fails because we are trying to push all the data into a single kmalloc buffer. The issue is on these very large machines all the data will not fit in 4mb. A better solution is to not us the single_open mechanism but to provide our own seq_operations and treat each cpu as an individual record. The output should be identical to the previous version. Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Cc: Peter Zijlstra <peterz@infradead.org>) [ Whitespace fixlet] [ Fix spello in comment] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Nathan Zimmer <nzimmer@sgi.com> 2013-02-21 18:15:09 -0500
committer: Ingo Molnar <mingo@kernel.org> 2013-02-22 04:27:25 -0500
commit: bbbfeac92beff40eb86c7f682a7f1395f9f0ae52 (patch)
tree: 80a9f7d32d7a9a7898349c6e6ca88a1a7c1edf13 /kernel/sched
parent: cb152ff26717961b10d0888cd983ba284cb99cd1 (diff)
1 files changed, 79 insertions, 11 deletions
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c5420e..c496eb3c6459 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu)
        {
                unsigned int freq = cpu_khz ? : 1;
-                SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+                SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
                           cpu, freq / 1000, (freq % 1000));
        }
 #else
-        SEQ_printf(m, "\ncpu#%d\n", cpu);
+        SEQ_printf(m, "cpu#%d\n", cpu);
 #endif
 #define P(x)                                                            \
@@ -330,6 +330,7 @@ do {									\
        print_rq(m, rq, cpu);
        rcu_read_unlock();
        spin_unlock_irqrestore(&sched_debug_lock, flags);
+        SEQ_printf(m, "\n");
 }
 static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = {
        "linear"
 };
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
 {
        u64 ktime, sched_clk, cpu_clk;
        unsigned long flags;
-        int cpu;
        local_irq_save(flags);
        ktime = ktime_to_ns(ktime_get());
@@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
-        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+        SEQ_printf(m, "  .%-40s: %d (%s)\n",
+                "sysctl_sched_tunable_scaling",
                sysctl_sched_tunable_scaling,
                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+        SEQ_printf(m, "\n");
+}
-        for_each_online_cpu(cpu)
+static int sched_debug_show(struct seq_file *m, void *v)
-                print_cpu(m, cpu);
+{
+        int cpu = (unsigned long)(v - 2);
-        SEQ_printf(m, "\n");
+        if (cpu != -1)
+                print_cpu(m, cpu);
+        else
+                sched_debug_header(m);
        return 0;
 }
 void sysrq_sched_debug_show(void)
 {
-        sched_debug_show(NULL, NULL);
+        int cpu;
+        sched_debug_header(NULL);
+        for_each_online_cpu(cpu)
+                print_cpu(NULL, cpu);
+}
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+        unsigned long n = *offset;
+        if (n == 0)
+                return (void *) 1;
+        n--;
+        if (n > 0)
+                n = cpumask_next(n - 1, cpu_online_mask);
+        else
+                n = cpumask_first(cpu_online_mask);
+        *offset = n + 1;
+        if (n < nr_cpu_ids)
+                return (void *)(unsigned long)(n + 2);
+        return NULL;
+}
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        return sched_debug_start(file, offset);
+}
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+static const struct seq_operations sched_debug_sops = {
+        .start = sched_debug_start,
+        .next = sched_debug_next,
+        .stop = sched_debug_stop,
+        .show = sched_debug_show,
+};
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+        seq_release(inode, file);
+        return 0;
 }
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
-        return single_open(filp, sched_debug_show, NULL);
+        int ret = 0;
+        ret = seq_open(filp, &sched_debug_sops);
+        return ret;
 }
 static const struct file_operations sched_debug_fops = {
        .open           = sched_debug_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .release        = sched_debug_release,
 };
 static int __init init_sched_debug_procfs(void)
author	Nathan Zimmer <nzimmer@sgi.com>	2013-02-21 18:15:09 -0500
committer	Ingo Molnar <mingo@kernel.org>	2013-02-22 04:27:25 -0500
commit	bbbfeac92beff40eb86c7f682a7f1395f9f0ae52 (patch)
tree	80a9f7d32d7a9a7898349c6e6ca88a1a7c1edf13 /kernel/sched
parent	cb152ff26717961b10d0888cd983ba284cb99cd1 (diff)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7ae4c4c5420e..c496eb3c6459 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c
@@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu)
269	{	269	{
270	unsigned int freq = cpu_khz ? : 1;	270	unsigned int freq = cpu_khz ? : 1;
271		271
272	SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",	272	SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
273	cpu, freq / 1000, (freq % 1000));	273	cpu, freq / 1000, (freq % 1000));
274	}	274	}
275	#else	275	#else
276	SEQ_printf(m, "\ncpu#%d\n", cpu);	276	SEQ_printf(m, "cpu#%d\n", cpu);
277	#endif	277	#endif
278		278
279	#define P(x) \	279	#define P(x) \
@@ -330,6 +330,7 @@ do { \
330	print_rq(m, rq, cpu);	330	print_rq(m, rq, cpu);
331	rcu_read_unlock();	331	rcu_read_unlock();
332	spin_unlock_irqrestore(&sched_debug_lock, flags);	332	spin_unlock_irqrestore(&sched_debug_lock, flags);
		333	SEQ_printf(m, "\n");
333	}	334	}
334		335
335	static const char *sched_tunable_scaling_names[] = {	336	static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = {
338	"linear"	339	"linear"
339	};	340	};
340		341
341	static int sched_debug_show(struct seq_file m, void v)	342	static void sched_debug_header(struct seq_file *m)
342	{	343	{
343	u64 ktime, sched_clk, cpu_clk;	344	u64 ktime, sched_clk, cpu_clk;
344	unsigned long flags;	345	unsigned long flags;
345	int cpu;
346		346
347	local_irq_save(flags);	347	local_irq_save(flags);
348	ktime = ktime_to_ns(ktime_get());	348	ktime = ktime_to_ns(ktime_get());
@@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file m, void v)
384	#undef PN	384	#undef PN
385	#undef P	385	#undef P
386		386
387	SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",	387	SEQ_printf(m, " .%-40s: %d (%s)\n",
		388	"sysctl_sched_tunable_scaling",
388	sysctl_sched_tunable_scaling,	389	sysctl_sched_tunable_scaling,
389	sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);	390	sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
		391	SEQ_printf(m, "\n");
		392	}
390		393
391	for_each_online_cpu(cpu)	394	static int sched_debug_show(struct seq_file m, void v)
392	print_cpu(m, cpu);	395	{
		396	int cpu = (unsigned long)(v - 2);
393		397
394	SEQ_printf(m, "\n");	398	if (cpu != -1)
		399	print_cpu(m, cpu);
		400	else
		401	sched_debug_header(m);
395		402
396	return 0;	403	return 0;
397	}	404	}
398		405
399	void sysrq_sched_debug_show(void)	406	void sysrq_sched_debug_show(void)
400	{	407	{
401	sched_debug_show(NULL, NULL);	408	int cpu;
		409
		410	sched_debug_header(NULL);
		411	for_each_online_cpu(cpu)
		412	print_cpu(NULL, cpu);
		413
		414	}
		415
		416	/*
		417	* This itererator needs some explanation.
		418	* It returns 1 for the header position.
		419	* This means 2 is cpu 0.
		420	* In a hotplugged system some cpus, including cpu 0, may be missing so we have
		421	* to use cpumask_* to iterate over the cpus.
		422	*/
		423	static void sched_debug_start(struct seq_file file, loff_t *offset)
		424	{
		425	unsigned long n = *offset;
		426
		427	if (n == 0)
		428	return (void *) 1;
		429
		430	n--;
		431
		432	if (n > 0)
		433	n = cpumask_next(n - 1, cpu_online_mask);
		434	else
		435	n = cpumask_first(cpu_online_mask);
		436
		437	*offset = n + 1;
		438
		439	if (n < nr_cpu_ids)
		440	return (void *)(unsigned long)(n + 2);
		441	return NULL;
		442	}
		443
		444	static void sched_debug_next(struct seq_file file, void data, loff_t offset)
		445	{
		446	(*offset)++;
		447	return sched_debug_start(file, offset);
		448	}
		449
		450	static void sched_debug_stop(struct seq_file file, void data)
		451	{
		452	}
		453
		454	static const struct seq_operations sched_debug_sops = {
		455	.start = sched_debug_start,
		456	.next = sched_debug_next,
		457	.stop = sched_debug_stop,
		458	.show = sched_debug_show,
		459	};
		460
		461	static int sched_debug_release(struct inode inode, struct file file)
		462	{
		463	seq_release(inode, file);
		464
		465	return 0;
402	}	466	}
403		467
404	static int sched_debug_open(struct inode inode, struct file filp)	468	static int sched_debug_open(struct inode inode, struct file filp)
405	{	469	{
406	return single_open(filp, sched_debug_show, NULL);	470	int ret = 0;
		471
		472	ret = seq_open(filp, &sched_debug_sops);
		473
		474	return ret;
407	}	475	}
408		476
409	static const struct file_operations sched_debug_fops = {	477	static const struct file_operations sched_debug_fops = {
410	.open = sched_debug_open,	478	.open = sched_debug_open,
411	.read = seq_read,	479	.read = seq_read,
412	.llseek = seq_lseek,	480	.llseek = seq_lseek,
413	.release = single_release,	481	.release = sched_debug_release,
414	};	482	};
415		483
416	static int __init init_sched_debug_procfs(void)	484	static int __init init_sched_debug_procfs(void)