From 68bf21aa15c85d2e9b623dcda2b1ed8893275fa1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:08 -0400 Subject: ftrace: mcount call site on boot nops core This is the infrastructure to the converting the mcount call sites recorded by the __mcount_loc section into nops on boot. It also allows for using these sites to enable tracing as normal. When the __mcount_loc section is used, the "ftraced" kernel thread is disabled. This uses the current infrastructure to record the mcount call sites as well as convert them to nops. The mcount function is kept as a stub on boot up and not converted to the ftrace_record_ip function. We use the ftrace_record_ip to only record from the table. This patch does not handle modules. That comes with a later patch. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- init/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 3820323c4c84..ded1fae965ab 100644 --- a/init/main.c +++ b/init/main.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,8 @@ asmlinkage void __init start_kernel(void) acpi_early_init(); /* before LAPIC and SMP init */ + ftrace_init(); + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } -- cgit v1.2.2 From aa5d9151f745b6ee6a236a1f109118034277eb92 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 13 Sep 2008 09:36:06 -0700 Subject: tracing/fastboot: add a script to visualize the kernel boot process / time When optimizing the kernel boot time, it's very valuable to visualize what is going on at which time. In addition, with the fastboot asynchronous initcall level, it's very valuable to see which initcall gets run where and when. This patch adds a script to turn a dmesg into a SVG graph (that can be shown with tools such as InkScape, Gimp or Firefox) and a small change to the initcall code to print the PID of the thread calling the initcall (so that the script can work out the parallelism). Signed-off-by: Arjan van de Ven --- init/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index ded1fae965ab..16abba05c826 100644 --- a/init/main.c +++ b/init/main.c @@ -711,7 +711,8 @@ int do_one_initcall(initcall_t fn) int result; if (initcall_debug) { - printk("calling %pF\n", fn); + printk("calling %pF", fn); + printk(" @ %i\n", task_pid_nr(current)); t0 = ktime_get(); } -- cgit v1.2.2 From 3bf77af6e1fef1124bf71d81f9f84885f0ee0dea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Tue, 23 Sep 2008 11:38:18 +0100 Subject: tracing/ftrace: launch boot tracing after pre-smp initcalls Launch the boot tracing inside the initcall_debug area. Old printk have not been removed to keep the old way of initcall tracing for backward compatibility. [ mingo@elte.hu: resolved conflicts ] Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- init/main.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 16abba05c826..1e39a1eab190 100644 --- a/init/main.c +++ b/init/main.c @@ -709,10 +709,12 @@ int do_one_initcall(initcall_t fn) ktime_t t0, t1, delta; char msgbuf[64]; int result; + struct boot_trace it; if (initcall_debug) { - printk("calling %pF", fn); - printk(" @ %i\n", task_pid_nr(current)); + it.caller = task_pid_nr(current); + it.func = fn; + printk("calling %pF @ %i\n", fn, it.caller); t0 = ktime_get(); } @@ -721,10 +723,11 @@ int do_one_initcall(initcall_t fn) if (initcall_debug) { t1 = ktime_get(); delta = ktime_sub(t1, t0); - - printk("initcall %pF returned %d after %Ld msecs\n", - fn, result, - (unsigned long long) delta.tv64 >> 20); + it.result = result; + it.duration = (unsigned long long) delta.tv64 >> 20; + printk("initcall %pF returned %d after %Ld msecs\n", fn, + result, it.duration); + trace_boot(&it); } msgbuf[0] = 0; @@ -859,6 +862,7 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); + start_boot_trace(); smp_init(); sched_init_smp(); -- cgit v1.2.2 From cb5ab74204a6e2579d1119bf1348eb806526b12b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 2 Oct 2008 12:59:20 +0200 Subject: tracing/fastboot: change the printing of boot tracer according to bootgraph.pl Change the boot tracer printing to make it parsable for the scripts/bootgraph.pl script. We have now to output two lines for each initcall, according to the printk in do_one_initcall() in init/main.c We need now the call's time and the return's time. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- init/main.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 1e39a1eab190..61eb66159391 100644 --- a/init/main.c +++ b/init/main.c @@ -706,34 +706,32 @@ __setup("initcall_debug", initcall_debug_setup); int do_one_initcall(initcall_t fn) { int count = preempt_count(); - ktime_t t0, t1, delta; + ktime_t delta; char msgbuf[64]; - int result; struct boot_trace it; if (initcall_debug) { it.caller = task_pid_nr(current); it.func = fn; printk("calling %pF @ %i\n", fn, it.caller); - t0 = ktime_get(); + it.calltime = ktime_get(); } - result = fn(); + it.result = fn(); if (initcall_debug) { - t1 = ktime_get(); - delta = ktime_sub(t1, t0); - it.result = result; + it.rettime = ktime_get(); + delta = ktime_sub(it.rettime, it.calltime); it.duration = (unsigned long long) delta.tv64 >> 20; printk("initcall %pF returned %d after %Ld msecs\n", fn, - result, it.duration); + it.result, it.duration); trace_boot(&it); } msgbuf[0] = 0; - if (result && result != -ENODEV && initcall_debug) - sprintf(msgbuf, "error code %d ", result); + if (it.result && it.result != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", it.result); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); @@ -747,7 +745,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned with %s\n", fn, msgbuf); } - return result; + return it.result; } -- cgit v1.2.2 From 5601020feb0c3010e9e3e0131e9697ac6a06777b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 2 Oct 2008 13:26:05 +0200 Subject: tracing/fastboot: get the initcall name before it disappears After some initcall traces, some initcall names may be inconsistent. That's because these functions will disappear from the .init section and also their name from the symbols table. So we have to copy the name of the function in a buffer large enough during the trace appending. It is not costly for the ring_buffer because the number of initcall entries is commonly not really large. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- init/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 61eb66159391..8e96a0ef17f4 100644 --- a/init/main.c +++ b/init/main.c @@ -712,7 +712,6 @@ int do_one_initcall(initcall_t fn) if (initcall_debug) { it.caller = task_pid_nr(current); - it.func = fn; printk("calling %pF @ %i\n", fn, it.caller); it.calltime = ktime_get(); } @@ -725,7 +724,7 @@ int do_one_initcall(initcall_t fn) it.duration = (unsigned long long) delta.tv64 >> 20; printk("initcall %pF returned %d after %Ld msecs\n", fn, it.result, it.duration); - trace_boot(&it); + trace_boot(&it, fn); } msgbuf[0] = 0; -- cgit v1.2.2 From 097d036a2f25eecc42435c57e010aaf4a2eed2d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 15:39:21 +0200 Subject: tracing/fastboot: only trace non-module initcalls At this time, only built-in initcalls interest us. We can't really produce a relevant graph if we include the modules initcall too. I had good results after this patch (see svg in attachment). Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- init/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 8e96a0ef17f4..e7939de80f3e 100644 --- a/init/main.c +++ b/init/main.c @@ -886,6 +886,7 @@ static int __init kernel_init(void * unused) * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ + stop_boot_trace(); init_post(); return 0; } -- cgit v1.2.2 From ca538f6bbe583406f941f3041d40c41f9a13d1de Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Thu, 9 Oct 2008 15:23:05 -0700 Subject: tracing/fastboot: add better resolution to initcall debug/tracing Change the time resolution for initcall_debug to microseconds, from milliseconds. This is handy to determine which initcalls you want to work on for faster booting. One one of my test machines, over 90% of the initcalls are less than a millisecond and (without this patch) these are all reported as 0 msecs. Working on the 900 us ones is more important than the 4 us ones. With 'quiet' on the kernel command line, this adds no significant overhead to kernel boot time. Signed-off-by: Tim Bird Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- init/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index e7939de80f3e..b2e7ff4a5349 100644 --- a/init/main.c +++ b/init/main.c @@ -721,8 +721,8 @@ int do_one_initcall(initcall_t fn) if (initcall_debug) { it.rettime = ktime_get(); delta = ktime_sub(it.rettime, it.calltime); - it.duration = (unsigned long long) delta.tv64 >> 20; - printk("initcall %pF returned %d after %Ld msecs\n", fn, + it.duration = (unsigned long long) delta.tv64 >> 10; + printk("initcall %pF returned %d after %Ld usecs\n", fn, it.result, it.duration); trace_boot(&it, fn); } -- cgit v1.2.2 From db64fe02258f1507e13fe5212a989922323685ce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:27:03 -0700 Subject: mm: rewrite vmap layer Rewrite the vmap allocator to use rbtrees and lazy tlb flushing, and provide a fast, scalable percpu frontend for small vmaps (requires a slightly different API, though). The biggest problem with vmap is actually vunmap. Presently this requires a global kernel TLB flush, which on most architectures is a broadcast IPI to all CPUs to flush the cache. This is all done under a global lock. As the number of CPUs increases, so will the number of vunmaps a scaled workload will want to perform, and so will the cost of a global TLB flush. This gives terrible quadratic scalability characteristics. Another problem is that the entire vmap subsystem works under a single lock. It is a rwlock, but it is actually taken for write in all the fast paths, and the read locking would likely never be run concurrently anyway, so it's just pointless. This is a rewrite of vmap subsystem to solve those problems. The existing vmalloc API is implemented on top of the rewritten subsystem. The TLB flushing problem is solved by using lazy TLB unmapping. vmap addresses do not have to be flushed immediately when they are vunmapped, because the kernel will not reuse them again (would be a use-after-free) until they are reallocated. So the addresses aren't allocated again until a subsequent TLB flush. A single TLB flush then can flush multiple vunmaps from each CPU. XEN and PAT and such do not like deferred TLB flushing because they can't always handle multiple aliasing virtual addresses to a physical address. They now call vm_unmap_aliases() in order to flush any deferred mappings. That call is very expensive (well, actually not a lot more expensive than a single vunmap under the old scheme), however it should be OK if not called too often. The virtual memory extent information is stored in an rbtree rather than a linked list to improve the algorithmic scalability. There is a per-CPU allocator for small vmaps, which amortizes or avoids global locking. To use the per-CPU interface, the vm_map_ram / vm_unmap_ram interfaces must be used in place of vmap and vunmap. Vmalloc does not use these interfaces at the moment, so it will not be quite so scalable (although it will use lazy TLB flushing). As a quick test of performance, I ran a test that loops in the kernel, linearly mapping then touching then unmapping 4 pages. Different numbers of tests were run in parallel on an 4 core, 2 socket opteron. Results are in nanoseconds per map+touch+unmap. threads vanilla vmap rewrite 1 14700 2900 2 33600 3000 4 49500 2800 8 70631 2900 So with a 8 cores, the rewritten version is already 25x faster. In a slightly more realistic test (although with an older and less scalable version of the patch), I ripped the not-very-good vunmap batching code out of XFS, and implemented the large buffer mapping with vm_map_ram and vm_unmap_ram... along with a couple of other tricks, I was able to speed up a large directory workload by 20x on a 64 CPU system. I believe vmap/vunmap is actually sped up a lot more than 20x on such a system, but I'm running into other locks now. vmap is pretty well blown off the profiles. Before: 1352059 total 0.1401 798784 _write_lock 8320.6667 <- vmlist_lock 529313 default_idle 1181.5022 15242 smp_call_function 15.8771 <- vmap tlb flushing 2472 __get_vm_area_node 1.9312 <- vmap 1762 remove_vm_area 4.5885 <- vunmap 316 map_vm_area 0.2297 <- vmap 312 kfree 0.1950 300 _spin_lock 3.1250 252 sn_send_IPI_phys 0.4375 <- tlb flushing 238 vmap 0.8264 <- vmap 216 find_lock_page 0.5192 196 find_next_bit 0.3603 136 sn2_send_IPI 0.2024 130 pio_phys_write_mmr 2.0312 118 unmap_kernel_range 0.1229 After: 78406 total 0.0081 40053 default_idle 89.4040 33576 ia64_spinlock_contention 349.7500 1650 _spin_lock 17.1875 319 __reg_op 0.5538 281 _atomic_dec_and_lock 1.0977 153 mutex_unlock 1.5938 123 iget_locked 0.1671 117 xfs_dir_lookup 0.1662 117 dput 0.1406 114 xfs_iget_core 0.0268 92 xfs_da_hashname 0.1917 75 d_alloc 0.0670 68 vmap_page_range 0.0462 <- vmap 58 kmem_cache_alloc 0.0604 57 memset 0.0540 52 rb_next 0.1625 50 __copy_user 0.0208 49 bitmap_find_free_region 0.2188 <- vmap 46 ia64_sn_udelay 0.1106 45 find_inode_fast 0.1406 42 memcmp 0.2188 42 finish_task_switch 0.1094 42 __d_lookup 0.0410 40 radix_tree_lookup_slot 0.1250 37 _spin_unlock_irqrestore 0.3854 36 xfs_bmapi 0.0050 36 kmem_cache_free 0.0256 35 xfs_vn_getattr 0.0322 34 radix_tree_lookup 0.1062 33 __link_path_walk 0.0035 31 xfs_da_do_buf 0.0091 30 _xfs_buf_find 0.0204 28 find_get_page 0.0875 27 xfs_iread 0.0241 27 __strncpy_from_user 0.2812 26 _xfs_buf_initialize 0.0406 24 _xfs_buf_lookup_pages 0.0179 24 vunmap_page_range 0.0250 <- vunmap 23 find_lock_page 0.0799 22 vm_map_ram 0.0087 <- vmap 20 kfree 0.0125 19 put_page 0.0330 18 __kmalloc 0.0176 17 xfs_da_node_lookup_int 0.0086 17 _read_lock 0.0885 17 page_waitqueue 0.0664 vmap has gone from being the top 5 on the profiles and flushing the crap out of all TLBs, to using less than 1% of kernel time. [akpm@linux-foundation.org: cleanups, section fix] [akpm@linux-foundation.org: fix build on alpha] Signed-off-by: Nick Piggin Cc: Jeremy Fitzhardinge Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'init/main.c') diff --git a/init/main.c b/init/main.c index 27f6bf6108e9..4371d11721f6 100644 --- a/init/main.c +++ b/init/main.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -642,6 +643,7 @@ asmlinkage void __init start_kernel(void) initrd_start = 0; } #endif + vmalloc_init(); vfs_caches_init_early(); cpuset_init_early(); mem_init(); -- cgit v1.2.2