diff options
author | Andy Lutomirski <luto@MIT.EDU> | 2011-05-23 09:31:28 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2011-05-24 08:51:29 -0400 |
commit | 44259b1abfaa8bb819d25d41d71e8e33e25dd36a (patch) | |
tree | 6255324c0bf2fef17ec3f7d92cdc88d9bbed44a3 /arch/x86/kernel | |
parent | 0f51f2852ccf0fe38a02d340d0ba625e8e32a863 (diff) |
x86-64: Move vread_tsc into a new file with sensible options
vread_tsc is short and hot, and it's userspace code so the usual
reasons to enable -pg and turn off sibling calls don't apply.
(OK, turning off sibling calls has no effect. But it might
someday...)
As an added benefit, tsc.c is profilable now.
Signed-off-by: Andy Lutomirski <luto@mit.edu>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Borislav Petkov <bp@amd64.org>
Link: http://lkml.kernel.org/r/%3C99c6d7f5efa3ccb65b4ac6eb443e1ab7bad47d7b.1306156808.git.luto%40mit.edu%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 8 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 34 | ||||
-rw-r--r-- | arch/x86/kernel/vread_tsc_64.c | 36 |
3 files changed, 41 insertions, 37 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 250806472a7e..f5abe3a245b8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) | |||
8 | 8 | ||
9 | ifdef CONFIG_FUNCTION_TRACER | 9 | ifdef CONFIG_FUNCTION_TRACER |
10 | # Do not profile debug and lowlevel utilities | 10 | # Do not profile debug and lowlevel utilities |
11 | CFLAGS_REMOVE_tsc.o = -pg | ||
12 | CFLAGS_REMOVE_rtc.o = -pg | 11 | CFLAGS_REMOVE_rtc.o = -pg |
13 | CFLAGS_REMOVE_paravirt-spinlocks.o = -pg | 12 | CFLAGS_REMOVE_paravirt-spinlocks.o = -pg |
14 | CFLAGS_REMOVE_pvclock.o = -pg | 13 | CFLAGS_REMOVE_pvclock.o = -pg |
@@ -24,13 +23,16 @@ endif | |||
24 | nostackp := $(call cc-option, -fno-stack-protector) | 23 | nostackp := $(call cc-option, -fno-stack-protector) |
25 | CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) | 24 | CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) |
26 | CFLAGS_hpet.o := $(nostackp) | 25 | CFLAGS_hpet.o := $(nostackp) |
27 | CFLAGS_tsc.o := $(nostackp) | 26 | CFLAGS_vread_tsc_64.o := $(nostackp) |
28 | CFLAGS_paravirt.o := $(nostackp) | 27 | CFLAGS_paravirt.o := $(nostackp) |
29 | GCOV_PROFILE_vsyscall_64.o := n | 28 | GCOV_PROFILE_vsyscall_64.o := n |
30 | GCOV_PROFILE_hpet.o := n | 29 | GCOV_PROFILE_hpet.o := n |
31 | GCOV_PROFILE_tsc.o := n | 30 | GCOV_PROFILE_tsc.o := n |
32 | GCOV_PROFILE_paravirt.o := n | 31 | GCOV_PROFILE_paravirt.o := n |
33 | 32 | ||
33 | # vread_tsc_64 is hot and should be fully optimized: | ||
34 | CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls | ||
35 | |||
34 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o | 36 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o |
35 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o | 37 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o |
36 | obj-y += time.o ioport.o ldt.o dumpstack.o | 38 | obj-y += time.o ioport.o ldt.o dumpstack.o |
@@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o | |||
39 | obj-y += probe_roms.o | 41 | obj-y += probe_roms.o |
40 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o | 42 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o |
41 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 43 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
42 | obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o | 44 | obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o |
43 | obj-y += bootflag.o e820.o | 45 | obj-y += bootflag.o e820.o |
44 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o | 46 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o |
45 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o | 47 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 24249a5360b6..6cc6922262af 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -763,40 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs) | |||
763 | ret : clocksource_tsc.cycle_last; | 763 | ret : clocksource_tsc.cycle_last; |
764 | } | 764 | } |
765 | 765 | ||
766 | #ifdef CONFIG_X86_64 | ||
767 | static cycle_t __vsyscall_fn vread_tsc(void) | ||
768 | { | ||
769 | cycle_t ret; | ||
770 | u64 last; | ||
771 | |||
772 | /* | ||
773 | * Empirically, a fence (of type that depends on the CPU) | ||
774 | * before rdtsc is enough to ensure that rdtsc is ordered | ||
775 | * with respect to loads. The various CPU manuals are unclear | ||
776 | * as to whether rdtsc can be reordered with later loads, | ||
777 | * but no one has ever seen it happen. | ||
778 | */ | ||
779 | rdtsc_barrier(); | ||
780 | ret = (cycle_t)vget_cycles(); | ||
781 | |||
782 | last = VVAR(vsyscall_gtod_data).clock.cycle_last; | ||
783 | |||
784 | if (likely(ret >= last)) | ||
785 | return ret; | ||
786 | |||
787 | /* | ||
788 | * GCC likes to generate cmov here, but this branch is extremely | ||
789 | * predictable (it's just a funciton of time and the likely is | ||
790 | * very likely) and there's a data dependence, so force GCC | ||
791 | * to generate a branch instead. I don't barrier() because | ||
792 | * we don't actually need a barrier, and if this function | ||
793 | * ever gets inlined it will generate worse code. | ||
794 | */ | ||
795 | asm volatile (""); | ||
796 | return last; | ||
797 | } | ||
798 | #endif | ||
799 | |||
800 | static void resume_tsc(struct clocksource *cs) | 766 | static void resume_tsc(struct clocksource *cs) |
801 | { | 767 | { |
802 | clocksource_tsc.cycle_last = 0; | 768 | clocksource_tsc.cycle_last = 0; |
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c new file mode 100644 index 000000000000..a81aa9e9894c --- /dev/null +++ b/arch/x86/kernel/vread_tsc_64.c | |||
@@ -0,0 +1,36 @@ | |||
1 | /* This code runs in userspace. */ | ||
2 | |||
3 | #define DISABLE_BRANCH_PROFILING | ||
4 | #include <asm/vgtod.h> | ||
5 | |||
6 | notrace cycle_t __vsyscall_fn vread_tsc(void) | ||
7 | { | ||
8 | cycle_t ret; | ||
9 | u64 last; | ||
10 | |||
11 | /* | ||
12 | * Empirically, a fence (of type that depends on the CPU) | ||
13 | * before rdtsc is enough to ensure that rdtsc is ordered | ||
14 | * with respect to loads. The various CPU manuals are unclear | ||
15 | * as to whether rdtsc can be reordered with later loads, | ||
16 | * but no one has ever seen it happen. | ||
17 | */ | ||
18 | rdtsc_barrier(); | ||
19 | ret = (cycle_t)vget_cycles(); | ||
20 | |||
21 | last = VVAR(vsyscall_gtod_data).clock.cycle_last; | ||
22 | |||
23 | if (likely(ret >= last)) | ||
24 | return ret; | ||
25 | |||
26 | /* | ||
27 | * GCC likes to generate cmov here, but this branch is extremely | ||
28 | * predictable (it's just a funciton of time and the likely is | ||
29 | * very likely) and there's a data dependence, so force GCC | ||
30 | * to generate a branch instead. I don't barrier() because | ||
31 | * we don't actually need a barrier, and if this function | ||
32 | * ever gets inlined it will generate worse code. | ||
33 | */ | ||
34 | asm volatile (""); | ||
35 | return last; | ||
36 | } | ||