diff options
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r-- | arch/x86_64/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86_64/kernel/apic.c | 5 | ||||
-rw-r--r-- | arch/x86_64/kernel/entry.S | 11 | ||||
-rw-r--r-- | arch/x86_64/kernel/io_apic.c | 81 | ||||
-rw-r--r-- | arch/x86_64/kernel/kprobes.c | 7 | ||||
-rw-r--r-- | arch/x86_64/kernel/module.c | 4 | ||||
-rw-r--r-- | arch/x86_64/kernel/mpparse.c | 22 | ||||
-rw-r--r-- | arch/x86_64/kernel/nmi.c | 248 | ||||
-rw-r--r-- | arch/x86_64/kernel/pmtimer.c | 101 | ||||
-rw-r--r-- | arch/x86_64/kernel/ptrace.c | 17 | ||||
-rw-r--r-- | arch/x86_64/kernel/setup.c | 30 | ||||
-rw-r--r-- | arch/x86_64/kernel/signal.c | 4 | ||||
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 263 | ||||
-rw-r--r-- | arch/x86_64/kernel/time.c | 62 | ||||
-rw-r--r-- | arch/x86_64/kernel/vsyscall.c | 5 | ||||
-rw-r--r-- | arch/x86_64/kernel/x8664_ksyms.c | 13 |
16 files changed, 591 insertions, 283 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 0a3318e08ab6..5ca4a4598fda 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile | |||
@@ -28,6 +28,7 @@ obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o | |||
28 | obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o | 28 | obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o |
29 | obj-$(CONFIG_SWIOTLB) += swiotlb.o | 29 | obj-$(CONFIG_SWIOTLB) += swiotlb.o |
30 | obj-$(CONFIG_KPROBES) += kprobes.o | 30 | obj-$(CONFIG_KPROBES) += kprobes.o |
31 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o | ||
31 | 32 | ||
32 | obj-$(CONFIG_MODULES) += module.o | 33 | obj-$(CONFIG_MODULES) += module.o |
33 | 34 | ||
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 7e13545748e0..f8e6cc4fecd4 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <asm/mpspec.h> | 33 | #include <asm/mpspec.h> |
34 | #include <asm/pgalloc.h> | 34 | #include <asm/pgalloc.h> |
35 | #include <asm/mach_apic.h> | 35 | #include <asm/mach_apic.h> |
36 | #include <asm/nmi.h> | ||
36 | 37 | ||
37 | int apic_verbosity; | 38 | int apic_verbosity; |
38 | 39 | ||
@@ -925,7 +926,7 @@ __init int oem_force_hpet_timer(void) | |||
925 | unsigned id; | 926 | unsigned id; |
926 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | 927 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); |
927 | 928 | ||
928 | bitmap_empty(clustermap, NUM_APIC_CLUSTERS); | 929 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); |
929 | 930 | ||
930 | for (i = 0; i < NR_CPUS; i++) { | 931 | for (i = 0; i < NR_CPUS; i++) { |
931 | id = bios_cpu_apicid[i]; | 932 | id = bios_cpu_apicid[i]; |
@@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void) | |||
1056 | nr_ioapics = 0; | 1057 | nr_ioapics = 0; |
1057 | #endif | 1058 | #endif |
1058 | setup_boot_APIC_clock(); | 1059 | setup_boot_APIC_clock(); |
1059 | 1060 | check_nmi_watchdog(); | |
1060 | return 0; | 1061 | return 0; |
1061 | } | 1062 | } |
1062 | 1063 | ||
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 1086b5fcac21..28817490fdc6 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S | |||
@@ -220,13 +220,18 @@ sysret_careful: | |||
220 | jmp sysret_check | 220 | jmp sysret_check |
221 | 221 | ||
222 | /* Handle a signal */ | 222 | /* Handle a signal */ |
223 | /* edx: work flags (arg3) */ | ||
224 | sysret_signal: | 223 | sysret_signal: |
225 | sti | 224 | sti |
225 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
226 | jz 1f | ||
227 | |||
228 | /* Really a signal */ | ||
229 | /* edx: work flags (arg3) */ | ||
226 | leaq do_notify_resume(%rip),%rax | 230 | leaq do_notify_resume(%rip),%rax |
227 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | 231 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 |
228 | xorl %esi,%esi # oldset -> arg2 | 232 | xorl %esi,%esi # oldset -> arg2 |
229 | call ptregscall_common | 233 | call ptregscall_common |
234 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
230 | jmp sysret_check | 235 | jmp sysret_check |
231 | 236 | ||
232 | /* Do syscall tracing */ | 237 | /* Do syscall tracing */ |
@@ -484,6 +489,8 @@ retint_careful: | |||
484 | jmp retint_check | 489 | jmp retint_check |
485 | 490 | ||
486 | retint_signal: | 491 | retint_signal: |
492 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
493 | jz retint_swapgs | ||
487 | sti | 494 | sti |
488 | SAVE_REST | 495 | SAVE_REST |
489 | movq $-1,ORIG_RAX(%rsp) | 496 | movq $-1,ORIG_RAX(%rsp) |
@@ -492,8 +499,8 @@ retint_signal: | |||
492 | call do_notify_resume | 499 | call do_notify_resume |
493 | RESTORE_REST | 500 | RESTORE_REST |
494 | cli | 501 | cli |
502 | movl $_TIF_NEED_RESCHED,%edi | ||
495 | GET_THREAD_INFO(%rcx) | 503 | GET_THREAD_INFO(%rcx) |
496 | movl $_TIF_WORK_MASK,%edi | ||
497 | jmp retint_check | 504 | jmp retint_check |
498 | 505 | ||
499 | #ifdef CONFIG_PREEMPT | 506 | #ifdef CONFIG_PREEMPT |
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 60be58617eb9..80e9b498c443 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c | |||
@@ -42,6 +42,8 @@ | |||
42 | 42 | ||
43 | int sis_apic_bug; /* not actually supported, dummy for compile */ | 43 | int sis_apic_bug; /* not actually supported, dummy for compile */ |
44 | 44 | ||
45 | static int no_timer_check; | ||
46 | |||
45 | static DEFINE_SPINLOCK(ioapic_lock); | 47 | static DEFINE_SPINLOCK(ioapic_lock); |
46 | 48 | ||
47 | /* | 49 | /* |
@@ -1601,7 +1603,7 @@ static inline void check_timer(void) | |||
1601 | * Ok, does IRQ0 through the IOAPIC work? | 1603 | * Ok, does IRQ0 through the IOAPIC work? |
1602 | */ | 1604 | */ |
1603 | unmask_IO_APIC_irq(0); | 1605 | unmask_IO_APIC_irq(0); |
1604 | if (timer_irq_works()) { | 1606 | if (!no_timer_check && timer_irq_works()) { |
1605 | nmi_watchdog_default(); | 1607 | nmi_watchdog_default(); |
1606 | if (nmi_watchdog == NMI_IO_APIC) { | 1608 | if (nmi_watchdog == NMI_IO_APIC) { |
1607 | disable_8259A_irq(0); | 1609 | disable_8259A_irq(0); |
@@ -1671,6 +1673,13 @@ static inline void check_timer(void) | |||
1671 | panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); | 1673 | panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); |
1672 | } | 1674 | } |
1673 | 1675 | ||
1676 | static int __init notimercheck(char *s) | ||
1677 | { | ||
1678 | no_timer_check = 1; | ||
1679 | return 1; | ||
1680 | } | ||
1681 | __setup("no_timer_check", notimercheck); | ||
1682 | |||
1674 | /* | 1683 | /* |
1675 | * | 1684 | * |
1676 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. | 1685 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. |
@@ -1804,76 +1813,6 @@ device_initcall(ioapic_init_sysfs); | |||
1804 | 1813 | ||
1805 | #define IO_APIC_MAX_ID 0xFE | 1814 | #define IO_APIC_MAX_ID 0xFE |
1806 | 1815 | ||
1807 | int __init io_apic_get_unique_id (int ioapic, int apic_id) | ||
1808 | { | ||
1809 | union IO_APIC_reg_00 reg_00; | ||
1810 | static physid_mask_t apic_id_map; | ||
1811 | unsigned long flags; | ||
1812 | int i = 0; | ||
1813 | |||
1814 | /* | ||
1815 | * The P4 platform supports up to 256 APIC IDs on two separate APIC | ||
1816 | * buses (one for LAPICs, one for IOAPICs), where predecessors only | ||
1817 | * supports up to 16 on one shared APIC bus. | ||
1818 | * | ||
1819 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | ||
1820 | * advantage of new APIC bus architecture. | ||
1821 | */ | ||
1822 | |||
1823 | if (physids_empty(apic_id_map)) | ||
1824 | apic_id_map = phys_cpu_present_map; | ||
1825 | |||
1826 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1827 | reg_00.raw = io_apic_read(ioapic, 0); | ||
1828 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1829 | |||
1830 | if (apic_id >= IO_APIC_MAX_ID) { | ||
1831 | apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " | ||
1832 | "%d\n", ioapic, apic_id, reg_00.bits.ID); | ||
1833 | apic_id = reg_00.bits.ID; | ||
1834 | } | ||
1835 | |||
1836 | /* | ||
1837 | * Every APIC in a system must have a unique ID or we get lots of nice | ||
1838 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
1839 | */ | ||
1840 | if (physid_isset(apic_id, apic_id_map)) { | ||
1841 | |||
1842 | for (i = 0; i < IO_APIC_MAX_ID; i++) { | ||
1843 | if (!physid_isset(i, apic_id_map)) | ||
1844 | break; | ||
1845 | } | ||
1846 | |||
1847 | if (i == IO_APIC_MAX_ID) | ||
1848 | panic("Max apic_id exceeded!\n"); | ||
1849 | |||
1850 | apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " | ||
1851 | "trying %d\n", ioapic, apic_id, i); | ||
1852 | |||
1853 | apic_id = i; | ||
1854 | } | ||
1855 | |||
1856 | physid_set(apic_id, apic_id_map); | ||
1857 | |||
1858 | if (reg_00.bits.ID != apic_id) { | ||
1859 | reg_00.bits.ID = apic_id; | ||
1860 | |||
1861 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1862 | io_apic_write(ioapic, 0, reg_00.raw); | ||
1863 | reg_00.raw = io_apic_read(ioapic, 0); | ||
1864 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1865 | |||
1866 | /* Sanity check */ | ||
1867 | if (reg_00.bits.ID != apic_id) | ||
1868 | panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); | ||
1869 | } | ||
1870 | |||
1871 | apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); | ||
1872 | |||
1873 | return apic_id; | ||
1874 | } | ||
1875 | |||
1876 | |||
1877 | int __init io_apic_get_version (int ioapic) | 1816 | int __init io_apic_get_version (int ioapic) |
1878 | { | 1817 | { |
1879 | union IO_APIC_reg_01 reg_01; | 1818 | union IO_APIC_reg_01 reg_01; |
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 4f2a852299b6..f77f8a0ff187 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c | |||
@@ -355,6 +355,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs) | |||
355 | *tos &= ~(TF_MASK | IF_MASK); | 355 | *tos &= ~(TF_MASK | IF_MASK); |
356 | *tos |= kprobe_old_rflags; | 356 | *tos |= kprobe_old_rflags; |
357 | break; | 357 | break; |
358 | case 0xc3: /* ret/lret */ | ||
359 | case 0xcb: | ||
360 | case 0xc2: | ||
361 | case 0xca: | ||
362 | regs->eflags &= ~TF_MASK; | ||
363 | /* rip is already adjusted, no more changes required*/ | ||
364 | return; | ||
358 | case 0xe8: /* call relative - Fix return addr */ | 365 | case 0xe8: /* call relative - Fix return addr */ |
359 | *tos = orig_rip + (*tos - copy_rip); | 366 | *tos = orig_rip + (*tos - copy_rip); |
360 | break; | 367 | break; |
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c index c2ffea8845ed..bac195c74bcc 100644 --- a/arch/x86_64/kernel/module.c +++ b/arch/x86_64/kernel/module.c | |||
@@ -30,9 +30,12 @@ | |||
30 | 30 | ||
31 | #define DEBUGP(fmt...) | 31 | #define DEBUGP(fmt...) |
32 | 32 | ||
33 | #ifndef CONFIG_UML | ||
33 | void module_free(struct module *mod, void *module_region) | 34 | void module_free(struct module *mod, void *module_region) |
34 | { | 35 | { |
35 | vfree(module_region); | 36 | vfree(module_region); |
37 | /* FIXME: If module_region == mod->init_region, trim exception | ||
38 | table entries. */ | ||
36 | } | 39 | } |
37 | 40 | ||
38 | void *module_alloc(unsigned long size) | 41 | void *module_alloc(unsigned long size) |
@@ -51,6 +54,7 @@ void *module_alloc(unsigned long size) | |||
51 | 54 | ||
52 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); | 55 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); |
53 | } | 56 | } |
57 | #endif | ||
54 | 58 | ||
55 | /* We don't need anything special. */ | 59 | /* We don't need anything special. */ |
56 | int module_frob_arch_sections(Elf_Ehdr *hdr, | 60 | int module_frob_arch_sections(Elf_Ehdr *hdr, |
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 7ec031c6ca10..f86d9db94bfc 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c | |||
@@ -107,6 +107,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) | |||
107 | static void __init MP_processor_info (struct mpc_config_processor *m) | 107 | static void __init MP_processor_info (struct mpc_config_processor *m) |
108 | { | 108 | { |
109 | int ver; | 109 | int ver; |
110 | static int found_bsp=0; | ||
110 | 111 | ||
111 | if (!(m->mpc_cpuflag & CPU_ENABLED)) | 112 | if (!(m->mpc_cpuflag & CPU_ENABLED)) |
112 | return; | 113 | return; |
@@ -126,11 +127,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m) | |||
126 | " Processor ignored.\n", NR_CPUS); | 127 | " Processor ignored.\n", NR_CPUS); |
127 | return; | 128 | return; |
128 | } | 129 | } |
129 | if (num_processors >= maxcpus) { | ||
130 | printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." | ||
131 | " Processor ignored.\n", maxcpus); | ||
132 | return; | ||
133 | } | ||
134 | 130 | ||
135 | num_processors++; | 131 | num_processors++; |
136 | 132 | ||
@@ -150,7 +146,19 @@ static void __init MP_processor_info (struct mpc_config_processor *m) | |||
150 | ver = 0x10; | 146 | ver = 0x10; |
151 | } | 147 | } |
152 | apic_version[m->mpc_apicid] = ver; | 148 | apic_version[m->mpc_apicid] = ver; |
153 | bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; | 149 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { |
150 | /* | ||
151 | * bios_cpu_apicid is required to have processors listed | ||
152 | * in same order as logical cpu numbers. Hence the first | ||
153 | * entry is BSP, and so on. | ||
154 | */ | ||
155 | bios_cpu_apicid[0] = m->mpc_apicid; | ||
156 | x86_cpu_to_apicid[0] = m->mpc_apicid; | ||
157 | found_bsp = 1; | ||
158 | } else { | ||
159 | bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid; | ||
160 | x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid; | ||
161 | } | ||
154 | } | 162 | } |
155 | 163 | ||
156 | static void __init MP_bus_info (struct mpc_config_bus *m) | 164 | static void __init MP_bus_info (struct mpc_config_bus *m) |
@@ -759,7 +767,7 @@ void __init mp_register_ioapic ( | |||
759 | mp_ioapics[idx].mpc_apicaddr = address; | 767 | mp_ioapics[idx].mpc_apicaddr = address; |
760 | 768 | ||
761 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | 769 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); |
762 | mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); | 770 | mp_ioapics[idx].mpc_apicid = id; |
763 | mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); | 771 | mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); |
764 | 772 | ||
765 | /* | 773 | /* |
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 61de0b34a01e..31c0f2e6ac91 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <asm/msr.h> | 33 | #include <asm/msr.h> |
34 | #include <asm/proto.h> | 34 | #include <asm/proto.h> |
35 | #include <asm/kdebug.h> | 35 | #include <asm/kdebug.h> |
36 | #include <asm/local.h> | ||
36 | 37 | ||
37 | /* | 38 | /* |
38 | * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: | 39 | * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: |
@@ -59,7 +60,8 @@ int panic_on_timeout; | |||
59 | 60 | ||
60 | unsigned int nmi_watchdog = NMI_DEFAULT; | 61 | unsigned int nmi_watchdog = NMI_DEFAULT; |
61 | static unsigned int nmi_hz = HZ; | 62 | static unsigned int nmi_hz = HZ; |
62 | unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ | 63 | static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ |
64 | static unsigned int nmi_p4_cccr_val; | ||
63 | 65 | ||
64 | /* Note that these events don't tick when the CPU idles. This means | 66 | /* Note that these events don't tick when the CPU idles. This means |
65 | the frequency varies with CPU load. */ | 67 | the frequency varies with CPU load. */ |
@@ -71,61 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ | |||
71 | #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 | 73 | #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 |
72 | #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING | 74 | #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING |
73 | 75 | ||
74 | #define P6_EVNTSEL0_ENABLE (1 << 22) | 76 | #define MSR_P4_MISC_ENABLE 0x1A0 |
75 | #define P6_EVNTSEL_INT (1 << 20) | 77 | #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) |
76 | #define P6_EVNTSEL_OS (1 << 17) | 78 | #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) |
77 | #define P6_EVNTSEL_USR (1 << 16) | 79 | #define MSR_P4_PERFCTR0 0x300 |
78 | #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 | 80 | #define MSR_P4_CCCR0 0x360 |
79 | #define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED | 81 | #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) |
82 | #define P4_ESCR_OS (1<<3) | ||
83 | #define P4_ESCR_USR (1<<2) | ||
84 | #define P4_CCCR_OVF_PMI0 (1<<26) | ||
85 | #define P4_CCCR_OVF_PMI1 (1<<27) | ||
86 | #define P4_CCCR_THRESHOLD(N) ((N)<<20) | ||
87 | #define P4_CCCR_COMPLEMENT (1<<19) | ||
88 | #define P4_CCCR_COMPARE (1<<18) | ||
89 | #define P4_CCCR_REQUIRED (3<<16) | ||
90 | #define P4_CCCR_ESCR_SELECT(N) ((N)<<13) | ||
91 | #define P4_CCCR_ENABLE (1<<12) | ||
92 | /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter | ||
93 | CRU_ESCR0 (with any non-null event selector) through a complemented | ||
94 | max threshold. [IA32-Vol3, Section 14.9.9] */ | ||
95 | #define MSR_P4_IQ_COUNTER0 0x30C | ||
96 | #define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) | ||
97 | #define P4_NMI_IQ_CCCR0 \ | ||
98 | (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ | ||
99 | P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) | ||
100 | |||
101 | static __init inline int nmi_known_cpu(void) | ||
102 | { | ||
103 | switch (boot_cpu_data.x86_vendor) { | ||
104 | case X86_VENDOR_AMD: | ||
105 | return boot_cpu_data.x86 == 15; | ||
106 | case X86_VENDOR_INTEL: | ||
107 | return boot_cpu_data.x86 == 15; | ||
108 | } | ||
109 | return 0; | ||
110 | } | ||
80 | 111 | ||
81 | /* Run after command line and cpu_init init, but before all other checks */ | 112 | /* Run after command line and cpu_init init, but before all other checks */ |
82 | void __init nmi_watchdog_default(void) | 113 | void __init nmi_watchdog_default(void) |
83 | { | 114 | { |
84 | if (nmi_watchdog != NMI_DEFAULT) | 115 | if (nmi_watchdog != NMI_DEFAULT) |
85 | return; | 116 | return; |
86 | 117 | if (nmi_known_cpu()) | |
87 | /* For some reason the IO APIC watchdog doesn't work on the AMD | 118 | nmi_watchdog = NMI_LOCAL_APIC; |
88 | 8111 chipset. For now switch to local APIC mode using | 119 | else |
89 | perfctr0 there. On Intel CPUs we don't have code to handle | ||
90 | the perfctr and the IO-APIC seems to work, so use that. */ | ||
91 | |||
92 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | ||
93 | nmi_watchdog = NMI_LOCAL_APIC; | ||
94 | printk(KERN_INFO | ||
95 | "Using local APIC NMI watchdog using perfctr0\n"); | ||
96 | } else { | ||
97 | printk(KERN_INFO "Using IO APIC NMI watchdog\n"); | ||
98 | nmi_watchdog = NMI_IO_APIC; | 120 | nmi_watchdog = NMI_IO_APIC; |
99 | } | ||
100 | } | 121 | } |
101 | 122 | ||
102 | /* Why is there no CPUID flag for this? */ | 123 | #ifdef CONFIG_SMP |
103 | static __init int cpu_has_lapic(void) | 124 | /* The performance counters used by NMI_LOCAL_APIC don't trigger when |
125 | * the CPU is idle. To make sure the NMI watchdog really ticks on all | ||
126 | * CPUs during the test make them busy. | ||
127 | */ | ||
128 | static __init void nmi_cpu_busy(void *data) | ||
104 | { | 129 | { |
105 | switch (boot_cpu_data.x86_vendor) { | 130 | volatile int *endflag = data; |
106 | case X86_VENDOR_INTEL: | 131 | local_irq_enable(); |
107 | case X86_VENDOR_AMD: | 132 | /* Intentionally don't use cpu_relax here. This is |
108 | return boot_cpu_data.x86 >= 6; | 133 | to make sure that the performance counter really ticks, |
109 | /* .... add more cpus here or find a different way to figure this out. */ | 134 | even if there is a simulator or similar that catches the |
110 | default: | 135 | pause instruction. On a real HT machine this is fine because |
111 | return 0; | 136 | all other CPUs are busy with "useless" delay loops and don't |
112 | } | 137 | care if they get somewhat less cycles. */ |
138 | while (*endflag == 0) | ||
139 | barrier(); | ||
113 | } | 140 | } |
141 | #endif | ||
114 | 142 | ||
115 | static int __init check_nmi_watchdog (void) | 143 | int __init check_nmi_watchdog (void) |
116 | { | 144 | { |
117 | int counts[NR_CPUS]; | 145 | volatile int endflag = 0; |
146 | int *counts; | ||
118 | int cpu; | 147 | int cpu; |
119 | 148 | ||
120 | if (nmi_watchdog == NMI_NONE) | 149 | counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); |
121 | return 0; | 150 | if (!counts) |
151 | return -1; | ||
122 | 152 | ||
123 | if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { | 153 | printk(KERN_INFO "testing NMI watchdog ... "); |
124 | nmi_watchdog = NMI_NONE; | ||
125 | return -1; | ||
126 | } | ||
127 | 154 | ||
128 | printk(KERN_INFO "Testing NMI watchdog ... "); | 155 | if (nmi_watchdog == NMI_LOCAL_APIC) |
156 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); | ||
129 | 157 | ||
130 | for (cpu = 0; cpu < NR_CPUS; cpu++) | 158 | for (cpu = 0; cpu < NR_CPUS; cpu++) |
131 | counts[cpu] = cpu_pda[cpu].__nmi_count; | 159 | counts[cpu] = cpu_pda[cpu].__nmi_count; |
@@ -133,15 +161,22 @@ static int __init check_nmi_watchdog (void) | |||
133 | mdelay((10*1000)/nmi_hz); // wait 10 ticks | 161 | mdelay((10*1000)/nmi_hz); // wait 10 ticks |
134 | 162 | ||
135 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 163 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
164 | if (!cpu_online(cpu)) | ||
165 | continue; | ||
136 | if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { | 166 | if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { |
137 | printk("CPU#%d: NMI appears to be stuck (%d)!\n", | 167 | endflag = 1; |
168 | printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", | ||
138 | cpu, | 169 | cpu, |
170 | counts[cpu], | ||
139 | cpu_pda[cpu].__nmi_count); | 171 | cpu_pda[cpu].__nmi_count); |
140 | nmi_active = 0; | 172 | nmi_active = 0; |
141 | lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; | 173 | lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; |
174 | nmi_perfctr_msr = 0; | ||
175 | kfree(counts); | ||
142 | return -1; | 176 | return -1; |
143 | } | 177 | } |
144 | } | 178 | } |
179 | endflag = 1; | ||
145 | printk("OK.\n"); | 180 | printk("OK.\n"); |
146 | 181 | ||
147 | /* now that we know it works we can reduce NMI frequency to | 182 | /* now that we know it works we can reduce NMI frequency to |
@@ -149,10 +184,9 @@ static int __init check_nmi_watchdog (void) | |||
149 | if (nmi_watchdog == NMI_LOCAL_APIC) | 184 | if (nmi_watchdog == NMI_LOCAL_APIC) |
150 | nmi_hz = 1; | 185 | nmi_hz = 1; |
151 | 186 | ||
187 | kfree(counts); | ||
152 | return 0; | 188 | return 0; |
153 | } | 189 | } |
154 | /* Have this called later during boot so counters are updating */ | ||
155 | late_initcall(check_nmi_watchdog); | ||
156 | 190 | ||
157 | int __init setup_nmi_watchdog(char *str) | 191 | int __init setup_nmi_watchdog(char *str) |
158 | { | 192 | { |
@@ -170,7 +204,7 @@ int __init setup_nmi_watchdog(char *str) | |||
170 | 204 | ||
171 | if (nmi >= NMI_INVALID) | 205 | if (nmi >= NMI_INVALID) |
172 | return 0; | 206 | return 0; |
173 | nmi_watchdog = nmi; | 207 | nmi_watchdog = nmi; |
174 | return 1; | 208 | return 1; |
175 | } | 209 | } |
176 | 210 | ||
@@ -185,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void) | |||
185 | wrmsr(MSR_K7_EVNTSEL0, 0, 0); | 219 | wrmsr(MSR_K7_EVNTSEL0, 0, 0); |
186 | break; | 220 | break; |
187 | case X86_VENDOR_INTEL: | 221 | case X86_VENDOR_INTEL: |
188 | wrmsr(MSR_IA32_EVNTSEL0, 0, 0); | 222 | if (boot_cpu_data.x86 == 15) { |
223 | wrmsr(MSR_P4_IQ_CCCR0, 0, 0); | ||
224 | wrmsr(MSR_P4_CRU_ESCR0, 0, 0); | ||
225 | } | ||
189 | break; | 226 | break; |
190 | } | 227 | } |
191 | nmi_active = -1; | 228 | nmi_active = -1; |
@@ -253,7 +290,7 @@ void enable_timer_nmi_watchdog(void) | |||
253 | 290 | ||
254 | static int nmi_pm_active; /* nmi_active before suspend */ | 291 | static int nmi_pm_active; /* nmi_active before suspend */ |
255 | 292 | ||
256 | static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) | 293 | static int lapic_nmi_suspend(struct sys_device *dev, u32 state) |
257 | { | 294 | { |
258 | nmi_pm_active = nmi_active; | 295 | nmi_pm_active = nmi_active; |
259 | disable_lapic_nmi_watchdog(); | 296 | disable_lapic_nmi_watchdog(); |
@@ -300,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs); | |||
300 | * Original code written by Keith Owens. | 337 | * Original code written by Keith Owens. |
301 | */ | 338 | */ |
302 | 339 | ||
340 | static void clear_msr_range(unsigned int base, unsigned int n) | ||
341 | { | ||
342 | unsigned int i; | ||
343 | |||
344 | for(i = 0; i < n; ++i) | ||
345 | wrmsr(base+i, 0, 0); | ||
346 | } | ||
347 | |||
303 | static void setup_k7_watchdog(void) | 348 | static void setup_k7_watchdog(void) |
304 | { | 349 | { |
305 | int i; | 350 | int i; |
306 | unsigned int evntsel; | 351 | unsigned int evntsel; |
307 | 352 | ||
308 | /* No check, so can start with slow frequency */ | ||
309 | nmi_hz = 1; | ||
310 | |||
311 | /* XXX should check these in EFER */ | ||
312 | |||
313 | nmi_perfctr_msr = MSR_K7_PERFCTR0; | 353 | nmi_perfctr_msr = MSR_K7_PERFCTR0; |
314 | 354 | ||
315 | for(i = 0; i < 4; ++i) { | 355 | for(i = 0; i < 4; ++i) { |
316 | /* Simulator may not support it */ | 356 | /* Simulator may not support it */ |
317 | if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) | 357 | if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { |
358 | nmi_perfctr_msr = 0; | ||
318 | return; | 359 | return; |
360 | } | ||
319 | wrmsrl(MSR_K7_PERFCTR0+i, 0UL); | 361 | wrmsrl(MSR_K7_PERFCTR0+i, 0UL); |
320 | } | 362 | } |
321 | 363 | ||
@@ -325,12 +367,54 @@ static void setup_k7_watchdog(void) | |||
325 | | K7_NMI_EVENT; | 367 | | K7_NMI_EVENT; |
326 | 368 | ||
327 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); | 369 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); |
328 | wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); | 370 | wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); |
329 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 371 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
330 | evntsel |= K7_EVNTSEL_ENABLE; | 372 | evntsel |= K7_EVNTSEL_ENABLE; |
331 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); | 373 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); |
332 | } | 374 | } |
333 | 375 | ||
376 | |||
377 | static int setup_p4_watchdog(void) | ||
378 | { | ||
379 | unsigned int misc_enable, dummy; | ||
380 | |||
381 | rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); | ||
382 | if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) | ||
383 | return 0; | ||
384 | |||
385 | nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; | ||
386 | nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; | ||
387 | #ifdef CONFIG_SMP | ||
388 | if (smp_num_siblings == 2) | ||
389 | nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; | ||
390 | #endif | ||
391 | |||
392 | if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) | ||
393 | clear_msr_range(0x3F1, 2); | ||
394 | /* MSR 0x3F0 seems to have a default value of 0xFC00, but current | ||
395 | docs doesn't fully define it, so leave it alone for now. */ | ||
396 | if (boot_cpu_data.x86_model >= 0x3) { | ||
397 | /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ | ||
398 | clear_msr_range(0x3A0, 26); | ||
399 | clear_msr_range(0x3BC, 3); | ||
400 | } else { | ||
401 | clear_msr_range(0x3A0, 31); | ||
402 | } | ||
403 | clear_msr_range(0x3C0, 6); | ||
404 | clear_msr_range(0x3C8, 6); | ||
405 | clear_msr_range(0x3E0, 2); | ||
406 | clear_msr_range(MSR_P4_CCCR0, 18); | ||
407 | clear_msr_range(MSR_P4_PERFCTR0, 18); | ||
408 | |||
409 | wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); | ||
410 | wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); | ||
411 | Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); | ||
412 | wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); | ||
413 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
414 | wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); | ||
415 | return 1; | ||
416 | } | ||
417 | |||
334 | void setup_apic_nmi_watchdog(void) | 418 | void setup_apic_nmi_watchdog(void) |
335 | { | 419 | { |
336 | switch (boot_cpu_data.x86_vendor) { | 420 | switch (boot_cpu_data.x86_vendor) { |
@@ -341,6 +425,13 @@ void setup_apic_nmi_watchdog(void) | |||
341 | return; | 425 | return; |
342 | setup_k7_watchdog(); | 426 | setup_k7_watchdog(); |
343 | break; | 427 | break; |
428 | case X86_VENDOR_INTEL: | ||
429 | if (boot_cpu_data.x86 != 15) | ||
430 | return; | ||
431 | if (!setup_p4_watchdog()) | ||
432 | return; | ||
433 | break; | ||
434 | |||
344 | default: | 435 | default: |
345 | return; | 436 | return; |
346 | } | 437 | } |
@@ -355,56 +446,67 @@ void setup_apic_nmi_watchdog(void) | |||
355 | * | 446 | * |
356 | * as these watchdog NMI IRQs are generated on every CPU, we only | 447 | * as these watchdog NMI IRQs are generated on every CPU, we only |
357 | * have to check the current processor. | 448 | * have to check the current processor. |
358 | * | ||
359 | * since NMIs don't listen to _any_ locks, we have to be extremely | ||
360 | * careful not to rely on unsafe variables. The printk might lock | ||
361 | * up though, so we have to break up any console locks first ... | ||
362 | * [when there will be more tty-related locks, break them up | ||
363 | * here too!] | ||
364 | */ | 449 | */ |
365 | 450 | ||
366 | static unsigned int | 451 | static DEFINE_PER_CPU(unsigned, last_irq_sum); |
367 | last_irq_sums [NR_CPUS], | 452 | static DEFINE_PER_CPU(local_t, alert_counter); |
368 | alert_counter [NR_CPUS]; | 453 | static DEFINE_PER_CPU(int, nmi_touch); |
369 | 454 | ||
370 | void touch_nmi_watchdog (void) | 455 | void touch_nmi_watchdog (void) |
371 | { | 456 | { |
372 | int i; | 457 | int i; |
373 | 458 | ||
374 | /* | 459 | /* |
375 | * Just reset the alert counters, (other CPUs might be | 460 | * Tell other CPUs to reset their alert counters. We cannot |
376 | * spinning on locks we hold): | 461 | * do it ourselves because the alert count increase is not |
462 | * atomic. | ||
377 | */ | 463 | */ |
378 | for (i = 0; i < NR_CPUS; i++) | 464 | for (i = 0; i < NR_CPUS; i++) |
379 | alert_counter[i] = 0; | 465 | per_cpu(nmi_touch, i) = 1; |
380 | } | 466 | } |
381 | 467 | ||
382 | void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) | 468 | void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) |
383 | { | 469 | { |
384 | int sum, cpu; | 470 | int sum; |
471 | int touched = 0; | ||
385 | 472 | ||
386 | cpu = safe_smp_processor_id(); | ||
387 | sum = read_pda(apic_timer_irqs); | 473 | sum = read_pda(apic_timer_irqs); |
388 | if (last_irq_sums[cpu] == sum) { | 474 | if (__get_cpu_var(nmi_touch)) { |
475 | __get_cpu_var(nmi_touch) = 0; | ||
476 | touched = 1; | ||
477 | } | ||
478 | if (!touched && __get_cpu_var(last_irq_sum) == sum) { | ||
389 | /* | 479 | /* |
390 | * Ayiee, looks like this CPU is stuck ... | 480 | * Ayiee, looks like this CPU is stuck ... |
391 | * wait a few IRQs (5 seconds) before doing the oops ... | 481 | * wait a few IRQs (5 seconds) before doing the oops ... |
392 | */ | 482 | */ |
393 | alert_counter[cpu]++; | 483 | local_inc(&__get_cpu_var(alert_counter)); |
394 | if (alert_counter[cpu] == 5*nmi_hz) { | 484 | if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { |
395 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | 485 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) |
396 | == NOTIFY_STOP) { | 486 | == NOTIFY_STOP) { |
397 | alert_counter[cpu] = 0; | 487 | local_set(&__get_cpu_var(alert_counter), 0); |
398 | return; | 488 | return; |
399 | } | 489 | } |
400 | die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); | 490 | die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); |
401 | } | 491 | } |
402 | } else { | 492 | } else { |
403 | last_irq_sums[cpu] = sum; | 493 | __get_cpu_var(last_irq_sum) = sum; |
404 | alert_counter[cpu] = 0; | 494 | local_set(&__get_cpu_var(alert_counter), 0); |
405 | } | 495 | } |
406 | if (nmi_perfctr_msr) | 496 | if (nmi_perfctr_msr) { |
497 | if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { | ||
498 | /* | ||
499 | * P4 quirks: | ||
500 | * - An overflown perfctr will assert its interrupt | ||
501 | * until the OVF flag in its CCCR is cleared. | ||
502 | * - LVTPC is masked on interrupt and must be | ||
503 | * unmasked by the LVTPC handler. | ||
504 | */ | ||
505 | wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); | ||
506 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
507 | } | ||
407 | wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); | 508 | wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); |
509 | } | ||
408 | } | 510 | } |
409 | 511 | ||
410 | static int dummy_nmi_callback(struct pt_regs * regs, int cpu) | 512 | static int dummy_nmi_callback(struct pt_regs * regs, int cpu) |
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c new file mode 100644 index 000000000000..feb5f108dd26 --- /dev/null +++ b/arch/x86_64/kernel/pmtimer.c | |||
@@ -0,0 +1,101 @@ | |||
1 | /* Ported over from i386 by AK, original copyright was: | ||
2 | * | ||
3 | * (C) Dominik Brodowski <linux@brodo.de> 2003 | ||
4 | * | ||
5 | * Driver to use the Power Management Timer (PMTMR) available in some | ||
6 | * southbridges as primary timing source for the Linux kernel. | ||
7 | * | ||
8 | * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, | ||
9 | * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. | ||
10 | * | ||
11 | * This file is licensed under the GPL v2. | ||
12 | * | ||
13 | * Dropped all the hardware bug workarounds for now. Hopefully they | ||
14 | * are not needed on 64bit chipsets. | ||
15 | */ | ||
16 | |||
17 | #include <linux/jiffies.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/cpumask.h> | ||
22 | #include <asm/io.h> | ||
23 | #include <asm/proto.h> | ||
24 | #include <asm/msr.h> | ||
25 | #include <asm/vsyscall.h> | ||
26 | |||
27 | /* The I/O port the PMTMR resides at. | ||
28 | * The location is detected during setup_arch(), | ||
29 | * in arch/i386/kernel/acpi/boot.c */ | ||
30 | u32 pmtmr_ioport; | ||
31 | |||
32 | /* value of the Power timer at last timer interrupt */ | ||
33 | static u32 offset_delay; | ||
34 | static u32 last_pmtmr_tick; | ||
35 | |||
36 | #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ | ||
37 | |||
38 | static inline u32 cyc2us(u32 cycles) | ||
39 | { | ||
40 | /* The Power Management Timer ticks at 3.579545 ticks per microsecond. | ||
41 | * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] | ||
42 | * | ||
43 | * Even with HZ = 100, delta is at maximum 35796 ticks, so it can | ||
44 | * easily be multiplied with 286 (=0x11E) without having to fear | ||
45 | * u32 overflows. | ||
46 | */ | ||
47 | cycles *= 286; | ||
48 | return (cycles >> 10); | ||
49 | } | ||
50 | |||
51 | int pmtimer_mark_offset(void) | ||
52 | { | ||
53 | static int first_run = 1; | ||
54 | unsigned long tsc; | ||
55 | u32 lost; | ||
56 | |||
57 | u32 tick = inl(pmtmr_ioport); | ||
58 | u32 delta; | ||
59 | |||
60 | delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); | ||
61 | |||
62 | last_pmtmr_tick = tick; | ||
63 | monotonic_base += delta * NSEC_PER_USEC; | ||
64 | |||
65 | delta += offset_delay; | ||
66 | |||
67 | lost = delta / (USEC_PER_SEC / HZ); | ||
68 | offset_delay = delta % (USEC_PER_SEC / HZ); | ||
69 | |||
70 | rdtscll(tsc); | ||
71 | vxtime.last_tsc = tsc - offset_delay * cpu_khz; | ||
72 | |||
73 | /* don't calculate delay for first run, | ||
74 | or if we've got less then a tick */ | ||
75 | if (first_run || (lost < 1)) { | ||
76 | first_run = 0; | ||
77 | offset_delay = 0; | ||
78 | } | ||
79 | |||
80 | return lost - 1; | ||
81 | } | ||
82 | |||
83 | unsigned int do_gettimeoffset_pm(void) | ||
84 | { | ||
85 | u32 now, offset, delta = 0; | ||
86 | |||
87 | offset = last_pmtmr_tick; | ||
88 | now = inl(pmtmr_ioport); | ||
89 | delta = (now - offset) & ACPI_PM_MASK; | ||
90 | |||
91 | return offset_delay + cyc2us(delta); | ||
92 | } | ||
93 | |||
94 | |||
95 | static int __init nopmtimer_setup(char *s) | ||
96 | { | ||
97 | pmtmr_ioport = 0; | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | __setup("nopmtimer", nopmtimer_setup); | ||
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index e26e86bb56fe..525f6a128a27 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c | |||
@@ -257,13 +257,13 @@ static int putreg(struct task_struct *child, | |||
257 | value &= 0xffff; | 257 | value &= 0xffff; |
258 | return 0; | 258 | return 0; |
259 | case offsetof(struct user_regs_struct,fs_base): | 259 | case offsetof(struct user_regs_struct,fs_base): |
260 | if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) | 260 | if (value >= TASK_SIZE) |
261 | return -EIO; | 261 | return -EIO; |
262 | child->thread.fs = value; | 262 | child->thread.fs = value; |
263 | return 0; | 263 | return 0; |
264 | case offsetof(struct user_regs_struct,gs_base): | 264 | case offsetof(struct user_regs_struct,gs_base): |
265 | if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) | 265 | if (value >= TASK_SIZE) |
266 | return -EIO; | 266 | return -EIO; |
267 | child->thread.gs = value; | 267 | child->thread.gs = value; |
268 | return 0; | 268 | return 0; |
269 | case offsetof(struct user_regs_struct, eflags): | 269 | case offsetof(struct user_regs_struct, eflags): |
@@ -277,6 +277,11 @@ static int putreg(struct task_struct *child, | |||
277 | return -EIO; | 277 | return -EIO; |
278 | value &= 0xffff; | 278 | value &= 0xffff; |
279 | break; | 279 | break; |
280 | case offsetof(struct user_regs_struct, rip): | ||
281 | /* Check if the new RIP address is canonical */ | ||
282 | if (value >= TASK_SIZE) | ||
283 | return -EIO; | ||
284 | break; | ||
280 | } | 285 | } |
281 | put_stack_long(child, regno - sizeof(struct pt_regs), value); | 286 | put_stack_long(child, regno - sizeof(struct pt_regs), value); |
282 | return 0; | 287 | return 0; |
@@ -375,7 +380,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data | |||
375 | break; | 380 | break; |
376 | 381 | ||
377 | switch (addr) { | 382 | switch (addr) { |
378 | case 0 ... sizeof(struct user_regs_struct): | 383 | case 0 ... sizeof(struct user_regs_struct) - sizeof(long): |
379 | tmp = getreg(child, addr); | 384 | tmp = getreg(child, addr); |
380 | break; | 385 | break; |
381 | case offsetof(struct user, u_debugreg[0]): | 386 | case offsetof(struct user, u_debugreg[0]): |
@@ -420,7 +425,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data | |||
420 | break; | 425 | break; |
421 | 426 | ||
422 | switch (addr) { | 427 | switch (addr) { |
423 | case 0 ... sizeof(struct user_regs_struct): | 428 | case 0 ... sizeof(struct user_regs_struct) - sizeof(long): |
424 | ret = putreg(child, addr, data); | 429 | ret = putreg(child, addr, data); |
425 | break; | 430 | break; |
426 | /* Disallows to set a breakpoint into the vsyscall */ | 431 | /* Disallows to set a breakpoint into the vsyscall */ |
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 2129cf9ba6b2..99f038ede23c 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c | |||
@@ -719,7 +719,6 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c) | |||
719 | } | 719 | } |
720 | } | 720 | } |
721 | 721 | ||
722 | #ifdef CONFIG_SMP | ||
723 | /* | 722 | /* |
724 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | 723 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. |
725 | * Assumes number of cores is a power of two. | 724 | * Assumes number of cores is a power of two. |
@@ -727,17 +726,26 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c) | |||
727 | static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | 726 | static void __init amd_detect_cmp(struct cpuinfo_x86 *c) |
728 | { | 727 | { |
729 | #ifdef CONFIG_SMP | 728 | #ifdef CONFIG_SMP |
730 | int cpu = c->x86_apicid; | 729 | int cpu = smp_processor_id(); |
731 | int node = 0; | 730 | int node = 0; |
731 | unsigned bits; | ||
732 | if (c->x86_num_cores == 1) | 732 | if (c->x86_num_cores == 1) |
733 | return; | 733 | return; |
734 | cpu_core_id[cpu] = cpu >> hweight32(c->x86_num_cores - 1); | 734 | |
735 | bits = 0; | ||
736 | while ((1 << bits) < c->x86_num_cores) | ||
737 | bits++; | ||
738 | |||
739 | /* Low order bits define the core id (index of core in socket) */ | ||
740 | cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1); | ||
741 | /* Convert the APIC ID into the socket ID */ | ||
742 | phys_proc_id[cpu] >>= bits; | ||
735 | 743 | ||
736 | #ifdef CONFIG_NUMA | 744 | #ifdef CONFIG_NUMA |
737 | /* When an ACPI SRAT table is available use the mappings from SRAT | 745 | /* When an ACPI SRAT table is available use the mappings from SRAT |
738 | instead. */ | 746 | instead. */ |
739 | if (acpi_numa <= 0) { | 747 | if (acpi_numa <= 0) { |
740 | node = cpu_core_id[cpu]; | 748 | node = phys_proc_id[cpu]; |
741 | if (!node_online(node)) | 749 | if (!node_online(node)) |
742 | node = first_node(node_online_map); | 750 | node = first_node(node_online_map); |
743 | cpu_to_node[cpu] = node; | 751 | cpu_to_node[cpu] = node; |
@@ -745,15 +753,11 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | |||
745 | node = cpu_to_node[cpu]; | 753 | node = cpu_to_node[cpu]; |
746 | } | 754 | } |
747 | #endif | 755 | #endif |
756 | |||
748 | printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", | 757 | printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", |
749 | cpu, c->x86_num_cores, node, cpu_core_id[cpu]); | 758 | cpu, c->x86_num_cores, node, cpu_core_id[cpu]); |
750 | #endif | 759 | #endif |
751 | } | 760 | } |
752 | #else | ||
753 | static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | ||
754 | { | ||
755 | } | ||
756 | #endif | ||
757 | 761 | ||
758 | static int __init init_amd(struct cpuinfo_x86 *c) | 762 | static int __init init_amd(struct cpuinfo_x86 *c) |
759 | { | 763 | { |
@@ -925,7 +929,6 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) | |||
925 | c->x86_clflush_size = 64; | 929 | c->x86_clflush_size = 64; |
926 | c->x86_cache_alignment = c->x86_clflush_size; | 930 | c->x86_cache_alignment = c->x86_clflush_size; |
927 | c->x86_num_cores = 1; | 931 | c->x86_num_cores = 1; |
928 | c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data; | ||
929 | c->extended_cpuid_level = 0; | 932 | c->extended_cpuid_level = 0; |
930 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | 933 | memset(&c->x86_capability, 0, sizeof c->x86_capability); |
931 | 934 | ||
@@ -954,11 +957,14 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) | |||
954 | } | 957 | } |
955 | if (c->x86_capability[0] & (1<<19)) | 958 | if (c->x86_capability[0] & (1<<19)) |
956 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | 959 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; |
957 | c->x86_apicid = misc >> 24; | ||
958 | } else { | 960 | } else { |
959 | /* Have CPUID level 0 only - unheard of */ | 961 | /* Have CPUID level 0 only - unheard of */ |
960 | c->x86 = 4; | 962 | c->x86 = 4; |
961 | } | 963 | } |
964 | |||
965 | #ifdef CONFIG_SMP | ||
966 | phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; | ||
967 | #endif | ||
962 | } | 968 | } |
963 | 969 | ||
964 | /* | 970 | /* |
@@ -1088,7 +1094,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) | |||
1088 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | 1094 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
1089 | 1095 | ||
1090 | /* Other (Linux-defined) */ | 1096 | /* Other (Linux-defined) */ |
1091 | "cxmmx", NULL, "cyrix_arr", "centaur_mcr", "k8c+", | 1097 | "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, |
1092 | "constant_tsc", NULL, NULL, | 1098 | "constant_tsc", NULL, NULL, |
1093 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | 1099 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
1094 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | 1100 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index d439ced150c6..3fdcdba0fec5 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c | |||
@@ -452,7 +452,9 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) | |||
452 | regs->rip -= 2; | 452 | regs->rip -= 2; |
453 | } | 453 | } |
454 | if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { | 454 | if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { |
455 | regs->rax = __NR_restart_syscall; | 455 | regs->rax = test_thread_flag(TIF_IA32) ? |
456 | __NR_ia32_restart_syscall : | ||
457 | __NR_restart_syscall; | ||
456 | regs->rip -= 2; | 458 | regs->rip -= 2; |
457 | } | 459 | } |
458 | } | 460 | } |
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 73f7e8b9543a..f1ec0f345941 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <asm/kdebug.h> | 56 | #include <asm/kdebug.h> |
57 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
58 | #include <asm/proto.h> | 58 | #include <asm/proto.h> |
59 | #include <asm/nmi.h> | ||
59 | 60 | ||
60 | /* Change for real CPU hotplug. Note other files need to be fixed | 61 | /* Change for real CPU hotplug. Note other files need to be fixed |
61 | first too. */ | 62 | first too. */ |
@@ -93,6 +94,7 @@ int smp_threads_ready; | |||
93 | 94 | ||
94 | cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; | 95 | cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; |
95 | cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; | 96 | cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; |
97 | EXPORT_SYMBOL(cpu_core_map); | ||
96 | 98 | ||
97 | /* | 99 | /* |
98 | * Trampoline 80x86 program as an array. | 100 | * Trampoline 80x86 program as an array. |
@@ -125,96 +127,210 @@ static void __cpuinit smp_store_cpu_info(int id) | |||
125 | 127 | ||
126 | *c = boot_cpu_data; | 128 | *c = boot_cpu_data; |
127 | identify_cpu(c); | 129 | identify_cpu(c); |
130 | print_cpu_info(c); | ||
128 | } | 131 | } |
129 | 132 | ||
130 | /* | 133 | /* |
131 | * Synchronize TSCs of CPUs | 134 | * New Funky TSC sync algorithm borrowed from IA64. |
135 | * Main advantage is that it doesn't reset the TSCs fully and | ||
136 | * in general looks more robust and it works better than my earlier | ||
137 | * attempts. I believe it was written by David Mosberger. Some minor | ||
138 | * adjustments for x86-64 by me -AK | ||
132 | * | 139 | * |
133 | * This new algorithm is less accurate than the old "zero TSCs" | 140 | * Original comment reproduced below. |
134 | * one, but we cannot zero TSCs anymore in the new hotplug CPU | 141 | * |
135 | * model. | 142 | * Synchronize TSC of the current (slave) CPU with the TSC of the |
143 | * MASTER CPU (normally the time-keeper CPU). We use a closed loop to | ||
144 | * eliminate the possibility of unaccounted-for errors (such as | ||
145 | * getting a machine check in the middle of a calibration step). The | ||
146 | * basic idea is for the slave to ask the master what itc value it has | ||
147 | * and to read its own itc before and after the master responds. Each | ||
148 | * iteration gives us three timestamps: | ||
149 | * | ||
150 | * slave master | ||
151 | * | ||
152 | * t0 ---\ | ||
153 | * ---\ | ||
154 | * ---> | ||
155 | * tm | ||
156 | * /--- | ||
157 | * /--- | ||
158 | * t1 <--- | ||
159 | * | ||
160 | * | ||
161 | * The goal is to adjust the slave's TSC such that tm falls exactly | ||
162 | * half-way between t0 and t1. If we achieve this, the clocks are | ||
163 | * synchronized provided the interconnect between the slave and the | ||
164 | * master is symmetric. Even if the interconnect were asymmetric, we | ||
165 | * would still know that the synchronization error is smaller than the | ||
166 | * roundtrip latency (t0 - t1). | ||
167 | * | ||
168 | * When the interconnect is quiet and symmetric, this lets us | ||
169 | * synchronize the TSC to within one or two cycles. However, we can | ||
170 | * only *guarantee* that the synchronization is accurate to within a | ||
171 | * round-trip time, which is typically in the range of several hundred | ||
172 | * cycles (e.g., ~500 cycles). In practice, this means that the TSCs | ||
173 | * are usually almost perfectly synchronized, but we shouldn't assume | ||
174 | * that the accuracy is much better than half a micro second or so. | ||
175 | * | ||
176 | * [there are other errors like the latency of RDTSC and of the | ||
177 | * WRMSR. These can also account to hundreds of cycles. So it's | ||
178 | * probably worse. It claims 153 cycles error on a dual Opteron, | ||
179 | * but I suspect the numbers are actually somewhat worse -AK] | ||
136 | */ | 180 | */ |
137 | 181 | ||
138 | static atomic_t __cpuinitdata tsc_flag; | 182 | #define MASTER 0 |
183 | #define SLAVE (SMP_CACHE_BYTES/8) | ||
184 | |||
185 | /* Intentionally don't use cpu_relax() while TSC synchronization | ||
186 | because we don't want to go into funky power save modi or cause | ||
187 | hypervisors to schedule us away. Going to sleep would likely affect | ||
188 | latency and low latency is the primary objective here. -AK */ | ||
189 | #define no_cpu_relax() barrier() | ||
190 | |||
139 | static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); | 191 | static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); |
140 | static unsigned long long __cpuinitdata bp_tsc, ap_tsc; | 192 | static volatile __cpuinitdata unsigned long go[SLAVE + 1]; |
193 | static int notscsync __cpuinitdata; | ||
194 | |||
195 | #undef DEBUG_TSC_SYNC | ||
141 | 196 | ||
142 | #define NR_LOOPS 5 | 197 | #define NUM_ROUNDS 64 /* magic value */ |
198 | #define NUM_ITERS 5 /* likewise */ | ||
143 | 199 | ||
144 | static void __cpuinit sync_tsc_bp_init(int init) | 200 | /* Callback on boot CPU */ |
201 | static __cpuinit void sync_master(void *arg) | ||
145 | { | 202 | { |
146 | if (init) | 203 | unsigned long flags, i; |
147 | _raw_spin_lock(&tsc_sync_lock); | 204 | |
148 | else | 205 | if (smp_processor_id() != boot_cpu_id) |
149 | _raw_spin_unlock(&tsc_sync_lock); | 206 | return; |
150 | atomic_set(&tsc_flag, 0); | 207 | |
208 | go[MASTER] = 0; | ||
209 | |||
210 | local_irq_save(flags); | ||
211 | { | ||
212 | for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { | ||
213 | while (!go[MASTER]) | ||
214 | no_cpu_relax(); | ||
215 | go[MASTER] = 0; | ||
216 | rdtscll(go[SLAVE]); | ||
217 | } | ||
218 | } | ||
219 | local_irq_restore(flags); | ||
151 | } | 220 | } |
152 | 221 | ||
153 | /* | 222 | /* |
154 | * Synchronize TSC on AP with BP. | 223 | * Return the number of cycles by which our tsc differs from the tsc |
224 | * on the master (time-keeper) CPU. A positive number indicates our | ||
225 | * tsc is ahead of the master, negative that it is behind. | ||
155 | */ | 226 | */ |
156 | static void __cpuinit __sync_tsc_ap(void) | 227 | static inline long |
228 | get_delta(long *rt, long *master) | ||
157 | { | 229 | { |
158 | if (!cpu_has_tsc) | 230 | unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; |
159 | return; | 231 | unsigned long tcenter, t0, t1, tm; |
160 | Dprintk("AP %d syncing TSC\n", smp_processor_id()); | 232 | int i; |
161 | 233 | ||
162 | while (atomic_read(&tsc_flag) != 0) | 234 | for (i = 0; i < NUM_ITERS; ++i) { |
163 | cpu_relax(); | 235 | rdtscll(t0); |
164 | atomic_inc(&tsc_flag); | 236 | go[MASTER] = 1; |
165 | mb(); | 237 | while (!(tm = go[SLAVE])) |
166 | _raw_spin_lock(&tsc_sync_lock); | 238 | no_cpu_relax(); |
167 | wrmsrl(MSR_IA32_TSC, bp_tsc); | 239 | go[SLAVE] = 0; |
168 | _raw_spin_unlock(&tsc_sync_lock); | 240 | rdtscll(t1); |
169 | rdtscll(ap_tsc); | 241 | |
170 | mb(); | 242 | if (t1 - t0 < best_t1 - best_t0) |
171 | atomic_inc(&tsc_flag); | 243 | best_t0 = t0, best_t1 = t1, best_tm = tm; |
172 | mb(); | 244 | } |
245 | |||
246 | *rt = best_t1 - best_t0; | ||
247 | *master = best_tm - best_t0; | ||
248 | |||
249 | /* average best_t0 and best_t1 without overflow: */ | ||
250 | tcenter = (best_t0/2 + best_t1/2); | ||
251 | if (best_t0 % 2 + best_t1 % 2 == 2) | ||
252 | ++tcenter; | ||
253 | return tcenter - best_tm; | ||
173 | } | 254 | } |
174 | 255 | ||
175 | static void __cpuinit sync_tsc_ap(void) | 256 | static __cpuinit void sync_tsc(void) |
176 | { | 257 | { |
177 | int i; | 258 | int i, done = 0; |
178 | for (i = 0; i < NR_LOOPS; i++) | 259 | long delta, adj, adjust_latency = 0; |
179 | __sync_tsc_ap(); | 260 | unsigned long flags, rt, master_time_stamp, bound; |
261 | #if DEBUG_TSC_SYNC | ||
262 | static struct syncdebug { | ||
263 | long rt; /* roundtrip time */ | ||
264 | long master; /* master's timestamp */ | ||
265 | long diff; /* difference between midpoint and master's timestamp */ | ||
266 | long lat; /* estimate of tsc adjustment latency */ | ||
267 | } t[NUM_ROUNDS] __cpuinitdata; | ||
268 | #endif | ||
269 | |||
270 | go[MASTER] = 1; | ||
271 | |||
272 | smp_call_function(sync_master, NULL, 1, 0); | ||
273 | |||
274 | while (go[MASTER]) /* wait for master to be ready */ | ||
275 | no_cpu_relax(); | ||
276 | |||
277 | spin_lock_irqsave(&tsc_sync_lock, flags); | ||
278 | { | ||
279 | for (i = 0; i < NUM_ROUNDS; ++i) { | ||
280 | delta = get_delta(&rt, &master_time_stamp); | ||
281 | if (delta == 0) { | ||
282 | done = 1; /* let's lock on to this... */ | ||
283 | bound = rt; | ||
284 | } | ||
285 | |||
286 | if (!done) { | ||
287 | unsigned long t; | ||
288 | if (i > 0) { | ||
289 | adjust_latency += -delta; | ||
290 | adj = -delta + adjust_latency/4; | ||
291 | } else | ||
292 | adj = -delta; | ||
293 | |||
294 | rdtscll(t); | ||
295 | wrmsrl(MSR_IA32_TSC, t + adj); | ||
296 | } | ||
297 | #if DEBUG_TSC_SYNC | ||
298 | t[i].rt = rt; | ||
299 | t[i].master = master_time_stamp; | ||
300 | t[i].diff = delta; | ||
301 | t[i].lat = adjust_latency/4; | ||
302 | #endif | ||
303 | } | ||
304 | } | ||
305 | spin_unlock_irqrestore(&tsc_sync_lock, flags); | ||
306 | |||
307 | #if DEBUG_TSC_SYNC | ||
308 | for (i = 0; i < NUM_ROUNDS; ++i) | ||
309 | printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", | ||
310 | t[i].rt, t[i].master, t[i].diff, t[i].lat); | ||
311 | #endif | ||
312 | |||
313 | printk(KERN_INFO | ||
314 | "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " | ||
315 | "maxerr %lu cycles)\n", | ||
316 | smp_processor_id(), boot_cpu_id, delta, rt); | ||
180 | } | 317 | } |
181 | 318 | ||
182 | /* | 319 | static void __cpuinit tsc_sync_wait(void) |
183 | * Synchronize TSC from BP to AP. | ||
184 | */ | ||
185 | static void __cpuinit __sync_tsc_bp(int cpu) | ||
186 | { | 320 | { |
187 | if (!cpu_has_tsc) | 321 | if (notscsync || !cpu_has_tsc) |
188 | return; | 322 | return; |
189 | 323 | printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), | |
190 | /* Wait for AP */ | 324 | boot_cpu_id); |
191 | while (atomic_read(&tsc_flag) == 0) | 325 | sync_tsc(); |
192 | cpu_relax(); | ||
193 | /* Save BPs TSC */ | ||
194 | sync_core(); | ||
195 | rdtscll(bp_tsc); | ||
196 | /* Don't do the sync core here to avoid too much latency. */ | ||
197 | mb(); | ||
198 | /* Start the AP */ | ||
199 | _raw_spin_unlock(&tsc_sync_lock); | ||
200 | /* Wait for AP again */ | ||
201 | while (atomic_read(&tsc_flag) < 2) | ||
202 | cpu_relax(); | ||
203 | rdtscl(bp_tsc); | ||
204 | barrier(); | ||
205 | } | 326 | } |
206 | 327 | ||
207 | static void __cpuinit sync_tsc_bp(int cpu) | 328 | static __init int notscsync_setup(char *s) |
208 | { | 329 | { |
209 | int i; | 330 | notscsync = 1; |
210 | for (i = 0; i < NR_LOOPS - 1; i++) { | 331 | return 0; |
211 | __sync_tsc_bp(cpu); | ||
212 | sync_tsc_bp_init(1); | ||
213 | } | ||
214 | __sync_tsc_bp(cpu); | ||
215 | printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n", | ||
216 | cpu, ap_tsc - bp_tsc); | ||
217 | } | 332 | } |
333 | __setup("notscsync", notscsync_setup); | ||
218 | 334 | ||
219 | static atomic_t init_deasserted __cpuinitdata; | 335 | static atomic_t init_deasserted __cpuinitdata; |
220 | 336 | ||
@@ -315,11 +431,6 @@ void __cpuinit start_secondary(void) | |||
315 | cpu_init(); | 431 | cpu_init(); |
316 | smp_callin(); | 432 | smp_callin(); |
317 | 433 | ||
318 | /* | ||
319 | * Synchronize the TSC with the BP | ||
320 | */ | ||
321 | sync_tsc_ap(); | ||
322 | |||
323 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | 434 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ |
324 | barrier(); | 435 | barrier(); |
325 | 436 | ||
@@ -334,7 +445,6 @@ void __cpuinit start_secondary(void) | |||
334 | enable_8259A_irq(0); | 445 | enable_8259A_irq(0); |
335 | } | 446 | } |
336 | 447 | ||
337 | |||
338 | enable_APIC_timer(); | 448 | enable_APIC_timer(); |
339 | 449 | ||
340 | /* | 450 | /* |
@@ -343,6 +453,11 @@ void __cpuinit start_secondary(void) | |||
343 | cpu_set(smp_processor_id(), cpu_online_map); | 453 | cpu_set(smp_processor_id(), cpu_online_map); |
344 | mb(); | 454 | mb(); |
345 | 455 | ||
456 | /* Wait for TSC sync to not schedule things before. | ||
457 | We still process interrupts, which could see an inconsistent | ||
458 | time in that window unfortunately. */ | ||
459 | tsc_sync_wait(); | ||
460 | |||
346 | cpu_idle(); | 461 | cpu_idle(); |
347 | } | 462 | } |
348 | 463 | ||
@@ -531,7 +646,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) | |||
531 | printk("failed fork for CPU %d\n", cpu); | 646 | printk("failed fork for CPU %d\n", cpu); |
532 | return PTR_ERR(idle); | 647 | return PTR_ERR(idle); |
533 | } | 648 | } |
534 | x86_cpu_to_apicid[cpu] = apicid; | ||
535 | 649 | ||
536 | cpu_pda[cpu].pcurrent = idle; | 650 | cpu_pda[cpu].pcurrent = idle; |
537 | 651 | ||
@@ -600,8 +714,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) | |||
600 | 714 | ||
601 | if (cpu_isset(cpu, cpu_callin_map)) { | 715 | if (cpu_isset(cpu, cpu_callin_map)) { |
602 | /* number CPUs logically, starting from 1 (BSP is 0) */ | 716 | /* number CPUs logically, starting from 1 (BSP is 0) */ |
603 | Dprintk("OK.\n"); | ||
604 | print_cpu_info(&cpu_data[cpu]); | ||
605 | Dprintk("CPU has booted.\n"); | 717 | Dprintk("CPU has booted.\n"); |
606 | } else { | 718 | } else { |
607 | boot_error = 1; | 719 | boot_error = 1; |
@@ -842,7 +954,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus) | |||
842 | GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); | 954 | GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); |
843 | /* Or can we switch back to PIC here? */ | 955 | /* Or can we switch back to PIC here? */ |
844 | } | 956 | } |
845 | x86_cpu_to_apicid[0] = boot_cpu_id; | ||
846 | 957 | ||
847 | /* | 958 | /* |
848 | * Now start the IO-APICs | 959 | * Now start the IO-APICs |
@@ -889,18 +1000,14 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
889 | printk("__cpu_up: bad cpu %d\n", cpu); | 1000 | printk("__cpu_up: bad cpu %d\n", cpu); |
890 | return -EINVAL; | 1001 | return -EINVAL; |
891 | } | 1002 | } |
892 | sync_tsc_bp_init(1); | ||
893 | 1003 | ||
894 | /* Boot it! */ | 1004 | /* Boot it! */ |
895 | err = do_boot_cpu(cpu, apicid); | 1005 | err = do_boot_cpu(cpu, apicid); |
896 | if (err < 0) { | 1006 | if (err < 0) { |
897 | sync_tsc_bp_init(0); | ||
898 | Dprintk("do_boot_cpu failed %d\n", err); | 1007 | Dprintk("do_boot_cpu failed %d\n", err); |
899 | return err; | 1008 | return err; |
900 | } | 1009 | } |
901 | 1010 | ||
902 | sync_tsc_bp(cpu); | ||
903 | |||
904 | /* Unleash the CPU! */ | 1011 | /* Unleash the CPU! */ |
905 | Dprintk("waiting for cpu %d\n", cpu); | 1012 | Dprintk("waiting for cpu %d\n", cpu); |
906 | 1013 | ||
@@ -923,4 +1030,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus) | |||
923 | 1030 | ||
924 | detect_siblings(); | 1031 | detect_siblings(); |
925 | time_init_gtod(); | 1032 | time_init_gtod(); |
1033 | |||
1034 | check_nmi_watchdog(); | ||
926 | } | 1035 | } |
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 0e3b60e1ac5b..735b6767c8ed 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c | |||
@@ -26,6 +26,8 @@ | |||
26 | #include <linux/sysdev.h> | 26 | #include <linux/sysdev.h> |
27 | #include <linux/bcd.h> | 27 | #include <linux/bcd.h> |
28 | #include <linux/kallsyms.h> | 28 | #include <linux/kallsyms.h> |
29 | #include <linux/acpi.h> | ||
30 | #include <acpi/achware.h> /* for PM timer frequency */ | ||
29 | #include <asm/8253pit.h> | 31 | #include <asm/8253pit.h> |
30 | #include <asm/pgtable.h> | 32 | #include <asm/pgtable.h> |
31 | #include <asm/vsyscall.h> | 33 | #include <asm/vsyscall.h> |
@@ -396,6 +398,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) | |||
396 | (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; | 398 | (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; |
397 | 399 | ||
398 | vxtime.last = offset; | 400 | vxtime.last = offset; |
401 | #ifdef CONFIG_X86_PM_TIMER | ||
402 | } else if (vxtime.mode == VXTIME_PMTMR) { | ||
403 | lost = pmtimer_mark_offset(); | ||
404 | #endif | ||
399 | } else { | 405 | } else { |
400 | offset = (((tsc - vxtime.last_tsc) * | 406 | offset = (((tsc - vxtime.last_tsc) * |
401 | vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); | 407 | vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); |
@@ -898,6 +904,13 @@ void __init time_init(void) | |||
898 | hpet_period; | 904 | hpet_period; |
899 | cpu_khz = hpet_calibrate_tsc(); | 905 | cpu_khz = hpet_calibrate_tsc(); |
900 | timename = "HPET"; | 906 | timename = "HPET"; |
907 | #ifdef CONFIG_X86_PM_TIMER | ||
908 | } else if (pmtmr_ioport) { | ||
909 | vxtime_hz = PM_TIMER_FREQUENCY; | ||
910 | timename = "PM"; | ||
911 | pit_init(); | ||
912 | cpu_khz = pit_calibrate_tsc(); | ||
913 | #endif | ||
901 | } else { | 914 | } else { |
902 | pit_init(); | 915 | pit_init(); |
903 | cpu_khz = pit_calibrate_tsc(); | 916 | cpu_khz = pit_calibrate_tsc(); |
@@ -923,35 +936,50 @@ void __init time_init(void) | |||
923 | } | 936 | } |
924 | 937 | ||
925 | /* | 938 | /* |
939 | * Make an educated guess if the TSC is trustworthy and synchronized | ||
940 | * over all CPUs. | ||
941 | */ | ||
942 | static __init int unsynchronized_tsc(void) | ||
943 | { | ||
944 | #ifdef CONFIG_SMP | ||
945 | if (oem_force_hpet_timer()) | ||
946 | return 1; | ||
947 | /* Intel systems are normally all synchronized. Exceptions | ||
948 | are handled in the OEM check above. */ | ||
949 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
950 | return 0; | ||
951 | /* All in a single socket - should be synchronized */ | ||
952 | if (cpus_weight(cpu_core_map[0]) == num_online_cpus()) | ||
953 | return 0; | ||
954 | #endif | ||
955 | /* Assume multi socket systems are not synchronized */ | ||
956 | return num_online_cpus() > 1; | ||
957 | } | ||
958 | |||
959 | /* | ||
926 | * Decide after all CPUs are booted what mode gettimeofday should use. | 960 | * Decide after all CPUs are booted what mode gettimeofday should use. |
927 | */ | 961 | */ |
928 | void __init time_init_gtod(void) | 962 | void __init time_init_gtod(void) |
929 | { | 963 | { |
930 | char *timetype; | 964 | char *timetype; |
931 | 965 | ||
932 | /* | 966 | if (unsynchronized_tsc()) |
933 | * AMD systems with more than one CPU don't have fully synchronized | ||
934 | * TSCs. Always use HPET gettimeofday for these, although it is slower. | ||
935 | * Intel SMP systems usually have synchronized TSCs, so use always | ||
936 | * the TSC. | ||
937 | * | ||
938 | * Exceptions: | ||
939 | * IBM Summit2 checked by oem_force_hpet_timer(). | ||
940 | * AMD dual core may also not need HPET. Check me. | ||
941 | * | ||
942 | * Can be turned off with "notsc". | ||
943 | */ | ||
944 | if (num_online_cpus() > 1 && | ||
945 | boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
946 | notsc = 1; | ||
947 | /* Some systems will want to disable TSC and use HPET. */ | ||
948 | if (oem_force_hpet_timer()) | ||
949 | notsc = 1; | 967 | notsc = 1; |
950 | if (vxtime.hpet_address && notsc) { | 968 | if (vxtime.hpet_address && notsc) { |
951 | timetype = "HPET"; | 969 | timetype = "HPET"; |
952 | vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; | 970 | vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; |
953 | vxtime.mode = VXTIME_HPET; | 971 | vxtime.mode = VXTIME_HPET; |
954 | do_gettimeoffset = do_gettimeoffset_hpet; | 972 | do_gettimeoffset = do_gettimeoffset_hpet; |
973 | #ifdef CONFIG_X86_PM_TIMER | ||
974 | /* Using PM for gettimeofday is quite slow, but we have no other | ||
975 | choice because the TSC is too unreliable on some systems. */ | ||
976 | } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { | ||
977 | timetype = "PM"; | ||
978 | do_gettimeoffset = do_gettimeoffset_pm; | ||
979 | vxtime.mode = VXTIME_PMTMR; | ||
980 | sysctl_vsyscall = 0; | ||
981 | printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); | ||
982 | #endif | ||
955 | } else { | 983 | } else { |
956 | timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; | 984 | timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; |
957 | vxtime.mode = VXTIME_TSC; | 985 | vxtime.mode = VXTIME_TSC; |
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index b4b8dc59663a..2e5734425949 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c | |||
@@ -65,7 +65,7 @@ static force_inline void do_vgettimeofday(struct timeval * tv) | |||
65 | usec = (__xtime.tv_nsec / 1000) + | 65 | usec = (__xtime.tv_nsec / 1000) + |
66 | (__jiffies - __wall_jiffies) * (1000000 / HZ); | 66 | (__jiffies - __wall_jiffies) * (1000000 / HZ); |
67 | 67 | ||
68 | if (__vxtime.mode == VXTIME_TSC) { | 68 | if (__vxtime.mode != VXTIME_HPET) { |
69 | sync_core(); | 69 | sync_core(); |
70 | rdtscll(t); | 70 | rdtscll(t); |
71 | if (t < __vxtime.last_tsc) | 71 | if (t < __vxtime.last_tsc) |
@@ -217,8 +217,9 @@ static int __init vsyscall_init(void) | |||
217 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | 217 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); |
218 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | 218 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); |
219 | map_vsyscall(); | 219 | map_vsyscall(); |
220 | sysctl_vsyscall = 1; | 220 | #ifdef CONFIG_SYSCTL |
221 | register_sysctl_table(kernel_root_table2, 0); | 221 | register_sysctl_table(kernel_root_table2, 0); |
222 | #endif | ||
222 | return 0; | 223 | return 0; |
223 | } | 224 | } |
224 | 225 | ||
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 88626e626886..a43dedb58fa2 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c | |||
@@ -139,35 +139,23 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback); | |||
139 | #undef memmove | 139 | #undef memmove |
140 | #undef memchr | 140 | #undef memchr |
141 | #undef strlen | 141 | #undef strlen |
142 | #undef strcpy | ||
143 | #undef strncmp | 142 | #undef strncmp |
144 | #undef strncpy | 143 | #undef strncpy |
145 | #undef strchr | 144 | #undef strchr |
146 | #undef strcmp | ||
147 | #undef strcpy | ||
148 | #undef strcat | ||
149 | #undef memcmp | ||
150 | 145 | ||
151 | extern void * memset(void *,int,__kernel_size_t); | 146 | extern void * memset(void *,int,__kernel_size_t); |
152 | extern size_t strlen(const char *); | 147 | extern size_t strlen(const char *); |
153 | extern void * memmove(void * dest,const void *src,size_t count); | 148 | extern void * memmove(void * dest,const void *src,size_t count); |
154 | extern char * strcpy(char * dest,const char *src); | ||
155 | extern int strcmp(const char * cs,const char * ct); | ||
156 | extern void *memchr(const void *s, int c, size_t n); | 149 | extern void *memchr(const void *s, int c, size_t n); |
157 | extern void * memcpy(void *,const void *,__kernel_size_t); | 150 | extern void * memcpy(void *,const void *,__kernel_size_t); |
158 | extern void * __memcpy(void *,const void *,__kernel_size_t); | 151 | extern void * __memcpy(void *,const void *,__kernel_size_t); |
159 | extern char * strcat(char *, const char *); | ||
160 | extern int memcmp(const void * cs,const void * ct,size_t count); | ||
161 | 152 | ||
162 | EXPORT_SYMBOL(memset); | 153 | EXPORT_SYMBOL(memset); |
163 | EXPORT_SYMBOL(strlen); | 154 | EXPORT_SYMBOL(strlen); |
164 | EXPORT_SYMBOL(memmove); | 155 | EXPORT_SYMBOL(memmove); |
165 | EXPORT_SYMBOL(strcpy); | ||
166 | EXPORT_SYMBOL(strncmp); | 156 | EXPORT_SYMBOL(strncmp); |
167 | EXPORT_SYMBOL(strncpy); | 157 | EXPORT_SYMBOL(strncpy); |
168 | EXPORT_SYMBOL(strchr); | 158 | EXPORT_SYMBOL(strchr); |
169 | EXPORT_SYMBOL(strcmp); | ||
170 | EXPORT_SYMBOL(strcat); | ||
171 | EXPORT_SYMBOL(strncat); | 159 | EXPORT_SYMBOL(strncat); |
172 | EXPORT_SYMBOL(memchr); | 160 | EXPORT_SYMBOL(memchr); |
173 | EXPORT_SYMBOL(strrchr); | 161 | EXPORT_SYMBOL(strrchr); |
@@ -175,7 +163,6 @@ EXPORT_SYMBOL(strnlen); | |||
175 | EXPORT_SYMBOL(memscan); | 163 | EXPORT_SYMBOL(memscan); |
176 | EXPORT_SYMBOL(memcpy); | 164 | EXPORT_SYMBOL(memcpy); |
177 | EXPORT_SYMBOL(__memcpy); | 165 | EXPORT_SYMBOL(__memcpy); |
178 | EXPORT_SYMBOL(memcmp); | ||
179 | 166 | ||
180 | #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM | 167 | #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM |
181 | /* prototypes are wrong, these are assembly with custom calling functions */ | 168 | /* prototypes are wrong, these are assembly with custom calling functions */ |