aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/apic.c5
-rw-r--r--arch/x86_64/kernel/entry.S11
-rw-r--r--arch/x86_64/kernel/io_apic.c70
-rw-r--r--arch/x86_64/kernel/mpparse.c22
-rw-r--r--arch/x86_64/kernel/nmi.c248
-rw-r--r--arch/x86_64/kernel/pmtimer.c101
-rw-r--r--arch/x86_64/kernel/ptrace.c13
-rw-r--r--arch/x86_64/kernel/setup.c17
-rw-r--r--arch/x86_64/kernel/smpboot.c262
-rw-r--r--arch/x86_64/kernel/time.c62
-rw-r--r--arch/x86_64/kernel/vsyscall.c5
12 files changed, 558 insertions, 259 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 0a3318e08ab6..5ca4a4598fda 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
28obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o 28obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
29obj-$(CONFIG_SWIOTLB) += swiotlb.o 29obj-$(CONFIG_SWIOTLB) += swiotlb.o
30obj-$(CONFIG_KPROBES) += kprobes.o 30obj-$(CONFIG_KPROBES) += kprobes.o
31obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o
31 32
32obj-$(CONFIG_MODULES) += module.o 33obj-$(CONFIG_MODULES) += module.o
33 34
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 7e13545748e0..f8e6cc4fecd4 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -33,6 +33,7 @@
33#include <asm/mpspec.h> 33#include <asm/mpspec.h>
34#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
35#include <asm/mach_apic.h> 35#include <asm/mach_apic.h>
36#include <asm/nmi.h>
36 37
37int apic_verbosity; 38int apic_verbosity;
38 39
@@ -925,7 +926,7 @@ __init int oem_force_hpet_timer(void)
925 unsigned id; 926 unsigned id;
926 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 927 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
927 928
928 bitmap_empty(clustermap, NUM_APIC_CLUSTERS); 929 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
929 930
930 for (i = 0; i < NR_CPUS; i++) { 931 for (i = 0; i < NR_CPUS; i++) {
931 id = bios_cpu_apicid[i]; 932 id = bios_cpu_apicid[i];
@@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void)
1056 nr_ioapics = 0; 1057 nr_ioapics = 0;
1057#endif 1058#endif
1058 setup_boot_APIC_clock(); 1059 setup_boot_APIC_clock();
1059 1060 check_nmi_watchdog();
1060 return 0; 1061 return 0;
1061} 1062}
1062 1063
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 1086b5fcac21..28817490fdc6 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -220,13 +220,18 @@ sysret_careful:
220 jmp sysret_check 220 jmp sysret_check
221 221
222 /* Handle a signal */ 222 /* Handle a signal */
223 /* edx: work flags (arg3) */
224sysret_signal: 223sysret_signal:
225 sti 224 sti
225 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
226 jz 1f
227
228 /* Really a signal */
229 /* edx: work flags (arg3) */
226 leaq do_notify_resume(%rip),%rax 230 leaq do_notify_resume(%rip),%rax
227 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 231 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
228 xorl %esi,%esi # oldset -> arg2 232 xorl %esi,%esi # oldset -> arg2
229 call ptregscall_common 233 call ptregscall_common
2341: movl $_TIF_NEED_RESCHED,%edi
230 jmp sysret_check 235 jmp sysret_check
231 236
232 /* Do syscall tracing */ 237 /* Do syscall tracing */
@@ -484,6 +489,8 @@ retint_careful:
484 jmp retint_check 489 jmp retint_check
485 490
486retint_signal: 491retint_signal:
492 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
493 jz retint_swapgs
487 sti 494 sti
488 SAVE_REST 495 SAVE_REST
489 movq $-1,ORIG_RAX(%rsp) 496 movq $-1,ORIG_RAX(%rsp)
@@ -492,8 +499,8 @@ retint_signal:
492 call do_notify_resume 499 call do_notify_resume
493 RESTORE_REST 500 RESTORE_REST
494 cli 501 cli
502 movl $_TIF_NEED_RESCHED,%edi
495 GET_THREAD_INFO(%rcx) 503 GET_THREAD_INFO(%rcx)
496 movl $_TIF_WORK_MASK,%edi
497 jmp retint_check 504 jmp retint_check
498 505
499#ifdef CONFIG_PREEMPT 506#ifdef CONFIG_PREEMPT
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 60be58617eb9..ac7684324954 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1804,76 +1804,6 @@ device_initcall(ioapic_init_sysfs);
1804 1804
1805#define IO_APIC_MAX_ID 0xFE 1805#define IO_APIC_MAX_ID 0xFE
1806 1806
1807int __init io_apic_get_unique_id (int ioapic, int apic_id)
1808{
1809 union IO_APIC_reg_00 reg_00;
1810 static physid_mask_t apic_id_map;
1811 unsigned long flags;
1812 int i = 0;
1813
1814 /*
1815 * The P4 platform supports up to 256 APIC IDs on two separate APIC
1816 * buses (one for LAPICs, one for IOAPICs), where predecessors only
1817 * supports up to 16 on one shared APIC bus.
1818 *
1819 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
1820 * advantage of new APIC bus architecture.
1821 */
1822
1823 if (physids_empty(apic_id_map))
1824 apic_id_map = phys_cpu_present_map;
1825
1826 spin_lock_irqsave(&ioapic_lock, flags);
1827 reg_00.raw = io_apic_read(ioapic, 0);
1828 spin_unlock_irqrestore(&ioapic_lock, flags);
1829
1830 if (apic_id >= IO_APIC_MAX_ID) {
1831 apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
1832 "%d\n", ioapic, apic_id, reg_00.bits.ID);
1833 apic_id = reg_00.bits.ID;
1834 }
1835
1836 /*
1837 * Every APIC in a system must have a unique ID or we get lots of nice
1838 * 'stuck on smp_invalidate_needed IPI wait' messages.
1839 */
1840 if (physid_isset(apic_id, apic_id_map)) {
1841
1842 for (i = 0; i < IO_APIC_MAX_ID; i++) {
1843 if (!physid_isset(i, apic_id_map))
1844 break;
1845 }
1846
1847 if (i == IO_APIC_MAX_ID)
1848 panic("Max apic_id exceeded!\n");
1849
1850 apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
1851 "trying %d\n", ioapic, apic_id, i);
1852
1853 apic_id = i;
1854 }
1855
1856 physid_set(apic_id, apic_id_map);
1857
1858 if (reg_00.bits.ID != apic_id) {
1859 reg_00.bits.ID = apic_id;
1860
1861 spin_lock_irqsave(&ioapic_lock, flags);
1862 io_apic_write(ioapic, 0, reg_00.raw);
1863 reg_00.raw = io_apic_read(ioapic, 0);
1864 spin_unlock_irqrestore(&ioapic_lock, flags);
1865
1866 /* Sanity check */
1867 if (reg_00.bits.ID != apic_id)
1868 panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
1869 }
1870
1871 apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
1872
1873 return apic_id;
1874}
1875
1876
1877int __init io_apic_get_version (int ioapic) 1807int __init io_apic_get_version (int ioapic)
1878{ 1808{
1879 union IO_APIC_reg_01 reg_01; 1809 union IO_APIC_reg_01 reg_01;
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 7ec031c6ca10..f86d9db94bfc 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -107,6 +107,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
107static void __init MP_processor_info (struct mpc_config_processor *m) 107static void __init MP_processor_info (struct mpc_config_processor *m)
108{ 108{
109 int ver; 109 int ver;
110 static int found_bsp=0;
110 111
111 if (!(m->mpc_cpuflag & CPU_ENABLED)) 112 if (!(m->mpc_cpuflag & CPU_ENABLED))
112 return; 113 return;
@@ -126,11 +127,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
126 " Processor ignored.\n", NR_CPUS); 127 " Processor ignored.\n", NR_CPUS);
127 return; 128 return;
128 } 129 }
129 if (num_processors >= maxcpus) {
130 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
131 " Processor ignored.\n", maxcpus);
132 return;
133 }
134 130
135 num_processors++; 131 num_processors++;
136 132
@@ -150,7 +146,19 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
150 ver = 0x10; 146 ver = 0x10;
151 } 147 }
152 apic_version[m->mpc_apicid] = ver; 148 apic_version[m->mpc_apicid] = ver;
153 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; 149 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
150 /*
151 * bios_cpu_apicid is required to have processors listed
152 * in same order as logical cpu numbers. Hence the first
153 * entry is BSP, and so on.
154 */
155 bios_cpu_apicid[0] = m->mpc_apicid;
156 x86_cpu_to_apicid[0] = m->mpc_apicid;
157 found_bsp = 1;
158 } else {
159 bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid;
160 x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid;
161 }
154} 162}
155 163
156static void __init MP_bus_info (struct mpc_config_bus *m) 164static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -759,7 +767,7 @@ void __init mp_register_ioapic (
759 mp_ioapics[idx].mpc_apicaddr = address; 767 mp_ioapics[idx].mpc_apicaddr = address;
760 768
761 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 769 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
762 mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); 770 mp_ioapics[idx].mpc_apicid = id;
763 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); 771 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
764 772
765 /* 773 /*
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 61de0b34a01e..31c0f2e6ac91 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -33,6 +33,7 @@
33#include <asm/msr.h> 33#include <asm/msr.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/kdebug.h> 35#include <asm/kdebug.h>
36#include <asm/local.h>
36 37
37/* 38/*
38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 39 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -59,7 +60,8 @@ int panic_on_timeout;
59 60
60unsigned int nmi_watchdog = NMI_DEFAULT; 61unsigned int nmi_watchdog = NMI_DEFAULT;
61static unsigned int nmi_hz = HZ; 62static unsigned int nmi_hz = HZ;
62unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ 63static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
64static unsigned int nmi_p4_cccr_val;
63 65
64/* Note that these events don't tick when the CPU idles. This means 66/* Note that these events don't tick when the CPU idles. This means
65 the frequency varies with CPU load. */ 67 the frequency varies with CPU load. */
@@ -71,61 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
71#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 73#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
72#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 74#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
73 75
74#define P6_EVNTSEL0_ENABLE (1 << 22) 76#define MSR_P4_MISC_ENABLE 0x1A0
75#define P6_EVNTSEL_INT (1 << 20) 77#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
76#define P6_EVNTSEL_OS (1 << 17) 78#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
77#define P6_EVNTSEL_USR (1 << 16) 79#define MSR_P4_PERFCTR0 0x300
78#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 80#define MSR_P4_CCCR0 0x360
79#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED 81#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
82#define P4_ESCR_OS (1<<3)
83#define P4_ESCR_USR (1<<2)
84#define P4_CCCR_OVF_PMI0 (1<<26)
85#define P4_CCCR_OVF_PMI1 (1<<27)
86#define P4_CCCR_THRESHOLD(N) ((N)<<20)
87#define P4_CCCR_COMPLEMENT (1<<19)
88#define P4_CCCR_COMPARE (1<<18)
89#define P4_CCCR_REQUIRED (3<<16)
90#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
91#define P4_CCCR_ENABLE (1<<12)
92/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
93 CRU_ESCR0 (with any non-null event selector) through a complemented
94 max threshold. [IA32-Vol3, Section 14.9.9] */
95#define MSR_P4_IQ_COUNTER0 0x30C
96#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
97#define P4_NMI_IQ_CCCR0 \
98 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
99 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
100
101static __init inline int nmi_known_cpu(void)
102{
103 switch (boot_cpu_data.x86_vendor) {
104 case X86_VENDOR_AMD:
105 return boot_cpu_data.x86 == 15;
106 case X86_VENDOR_INTEL:
107 return boot_cpu_data.x86 == 15;
108 }
109 return 0;
110}
80 111
81/* Run after command line and cpu_init init, but before all other checks */ 112/* Run after command line and cpu_init init, but before all other checks */
82void __init nmi_watchdog_default(void) 113void __init nmi_watchdog_default(void)
83{ 114{
84 if (nmi_watchdog != NMI_DEFAULT) 115 if (nmi_watchdog != NMI_DEFAULT)
85 return; 116 return;
86 117 if (nmi_known_cpu())
87 /* For some reason the IO APIC watchdog doesn't work on the AMD 118 nmi_watchdog = NMI_LOCAL_APIC;
88 8111 chipset. For now switch to local APIC mode using 119 else
89 perfctr0 there. On Intel CPUs we don't have code to handle
90 the perfctr and the IO-APIC seems to work, so use that. */
91
92 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
93 nmi_watchdog = NMI_LOCAL_APIC;
94 printk(KERN_INFO
95 "Using local APIC NMI watchdog using perfctr0\n");
96 } else {
97 printk(KERN_INFO "Using IO APIC NMI watchdog\n");
98 nmi_watchdog = NMI_IO_APIC; 120 nmi_watchdog = NMI_IO_APIC;
99 }
100} 121}
101 122
102/* Why is there no CPUID flag for this? */ 123#ifdef CONFIG_SMP
103static __init int cpu_has_lapic(void) 124/* The performance counters used by NMI_LOCAL_APIC don't trigger when
125 * the CPU is idle. To make sure the NMI watchdog really ticks on all
126 * CPUs during the test make them busy.
127 */
128static __init void nmi_cpu_busy(void *data)
104{ 129{
105 switch (boot_cpu_data.x86_vendor) { 130 volatile int *endflag = data;
106 case X86_VENDOR_INTEL: 131 local_irq_enable();
107 case X86_VENDOR_AMD: 132 /* Intentionally don't use cpu_relax here. This is
108 return boot_cpu_data.x86 >= 6; 133 to make sure that the performance counter really ticks,
109 /* .... add more cpus here or find a different way to figure this out. */ 134 even if there is a simulator or similar that catches the
110 default: 135 pause instruction. On a real HT machine this is fine because
111 return 0; 136 all other CPUs are busy with "useless" delay loops and don't
112 } 137 care if they get somewhat less cycles. */
138 while (*endflag == 0)
139 barrier();
113} 140}
141#endif
114 142
115static int __init check_nmi_watchdog (void) 143int __init check_nmi_watchdog (void)
116{ 144{
117 int counts[NR_CPUS]; 145 volatile int endflag = 0;
146 int *counts;
118 int cpu; 147 int cpu;
119 148
120 if (nmi_watchdog == NMI_NONE) 149 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
121 return 0; 150 if (!counts)
151 return -1;
122 152
123 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { 153 printk(KERN_INFO "testing NMI watchdog ... ");
124 nmi_watchdog = NMI_NONE;
125 return -1;
126 }
127 154
128 printk(KERN_INFO "Testing NMI watchdog ... "); 155 if (nmi_watchdog == NMI_LOCAL_APIC)
156 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
129 157
130 for (cpu = 0; cpu < NR_CPUS; cpu++) 158 for (cpu = 0; cpu < NR_CPUS; cpu++)
131 counts[cpu] = cpu_pda[cpu].__nmi_count; 159 counts[cpu] = cpu_pda[cpu].__nmi_count;
@@ -133,15 +161,22 @@ static int __init check_nmi_watchdog (void)
133 mdelay((10*1000)/nmi_hz); // wait 10 ticks 161 mdelay((10*1000)/nmi_hz); // wait 10 ticks
134 162
135 for (cpu = 0; cpu < NR_CPUS; cpu++) { 163 for (cpu = 0; cpu < NR_CPUS; cpu++) {
164 if (!cpu_online(cpu))
165 continue;
136 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 166 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
137 printk("CPU#%d: NMI appears to be stuck (%d)!\n", 167 endflag = 1;
168 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
138 cpu, 169 cpu,
170 counts[cpu],
139 cpu_pda[cpu].__nmi_count); 171 cpu_pda[cpu].__nmi_count);
140 nmi_active = 0; 172 nmi_active = 0;
141 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 173 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
174 nmi_perfctr_msr = 0;
175 kfree(counts);
142 return -1; 176 return -1;
143 } 177 }
144 } 178 }
179 endflag = 1;
145 printk("OK.\n"); 180 printk("OK.\n");
146 181
147 /* now that we know it works we can reduce NMI frequency to 182 /* now that we know it works we can reduce NMI frequency to
@@ -149,10 +184,9 @@ static int __init check_nmi_watchdog (void)
149 if (nmi_watchdog == NMI_LOCAL_APIC) 184 if (nmi_watchdog == NMI_LOCAL_APIC)
150 nmi_hz = 1; 185 nmi_hz = 1;
151 186
187 kfree(counts);
152 return 0; 188 return 0;
153} 189}
154/* Have this called later during boot so counters are updating */
155late_initcall(check_nmi_watchdog);
156 190
157int __init setup_nmi_watchdog(char *str) 191int __init setup_nmi_watchdog(char *str)
158{ 192{
@@ -170,7 +204,7 @@ int __init setup_nmi_watchdog(char *str)
170 204
171 if (nmi >= NMI_INVALID) 205 if (nmi >= NMI_INVALID)
172 return 0; 206 return 0;
173 nmi_watchdog = nmi; 207 nmi_watchdog = nmi;
174 return 1; 208 return 1;
175} 209}
176 210
@@ -185,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void)
185 wrmsr(MSR_K7_EVNTSEL0, 0, 0); 219 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
186 break; 220 break;
187 case X86_VENDOR_INTEL: 221 case X86_VENDOR_INTEL:
188 wrmsr(MSR_IA32_EVNTSEL0, 0, 0); 222 if (boot_cpu_data.x86 == 15) {
223 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
224 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
225 }
189 break; 226 break;
190 } 227 }
191 nmi_active = -1; 228 nmi_active = -1;
@@ -253,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
253 290
254static int nmi_pm_active; /* nmi_active before suspend */ 291static int nmi_pm_active; /* nmi_active before suspend */
255 292
256static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 293static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
257{ 294{
258 nmi_pm_active = nmi_active; 295 nmi_pm_active = nmi_active;
259 disable_lapic_nmi_watchdog(); 296 disable_lapic_nmi_watchdog();
@@ -300,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs);
300 * Original code written by Keith Owens. 337 * Original code written by Keith Owens.
301 */ 338 */
302 339
340static void clear_msr_range(unsigned int base, unsigned int n)
341{
342 unsigned int i;
343
344 for(i = 0; i < n; ++i)
345 wrmsr(base+i, 0, 0);
346}
347
303static void setup_k7_watchdog(void) 348static void setup_k7_watchdog(void)
304{ 349{
305 int i; 350 int i;
306 unsigned int evntsel; 351 unsigned int evntsel;
307 352
308 /* No check, so can start with slow frequency */
309 nmi_hz = 1;
310
311 /* XXX should check these in EFER */
312
313 nmi_perfctr_msr = MSR_K7_PERFCTR0; 353 nmi_perfctr_msr = MSR_K7_PERFCTR0;
314 354
315 for(i = 0; i < 4; ++i) { 355 for(i = 0; i < 4; ++i) {
316 /* Simulator may not support it */ 356 /* Simulator may not support it */
317 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) 357 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
358 nmi_perfctr_msr = 0;
318 return; 359 return;
360 }
319 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 361 wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
320 } 362 }
321 363
@@ -325,12 +367,54 @@ static void setup_k7_watchdog(void)
325 | K7_NMI_EVENT; 367 | K7_NMI_EVENT;
326 368
327 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 369 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
328 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); 370 wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
329 apic_write(APIC_LVTPC, APIC_DM_NMI); 371 apic_write(APIC_LVTPC, APIC_DM_NMI);
330 evntsel |= K7_EVNTSEL_ENABLE; 372 evntsel |= K7_EVNTSEL_ENABLE;
331 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 373 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
332} 374}
333 375
376
377static int setup_p4_watchdog(void)
378{
379 unsigned int misc_enable, dummy;
380
381 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
382 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
383 return 0;
384
385 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
386 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
387#ifdef CONFIG_SMP
388 if (smp_num_siblings == 2)
389 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
390#endif
391
392 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
393 clear_msr_range(0x3F1, 2);
394 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
395 docs doesn't fully define it, so leave it alone for now. */
396 if (boot_cpu_data.x86_model >= 0x3) {
397 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
398 clear_msr_range(0x3A0, 26);
399 clear_msr_range(0x3BC, 3);
400 } else {
401 clear_msr_range(0x3A0, 31);
402 }
403 clear_msr_range(0x3C0, 6);
404 clear_msr_range(0x3C8, 6);
405 clear_msr_range(0x3E0, 2);
406 clear_msr_range(MSR_P4_CCCR0, 18);
407 clear_msr_range(MSR_P4_PERFCTR0, 18);
408
409 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
410 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
411 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
412 wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
413 apic_write(APIC_LVTPC, APIC_DM_NMI);
414 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
415 return 1;
416}
417
334void setup_apic_nmi_watchdog(void) 418void setup_apic_nmi_watchdog(void)
335{ 419{
336 switch (boot_cpu_data.x86_vendor) { 420 switch (boot_cpu_data.x86_vendor) {
@@ -341,6 +425,13 @@ void setup_apic_nmi_watchdog(void)
341 return; 425 return;
342 setup_k7_watchdog(); 426 setup_k7_watchdog();
343 break; 427 break;
428 case X86_VENDOR_INTEL:
429 if (boot_cpu_data.x86 != 15)
430 return;
431 if (!setup_p4_watchdog())
432 return;
433 break;
434
344 default: 435 default:
345 return; 436 return;
346 } 437 }
@@ -355,56 +446,67 @@ void setup_apic_nmi_watchdog(void)
355 * 446 *
356 * as these watchdog NMI IRQs are generated on every CPU, we only 447 * as these watchdog NMI IRQs are generated on every CPU, we only
357 * have to check the current processor. 448 * have to check the current processor.
358 *
359 * since NMIs don't listen to _any_ locks, we have to be extremely
360 * careful not to rely on unsafe variables. The printk might lock
361 * up though, so we have to break up any console locks first ...
362 * [when there will be more tty-related locks, break them up
363 * here too!]
364 */ 449 */
365 450
366static unsigned int 451static DEFINE_PER_CPU(unsigned, last_irq_sum);
367 last_irq_sums [NR_CPUS], 452static DEFINE_PER_CPU(local_t, alert_counter);
368 alert_counter [NR_CPUS]; 453static DEFINE_PER_CPU(int, nmi_touch);
369 454
370void touch_nmi_watchdog (void) 455void touch_nmi_watchdog (void)
371{ 456{
372 int i; 457 int i;
373 458
374 /* 459 /*
375 * Just reset the alert counters, (other CPUs might be 460 * Tell other CPUs to reset their alert counters. We cannot
376 * spinning on locks we hold): 461 * do it ourselves because the alert count increase is not
462 * atomic.
377 */ 463 */
378 for (i = 0; i < NR_CPUS; i++) 464 for (i = 0; i < NR_CPUS; i++)
379 alert_counter[i] = 0; 465 per_cpu(nmi_touch, i) = 1;
380} 466}
381 467
382void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) 468void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
383{ 469{
384 int sum, cpu; 470 int sum;
471 int touched = 0;
385 472
386 cpu = safe_smp_processor_id();
387 sum = read_pda(apic_timer_irqs); 473 sum = read_pda(apic_timer_irqs);
388 if (last_irq_sums[cpu] == sum) { 474 if (__get_cpu_var(nmi_touch)) {
475 __get_cpu_var(nmi_touch) = 0;
476 touched = 1;
477 }
478 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
389 /* 479 /*
390 * Ayiee, looks like this CPU is stuck ... 480 * Ayiee, looks like this CPU is stuck ...
391 * wait a few IRQs (5 seconds) before doing the oops ... 481 * wait a few IRQs (5 seconds) before doing the oops ...
392 */ 482 */
393 alert_counter[cpu]++; 483 local_inc(&__get_cpu_var(alert_counter));
394 if (alert_counter[cpu] == 5*nmi_hz) { 484 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
395 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 485 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
396 == NOTIFY_STOP) { 486 == NOTIFY_STOP) {
397 alert_counter[cpu] = 0; 487 local_set(&__get_cpu_var(alert_counter), 0);
398 return; 488 return;
399 } 489 }
400 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); 490 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
401 } 491 }
402 } else { 492 } else {
403 last_irq_sums[cpu] = sum; 493 __get_cpu_var(last_irq_sum) = sum;
404 alert_counter[cpu] = 0; 494 local_set(&__get_cpu_var(alert_counter), 0);
405 } 495 }
406 if (nmi_perfctr_msr) 496 if (nmi_perfctr_msr) {
497 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
498 /*
499 * P4 quirks:
500 * - An overflown perfctr will assert its interrupt
501 * until the OVF flag in its CCCR is cleared.
502 * - LVTPC is masked on interrupt and must be
503 * unmasked by the LVTPC handler.
504 */
505 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
506 apic_write(APIC_LVTPC, APIC_DM_NMI);
507 }
407 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); 508 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
509 }
408} 510}
409 511
410static int dummy_nmi_callback(struct pt_regs * regs, int cpu) 512static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
new file mode 100644
index 000000000000..feb5f108dd26
--- /dev/null
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -0,0 +1,101 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <asm/io.h>
23#include <asm/proto.h>
24#include <asm/msr.h>
25#include <asm/vsyscall.h>
26
27/* The I/O port the PMTMR resides at.
28 * The location is detected during setup_arch(),
29 * in arch/i386/kernel/acpi/boot.c */
30u32 pmtmr_ioport;
31
32/* value of the Power timer at last timer interrupt */
33static u32 offset_delay;
34static u32 last_pmtmr_tick;
35
36#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
37
38static inline u32 cyc2us(u32 cycles)
39{
40 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
41 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
42 *
43 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
44 * easily be multiplied with 286 (=0x11E) without having to fear
45 * u32 overflows.
46 */
47 cycles *= 286;
48 return (cycles >> 10);
49}
50
51int pmtimer_mark_offset(void)
52{
53 static int first_run = 1;
54 unsigned long tsc;
55 u32 lost;
56
57 u32 tick = inl(pmtmr_ioport);
58 u32 delta;
59
60 delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
61
62 last_pmtmr_tick = tick;
63 monotonic_base += delta * NSEC_PER_USEC;
64
65 delta += offset_delay;
66
67 lost = delta / (USEC_PER_SEC / HZ);
68 offset_delay = delta % (USEC_PER_SEC / HZ);
69
70 rdtscll(tsc);
71 vxtime.last_tsc = tsc - offset_delay * cpu_khz;
72
73 /* don't calculate delay for first run,
74 or if we've got less then a tick */
75 if (first_run || (lost < 1)) {
76 first_run = 0;
77 offset_delay = 0;
78 }
79
80 return lost - 1;
81}
82
83unsigned int do_gettimeoffset_pm(void)
84{
85 u32 now, offset, delta = 0;
86
87 offset = last_pmtmr_tick;
88 now = inl(pmtmr_ioport);
89 delta = (now - offset) & ACPI_PM_MASK;
90
91 return offset_delay + cyc2us(delta);
92}
93
94
95static int __init nopmtimer_setup(char *s)
96{
97 pmtmr_ioport = 0;
98 return 0;
99}
100
101__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index e26e86bb56fe..60dc9b98951d 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -257,13 +257,13 @@ static int putreg(struct task_struct *child,
257 value &= 0xffff; 257 value &= 0xffff;
258 return 0; 258 return 0;
259 case offsetof(struct user_regs_struct,fs_base): 259 case offsetof(struct user_regs_struct,fs_base):
260 if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) 260 if (value >= TASK_SIZE)
261 return -EIO; 261 return -EIO;
262 child->thread.fs = value; 262 child->thread.fs = value;
263 return 0; 263 return 0;
264 case offsetof(struct user_regs_struct,gs_base): 264 case offsetof(struct user_regs_struct,gs_base):
265 if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) 265 if (value >= TASK_SIZE)
266 return -EIO; 266 return -EIO;
267 child->thread.gs = value; 267 child->thread.gs = value;
268 return 0; 268 return 0;
269 case offsetof(struct user_regs_struct, eflags): 269 case offsetof(struct user_regs_struct, eflags):
@@ -277,6 +277,11 @@ static int putreg(struct task_struct *child,
277 return -EIO; 277 return -EIO;
278 value &= 0xffff; 278 value &= 0xffff;
279 break; 279 break;
280 case offsetof(struct user_regs_struct, rip):
281 /* Check if the new RIP address is canonical */
282 if (value >= TASK_SIZE)
283 return -EIO;
284 break;
280 } 285 }
281 put_stack_long(child, regno - sizeof(struct pt_regs), value); 286 put_stack_long(child, regno - sizeof(struct pt_regs), value);
282 return 0; 287 return 0;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 2129cf9ba6b2..b9fd0252c279 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -727,11 +727,12 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c)
727static void __init amd_detect_cmp(struct cpuinfo_x86 *c) 727static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
728{ 728{
729#ifdef CONFIG_SMP 729#ifdef CONFIG_SMP
730 int cpu = c->x86_apicid; 730 int cpu = smp_processor_id();
731 int node = 0; 731 int node = 0;
732 if (c->x86_num_cores == 1) 732 if (c->x86_num_cores == 1)
733 return; 733 return;
734 cpu_core_id[cpu] = cpu >> hweight32(c->x86_num_cores - 1); 734 /* Fix up the APIC ID following the AMD specification. */
735 cpu_core_id[cpu] >>= hweight32(c->x86_num_cores - 1);
735 736
736#ifdef CONFIG_NUMA 737#ifdef CONFIG_NUMA
737 /* When an ACPI SRAT table is available use the mappings from SRAT 738 /* When an ACPI SRAT table is available use the mappings from SRAT
@@ -745,6 +746,9 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
745 node = cpu_to_node[cpu]; 746 node = cpu_to_node[cpu];
746 } 747 }
747#endif 748#endif
749 /* For now: - better than BAD_APIC_ID at least*/
750 phys_proc_id[cpu] = cpu_core_id[cpu];
751
748 printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", 752 printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
749 cpu, c->x86_num_cores, node, cpu_core_id[cpu]); 753 cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
750#endif 754#endif
@@ -925,7 +929,6 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
925 c->x86_clflush_size = 64; 929 c->x86_clflush_size = 64;
926 c->x86_cache_alignment = c->x86_clflush_size; 930 c->x86_cache_alignment = c->x86_clflush_size;
927 c->x86_num_cores = 1; 931 c->x86_num_cores = 1;
928 c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
929 c->extended_cpuid_level = 0; 932 c->extended_cpuid_level = 0;
930 memset(&c->x86_capability, 0, sizeof c->x86_capability); 933 memset(&c->x86_capability, 0, sizeof c->x86_capability);
931 934
@@ -954,11 +957,15 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
954 } 957 }
955 if (c->x86_capability[0] & (1<<19)) 958 if (c->x86_capability[0] & (1<<19))
956 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 959 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
957 c->x86_apicid = misc >> 24;
958 } else { 960 } else {
959 /* Have CPUID level 0 only - unheard of */ 961 /* Have CPUID level 0 only - unheard of */
960 c->x86 = 4; 962 c->x86 = 4;
961 } 963 }
964
965#ifdef CONFIG_SMP
966 phys_proc_id[smp_processor_id()] =
967 cpu_core_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
968#endif
962} 969}
963 970
964/* 971/*
@@ -1088,7 +1095,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1088 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1095 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1089 1096
1090 /* Other (Linux-defined) */ 1097 /* Other (Linux-defined) */
1091 "cxmmx", NULL, "cyrix_arr", "centaur_mcr", "k8c+", 1098 "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
1092 "constant_tsc", NULL, NULL, 1099 "constant_tsc", NULL, NULL,
1093 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1100 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1094 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1101 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 73f7e8b9543a..5abdee1e16a5 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -56,6 +56,7 @@
56#include <asm/kdebug.h> 56#include <asm/kdebug.h>
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/proto.h> 58#include <asm/proto.h>
59#include <asm/nmi.h>
59 60
60/* Change for real CPU hotplug. Note other files need to be fixed 61/* Change for real CPU hotplug. Note other files need to be fixed
61 first too. */ 62 first too. */
@@ -125,96 +126,210 @@ static void __cpuinit smp_store_cpu_info(int id)
125 126
126 *c = boot_cpu_data; 127 *c = boot_cpu_data;
127 identify_cpu(c); 128 identify_cpu(c);
129 print_cpu_info(c);
128} 130}
129 131
130/* 132/*
131 * Synchronize TSCs of CPUs 133 * New Funky TSC sync algorithm borrowed from IA64.
134 * Main advantage is that it doesn't reset the TSCs fully and
135 * in general looks more robust and it works better than my earlier
136 * attempts. I believe it was written by David Mosberger. Some minor
137 * adjustments for x86-64 by me -AK
132 * 138 *
133 * This new algorithm is less accurate than the old "zero TSCs" 139 * Original comment reproduced below.
134 * one, but we cannot zero TSCs anymore in the new hotplug CPU 140 *
135 * model. 141 * Synchronize TSC of the current (slave) CPU with the TSC of the
142 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
143 * eliminate the possibility of unaccounted-for errors (such as
144 * getting a machine check in the middle of a calibration step). The
145 * basic idea is for the slave to ask the master what itc value it has
146 * and to read its own itc before and after the master responds. Each
147 * iteration gives us three timestamps:
148 *
149 * slave master
150 *
151 * t0 ---\
152 * ---\
153 * --->
154 * tm
155 * /---
156 * /---
157 * t1 <---
158 *
159 *
160 * The goal is to adjust the slave's TSC such that tm falls exactly
161 * half-way between t0 and t1. If we achieve this, the clocks are
162 * synchronized provided the interconnect between the slave and the
163 * master is symmetric. Even if the interconnect were asymmetric, we
164 * would still know that the synchronization error is smaller than the
165 * roundtrip latency (t0 - t1).
166 *
167 * When the interconnect is quiet and symmetric, this lets us
168 * synchronize the TSC to within one or two cycles. However, we can
169 * only *guarantee* that the synchronization is accurate to within a
170 * round-trip time, which is typically in the range of several hundred
171 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
172 * are usually almost perfectly synchronized, but we shouldn't assume
173 * that the accuracy is much better than half a micro second or so.
174 *
175 * [there are other errors like the latency of RDTSC and of the
176 * WRMSR. These can also account to hundreds of cycles. So it's
177 * probably worse. It claims 153 cycles error on a dual Opteron,
178 * but I suspect the numbers are actually somewhat worse -AK]
136 */ 179 */
137 180
138static atomic_t __cpuinitdata tsc_flag; 181#define MASTER 0
182#define SLAVE (SMP_CACHE_BYTES/8)
183
184/* Intentionally don't use cpu_relax() while TSC synchronization
185 because we don't want to go into funky power save modi or cause
186 hypervisors to schedule us away. Going to sleep would likely affect
187 latency and low latency is the primary objective here. -AK */
188#define no_cpu_relax() barrier()
189
139static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); 190static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
140static unsigned long long __cpuinitdata bp_tsc, ap_tsc; 191static volatile __cpuinitdata unsigned long go[SLAVE + 1];
192static int notscsync __cpuinitdata;
193
194#undef DEBUG_TSC_SYNC
141 195
142#define NR_LOOPS 5 196#define NUM_ROUNDS 64 /* magic value */
197#define NUM_ITERS 5 /* likewise */
143 198
144static void __cpuinit sync_tsc_bp_init(int init) 199/* Callback on boot CPU */
200static __cpuinit void sync_master(void *arg)
145{ 201{
146 if (init) 202 unsigned long flags, i;
147 _raw_spin_lock(&tsc_sync_lock); 203
148 else 204 if (smp_processor_id() != boot_cpu_id)
149 _raw_spin_unlock(&tsc_sync_lock); 205 return;
150 atomic_set(&tsc_flag, 0); 206
207 go[MASTER] = 0;
208
209 local_irq_save(flags);
210 {
211 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
212 while (!go[MASTER])
213 no_cpu_relax();
214 go[MASTER] = 0;
215 rdtscll(go[SLAVE]);
216 }
217 }
218 local_irq_restore(flags);
151} 219}
152 220
153/* 221/*
154 * Synchronize TSC on AP with BP. 222 * Return the number of cycles by which our tsc differs from the tsc
223 * on the master (time-keeper) CPU. A positive number indicates our
224 * tsc is ahead of the master, negative that it is behind.
155 */ 225 */
156static void __cpuinit __sync_tsc_ap(void) 226static inline long
227get_delta(long *rt, long *master)
157{ 228{
158 if (!cpu_has_tsc) 229 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
159 return; 230 unsigned long tcenter, t0, t1, tm;
160 Dprintk("AP %d syncing TSC\n", smp_processor_id()); 231 int i;
161 232
162 while (atomic_read(&tsc_flag) != 0) 233 for (i = 0; i < NUM_ITERS; ++i) {
163 cpu_relax(); 234 rdtscll(t0);
164 atomic_inc(&tsc_flag); 235 go[MASTER] = 1;
165 mb(); 236 while (!(tm = go[SLAVE]))
166 _raw_spin_lock(&tsc_sync_lock); 237 no_cpu_relax();
167 wrmsrl(MSR_IA32_TSC, bp_tsc); 238 go[SLAVE] = 0;
168 _raw_spin_unlock(&tsc_sync_lock); 239 rdtscll(t1);
169 rdtscll(ap_tsc); 240
170 mb(); 241 if (t1 - t0 < best_t1 - best_t0)
171 atomic_inc(&tsc_flag); 242 best_t0 = t0, best_t1 = t1, best_tm = tm;
172 mb(); 243 }
244
245 *rt = best_t1 - best_t0;
246 *master = best_tm - best_t0;
247
248 /* average best_t0 and best_t1 without overflow: */
249 tcenter = (best_t0/2 + best_t1/2);
250 if (best_t0 % 2 + best_t1 % 2 == 2)
251 ++tcenter;
252 return tcenter - best_tm;
173} 253}
174 254
175static void __cpuinit sync_tsc_ap(void) 255static __cpuinit void sync_tsc(void)
176{ 256{
177 int i; 257 int i, done = 0;
178 for (i = 0; i < NR_LOOPS; i++) 258 long delta, adj, adjust_latency = 0;
179 __sync_tsc_ap(); 259 unsigned long flags, rt, master_time_stamp, bound;
260#if DEBUG_TSC_SYNC
261 static struct syncdebug {
262 long rt; /* roundtrip time */
263 long master; /* master's timestamp */
264 long diff; /* difference between midpoint and master's timestamp */
265 long lat; /* estimate of tsc adjustment latency */
266 } t[NUM_ROUNDS] __cpuinitdata;
267#endif
268
269 go[MASTER] = 1;
270
271 smp_call_function(sync_master, NULL, 1, 0);
272
273 while (go[MASTER]) /* wait for master to be ready */
274 no_cpu_relax();
275
276 spin_lock_irqsave(&tsc_sync_lock, flags);
277 {
278 for (i = 0; i < NUM_ROUNDS; ++i) {
279 delta = get_delta(&rt, &master_time_stamp);
280 if (delta == 0) {
281 done = 1; /* let's lock on to this... */
282 bound = rt;
283 }
284
285 if (!done) {
286 unsigned long t;
287 if (i > 0) {
288 adjust_latency += -delta;
289 adj = -delta + adjust_latency/4;
290 } else
291 adj = -delta;
292
293 rdtscll(t);
294 wrmsrl(MSR_IA32_TSC, t + adj);
295 }
296#if DEBUG_TSC_SYNC
297 t[i].rt = rt;
298 t[i].master = master_time_stamp;
299 t[i].diff = delta;
300 t[i].lat = adjust_latency/4;
301#endif
302 }
303 }
304 spin_unlock_irqrestore(&tsc_sync_lock, flags);
305
306#if DEBUG_TSC_SYNC
307 for (i = 0; i < NUM_ROUNDS; ++i)
308 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
309 t[i].rt, t[i].master, t[i].diff, t[i].lat);
310#endif
311
312 printk(KERN_INFO
313 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
314 "maxerr %lu cycles)\n",
315 smp_processor_id(), boot_cpu_id, delta, rt);
180} 316}
181 317
182/* 318static void __cpuinit tsc_sync_wait(void)
183 * Synchronize TSC from BP to AP.
184 */
185static void __cpuinit __sync_tsc_bp(int cpu)
186{ 319{
187 if (!cpu_has_tsc) 320 if (notscsync || !cpu_has_tsc)
188 return; 321 return;
189 322 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
190 /* Wait for AP */ 323 boot_cpu_id);
191 while (atomic_read(&tsc_flag) == 0) 324 sync_tsc();
192 cpu_relax();
193 /* Save BPs TSC */
194 sync_core();
195 rdtscll(bp_tsc);
196 /* Don't do the sync core here to avoid too much latency. */
197 mb();
198 /* Start the AP */
199 _raw_spin_unlock(&tsc_sync_lock);
200 /* Wait for AP again */
201 while (atomic_read(&tsc_flag) < 2)
202 cpu_relax();
203 rdtscl(bp_tsc);
204 barrier();
205} 325}
206 326
207static void __cpuinit sync_tsc_bp(int cpu) 327static __init int notscsync_setup(char *s)
208{ 328{
209 int i; 329 notscsync = 1;
210 for (i = 0; i < NR_LOOPS - 1; i++) { 330 return 0;
211 __sync_tsc_bp(cpu);
212 sync_tsc_bp_init(1);
213 }
214 __sync_tsc_bp(cpu);
215 printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
216 cpu, ap_tsc - bp_tsc);
217} 331}
332__setup("notscsync", notscsync_setup);
218 333
219static atomic_t init_deasserted __cpuinitdata; 334static atomic_t init_deasserted __cpuinitdata;
220 335
@@ -315,11 +430,6 @@ void __cpuinit start_secondary(void)
315 cpu_init(); 430 cpu_init();
316 smp_callin(); 431 smp_callin();
317 432
318 /*
319 * Synchronize the TSC with the BP
320 */
321 sync_tsc_ap();
322
323 /* otherwise gcc will move up the smp_processor_id before the cpu_init */ 433 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
324 barrier(); 434 barrier();
325 435
@@ -334,7 +444,6 @@ void __cpuinit start_secondary(void)
334 enable_8259A_irq(0); 444 enable_8259A_irq(0);
335 } 445 }
336 446
337
338 enable_APIC_timer(); 447 enable_APIC_timer();
339 448
340 /* 449 /*
@@ -343,6 +452,11 @@ void __cpuinit start_secondary(void)
343 cpu_set(smp_processor_id(), cpu_online_map); 452 cpu_set(smp_processor_id(), cpu_online_map);
344 mb(); 453 mb();
345 454
455 /* Wait for TSC sync to not schedule things before.
456 We still process interrupts, which could see an inconsistent
457 time in that window unfortunately. */
458 tsc_sync_wait();
459
346 cpu_idle(); 460 cpu_idle();
347} 461}
348 462
@@ -531,7 +645,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
531 printk("failed fork for CPU %d\n", cpu); 645 printk("failed fork for CPU %d\n", cpu);
532 return PTR_ERR(idle); 646 return PTR_ERR(idle);
533 } 647 }
534 x86_cpu_to_apicid[cpu] = apicid;
535 648
536 cpu_pda[cpu].pcurrent = idle; 649 cpu_pda[cpu].pcurrent = idle;
537 650
@@ -600,8 +713,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
600 713
601 if (cpu_isset(cpu, cpu_callin_map)) { 714 if (cpu_isset(cpu, cpu_callin_map)) {
602 /* number CPUs logically, starting from 1 (BSP is 0) */ 715 /* number CPUs logically, starting from 1 (BSP is 0) */
603 Dprintk("OK.\n");
604 print_cpu_info(&cpu_data[cpu]);
605 Dprintk("CPU has booted.\n"); 716 Dprintk("CPU has booted.\n");
606 } else { 717 } else {
607 boot_error = 1; 718 boot_error = 1;
@@ -842,7 +953,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
842 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 953 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
843 /* Or can we switch back to PIC here? */ 954 /* Or can we switch back to PIC here? */
844 } 955 }
845 x86_cpu_to_apicid[0] = boot_cpu_id;
846 956
847 /* 957 /*
848 * Now start the IO-APICs 958 * Now start the IO-APICs
@@ -889,18 +999,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
889 printk("__cpu_up: bad cpu %d\n", cpu); 999 printk("__cpu_up: bad cpu %d\n", cpu);
890 return -EINVAL; 1000 return -EINVAL;
891 } 1001 }
892 sync_tsc_bp_init(1);
893 1002
894 /* Boot it! */ 1003 /* Boot it! */
895 err = do_boot_cpu(cpu, apicid); 1004 err = do_boot_cpu(cpu, apicid);
896 if (err < 0) { 1005 if (err < 0) {
897 sync_tsc_bp_init(0);
898 Dprintk("do_boot_cpu failed %d\n", err); 1006 Dprintk("do_boot_cpu failed %d\n", err);
899 return err; 1007 return err;
900 } 1008 }
901 1009
902 sync_tsc_bp(cpu);
903
904 /* Unleash the CPU! */ 1010 /* Unleash the CPU! */
905 Dprintk("waiting for cpu %d\n", cpu); 1011 Dprintk("waiting for cpu %d\n", cpu);
906 1012
@@ -923,4 +1029,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus)
923 1029
924 detect_siblings(); 1030 detect_siblings();
925 time_init_gtod(); 1031 time_init_gtod();
1032
1033 check_nmi_watchdog();
926} 1034}
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 0e3b60e1ac5b..735b6767c8ed 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -26,6 +26,8 @@
26#include <linux/sysdev.h> 26#include <linux/sysdev.h>
27#include <linux/bcd.h> 27#include <linux/bcd.h>
28#include <linux/kallsyms.h> 28#include <linux/kallsyms.h>
29#include <linux/acpi.h>
30#include <acpi/achware.h> /* for PM timer frequency */
29#include <asm/8253pit.h> 31#include <asm/8253pit.h>
30#include <asm/pgtable.h> 32#include <asm/pgtable.h>
31#include <asm/vsyscall.h> 33#include <asm/vsyscall.h>
@@ -396,6 +398,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
396 (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; 398 (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
397 399
398 vxtime.last = offset; 400 vxtime.last = offset;
401#ifdef CONFIG_X86_PM_TIMER
402 } else if (vxtime.mode == VXTIME_PMTMR) {
403 lost = pmtimer_mark_offset();
404#endif
399 } else { 405 } else {
400 offset = (((tsc - vxtime.last_tsc) * 406 offset = (((tsc - vxtime.last_tsc) *
401 vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); 407 vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
@@ -898,6 +904,13 @@ void __init time_init(void)
898 hpet_period; 904 hpet_period;
899 cpu_khz = hpet_calibrate_tsc(); 905 cpu_khz = hpet_calibrate_tsc();
900 timename = "HPET"; 906 timename = "HPET";
907#ifdef CONFIG_X86_PM_TIMER
908 } else if (pmtmr_ioport) {
909 vxtime_hz = PM_TIMER_FREQUENCY;
910 timename = "PM";
911 pit_init();
912 cpu_khz = pit_calibrate_tsc();
913#endif
901 } else { 914 } else {
902 pit_init(); 915 pit_init();
903 cpu_khz = pit_calibrate_tsc(); 916 cpu_khz = pit_calibrate_tsc();
@@ -923,35 +936,50 @@ void __init time_init(void)
923} 936}
924 937
925/* 938/*
939 * Make an educated guess if the TSC is trustworthy and synchronized
940 * over all CPUs.
941 */
942static __init int unsynchronized_tsc(void)
943{
944#ifdef CONFIG_SMP
945 if (oem_force_hpet_timer())
946 return 1;
947 /* Intel systems are normally all synchronized. Exceptions
948 are handled in the OEM check above. */
949 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
950 return 0;
951 /* All in a single socket - should be synchronized */
952 if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
953 return 0;
954#endif
955 /* Assume multi socket systems are not synchronized */
956 return num_online_cpus() > 1;
957}
958
959/*
926 * Decide after all CPUs are booted what mode gettimeofday should use. 960 * Decide after all CPUs are booted what mode gettimeofday should use.
927 */ 961 */
928void __init time_init_gtod(void) 962void __init time_init_gtod(void)
929{ 963{
930 char *timetype; 964 char *timetype;
931 965
932 /* 966 if (unsynchronized_tsc())
933 * AMD systems with more than one CPU don't have fully synchronized
934 * TSCs. Always use HPET gettimeofday for these, although it is slower.
935 * Intel SMP systems usually have synchronized TSCs, so use always
936 * the TSC.
937 *
938 * Exceptions:
939 * IBM Summit2 checked by oem_force_hpet_timer().
940 * AMD dual core may also not need HPET. Check me.
941 *
942 * Can be turned off with "notsc".
943 */
944 if (num_online_cpus() > 1 &&
945 boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
946 notsc = 1;
947 /* Some systems will want to disable TSC and use HPET. */
948 if (oem_force_hpet_timer())
949 notsc = 1; 967 notsc = 1;
950 if (vxtime.hpet_address && notsc) { 968 if (vxtime.hpet_address && notsc) {
951 timetype = "HPET"; 969 timetype = "HPET";
952 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; 970 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
953 vxtime.mode = VXTIME_HPET; 971 vxtime.mode = VXTIME_HPET;
954 do_gettimeoffset = do_gettimeoffset_hpet; 972 do_gettimeoffset = do_gettimeoffset_hpet;
973#ifdef CONFIG_X86_PM_TIMER
974 /* Using PM for gettimeofday is quite slow, but we have no other
975 choice because the TSC is too unreliable on some systems. */
976 } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
977 timetype = "PM";
978 do_gettimeoffset = do_gettimeoffset_pm;
979 vxtime.mode = VXTIME_PMTMR;
980 sysctl_vsyscall = 0;
981 printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
982#endif
955 } else { 983 } else {
956 timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; 984 timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
957 vxtime.mode = VXTIME_TSC; 985 vxtime.mode = VXTIME_TSC;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b4b8dc59663a..2e5734425949 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -65,7 +65,7 @@ static force_inline void do_vgettimeofday(struct timeval * tv)
65 usec = (__xtime.tv_nsec / 1000) + 65 usec = (__xtime.tv_nsec / 1000) +
66 (__jiffies - __wall_jiffies) * (1000000 / HZ); 66 (__jiffies - __wall_jiffies) * (1000000 / HZ);
67 67
68 if (__vxtime.mode == VXTIME_TSC) { 68 if (__vxtime.mode != VXTIME_HPET) {
69 sync_core(); 69 sync_core();
70 rdtscll(t); 70 rdtscll(t);
71 if (t < __vxtime.last_tsc) 71 if (t < __vxtime.last_tsc)
@@ -217,8 +217,9 @@ static int __init vsyscall_init(void)
217 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 217 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
218 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 218 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
219 map_vsyscall(); 219 map_vsyscall();
220 sysctl_vsyscall = 1; 220#ifdef CONFIG_SYSCTL
221 register_sysctl_table(kernel_root_table2, 0); 221 register_sysctl_table(kernel_root_table2, 0);
222#endif
222 return 0; 223 return 0;
223} 224}
224 225