aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile71
-rw-r--r--arch/i386/kernel/acpi/Makefile4
-rw-r--r--arch/i386/kernel/acpi/boot.c908
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c51
-rw-r--r--arch/i386/kernel/acpi/sleep.c93
-rw-r--r--arch/i386/kernel/acpi/wakeup.S318
-rw-r--r--arch/i386/kernel/apic.c1278
-rw-r--r--arch/i386/kernel/apm.c2428
-rw-r--r--arch/i386/kernel/asm-offsets.c72
-rw-r--r--arch/i386/kernel/bootflag.c99
-rw-r--r--arch/i386/kernel/cpu/Makefile19
-rw-r--r--arch/i386/kernel/cpu/amd.c249
-rw-r--r--arch/i386/kernel/cpu/centaur.c476
-rw-r--r--arch/i386/kernel/cpu/changelog63
-rw-r--r--arch/i386/kernel/cpu/common.c634
-rw-r--r--arch/i386/kernel/cpu/cpu.h30
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Kconfig231
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Makefile14
-rw-r--r--arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c537
-rw-r--r--arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c457
-rw-r--r--arch/i386/kernel/cpu/cpufreq/elanfreq.c312
-rw-r--r--arch/i386/kernel/cpu/cpufreq/gx-suspmod.c502
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.c658
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.h466
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longrun.c326
-rw-r--r--arch/i386/kernel/cpu/cpufreq/p4-clockmod.c337
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k6.c256
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k7.c690
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k7.h44
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k8.c1135
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k8.h176
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c715
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-est-common.h25
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-ich.c424
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-lib.c385
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-lib.h47
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-smi.c424
-rw-r--r--arch/i386/kernel/cpu/cyrix.c439
-rw-r--r--arch/i386/kernel/cpu/intel.c248
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c598
-rw-r--r--arch/i386/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/i386/kernel/cpu/mcheck/k7.c97
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c77
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.h14
-rw-r--r--arch/i386/kernel/cpu/mcheck/non-fatal.c93
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c271
-rw-r--r--arch/i386/kernel/cpu/mcheck/p5.c54
-rw-r--r--arch/i386/kernel/cpu/mcheck/p6.c115
-rw-r--r--arch/i386/kernel/cpu/mcheck/winchip.c37
-rw-r--r--arch/i386/kernel/cpu/mtrr/Makefile5
-rw-r--r--arch/i386/kernel/cpu/mtrr/amd.c121
-rw-r--r--arch/i386/kernel/cpu/mtrr/centaur.c223
-rw-r--r--arch/i386/kernel/cpu/mtrr/changelog229
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c364
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c417
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c374
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c693
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h98
-rw-r--r--arch/i386/kernel/cpu/mtrr/state.c78
-rw-r--r--arch/i386/kernel/cpu/nexgen.c63
-rw-r--r--arch/i386/kernel/cpu/proc.c149
-rw-r--r--arch/i386/kernel/cpu/rise.c53
-rw-r--r--arch/i386/kernel/cpu/transmeta.c107
-rw-r--r--arch/i386/kernel/cpu/umc.c33
-rw-r--r--arch/i386/kernel/cpuid.c246
-rw-r--r--arch/i386/kernel/dmi_scan.c487
-rw-r--r--arch/i386/kernel/doublefault.c65
-rw-r--r--arch/i386/kernel/early_printk.c2
-rw-r--r--arch/i386/kernel/efi.c635
-rw-r--r--arch/i386/kernel/efi_stub.S124
-rw-r--r--arch/i386/kernel/entry.S950
-rw-r--r--arch/i386/kernel/head.S521
-rw-r--r--arch/i386/kernel/i386_ksyms.c195
-rw-r--r--arch/i386/kernel/i387.c555
-rw-r--r--arch/i386/kernel/i8259.c429
-rw-r--r--arch/i386/kernel/init_task.c46
-rw-r--r--arch/i386/kernel/io_apic.c2545
-rw-r--r--arch/i386/kernel/ioport.c147
-rw-r--r--arch/i386/kernel/irq.c261
-rw-r--r--arch/i386/kernel/kprobes.c385
-rw-r--r--arch/i386/kernel/ldt.c255
-rw-r--r--arch/i386/kernel/mca.c474
-rw-r--r--arch/i386/kernel/microcode.c512
-rw-r--r--arch/i386/kernel/module.c129
-rw-r--r--arch/i386/kernel/mpparse.c1109
-rw-r--r--arch/i386/kernel/msr.c346
-rw-r--r--arch/i386/kernel/nmi.c570
-rw-r--r--arch/i386/kernel/numaq.c79
-rw-r--r--arch/i386/kernel/pci-dma.c147
-rw-r--r--arch/i386/kernel/process.c848
-rw-r--r--arch/i386/kernel/ptrace.c717
-rw-r--r--arch/i386/kernel/quirks.c52
-rw-r--r--arch/i386/kernel/reboot.c382
-rw-r--r--arch/i386/kernel/scx200.c167
-rw-r--r--arch/i386/kernel/semaphore.c297
-rw-r--r--arch/i386/kernel/setup.c1535
-rw-r--r--arch/i386/kernel/sigframe.h21
-rw-r--r--arch/i386/kernel/signal.c665
-rw-r--r--arch/i386/kernel/smp.c612
-rw-r--r--arch/i386/kernel/smpboot.c1145
-rw-r--r--arch/i386/kernel/srat.c456
-rw-r--r--arch/i386/kernel/summit.c180
-rw-r--r--arch/i386/kernel/sys_i386.c252
-rw-r--r--arch/i386/kernel/sysenter.c65
-rw-r--r--arch/i386/kernel/time.c476
-rw-r--r--arch/i386/kernel/time_hpet.c458
-rw-r--r--arch/i386/kernel/timers/Makefile9
-rw-r--r--arch/i386/kernel/timers/common.c160
-rw-r--r--arch/i386/kernel/timers/timer.c66
-rw-r--r--arch/i386/kernel/timers/timer_cyclone.c259
-rw-r--r--arch/i386/kernel/timers/timer_hpet.c191
-rw-r--r--arch/i386/kernel/timers/timer_none.c39
-rw-r--r--arch/i386/kernel/timers/timer_pit.c206
-rw-r--r--arch/i386/kernel/timers/timer_pm.c258
-rw-r--r--arch/i386/kernel/timers/timer_tsc.c560
-rw-r--r--arch/i386/kernel/trampoline.S80
-rw-r--r--arch/i386/kernel/traps.c1084
-rw-r--r--arch/i386/kernel/vm86.c804
-rw-r--r--arch/i386/kernel/vmlinux.lds.S134
-rw-r--r--arch/i386/kernel/vsyscall-int80.S53
-rw-r--r--arch/i386/kernel/vsyscall-sigreturn.S142
-rw-r--r--arch/i386/kernel/vsyscall-sysenter.S104
-rw-r--r--arch/i386/kernel/vsyscall.S15
-rw-r--r--arch/i386/kernel/vsyscall.lds.S65
124 files changed, 43777 insertions, 0 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
new file mode 100644
index 000000000000..933787a46b4c
--- /dev/null
+++ b/arch/i386/kernel/Makefile
@@ -0,0 +1,71 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head.o init_task.o vmlinux.lds
6
7obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
9 pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
10 doublefault.o quirks.o
11
12obj-y += cpu/
13obj-y += timers/
14obj-$(CONFIG_ACPI_BOOT) += acpi/
15obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
16obj-$(CONFIG_MCA) += mca.o
17obj-$(CONFIG_X86_MSR) += msr.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_MICROCODE) += microcode.o
20obj-$(CONFIG_APM) += apm.o
21obj-$(CONFIG_X86_SMP) += smp.o smpboot.o
22obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
23obj-$(CONFIG_X86_MPPARSE) += mpparse.o
24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
25obj-$(CONFIG_X86_IO_APIC) += io_apic.o
26obj-$(CONFIG_X86_NUMAQ) += numaq.o
27obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o
28obj-$(CONFIG_KPROBES) += kprobes.o
29obj-$(CONFIG_MODULES) += module.o
30obj-y += sysenter.o vsyscall.o
31obj-$(CONFIG_ACPI_SRAT) += srat.o
32obj-$(CONFIG_HPET_TIMER) += time_hpet.o
33obj-$(CONFIG_EFI) += efi.o efi_stub.o
34obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
35
36EXTRA_AFLAGS := -traditional
37
38obj-$(CONFIG_SCx200) += scx200.o
39
40# vsyscall.o contains the vsyscall DSO images as __initdata.
41# We must build both images before we can assemble it.
42# Note: kbuild does not track this dependency due to usage of .incbin
43$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so
44targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
45targets += vsyscall.lds
46
47# The DSO images are built using a special linker script.
48quiet_cmd_syscall = SYSCALL $@
49 cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
50 -Wl,-T,$(filter-out FORCE,$^) -o $@
51
52export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH)
53
54vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1
55SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags)
56SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags)
57
58$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
59$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
60 $(call if_changed,syscall)
61
62# We also create a special relocatable object that should mirror the symbol
63# table and layout of the linked DSO. With ld -R we can then refer to
64# these symbols in the kernel code rather than hand-coded addresses.
65extra-y += vsyscall-syms.o
66$(obj)/built-in.o: $(obj)/vsyscall-syms.o
67$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
68
69SYSCFLAGS_vsyscall-syms.o = -r
70$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds $(obj)/vsyscall-sysenter.o FORCE
71 $(call if_changed,syscall)
diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile
new file mode 100644
index 000000000000..ee75cb286cfe
--- /dev/null
+++ b/arch/i386/kernel/acpi/Makefile
@@ -0,0 +1,4 @@
1obj-$(CONFIG_ACPI_BOOT) := boot.o
2obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o
3obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
4
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
new file mode 100644
index 000000000000..9ba0b957d11f
--- /dev/null
+++ b/arch/i386/kernel/acpi/boot.c
@@ -0,0 +1,908 @@
1/*
2 * boot.c - Architecture-Specific Low-Level ACPI Boot Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 *
7 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/init.h>
27#include <linux/config.h>
28#include <linux/acpi.h>
29#include <linux/efi.h>
30#include <linux/irq.h>
31#include <linux/module.h>
32
33#include <asm/pgtable.h>
34#include <asm/io_apic.h>
35#include <asm/apic.h>
36#include <asm/io.h>
37#include <asm/irq.h>
38#include <asm/mpspec.h>
39
40#ifdef CONFIG_X86_64
41
42static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) { }
43extern void __init clustered_apic_check(void);
44static inline int ioapic_setup_disabled(void) { return 0; }
45#include <asm/proto.h>
46
47#else /* X86 */
48
49#ifdef CONFIG_X86_LOCAL_APIC
50#include <mach_apic.h>
51#include <mach_mpparse.h>
52#endif /* CONFIG_X86_LOCAL_APIC */
53
54#endif /* X86 */
55
56#define BAD_MADT_ENTRY(entry, end) ( \
57 (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
58 ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
59
60#define PREFIX "ACPI: "
61
62#ifdef CONFIG_ACPI_PCI
63int acpi_noirq __initdata; /* skip ACPI IRQ initialization */
64int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
65#else
66int acpi_noirq __initdata = 1;
67int acpi_pci_disabled __initdata = 1;
68#endif
69int acpi_ht __initdata = 1; /* enable HT */
70
71int acpi_lapic;
72int acpi_ioapic;
73int acpi_strict;
74EXPORT_SYMBOL(acpi_strict);
75
76acpi_interrupt_flags acpi_sci_flags __initdata;
77int acpi_sci_override_gsi __initdata;
78int acpi_skip_timer_override __initdata;
79
80#ifdef CONFIG_X86_LOCAL_APIC
81static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
82#endif
83
84#ifndef __HAVE_ARCH_CMPXCHG
85#warning ACPI uses CMPXCHG, i486 and later hardware
86#endif
87
88#define MAX_MADT_ENTRIES 256
89u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
90 { [0 ... MAX_MADT_ENTRIES-1] = 0xff };
91EXPORT_SYMBOL(x86_acpiid_to_apicid);
92
93/* --------------------------------------------------------------------------
94 Boot-time Configuration
95 -------------------------------------------------------------------------- */
96
97/*
98 * The default interrupt routing model is PIC (8259). This gets
99 * overriden if IOAPICs are enumerated (below).
100 */
101enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
102
103#ifdef CONFIG_X86_64
104
105/* rely on all ACPI tables being in the direct mapping */
106char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
107{
108 if (!phys_addr || !size)
109 return NULL;
110
111 if (phys_addr < (end_pfn_map << PAGE_SHIFT))
112 return __va(phys_addr);
113
114 return NULL;
115}
116
117#else
118
119/*
120 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
121 * to map the target physical address. The problem is that set_fixmap()
122 * provides a single page, and it is possible that the page is not
123 * sufficient.
124 * By using this area, we can map up to MAX_IO_APICS pages temporarily,
125 * i.e. until the next __va_range() call.
126 *
127 * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
128 * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
129 * count idx down while incrementing the phys address.
130 */
131char *__acpi_map_table(unsigned long phys, unsigned long size)
132{
133 unsigned long base, offset, mapped_size;
134 int idx;
135
136 if (phys + size < 8*1024*1024)
137 return __va(phys);
138
139 offset = phys & (PAGE_SIZE - 1);
140 mapped_size = PAGE_SIZE - offset;
141 set_fixmap(FIX_ACPI_END, phys);
142 base = fix_to_virt(FIX_ACPI_END);
143
144 /*
145 * Most cases can be covered by the below.
146 */
147 idx = FIX_ACPI_END;
148 while (mapped_size < size) {
149 if (--idx < FIX_ACPI_BEGIN)
150 return NULL; /* cannot handle this */
151 phys += PAGE_SIZE;
152 set_fixmap(idx, phys);
153 mapped_size += PAGE_SIZE;
154 }
155
156 return ((unsigned char *) base + offset);
157}
158#endif
159
160#ifdef CONFIG_PCI_MMCONFIG
161static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
162{
163 struct acpi_table_mcfg *mcfg;
164
165 if (!phys_addr || !size)
166 return -EINVAL;
167
168 mcfg = (struct acpi_table_mcfg *) __acpi_map_table(phys_addr, size);
169 if (!mcfg) {
170 printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
171 return -ENODEV;
172 }
173
174 if (mcfg->base_reserved) {
175 printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n");
176 return -ENODEV;
177 }
178
179 pci_mmcfg_base_addr = mcfg->base_address;
180
181 return 0;
182}
183#else
184#define acpi_parse_mcfg NULL
185#endif /* !CONFIG_PCI_MMCONFIG */
186
187#ifdef CONFIG_X86_LOCAL_APIC
188static int __init
189acpi_parse_madt (
190 unsigned long phys_addr,
191 unsigned long size)
192{
193 struct acpi_table_madt *madt = NULL;
194
195 if (!phys_addr || !size)
196 return -EINVAL;
197
198 madt = (struct acpi_table_madt *) __acpi_map_table(phys_addr, size);
199 if (!madt) {
200 printk(KERN_WARNING PREFIX "Unable to map MADT\n");
201 return -ENODEV;
202 }
203
204 if (madt->lapic_address) {
205 acpi_lapic_addr = (u64) madt->lapic_address;
206
207 printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
208 madt->lapic_address);
209 }
210
211 acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
212
213 return 0;
214}
215
216
217static int __init
218acpi_parse_lapic (
219 acpi_table_entry_header *header, const unsigned long end)
220{
221 struct acpi_table_lapic *processor = NULL;
222
223 processor = (struct acpi_table_lapic*) header;
224
225 if (BAD_MADT_ENTRY(processor, end))
226 return -EINVAL;
227
228 acpi_table_print_madt_entry(header);
229
230 /* no utility in registering a disabled processor */
231 if (processor->flags.enabled == 0)
232 return 0;
233
234 x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
235
236 mp_register_lapic (
237 processor->id, /* APIC ID */
238 processor->flags.enabled); /* Enabled? */
239
240 return 0;
241}
242
243static int __init
244acpi_parse_lapic_addr_ovr (
245 acpi_table_entry_header *header, const unsigned long end)
246{
247 struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
248
249 lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr*) header;
250
251 if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
252 return -EINVAL;
253
254 acpi_lapic_addr = lapic_addr_ovr->address;
255
256 return 0;
257}
258
259static int __init
260acpi_parse_lapic_nmi (
261 acpi_table_entry_header *header, const unsigned long end)
262{
263 struct acpi_table_lapic_nmi *lapic_nmi = NULL;
264
265 lapic_nmi = (struct acpi_table_lapic_nmi*) header;
266
267 if (BAD_MADT_ENTRY(lapic_nmi, end))
268 return -EINVAL;
269
270 acpi_table_print_madt_entry(header);
271
272 if (lapic_nmi->lint != 1)
273 printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
274
275 return 0;
276}
277
278
279#endif /*CONFIG_X86_LOCAL_APIC*/
280
281#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER)
282
283static int __init
284acpi_parse_ioapic (
285 acpi_table_entry_header *header, const unsigned long end)
286{
287 struct acpi_table_ioapic *ioapic = NULL;
288
289 ioapic = (struct acpi_table_ioapic*) header;
290
291 if (BAD_MADT_ENTRY(ioapic, end))
292 return -EINVAL;
293
294 acpi_table_print_madt_entry(header);
295
296 mp_register_ioapic (
297 ioapic->id,
298 ioapic->address,
299 ioapic->global_irq_base);
300
301 return 0;
302}
303
304/*
305 * Parse Interrupt Source Override for the ACPI SCI
306 */
307static void
308acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
309{
310 if (trigger == 0) /* compatible SCI trigger is level */
311 trigger = 3;
312
313 if (polarity == 0) /* compatible SCI polarity is low */
314 polarity = 3;
315
316 /* Command-line over-ride via acpi_sci= */
317 if (acpi_sci_flags.trigger)
318 trigger = acpi_sci_flags.trigger;
319
320 if (acpi_sci_flags.polarity)
321 polarity = acpi_sci_flags.polarity;
322
323 /*
324 * mp_config_acpi_legacy_irqs() already setup IRQs < 16
325 * If GSI is < 16, this will update its flags,
326 * else it will create a new mp_irqs[] entry.
327 */
328 mp_override_legacy_irq(gsi, polarity, trigger, gsi);
329
330 /*
331 * stash over-ride to indicate we've been here
332 * and for later update of acpi_fadt
333 */
334 acpi_sci_override_gsi = gsi;
335 return;
336}
337
338static int __init
339acpi_parse_int_src_ovr (
340 acpi_table_entry_header *header, const unsigned long end)
341{
342 struct acpi_table_int_src_ovr *intsrc = NULL;
343
344 intsrc = (struct acpi_table_int_src_ovr*) header;
345
346 if (BAD_MADT_ENTRY(intsrc, end))
347 return -EINVAL;
348
349 acpi_table_print_madt_entry(header);
350
351 if (intsrc->bus_irq == acpi_fadt.sci_int) {
352 acpi_sci_ioapic_setup(intsrc->global_irq,
353 intsrc->flags.polarity, intsrc->flags.trigger);
354 return 0;
355 }
356
357 if (acpi_skip_timer_override &&
358 intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
359 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
360 return 0;
361 }
362
363 mp_override_legacy_irq (
364 intsrc->bus_irq,
365 intsrc->flags.polarity,
366 intsrc->flags.trigger,
367 intsrc->global_irq);
368
369 return 0;
370}
371
372
373static int __init
374acpi_parse_nmi_src (
375 acpi_table_entry_header *header, const unsigned long end)
376{
377 struct acpi_table_nmi_src *nmi_src = NULL;
378
379 nmi_src = (struct acpi_table_nmi_src*) header;
380
381 if (BAD_MADT_ENTRY(nmi_src, end))
382 return -EINVAL;
383
384 acpi_table_print_madt_entry(header);
385
386 /* TBD: Support nimsrc entries? */
387
388 return 0;
389}
390
391#endif /* CONFIG_X86_IO_APIC */
392
393#ifdef CONFIG_ACPI_BUS
394
395/*
396 * acpi_pic_sci_set_trigger()
397 *
398 * use ELCR to set PIC-mode trigger type for SCI
399 *
400 * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
401 * it may require Edge Trigger -- use "acpi_sci=edge"
402 *
403 * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
404 * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
405 * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
406 * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
407 */
408
409void __init
410acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
411{
412 unsigned int mask = 1 << irq;
413 unsigned int old, new;
414
415 /* Real old ELCR mask */
416 old = inb(0x4d0) | (inb(0x4d1) << 8);
417
418 /*
419 * If we use ACPI to set PCI irq's, then we should clear ELCR
420 * since we will set it correctly as we enable the PCI irq
421 * routing.
422 */
423 new = acpi_noirq ? old : 0;
424
425 /*
426 * Update SCI information in the ELCR, it isn't in the PCI
427 * routing tables..
428 */
429 switch (trigger) {
430 case 1: /* Edge - clear */
431 new &= ~mask;
432 break;
433 case 3: /* Level - set */
434 new |= mask;
435 break;
436 }
437
438 if (old == new)
439 return;
440
441 printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
442 outb(new, 0x4d0);
443 outb(new >> 8, 0x4d1);
444}
445
446
447#endif /* CONFIG_ACPI_BUS */
448
449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
450{
451#ifdef CONFIG_X86_IO_APIC
452 if (use_pci_vector() && !platform_legacy_irq(gsi))
453 *irq = IO_APIC_VECTOR(gsi);
454 else
455#endif
456 *irq = gsi;
457 return 0;
458}
459
460unsigned int acpi_register_gsi(u32 gsi, int edge_level, int active_high_low)
461{
462 unsigned int irq;
463 unsigned int plat_gsi = gsi;
464
465#ifdef CONFIG_PCI
466 /*
467 * Make sure all (legacy) PCI IRQs are set as level-triggered.
468 */
469 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
470 extern void eisa_set_level_irq(unsigned int irq);
471
472 if (edge_level == ACPI_LEVEL_SENSITIVE)
473 eisa_set_level_irq(gsi);
474 }
475#endif
476
477#ifdef CONFIG_X86_IO_APIC
478 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
479 plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low);
480 }
481#endif
482 acpi_gsi_to_irq(plat_gsi, &irq);
483 return irq;
484}
485EXPORT_SYMBOL(acpi_register_gsi);
486
487/*
488 * ACPI based hotplug support for CPU
489 */
490#ifdef CONFIG_ACPI_HOTPLUG_CPU
491int
492acpi_map_lsapic(acpi_handle handle, int *pcpu)
493{
494 /* TBD */
495 return -EINVAL;
496}
497EXPORT_SYMBOL(acpi_map_lsapic);
498
499
500int
501acpi_unmap_lsapic(int cpu)
502{
503 /* TBD */
504 return -EINVAL;
505}
506EXPORT_SYMBOL(acpi_unmap_lsapic);
507#endif /* CONFIG_ACPI_HOTPLUG_CPU */
508
509static unsigned long __init
510acpi_scan_rsdp (
511 unsigned long start,
512 unsigned long length)
513{
514 unsigned long offset = 0;
515 unsigned long sig_len = sizeof("RSD PTR ") - 1;
516
517 /*
518 * Scan all 16-byte boundaries of the physical memory region for the
519 * RSDP signature.
520 */
521 for (offset = 0; offset < length; offset += 16) {
522 if (strncmp((char *) (start + offset), "RSD PTR ", sig_len))
523 continue;
524 return (start + offset);
525 }
526
527 return 0;
528}
529
530static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
531{
532 struct acpi_table_sbf *sb;
533
534 if (!phys_addr || !size)
535 return -EINVAL;
536
537 sb = (struct acpi_table_sbf *) __acpi_map_table(phys_addr, size);
538 if (!sb) {
539 printk(KERN_WARNING PREFIX "Unable to map SBF\n");
540 return -ENODEV;
541 }
542
543 sbf_port = sb->sbf_cmos; /* Save CMOS port */
544
545 return 0;
546}
547
548
549#ifdef CONFIG_HPET_TIMER
550
551static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
552{
553 struct acpi_table_hpet *hpet_tbl;
554
555 if (!phys || !size)
556 return -EINVAL;
557
558 hpet_tbl = (struct acpi_table_hpet *) __acpi_map_table(phys, size);
559 if (!hpet_tbl) {
560 printk(KERN_WARNING PREFIX "Unable to map HPET\n");
561 return -ENODEV;
562 }
563
564 if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
565 printk(KERN_WARNING PREFIX "HPET timers must be located in "
566 "memory.\n");
567 return -1;
568 }
569
570#ifdef CONFIG_X86_64
571 vxtime.hpet_address = hpet_tbl->addr.addrl |
572 ((long) hpet_tbl->addr.addrh << 32);
573
574 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
575 hpet_tbl->id, vxtime.hpet_address);
576#else /* X86 */
577 {
578 extern unsigned long hpet_address;
579
580 hpet_address = hpet_tbl->addr.addrl;
581 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
582 hpet_tbl->id, hpet_address);
583 }
584#endif /* X86 */
585
586 return 0;
587}
588#else
589#define acpi_parse_hpet NULL
590#endif
591
592#ifdef CONFIG_X86_PM_TIMER
593extern u32 pmtmr_ioport;
594#endif
595
596static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
597{
598 struct fadt_descriptor_rev2 *fadt = NULL;
599
600 fadt = (struct fadt_descriptor_rev2*) __acpi_map_table(phys,size);
601 if(!fadt) {
602 printk(KERN_WARNING PREFIX "Unable to map FADT\n");
603 return 0;
604 }
605
606#ifdef CONFIG_ACPI_INTERPRETER
607 /* initialize sci_int early for INT_SRC_OVR MADT parsing */
608 acpi_fadt.sci_int = fadt->sci_int;
609#endif
610
611#ifdef CONFIG_X86_PM_TIMER
612 /* detect the location of the ACPI PM Timer */
613 if (fadt->revision >= FADT2_REVISION_ID) {
614 /* FADT rev. 2 */
615 if (fadt->xpm_tmr_blk.address_space_id != ACPI_ADR_SPACE_SYSTEM_IO)
616 return 0;
617
618 pmtmr_ioport = fadt->xpm_tmr_blk.address;
619 } else {
620 /* FADT rev. 1 */
621 pmtmr_ioport = fadt->V1_pm_tmr_blk;
622 }
623 if (pmtmr_ioport)
624 printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", pmtmr_ioport);
625#endif
626 return 0;
627}
628
629
630unsigned long __init
631acpi_find_rsdp (void)
632{
633 unsigned long rsdp_phys = 0;
634
635 if (efi_enabled) {
636 if (efi.acpi20)
637 return __pa(efi.acpi20);
638 else if (efi.acpi)
639 return __pa(efi.acpi);
640 }
641 /*
642 * Scan memory looking for the RSDP signature. First search EBDA (low
643 * memory) paragraphs and then search upper memory (E0000-FFFFF).
644 */
645 rsdp_phys = acpi_scan_rsdp (0, 0x400);
646 if (!rsdp_phys)
647 rsdp_phys = acpi_scan_rsdp (0xE0000, 0xFFFFF);
648
649 return rsdp_phys;
650}
651
652#ifdef CONFIG_X86_LOCAL_APIC
653/*
654 * Parse LAPIC entries in MADT
655 * returns 0 on success, < 0 on error
656 */
657static int __init
658acpi_parse_madt_lapic_entries(void)
659{
660 int count;
661
662 /*
663 * Note that the LAPIC address is obtained from the MADT (32-bit value)
664 * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
665 */
666
667 count = acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0);
668 if (count < 0) {
669 printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n");
670 return count;
671 }
672
673 mp_register_lapic_address(acpi_lapic_addr);
674
675 count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
676 MAX_APICS);
677 if (!count) {
678 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
679 /* TBD: Cleanup to allow fallback to MPS */
680 return -ENODEV;
681 }
682 else if (count < 0) {
683 printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
684 /* TBD: Cleanup to allow fallback to MPS */
685 return count;
686 }
687
688 count = acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
689 if (count < 0) {
690 printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
691 /* TBD: Cleanup to allow fallback to MPS */
692 return count;
693 }
694 return 0;
695}
696#endif /* CONFIG_X86_LOCAL_APIC */
697
698#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER)
699/*
700 * Parse IOAPIC related entries in MADT
701 * returns 0 on success, < 0 on error
702 */
703static int __init
704acpi_parse_madt_ioapic_entries(void)
705{
706 int count;
707
708 /*
709 * ACPI interpreter is required to complete interrupt setup,
710 * so if it is off, don't enumerate the io-apics with ACPI.
711 * If MPS is present, it will handle them,
712 * otherwise the system will stay in PIC mode
713 */
714 if (acpi_disabled || acpi_noirq) {
715 return -ENODEV;
716 }
717
718 /*
719 * if "noapic" boot option, don't look for IO-APICs
720 */
721 if (skip_ioapic_setup) {
722 printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
723 "due to 'noapic' option.\n");
724 return -ENODEV;
725 }
726
727 count = acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic, MAX_IO_APICS);
728 if (!count) {
729 printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
730 return -ENODEV;
731 }
732 else if (count < 0) {
733 printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
734 return count;
735 }
736
737 count = acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, NR_IRQ_VECTORS);
738 if (count < 0) {
739 printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n");
740 /* TBD: Cleanup to allow fallback to MPS */
741 return count;
742 }
743
744 /*
745 * If BIOS did not supply an INT_SRC_OVR for the SCI
746 * pretend we got one so we can set the SCI flags.
747 */
748 if (!acpi_sci_override_gsi)
749 acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
750
751 /* Fill in identity legacy mapings where no override */
752 mp_config_acpi_legacy_irqs();
753
754 count = acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, NR_IRQ_VECTORS);
755 if (count < 0) {
756 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
757 /* TBD: Cleanup to allow fallback to MPS */
758 return count;
759 }
760
761 return 0;
762}
763#else
764static inline int acpi_parse_madt_ioapic_entries(void)
765{
766 return -1;
767}
768#endif /* !(CONFIG_X86_IO_APIC && CONFIG_ACPI_INTERPRETER) */
769
770
771static void __init
772acpi_process_madt(void)
773{
774#ifdef CONFIG_X86_LOCAL_APIC
775 int count, error;
776
777 count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
778 if (count >= 1) {
779
780 /*
781 * Parse MADT LAPIC entries
782 */
783 error = acpi_parse_madt_lapic_entries();
784 if (!error) {
785 acpi_lapic = 1;
786
787 /*
788 * Parse MADT IO-APIC entries
789 */
790 error = acpi_parse_madt_ioapic_entries();
791 if (!error) {
792 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
793 acpi_irq_balance_set(NULL);
794 acpi_ioapic = 1;
795
796 smp_found_config = 1;
797 clustered_apic_check();
798 }
799 }
800 if (error == -EINVAL) {
801 /*
802 * Dell Precision Workstation 410, 610 come here.
803 */
804 printk(KERN_ERR PREFIX "Invalid BIOS MADT, disabling ACPI\n");
805 disable_acpi();
806 }
807 }
808#endif
809 return;
810}
811
812/*
813 * acpi_boot_table_init() and acpi_boot_init()
814 * called from setup_arch(), always.
815 * 1. checksums all tables
816 * 2. enumerates lapics
817 * 3. enumerates io-apics
818 *
819 * acpi_table_init() is separate to allow reading SRAT without
820 * other side effects.
821 *
822 * side effects of acpi_boot_init:
823 * acpi_lapic = 1 if LAPIC found
824 * acpi_ioapic = 1 if IOAPIC found
825 * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
826 * if acpi_blacklisted() acpi_disabled = 1;
827 * acpi_irq_model=...
828 * ...
829 *
830 * return value: (currently ignored)
831 * 0: success
832 * !0: failure
833 */
834
835int __init
836acpi_boot_table_init(void)
837{
838 int error;
839
840 /*
841 * If acpi_disabled, bail out
842 * One exception: acpi=ht continues far enough to enumerate LAPICs
843 */
844 if (acpi_disabled && !acpi_ht)
845 return 1;
846
847 /*
848 * Initialize the ACPI boot-time table parser.
849 */
850 error = acpi_table_init();
851 if (error) {
852 disable_acpi();
853 return error;
854 }
855
856#ifdef __i386__
857 check_acpi_pci();
858#endif
859
860 acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
861
862 /*
863 * blacklist may disable ACPI entirely
864 */
865 error = acpi_blacklisted();
866 if (error) {
867 extern int acpi_force;
868
869 if (acpi_force) {
870 printk(KERN_WARNING PREFIX "acpi=force override\n");
871 } else {
872 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
873 disable_acpi();
874 return error;
875 }
876 }
877
878 return 0;
879}
880
881
882int __init acpi_boot_init(void)
883{
884 /*
885 * If acpi_disabled, bail out
886 * One exception: acpi=ht continues far enough to enumerate LAPICs
887 */
888 if (acpi_disabled && !acpi_ht)
889 return 1;
890
891 acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
892
893 /*
894 * set sci_int and PM timer address
895 */
896 acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
897
898 /*
899 * Process the Multiple APIC Description Table (MADT), if present
900 */
901 acpi_process_madt();
902
903 acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
904 acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg);
905
906 return 0;
907}
908
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
new file mode 100644
index 000000000000..726a5ca4b165
--- /dev/null
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -0,0 +1,51 @@
1/*
2 * Do early PCI probing for bug detection when the main PCI subsystem is
3 * not up yet.
4 */
5#include <linux/init.h>
6#include <linux/kernel.h>
7#include <linux/pci.h>
8#include <asm/pci-direct.h>
9#include <asm/acpi.h>
10
11static int __init check_bridge(int vendor, int device)
12{
13 /* According to Nvidia all timer overrides are bogus. Just ignore
14 them all. */
15 if (vendor == PCI_VENDOR_ID_NVIDIA) {
16 acpi_skip_timer_override = 1;
17 }
18 return 0;
19}
20
21void __init check_acpi_pci(void)
22{
23 int num,slot,func;
24
25 /* Assume the machine supports type 1. If not it will
26 always read ffffffff and should not have any side effect. */
27
28 /* Poor man's PCI discovery */
29 for (num = 0; num < 32; num++) {
30 for (slot = 0; slot < 32; slot++) {
31 for (func = 0; func < 8; func++) {
32 u32 class;
33 u32 vendor;
34 class = read_pci_config(num,slot,func,
35 PCI_CLASS_REVISION);
36 if (class == 0xffffffff)
37 break;
38
39 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
40 continue;
41
42 vendor = read_pci_config(num, slot, func,
43 PCI_VENDOR_ID);
44
45 if (check_bridge(vendor&0xffff, vendor >> 16))
46 return;
47 }
48
49 }
50 }
51}
diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..28bb0514bb6e
--- /dev/null
+++ b/arch/i386/kernel/acpi/sleep.c
@@ -0,0 +1,93 @@
1/*
2 * sleep.c - x86-specific ACPI sleep support.
3 *
4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
6 */
7
8#include <linux/acpi.h>
9#include <linux/bootmem.h>
10#include <asm/smp.h>
11#include <asm/tlbflush.h>
12
13/* address in low memory of the wakeup routine. */
14unsigned long acpi_wakeup_address = 0;
15unsigned long acpi_video_flags;
16extern char wakeup_start, wakeup_end;
17
18extern void zap_low_mappings(void);
19
20extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
21
22static void init_low_mapping(pgd_t *pgd, int pgd_limit)
23{
24 int pgd_ofs = 0;
25
26 while ((pgd_ofs < pgd_limit) && (pgd_ofs + USER_PTRS_PER_PGD < PTRS_PER_PGD)) {
27 set_pgd(pgd, *(pgd+USER_PTRS_PER_PGD));
28 pgd_ofs++, pgd++;
29 }
30 flush_tlb_all();
31}
32
33/**
34 * acpi_save_state_mem - save kernel state
35 *
36 * Create an identity mapped page table and copy the wakeup routine to
37 * low memory.
38 */
39int acpi_save_state_mem (void)
40{
41 if (!acpi_wakeup_address)
42 return 1;
43 init_low_mapping(swapper_pg_dir, USER_PTRS_PER_PGD);
44 memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start);
45 acpi_copy_wakeup_routine(acpi_wakeup_address);
46
47 return 0;
48}
49
50/*
51 * acpi_restore_state - undo effects of acpi_save_state_mem
52 */
53void acpi_restore_state_mem (void)
54{
55 zap_low_mappings();
56}
57
58/**
59 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
60 *
61 * We allocate a page from the first 1MB of memory for the wakeup
62 * routine for when we come back from a sleep state. The
63 * runtime allocator allows specification of <16MB pages, but not
64 * <1MB pages.
65 */
66void __init acpi_reserve_bootmem(void)
67{
68 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
69 printk(KERN_ERR "ACPI: Wakeup code way too big, S3 disabled.\n");
70 return;
71 }
72
73 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
74 if (!acpi_wakeup_address)
75 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
76}
77
78static int __init acpi_sleep_setup(char *str)
79{
80 while ((str != NULL) && (*str != '\0')) {
81 if (strncmp(str, "s3_bios", 7) == 0)
82 acpi_video_flags = 1;
83 if (strncmp(str, "s3_mode", 7) == 0)
84 acpi_video_flags |= 2;
85 str = strchr(str, ',');
86 if (str != NULL)
87 str += strspn(str, ", \t");
88 }
89 return 1;
90}
91
92
93__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
new file mode 100644
index 000000000000..39d32484f6f5
--- /dev/null
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -0,0 +1,318 @@
1.text
2#include <linux/linkage.h>
3#include <asm/segment.h>
4#include <asm/page.h>
5
6#
7# wakeup_code runs in real mode, and at unknown address (determined at run-time).
8# Therefore it must only use relative jumps/calls.
9#
10# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
11#
12# If physical address of wakeup_code is 0x12345, BIOS should call us with
13# cs = 0x1234, eip = 0x05
14#
15
16ALIGN
17 .align 4096
18ENTRY(wakeup_start)
19wakeup_code:
20 wakeup_code_start = .
21 .code16
22
23 movw $0xb800, %ax
24 movw %ax,%fs
25 movw $0x0e00 + 'L', %fs:(0x10)
26
27 cli
28 cld
29
30 # setup data segment
31 movw %cs, %ax
32 movw %ax, %ds # Make ds:0 point to wakeup_start
33 movw %ax, %ss
34 mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board
35 movw $0x0e00 + 'S', %fs:(0x12)
36
37 pushl $0 # Kill any dangerous flags
38 popfl
39
40 movl real_magic - wakeup_code, %eax
41 cmpl $0x12345678, %eax
42 jne bogus_real_magic
43
44 testl $1, video_flags - wakeup_code
45 jz 1f
46 lcall $0xc000,$3
47 movw %cs, %ax
48 movw %ax, %ds # Bios might have played with that
49 movw %ax, %ss
501:
51
52 testl $2, video_flags - wakeup_code
53 jz 1f
54 mov video_mode - wakeup_code, %ax
55 call mode_set
561:
57
58 # set up page table
59 movl $swapper_pg_dir-__PAGE_OFFSET, %eax
60 movl %eax, %cr3
61
62 testl $1, real_efer_save_restore - wakeup_code
63 jz 4f
64 # restore efer setting
65 movl real_save_efer_edx - wakeup_code, %edx
66 movl real_save_efer_eax - wakeup_code, %eax
67 mov $0xc0000080, %ecx
68 wrmsr
694:
70 # make sure %cr4 is set correctly (features, etc)
71 movl real_save_cr4 - wakeup_code, %eax
72 movl %eax, %cr4
73 movw $0xb800, %ax
74 movw %ax,%fs
75 movw $0x0e00 + 'i', %fs:(0x12)
76
77 # need a gdt
78 lgdt real_save_gdt - wakeup_code
79
80 movl real_save_cr0 - wakeup_code, %eax
81 movl %eax, %cr0
82 jmp 1f
831:
84 movw $0x0e00 + 'n', %fs:(0x14)
85
86 movl real_magic - wakeup_code, %eax
87 cmpl $0x12345678, %eax
88 jne bogus_real_magic
89
90 ljmpl $__KERNEL_CS,$wakeup_pmode_return
91
92real_save_gdt: .word 0
93 .long 0
94real_save_cr0: .long 0
95real_save_cr3: .long 0
96real_save_cr4: .long 0
97real_magic: .long 0
98video_mode: .long 0
99video_flags: .long 0
100real_efer_save_restore: .long 0
101real_save_efer_edx: .long 0
102real_save_efer_eax: .long 0
103
104bogus_real_magic:
105 movw $0x0e00 + 'B', %fs:(0x12)
106 jmp bogus_real_magic
107
108/* This code uses an extended set of video mode numbers. These include:
109 * Aliases for standard modes
110 * NORMAL_VGA (-1)
111 * EXTENDED_VGA (-2)
112 * ASK_VGA (-3)
113 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
114 * of compatibility when extending the table. These are between 0x00 and 0xff.
115 */
116#define VIDEO_FIRST_MENU 0x0000
117
118/* Standard BIOS video modes (BIOS number + 0x0100) */
119#define VIDEO_FIRST_BIOS 0x0100
120
121/* VESA BIOS video modes (VESA number + 0x0200) */
122#define VIDEO_FIRST_VESA 0x0200
123
124/* Video7 special modes (BIOS number + 0x0900) */
125#define VIDEO_FIRST_V7 0x0900
126
127# Setting of user mode (AX=mode ID) => CF=success
128mode_set:
129 movw %ax, %bx
130#if 0
131 cmpb $0xff, %ah
132 jz setalias
133
134 testb $VIDEO_RECALC>>8, %ah
135 jnz _setrec
136
137 cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
138 jnc setres
139
140 cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
141 jz setspc
142
143 cmpb $VIDEO_FIRST_V7>>8, %ah
144 jz setv7
145#endif
146
147 cmpb $VIDEO_FIRST_VESA>>8, %ah
148 jnc check_vesa
149#if 0
150 orb %ah, %ah
151 jz setmenu
152#endif
153
154 decb %ah
155# jz setbios Add bios modes later
156
157setbad: clc
158 ret
159
160check_vesa:
161 subb $VIDEO_FIRST_VESA>>8, %bh
162 orw $0x4000, %bx # Use linear frame buffer
163 movw $0x4f02, %ax # VESA BIOS mode set call
164 int $0x10
165 cmpw $0x004f, %ax # AL=4f if implemented
166 jnz _setbad # AH=0 if OK
167
168 stc
169 ret
170
171_setbad: jmp setbad
172
173 .code32
174 ALIGN
175
176.org 0x800
177wakeup_stack_begin: # Stack grows down
178
179.org 0xff0 # Just below end of page
180wakeup_stack:
181ENTRY(wakeup_end)
182
183.org 0x1000
184
185wakeup_pmode_return:
186 movw $__KERNEL_DS, %ax
187 movw %ax, %ss
188 movw %ax, %ds
189 movw %ax, %es
190 movw %ax, %fs
191 movw %ax, %gs
192 movw $0x0e00 + 'u', 0xb8016
193
194 # reload the gdt, as we need the full 32 bit address
195 lgdt saved_gdt
196 lidt saved_idt
197 lldt saved_ldt
198 ljmp $(__KERNEL_CS),$1f
1991:
200 movl %cr3, %eax
201 movl %eax, %cr3
202 wbinvd
203
204 # and restore the stack ... but you need gdt for this to work
205 movl saved_context_esp, %esp
206
207 movl %cs:saved_magic, %eax
208 cmpl $0x12345678, %eax
209 jne bogus_magic
210
211 # jump to place where we left off
212 movl saved_eip,%eax
213 jmp *%eax
214
215bogus_magic:
216 movw $0x0e00 + 'B', 0xb8018
217 jmp bogus_magic
218
219
220##
221# acpi_copy_wakeup_routine
222#
223# Copy the above routine to low memory.
224#
225# Parameters:
226# %eax: place to copy wakeup routine to
227#
228# Returned address is location of code in low memory (past data and stack)
229#
230ENTRY(acpi_copy_wakeup_routine)
231
232 sgdt saved_gdt
233 sidt saved_idt
234 sldt saved_ldt
235 str saved_tss
236
237 movl nx_enabled, %edx
238 movl %edx, real_efer_save_restore - wakeup_start (%eax)
239 testl $1, real_efer_save_restore - wakeup_start (%eax)
240 jz 2f
241 # save efer setting
242 pushl %eax
243 movl %eax, %ebx
244 mov $0xc0000080, %ecx
245 rdmsr
246 movl %edx, real_save_efer_edx - wakeup_start (%ebx)
247 movl %eax, real_save_efer_eax - wakeup_start (%ebx)
248 popl %eax
2492:
250
251 movl %cr3, %edx
252 movl %edx, real_save_cr3 - wakeup_start (%eax)
253 movl %cr4, %edx
254 movl %edx, real_save_cr4 - wakeup_start (%eax)
255 movl %cr0, %edx
256 movl %edx, real_save_cr0 - wakeup_start (%eax)
257 sgdt real_save_gdt - wakeup_start (%eax)
258
259 movl saved_videomode, %edx
260 movl %edx, video_mode - wakeup_start (%eax)
261 movl acpi_video_flags, %edx
262 movl %edx, video_flags - wakeup_start (%eax)
263 movl $0x12345678, real_magic - wakeup_start (%eax)
264 movl $0x12345678, saved_magic
265 ret
266
267.data
268ALIGN
269ENTRY(saved_magic) .long 0
270ENTRY(saved_eip) .long 0
271
272save_registers:
273 leal 4(%esp), %eax
274 movl %eax, saved_context_esp
275 movl %ebx, saved_context_ebx
276 movl %ebp, saved_context_ebp
277 movl %esi, saved_context_esi
278 movl %edi, saved_context_edi
279 pushfl ; popl saved_context_eflags
280
281 movl $ret_point, saved_eip
282 ret
283
284
285restore_registers:
286 movl saved_context_ebp, %ebp
287 movl saved_context_ebx, %ebx
288 movl saved_context_esi, %esi
289 movl saved_context_edi, %edi
290 pushl saved_context_eflags ; popfl
291 ret
292
293ENTRY(do_suspend_lowlevel)
294 call save_processor_state
295 call save_registers
296 pushl $3
297 call acpi_enter_sleep_state
298 addl $4, %esp
299 ret
300 .p2align 4,,7
301ret_point:
302 call restore_registers
303 call restore_processor_state
304 ret
305
306ENTRY(do_suspend_lowlevel_s4bios)
307 call save_processor_state
308 call save_registers
309 call acpi_enter_sleep_state_s4bios
310 ret
311
312ALIGN
313# saved registers
314saved_gdt: .long 0,0
315saved_idt: .long 0,0
316saved_ldt: .long 0
317saved_tss: .long 0
318
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
new file mode 100644
index 000000000000..35c1751ea0b0
--- /dev/null
+++ b/arch/i386/kernel/apic.c
@@ -0,0 +1,1278 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/config.h>
18#include <linux/init.h>
19
20#include <linux/mm.h>
21#include <linux/irq.h>
22#include <linux/delay.h>
23#include <linux/bootmem.h>
24#include <linux/smp_lock.h>
25#include <linux/interrupt.h>
26#include <linux/mc146818rtc.h>
27#include <linux/kernel_stat.h>
28#include <linux/sysdev.h>
29
30#include <asm/atomic.h>
31#include <asm/smp.h>
32#include <asm/mtrr.h>
33#include <asm/mpspec.h>
34#include <asm/desc.h>
35#include <asm/arch_hooks.h>
36#include <asm/hpet.h>
37
38#include <mach_apic.h>
39
40#include "io_ports.h"
41
42/*
43 * Debug level
44 */
45int apic_verbosity;
46
47
48static void apic_pm_activate(void);
49
50/*
51 * 'what should we do if we get a hw irq event on an illegal vector'.
52 * each architecture has to answer this themselves.
53 */
54void ack_bad_irq(unsigned int irq)
55{
56 printk("unexpected IRQ trap at vector %02x\n", irq);
57 /*
58 * Currently unexpected vectors happen only on SMP and APIC.
59 * We _must_ ack these because every local APIC has only N
60 * irq slots per priority level, and a 'hanging, unacked' IRQ
61 * holds up an irq slot - in excessive cases (when multiple
62 * unexpected vectors occur) that might lock up the APIC
63 * completely.
64 */
65 ack_APIC_irq();
66}
67
68void __init apic_intr_init(void)
69{
70#ifdef CONFIG_SMP
71 smp_intr_init();
72#endif
73 /* self generated IPI for local APIC timer */
74 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
75
76 /* IPI vectors for APIC spurious and error interrupts */
77 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
78 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
79
80 /* thermal monitor LVT interrupt */
81#ifdef CONFIG_X86_MCE_P4THERMAL
82 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
83#endif
84}
85
86/* Using APIC to generate smp_local_timer_interrupt? */
87int using_apic_timer = 0;
88
89static DEFINE_PER_CPU(int, prof_multiplier) = 1;
90static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
91static DEFINE_PER_CPU(int, prof_counter) = 1;
92
93static int enabled_via_apicbase;
94
95void enable_NMI_through_LVT0 (void * dummy)
96{
97 unsigned int v, ver;
98
99 ver = apic_read(APIC_LVR);
100 ver = GET_APIC_VERSION(ver);
101 v = APIC_DM_NMI; /* unmask and set to NMI */
102 if (!APIC_INTEGRATED(ver)) /* 82489DX */
103 v |= APIC_LVT_LEVEL_TRIGGER;
104 apic_write_around(APIC_LVT0, v);
105}
106
107int get_physical_broadcast(void)
108{
109 unsigned int lvr, version;
110 lvr = apic_read(APIC_LVR);
111 version = GET_APIC_VERSION(lvr);
112 if (!APIC_INTEGRATED(version) || version >= 0x14)
113 return 0xff;
114 else
115 return 0xf;
116}
117
118int get_maxlvt(void)
119{
120 unsigned int v, ver, maxlvt;
121
122 v = apic_read(APIC_LVR);
123 ver = GET_APIC_VERSION(v);
124 /* 82489DXs do not report # of LVT entries. */
125 maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2;
126 return maxlvt;
127}
128
129void clear_local_APIC(void)
130{
131 int maxlvt;
132 unsigned long v;
133
134 maxlvt = get_maxlvt();
135
136 /*
137 * Masking an LVT entry on a P6 can trigger a local APIC error
138 * if the vector is zero. Mask LVTERR first to prevent this.
139 */
140 if (maxlvt >= 3) {
141 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
142 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
143 }
144 /*
145 * Careful: we have to set masks only first to deassert
146 * any level-triggered sources.
147 */
148 v = apic_read(APIC_LVTT);
149 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
150 v = apic_read(APIC_LVT0);
151 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
152 v = apic_read(APIC_LVT1);
153 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
154 if (maxlvt >= 4) {
155 v = apic_read(APIC_LVTPC);
156 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
157 }
158
159/* lets not touch this if we didn't frob it */
160#ifdef CONFIG_X86_MCE_P4THERMAL
161 if (maxlvt >= 5) {
162 v = apic_read(APIC_LVTTHMR);
163 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
164 }
165#endif
166 /*
167 * Clean APIC state for other OSs:
168 */
169 apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
170 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
171 apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
172 if (maxlvt >= 3)
173 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
174 if (maxlvt >= 4)
175 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
176
177#ifdef CONFIG_X86_MCE_P4THERMAL
178 if (maxlvt >= 5)
179 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
180#endif
181 v = GET_APIC_VERSION(apic_read(APIC_LVR));
182 if (APIC_INTEGRATED(v)) { /* !82489DX */
183 if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */
184 apic_write(APIC_ESR, 0);
185 apic_read(APIC_ESR);
186 }
187}
188
189void __init connect_bsp_APIC(void)
190{
191 if (pic_mode) {
192 /*
193 * Do not trust the local APIC being empty at bootup.
194 */
195 clear_local_APIC();
196 /*
197 * PIC mode, enable APIC mode in the IMCR, i.e.
198 * connect BSP's local APIC to INT and NMI lines.
199 */
200 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
201 "enabling APIC mode.\n");
202 outb(0x70, 0x22);
203 outb(0x01, 0x23);
204 }
205 enable_apic_mode();
206}
207
208void disconnect_bsp_APIC(void)
209{
210 if (pic_mode) {
211 /*
212 * Put the board back into PIC mode (has an effect
213 * only on certain older boards). Note that APIC
214 * interrupts, including IPIs, won't work beyond
215 * this point! The only exception are INIT IPIs.
216 */
217 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
218 "entering PIC mode.\n");
219 outb(0x70, 0x22);
220 outb(0x00, 0x23);
221 }
222}
223
224void disable_local_APIC(void)
225{
226 unsigned long value;
227
228 clear_local_APIC();
229
230 /*
231 * Disable APIC (implies clearing of registers
232 * for 82489DX!).
233 */
234 value = apic_read(APIC_SPIV);
235 value &= ~APIC_SPIV_APIC_ENABLED;
236 apic_write_around(APIC_SPIV, value);
237
238 if (enabled_via_apicbase) {
239 unsigned int l, h;
240 rdmsr(MSR_IA32_APICBASE, l, h);
241 l &= ~MSR_IA32_APICBASE_ENABLE;
242 wrmsr(MSR_IA32_APICBASE, l, h);
243 }
244}
245
246/*
247 * This is to verify that we're looking at a real local APIC.
248 * Check these against your board if the CPUs aren't getting
249 * started for no apparent reason.
250 */
251int __init verify_local_APIC(void)
252{
253 unsigned int reg0, reg1;
254
255 /*
256 * The version register is read-only in a real APIC.
257 */
258 reg0 = apic_read(APIC_LVR);
259 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
260 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
261 reg1 = apic_read(APIC_LVR);
262 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
263
264 /*
265 * The two version reads above should print the same
266 * numbers. If the second one is different, then we
267 * poke at a non-APIC.
268 */
269 if (reg1 != reg0)
270 return 0;
271
272 /*
273 * Check if the version looks reasonably.
274 */
275 reg1 = GET_APIC_VERSION(reg0);
276 if (reg1 == 0x00 || reg1 == 0xff)
277 return 0;
278 reg1 = get_maxlvt();
279 if (reg1 < 0x02 || reg1 == 0xff)
280 return 0;
281
282 /*
283 * The ID register is read/write in a real APIC.
284 */
285 reg0 = apic_read(APIC_ID);
286 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
287
288 /*
289 * The next two are just to see if we have sane values.
290 * They're only really relevant if we're in Virtual Wire
291 * compatibility mode, but most boxes are anymore.
292 */
293 reg0 = apic_read(APIC_LVT0);
294 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
295 reg1 = apic_read(APIC_LVT1);
296 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
297
298 return 1;
299}
300
301void __init sync_Arb_IDs(void)
302{
303 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
304 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
305 if (ver >= 0x14) /* P4 or higher */
306 return;
307 /*
308 * Wait for idle.
309 */
310 apic_wait_icr_idle();
311
312 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
313 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
314 | APIC_DM_INIT);
315}
316
317extern void __error_in_apic_c (void);
318
319/*
320 * An initial setup of the virtual wire mode.
321 */
322void __init init_bsp_APIC(void)
323{
324 unsigned long value, ver;
325
326 /*
327 * Don't do the setup now if we have a SMP BIOS as the
328 * through-I/O-APIC virtual wire mode might be active.
329 */
330 if (smp_found_config || !cpu_has_apic)
331 return;
332
333 value = apic_read(APIC_LVR);
334 ver = GET_APIC_VERSION(value);
335
336 /*
337 * Do not trust the local APIC being empty at bootup.
338 */
339 clear_local_APIC();
340
341 /*
342 * Enable APIC.
343 */
344 value = apic_read(APIC_SPIV);
345 value &= ~APIC_VECTOR_MASK;
346 value |= APIC_SPIV_APIC_ENABLED;
347
348 /* This bit is reserved on P4/Xeon and should be cleared */
349 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15))
350 value &= ~APIC_SPIV_FOCUS_DISABLED;
351 else
352 value |= APIC_SPIV_FOCUS_DISABLED;
353 value |= SPURIOUS_APIC_VECTOR;
354 apic_write_around(APIC_SPIV, value);
355
356 /*
357 * Set up the virtual wire mode.
358 */
359 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
360 value = APIC_DM_NMI;
361 if (!APIC_INTEGRATED(ver)) /* 82489DX */
362 value |= APIC_LVT_LEVEL_TRIGGER;
363 apic_write_around(APIC_LVT1, value);
364}
365
366void __init setup_local_APIC (void)
367{
368 unsigned long oldvalue, value, ver, maxlvt;
369
370 /* Pound the ESR really hard over the head with a big hammer - mbligh */
371 if (esr_disable) {
372 apic_write(APIC_ESR, 0);
373 apic_write(APIC_ESR, 0);
374 apic_write(APIC_ESR, 0);
375 apic_write(APIC_ESR, 0);
376 }
377
378 value = apic_read(APIC_LVR);
379 ver = GET_APIC_VERSION(value);
380
381 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
382 __error_in_apic_c();
383
384 /*
385 * Double-check whether this APIC is really registered.
386 */
387 if (!apic_id_registered())
388 BUG();
389
390 /*
391 * Intel recommends to set DFR, LDR and TPR before enabling
392 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
393 * document number 292116). So here it goes...
394 */
395 init_apic_ldr();
396
397 /*
398 * Set Task Priority to 'accept all'. We never change this
399 * later on.
400 */
401 value = apic_read(APIC_TASKPRI);
402 value &= ~APIC_TPRI_MASK;
403 apic_write_around(APIC_TASKPRI, value);
404
405 /*
406 * Now that we are all set up, enable the APIC
407 */
408 value = apic_read(APIC_SPIV);
409 value &= ~APIC_VECTOR_MASK;
410 /*
411 * Enable APIC
412 */
413 value |= APIC_SPIV_APIC_ENABLED;
414
415 /*
416 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
417 * certain networking cards. If high frequency interrupts are
418 * happening on a particular IOAPIC pin, plus the IOAPIC routing
419 * entry is masked/unmasked at a high rate as well then sooner or
420 * later IOAPIC line gets 'stuck', no more interrupts are received
421 * from the device. If focus CPU is disabled then the hang goes
422 * away, oh well :-(
423 *
424 * [ This bug can be reproduced easily with a level-triggered
425 * PCI Ne2000 networking cards and PII/PIII processors, dual
426 * BX chipset. ]
427 */
428 /*
429 * Actually disabling the focus CPU check just makes the hang less
430 * frequent as it makes the interrupt distributon model be more
431 * like LRU than MRU (the short-term load is more even across CPUs).
432 * See also the comment in end_level_ioapic_irq(). --macro
433 */
434#if 1
435 /* Enable focus processor (bit==0) */
436 value &= ~APIC_SPIV_FOCUS_DISABLED;
437#else
438 /* Disable focus processor (bit==1) */
439 value |= APIC_SPIV_FOCUS_DISABLED;
440#endif
441 /*
442 * Set spurious IRQ vector
443 */
444 value |= SPURIOUS_APIC_VECTOR;
445 apic_write_around(APIC_SPIV, value);
446
447 /*
448 * Set up LVT0, LVT1:
449 *
450 * set up through-local-APIC on the BP's LINT0. This is not
451 * strictly necessery in pure symmetric-IO mode, but sometimes
452 * we delegate interrupts to the 8259A.
453 */
454 /*
455 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
456 */
457 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
458 if (!smp_processor_id() && (pic_mode || !value)) {
459 value = APIC_DM_EXTINT;
460 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
461 smp_processor_id());
462 } else {
463 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
464 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
465 smp_processor_id());
466 }
467 apic_write_around(APIC_LVT0, value);
468
469 /*
470 * only the BP should see the LINT1 NMI signal, obviously.
471 */
472 if (!smp_processor_id())
473 value = APIC_DM_NMI;
474 else
475 value = APIC_DM_NMI | APIC_LVT_MASKED;
476 if (!APIC_INTEGRATED(ver)) /* 82489DX */
477 value |= APIC_LVT_LEVEL_TRIGGER;
478 apic_write_around(APIC_LVT1, value);
479
480 if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */
481 maxlvt = get_maxlvt();
482 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
483 apic_write(APIC_ESR, 0);
484 oldvalue = apic_read(APIC_ESR);
485
486 value = ERROR_APIC_VECTOR; // enables sending errors
487 apic_write_around(APIC_LVTERR, value);
488 /*
489 * spec says clear errors after enabling vector.
490 */
491 if (maxlvt > 3)
492 apic_write(APIC_ESR, 0);
493 value = apic_read(APIC_ESR);
494 if (value != oldvalue)
495 apic_printk(APIC_VERBOSE, "ESR value before enabling "
496 "vector: 0x%08lx after: 0x%08lx\n",
497 oldvalue, value);
498 } else {
499 if (esr_disable)
500 /*
501 * Something untraceble is creating bad interrupts on
502 * secondary quads ... for the moment, just leave the
503 * ESR disabled - we can't do anything useful with the
504 * errors anyway - mbligh
505 */
506 printk("Leaving ESR disabled.\n");
507 else
508 printk("No ESR for 82489DX.\n");
509 }
510
511 if (nmi_watchdog == NMI_LOCAL_APIC)
512 setup_apic_nmi_watchdog();
513 apic_pm_activate();
514}
515
516/*
517 * If Linux enabled the LAPIC against the BIOS default
518 * disable it down before re-entering the BIOS on shutdown.
519 * Otherwise the BIOS may get confused and not power-off.
520 */
521void lapic_shutdown(void)
522{
523 if (!cpu_has_apic || !enabled_via_apicbase)
524 return;
525
526 local_irq_disable();
527 disable_local_APIC();
528 local_irq_enable();
529}
530
531#ifdef CONFIG_PM
532
533static struct {
534 int active;
535 /* r/w apic fields */
536 unsigned int apic_id;
537 unsigned int apic_taskpri;
538 unsigned int apic_ldr;
539 unsigned int apic_dfr;
540 unsigned int apic_spiv;
541 unsigned int apic_lvtt;
542 unsigned int apic_lvtpc;
543 unsigned int apic_lvt0;
544 unsigned int apic_lvt1;
545 unsigned int apic_lvterr;
546 unsigned int apic_tmict;
547 unsigned int apic_tdcr;
548 unsigned int apic_thmr;
549} apic_pm_state;
550
551static int lapic_suspend(struct sys_device *dev, u32 state)
552{
553 unsigned long flags;
554
555 if (!apic_pm_state.active)
556 return 0;
557
558 apic_pm_state.apic_id = apic_read(APIC_ID);
559 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
560 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
561 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
562 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
563 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
564 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
565 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
566 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
567 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
568 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
569 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
570 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
571
572 local_irq_save(flags);
573 disable_local_APIC();
574 local_irq_restore(flags);
575 return 0;
576}
577
578static int lapic_resume(struct sys_device *dev)
579{
580 unsigned int l, h;
581 unsigned long flags;
582
583 if (!apic_pm_state.active)
584 return 0;
585
586 local_irq_save(flags);
587
588 /*
589 * Make sure the APICBASE points to the right address
590 *
591 * FIXME! This will be wrong if we ever support suspend on
592 * SMP! We'll need to do this as part of the CPU restore!
593 */
594 rdmsr(MSR_IA32_APICBASE, l, h);
595 l &= ~MSR_IA32_APICBASE_BASE;
596 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
597 wrmsr(MSR_IA32_APICBASE, l, h);
598
599 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
600 apic_write(APIC_ID, apic_pm_state.apic_id);
601 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
602 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
603 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
604 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
605 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
606 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
607 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
608 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
609 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
610 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
611 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
612 apic_write(APIC_ESR, 0);
613 apic_read(APIC_ESR);
614 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
615 apic_write(APIC_ESR, 0);
616 apic_read(APIC_ESR);
617 local_irq_restore(flags);
618 return 0;
619}
620
621/*
622 * This device has no shutdown method - fully functioning local APICs
623 * are needed on every CPU up until machine_halt/restart/poweroff.
624 */
625
626static struct sysdev_class lapic_sysclass = {
627 set_kset_name("lapic"),
628 .resume = lapic_resume,
629 .suspend = lapic_suspend,
630};
631
632static struct sys_device device_lapic = {
633 .id = 0,
634 .cls = &lapic_sysclass,
635};
636
637static void __init apic_pm_activate(void)
638{
639 apic_pm_state.active = 1;
640}
641
642static int __init init_lapic_sysfs(void)
643{
644 int error;
645
646 if (!cpu_has_apic)
647 return 0;
648 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
649
650 error = sysdev_class_register(&lapic_sysclass);
651 if (!error)
652 error = sysdev_register(&device_lapic);
653 return error;
654}
655device_initcall(init_lapic_sysfs);
656
657#else /* CONFIG_PM */
658
659static void apic_pm_activate(void) { }
660
661#endif /* CONFIG_PM */
662
663/*
664 * Detect and enable local APICs on non-SMP boards.
665 * Original code written by Keir Fraser.
666 */
667
668/*
669 * Knob to control our willingness to enable the local APIC.
670 */
671int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
672
673static int __init lapic_disable(char *str)
674{
675 enable_local_apic = -1;
676 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
677 return 0;
678}
679__setup("nolapic", lapic_disable);
680
681static int __init lapic_enable(char *str)
682{
683 enable_local_apic = 1;
684 return 0;
685}
686__setup("lapic", lapic_enable);
687
688static int __init apic_set_verbosity(char *str)
689{
690 if (strcmp("debug", str) == 0)
691 apic_verbosity = APIC_DEBUG;
692 else if (strcmp("verbose", str) == 0)
693 apic_verbosity = APIC_VERBOSE;
694 else
695 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
696 " use apic=verbose or apic=debug", str);
697
698 return 0;
699}
700
701__setup("apic=", apic_set_verbosity);
702
703static int __init detect_init_APIC (void)
704{
705 u32 h, l, features;
706 extern void get_cpu_vendor(struct cpuinfo_x86*);
707
708 /* Disabled by kernel option? */
709 if (enable_local_apic < 0)
710 return -1;
711
712 /* Workaround for us being called before identify_cpu(). */
713 get_cpu_vendor(&boot_cpu_data);
714
715 switch (boot_cpu_data.x86_vendor) {
716 case X86_VENDOR_AMD:
717 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
718 (boot_cpu_data.x86 == 15))
719 break;
720 goto no_apic;
721 case X86_VENDOR_INTEL:
722 if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
723 (boot_cpu_data.x86 == 5 && cpu_has_apic))
724 break;
725 goto no_apic;
726 default:
727 goto no_apic;
728 }
729
730 if (!cpu_has_apic) {
731 /*
732 * Over-ride BIOS and try to enable the local
733 * APIC only if "lapic" specified.
734 */
735 if (enable_local_apic <= 0) {
736 printk("Local APIC disabled by BIOS -- "
737 "you can enable it with \"lapic\"\n");
738 return -1;
739 }
740 /*
741 * Some BIOSes disable the local APIC in the
742 * APIC_BASE MSR. This can only be done in
743 * software for Intel P6 or later and AMD K7
744 * (Model > 1) or later.
745 */
746 rdmsr(MSR_IA32_APICBASE, l, h);
747 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
748 printk("Local APIC disabled by BIOS -- reenabling.\n");
749 l &= ~MSR_IA32_APICBASE_BASE;
750 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
751 wrmsr(MSR_IA32_APICBASE, l, h);
752 enabled_via_apicbase = 1;
753 }
754 }
755 /*
756 * The APIC feature bit should now be enabled
757 * in `cpuid'
758 */
759 features = cpuid_edx(1);
760 if (!(features & (1 << X86_FEATURE_APIC))) {
761 printk("Could not enable APIC!\n");
762 return -1;
763 }
764 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
765 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
766
767 /* The BIOS may have set up the APIC at some other address */
768 rdmsr(MSR_IA32_APICBASE, l, h);
769 if (l & MSR_IA32_APICBASE_ENABLE)
770 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
771
772 if (nmi_watchdog != NMI_NONE)
773 nmi_watchdog = NMI_LOCAL_APIC;
774
775 printk("Found and enabled local APIC!\n");
776
777 apic_pm_activate();
778
779 return 0;
780
781no_apic:
782 printk("No local APIC present or hardware disabled\n");
783 return -1;
784}
785
786void __init init_apic_mappings(void)
787{
788 unsigned long apic_phys;
789
790 /*
791 * If no local APIC can be found then set up a fake all
792 * zeroes page to simulate the local APIC and another
793 * one for the IO-APIC.
794 */
795 if (!smp_found_config && detect_init_APIC()) {
796 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
797 apic_phys = __pa(apic_phys);
798 } else
799 apic_phys = mp_lapic_addr;
800
801 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
802 printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
803 apic_phys);
804
805 /*
806 * Fetch the APIC ID of the BSP in case we have a
807 * default configuration (or the MP table is broken).
808 */
809 if (boot_cpu_physical_apicid == -1U)
810 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
811
812#ifdef CONFIG_X86_IO_APIC
813 {
814 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
815 int i;
816
817 for (i = 0; i < nr_ioapics; i++) {
818 if (smp_found_config) {
819 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
820 if (!ioapic_phys) {
821 printk(KERN_ERR
822 "WARNING: bogus zero IO-APIC "
823 "address found in MPTABLE, "
824 "disabling IO/APIC support!\n");
825 smp_found_config = 0;
826 skip_ioapic_setup = 1;
827 goto fake_ioapic_page;
828 }
829 } else {
830fake_ioapic_page:
831 ioapic_phys = (unsigned long)
832 alloc_bootmem_pages(PAGE_SIZE);
833 ioapic_phys = __pa(ioapic_phys);
834 }
835 set_fixmap_nocache(idx, ioapic_phys);
836 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
837 __fix_to_virt(idx), ioapic_phys);
838 idx++;
839 }
840 }
841#endif
842}
843
844/*
845 * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts
846 * per second. We assume that the caller has already set up the local
847 * APIC.
848 *
849 * The APIC timer is not exactly sync with the external timer chip, it
850 * closely follows bus clocks.
851 */
852
853/*
854 * The timer chip is already set up at HZ interrupts per second here,
855 * but we do not accept timer interrupts yet. We only allow the BP
856 * to calibrate.
857 */
858static unsigned int __init get_8254_timer_count(void)
859{
860 extern spinlock_t i8253_lock;
861 unsigned long flags;
862
863 unsigned int count;
864
865 spin_lock_irqsave(&i8253_lock, flags);
866
867 outb_p(0x00, PIT_MODE);
868 count = inb_p(PIT_CH0);
869 count |= inb_p(PIT_CH0) << 8;
870
871 spin_unlock_irqrestore(&i8253_lock, flags);
872
873 return count;
874}
875
876/* next tick in 8254 can be caught by catching timer wraparound */
877static void __init wait_8254_wraparound(void)
878{
879 unsigned int curr_count, prev_count;
880
881 curr_count = get_8254_timer_count();
882 do {
883 prev_count = curr_count;
884 curr_count = get_8254_timer_count();
885
886 /* workaround for broken Mercury/Neptune */
887 if (prev_count >= curr_count + 0x100)
888 curr_count = get_8254_timer_count();
889
890 } while (prev_count >= curr_count);
891}
892
893/*
894 * Default initialization for 8254 timers. If we use other timers like HPET,
895 * we override this later
896 */
897void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound;
898
899/*
900 * This function sets up the local APIC timer, with a timeout of
901 * 'clocks' APIC bus clock. During calibration we actually call
902 * this function twice on the boot CPU, once with a bogus timeout
903 * value, second time for real. The other (noncalibrating) CPUs
904 * call this function only once, with the real, calibrated value.
905 *
906 * We do reads before writes even if unnecessary, to get around the
907 * P5 APIC double write bug.
908 */
909
910#define APIC_DIVISOR 16
911
912static void __setup_APIC_LVTT(unsigned int clocks)
913{
914 unsigned int lvtt_value, tmp_value, ver;
915
916 ver = GET_APIC_VERSION(apic_read(APIC_LVR));
917 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
918 if (!APIC_INTEGRATED(ver))
919 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
920 apic_write_around(APIC_LVTT, lvtt_value);
921
922 /*
923 * Divide PICLK by 16
924 */
925 tmp_value = apic_read(APIC_TDCR);
926 apic_write_around(APIC_TDCR, (tmp_value
927 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
928 | APIC_TDR_DIV_16);
929
930 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
931}
932
933static void __init setup_APIC_timer(unsigned int clocks)
934{
935 unsigned long flags;
936
937 local_irq_save(flags);
938
939 /*
940 * Wait for IRQ0's slice:
941 */
942 wait_timer_tick();
943
944 __setup_APIC_LVTT(clocks);
945
946 local_irq_restore(flags);
947}
948
949/*
950 * In this function we calibrate APIC bus clocks to the external
951 * timer. Unfortunately we cannot use jiffies and the timer irq
952 * to calibrate, since some later bootup code depends on getting
953 * the first irq? Ugh.
954 *
955 * We want to do the calibration only once since we
956 * want to have local timer irqs syncron. CPUs connected
957 * by the same APIC bus have the very same bus frequency.
958 * And we want to have irqs off anyways, no accidental
959 * APIC irq that way.
960 */
961
962static int __init calibrate_APIC_clock(void)
963{
964 unsigned long long t1 = 0, t2 = 0;
965 long tt1, tt2;
966 long result;
967 int i;
968 const int LOOPS = HZ/10;
969
970 apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n");
971
972 /*
973 * Put whatever arbitrary (but long enough) timeout
974 * value into the APIC clock, we just want to get the
975 * counter running for calibration.
976 */
977 __setup_APIC_LVTT(1000000000);
978
979 /*
980 * The timer chip counts down to zero. Let's wait
981 * for a wraparound to start exact measurement:
982 * (the current tick might have been already half done)
983 */
984
985 wait_timer_tick();
986
987 /*
988 * We wrapped around just now. Let's start:
989 */
990 if (cpu_has_tsc)
991 rdtscll(t1);
992 tt1 = apic_read(APIC_TMCCT);
993
994 /*
995 * Let's wait LOOPS wraprounds:
996 */
997 for (i = 0; i < LOOPS; i++)
998 wait_timer_tick();
999
1000 tt2 = apic_read(APIC_TMCCT);
1001 if (cpu_has_tsc)
1002 rdtscll(t2);
1003
1004 /*
1005 * The APIC bus clock counter is 32 bits only, it
1006 * might have overflown, but note that we use signed
1007 * longs, thus no extra care needed.
1008 *
1009 * underflown to be exact, as the timer counts down ;)
1010 */
1011
1012 result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
1013
1014 if (cpu_has_tsc)
1015 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
1016 "%ld.%04ld MHz.\n",
1017 ((long)(t2-t1)/LOOPS)/(1000000/HZ),
1018 ((long)(t2-t1)/LOOPS)%(1000000/HZ));
1019
1020 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
1021 "%ld.%04ld MHz.\n",
1022 result/(1000000/HZ),
1023 result%(1000000/HZ));
1024
1025 return result;
1026}
1027
1028static unsigned int calibration_result;
1029
1030void __init setup_boot_APIC_clock(void)
1031{
1032 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
1033 using_apic_timer = 1;
1034
1035 local_irq_disable();
1036
1037 calibration_result = calibrate_APIC_clock();
1038 /*
1039 * Now set up the timer for real.
1040 */
1041 setup_APIC_timer(calibration_result);
1042
1043 local_irq_enable();
1044}
1045
1046void __init setup_secondary_APIC_clock(void)
1047{
1048 setup_APIC_timer(calibration_result);
1049}
1050
1051void __init disable_APIC_timer(void)
1052{
1053 if (using_apic_timer) {
1054 unsigned long v;
1055
1056 v = apic_read(APIC_LVTT);
1057 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
1058 }
1059}
1060
1061void enable_APIC_timer(void)
1062{
1063 if (using_apic_timer) {
1064 unsigned long v;
1065
1066 v = apic_read(APIC_LVTT);
1067 apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
1068 }
1069}
1070
1071/*
1072 * the frequency of the profiling timer can be changed
1073 * by writing a multiplier value into /proc/profile.
1074 */
1075int setup_profiling_timer(unsigned int multiplier)
1076{
1077 int i;
1078
1079 /*
1080 * Sanity check. [at least 500 APIC cycles should be
1081 * between APIC interrupts as a rule of thumb, to avoid
1082 * irqs flooding us]
1083 */
1084 if ( (!multiplier) || (calibration_result/multiplier < 500))
1085 return -EINVAL;
1086
1087 /*
1088 * Set the new multiplier for each CPU. CPUs don't start using the
1089 * new values until the next timer interrupt in which they do process
1090 * accounting. At that time they also adjust their APIC timers
1091 * accordingly.
1092 */
1093 for (i = 0; i < NR_CPUS; ++i)
1094 per_cpu(prof_multiplier, i) = multiplier;
1095
1096 return 0;
1097}
1098
1099#undef APIC_DIVISOR
1100
1101/*
1102 * Local timer interrupt handler. It does both profiling and
1103 * process statistics/rescheduling.
1104 *
1105 * We do profiling in every local tick, statistics/rescheduling
1106 * happen only every 'profiling multiplier' ticks. The default
1107 * multiplier is 1 and it can be changed by writing the new multiplier
1108 * value into /proc/profile.
1109 */
1110
1111inline void smp_local_timer_interrupt(struct pt_regs * regs)
1112{
1113 int cpu = smp_processor_id();
1114
1115 profile_tick(CPU_PROFILING, regs);
1116 if (--per_cpu(prof_counter, cpu) <= 0) {
1117 /*
1118 * The multiplier may have changed since the last time we got
1119 * to this point as a result of the user writing to
1120 * /proc/profile. In this case we need to adjust the APIC
1121 * timer accordingly.
1122 *
1123 * Interrupts are already masked off at this point.
1124 */
1125 per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
1126 if (per_cpu(prof_counter, cpu) !=
1127 per_cpu(prof_old_multiplier, cpu)) {
1128 __setup_APIC_LVTT(
1129 calibration_result/
1130 per_cpu(prof_counter, cpu));
1131 per_cpu(prof_old_multiplier, cpu) =
1132 per_cpu(prof_counter, cpu);
1133 }
1134
1135#ifdef CONFIG_SMP
1136 update_process_times(user_mode(regs));
1137#endif
1138 }
1139
1140 /*
1141 * We take the 'long' return path, and there every subsystem
1142 * grabs the apropriate locks (kernel lock/ irq lock).
1143 *
1144 * we might want to decouple profiling from the 'long path',
1145 * and do the profiling totally in assembly.
1146 *
1147 * Currently this isn't too much of an issue (performance wise),
1148 * we can take more than 100K local irqs per second on a 100 MHz P5.
1149 */
1150}
1151
1152/*
1153 * Local APIC timer interrupt. This is the most natural way for doing
1154 * local interrupts, but local timer interrupts can be emulated by
1155 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1156 *
1157 * [ if a single-CPU system runs an SMP kernel then we call the local
1158 * interrupt as well. Thus we cannot inline the local irq ... ]
1159 */
1160
1161fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
1162{
1163 int cpu = smp_processor_id();
1164
1165 /*
1166 * the NMI deadlock-detector uses this.
1167 */
1168 per_cpu(irq_stat, cpu).apic_timer_irqs++;
1169
1170 /*
1171 * NOTE! We'd better ACK the irq immediately,
1172 * because timer handling can be slow.
1173 */
1174 ack_APIC_irq();
1175 /*
1176 * update_process_times() expects us to have done irq_enter().
1177 * Besides, if we don't timer interrupts ignore the global
1178 * interrupt lock, which is the WrongThing (tm) to do.
1179 */
1180 irq_enter();
1181 smp_local_timer_interrupt(regs);
1182 irq_exit();
1183}
1184
1185/*
1186 * This interrupt should _never_ happen with our APIC/SMP architecture
1187 */
1188fastcall void smp_spurious_interrupt(struct pt_regs *regs)
1189{
1190 unsigned long v;
1191
1192 irq_enter();
1193 /*
1194 * Check if this really is a spurious interrupt and ACK it
1195 * if it is a vectored one. Just in case...
1196 * Spurious interrupts should not be ACKed.
1197 */
1198 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1199 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1200 ack_APIC_irq();
1201
1202 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1203 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n",
1204 smp_processor_id());
1205 irq_exit();
1206}
1207
1208/*
1209 * This interrupt should never happen with our APIC/SMP architecture
1210 */
1211
1212fastcall void smp_error_interrupt(struct pt_regs *regs)
1213{
1214 unsigned long v, v1;
1215
1216 irq_enter();
1217 /* First tickle the hardware, only then report what went on. -- REW */
1218 v = apic_read(APIC_ESR);
1219 apic_write(APIC_ESR, 0);
1220 v1 = apic_read(APIC_ESR);
1221 ack_APIC_irq();
1222 atomic_inc(&irq_err_count);
1223
1224 /* Here is what the APIC error bits mean:
1225 0: Send CS error
1226 1: Receive CS error
1227 2: Send accept error
1228 3: Receive accept error
1229 4: Reserved
1230 5: Send illegal vector
1231 6: Received illegal vector
1232 7: Illegal register address
1233 */
1234 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1235 smp_processor_id(), v , v1);
1236 irq_exit();
1237}
1238
1239/*
1240 * This initializes the IO-APIC and APIC hardware if this is
1241 * a UP kernel.
1242 */
1243int __init APIC_init_uniprocessor (void)
1244{
1245 if (enable_local_apic < 0)
1246 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1247
1248 if (!smp_found_config && !cpu_has_apic)
1249 return -1;
1250
1251 /*
1252 * Complain if the BIOS pretends there is one.
1253 */
1254 if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1255 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1256 boot_cpu_physical_apicid);
1257 return -1;
1258 }
1259
1260 verify_local_APIC();
1261
1262 connect_bsp_APIC();
1263
1264 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
1265
1266 setup_local_APIC();
1267
1268 if (nmi_watchdog == NMI_LOCAL_APIC)
1269 check_nmi_watchdog();
1270#ifdef CONFIG_X86_IO_APIC
1271 if (smp_found_config)
1272 if (!skip_ioapic_setup && nr_ioapics)
1273 setup_IO_APIC();
1274#endif
1275 setup_boot_APIC_clock();
1276
1277 return 0;
1278}
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
new file mode 100644
index 000000000000..45641a872550
--- /dev/null
+++ b/arch/i386/kernel/apm.c
@@ -0,0 +1,2428 @@
1/* -*- linux-c -*-
2 * APM BIOS driver for Linux
3 * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
4 *
5 * Initial development of this driver was funded by NEC Australia P/L
6 * and NEC Corporation
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; either version 2, or (at your option) any
11 * later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * October 1995, Rik Faith (faith@cs.unc.edu):
19 * Minor enhancements and updates (to the patch set) for 1.3.x
20 * Documentation
21 * January 1996, Rik Faith (faith@cs.unc.edu):
22 * Make /proc/apm easy to format (bump driver version)
23 * March 1996, Rik Faith (faith@cs.unc.edu):
24 * Prohibit APM BIOS calls unless apm_enabled.
25 * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
26 * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
27 * Version 1.0 and 1.1
28 * May 1996, Version 1.2
29 * Feb 1998, Version 1.3
30 * Feb 1998, Version 1.4
31 * Aug 1998, Version 1.5
32 * Sep 1998, Version 1.6
33 * Nov 1998, Version 1.7
34 * Jan 1999, Version 1.8
35 * Jan 1999, Version 1.9
36 * Oct 1999, Version 1.10
37 * Nov 1999, Version 1.11
38 * Jan 2000, Version 1.12
39 * Feb 2000, Version 1.13
40 * Nov 2000, Version 1.14
41 * Oct 2001, Version 1.15
42 * Jan 2002, Version 1.16
43 * Oct 2002, Version 1.16ac
44 *
45 * History:
46 * 0.6b: first version in official kernel, Linux 1.3.46
47 * 0.7: changed /proc/apm format, Linux 1.3.58
48 * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
49 * 0.9: only call bios if bios is present, Linux 1.3.72
50 * 1.0: use fixed device number, consolidate /proc/apm into this file,
51 * Linux 1.3.85
52 * 1.1: support user-space standby and suspend, power off after system
53 * halted, Linux 1.3.98
54 * 1.2: When resetting RTC after resume, take care so that the time
55 * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
56 * <jtoth@princeton.edu>); improve interaction between
57 * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
58 * 1.2a:Simple change to stop mysterious bug reports with SMP also added
59 * levels to the printk calls. APM is not defined for SMP machines.
60 * The new replacment for it is, but Linux doesn't yet support this.
61 * Alan Cox Linux 2.1.55
62 * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
63 * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
64 * Dean Gaudet <dgaudet@arctic.org>.
65 * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call).
68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences.
70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de>
73 * Stephen Rothwell
74 * 1.7: Modify driver's cached copy of the disabled/disengaged flags
75 * to reflect current state of APM BIOS.
76 * Chris Rankin <rankinc@bellsouth.net>
77 * Reset interrupt 0 timer to 100Hz after suspend
78 * Chad Miller <cmiller@surfsouth.com>
79 * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
80 * Richard Gooch <rgooch@atnf.csiro.au>
81 * Allow boot time disabling of APM
82 * Make boot messages far less verbose by default
83 * Make asm safer
84 * Stephen Rothwell
85 * 1.8: Add CONFIG_APM_RTC_IS_GMT
86 * Richard Gooch <rgooch@atnf.csiro.au>
87 * change APM_NOINTS to CONFIG_APM_ALLOW_INTS
88 * remove dependency on CONFIG_PROC_FS
89 * Stephen Rothwell
90 * 1.9: Fix small typo. <laslo@wodip.opole.pl>
91 * Try to cope with BIOS's that need to have all display
92 * devices blanked and not just the first one.
93 * Ross Paterson <ross@soi.city.ac.uk>
94 * Fix segment limit setting it has always been wrong as
95 * the segments needed to have byte granularity.
96 * Mark a few things __init.
97 * Add hack to allow power off of SMP systems by popular request.
98 * Use CONFIG_SMP instead of __SMP__
99 * Ignore BOUNCES for three seconds.
100 * Stephen Rothwell
101 * 1.10: Fix for Thinkpad return code.
102 * Merge 2.2 and 2.3 drivers.
103 * Remove APM dependencies in arch/i386/kernel/process.c
104 * Remove APM dependencies in drivers/char/sysrq.c
105 * Reset time across standby.
106 * Allow more inititialisation on SMP.
107 * Remove CONFIG_APM_POWER_OFF and make it boot time
108 * configurable (default on).
109 * Make debug only a boot time parameter (remove APM_DEBUG).
110 * Try to blank all devices on any error.
111 * 1.11: Remove APM dependencies in drivers/char/console.c
112 * Check nr_running to detect if we are idle (from
113 * Borislav Deianov <borislav@lix.polytechnique.fr>)
114 * Fix for bioses that don't zero the top part of the
115 * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
116 * (reported by Panos Katsaloulis <teras@writeme.com>).
117 * Real mode power off patch (Walter Hofmann
118 * <Walter.Hofmann@physik.stud.uni-erlangen.de>).
119 * 1.12: Remove CONFIG_SMP as the compiler will optimize
120 * the code away anyway (smp_num_cpus == 1 in UP)
121 * noted by Artur Skawina <skawina@geocities.com>.
122 * Make power off under SMP work again.
123 * Fix thinko with initial engaging of BIOS.
124 * Make sure power off only happens on CPU 0
125 * (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
126 * Do error notification to user mode if BIOS calls fail.
127 * Move entrypoint offset fix to ...boot/setup.S
128 * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
129 * Remove smp-power-off. SMP users must now specify
130 * "apm=power-off" on the kernel command line. Suggested
131 * by Jim Avera <jima@hal.com>, modified by Alan Cox
132 * <alan@lxorguk.ukuu.org.uk>.
133 * Register the /proc/apm entry even on SMP so that
134 * scripts that check for it before doing power off
135 * work (Jim Avera <jima@hal.com>).
136 * 1.13: Changes for new pm_ interfaces (Andy Henroid
137 * <andy_henroid@yahoo.com>).
138 * Modularize the code.
139 * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
140 * is now the way life works).
141 * Fix thinko in suspend() (wrong return).
142 * Notify drivers on critical suspend.
143 * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
144 * modified by sfr).
145 * Disable interrupts while we are suspended (Andy Henroid
146 * <andy_henroid@yahoo.com> fixed by sfr).
147 * Make power off work on SMP again (Tony Hoyle
148 * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
149 * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore
150 * interval is now configurable.
151 * 1.14: Make connection version persist across module unload/load.
152 * Enable and engage power management earlier.
153 * Disengage power management on module unload.
154 * Changed to use the sysrq-register hack for registering the
155 * power off function called by magic sysrq based upon discussions
156 * in irc://irc.openprojects.net/#kernelnewbies
157 * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
158 * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
159 * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
160 * Work around byte swap bug in one of the Vaio's BIOS's
161 * (Marc Boucher <marc@mbsi.ca>).
162 * Exposed the disable flag to dmi so that we can handle known
163 * broken APM (Alan Cox <alan@redhat.com>).
164 * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
165 * calling it - instead idle. (Alan Cox <alan@redhat.com>)
166 * If an APM idle fails log it and idle sensibly
167 * 1.15: Don't queue events to clients who open the device O_WRONLY.
168 * Don't expect replies from clients who open the device O_RDONLY.
169 * (Idea from Thomas Hood)
170 * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
171 * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
172 * Notify listeners of standby or suspend events before notifying
173 * drivers. Return EBUSY to ioctl() if suspend is rejected.
174 * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
175 * Ignore first resume after we generate our own resume event
176 * after a suspend (Thomas Hood)
177 * Daemonize now gets rid of our controlling terminal (sfr).
178 * CONFIG_APM_CPU_IDLE now just affects the default value of
179 * idle_threshold (sfr).
180 * Change name of kernel apm daemon (as it no longer idles) (sfr).
181 * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
182 * make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
183 * TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
184 *
185 * APM 1.1 Reference:
186 *
187 * Intel Corporation, Microsoft Corporation. Advanced Power Management
188 * (APM) BIOS Interface Specification, Revision 1.1, September 1993.
189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
190 *
191 * [This document is available free from Intel by calling 800.628.8686 (fax
192 * 916.356.6100) or 800.548.4725; or via anonymous ftp from
193 * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also
194 * available from Microsoft by calling 206.882.8080.]
195 *
196 * APM 1.2 Reference:
197 * Intel Corporation, Microsoft Corporation. Advanced Power Management
198 * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
199 *
200 * [This document is available from Microsoft at:
201 * http://www.microsoft.com/hwdev/busbios/amp_12.htm]
202 */
203
204#include <linux/config.h>
205#include <linux/module.h>
206
207#include <linux/poll.h>
208#include <linux/types.h>
209#include <linux/stddef.h>
210#include <linux/timer.h>
211#include <linux/fcntl.h>
212#include <linux/slab.h>
213#include <linux/stat.h>
214#include <linux/proc_fs.h>
215#include <linux/miscdevice.h>
216#include <linux/apm_bios.h>
217#include <linux/init.h>
218#include <linux/time.h>
219#include <linux/sched.h>
220#include <linux/pm.h>
221#include <linux/device.h>
222#include <linux/kernel.h>
223#include <linux/smp.h>
224#include <linux/smp_lock.h>
225#include <linux/dmi.h>
226#include <linux/suspend.h>
227
228#include <asm/system.h>
229#include <asm/uaccess.h>
230#include <asm/desc.h>
231
232#include "io_ports.h"
233
234extern spinlock_t i8253_lock;
235extern unsigned long get_cmos_time(void);
236extern void machine_real_restart(unsigned char *, int);
237
238#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
239extern int (*console_blank_hook)(int);
240#endif
241
242/*
243 * The apm_bios device is one of the misc char devices.
244 * This is its minor number.
245 */
246#define APM_MINOR_DEV 134
247
248/*
249 * See Documentation/Config.help for the configuration options.
250 *
251 * Various options can be changed at boot time as follows:
252 * (We allow underscores for compatibility with the modules code)
253 * apm=on/off enable/disable APM
254 * [no-]allow[-_]ints allow interrupts during BIOS calls
255 * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call
256 * [no-]realmode[-_]power[-_]off switch to real mode before
257 * powering off
258 * [no-]debug log some debugging messages
259 * [no-]power[-_]off power off on shutdown
260 * [no-]smp Use apm even on an SMP box
261 * bounce[-_]interval=<n> number of ticks to ignore suspend
262 * bounces
263 * idle[-_]threshold=<n> System idle percentage above which to
264 * make APM BIOS idle calls. Set it to
265 * 100 to disable.
266 * idle[-_]period=<n> Period (in 1/100s of a second) over
267 * which the idle percentage is
268 * calculated.
269 */
270
271/* KNOWN PROBLEM MACHINES:
272 *
273 * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
274 * [Confirmed by TI representative]
275 * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
276 * [Confirmed by BIOS disassembly]
277 * [This may work now ...]
278 * P: Toshiba 1950S: battery life information only gets updated after resume
279 * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
280 * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
281 * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
282 * Neale Banks <neale@lowendale.com.au> December 2000
283 *
284 * Legend: U = unusable with APM patches
285 * P = partially usable with APM patches
286 */
287
288/*
289 * Define as 1 to make the driver always call the APM BIOS busy
290 * routine even if the clock was not reported as slowed by the
291 * idle routine. Otherwise, define as 0.
292 */
293#define ALWAYS_CALL_BUSY 1
294
295/*
296 * Define to make the APM BIOS calls zero all data segment registers (so
297 * that an incorrect BIOS implementation will cause a kernel panic if it
298 * tries to write to arbitrary memory).
299 */
300#define APM_ZERO_SEGS
301
302#include "apm.h"
303
304/*
305 * Define to make all _set_limit calls use 64k limits. The APM 1.1 BIOS is
306 * supposed to provide limit information that it recognizes. Many machines
307 * do this correctly, but many others do not restrict themselves to their
308 * claimed limit. When this happens, they will cause a segmentation
309 * violation in the kernel at boot time. Most BIOS's, however, will
310 * respect a 64k limit, so we use that. If you want to be pedantic and
311 * hold your BIOS to its claims, then undefine this.
312 */
313#define APM_RELAX_SEGMENTS
314
315/*
316 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
317 * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
318 * David Chen <chen@ctpa04.mit.edu>
319 */
320#undef INIT_TIMER_AFTER_SUSPEND
321
322#ifdef INIT_TIMER_AFTER_SUSPEND
323#include <linux/timex.h>
324#include <asm/io.h>
325#include <linux/delay.h>
326#endif
327
328/*
329 * Need to poll the APM BIOS every second
330 */
331#define APM_CHECK_TIMEOUT (HZ)
332
333/*
334 * Ignore suspend events for this amount of time after a resume
335 */
336#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
337
338/*
339 * Maximum number of events stored
340 */
341#define APM_MAX_EVENTS 20
342
343/*
344 * The per-file APM data
345 */
346struct apm_user {
347 int magic;
348 struct apm_user * next;
349 int suser: 1;
350 int writer: 1;
351 int reader: 1;
352 int suspend_wait: 1;
353 int suspend_result;
354 int suspends_pending;
355 int standbys_pending;
356 int suspends_read;
357 int standbys_read;
358 int event_head;
359 int event_tail;
360 apm_event_t events[APM_MAX_EVENTS];
361};
362
363/*
364 * The magic number in apm_user
365 */
366#define APM_BIOS_MAGIC 0x4101
367
368/*
369 * idle percentage above which bios idle calls are done
370 */
371#ifdef CONFIG_APM_CPU_IDLE
372#define DEFAULT_IDLE_THRESHOLD 95
373#else
374#define DEFAULT_IDLE_THRESHOLD 100
375#endif
376#define DEFAULT_IDLE_PERIOD (100 / 3)
377
378/*
379 * Local variables
380 */
381static struct {
382 unsigned long offset;
383 unsigned short segment;
384} apm_bios_entry;
385static int clock_slowed;
386static int idle_threshold = DEFAULT_IDLE_THRESHOLD;
387static int idle_period = DEFAULT_IDLE_PERIOD;
388static int set_pm_idle;
389static int suspends_pending;
390static int standbys_pending;
391static int ignore_sys_suspend;
392static int ignore_normal_resume;
393static int bounce_interval = DEFAULT_BOUNCE_INTERVAL;
394
395#ifdef CONFIG_APM_RTC_IS_GMT
396# define clock_cmos_diff 0
397# define got_clock_diff 1
398#else
399static long clock_cmos_diff;
400static int got_clock_diff;
401#endif
402static int debug;
403static int smp;
404static int apm_disabled = -1;
405#ifdef CONFIG_SMP
406static int power_off;
407#else
408static int power_off = 1;
409#endif
410#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
411static int realmode_power_off = 1;
412#else
413static int realmode_power_off;
414#endif
415static int exit_kapmd;
416static int kapmd_running;
417#ifdef CONFIG_APM_ALLOW_INTS
418static int allow_ints = 1;
419#else
420static int allow_ints;
421#endif
422static int broken_psr;
423
424static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
425static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
426static struct apm_user * user_list;
427static DEFINE_SPINLOCK(user_list_lock);
428static struct desc_struct bad_bios_desc = { 0, 0x00409200 };
429
430static char driver_version[] = "1.16ac"; /* no spaces */
431
432/*
433 * APM event names taken from the APM 1.2 specification. These are
434 * the message codes that the BIOS uses to tell us about events
435 */
436static char * apm_event_name[] = {
437 "system standby",
438 "system suspend",
439 "normal resume",
440 "critical resume",
441 "low battery",
442 "power status change",
443 "update time",
444 "critical suspend",
445 "user standby",
446 "user suspend",
447 "system standby resume",
448 "capabilities change"
449};
450#define NR_APM_EVENT_NAME \
451 (sizeof(apm_event_name) / sizeof(apm_event_name[0]))
452
453typedef struct lookup_t {
454 int key;
455 char * msg;
456} lookup_t;
457
458/*
459 * The BIOS returns a set of standard error codes in AX when the
460 * carry flag is set.
461 */
462
463static const lookup_t error_table[] = {
464/* N/A { APM_SUCCESS, "Operation succeeded" }, */
465 { APM_DISABLED, "Power management disabled" },
466 { APM_CONNECTED, "Real mode interface already connected" },
467 { APM_NOT_CONNECTED, "Interface not connected" },
468 { APM_16_CONNECTED, "16 bit interface already connected" },
469/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */
470 { APM_32_CONNECTED, "32 bit interface already connected" },
471 { APM_32_UNSUPPORTED, "32 bit interface not supported" },
472 { APM_BAD_DEVICE, "Unrecognized device ID" },
473 { APM_BAD_PARAM, "Parameter out of range" },
474 { APM_NOT_ENGAGED, "Interface not engaged" },
475 { APM_BAD_FUNCTION, "Function not supported" },
476 { APM_RESUME_DISABLED, "Resume timer disabled" },
477 { APM_BAD_STATE, "Unable to enter requested state" },
478/* N/A { APM_NO_EVENTS, "No events pending" }, */
479 { APM_NO_ERROR, "BIOS did not set a return code" },
480 { APM_NOT_PRESENT, "No APM present" }
481};
482#define ERROR_COUNT (sizeof(error_table)/sizeof(lookup_t))
483
484/**
485 * apm_error - display an APM error
486 * @str: information string
487 * @err: APM BIOS return code
488 *
489 * Write a meaningful log entry to the kernel log in the event of
490 * an APM error.
491 */
492
493static void apm_error(char *str, int err)
494{
495 int i;
496
497 for (i = 0; i < ERROR_COUNT; i++)
498 if (error_table[i].key == err) break;
499 if (i < ERROR_COUNT)
500 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
501 else
502 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
503 str, err);
504}
505
506/*
507 * Lock APM functionality to physical CPU 0
508 */
509
510#ifdef CONFIG_SMP
511
512static cpumask_t apm_save_cpus(void)
513{
514 cpumask_t x = current->cpus_allowed;
515 /* Some bioses don't like being called from CPU != 0 */
516 set_cpus_allowed(current, cpumask_of_cpu(0));
517 BUG_ON(smp_processor_id() != 0);
518 return x;
519}
520
521static inline void apm_restore_cpus(cpumask_t mask)
522{
523 set_cpus_allowed(current, mask);
524}
525
526#else
527
528/*
529 * No CPU lockdown needed on a uniprocessor
530 */
531
532#define apm_save_cpus() (current->cpus_allowed)
533#define apm_restore_cpus(x) (void)(x)
534
535#endif
536
537/*
538 * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
539 * apm_info.allow_ints, we are being really paranoid here! Not only
540 * are interrupts disabled, but all the segment registers (except SS)
541 * are saved and zeroed this means that if the BIOS tries to reference
542 * any data without explicitly loading the segment registers, the kernel
543 * will fault immediately rather than have some unforeseen circumstances
544 * for the rest of the kernel. And it will be very obvious! :-) Doing
545 * this depends on CS referring to the same physical memory as DS so that
546 * DS can be zeroed before the call. Unfortunately, we can't do anything
547 * about the stack segment/pointer. Also, we tell the compiler that
548 * everything could change.
549 *
550 * Also, we KNOW that for the non error case of apm_bios_call, there
551 * is no useful data returned in the low order 8 bits of eax.
552 */
553#define APM_DO_CLI \
554 if (apm_info.allow_ints) \
555 local_irq_enable(); \
556 else \
557 local_irq_disable();
558
559#ifdef APM_ZERO_SEGS
560# define APM_DECL_SEGS \
561 unsigned int saved_fs; unsigned int saved_gs;
562# define APM_DO_SAVE_SEGS \
563 savesegment(fs, saved_fs); savesegment(gs, saved_gs)
564# define APM_DO_RESTORE_SEGS \
565 loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
566#else
567# define APM_DECL_SEGS
568# define APM_DO_SAVE_SEGS
569# define APM_DO_RESTORE_SEGS
570#endif
571
572/**
573 * apm_bios_call - Make an APM BIOS 32bit call
574 * @func: APM function to execute
575 * @ebx_in: EBX register for call entry
576 * @ecx_in: ECX register for call entry
577 * @eax: EAX register return
578 * @ebx: EBX register return
579 * @ecx: ECX register return
580 * @edx: EDX register return
581 * @esi: ESI register return
582 *
583 * Make an APM call using the 32bit protected mode interface. The
584 * caller is responsible for knowing if APM BIOS is configured and
585 * enabled. This call can disable interrupts for a long period of
586 * time on some laptops. The return value is in AH and the carry
587 * flag is loaded into AL. If there is an error, then the error
588 * code is returned in AH (bits 8-15 of eax) and this function
589 * returns non-zero.
590 */
591
592static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
593 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
594{
595 APM_DECL_SEGS
596 unsigned long flags;
597 cpumask_t cpus;
598 int cpu;
599 struct desc_struct save_desc_40;
600
601 cpus = apm_save_cpus();
602
603 cpu = get_cpu();
604 save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8];
605 per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc;
606
607 local_save_flags(flags);
608 APM_DO_CLI;
609 APM_DO_SAVE_SEGS;
610 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
611 APM_DO_RESTORE_SEGS;
612 local_irq_restore(flags);
613 per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = save_desc_40;
614 put_cpu();
615 apm_restore_cpus(cpus);
616
617 return *eax & 0xff;
618}
619
620/**
621 * apm_bios_call_simple - make a simple APM BIOS 32bit call
622 * @func: APM function to invoke
623 * @ebx_in: EBX register value for BIOS call
624 * @ecx_in: ECX register value for BIOS call
625 * @eax: EAX register on return from the BIOS call
626 *
627 * Make a BIOS call that does only returns one value, or just status.
628 * If there is an error, then the error code is returned in AH
629 * (bits 8-15 of eax) and this function returns non-zero. This is
630 * used for simpler BIOS operations. This call may hold interrupts
631 * off for a long time on some laptops.
632 */
633
634static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
635{
636 u8 error;
637 APM_DECL_SEGS
638 unsigned long flags;
639 cpumask_t cpus;
640 int cpu;
641 struct desc_struct save_desc_40;
642
643
644 cpus = apm_save_cpus();
645
646 cpu = get_cpu();
647 save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8];
648 per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc;
649
650 local_save_flags(flags);
651 APM_DO_CLI;
652 APM_DO_SAVE_SEGS;
653 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
654 APM_DO_RESTORE_SEGS;
655 local_irq_restore(flags);
656 __get_cpu_var(cpu_gdt_table)[0x40 / 8] = save_desc_40;
657 put_cpu();
658 apm_restore_cpus(cpus);
659 return error;
660}
661
662/**
663 * apm_driver_version - APM driver version
664 * @val: loaded with the APM version on return
665 *
666 * Retrieve the APM version supported by the BIOS. This is only
667 * supported for APM 1.1 or higher. An error indicates APM 1.0 is
668 * probably present.
669 *
670 * On entry val should point to a value indicating the APM driver
671 * version with the high byte being the major and the low byte the
672 * minor number both in BCD
673 *
674 * On return it will hold the BIOS revision supported in the
675 * same format.
676 */
677
678static int apm_driver_version(u_short *val)
679{
680 u32 eax;
681
682 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
683 return (eax >> 8) & 0xff;
684 *val = eax;
685 return APM_SUCCESS;
686}
687
688/**
689 * apm_get_event - get an APM event from the BIOS
690 * @event: pointer to the event
691 * @info: point to the event information
692 *
693 * The APM BIOS provides a polled information for event
694 * reporting. The BIOS expects to be polled at least every second
695 * when events are pending. When a message is found the caller should
696 * poll until no more messages are present. However, this causes
697 * problems on some laptops where a suspend event notification is
698 * not cleared until it is acknowledged.
699 *
700 * Additional information is returned in the info pointer, providing
701 * that APM 1.2 is in use. If no messges are pending the value 0x80
702 * is returned (No power management events pending).
703 */
704
705static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
706{
707 u32 eax;
708 u32 ebx;
709 u32 ecx;
710 u32 dummy;
711
712 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
713 &dummy, &dummy))
714 return (eax >> 8) & 0xff;
715 *event = ebx;
716 if (apm_info.connection_version < 0x0102)
717 *info = ~0; /* indicate info not valid */
718 else
719 *info = ecx;
720 return APM_SUCCESS;
721}
722
723/**
724 * set_power_state - set the power management state
725 * @what: which items to transition
726 * @state: state to transition to
727 *
728 * Request an APM change of state for one or more system devices. The
729 * processor state must be transitioned last of all. what holds the
730 * class of device in the upper byte and the device number (0xFF for
731 * all) for the object to be transitioned.
732 *
733 * The state holds the state to transition to, which may in fact
734 * be an acceptance of a BIOS requested state change.
735 */
736
737static int set_power_state(u_short what, u_short state)
738{
739 u32 eax;
740
741 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
742 return (eax >> 8) & 0xff;
743 return APM_SUCCESS;
744}
745
746/**
747 * set_system_power_state - set system wide power state
748 * @state: which state to enter
749 *
750 * Transition the entire system into a new APM power state.
751 */
752
753static int set_system_power_state(u_short state)
754{
755 return set_power_state(APM_DEVICE_ALL, state);
756}
757
758/**
759 * apm_do_idle - perform power saving
760 *
761 * This function notifies the BIOS that the processor is (in the view
762 * of the OS) idle. It returns -1 in the event that the BIOS refuses
763 * to handle the idle request. On a success the function returns 1
764 * if the BIOS did clock slowing or 0 otherwise.
765 */
766
767static int apm_do_idle(void)
768{
769 u32 eax;
770
771 if (apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax)) {
772 static unsigned long t;
773
774 /* This always fails on some SMP boards running UP kernels.
775 * Only report the failure the first 5 times.
776 */
777 if (++t < 5)
778 {
779 printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
780 (eax >> 8) & 0xff);
781 t = jiffies;
782 }
783 return -1;
784 }
785 clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
786 return clock_slowed;
787}
788
789/**
790 * apm_do_busy - inform the BIOS the CPU is busy
791 *
792 * Request that the BIOS brings the CPU back to full performance.
793 */
794
795static void apm_do_busy(void)
796{
797 u32 dummy;
798
799 if (clock_slowed || ALWAYS_CALL_BUSY) {
800 (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
801 clock_slowed = 0;
802 }
803}
804
805/*
806 * If no process has really been interested in
807 * the CPU for some time, we want to call BIOS
808 * power management - we probably want
809 * to conserve power.
810 */
811#define IDLE_CALC_LIMIT (HZ * 100)
812#define IDLE_LEAKY_MAX 16
813
814static void (*original_pm_idle)(void);
815
816extern void default_idle(void);
817
818/**
819 * apm_cpu_idle - cpu idling for APM capable Linux
820 *
821 * This is the idling function the kernel executes when APM is available. It
822 * tries to do BIOS powermanagement based on the average system idle time.
823 * Furthermore it calls the system default idle routine.
824 */
825
826static void apm_cpu_idle(void)
827{
828 static int use_apm_idle; /* = 0 */
829 static unsigned int last_jiffies; /* = 0 */
830 static unsigned int last_stime; /* = 0 */
831
832 int apm_idle_done = 0;
833 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
834 unsigned int bucket;
835
836recalc:
837 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
838 use_apm_idle = 0;
839 last_jiffies = jiffies;
840 last_stime = current->stime;
841 } else if (jiffies_since_last_check > idle_period) {
842 unsigned int idle_percentage;
843
844 idle_percentage = current->stime - last_stime;
845 idle_percentage *= 100;
846 idle_percentage /= jiffies_since_last_check;
847 use_apm_idle = (idle_percentage > idle_threshold);
848 if (apm_info.forbid_idle)
849 use_apm_idle = 0;
850 last_jiffies = jiffies;
851 last_stime = current->stime;
852 }
853
854 bucket = IDLE_LEAKY_MAX;
855
856 while (!need_resched()) {
857 if (use_apm_idle) {
858 unsigned int t;
859
860 t = jiffies;
861 switch (apm_do_idle()) {
862 case 0: apm_idle_done = 1;
863 if (t != jiffies) {
864 if (bucket) {
865 bucket = IDLE_LEAKY_MAX;
866 continue;
867 }
868 } else if (bucket) {
869 bucket--;
870 continue;
871 }
872 break;
873 case 1: apm_idle_done = 1;
874 break;
875 default: /* BIOS refused */
876 break;
877 }
878 }
879 if (original_pm_idle)
880 original_pm_idle();
881 else
882 default_idle();
883 jiffies_since_last_check = jiffies - last_jiffies;
884 if (jiffies_since_last_check > idle_period)
885 goto recalc;
886 }
887
888 if (apm_idle_done)
889 apm_do_busy();
890}
891
892/**
893 * apm_power_off - ask the BIOS to power off
894 *
895 * Handle the power off sequence. This is the one piece of code we
896 * will execute even on SMP machines. In order to deal with BIOS
897 * bugs we support real mode APM BIOS power off calls. We also make
898 * the SMP call on CPU0 as some systems will only honour this call
899 * on their first cpu.
900 */
901
902static void apm_power_off(void)
903{
904 unsigned char po_bios_call[] = {
905 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
906 0x8e, 0xd0, /* movw ax,ss */
907 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
908 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
909 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
910 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
911 0xcd, 0x15 /* int $0x15 */
912 };
913
914 /*
915 * This may be called on an SMP machine.
916 */
917#ifdef CONFIG_SMP
918 /* Some bioses don't like being called from CPU != 0 */
919 set_cpus_allowed(current, cpumask_of_cpu(0));
920 BUG_ON(smp_processor_id() != 0);
921#endif
922 if (apm_info.realmode_power_off)
923 {
924 (void)apm_save_cpus();
925 machine_real_restart(po_bios_call, sizeof(po_bios_call));
926 }
927 else
928 (void) set_system_power_state(APM_STATE_OFF);
929}
930
931#ifdef CONFIG_APM_DO_ENABLE
932
933/**
934 * apm_enable_power_management - enable BIOS APM power management
935 * @enable: enable yes/no
936 *
937 * Enable or disable the APM BIOS power services.
938 */
939
940static int apm_enable_power_management(int enable)
941{
942 u32 eax;
943
944 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
945 return APM_NOT_ENGAGED;
946 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
947 enable, &eax))
948 return (eax >> 8) & 0xff;
949 if (enable)
950 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
951 else
952 apm_info.bios.flags |= APM_BIOS_DISABLED;
953 return APM_SUCCESS;
954}
955#endif
956
957/**
958 * apm_get_power_status - get current power state
959 * @status: returned status
960 * @bat: battery info
961 * @life: estimated life
962 *
963 * Obtain the current power status from the APM BIOS. We return a
964 * status which gives the rough battery status, and current power
965 * source. The bat value returned give an estimate as a percentage
966 * of life and a status value for the battery. The estimated life
967 * if reported is a lifetime in secodnds/minutes at current powwer
968 * consumption.
969 */
970
971static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
972{
973 u32 eax;
974 u32 ebx;
975 u32 ecx;
976 u32 edx;
977 u32 dummy;
978
979 if (apm_info.get_power_status_broken)
980 return APM_32_UNSUPPORTED;
981 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
982 &eax, &ebx, &ecx, &edx, &dummy))
983 return (eax >> 8) & 0xff;
984 *status = ebx;
985 *bat = ecx;
986 if (apm_info.get_power_status_swabinminutes) {
987 *life = swab16((u16)edx);
988 *life |= 0x8000;
989 } else
990 *life = edx;
991 return APM_SUCCESS;
992}
993
994#if 0
995static int apm_get_battery_status(u_short which, u_short *status,
996 u_short *bat, u_short *life, u_short *nbat)
997{
998 u32 eax;
999 u32 ebx;
1000 u32 ecx;
1001 u32 edx;
1002 u32 esi;
1003
1004 if (apm_info.connection_version < 0x0102) {
1005 /* pretend we only have one battery. */
1006 if (which != 1)
1007 return APM_BAD_DEVICE;
1008 *nbat = 1;
1009 return apm_get_power_status(status, bat, life);
1010 }
1011
1012 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
1013 &ebx, &ecx, &edx, &esi))
1014 return (eax >> 8) & 0xff;
1015 *status = ebx;
1016 *bat = ecx;
1017 *life = edx;
1018 *nbat = esi;
1019 return APM_SUCCESS;
1020}
1021#endif
1022
1023/**
1024 * apm_engage_power_management - enable PM on a device
1025 * @device: identity of device
1026 * @enable: on/off
1027 *
1028 * Activate or deactive power management on either a specific device
1029 * or the entire system (%APM_DEVICE_ALL).
1030 */
1031
1032static int apm_engage_power_management(u_short device, int enable)
1033{
1034 u32 eax;
1035
1036 if ((enable == 0) && (device == APM_DEVICE_ALL)
1037 && (apm_info.bios.flags & APM_BIOS_DISABLED))
1038 return APM_DISABLED;
1039 if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
1040 return (eax >> 8) & 0xff;
1041 if (device == APM_DEVICE_ALL) {
1042 if (enable)
1043 apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
1044 else
1045 apm_info.bios.flags |= APM_BIOS_DISENGAGED;
1046 }
1047 return APM_SUCCESS;
1048}
1049
1050#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1051
1052/**
1053 * apm_console_blank - blank the display
1054 * @blank: on/off
1055 *
1056 * Attempt to blank the console, firstly by blanking just video device
1057 * zero, and if that fails (some BIOSes don't support it) then it blanks
1058 * all video devices. Typically the BIOS will do laptop backlight and
1059 * monitor powerdown for us.
1060 */
1061
1062static int apm_console_blank(int blank)
1063{
1064 int error;
1065 u_short state;
1066
1067 state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
1068 /* Blank the first display device */
1069 error = set_power_state(0x100, state);
1070 if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) {
1071 /* try to blank them all instead */
1072 error = set_power_state(0x1ff, state);
1073 if ((error != APM_SUCCESS) && (error != APM_NO_ERROR))
1074 /* try to blank device one instead */
1075 error = set_power_state(0x101, state);
1076 }
1077 if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
1078 return 1;
1079 if (error == APM_NOT_ENGAGED) {
1080 static int tried;
1081 int eng_error;
1082 if (tried++ == 0) {
1083 eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
1084 if (eng_error) {
1085 apm_error("set display", error);
1086 apm_error("engage interface", eng_error);
1087 return 0;
1088 } else
1089 return apm_console_blank(blank);
1090 }
1091 }
1092 apm_error("set display", error);
1093 return 0;
1094}
1095#endif
1096
1097static int queue_empty(struct apm_user *as)
1098{
1099 return as->event_head == as->event_tail;
1100}
1101
1102static apm_event_t get_queued_event(struct apm_user *as)
1103{
1104 as->event_tail = (as->event_tail + 1) % APM_MAX_EVENTS;
1105 return as->events[as->event_tail];
1106}
1107
1108static void queue_event(apm_event_t event, struct apm_user *sender)
1109{
1110 struct apm_user * as;
1111
1112 spin_lock(&user_list_lock);
1113 if (user_list == NULL)
1114 goto out;
1115 for (as = user_list; as != NULL; as = as->next) {
1116 if ((as == sender) || (!as->reader))
1117 continue;
1118 as->event_head = (as->event_head + 1) % APM_MAX_EVENTS;
1119 if (as->event_head == as->event_tail) {
1120 static int notified;
1121
1122 if (notified++ == 0)
1123 printk(KERN_ERR "apm: an event queue overflowed\n");
1124 as->event_tail = (as->event_tail + 1) % APM_MAX_EVENTS;
1125 }
1126 as->events[as->event_head] = event;
1127 if ((!as->suser) || (!as->writer))
1128 continue;
1129 switch (event) {
1130 case APM_SYS_SUSPEND:
1131 case APM_USER_SUSPEND:
1132 as->suspends_pending++;
1133 suspends_pending++;
1134 break;
1135
1136 case APM_SYS_STANDBY:
1137 case APM_USER_STANDBY:
1138 as->standbys_pending++;
1139 standbys_pending++;
1140 break;
1141 }
1142 }
1143 wake_up_interruptible(&apm_waitqueue);
1144out:
1145 spin_unlock(&user_list_lock);
1146}
1147
1148static void set_time(void)
1149{
1150 if (got_clock_diff) { /* Must know time zone in order to set clock */
1151 xtime.tv_sec = get_cmos_time() + clock_cmos_diff;
1152 xtime.tv_nsec = 0;
1153 }
1154}
1155
1156static void get_time_diff(void)
1157{
1158#ifndef CONFIG_APM_RTC_IS_GMT
1159 /*
1160 * Estimate time zone so that set_time can update the clock
1161 */
1162 clock_cmos_diff = -get_cmos_time();
1163 clock_cmos_diff += get_seconds();
1164 got_clock_diff = 1;
1165#endif
1166}
1167
1168static void reinit_timer(void)
1169{
1170#ifdef INIT_TIMER_AFTER_SUSPEND
1171 unsigned long flags;
1172 extern spinlock_t i8253_lock;
1173
1174 spin_lock_irqsave(&i8253_lock, flags);
1175 /* set the clock to 100 Hz */
1176 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1177 udelay(10);
1178 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
1179 udelay(10);
1180 outb(LATCH >> 8, PIT_CH0); /* MSB */
1181 udelay(10);
1182 spin_unlock_irqrestore(&i8253_lock, flags);
1183#endif
1184}
1185
1186static int suspend(int vetoable)
1187{
1188 int err;
1189 struct apm_user *as;
1190
1191 if (pm_send_all(PM_SUSPEND, (void *)3)) {
1192 /* Vetoed */
1193 if (vetoable) {
1194 if (apm_info.connection_version > 0x100)
1195 set_system_power_state(APM_STATE_REJECT);
1196 err = -EBUSY;
1197 ignore_sys_suspend = 0;
1198 printk(KERN_WARNING "apm: suspend was vetoed.\n");
1199 goto out;
1200 }
1201 printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
1202 }
1203
1204 device_suspend(PMSG_SUSPEND);
1205 local_irq_disable();
1206 device_power_down(PMSG_SUSPEND);
1207
1208 /* serialize with the timer interrupt */
1209 write_seqlock(&xtime_lock);
1210
1211 /* protect against access to timer chip registers */
1212 spin_lock(&i8253_lock);
1213
1214 get_time_diff();
1215 /*
1216 * Irq spinlock must be dropped around set_system_power_state.
1217 * We'll undo any timer changes due to interrupts below.
1218 */
1219 spin_unlock(&i8253_lock);
1220 write_sequnlock(&xtime_lock);
1221 local_irq_enable();
1222
1223 save_processor_state();
1224 err = set_system_power_state(APM_STATE_SUSPEND);
1225 restore_processor_state();
1226
1227 local_irq_disable();
1228 write_seqlock(&xtime_lock);
1229 spin_lock(&i8253_lock);
1230 reinit_timer();
1231 set_time();
1232 ignore_normal_resume = 1;
1233
1234 spin_unlock(&i8253_lock);
1235 write_sequnlock(&xtime_lock);
1236
1237 if (err == APM_NO_ERROR)
1238 err = APM_SUCCESS;
1239 if (err != APM_SUCCESS)
1240 apm_error("suspend", err);
1241 err = (err == APM_SUCCESS) ? 0 : -EIO;
1242 device_power_up();
1243 local_irq_enable();
1244 device_resume();
1245 pm_send_all(PM_RESUME, (void *)0);
1246 queue_event(APM_NORMAL_RESUME, NULL);
1247 out:
1248 spin_lock(&user_list_lock);
1249 for (as = user_list; as != NULL; as = as->next) {
1250 as->suspend_wait = 0;
1251 as->suspend_result = err;
1252 }
1253 spin_unlock(&user_list_lock);
1254 wake_up_interruptible(&apm_suspend_waitqueue);
1255 return err;
1256}
1257
1258static void standby(void)
1259{
1260 int err;
1261
1262 local_irq_disable();
1263 device_power_down(PMSG_SUSPEND);
1264 /* serialize with the timer interrupt */
1265 write_seqlock(&xtime_lock);
1266 /* If needed, notify drivers here */
1267 get_time_diff();
1268 write_sequnlock(&xtime_lock);
1269 local_irq_enable();
1270
1271 err = set_system_power_state(APM_STATE_STANDBY);
1272 if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
1273 apm_error("standby", err);
1274
1275 local_irq_disable();
1276 device_power_up();
1277 local_irq_enable();
1278}
1279
1280static apm_event_t get_event(void)
1281{
1282 int error;
1283 apm_event_t event;
1284 apm_eventinfo_t info;
1285
1286 static int notified;
1287
1288 /* we don't use the eventinfo */
1289 error = apm_get_event(&event, &info);
1290 if (error == APM_SUCCESS)
1291 return event;
1292
1293 if ((error != APM_NO_EVENTS) && (notified++ == 0))
1294 apm_error("get_event", error);
1295
1296 return 0;
1297}
1298
1299static void check_events(void)
1300{
1301 apm_event_t event;
1302 static unsigned long last_resume;
1303 static int ignore_bounce;
1304
1305 while ((event = get_event()) != 0) {
1306 if (debug) {
1307 if (event <= NR_APM_EVENT_NAME)
1308 printk(KERN_DEBUG "apm: received %s notify\n",
1309 apm_event_name[event - 1]);
1310 else
1311 printk(KERN_DEBUG "apm: received unknown "
1312 "event 0x%02x\n", event);
1313 }
1314 if (ignore_bounce
1315 && ((jiffies - last_resume) > bounce_interval))
1316 ignore_bounce = 0;
1317
1318 switch (event) {
1319 case APM_SYS_STANDBY:
1320 case APM_USER_STANDBY:
1321 queue_event(event, NULL);
1322 if (standbys_pending <= 0)
1323 standby();
1324 break;
1325
1326 case APM_USER_SUSPEND:
1327#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
1328 if (apm_info.connection_version > 0x100)
1329 set_system_power_state(APM_STATE_REJECT);
1330 break;
1331#endif
1332 case APM_SYS_SUSPEND:
1333 if (ignore_bounce) {
1334 if (apm_info.connection_version > 0x100)
1335 set_system_power_state(APM_STATE_REJECT);
1336 break;
1337 }
1338 /*
1339 * If we are already processing a SUSPEND,
1340 * then further SUSPEND events from the BIOS
1341 * will be ignored. We also return here to
1342 * cope with the fact that the Thinkpads keep
1343 * sending a SUSPEND event until something else
1344 * happens!
1345 */
1346 if (ignore_sys_suspend)
1347 return;
1348 ignore_sys_suspend = 1;
1349 queue_event(event, NULL);
1350 if (suspends_pending <= 0)
1351 (void) suspend(1);
1352 break;
1353
1354 case APM_NORMAL_RESUME:
1355 case APM_CRITICAL_RESUME:
1356 case APM_STANDBY_RESUME:
1357 ignore_sys_suspend = 0;
1358 last_resume = jiffies;
1359 ignore_bounce = 1;
1360 if ((event != APM_NORMAL_RESUME)
1361 || (ignore_normal_resume == 0)) {
1362 write_seqlock_irq(&xtime_lock);
1363 set_time();
1364 write_sequnlock_irq(&xtime_lock);
1365 device_resume();
1366 pm_send_all(PM_RESUME, (void *)0);
1367 queue_event(event, NULL);
1368 }
1369 ignore_normal_resume = 0;
1370 break;
1371
1372 case APM_CAPABILITY_CHANGE:
1373 case APM_LOW_BATTERY:
1374 case APM_POWER_STATUS_CHANGE:
1375 queue_event(event, NULL);
1376 /* If needed, notify drivers here */
1377 break;
1378
1379 case APM_UPDATE_TIME:
1380 write_seqlock_irq(&xtime_lock);
1381 set_time();
1382 write_sequnlock_irq(&xtime_lock);
1383 break;
1384
1385 case APM_CRITICAL_SUSPEND:
1386 /*
1387 * We are not allowed to reject a critical suspend.
1388 */
1389 (void) suspend(0);
1390 break;
1391 }
1392 }
1393}
1394
1395static void apm_event_handler(void)
1396{
1397 static int pending_count = 4;
1398 int err;
1399
1400 if ((standbys_pending > 0) || (suspends_pending > 0)) {
1401 if ((apm_info.connection_version > 0x100) &&
1402 (pending_count-- <= 0)) {
1403 pending_count = 4;
1404 if (debug)
1405 printk(KERN_DEBUG "apm: setting state busy\n");
1406 err = set_system_power_state(APM_STATE_BUSY);
1407 if (err)
1408 apm_error("busy", err);
1409 }
1410 } else
1411 pending_count = 4;
1412 check_events();
1413}
1414
1415/*
1416 * This is the APM thread main loop.
1417 */
1418
1419static void apm_mainloop(void)
1420{
1421 DECLARE_WAITQUEUE(wait, current);
1422
1423 add_wait_queue(&apm_waitqueue, &wait);
1424 set_current_state(TASK_INTERRUPTIBLE);
1425 for (;;) {
1426 schedule_timeout(APM_CHECK_TIMEOUT);
1427 if (exit_kapmd)
1428 break;
1429 /*
1430 * Ok, check all events, check for idle (and mark us sleeping
1431 * so as not to count towards the load average)..
1432 */
1433 set_current_state(TASK_INTERRUPTIBLE);
1434 apm_event_handler();
1435 }
1436 remove_wait_queue(&apm_waitqueue, &wait);
1437}
1438
1439static int check_apm_user(struct apm_user *as, const char *func)
1440{
1441 if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
1442 printk(KERN_ERR "apm: %s passed bad filp\n", func);
1443 return 1;
1444 }
1445 return 0;
1446}
1447
1448static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
1449{
1450 struct apm_user * as;
1451 int i;
1452 apm_event_t event;
1453
1454 as = fp->private_data;
1455 if (check_apm_user(as, "read"))
1456 return -EIO;
1457 if ((int)count < sizeof(apm_event_t))
1458 return -EINVAL;
1459 if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
1460 return -EAGAIN;
1461 wait_event_interruptible(apm_waitqueue, !queue_empty(as));
1462 i = count;
1463 while ((i >= sizeof(event)) && !queue_empty(as)) {
1464 event = get_queued_event(as);
1465 if (copy_to_user(buf, &event, sizeof(event))) {
1466 if (i < count)
1467 break;
1468 return -EFAULT;
1469 }
1470 switch (event) {
1471 case APM_SYS_SUSPEND:
1472 case APM_USER_SUSPEND:
1473 as->suspends_read++;
1474 break;
1475
1476 case APM_SYS_STANDBY:
1477 case APM_USER_STANDBY:
1478 as->standbys_read++;
1479 break;
1480 }
1481 buf += sizeof(event);
1482 i -= sizeof(event);
1483 }
1484 if (i < count)
1485 return count - i;
1486 if (signal_pending(current))
1487 return -ERESTARTSYS;
1488 return 0;
1489}
1490
1491static unsigned int do_poll(struct file *fp, poll_table * wait)
1492{
1493 struct apm_user * as;
1494
1495 as = fp->private_data;
1496 if (check_apm_user(as, "poll"))
1497 return 0;
1498 poll_wait(fp, &apm_waitqueue, wait);
1499 if (!queue_empty(as))
1500 return POLLIN | POLLRDNORM;
1501 return 0;
1502}
1503
1504static int do_ioctl(struct inode * inode, struct file *filp,
1505 u_int cmd, u_long arg)
1506{
1507 struct apm_user * as;
1508
1509 as = filp->private_data;
1510 if (check_apm_user(as, "ioctl"))
1511 return -EIO;
1512 if ((!as->suser) || (!as->writer))
1513 return -EPERM;
1514 switch (cmd) {
1515 case APM_IOC_STANDBY:
1516 if (as->standbys_read > 0) {
1517 as->standbys_read--;
1518 as->standbys_pending--;
1519 standbys_pending--;
1520 } else
1521 queue_event(APM_USER_STANDBY, as);
1522 if (standbys_pending <= 0)
1523 standby();
1524 break;
1525 case APM_IOC_SUSPEND:
1526 if (as->suspends_read > 0) {
1527 as->suspends_read--;
1528 as->suspends_pending--;
1529 suspends_pending--;
1530 } else
1531 queue_event(APM_USER_SUSPEND, as);
1532 if (suspends_pending <= 0) {
1533 return suspend(1);
1534 } else {
1535 as->suspend_wait = 1;
1536 wait_event_interruptible(apm_suspend_waitqueue,
1537 as->suspend_wait == 0);
1538 return as->suspend_result;
1539 }
1540 break;
1541 default:
1542 return -EINVAL;
1543 }
1544 return 0;
1545}
1546
1547static int do_release(struct inode * inode, struct file * filp)
1548{
1549 struct apm_user * as;
1550
1551 as = filp->private_data;
1552 if (check_apm_user(as, "release"))
1553 return 0;
1554 filp->private_data = NULL;
1555 if (as->standbys_pending > 0) {
1556 standbys_pending -= as->standbys_pending;
1557 if (standbys_pending <= 0)
1558 standby();
1559 }
1560 if (as->suspends_pending > 0) {
1561 suspends_pending -= as->suspends_pending;
1562 if (suspends_pending <= 0)
1563 (void) suspend(1);
1564 }
1565 spin_lock(&user_list_lock);
1566 if (user_list == as)
1567 user_list = as->next;
1568 else {
1569 struct apm_user * as1;
1570
1571 for (as1 = user_list;
1572 (as1 != NULL) && (as1->next != as);
1573 as1 = as1->next)
1574 ;
1575 if (as1 == NULL)
1576 printk(KERN_ERR "apm: filp not in user list\n");
1577 else
1578 as1->next = as->next;
1579 }
1580 spin_unlock(&user_list_lock);
1581 kfree(as);
1582 return 0;
1583}
1584
1585static int do_open(struct inode * inode, struct file * filp)
1586{
1587 struct apm_user * as;
1588
1589 as = (struct apm_user *)kmalloc(sizeof(*as), GFP_KERNEL);
1590 if (as == NULL) {
1591 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1592 sizeof(*as));
1593 return -ENOMEM;
1594 }
1595 as->magic = APM_BIOS_MAGIC;
1596 as->event_tail = as->event_head = 0;
1597 as->suspends_pending = as->standbys_pending = 0;
1598 as->suspends_read = as->standbys_read = 0;
1599 /*
1600 * XXX - this is a tiny bit broken, when we consider BSD
1601 * process accounting. If the device is opened by root, we
1602 * instantly flag that we used superuser privs. Who knows,
1603 * we might close the device immediately without doing a
1604 * privileged operation -- cevans
1605 */
1606 as->suser = capable(CAP_SYS_ADMIN);
1607 as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
1608 as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
1609 spin_lock(&user_list_lock);
1610 as->next = user_list;
1611 user_list = as;
1612 spin_unlock(&user_list_lock);
1613 filp->private_data = as;
1614 return 0;
1615}
1616
1617static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1618{
1619 char * p;
1620 unsigned short bx;
1621 unsigned short cx;
1622 unsigned short dx;
1623 int error;
1624 unsigned short ac_line_status = 0xff;
1625 unsigned short battery_status = 0xff;
1626 unsigned short battery_flag = 0xff;
1627 int percentage = -1;
1628 int time_units = -1;
1629 char *units = "?";
1630
1631 p = buf;
1632
1633 if ((num_online_cpus() == 1) &&
1634 !(error = apm_get_power_status(&bx, &cx, &dx))) {
1635 ac_line_status = (bx >> 8) & 0xff;
1636 battery_status = bx & 0xff;
1637 if ((cx & 0xff) != 0xff)
1638 percentage = cx & 0xff;
1639
1640 if (apm_info.connection_version > 0x100) {
1641 battery_flag = (cx >> 8) & 0xff;
1642 if (dx != 0xffff) {
1643 units = (dx & 0x8000) ? "min" : "sec";
1644 time_units = dx & 0x7fff;
1645 }
1646 }
1647 }
1648 /* Arguments, with symbols from linux/apm_bios.h. Information is
1649 from the Get Power Status (0x0a) call unless otherwise noted.
1650
1651 0) Linux driver version (this will change if format changes)
1652 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2.
1653 2) APM flags from APM Installation Check (0x00):
1654 bit 0: APM_16_BIT_SUPPORT
1655 bit 1: APM_32_BIT_SUPPORT
1656 bit 2: APM_IDLE_SLOWS_CLOCK
1657 bit 3: APM_BIOS_DISABLED
1658 bit 4: APM_BIOS_DISENGAGED
1659 3) AC line status
1660 0x00: Off-line
1661 0x01: On-line
1662 0x02: On backup power (BIOS >= 1.1 only)
1663 0xff: Unknown
1664 4) Battery status
1665 0x00: High
1666 0x01: Low
1667 0x02: Critical
1668 0x03: Charging
1669 0x04: Selected battery not present (BIOS >= 1.2 only)
1670 0xff: Unknown
1671 5) Battery flag
1672 bit 0: High
1673 bit 1: Low
1674 bit 2: Critical
1675 bit 3: Charging
1676 bit 7: No system battery
1677 0xff: Unknown
1678 6) Remaining battery life (percentage of charge):
1679 0-100: valid
1680 -1: Unknown
1681 7) Remaining battery life (time units):
1682 Number of remaining minutes or seconds
1683 -1: Unknown
1684 8) min = minutes; sec = seconds */
1685
1686 p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1687 driver_version,
1688 (apm_info.bios.version >> 8) & 0xff,
1689 apm_info.bios.version & 0xff,
1690 apm_info.bios.flags,
1691 ac_line_status,
1692 battery_status,
1693 battery_flag,
1694 percentage,
1695 time_units,
1696 units);
1697
1698 return p - buf;
1699}
1700
1701static int apm(void *unused)
1702{
1703 unsigned short bx;
1704 unsigned short cx;
1705 unsigned short dx;
1706 int error;
1707 char * power_stat;
1708 char * bat_stat;
1709
1710 kapmd_running = 1;
1711
1712 daemonize("kapmd");
1713
1714 current->flags |= PF_NOFREEZE;
1715
1716#ifdef CONFIG_SMP
1717 /* 2002/08/01 - WT
1718 * This is to avoid random crashes at boot time during initialization
1719 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
1720 * Some bioses don't like being called from CPU != 0.
1721 * Method suggested by Ingo Molnar.
1722 */
1723 set_cpus_allowed(current, cpumask_of_cpu(0));
1724 BUG_ON(smp_processor_id() != 0);
1725#endif
1726
1727 if (apm_info.connection_version == 0) {
1728 apm_info.connection_version = apm_info.bios.version;
1729 if (apm_info.connection_version > 0x100) {
1730 /*
1731 * We only support BIOSs up to version 1.2
1732 */
1733 if (apm_info.connection_version > 0x0102)
1734 apm_info.connection_version = 0x0102;
1735 error = apm_driver_version(&apm_info.connection_version);
1736 if (error != APM_SUCCESS) {
1737 apm_error("driver version", error);
1738 /* Fall back to an APM 1.0 connection. */
1739 apm_info.connection_version = 0x100;
1740 }
1741 }
1742 }
1743
1744 if (debug)
1745 printk(KERN_INFO "apm: Connection version %d.%d\n",
1746 (apm_info.connection_version >> 8) & 0xff,
1747 apm_info.connection_version & 0xff);
1748
1749#ifdef CONFIG_APM_DO_ENABLE
1750 if (apm_info.bios.flags & APM_BIOS_DISABLED) {
1751 /*
1752 * This call causes my NEC UltraLite Versa 33/C to hang if it
1753 * is booted with PM disabled but not in the docking station.
1754 * Unfortunate ...
1755 */
1756 error = apm_enable_power_management(1);
1757 if (error) {
1758 apm_error("enable power management", error);
1759 return -1;
1760 }
1761 }
1762#endif
1763
1764 if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
1765 && (apm_info.connection_version > 0x0100)) {
1766 error = apm_engage_power_management(APM_DEVICE_ALL, 1);
1767 if (error) {
1768 apm_error("engage power management", error);
1769 return -1;
1770 }
1771 }
1772
1773 if (debug && (num_online_cpus() == 1 || smp )) {
1774 error = apm_get_power_status(&bx, &cx, &dx);
1775 if (error)
1776 printk(KERN_INFO "apm: power status not available\n");
1777 else {
1778 switch ((bx >> 8) & 0xff) {
1779 case 0: power_stat = "off line"; break;
1780 case 1: power_stat = "on line"; break;
1781 case 2: power_stat = "on backup power"; break;
1782 default: power_stat = "unknown"; break;
1783 }
1784 switch (bx & 0xff) {
1785 case 0: bat_stat = "high"; break;
1786 case 1: bat_stat = "low"; break;
1787 case 2: bat_stat = "critical"; break;
1788 case 3: bat_stat = "charging"; break;
1789 default: bat_stat = "unknown"; break;
1790 }
1791 printk(KERN_INFO
1792 "apm: AC %s, battery status %s, battery life ",
1793 power_stat, bat_stat);
1794 if ((cx & 0xff) == 0xff)
1795 printk("unknown\n");
1796 else
1797 printk("%d%%\n", cx & 0xff);
1798 if (apm_info.connection_version > 0x100) {
1799 printk(KERN_INFO
1800 "apm: battery flag 0x%02x, battery life ",
1801 (cx >> 8) & 0xff);
1802 if (dx == 0xffff)
1803 printk("unknown\n");
1804 else
1805 printk("%d %s\n", dx & 0x7fff,
1806 (dx & 0x8000) ?
1807 "minutes" : "seconds");
1808 }
1809 }
1810 }
1811
1812 /* Install our power off handler.. */
1813 if (power_off)
1814 pm_power_off = apm_power_off;
1815
1816 if (num_online_cpus() == 1 || smp) {
1817#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1818 console_blank_hook = apm_console_blank;
1819#endif
1820 apm_mainloop();
1821#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1822 console_blank_hook = NULL;
1823#endif
1824 }
1825 kapmd_running = 0;
1826
1827 return 0;
1828}
1829
1830#ifndef MODULE
1831static int __init apm_setup(char *str)
1832{
1833 int invert;
1834
1835 while ((str != NULL) && (*str != '\0')) {
1836 if (strncmp(str, "off", 3) == 0)
1837 apm_disabled = 1;
1838 if (strncmp(str, "on", 2) == 0)
1839 apm_disabled = 0;
1840 if ((strncmp(str, "bounce-interval=", 16) == 0) ||
1841 (strncmp(str, "bounce_interval=", 16) == 0))
1842 bounce_interval = simple_strtol(str + 16, NULL, 0);
1843 if ((strncmp(str, "idle-threshold=", 15) == 0) ||
1844 (strncmp(str, "idle_threshold=", 15) == 0))
1845 idle_threshold = simple_strtol(str + 15, NULL, 0);
1846 if ((strncmp(str, "idle-period=", 12) == 0) ||
1847 (strncmp(str, "idle_period=", 12) == 0))
1848 idle_period = simple_strtol(str + 12, NULL, 0);
1849 invert = (strncmp(str, "no-", 3) == 0) ||
1850 (strncmp(str, "no_", 3) == 0);
1851 if (invert)
1852 str += 3;
1853 if (strncmp(str, "debug", 5) == 0)
1854 debug = !invert;
1855 if ((strncmp(str, "power-off", 9) == 0) ||
1856 (strncmp(str, "power_off", 9) == 0))
1857 power_off = !invert;
1858 if (strncmp(str, "smp", 3) == 0)
1859 {
1860 smp = !invert;
1861 idle_threshold = 100;
1862 }
1863 if ((strncmp(str, "allow-ints", 10) == 0) ||
1864 (strncmp(str, "allow_ints", 10) == 0))
1865 apm_info.allow_ints = !invert;
1866 if ((strncmp(str, "broken-psr", 10) == 0) ||
1867 (strncmp(str, "broken_psr", 10) == 0))
1868 apm_info.get_power_status_broken = !invert;
1869 if ((strncmp(str, "realmode-power-off", 18) == 0) ||
1870 (strncmp(str, "realmode_power_off", 18) == 0))
1871 apm_info.realmode_power_off = !invert;
1872 str = strchr(str, ',');
1873 if (str != NULL)
1874 str += strspn(str, ", \t");
1875 }
1876 return 1;
1877}
1878
1879__setup("apm=", apm_setup);
1880#endif
1881
1882static struct file_operations apm_bios_fops = {
1883 .owner = THIS_MODULE,
1884 .read = do_read,
1885 .poll = do_poll,
1886 .ioctl = do_ioctl,
1887 .open = do_open,
1888 .release = do_release,
1889};
1890
1891static struct miscdevice apm_device = {
1892 APM_MINOR_DEV,
1893 "apm_bios",
1894 &apm_bios_fops
1895};
1896
1897
1898/* Simple "print if true" callback */
1899static int __init print_if_true(struct dmi_system_id *d)
1900{
1901 printk("%s\n", d->ident);
1902 return 0;
1903}
1904
1905/*
1906 * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
1907 * disabled before the suspend. Linux used to get terribly confused by that.
1908 */
1909static int __init broken_ps2_resume(struct dmi_system_id *d)
1910{
1911 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
1912 return 0;
1913}
1914
1915/* Some bioses have a broken protected mode poweroff and need to use realmode */
1916static int __init set_realmode_power_off(struct dmi_system_id *d)
1917{
1918 if (apm_info.realmode_power_off == 0) {
1919 apm_info.realmode_power_off = 1;
1920 printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
1921 }
1922 return 0;
1923}
1924
1925/* Some laptops require interrupts to be enabled during APM calls */
1926static int __init set_apm_ints(struct dmi_system_id *d)
1927{
1928 if (apm_info.allow_ints == 0) {
1929 apm_info.allow_ints = 1;
1930 printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
1931 }
1932 return 0;
1933}
1934
1935/* Some APM bioses corrupt memory or just plain do not work */
1936static int __init apm_is_horked(struct dmi_system_id *d)
1937{
1938 if (apm_info.disabled == 0) {
1939 apm_info.disabled = 1;
1940 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
1941 }
1942 return 0;
1943}
1944
1945static int __init apm_is_horked_d850md(struct dmi_system_id *d)
1946{
1947 if (apm_info.disabled == 0) {
1948 apm_info.disabled = 1;
1949 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
1950 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
1951 printk(KERN_INFO "download from support.intel.com \n");
1952 }
1953 return 0;
1954}
1955
1956/* Some APM bioses hang on APM idle calls */
1957static int __init apm_likes_to_melt(struct dmi_system_id *d)
1958{
1959 if (apm_info.forbid_idle == 0) {
1960 apm_info.forbid_idle = 1;
1961 printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
1962 }
1963 return 0;
1964}
1965
1966/*
1967 * Check for clue free BIOS implementations who use
1968 * the following QA technique
1969 *
1970 * [ Write BIOS Code ]<------
1971 * | ^
1972 * < Does it Compile >----N--
1973 * |Y ^
1974 * < Does it Boot Win98 >-N--
1975 * |Y
1976 * [Ship It]
1977 *
1978 * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e)
1979 * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000)
1980 */
1981static int __init broken_apm_power(struct dmi_system_id *d)
1982{
1983 apm_info.get_power_status_broken = 1;
1984 printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
1985 return 0;
1986}
1987
1988/*
1989 * This bios swaps the APM minute reporting bytes over (Many sony laptops
1990 * have this problem).
1991 */
1992static int __init swab_apm_power_in_minutes(struct dmi_system_id *d)
1993{
1994 apm_info.get_power_status_swabinminutes = 1;
1995 printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
1996 return 0;
1997}
1998
1999static struct dmi_system_id __initdata apm_dmi_table[] = {
2000 {
2001 print_if_true,
2002 KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
2003 { DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
2004 DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
2005 },
2006 { /* Handle problems with APM on the C600 */
2007 broken_ps2_resume, "Dell Latitude C600",
2008 { DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
2009 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
2010 },
2011 { /* Allow interrupts during suspend on Dell Latitude laptops*/
2012 set_apm_ints, "Dell Latitude",
2013 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2014 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
2015 },
2016 { /* APM crashes */
2017 apm_is_horked, "Dell Inspiron 2500",
2018 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2019 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
2020 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
2021 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
2022 },
2023 { /* Allow interrupts during suspend on Dell Inspiron laptops*/
2024 set_apm_ints, "Dell Inspiron", {
2025 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2026 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
2027 },
2028 { /* Handle problems with APM on Inspiron 5000e */
2029 broken_apm_power, "Dell Inspiron 5000e",
2030 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2031 DMI_MATCH(DMI_BIOS_VERSION, "A04"),
2032 DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
2033 },
2034 { /* Handle problems with APM on Inspiron 2500 */
2035 broken_apm_power, "Dell Inspiron 2500",
2036 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2037 DMI_MATCH(DMI_BIOS_VERSION, "A12"),
2038 DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
2039 },
2040 { /* APM crashes */
2041 apm_is_horked, "Dell Dimension 4100",
2042 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2043 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
2044 DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
2045 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
2046 },
2047 { /* Allow interrupts during suspend on Compaq Laptops*/
2048 set_apm_ints, "Compaq 12XL125",
2049 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
2050 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
2051 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2052 DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
2053 },
2054 { /* Allow interrupts during APM or the clock goes slow */
2055 set_apm_ints, "ASUSTeK",
2056 { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
2057 DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
2058 },
2059 { /* APM blows on shutdown */
2060 apm_is_horked, "ABIT KX7-333[R]",
2061 { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
2062 DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
2063 },
2064 { /* APM crashes */
2065 apm_is_horked, "Trigem Delhi3",
2066 { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
2067 DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
2068 },
2069 { /* APM crashes */
2070 apm_is_horked, "Fujitsu-Siemens",
2071 { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
2072 DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
2073 },
2074 { /* APM crashes */
2075 apm_is_horked_d850md, "Intel D850MD",
2076 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2077 DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
2078 },
2079 { /* APM crashes */
2080 apm_is_horked, "Intel D810EMO",
2081 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2082 DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
2083 },
2084 { /* APM crashes */
2085 apm_is_horked, "Dell XPS-Z",
2086 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2087 DMI_MATCH(DMI_BIOS_VERSION, "A11"),
2088 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
2089 },
2090 { /* APM crashes */
2091 apm_is_horked, "Sharp PC-PJ/AX",
2092 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
2093 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
2094 DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
2095 DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
2096 },
2097 { /* APM crashes */
2098 apm_is_horked, "Dell Inspiron 2500",
2099 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2100 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
2101 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
2102 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
2103 },
2104 { /* APM idle hangs */
2105 apm_likes_to_melt, "Jabil AMD",
2106 { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
2107 DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
2108 },
2109 { /* APM idle hangs */
2110 apm_likes_to_melt, "AMI Bios",
2111 { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
2112 DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
2113 },
2114 { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
2115 swab_apm_power_in_minutes, "Sony VAIO",
2116 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2117 DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
2118 DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
2119 },
2120 { /* Handle problems with APM on Sony Vaio PCG-N505VX */
2121 swab_apm_power_in_minutes, "Sony VAIO",
2122 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2123 DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
2124 DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
2125 },
2126 { /* Handle problems with APM on Sony Vaio PCG-XG29 */
2127 swab_apm_power_in_minutes, "Sony VAIO",
2128 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2129 DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
2130 DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
2131 },
2132 { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
2133 swab_apm_power_in_minutes, "Sony VAIO",
2134 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2135 DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
2136 DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
2137 },
2138 { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
2139 swab_apm_power_in_minutes, "Sony VAIO",
2140 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2141 DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
2142 DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
2143 },
2144 { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
2145 swab_apm_power_in_minutes, "Sony VAIO",
2146 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2147 DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
2148 DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
2149 },
2150 { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
2151 swab_apm_power_in_minutes, "Sony VAIO",
2152 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2153 DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
2154 DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
2155 },
2156 { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
2157 swab_apm_power_in_minutes, "Sony VAIO",
2158 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2159 DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
2160 DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
2161 },
2162 { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
2163 swab_apm_power_in_minutes, "Sony VAIO",
2164 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2165 DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
2166 DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
2167 },
2168 { /* Handle problems with APM on Sony Vaio PCG-F104K */
2169 swab_apm_power_in_minutes, "Sony VAIO",
2170 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2171 DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
2172 DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
2173 },
2174
2175 { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
2176 swab_apm_power_in_minutes, "Sony VAIO",
2177 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2178 DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
2179 DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
2180 },
2181 { /* Handle problems with APM on Sony Vaio PCG-C1VE */
2182 swab_apm_power_in_minutes, "Sony VAIO",
2183 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2184 DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
2185 DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
2186 },
2187 { /* Handle problems with APM on Sony Vaio PCG-C1VE */
2188 swab_apm_power_in_minutes, "Sony VAIO",
2189 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2190 DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
2191 DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
2192 },
2193 { /* broken PM poweroff bios */
2194 set_realmode_power_off, "Award Software v4.60 PGMA",
2195 { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
2196 DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
2197 DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
2198 },
2199
2200 /* Generic per vendor APM settings */
2201
2202 { /* Allow interrupts during suspend on IBM laptops */
2203 set_apm_ints, "IBM",
2204 { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
2205 },
2206
2207 { }
2208};
2209
2210/*
2211 * Just start the APM thread. We do NOT want to do APM BIOS
2212 * calls from anything but the APM thread, if for no other reason
2213 * than the fact that we don't trust the APM BIOS. This way,
2214 * most common APM BIOS problems that lead to protection errors
2215 * etc will have at least some level of being contained...
2216 *
2217 * In short, if something bad happens, at least we have a choice
2218 * of just killing the apm thread..
2219 */
2220static int __init apm_init(void)
2221{
2222 struct proc_dir_entry *apm_proc;
2223 int ret;
2224 int i;
2225
2226 dmi_check_system(apm_dmi_table);
2227
2228 if (apm_info.bios.version == 0) {
2229 printk(KERN_INFO "apm: BIOS not found.\n");
2230 return -ENODEV;
2231 }
2232 printk(KERN_INFO
2233 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
2234 ((apm_info.bios.version >> 8) & 0xff),
2235 (apm_info.bios.version & 0xff),
2236 apm_info.bios.flags,
2237 driver_version);
2238 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
2239 printk(KERN_INFO "apm: no 32 bit BIOS support\n");
2240 return -ENODEV;
2241 }
2242
2243 if (allow_ints)
2244 apm_info.allow_ints = 1;
2245 if (broken_psr)
2246 apm_info.get_power_status_broken = 1;
2247 if (realmode_power_off)
2248 apm_info.realmode_power_off = 1;
2249 /* User can override, but default is to trust DMI */
2250 if (apm_disabled != -1)
2251 apm_info.disabled = apm_disabled;
2252
2253 /*
2254 * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
2255 * but is reportedly a 1.0 BIOS.
2256 */
2257 if (apm_info.bios.version == 0x001)
2258 apm_info.bios.version = 0x100;
2259
2260 /* BIOS < 1.2 doesn't set cseg_16_len */
2261 if (apm_info.bios.version < 0x102)
2262 apm_info.bios.cseg_16_len = 0; /* 64k */
2263
2264 if (debug) {
2265 printk(KERN_INFO "apm: entry %x:%lx cseg16 %x dseg %x",
2266 apm_info.bios.cseg, apm_info.bios.offset,
2267 apm_info.bios.cseg_16, apm_info.bios.dseg);
2268 if (apm_info.bios.version > 0x100)
2269 printk(" cseg len %x, dseg len %x",
2270 apm_info.bios.cseg_len,
2271 apm_info.bios.dseg_len);
2272 if (apm_info.bios.version > 0x101)
2273 printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
2274 printk("\n");
2275 }
2276
2277 if (apm_info.disabled) {
2278 printk(KERN_NOTICE "apm: disabled on user request.\n");
2279 return -ENODEV;
2280 }
2281 if ((num_online_cpus() > 1) && !power_off && !smp) {
2282 printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
2283 apm_info.disabled = 1;
2284 return -ENODEV;
2285 }
2286 if (PM_IS_ACTIVE()) {
2287 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2288 apm_info.disabled = 1;
2289 return -ENODEV;
2290 }
2291 pm_active = 1;
2292
2293 /*
2294 * Set up a segment that references the real mode segment 0x40
2295 * that extends up to the end of page zero (that we have reserved).
2296 * This is for buggy BIOS's that refer to (real mode) segment 0x40
2297 * even though they are called in protected mode.
2298 */
2299 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
2300 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
2301
2302 apm_bios_entry.offset = apm_info.bios.offset;
2303 apm_bios_entry.segment = APM_CS;
2304
2305 for (i = 0; i < NR_CPUS; i++) {
2306 set_base(per_cpu(cpu_gdt_table, i)[APM_CS >> 3],
2307 __va((unsigned long)apm_info.bios.cseg << 4));
2308 set_base(per_cpu(cpu_gdt_table, i)[APM_CS_16 >> 3],
2309 __va((unsigned long)apm_info.bios.cseg_16 << 4));
2310 set_base(per_cpu(cpu_gdt_table, i)[APM_DS >> 3],
2311 __va((unsigned long)apm_info.bios.dseg << 4));
2312#ifndef APM_RELAX_SEGMENTS
2313 if (apm_info.bios.version == 0x100) {
2314#endif
2315 /* For ASUS motherboard, Award BIOS rev 110 (and others?) */
2316 _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS >> 3], 64 * 1024 - 1);
2317 /* For some unknown machine. */
2318 _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS_16 >> 3], 64 * 1024 - 1);
2319 /* For the DEC Hinote Ultra CT475 (and others?) */
2320 _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_DS >> 3], 64 * 1024 - 1);
2321#ifndef APM_RELAX_SEGMENTS
2322 } else {
2323 _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS >> 3],
2324 (apm_info.bios.cseg_len - 1) & 0xffff);
2325 _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS_16 >> 3],
2326 (apm_info.bios.cseg_16_len - 1) & 0xffff);
2327 _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_DS >> 3],
2328 (apm_info.bios.dseg_len - 1) & 0xffff);
2329 /* workaround for broken BIOSes */
2330 if (apm_info.bios.cseg_len <= apm_info.bios.offset)
2331 _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS >> 3], 64 * 1024 -1);
2332 if (apm_info.bios.dseg_len <= 0x40) { /* 0x40 * 4kB == 64kB */
2333 /* for the BIOS that assumes granularity = 1 */
2334 per_cpu(cpu_gdt_table, i)[APM_DS >> 3].b |= 0x800000;
2335 printk(KERN_NOTICE "apm: we set the granularity of dseg.\n");
2336 }
2337 }
2338#endif
2339 }
2340
2341 apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
2342 if (apm_proc)
2343 apm_proc->owner = THIS_MODULE;
2344
2345 ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD);
2346 if (ret < 0) {
2347 printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n");
2348 return -ENOMEM;
2349 }
2350
2351 if (num_online_cpus() > 1 && !smp ) {
2352 printk(KERN_NOTICE
2353 "apm: disabled - APM is not SMP safe (power off active).\n");
2354 return 0;
2355 }
2356
2357 misc_register(&apm_device);
2358
2359 if (HZ != 100)
2360 idle_period = (idle_period * HZ) / 100;
2361 if (idle_threshold < 100) {
2362 original_pm_idle = pm_idle;
2363 pm_idle = apm_cpu_idle;
2364 set_pm_idle = 1;
2365 }
2366
2367 return 0;
2368}
2369
2370static void __exit apm_exit(void)
2371{
2372 int error;
2373
2374 if (set_pm_idle) {
2375 pm_idle = original_pm_idle;
2376 /*
2377 * We are about to unload the current idle thread pm callback
2378 * (pm_idle), Wait for all processors to update cached/local
2379 * copies of pm_idle before proceeding.
2380 */
2381 cpu_idle_wait();
2382 }
2383 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
2384 && (apm_info.connection_version > 0x0100)) {
2385 error = apm_engage_power_management(APM_DEVICE_ALL, 0);
2386 if (error)
2387 apm_error("disengage power management", error);
2388 }
2389 misc_deregister(&apm_device);
2390 remove_proc_entry("apm", NULL);
2391 if (power_off)
2392 pm_power_off = NULL;
2393 exit_kapmd = 1;
2394 while (kapmd_running)
2395 schedule();
2396 pm_active = 0;
2397}
2398
2399module_init(apm_init);
2400module_exit(apm_exit);
2401
2402MODULE_AUTHOR("Stephen Rothwell");
2403MODULE_DESCRIPTION("Advanced Power Management");
2404MODULE_LICENSE("GPL");
2405module_param(debug, bool, 0644);
2406MODULE_PARM_DESC(debug, "Enable debug mode");
2407module_param(power_off, bool, 0444);
2408MODULE_PARM_DESC(power_off, "Enable power off");
2409module_param(bounce_interval, int, 0444);
2410MODULE_PARM_DESC(bounce_interval,
2411 "Set the number of ticks to ignore suspend bounces");
2412module_param(allow_ints, bool, 0444);
2413MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
2414module_param(broken_psr, bool, 0444);
2415MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
2416module_param(realmode_power_off, bool, 0444);
2417MODULE_PARM_DESC(realmode_power_off,
2418 "Switch to real mode before powering off");
2419module_param(idle_threshold, int, 0444);
2420MODULE_PARM_DESC(idle_threshold,
2421 "System idle percentage above which to make APM BIOS idle calls");
2422module_param(idle_period, int, 0444);
2423MODULE_PARM_DESC(idle_period,
2424 "Period (in sec/100) over which to caculate the idle percentage");
2425module_param(smp, bool, 0444);
2426MODULE_PARM_DESC(smp,
2427 "Set this to enable APM use on an SMP platform. Use with caution on older systems");
2428MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
new file mode 100644
index 000000000000..36d66e2077d0
--- /dev/null
+++ b/arch/i386/kernel/asm-offsets.c
@@ -0,0 +1,72 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/sched.h>
8#include <linux/signal.h>
9#include <linux/personality.h>
10#include <linux/suspend.h>
11#include <asm/ucontext.h>
12#include "sigframe.h"
13#include <asm/fixmap.h>
14#include <asm/processor.h>
15#include <asm/thread_info.h>
16
17#define DEFINE(sym, val) \
18 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
19
20#define BLANK() asm volatile("\n->" : : )
21
22#define OFFSET(sym, str, mem) \
23 DEFINE(sym, offsetof(struct str, mem));
24
25void foo(void)
26{
27 OFFSET(SIGCONTEXT_eax, sigcontext, eax);
28 OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
29 OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
30 OFFSET(SIGCONTEXT_edx, sigcontext, edx);
31 OFFSET(SIGCONTEXT_esi, sigcontext, esi);
32 OFFSET(SIGCONTEXT_edi, sigcontext, edi);
33 OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
34 OFFSET(SIGCONTEXT_esp, sigcontext, esp);
35 OFFSET(SIGCONTEXT_eip, sigcontext, eip);
36 BLANK();
37
38 OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
39 OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
40 OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
41 OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
42 OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
43 OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
44 OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
45 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
46 BLANK();
47
48 OFFSET(TI_task, thread_info, task);
49 OFFSET(TI_exec_domain, thread_info, exec_domain);
50 OFFSET(TI_flags, thread_info, flags);
51 OFFSET(TI_status, thread_info, status);
52 OFFSET(TI_cpu, thread_info, cpu);
53 OFFSET(TI_preempt_count, thread_info, preempt_count);
54 OFFSET(TI_addr_limit, thread_info, addr_limit);
55 OFFSET(TI_restart_block, thread_info, restart_block);
56 BLANK();
57
58 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
59 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
60 BLANK();
61
62 OFFSET(pbe_address, pbe, address);
63 OFFSET(pbe_orig_address, pbe, orig_address);
64 OFFSET(pbe_next, pbe, next);
65
66 /* Offset from the sysenter stack to tss.esp0 */
67 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
68 sizeof(struct tss_struct));
69
70 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
71 DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
72}
diff --git a/arch/i386/kernel/bootflag.c b/arch/i386/kernel/bootflag.c
new file mode 100644
index 000000000000..4c30ed01f4e1
--- /dev/null
+++ b/arch/i386/kernel/bootflag.c
@@ -0,0 +1,99 @@
1/*
2 * Implement 'Simple Boot Flag Specification 2.0'
3 */
4
5
6#include <linux/config.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/string.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/acpi.h>
14#include <asm/io.h>
15
16#include <linux/mc146818rtc.h>
17
18
19#define SBF_RESERVED (0x78)
20#define SBF_PNPOS (1<<0)
21#define SBF_BOOTING (1<<1)
22#define SBF_DIAG (1<<2)
23#define SBF_PARITY (1<<7)
24
25
26int sbf_port __initdata = -1; /* set via acpi_boot_init() */
27
28
29static int __init parity(u8 v)
30{
31 int x = 0;
32 int i;
33
34 for(i=0;i<8;i++)
35 {
36 x^=(v&1);
37 v>>=1;
38 }
39 return x;
40}
41
42static void __init sbf_write(u8 v)
43{
44 unsigned long flags;
45 if(sbf_port != -1)
46 {
47 v &= ~SBF_PARITY;
48 if(!parity(v))
49 v|=SBF_PARITY;
50
51 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
52
53 spin_lock_irqsave(&rtc_lock, flags);
54 CMOS_WRITE(v, sbf_port);
55 spin_unlock_irqrestore(&rtc_lock, flags);
56 }
57}
58
59static u8 __init sbf_read(void)
60{
61 u8 v;
62 unsigned long flags;
63 if(sbf_port == -1)
64 return 0;
65 spin_lock_irqsave(&rtc_lock, flags);
66 v = CMOS_READ(sbf_port);
67 spin_unlock_irqrestore(&rtc_lock, flags);
68 return v;
69}
70
71static int __init sbf_value_valid(u8 v)
72{
73 if(v&SBF_RESERVED) /* Reserved bits */
74 return 0;
75 if(!parity(v))
76 return 0;
77 return 1;
78}
79
80static int __init sbf_init(void)
81{
82 u8 v;
83 if(sbf_port == -1)
84 return 0;
85 v = sbf_read();
86 if(!sbf_value_valid(v))
87 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
88
89 v &= ~SBF_RESERVED;
90 v &= ~SBF_BOOTING;
91 v &= ~SBF_DIAG;
92#if defined(CONFIG_ISAPNP)
93 v |= SBF_PNPOS;
94#endif
95 sbf_write(v);
96 return 0;
97}
98
99module_init(sbf_init);
diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
new file mode 100644
index 000000000000..010aecfffbc1
--- /dev/null
+++ b/arch/i386/kernel/cpu/Makefile
@@ -0,0 +1,19 @@
1#
2# Makefile for x86-compatible CPU details and quirks
3#
4
5obj-y := common.o proc.o
6
7obj-y += amd.o
8obj-y += cyrix.o
9obj-y += centaur.o
10obj-y += transmeta.o
11obj-y += intel.o intel_cacheinfo.o
12obj-y += rise.o
13obj-y += nexgen.o
14obj-y += umc.o
15
16obj-$(CONFIG_X86_MCE) += mcheck/
17
18obj-$(CONFIG_MTRR) += mtrr/
19obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
new file mode 100644
index 000000000000..ae94585d0445
--- /dev/null
+++ b/arch/i386/kernel/cpu/amd.c
@@ -0,0 +1,249 @@
1#include <linux/init.h>
2#include <linux/bitops.h>
3#include <linux/mm.h>
4#include <asm/io.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9/*
10 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
11 * misexecution of code under Linux. Owners of such processors should
12 * contact AMD for precise details and a CPU swap.
13 *
14 * See http://www.multimania.com/poulot/k6bug.html
15 * http://www.amd.com/K6/k6docs/revgd.html
16 *
17 * The following test is erm.. interesting. AMD neglected to up
18 * the chip setting when fixing the bug but they also tweaked some
19 * performance at the same time..
20 */
21
22extern void vide(void);
23__asm__(".align 4\nvide: ret");
24
25static void __init init_amd(struct cpuinfo_x86 *c)
26{
27 u32 l, h;
28 int mbytes = num_physpages >> (20-PAGE_SHIFT);
29 int r;
30
31 /*
32 * FIXME: We should handle the K5 here. Set up the write
33 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
34 * no bus pipeline)
35 */
36
37 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
38 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
39 clear_bit(0*32+31, c->x86_capability);
40
41 r = get_model_name(c);
42
43 switch(c->x86)
44 {
45 case 4:
46 /*
47 * General Systems BIOSen alias the cpu frequency registers
48 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
49 * drivers subsequently pokes it, and changes the CPU speed.
50 * Workaround : Remove the unneeded alias.
51 */
52#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
53#define CBAR_ENB (0x80000000)
54#define CBAR_KEY (0X000000CB)
55 if (c->x86_model==9 || c->x86_model == 10) {
56 if (inl (CBAR) & CBAR_ENB)
57 outl (0 | CBAR_KEY, CBAR);
58 }
59 break;
60 case 5:
61 if( c->x86_model < 6 )
62 {
63 /* Based on AMD doc 20734R - June 2000 */
64 if ( c->x86_model == 0 ) {
65 clear_bit(X86_FEATURE_APIC, c->x86_capability);
66 set_bit(X86_FEATURE_PGE, c->x86_capability);
67 }
68 break;
69 }
70
71 if ( c->x86_model == 6 && c->x86_mask == 1 ) {
72 const int K6_BUG_LOOP = 1000000;
73 int n;
74 void (*f_vide)(void);
75 unsigned long d, d2;
76
77 printk(KERN_INFO "AMD K6 stepping B detected - ");
78
79 /*
80 * It looks like AMD fixed the 2.6.2 bug and improved indirect
81 * calls at the same time.
82 */
83
84 n = K6_BUG_LOOP;
85 f_vide = vide;
86 rdtscl(d);
87 while (n--)
88 f_vide();
89 rdtscl(d2);
90 d = d2-d;
91
92 /* Knock these two lines out if it debugs out ok */
93 printk(KERN_INFO "AMD K6 stepping B detected - ");
94 /* -- cut here -- */
95 if (d > 20*K6_BUG_LOOP)
96 printk("system stability may be impaired when more than 32 MB are used.\n");
97 else
98 printk("probably OK (after B9730xxxx).\n");
99 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
100 }
101
102 /* K6 with old style WHCR */
103 if (c->x86_model < 8 ||
104 (c->x86_model== 8 && c->x86_mask < 8)) {
105 /* We can only write allocate on the low 508Mb */
106 if(mbytes>508)
107 mbytes=508;
108
109 rdmsr(MSR_K6_WHCR, l, h);
110 if ((l&0x0000FFFF)==0) {
111 unsigned long flags;
112 l=(1<<0)|((mbytes/4)<<1);
113 local_irq_save(flags);
114 wbinvd();
115 wrmsr(MSR_K6_WHCR, l, h);
116 local_irq_restore(flags);
117 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
118 mbytes);
119 }
120 break;
121 }
122
123 if ((c->x86_model == 8 && c->x86_mask >7) ||
124 c->x86_model == 9 || c->x86_model == 13) {
125 /* The more serious chips .. */
126
127 if(mbytes>4092)
128 mbytes=4092;
129
130 rdmsr(MSR_K6_WHCR, l, h);
131 if ((l&0xFFFF0000)==0) {
132 unsigned long flags;
133 l=((mbytes>>2)<<22)|(1<<16);
134 local_irq_save(flags);
135 wbinvd();
136 wrmsr(MSR_K6_WHCR, l, h);
137 local_irq_restore(flags);
138 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
139 mbytes);
140 }
141
142 /* Set MTRR capability flag if appropriate */
143 if (c->x86_model == 13 || c->x86_model == 9 ||
144 (c->x86_model == 8 && c->x86_mask >= 8))
145 set_bit(X86_FEATURE_K6_MTRR, c->x86_capability);
146 break;
147 }
148 break;
149
150 case 6: /* An Athlon/Duron */
151
152 /* Bit 15 of Athlon specific MSR 15, needs to be 0
153 * to enable SSE on Palomino/Morgan/Barton CPU's.
154 * If the BIOS didn't enable it already, enable it here.
155 */
156 if (c->x86_model >= 6 && c->x86_model <= 10) {
157 if (!cpu_has(c, X86_FEATURE_XMM)) {
158 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
159 rdmsr(MSR_K7_HWCR, l, h);
160 l &= ~0x00008000;
161 wrmsr(MSR_K7_HWCR, l, h);
162 set_bit(X86_FEATURE_XMM, c->x86_capability);
163 }
164 }
165
166 /* It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask>=1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178 break;
179 }
180
181 switch (c->x86) {
182 case 15:
183 set_bit(X86_FEATURE_K8, c->x86_capability);
184 break;
185 case 6:
186 set_bit(X86_FEATURE_K7, c->x86_capability);
187 break;
188 }
189
190 display_cacheinfo(c);
191 detect_ht(c);
192
193#ifdef CONFIG_X86_HT
194 /* AMD dual core looks like HT but isn't really. Hide it from the
195 scheduler. This works around problems with the domain scheduler.
196 Also probably gives slightly better scheduling and disables
197 SMT nice which is harmful on dual core.
198 TBD tune the domain scheduler for dual core. */
199 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
200 smp_num_siblings = 1;
201#endif
202
203 if (cpuid_eax(0x80000000) >= 0x80000008) {
204 c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
205 if (c->x86_num_cores & (c->x86_num_cores - 1))
206 c->x86_num_cores = 1;
207 }
208}
209
210static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
211{
212 /* AMD errata T13 (order #21922) */
213 if ((c->x86 == 6)) {
214 if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */
215 size = 64;
216 if (c->x86_model == 4 &&
217 (c->x86_mask==0 || c->x86_mask==1)) /* Tbird rev A1/A2 */
218 size = 256;
219 }
220 return size;
221}
222
223static struct cpu_dev amd_cpu_dev __initdata = {
224 .c_vendor = "AMD",
225 .c_ident = { "AuthenticAMD" },
226 .c_models = {
227 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
228 {
229 [3] = "486 DX/2",
230 [7] = "486 DX/2-WB",
231 [8] = "486 DX/4",
232 [9] = "486 DX/4-WB",
233 [14] = "Am5x86-WT",
234 [15] = "Am5x86-WB"
235 }
236 },
237 },
238 .c_init = init_amd,
239 .c_identify = generic_identify,
240 .c_size_cache = amd_size_cache,
241};
242
243int __init amd_init_cpu(void)
244{
245 cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
246 return 0;
247}
248
249//early_arch_initcall(amd_init_cpu);
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
new file mode 100644
index 000000000000..394814e57672
--- /dev/null
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -0,0 +1,476 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <linux/bitops.h>
4#include <asm/processor.h>
5#include <asm/msr.h>
6#include <asm/e820.h>
7#include "cpu.h"
8
9#ifdef CONFIG_X86_OOSTORE
10
11static u32 __init power2(u32 x)
12{
13 u32 s=1;
14 while(s<=x)
15 s<<=1;
16 return s>>=1;
17}
18
19
20/*
21 * Set up an actual MCR
22 */
23
24static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key)
25{
26 u32 lo, hi;
27
28 hi = base & ~0xFFF;
29 lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
30 lo &= ~0xFFF; /* Remove the ctrl value bits */
31 lo |= key; /* Attribute we wish to set */
32 wrmsr(reg+MSR_IDT_MCR0, lo, hi);
33 mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
34}
35
36/*
37 * Figure what we can cover with MCR's
38 *
39 * Shortcut: We know you can't put 4Gig of RAM on a winchip
40 */
41
42static u32 __init ramtop(void) /* 16388 */
43{
44 int i;
45 u32 top = 0;
46 u32 clip = 0xFFFFFFFFUL;
47
48 for (i = 0; i < e820.nr_map; i++) {
49 unsigned long start, end;
50
51 if (e820.map[i].addr > 0xFFFFFFFFUL)
52 continue;
53 /*
54 * Don't MCR over reserved space. Ignore the ISA hole
55 * we frob around that catastrophy already
56 */
57
58 if (e820.map[i].type == E820_RESERVED)
59 {
60 if(e820.map[i].addr >= 0x100000UL && e820.map[i].addr < clip)
61 clip = e820.map[i].addr;
62 continue;
63 }
64 start = e820.map[i].addr;
65 end = e820.map[i].addr + e820.map[i].size;
66 if (start >= end)
67 continue;
68 if (end > top)
69 top = end;
70 }
71 /* Everything below 'top' should be RAM except for the ISA hole.
72 Because of the limited MCR's we want to map NV/ACPI into our
73 MCR range for gunk in RAM
74
75 Clip might cause us to MCR insufficient RAM but that is an
76 acceptable failure mode and should only bite obscure boxes with
77 a VESA hole at 15Mb
78
79 The second case Clip sometimes kicks in is when the EBDA is marked
80 as reserved. Again we fail safe with reasonable results
81 */
82
83 if(top>clip)
84 top=clip;
85
86 return top;
87}
88
89/*
90 * Compute a set of MCR's to give maximum coverage
91 */
92
93static int __init centaur_mcr_compute(int nr, int key)
94{
95 u32 mem = ramtop();
96 u32 root = power2(mem);
97 u32 base = root;
98 u32 top = root;
99 u32 floor = 0;
100 int ct = 0;
101
102 while(ct<nr)
103 {
104 u32 fspace = 0;
105
106 /*
107 * Find the largest block we will fill going upwards
108 */
109
110 u32 high = power2(mem-top);
111
112 /*
113 * Find the largest block we will fill going downwards
114 */
115
116 u32 low = base/2;
117
118 /*
119 * Don't fill below 1Mb going downwards as there
120 * is an ISA hole in the way.
121 */
122
123 if(base <= 1024*1024)
124 low = 0;
125
126 /*
127 * See how much space we could cover by filling below
128 * the ISA hole
129 */
130
131 if(floor == 0)
132 fspace = 512*1024;
133 else if(floor ==512*1024)
134 fspace = 128*1024;
135
136 /* And forget ROM space */
137
138 /*
139 * Now install the largest coverage we get
140 */
141
142 if(fspace > high && fspace > low)
143 {
144 centaur_mcr_insert(ct, floor, fspace, key);
145 floor += fspace;
146 }
147 else if(high > low)
148 {
149 centaur_mcr_insert(ct, top, high, key);
150 top += high;
151 }
152 else if(low > 0)
153 {
154 base -= low;
155 centaur_mcr_insert(ct, base, low, key);
156 }
157 else break;
158 ct++;
159 }
160 /*
161 * We loaded ct values. We now need to set the mask. The caller
162 * must do this bit.
163 */
164
165 return ct;
166}
167
168static void __init centaur_create_optimal_mcr(void)
169{
170 int i;
171 /*
172 * Allocate up to 6 mcrs to mark as much of ram as possible
173 * as write combining and weak write ordered.
174 *
175 * To experiment with: Linux never uses stack operations for
176 * mmio spaces so we could globally enable stack operation wc
177 *
178 * Load the registers with type 31 - full write combining, all
179 * writes weakly ordered.
180 */
181 int used = centaur_mcr_compute(6, 31);
182
183 /*
184 * Wipe unused MCRs
185 */
186
187 for(i=used;i<8;i++)
188 wrmsr(MSR_IDT_MCR0+i, 0, 0);
189}
190
191static void __init winchip2_create_optimal_mcr(void)
192{
193 u32 lo, hi;
194 int i;
195
196 /*
197 * Allocate up to 6 mcrs to mark as much of ram as possible
198 * as write combining, weak store ordered.
199 *
200 * Load the registers with type 25
201 * 8 - weak write ordering
202 * 16 - weak read ordering
203 * 1 - write combining
204 */
205
206 int used = centaur_mcr_compute(6, 25);
207
208 /*
209 * Mark the registers we are using.
210 */
211
212 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
213 for(i=0;i<used;i++)
214 lo|=1<<(9+i);
215 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
216
217 /*
218 * Wipe unused MCRs
219 */
220
221 for(i=used;i<8;i++)
222 wrmsr(MSR_IDT_MCR0+i, 0, 0);
223}
224
225/*
226 * Handle the MCR key on the Winchip 2.
227 */
228
229static void __init winchip2_unprotect_mcr(void)
230{
231 u32 lo, hi;
232 u32 key;
233
234 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
235 lo&=~0x1C0; /* blank bits 8-6 */
236 key = (lo>>17) & 7;
237 lo |= key<<6; /* replace with unlock key */
238 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
239}
240
241static void __init winchip2_protect_mcr(void)
242{
243 u32 lo, hi;
244
245 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
246 lo&=~0x1C0; /* blank bits 8-6 */
247 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
248}
249#endif /* CONFIG_X86_OOSTORE */
250
251#define ACE_PRESENT (1 << 6)
252#define ACE_ENABLED (1 << 7)
253#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
254
255#define RNG_PRESENT (1 << 2)
256#define RNG_ENABLED (1 << 3)
257#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
258
259static void __init init_c3(struct cpuinfo_x86 *c)
260{
261 u32 lo, hi;
262
263 /* Test for Centaur Extended Feature Flags presence */
264 if (cpuid_eax(0xC0000000) >= 0xC0000001) {
265 u32 tmp = cpuid_edx(0xC0000001);
266
267 /* enable ACE unit, if present and disabled */
268 if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
269 rdmsr (MSR_VIA_FCR, lo, hi);
270 lo |= ACE_FCR; /* enable ACE unit */
271 wrmsr (MSR_VIA_FCR, lo, hi);
272 printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
273 }
274
275 /* enable RNG unit, if present and disabled */
276 if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
277 rdmsr (MSR_VIA_RNG, lo, hi);
278 lo |= RNG_ENABLE; /* enable RNG unit */
279 wrmsr (MSR_VIA_RNG, lo, hi);
280 printk(KERN_INFO "CPU: Enabled h/w RNG\n");
281 }
282
283 /* store Centaur Extended Feature Flags as
284 * word 5 of the CPU capability bit array
285 */
286 c->x86_capability[5] = cpuid_edx(0xC0000001);
287 }
288
289 /* Cyrix III family needs CX8 & PGE explicity enabled. */
290 if (c->x86_model >=6 && c->x86_model <= 9) {
291 rdmsr (MSR_VIA_FCR, lo, hi);
292 lo |= (1<<1 | 1<<7);
293 wrmsr (MSR_VIA_FCR, lo, hi);
294 set_bit(X86_FEATURE_CX8, c->x86_capability);
295 }
296
297 /* Before Nehemiah, the C3's had 3dNOW! */
298 if (c->x86_model >=6 && c->x86_model <9)
299 set_bit(X86_FEATURE_3DNOW, c->x86_capability);
300
301 get_model_name(c);
302 display_cacheinfo(c);
303}
304
305static void __init init_centaur(struct cpuinfo_x86 *c)
306{
307 enum {
308 ECX8=1<<1,
309 EIERRINT=1<<2,
310 DPM=1<<3,
311 DMCE=1<<4,
312 DSTPCLK=1<<5,
313 ELINEAR=1<<6,
314 DSMC=1<<7,
315 DTLOCK=1<<8,
316 EDCTLB=1<<8,
317 EMMX=1<<9,
318 DPDC=1<<11,
319 EBRPRED=1<<12,
320 DIC=1<<13,
321 DDC=1<<14,
322 DNA=1<<15,
323 ERETSTK=1<<16,
324 E2MMX=1<<19,
325 EAMD3D=1<<20,
326 };
327
328 char *name;
329 u32 fcr_set=0;
330 u32 fcr_clr=0;
331 u32 lo,hi,newlo;
332 u32 aa,bb,cc,dd;
333
334 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
335 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
336 clear_bit(0*32+31, c->x86_capability);
337
338 switch (c->x86) {
339
340 case 5:
341 switch(c->x86_model) {
342 case 4:
343 name="C6";
344 fcr_set=ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
345 fcr_clr=DPDC;
346 printk(KERN_NOTICE "Disabling bugged TSC.\n");
347 clear_bit(X86_FEATURE_TSC, c->x86_capability);
348#ifdef CONFIG_X86_OOSTORE
349 centaur_create_optimal_mcr();
350 /* Enable
351 write combining on non-stack, non-string
352 write combining on string, all types
353 weak write ordering
354
355 The C6 original lacks weak read order
356
357 Note 0x120 is write only on Winchip 1 */
358
359 wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
360#endif
361 break;
362 case 8:
363 switch(c->x86_mask) {
364 default:
365 name="2";
366 break;
367 case 7 ... 9:
368 name="2A";
369 break;
370 case 10 ... 15:
371 name="2B";
372 break;
373 }
374 fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
375 fcr_clr=DPDC;
376#ifdef CONFIG_X86_OOSTORE
377 winchip2_unprotect_mcr();
378 winchip2_create_optimal_mcr();
379 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
380 /* Enable
381 write combining on non-stack, non-string
382 write combining on string, all types
383 weak write ordering
384 */
385 lo|=31;
386 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
387 winchip2_protect_mcr();
388#endif
389 break;
390 case 9:
391 name="3";
392 fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
393 fcr_clr=DPDC;
394#ifdef CONFIG_X86_OOSTORE
395 winchip2_unprotect_mcr();
396 winchip2_create_optimal_mcr();
397 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
398 /* Enable
399 write combining on non-stack, non-string
400 write combining on string, all types
401 weak write ordering
402 */
403 lo|=31;
404 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
405 winchip2_protect_mcr();
406#endif
407 break;
408 case 10:
409 name="4";
410 /* no info on the WC4 yet */
411 break;
412 default:
413 name="??";
414 }
415
416 rdmsr(MSR_IDT_FCR1, lo, hi);
417 newlo=(lo|fcr_set) & (~fcr_clr);
418
419 if (newlo!=lo) {
420 printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", lo, newlo );
421 wrmsr(MSR_IDT_FCR1, newlo, hi );
422 } else {
423 printk(KERN_INFO "Centaur FCR is 0x%X\n",lo);
424 }
425 /* Emulate MTRRs using Centaur's MCR. */
426 set_bit(X86_FEATURE_CENTAUR_MCR, c->x86_capability);
427 /* Report CX8 */
428 set_bit(X86_FEATURE_CX8, c->x86_capability);
429 /* Set 3DNow! on Winchip 2 and above. */
430 if (c->x86_model >=8)
431 set_bit(X86_FEATURE_3DNOW, c->x86_capability);
432 /* See if we can find out some more. */
433 if ( cpuid_eax(0x80000000) >= 0x80000005 ) {
434 /* Yes, we can. */
435 cpuid(0x80000005,&aa,&bb,&cc,&dd);
436 /* Add L1 data and code cache sizes. */
437 c->x86_cache_size = (cc>>24)+(dd>>24);
438 }
439 sprintf( c->x86_model_id, "WinChip %s", name );
440 break;
441
442 case 6:
443 init_c3(c);
444 break;
445 }
446}
447
448static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
449{
450 /* VIA C3 CPUs (670-68F) need further shifting. */
451 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
452 size >>= 8;
453
454 /* VIA also screwed up Nehemiah stepping 1, and made
455 it return '65KB' instead of '64KB'
456 - Note, it seems this may only be in engineering samples. */
457 if ((c->x86==6) && (c->x86_model==9) && (c->x86_mask==1) && (size==65))
458 size -=1;
459
460 return size;
461}
462
463static struct cpu_dev centaur_cpu_dev __initdata = {
464 .c_vendor = "Centaur",
465 .c_ident = { "CentaurHauls" },
466 .c_init = init_centaur,
467 .c_size_cache = centaur_size_cache,
468};
469
470int __init centaur_init_cpu(void)
471{
472 cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
473 return 0;
474}
475
476//early_arch_initcall(centaur_init_cpu);
diff --git a/arch/i386/kernel/cpu/changelog b/arch/i386/kernel/cpu/changelog
new file mode 100644
index 000000000000..cef76b80a710
--- /dev/null
+++ b/arch/i386/kernel/cpu/changelog
@@ -0,0 +1,63 @@
1/*
2 * Enhanced CPU type detection by Mike Jagdis, Patrick St. Jean
3 * and Martin Mares, November 1997.
4 *
5 * Force Cyrix 6x86(MX) and M II processors to report MTRR capability
6 * and Cyrix "coma bug" recognition by
7 * Zoltán Böszörményi <zboszor@mail.externet.hu> February 1999.
8 *
9 * Force Centaur C6 processors to report MTRR capability.
10 * Bart Hartgers <bart@etpmod.phys.tue.nl>, May 1999.
11 *
12 * Intel Mobile Pentium II detection fix. Sean Gilley, June 1999.
13 *
14 * IDT Winchip tweaks, misc clean ups.
15 * Dave Jones <davej@suse.de>, August 1999
16 *
17 * Better detection of Centaur/IDT WinChip models.
18 * Bart Hartgers <bart@etpmod.phys.tue.nl>, August 1999.
19 *
20 * Cleaned up cache-detection code
21 * Dave Jones <davej@suse.de>, October 1999
22 *
23 * Added proper L2 cache detection for Coppermine
24 * Dragan Stancevic <visitor@valinux.com>, October 1999
25 *
26 * Added the original array for capability flags but forgot to credit
27 * myself :) (~1998) Fixed/cleaned up some cpu_model_info and other stuff
28 * Jauder Ho <jauderho@carumba.com>, January 2000
29 *
30 * Detection for Celeron coppermine, identify_cpu() overhauled,
31 * and a few other clean ups.
32 * Dave Jones <davej@suse.de>, April 2000
33 *
34 * Pentium III FXSR, SSE support
35 * General FPU state handling cleanups
36 * Gareth Hughes <gareth@valinux.com>, May 2000
37 *
38 * Added proper Cascades CPU and L2 cache detection for Cascades
39 * and 8-way type cache happy bunch from Intel:^)
40 * Dragan Stancevic <visitor@valinux.com>, May 2000
41 *
42 * Forward port AMD Duron errata T13 from 2.2.17pre
43 * Dave Jones <davej@suse.de>, August 2000
44 *
45 * Forward port lots of fixes/improvements from 2.2.18pre
46 * Cyrix III, Pentium IV support.
47 * Dave Jones <davej@suse.de>, October 2000
48 *
49 * Massive cleanup of CPU detection and bug handling;
50 * Transmeta CPU detection,
51 * H. Peter Anvin <hpa@zytor.com>, November 2000
52 *
53 * VIA C3 Support.
54 * Dave Jones <davej@suse.de>, March 2001
55 *
56 * AMD Athlon/Duron/Thunderbird bluesmoke support.
57 * Dave Jones <davej@suse.de>, April 2001.
58 *
59 * CacheSize bug workaround updates for AMD, Intel & VIA Cyrix.
60 * Dave Jones <davej@suse.de>, September, October 2001.
61 *
62 */
63
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
new file mode 100644
index 000000000000..ebd5d8247faa
--- /dev/null
+++ b/arch/i386/kernel/cpu/common.c
@@ -0,0 +1,634 @@
1#include <linux/init.h>
2#include <linux/string.h>
3#include <linux/delay.h>
4#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <asm/semaphore.h>
8#include <asm/processor.h>
9#include <asm/i387.h>
10#include <asm/msr.h>
11#include <asm/io.h>
12#include <asm/mmu_context.h>
13#ifdef CONFIG_X86_LOCAL_APIC
14#include <asm/mpspec.h>
15#include <asm/apic.h>
16#include <mach_apic.h>
17#endif
18
19#include "cpu.h"
20
21DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
22EXPORT_PER_CPU_SYMBOL(cpu_gdt_table);
23
24DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
25EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
26
27static int cachesize_override __initdata = -1;
28static int disable_x86_fxsr __initdata = 0;
29static int disable_x86_serial_nr __initdata = 1;
30
31struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
32
33extern void mcheck_init(struct cpuinfo_x86 *c);
34
35extern int disable_pse;
36
37static void default_init(struct cpuinfo_x86 * c)
38{
39 /* Not much we can do here... */
40 /* Check if at least it has cpuid */
41 if (c->cpuid_level == -1) {
42 /* No cpuid. It must be an ancient CPU */
43 if (c->x86 == 4)
44 strcpy(c->x86_model_id, "486");
45 else if (c->x86 == 3)
46 strcpy(c->x86_model_id, "386");
47 }
48}
49
50static struct cpu_dev default_cpu = {
51 .c_init = default_init,
52};
53static struct cpu_dev * this_cpu = &default_cpu;
54
55static int __init cachesize_setup(char *str)
56{
57 get_option (&str, &cachesize_override);
58 return 1;
59}
60__setup("cachesize=", cachesize_setup);
61
62int __init get_model_name(struct cpuinfo_x86 *c)
63{
64 unsigned int *v;
65 char *p, *q;
66
67 if (cpuid_eax(0x80000000) < 0x80000004)
68 return 0;
69
70 v = (unsigned int *) c->x86_model_id;
71 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
72 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
73 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
74 c->x86_model_id[48] = 0;
75
76 /* Intel chips right-justify this string for some dumb reason;
77 undo that brain damage */
78 p = q = &c->x86_model_id[0];
79 while ( *p == ' ' )
80 p++;
81 if ( p != q ) {
82 while ( *p )
83 *q++ = *p++;
84 while ( q <= &c->x86_model_id[48] )
85 *q++ = '\0'; /* Zero-pad the rest */
86 }
87
88 return 1;
89}
90
91
92void __init display_cacheinfo(struct cpuinfo_x86 *c)
93{
94 unsigned int n, dummy, ecx, edx, l2size;
95
96 n = cpuid_eax(0x80000000);
97
98 if (n >= 0x80000005) {
99 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
100 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
101 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
102 c->x86_cache_size=(ecx>>24)+(edx>>24);
103 }
104
105 if (n < 0x80000006) /* Some chips just has a large L1. */
106 return;
107
108 ecx = cpuid_ecx(0x80000006);
109 l2size = ecx >> 16;
110
111 /* do processor-specific cache resizing */
112 if (this_cpu->c_size_cache)
113 l2size = this_cpu->c_size_cache(c,l2size);
114
115 /* Allow user to override all this if necessary. */
116 if (cachesize_override != -1)
117 l2size = cachesize_override;
118
119 if ( l2size == 0 )
120 return; /* Again, no L2 cache is possible */
121
122 c->x86_cache_size = l2size;
123
124 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
125 l2size, ecx & 0xFF);
126}
127
128/* Naming convention should be: <Name> [(<Codename>)] */
129/* This table only is used unless init_<vendor>() below doesn't set it; */
130/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
131
132/* Look up CPU names by table lookup. */
133static char __init *table_lookup_model(struct cpuinfo_x86 *c)
134{
135 struct cpu_model_info *info;
136
137 if ( c->x86_model >= 16 )
138 return NULL; /* Range check */
139
140 if (!this_cpu)
141 return NULL;
142
143 info = this_cpu->c_models;
144
145 while (info && info->family) {
146 if (info->family == c->x86)
147 return info->model_names[c->x86_model];
148 info++;
149 }
150 return NULL; /* Not found */
151}
152
153
154void __init get_cpu_vendor(struct cpuinfo_x86 *c, int early)
155{
156 char *v = c->x86_vendor_id;
157 int i;
158
159 for (i = 0; i < X86_VENDOR_NUM; i++) {
160 if (cpu_devs[i]) {
161 if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
162 (cpu_devs[i]->c_ident[1] &&
163 !strcmp(v,cpu_devs[i]->c_ident[1]))) {
164 c->x86_vendor = i;
165 if (!early)
166 this_cpu = cpu_devs[i];
167 break;
168 }
169 }
170 }
171}
172
173
174static int __init x86_fxsr_setup(char * s)
175{
176 disable_x86_fxsr = 1;
177 return 1;
178}
179__setup("nofxsr", x86_fxsr_setup);
180
181
182/* Standard macro to see if a specific flag is changeable */
183static inline int flag_is_changeable_p(u32 flag)
184{
185 u32 f1, f2;
186
187 asm("pushfl\n\t"
188 "pushfl\n\t"
189 "popl %0\n\t"
190 "movl %0,%1\n\t"
191 "xorl %2,%0\n\t"
192 "pushl %0\n\t"
193 "popfl\n\t"
194 "pushfl\n\t"
195 "popl %0\n\t"
196 "popfl\n\t"
197 : "=&r" (f1), "=&r" (f2)
198 : "ir" (flag));
199
200 return ((f1^f2) & flag) != 0;
201}
202
203
204/* Probe for the CPUID instruction */
205static int __init have_cpuid_p(void)
206{
207 return flag_is_changeable_p(X86_EFLAGS_ID);
208}
209
210/* Do minimum CPU detection early.
211 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
212 The others are not touched to avoid unwanted side effects. */
213static void __init early_cpu_detect(void)
214{
215 struct cpuinfo_x86 *c = &boot_cpu_data;
216
217 c->x86_cache_alignment = 32;
218
219 if (!have_cpuid_p())
220 return;
221
222 /* Get vendor name */
223 cpuid(0x00000000, &c->cpuid_level,
224 (int *)&c->x86_vendor_id[0],
225 (int *)&c->x86_vendor_id[8],
226 (int *)&c->x86_vendor_id[4]);
227
228 get_cpu_vendor(c, 1);
229
230 c->x86 = 4;
231 if (c->cpuid_level >= 0x00000001) {
232 u32 junk, tfms, cap0, misc;
233 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
234 c->x86 = (tfms >> 8) & 15;
235 c->x86_model = (tfms >> 4) & 15;
236 if (c->x86 == 0xf) {
237 c->x86 += (tfms >> 20) & 0xff;
238 c->x86_model += ((tfms >> 16) & 0xF) << 4;
239 }
240 c->x86_mask = tfms & 15;
241 if (cap0 & (1<<19))
242 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
243 }
244
245 early_intel_workaround(c);
246}
247
248void __init generic_identify(struct cpuinfo_x86 * c)
249{
250 u32 tfms, xlvl;
251 int junk;
252
253 if (have_cpuid_p()) {
254 /* Get vendor name */
255 cpuid(0x00000000, &c->cpuid_level,
256 (int *)&c->x86_vendor_id[0],
257 (int *)&c->x86_vendor_id[8],
258 (int *)&c->x86_vendor_id[4]);
259
260 get_cpu_vendor(c, 0);
261 /* Initialize the standard set of capabilities */
262 /* Note that the vendor-specific code below might override */
263
264 /* Intel-defined flags: level 0x00000001 */
265 if ( c->cpuid_level >= 0x00000001 ) {
266 u32 capability, excap;
267 cpuid(0x00000001, &tfms, &junk, &excap, &capability);
268 c->x86_capability[0] = capability;
269 c->x86_capability[4] = excap;
270 c->x86 = (tfms >> 8) & 15;
271 c->x86_model = (tfms >> 4) & 15;
272 if (c->x86 == 0xf) {
273 c->x86 += (tfms >> 20) & 0xff;
274 c->x86_model += ((tfms >> 16) & 0xF) << 4;
275 }
276 c->x86_mask = tfms & 15;
277 } else {
278 /* Have CPUID level 0 only - unheard of */
279 c->x86 = 4;
280 }
281
282 /* AMD-defined flags: level 0x80000001 */
283 xlvl = cpuid_eax(0x80000000);
284 if ( (xlvl & 0xffff0000) == 0x80000000 ) {
285 if ( xlvl >= 0x80000001 ) {
286 c->x86_capability[1] = cpuid_edx(0x80000001);
287 c->x86_capability[6] = cpuid_ecx(0x80000001);
288 }
289 if ( xlvl >= 0x80000004 )
290 get_model_name(c); /* Default name */
291 }
292 }
293}
294
295static void __init squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
296{
297 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
298 /* Disable processor serial number */
299 unsigned long lo,hi;
300 rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
301 lo |= 0x200000;
302 wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
303 printk(KERN_NOTICE "CPU serial number disabled.\n");
304 clear_bit(X86_FEATURE_PN, c->x86_capability);
305
306 /* Disabling the serial number may affect the cpuid level */
307 c->cpuid_level = cpuid_eax(0);
308 }
309}
310
311static int __init x86_serial_nr_setup(char *s)
312{
313 disable_x86_serial_nr = 0;
314 return 1;
315}
316__setup("serialnumber", x86_serial_nr_setup);
317
318
319
320/*
321 * This does the hard work of actually picking apart the CPU stuff...
322 */
323void __init identify_cpu(struct cpuinfo_x86 *c)
324{
325 int i;
326
327 c->loops_per_jiffy = loops_per_jiffy;
328 c->x86_cache_size = -1;
329 c->x86_vendor = X86_VENDOR_UNKNOWN;
330 c->cpuid_level = -1; /* CPUID not detected */
331 c->x86_model = c->x86_mask = 0; /* So far unknown... */
332 c->x86_vendor_id[0] = '\0'; /* Unset */
333 c->x86_model_id[0] = '\0'; /* Unset */
334 c->x86_num_cores = 1;
335 memset(&c->x86_capability, 0, sizeof c->x86_capability);
336
337 if (!have_cpuid_p()) {
338 /* First of all, decide if this is a 486 or higher */
339 /* It's a 486 if we can modify the AC flag */
340 if ( flag_is_changeable_p(X86_EFLAGS_AC) )
341 c->x86 = 4;
342 else
343 c->x86 = 3;
344 }
345
346 generic_identify(c);
347
348 printk(KERN_DEBUG "CPU: After generic identify, caps:");
349 for (i = 0; i < NCAPINTS; i++)
350 printk(" %08lx", c->x86_capability[i]);
351 printk("\n");
352
353 if (this_cpu->c_identify) {
354 this_cpu->c_identify(c);
355
356 printk(KERN_DEBUG "CPU: After vendor identify, caps:");
357 for (i = 0; i < NCAPINTS; i++)
358 printk(" %08lx", c->x86_capability[i]);
359 printk("\n");
360 }
361
362 /*
363 * Vendor-specific initialization. In this section we
364 * canonicalize the feature flags, meaning if there are
365 * features a certain CPU supports which CPUID doesn't
366 * tell us, CPUID claiming incorrect flags, or other bugs,
367 * we handle them here.
368 *
369 * At the end of this section, c->x86_capability better
370 * indicate the features this CPU genuinely supports!
371 */
372 if (this_cpu->c_init)
373 this_cpu->c_init(c);
374
375 /* Disable the PN if appropriate */
376 squash_the_stupid_serial_number(c);
377
378 /*
379 * The vendor-specific functions might have changed features. Now
380 * we do "generic changes."
381 */
382
383 /* TSC disabled? */
384 if ( tsc_disable )
385 clear_bit(X86_FEATURE_TSC, c->x86_capability);
386
387 /* FXSR disabled? */
388 if (disable_x86_fxsr) {
389 clear_bit(X86_FEATURE_FXSR, c->x86_capability);
390 clear_bit(X86_FEATURE_XMM, c->x86_capability);
391 }
392
393 if (disable_pse)
394 clear_bit(X86_FEATURE_PSE, c->x86_capability);
395
396 /* If the model name is still unset, do table lookup. */
397 if ( !c->x86_model_id[0] ) {
398 char *p;
399 p = table_lookup_model(c);
400 if ( p )
401 strcpy(c->x86_model_id, p);
402 else
403 /* Last resort... */
404 sprintf(c->x86_model_id, "%02x/%02x",
405 c->x86_vendor, c->x86_model);
406 }
407
408 /* Now the feature flags better reflect actual CPU features! */
409
410 printk(KERN_DEBUG "CPU: After all inits, caps:");
411 for (i = 0; i < NCAPINTS; i++)
412 printk(" %08lx", c->x86_capability[i]);
413 printk("\n");
414
415 /*
416 * On SMP, boot_cpu_data holds the common feature set between
417 * all CPUs; so make sure that we indicate which features are
418 * common between the CPUs. The first time this routine gets
419 * executed, c == &boot_cpu_data.
420 */
421 if ( c != &boot_cpu_data ) {
422 /* AND the already accumulated flags with these */
423 for ( i = 0 ; i < NCAPINTS ; i++ )
424 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
425 }
426
427 /* Init Machine Check Exception if available. */
428#ifdef CONFIG_X86_MCE
429 mcheck_init(c);
430#endif
431}
432
433#ifdef CONFIG_X86_HT
434void __init detect_ht(struct cpuinfo_x86 *c)
435{
436 u32 eax, ebx, ecx, edx;
437 int index_lsb, index_msb, tmp;
438 int cpu = smp_processor_id();
439
440 if (!cpu_has(c, X86_FEATURE_HT))
441 return;
442
443 cpuid(1, &eax, &ebx, &ecx, &edx);
444 smp_num_siblings = (ebx & 0xff0000) >> 16;
445
446 if (smp_num_siblings == 1) {
447 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
448 } else if (smp_num_siblings > 1 ) {
449 index_lsb = 0;
450 index_msb = 31;
451
452 if (smp_num_siblings > NR_CPUS) {
453 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
454 smp_num_siblings = 1;
455 return;
456 }
457 tmp = smp_num_siblings;
458 while ((tmp & 1) == 0) {
459 tmp >>=1 ;
460 index_lsb++;
461 }
462 tmp = smp_num_siblings;
463 while ((tmp & 0x80000000 ) == 0) {
464 tmp <<=1 ;
465 index_msb--;
466 }
467 if (index_lsb != index_msb )
468 index_msb++;
469 phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
470
471 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
472 phys_proc_id[cpu]);
473 }
474}
475#endif
476
477void __init print_cpu_info(struct cpuinfo_x86 *c)
478{
479 char *vendor = NULL;
480
481 if (c->x86_vendor < X86_VENDOR_NUM)
482 vendor = this_cpu->c_vendor;
483 else if (c->cpuid_level >= 0)
484 vendor = c->x86_vendor_id;
485
486 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
487 printk("%s ", vendor);
488
489 if (!c->x86_model_id[0])
490 printk("%d86", c->x86);
491 else
492 printk("%s", c->x86_model_id);
493
494 if (c->x86_mask || c->cpuid_level >= 0)
495 printk(" stepping %02x\n", c->x86_mask);
496 else
497 printk("\n");
498}
499
500cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
501
502/* This is hacky. :)
503 * We're emulating future behavior.
504 * In the future, the cpu-specific init functions will be called implicitly
505 * via the magic of initcalls.
506 * They will insert themselves into the cpu_devs structure.
507 * Then, when cpu_init() is called, we can just iterate over that array.
508 */
509
510extern int intel_cpu_init(void);
511extern int cyrix_init_cpu(void);
512extern int nsc_init_cpu(void);
513extern int amd_init_cpu(void);
514extern int centaur_init_cpu(void);
515extern int transmeta_init_cpu(void);
516extern int rise_init_cpu(void);
517extern int nexgen_init_cpu(void);
518extern int umc_init_cpu(void);
519
520void __init early_cpu_init(void)
521{
522 intel_cpu_init();
523 cyrix_init_cpu();
524 nsc_init_cpu();
525 amd_init_cpu();
526 centaur_init_cpu();
527 transmeta_init_cpu();
528 rise_init_cpu();
529 nexgen_init_cpu();
530 umc_init_cpu();
531 early_cpu_detect();
532
533#ifdef CONFIG_DEBUG_PAGEALLOC
534 /* pse is not compatible with on-the-fly unmapping,
535 * disable it even if the cpus claim to support it.
536 */
537 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
538 disable_pse = 1;
539#endif
540}
541/*
542 * cpu_init() initializes state that is per-CPU. Some data is already
543 * initialized (naturally) in the bootstrap process, such as the GDT
544 * and IDT. We reload them nevertheless, this function acts as a
545 * 'CPU state barrier', nothing should get across.
546 */
547void __init cpu_init (void)
548{
549 int cpu = smp_processor_id();
550 struct tss_struct * t = &per_cpu(init_tss, cpu);
551 struct thread_struct *thread = &current->thread;
552 __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
553
554 if (cpu_test_and_set(cpu, cpu_initialized)) {
555 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
556 for (;;) local_irq_enable();
557 }
558 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
559
560 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
561 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
562 if (tsc_disable && cpu_has_tsc) {
563 printk(KERN_NOTICE "Disabling TSC...\n");
564 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
565 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
566 set_in_cr4(X86_CR4_TSD);
567 }
568
569 /*
570 * Initialize the per-CPU GDT with the boot GDT,
571 * and set up the GDT descriptor:
572 */
573 memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table,
574 GDT_SIZE);
575
576 /* Set up GDT entry for 16bit stack */
577 *(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |=
578 ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
579 ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
580 (CPU_16BIT_STACK_SIZE - 1);
581
582 cpu_gdt_descr[cpu].size = GDT_SIZE - 1;
583 cpu_gdt_descr[cpu].address =
584 (unsigned long)&per_cpu(cpu_gdt_table, cpu);
585
586 /*
587 * Set up the per-thread TLS descriptor cache:
588 */
589 memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu),
590 GDT_ENTRY_TLS_ENTRIES * 8);
591
592 __asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu]));
593 __asm__ __volatile__("lidt %0" : : "m" (idt_descr));
594
595 /*
596 * Delete NT
597 */
598 __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
599
600 /*
601 * Set up and load the per-CPU TSS and LDT
602 */
603 atomic_inc(&init_mm.mm_count);
604 current->active_mm = &init_mm;
605 if (current->mm)
606 BUG();
607 enter_lazy_tlb(&init_mm, current);
608
609 load_esp0(t, thread);
610 set_tss_desc(cpu,t);
611 load_TR_desc();
612 load_LDT(&init_mm.context);
613
614 /* Set up doublefault TSS pointer in the GDT */
615 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
616
617 /* Clear %fs and %gs. */
618 asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
619
620 /* Clear all 6 debug registers: */
621
622#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
623
624 CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
625
626#undef CD
627
628 /*
629 * Force FPU initialization:
630 */
631 current_thread_info()->status = 0;
632 clear_used_math();
633 mxcsr_feature_mask_init();
634}
diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h
new file mode 100644
index 000000000000..5a1d4f163e84
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpu.h
@@ -0,0 +1,30 @@
1
2struct cpu_model_info {
3 int vendor;
4 int family;
5 char *model_names[16];
6};
7
8/* attempt to consolidate cpu attributes */
9struct cpu_dev {
10 char * c_vendor;
11
12 /* some have two possibilities for cpuid string */
13 char * c_ident[2];
14
15 struct cpu_model_info c_models[4];
16
17 void (*c_init)(struct cpuinfo_x86 * c);
18 void (*c_identify)(struct cpuinfo_x86 * c);
19 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
20};
21
22extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
23
24extern int get_model_name(struct cpuinfo_x86 *c);
25extern void display_cacheinfo(struct cpuinfo_x86 *c);
26
27extern void generic_identify(struct cpuinfo_x86 * c);
28
29extern void early_intel_workaround(struct cpuinfo_x86 *c);
30
diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig
new file mode 100644
index 000000000000..f25ffd74235c
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/Kconfig
@@ -0,0 +1,231 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE
16 depends on ACPI_PROCESSOR
17 help
18 This driver adds a CPUFreq driver which utilizes the ACPI
19 Processor Performance States.
20
21 For details, take a look at <file:Documentation/cpu-freq/>.
22
23 If in doubt, say N.
24
25config ELAN_CPUFREQ
26 tristate "AMD Elan"
27 select CPU_FREQ_TABLE
28 depends on X86_ELAN
29 ---help---
30 This adds the CPUFreq driver for AMD Elan SC400 and SC410
31 processors.
32
33 You need to specify the processor maximum speed as boot
34 parameter: elanfreq=maxspeed (in kHz) or as module
35 parameter "max_freq".
36
37 For details, take a look at <file:Documentation/cpu-freq/>.
38
39 If in doubt, say N.
40
41config X86_POWERNOW_K6
42 tristate "AMD Mobile K6-2/K6-3 PowerNow!"
43 select CPU_FREQ_TABLE
44 help
45 This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
46 AMD K6-3+ processors.
47
48 For details, take a look at <file:Documentation/cpu-freq/>.
49
50 If in doubt, say N.
51
52config X86_POWERNOW_K7
53 tristate "AMD Mobile Athlon/Duron PowerNow!"
54 select CPU_FREQ_TABLE
55 help
56 This adds the CPUFreq driver for mobile AMD K7 mobile processors.
57
58 For details, take a look at <file:Documentation/cpu-freq/>.
59
60 If in doubt, say N.
61
62config X86_POWERNOW_K7_ACPI
63 bool
64 depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
65 depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
66 default y
67
68config X86_POWERNOW_K8
69 tristate "AMD Opteron/Athlon64 PowerNow!"
70 select CPU_FREQ_TABLE
71 depends on EXPERIMENTAL
72 help
73 This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
74
75 For details, take a look at <file:Documentation/cpu-freq/>.
76
77 If in doubt, say N.
78
79config X86_POWERNOW_K8_ACPI
80 bool
81 depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
82 depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
83 default y
84
85config X86_GX_SUSPMOD
86 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
87 help
88 This add the CPUFreq driver for NatSemi Geode processors which
89 support suspend modulation.
90
91 For details, take a look at <file:Documentation/cpu-freq/>.
92
93 If in doubt, say N.
94
95config X86_SPEEDSTEP_CENTRINO
96 tristate "Intel Enhanced SpeedStep"
97 select CPU_FREQ_TABLE
98 select X86_SPEEDSTEP_CENTRINO_TABLE if (!X86_SPEEDSTEP_CENTRINO_ACPI)
99 help
100 This adds the CPUFreq driver for Enhanced SpeedStep enabled
101 mobile CPUs. This means Intel Pentium M (Centrino) CPUs. However,
102 you also need to say Y to "Use ACPI tables to decode..." below
103 [which might imply enabling ACPI] if you want to use this driver
104 on non-Banias CPUs.
105
106 For details, take a look at <file:Documentation/cpu-freq/>.
107
108 If in doubt, say N.
109
110config X86_SPEEDSTEP_CENTRINO_ACPI
111 bool "Use ACPI tables to decode valid frequency/voltage pairs"
112 depends on X86_SPEEDSTEP_CENTRINO && ACPI_PROCESSOR
113 depends on !(X86_SPEEDSTEP_CENTRINO = y && ACPI_PROCESSOR = m)
114 default y
115 help
116 Use primarily the information provided in the BIOS ACPI tables
117 to determine valid CPU frequency and voltage pairings. It is
118 required for the driver to work on non-Banias CPUs.
119
120 If in doubt, say Y.
121
122config X86_SPEEDSTEP_CENTRINO_TABLE
123 bool "Built-in tables for Banias CPUs"
124 depends on X86_SPEEDSTEP_CENTRINO
125 default y
126 help
127 Use built-in tables for Banias CPUs if ACPI encoding
128 is not available.
129
130 If in doubt, say N.
131
132config X86_SPEEDSTEP_ICH
133 tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
134 select CPU_FREQ_TABLE
135 help
136 This adds the CPUFreq driver for certain mobile Intel Pentium III
137 (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
138 mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
139 ICH3 or ICH4 southbridge.
140
141 For details, take a look at <file:Documentation/cpu-freq/>.
142
143 If in doubt, say N.
144
145config X86_SPEEDSTEP_SMI
146 tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
147 select CPU_FREQ_TABLE
148 depends on EXPERIMENTAL
149 help
150 This adds the CPUFreq driver for certain mobile Intel Pentium III
151 (Coppermine), all mobile Intel Pentium III-M (Tualatin)
152 on systems which have an Intel 440BX/ZX/MX southbridge.
153
154 For details, take a look at <file:Documentation/cpu-freq/>.
155
156 If in doubt, say N.
157
158config X86_P4_CLOCKMOD
159 tristate "Intel Pentium 4 clock modulation"
160 select CPU_FREQ_TABLE
161 help
162 This adds the CPUFreq driver for Intel Pentium 4 / XEON
163 processors.
164
165 For details, take a look at <file:Documentation/cpu-freq/>.
166
167 If in doubt, say N.
168
169config X86_CPUFREQ_NFORCE2
170 tristate "nVidia nForce2 FSB changing"
171 depends on EXPERIMENTAL
172 help
173 This adds the CPUFreq driver for FSB changing on nVidia nForce2
174 platforms.
175
176 For details, take a look at <file:Documentation/cpu-freq/>.
177
178 If in doubt, say N.
179
180config X86_LONGRUN
181 tristate "Transmeta LongRun"
182 help
183 This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
184 which support LongRun.
185
186 For details, take a look at <file:Documentation/cpu-freq/>.
187
188 If in doubt, say N.
189
190config X86_LONGHAUL
191 tristate "VIA Cyrix III Longhaul"
192 select CPU_FREQ_TABLE
193 help
194 This adds the CPUFreq driver for VIA Samuel/CyrixIII,
195 VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
196 processors.
197
198 For details, take a look at <file:Documentation/cpu-freq/>.
199
200 If in doubt, say N.
201
202comment "shared options"
203
204config X86_ACPI_CPUFREQ_PROC_INTF
205 bool "/proc/acpi/processor/../performance interface (deprecated)"
206 depends on PROC_FS
207 depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI
208 help
209 This enables the deprecated /proc/acpi/processor/../performance
210 interface. While it is helpful for debugging, the generic,
211 cross-architecture cpufreq interfaces should be used.
212
213 If in doubt, say N.
214
215config X86_SPEEDSTEP_LIB
216 tristate
217 default X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD
218
219config X86_SPEEDSTEP_RELAXED_CAP_CHECK
220 bool "Relaxed speedstep capability checks"
221 depends on (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
222 help
223 Don't perform all checks for a speedstep capable system which would
224 normally be done. Some ancient or strange systems, though speedstep
225 capable, don't always indicate that they are speedstep capable. This
226 option lets the probing code bypass some of those checks if the
227 parameter "relaxed_check=1" is passed to the module.
228
229endif # CPU_FREQ
230
231endmenu
diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile
new file mode 100644
index 000000000000..a922e97aeedd
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/Makefile
@@ -0,0 +1,14 @@
1obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
5obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
6obj-$(CONFIG_X86_LONGRUN) += longrun.o
7obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
8obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
9obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
10obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
11obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
12obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
13obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
14obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
new file mode 100644
index 000000000000..963e17aa205d
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -0,0 +1,537 @@
1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.3 $)
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
23 *
24 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 */
26
27#include <linux/config.h>
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/cpufreq.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <asm/io.h>
35#include <asm/delay.h>
36#include <asm/uaccess.h>
37
38#include <linux/acpi.h>
39#include <acpi/processor.h>
40
41#include "speedstep-est-common.h"
42
43#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
44
45MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
46MODULE_DESCRIPTION("ACPI Processor P-States Driver");
47MODULE_LICENSE("GPL");
48
49
50struct cpufreq_acpi_io {
51 struct acpi_processor_performance acpi_data;
52 struct cpufreq_frequency_table *freq_table;
53 unsigned int resume;
54};
55
56static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS];
57
58static struct cpufreq_driver acpi_cpufreq_driver;
59
60static int
61acpi_processor_write_port(
62 u16 port,
63 u8 bit_width,
64 u32 value)
65{
66 if (bit_width <= 8) {
67 outb(value, port);
68 } else if (bit_width <= 16) {
69 outw(value, port);
70 } else if (bit_width <= 32) {
71 outl(value, port);
72 } else {
73 return -ENODEV;
74 }
75 return 0;
76}
77
78static int
79acpi_processor_read_port(
80 u16 port,
81 u8 bit_width,
82 u32 *ret)
83{
84 *ret = 0;
85 if (bit_width <= 8) {
86 *ret = inb(port);
87 } else if (bit_width <= 16) {
88 *ret = inw(port);
89 } else if (bit_width <= 32) {
90 *ret = inl(port);
91 } else {
92 return -ENODEV;
93 }
94 return 0;
95}
96
97static int
98acpi_processor_set_performance (
99 struct cpufreq_acpi_io *data,
100 unsigned int cpu,
101 int state)
102{
103 u16 port = 0;
104 u8 bit_width = 0;
105 int ret = 0;
106 u32 value = 0;
107 int i = 0;
108 struct cpufreq_freqs cpufreq_freqs;
109 cpumask_t saved_mask;
110 int retval;
111
112 dprintk("acpi_processor_set_performance\n");
113
114 /*
115 * TBD: Use something other than set_cpus_allowed.
116 * As set_cpus_allowed is a bit racy,
117 * with any other set_cpus_allowed for this process.
118 */
119 saved_mask = current->cpus_allowed;
120 set_cpus_allowed(current, cpumask_of_cpu(cpu));
121 if (smp_processor_id() != cpu) {
122 return (-EAGAIN);
123 }
124
125 if (state == data->acpi_data.state) {
126 if (unlikely(data->resume)) {
127 dprintk("Called after resume, resetting to P%d\n", state);
128 data->resume = 0;
129 } else {
130 dprintk("Already at target state (P%d)\n", state);
131 retval = 0;
132 goto migrate_end;
133 }
134 }
135
136 dprintk("Transitioning from P%d to P%d\n",
137 data->acpi_data.state, state);
138
139 /* cpufreq frequency struct */
140 cpufreq_freqs.cpu = cpu;
141 cpufreq_freqs.old = data->freq_table[data->acpi_data.state].frequency;
142 cpufreq_freqs.new = data->freq_table[state].frequency;
143
144 /* notify cpufreq */
145 cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
146
147 /*
148 * First we write the target state's 'control' value to the
149 * control_register.
150 */
151
152 port = data->acpi_data.control_register.address;
153 bit_width = data->acpi_data.control_register.bit_width;
154 value = (u32) data->acpi_data.states[state].control;
155
156 dprintk("Writing 0x%08x to port 0x%04x\n", value, port);
157
158 ret = acpi_processor_write_port(port, bit_width, value);
159 if (ret) {
160 dprintk("Invalid port width 0x%04x\n", bit_width);
161 retval = ret;
162 goto migrate_end;
163 }
164
165 /*
166 * Then we read the 'status_register' and compare the value with the
167 * target state's 'status' to make sure the transition was successful.
168 * Note that we'll poll for up to 1ms (100 cycles of 10us) before
169 * giving up.
170 */
171
172 port = data->acpi_data.status_register.address;
173 bit_width = data->acpi_data.status_register.bit_width;
174
175 dprintk("Looking for 0x%08x from port 0x%04x\n",
176 (u32) data->acpi_data.states[state].status, port);
177
178 for (i=0; i<100; i++) {
179 ret = acpi_processor_read_port(port, bit_width, &value);
180 if (ret) {
181 dprintk("Invalid port width 0x%04x\n", bit_width);
182 retval = ret;
183 goto migrate_end;
184 }
185 if (value == (u32) data->acpi_data.states[state].status)
186 break;
187 udelay(10);
188 }
189
190 /* notify cpufreq */
191 cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
192
193 if (value != (u32) data->acpi_data.states[state].status) {
194 unsigned int tmp = cpufreq_freqs.new;
195 cpufreq_freqs.new = cpufreq_freqs.old;
196 cpufreq_freqs.old = tmp;
197 cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
198 cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
199 printk(KERN_WARNING "acpi-cpufreq: Transition failed\n");
200 retval = -ENODEV;
201 goto migrate_end;
202 }
203
204 dprintk("Transition successful after %d microseconds\n", i * 10);
205
206 data->acpi_data.state = state;
207
208 retval = 0;
209migrate_end:
210 set_cpus_allowed(current, saved_mask);
211 return (retval);
212}
213
214
215static int
216acpi_cpufreq_target (
217 struct cpufreq_policy *policy,
218 unsigned int target_freq,
219 unsigned int relation)
220{
221 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
222 unsigned int next_state = 0;
223 unsigned int result = 0;
224
225 dprintk("acpi_cpufreq_setpolicy\n");
226
227 result = cpufreq_frequency_table_target(policy,
228 data->freq_table,
229 target_freq,
230 relation,
231 &next_state);
232 if (result)
233 return (result);
234
235 result = acpi_processor_set_performance (data, policy->cpu, next_state);
236
237 return (result);
238}
239
240
241static int
242acpi_cpufreq_verify (
243 struct cpufreq_policy *policy)
244{
245 unsigned int result = 0;
246 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
247
248 dprintk("acpi_cpufreq_verify\n");
249
250 result = cpufreq_frequency_table_verify(policy,
251 data->freq_table);
252
253 return (result);
254}
255
256
257static unsigned long
258acpi_cpufreq_guess_freq (
259 struct cpufreq_acpi_io *data,
260 unsigned int cpu)
261{
262 if (cpu_khz) {
263 /* search the closest match to cpu_khz */
264 unsigned int i;
265 unsigned long freq;
266 unsigned long freqn = data->acpi_data.states[0].core_frequency * 1000;
267
268 for (i=0; i < (data->acpi_data.state_count - 1); i++) {
269 freq = freqn;
270 freqn = data->acpi_data.states[i+1].core_frequency * 1000;
271 if ((2 * cpu_khz) > (freqn + freq)) {
272 data->acpi_data.state = i;
273 return (freq);
274 }
275 }
276 data->acpi_data.state = data->acpi_data.state_count - 1;
277 return (freqn);
278 } else
279 /* assume CPU is at P0... */
280 data->acpi_data.state = 0;
281 return data->acpi_data.states[0].core_frequency * 1000;
282
283}
284
285
286/*
287 * acpi_processor_cpu_init_pdc_est - let BIOS know about the SMP capabilities
288 * of this driver
289 * @perf: processor-specific acpi_io_data struct
290 * @cpu: CPU being initialized
291 *
292 * To avoid issues with legacy OSes, some BIOSes require to be informed of
293 * the SMP capabilities of OS P-state driver. Here we set the bits in _PDC
294 * accordingly, for Enhanced Speedstep. Actual call to _PDC is done in
295 * driver/acpi/processor.c
296 */
297static void
298acpi_processor_cpu_init_pdc_est(
299 struct acpi_processor_performance *perf,
300 unsigned int cpu,
301 struct acpi_object_list *obj_list
302 )
303{
304 union acpi_object *obj;
305 u32 *buf;
306 struct cpuinfo_x86 *c = cpu_data + cpu;
307 dprintk("acpi_processor_cpu_init_pdc_est\n");
308
309 if (!cpu_has(c, X86_FEATURE_EST))
310 return;
311
312 /* Initialize pdc. It will be used later. */
313 if (!obj_list)
314 return;
315
316 if (!(obj_list->count && obj_list->pointer))
317 return;
318
319 obj = obj_list->pointer;
320 if ((obj->buffer.length == 12) && obj->buffer.pointer) {
321 buf = (u32 *)obj->buffer.pointer;
322 buf[0] = ACPI_PDC_REVISION_ID;
323 buf[1] = 1;
324 buf[2] = ACPI_PDC_EST_CAPABILITY_SMP;
325 perf->pdc = obj_list;
326 }
327 return;
328}
329
330
331/* CPU specific PDC initialization */
332static void
333acpi_processor_cpu_init_pdc(
334 struct acpi_processor_performance *perf,
335 unsigned int cpu,
336 struct acpi_object_list *obj_list
337 )
338{
339 struct cpuinfo_x86 *c = cpu_data + cpu;
340 dprintk("acpi_processor_cpu_init_pdc\n");
341 perf->pdc = NULL;
342 if (cpu_has(c, X86_FEATURE_EST))
343 acpi_processor_cpu_init_pdc_est(perf, cpu, obj_list);
344 return;
345}
346
347
348static int
349acpi_cpufreq_cpu_init (
350 struct cpufreq_policy *policy)
351{
352 unsigned int i;
353 unsigned int cpu = policy->cpu;
354 struct cpufreq_acpi_io *data;
355 unsigned int result = 0;
356
357 union acpi_object arg0 = {ACPI_TYPE_BUFFER};
358 u32 arg0_buf[3];
359 struct acpi_object_list arg_list = {1, &arg0};
360
361 dprintk("acpi_cpufreq_cpu_init\n");
362 /* setup arg_list for _PDC settings */
363 arg0.buffer.length = 12;
364 arg0.buffer.pointer = (u8 *) arg0_buf;
365
366 data = kmalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL);
367 if (!data)
368 return (-ENOMEM);
369 memset(data, 0, sizeof(struct cpufreq_acpi_io));
370
371 acpi_io_data[cpu] = data;
372
373 acpi_processor_cpu_init_pdc(&data->acpi_data, cpu, &arg_list);
374 result = acpi_processor_register_performance(&data->acpi_data, cpu);
375 data->acpi_data.pdc = NULL;
376
377 if (result)
378 goto err_free;
379
380 if (is_const_loops_cpu(cpu)) {
381 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
382 }
383
384 /* capability check */
385 if (data->acpi_data.state_count <= 1) {
386 dprintk("No P-States\n");
387 result = -ENODEV;
388 goto err_unreg;
389 }
390 if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_SYSTEM_IO) ||
391 (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_SYSTEM_IO)) {
392 dprintk("Unsupported address space [%d, %d]\n",
393 (u32) (data->acpi_data.control_register.space_id),
394 (u32) (data->acpi_data.status_register.space_id));
395 result = -ENODEV;
396 goto err_unreg;
397 }
398
399 /* alloc freq_table */
400 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * (data->acpi_data.state_count + 1), GFP_KERNEL);
401 if (!data->freq_table) {
402 result = -ENOMEM;
403 goto err_unreg;
404 }
405
406 /* detect transition latency */
407 policy->cpuinfo.transition_latency = 0;
408 for (i=0; i<data->acpi_data.state_count; i++) {
409 if ((data->acpi_data.states[i].transition_latency * 1000) > policy->cpuinfo.transition_latency)
410 policy->cpuinfo.transition_latency = data->acpi_data.states[i].transition_latency * 1000;
411 }
412 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
413
414 /* The current speed is unknown and not detectable by ACPI... */
415 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
416
417 /* table init */
418 for (i=0; i<=data->acpi_data.state_count; i++)
419 {
420 data->freq_table[i].index = i;
421 if (i<data->acpi_data.state_count)
422 data->freq_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000;
423 else
424 data->freq_table[i].frequency = CPUFREQ_TABLE_END;
425 }
426
427 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
428 if (result) {
429 goto err_freqfree;
430 }
431
432 /* notify BIOS that we exist */
433 acpi_processor_notify_smm(THIS_MODULE);
434
435 printk(KERN_INFO "acpi-cpufreq: CPU%u - ACPI performance management activated.\n",
436 cpu);
437 for (i = 0; i < data->acpi_data.state_count; i++)
438 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
439 (i == data->acpi_data.state?'*':' '), i,
440 (u32) data->acpi_data.states[i].core_frequency,
441 (u32) data->acpi_data.states[i].power,
442 (u32) data->acpi_data.states[i].transition_latency);
443
444 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
445 return (result);
446
447 err_freqfree:
448 kfree(data->freq_table);
449 err_unreg:
450 acpi_processor_unregister_performance(&data->acpi_data, cpu);
451 err_free:
452 kfree(data);
453 acpi_io_data[cpu] = NULL;
454
455 return (result);
456}
457
458
459static int
460acpi_cpufreq_cpu_exit (
461 struct cpufreq_policy *policy)
462{
463 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
464
465
466 dprintk("acpi_cpufreq_cpu_exit\n");
467
468 if (data) {
469 cpufreq_frequency_table_put_attr(policy->cpu);
470 acpi_io_data[policy->cpu] = NULL;
471 acpi_processor_unregister_performance(&data->acpi_data, policy->cpu);
472 kfree(data);
473 }
474
475 return (0);
476}
477
478static int
479acpi_cpufreq_resume (
480 struct cpufreq_policy *policy)
481{
482 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
483
484
485 dprintk("acpi_cpufreq_resume\n");
486
487 data->resume = 1;
488
489 return (0);
490}
491
492
493static struct freq_attr* acpi_cpufreq_attr[] = {
494 &cpufreq_freq_attr_scaling_available_freqs,
495 NULL,
496};
497
498static struct cpufreq_driver acpi_cpufreq_driver = {
499 .verify = acpi_cpufreq_verify,
500 .target = acpi_cpufreq_target,
501 .init = acpi_cpufreq_cpu_init,
502 .exit = acpi_cpufreq_cpu_exit,
503 .resume = acpi_cpufreq_resume,
504 .name = "acpi-cpufreq",
505 .owner = THIS_MODULE,
506 .attr = acpi_cpufreq_attr,
507};
508
509
510static int __init
511acpi_cpufreq_init (void)
512{
513 int result = 0;
514
515 dprintk("acpi_cpufreq_init\n");
516
517 result = cpufreq_register_driver(&acpi_cpufreq_driver);
518
519 return (result);
520}
521
522
523static void __exit
524acpi_cpufreq_exit (void)
525{
526 dprintk("acpi_cpufreq_exit\n");
527
528 cpufreq_unregister_driver(&acpi_cpufreq_driver);
529
530 return;
531}
532
533
534late_initcall(acpi_cpufreq_init);
535module_exit(acpi_cpufreq_exit);
536
537MODULE_ALIAS("acpi");
diff --git a/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c
new file mode 100644
index 000000000000..04a405345203
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -0,0 +1,457 @@
1/*
2 * (C) 2004 Sebastian Witt <se.witt@gmx.net>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 * Based upon reverse engineered information
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/pci.h>
16#include <linux/delay.h>
17
18#define NFORCE2_XTAL 25
19#define NFORCE2_BOOTFSB 0x48
20#define NFORCE2_PLLENABLE 0xa8
21#define NFORCE2_PLLREG 0xa4
22#define NFORCE2_PLLADR 0xa0
23#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
24
25#define NFORCE2_MIN_FSB 50
26#define NFORCE2_SAFE_DISTANCE 50
27
28/* Delay in ms between FSB changes */
29//#define NFORCE2_DELAY 10
30
31/* nforce2_chipset:
32 * FSB is changed using the chipset
33 */
34static struct pci_dev *nforce2_chipset_dev;
35
36/* fid:
37 * multiplier * 10
38 */
39static int fid = 0;
40
41/* min_fsb, max_fsb:
42 * minimum and maximum FSB (= FSB at boot time)
43 */
44static int min_fsb = 0;
45static int max_fsb = 0;
46
47MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
48MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
49MODULE_LICENSE("GPL");
50
51module_param(fid, int, 0444);
52module_param(min_fsb, int, 0444);
53
54MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
55MODULE_PARM_DESC(min_fsb,
56 "Minimum FSB to use, if not defined: current FSB - 50");
57
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
59
60/*
61 * nforce2_calc_fsb - calculate FSB
62 * @pll: PLL value
63 *
64 * Calculates FSB from PLL value
65 */
66static int nforce2_calc_fsb(int pll)
67{
68 unsigned char mul, div;
69
70 mul = (pll >> 8) & 0xff;
71 div = pll & 0xff;
72
73 if (div > 0)
74 return NFORCE2_XTAL * mul / div;
75
76 return 0;
77}
78
79/*
80 * nforce2_calc_pll - calculate PLL value
81 * @fsb: FSB
82 *
83 * Calculate PLL value for given FSB
84 */
85static int nforce2_calc_pll(unsigned int fsb)
86{
87 unsigned char xmul, xdiv;
88 unsigned char mul = 0, div = 0;
89 int tried = 0;
90
91 /* Try to calculate multiplier and divider up to 4 times */
92 while (((mul == 0) || (div == 0)) && (tried <= 3)) {
93 for (xdiv = 1; xdiv <= 0x80; xdiv++)
94 for (xmul = 1; xmul <= 0xfe; xmul++)
95 if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
96 fsb + tried) {
97 mul = xmul;
98 div = xdiv;
99 }
100 tried++;
101 }
102
103 if ((mul == 0) || (div == 0))
104 return -1;
105
106 return NFORCE2_PLL(mul, div);
107}
108
109/*
110 * nforce2_write_pll - write PLL value to chipset
111 * @pll: PLL value
112 *
113 * Writes new FSB PLL value to chipset
114 */
115static void nforce2_write_pll(int pll)
116{
117 int temp;
118
119 /* Set the pll addr. to 0x00 */
120 temp = 0x00;
121 pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, temp);
122
123 /* Now write the value in all 64 registers */
124 for (temp = 0; temp <= 0x3f; temp++) {
125 pci_write_config_dword(nforce2_chipset_dev,
126 NFORCE2_PLLREG, pll);
127 }
128
129 return;
130}
131
132/*
133 * nforce2_fsb_read - Read FSB
134 *
135 * Read FSB from chipset
136 * If bootfsb != 0, return FSB at boot-time
137 */
138static unsigned int nforce2_fsb_read(int bootfsb)
139{
140 struct pci_dev *nforce2_sub5;
141 u32 fsb, temp = 0;
142
143
144 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
145 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
146 0x01EF,
147 PCI_ANY_ID,
148 PCI_ANY_ID,
149 NULL);
150
151 if (!nforce2_sub5)
152 return 0;
153
154 pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
155 fsb /= 1000000;
156
157 /* Check if PLL register is already set */
158 pci_read_config_byte(nforce2_chipset_dev,
159 NFORCE2_PLLENABLE, (u8 *)&temp);
160
161 if(bootfsb || !temp)
162 return fsb;
163
164 /* Use PLL register FSB value */
165 pci_read_config_dword(nforce2_chipset_dev,
166 NFORCE2_PLLREG, &temp);
167 fsb = nforce2_calc_fsb(temp);
168
169 return fsb;
170}
171
172/*
173 * nforce2_set_fsb - set new FSB
174 * @fsb: New FSB
175 *
176 * Sets new FSB
177 */
178static int nforce2_set_fsb(unsigned int fsb)
179{
180 u32 pll, temp = 0;
181 unsigned int tfsb;
182 int diff;
183
184 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
185 printk(KERN_ERR "cpufreq: FSB %d is out of range!\n", fsb);
186 return -EINVAL;
187 }
188
189 tfsb = nforce2_fsb_read(0);
190 if (!tfsb) {
191 printk(KERN_ERR "cpufreq: Error while reading the FSB\n");
192 return -EINVAL;
193 }
194
195 /* First write? Then set actual value */
196 pci_read_config_byte(nforce2_chipset_dev,
197 NFORCE2_PLLENABLE, (u8 *)&temp);
198 if (!temp) {
199 pll = nforce2_calc_pll(tfsb);
200
201 if (pll < 0)
202 return -EINVAL;
203
204 nforce2_write_pll(pll);
205 }
206
207 /* Enable write access */
208 temp = 0x01;
209 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8)temp);
210
211 diff = tfsb - fsb;
212
213 if (!diff)
214 return 0;
215
216 while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
217 if (diff < 0)
218 tfsb++;
219 else
220 tfsb--;
221
222 /* Calculate the PLL reg. value */
223 if ((pll = nforce2_calc_pll(tfsb)) == -1)
224 return -EINVAL;
225
226 nforce2_write_pll(pll);
227#ifdef NFORCE2_DELAY
228 mdelay(NFORCE2_DELAY);
229#endif
230 }
231
232 temp = 0x40;
233 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLADR, (u8)temp);
234
235 return 0;
236}
237
238/**
239 * nforce2_get - get the CPU frequency
240 * @cpu: CPU number
241 *
242 * Returns the CPU frequency
243 */
244static unsigned int nforce2_get(unsigned int cpu)
245{
246 if (cpu)
247 return 0;
248 return nforce2_fsb_read(0) * fid * 100;
249}
250
251/**
252 * nforce2_target - set a new CPUFreq policy
253 * @policy: new policy
254 * @target_freq: the target frequency
255 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
256 *
257 * Sets a new CPUFreq policy.
258 */
259static int nforce2_target(struct cpufreq_policy *policy,
260 unsigned int target_freq, unsigned int relation)
261{
262// unsigned long flags;
263 struct cpufreq_freqs freqs;
264 unsigned int target_fsb;
265
266 if ((target_freq > policy->max) || (target_freq < policy->min))
267 return -EINVAL;
268
269 target_fsb = target_freq / (fid * 100);
270
271 freqs.old = nforce2_get(policy->cpu);
272 freqs.new = target_fsb * fid * 100;
273 freqs.cpu = 0; /* Only one CPU on nForce2 plattforms */
274
275 if (freqs.old == freqs.new)
276 return 0;
277
278 dprintk(KERN_INFO "cpufreq: Old CPU frequency %d kHz, new %d kHz\n",
279 freqs.old, freqs.new);
280
281 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
282
283 /* Disable IRQs */
284 //local_irq_save(flags);
285
286 if (nforce2_set_fsb(target_fsb) < 0)
287 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
288 target_fsb);
289 else
290 dprintk(KERN_INFO "cpufreq: Changed FSB successfully to %d\n",
291 target_fsb);
292
293 /* Enable IRQs */
294 //local_irq_restore(flags);
295
296 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
297
298 return 0;
299}
300
301/**
302 * nforce2_verify - verifies a new CPUFreq policy
303 * @policy: new policy
304 */
305static int nforce2_verify(struct cpufreq_policy *policy)
306{
307 unsigned int fsb_pol_max;
308
309 fsb_pol_max = policy->max / (fid * 100);
310
311 if (policy->min < (fsb_pol_max * fid * 100))
312 policy->max = (fsb_pol_max + 1) * fid * 100;
313
314 cpufreq_verify_within_limits(policy,
315 policy->cpuinfo.min_freq,
316 policy->cpuinfo.max_freq);
317 return 0;
318}
319
320static int nforce2_cpu_init(struct cpufreq_policy *policy)
321{
322 unsigned int fsb;
323 unsigned int rfid;
324
325 /* capability check */
326 if (policy->cpu != 0)
327 return -ENODEV;
328
329 /* Get current FSB */
330 fsb = nforce2_fsb_read(0);
331
332 if (!fsb)
333 return -EIO;
334
335 /* FIX: Get FID from CPU */
336 if (!fid) {
337 if (!cpu_khz) {
338 printk(KERN_WARNING
339 "cpufreq: cpu_khz not set, can't calculate multiplier!\n");
340 return -ENODEV;
341 }
342
343 fid = cpu_khz / (fsb * 100);
344 rfid = fid % 5;
345
346 if (rfid) {
347 if (rfid > 2)
348 fid += 5 - rfid;
349 else
350 fid -= rfid;
351 }
352 }
353
354 printk(KERN_INFO "cpufreq: FSB currently at %i MHz, FID %d.%d\n", fsb,
355 fid / 10, fid % 10);
356
357 /* Set maximum FSB to FSB at boot time */
358 max_fsb = nforce2_fsb_read(1);
359
360 if(!max_fsb)
361 return -EIO;
362
363 if (!min_fsb)
364 min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
365
366 if (min_fsb < NFORCE2_MIN_FSB)
367 min_fsb = NFORCE2_MIN_FSB;
368
369 /* cpuinfo and default policy values */
370 policy->cpuinfo.min_freq = min_fsb * fid * 100;
371 policy->cpuinfo.max_freq = max_fsb * fid * 100;
372 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
373 policy->cur = nforce2_get(policy->cpu);
374 policy->min = policy->cpuinfo.min_freq;
375 policy->max = policy->cpuinfo.max_freq;
376 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
377
378 return 0;
379}
380
381static int nforce2_cpu_exit(struct cpufreq_policy *policy)
382{
383 return 0;
384}
385
386static struct cpufreq_driver nforce2_driver = {
387 .name = "nforce2",
388 .verify = nforce2_verify,
389 .target = nforce2_target,
390 .get = nforce2_get,
391 .init = nforce2_cpu_init,
392 .exit = nforce2_cpu_exit,
393 .owner = THIS_MODULE,
394};
395
396/**
397 * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
398 *
399 * Detects nForce2 A2 and C1 stepping
400 *
401 */
402static unsigned int nforce2_detect_chipset(void)
403{
404 u8 revision;
405
406 nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
407 PCI_DEVICE_ID_NVIDIA_NFORCE2,
408 PCI_ANY_ID,
409 PCI_ANY_ID,
410 NULL);
411
412 if (nforce2_chipset_dev == NULL)
413 return -ENODEV;
414
415 pci_read_config_byte(nforce2_chipset_dev, PCI_REVISION_ID, &revision);
416
417 printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n",
418 revision);
419 printk(KERN_INFO
420 "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n");
421
422 return 0;
423}
424
425/**
426 * nforce2_init - initializes the nForce2 CPUFreq driver
427 *
428 * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
429 * devices, -EINVAL on problems during initiatization, and zero on
430 * success.
431 */
432static int __init nforce2_init(void)
433{
434 /* TODO: do we need to detect the processor? */
435
436 /* detect chipset */
437 if (nforce2_detect_chipset()) {
438 printk(KERN_ERR "cpufreq: No nForce2 chipset.\n");
439 return -ENODEV;
440 }
441
442 return cpufreq_register_driver(&nforce2_driver);
443}
444
445/**
446 * nforce2_exit - unregisters cpufreq module
447 *
448 * Unregisters nForce2 FSB change support.
449 */
450static void __exit nforce2_exit(void)
451{
452 cpufreq_unregister_driver(&nforce2_driver);
453}
454
455module_init(nforce2_init);
456module_exit(nforce2_exit);
457
diff --git a/arch/i386/kernel/cpu/cpufreq/elanfreq.c b/arch/i386/kernel/cpu/cpufreq/elanfreq.c
new file mode 100644
index 000000000000..3f7caa4ae6d6
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/elanfreq.c
@@ -0,0 +1,312 @@
1/*
2 * elanfreq: cpufreq driver for the AMD ELAN family
3 *
4 * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
5 *
6 * Parts of this code are (c) Sven Geggus <sven@geggus.net>
7 *
8 * All Rights Reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <linux/slab.h>
24#include <linux/delay.h>
25#include <linux/cpufreq.h>
26
27#include <asm/msr.h>
28#include <asm/timex.h>
29#include <asm/io.h>
30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
33
34/* Module parameter */
35static int max_freq;
36
37struct s_elan_multiplier {
38 int clock; /* frequency in kHz */
39 int val40h; /* PMU Force Mode register */
40 int val80h; /* CPU Clock Speed Register */
41};
42
43/*
44 * It is important that the frequencies
45 * are listed in ascending order here!
46 */
47struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08},
51 {8000, 0x00, 0x00},
52 {16000, 0x00, 0x02},
53 {33000, 0x00, 0x04},
54 {66000, 0x01, 0x04},
55 {99000, 0x01, 0x05}
56};
57
58static struct cpufreq_frequency_table elanfreq_table[] = {
59 {0, 1000},
60 {1, 2000},
61 {2, 4000},
62 {3, 8000},
63 {4, 16000},
64 {5, 33000},
65 {6, 66000},
66 {7, 99000},
67 {0, CPUFREQ_TABLE_END},
68};
69
70
71/**
72 * elanfreq_get_cpu_frequency: determine current cpu speed
73 *
74 * Finds out at which frequency the CPU of the Elan SOC runs
75 * at the moment. Frequencies from 1 to 33 MHz are generated
76 * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
77 * and have the rest of the chip running with 33 MHz.
78 */
79
80static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
81{
82 u8 clockspeed_reg; /* Clock Speed Register */
83
84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable();
88
89 if ((clockspeed_reg & 0xE0) == 0xE0) { return 0; }
90
91 /* Are we in CPU clock multiplied mode (66/99 MHz)? */
92 if ((clockspeed_reg & 0xE0) == 0xC0) {
93 if ((clockspeed_reg & 0x01) == 0) {
94 return 66000;
95 } else {
96 return 99000;
97 }
98 }
99
100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0)
102 return 33000;
103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000);
105}
106
107
108/**
109 * elanfreq_set_cpu_frequency: Change the CPU core frequency
110 * @cpu: cpu number
111 * @freq: frequency in kHz
112 *
113 * This function takes a frequency value and changes the CPU frequency
114 * according to this. Note that the frequency has to be checked by
115 * elanfreq_validatespeed() for correctness!
116 *
117 * There is no return value.
118 */
119
120static void elanfreq_set_cpu_state (unsigned int state) {
121
122 struct cpufreq_freqs freqs;
123
124 freqs.old = elanfreq_get_cpu_frequency(0);
125 freqs.new = elan_multiplier[state].clock;
126 freqs.cpu = 0; /* elanfreq.c is UP only driver */
127
128 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
129
130 printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",elan_multiplier[state].clock);
131
132
133 /*
134 * Access to the Elan's internal registers is indexed via
135 * 0x22: Chip Setup & Control Register Index Register (CSCI)
136 * 0x23: Chip Setup & Control Register Data Register (CSCD)
137 *
138 */
139
140 /*
141 * 0x40 is the Power Management Unit's Force Mode Register.
142 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
143 */
144
145 local_irq_disable();
146 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */
147 outb_p(0x00,REG_CSCDR);
148 local_irq_enable(); /* wait till internal pipelines and */
149 udelay(1000); /* buffers have cleaned up */
150
151 local_irq_disable();
152
153 /* now, set the CPU clock speed register (0x80) */
154 outb_p(0x80,REG_CSCIR);
155 outb_p(elan_multiplier[state].val80h,REG_CSCDR);
156
157 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
158 outb_p(0x40,REG_CSCIR);
159 outb_p(elan_multiplier[state].val40h,REG_CSCDR);
160 udelay(10000);
161 local_irq_enable();
162
163 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
164};
165
166
167/**
168 * elanfreq_validatespeed: test if frequency range is valid
169 * @policy: the policy to validate
170 *
171 * This function checks if a given frequency range in kHz is valid
172 * for the hardware supported by the driver.
173 */
174
175static int elanfreq_verify (struct cpufreq_policy *policy)
176{
177 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
178}
179
180static int elanfreq_target (struct cpufreq_policy *policy,
181 unsigned int target_freq,
182 unsigned int relation)
183{
184 unsigned int newstate = 0;
185
186 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], target_freq, relation, &newstate))
187 return -EINVAL;
188
189 elanfreq_set_cpu_state(newstate);
190
191 return 0;
192}
193
194
195/*
196 * Module init and exit code
197 */
198
199static int elanfreq_cpu_init(struct cpufreq_policy *policy)
200{
201 struct cpuinfo_x86 *c = cpu_data;
202 unsigned int i;
203 int result;
204
205 /* capability check */
206 if ((c->x86_vendor != X86_VENDOR_AMD) ||
207 (c->x86 != 4) || (c->x86_model!=10))
208 return -ENODEV;
209
210 /* max freq */
211 if (!max_freq)
212 max_freq = elanfreq_get_cpu_frequency(0);
213
214 /* table init */
215 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
216 if (elanfreq_table[i].frequency > max_freq)
217 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
218 }
219
220 /* cpuinfo and default policy values */
221 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
222 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
223 policy->cur = elanfreq_get_cpu_frequency(0);
224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result)
227 return (result);
228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230
231 return 0;
232}
233
234
235static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
236{
237 cpufreq_frequency_table_put_attr(policy->cpu);
238 return 0;
239}
240
241
242#ifndef MODULE
243/**
244 * elanfreq_setup - elanfreq command line parameter parsing
245 *
246 * elanfreq command line parameter. Use:
247 * elanfreq=66000
248 * to set the maximum CPU frequency to 66 MHz. Note that in
249 * case you do not give this boot parameter, the maximum
250 * frequency will fall back to _current_ CPU frequency which
251 * might be lower. If you build this as a module, use the
252 * max_freq module parameter instead.
253 */
254static int __init elanfreq_setup(char *str)
255{
256 max_freq = simple_strtoul(str, &str, 0);
257 printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
258 return 1;
259}
260__setup("elanfreq=", elanfreq_setup);
261#endif
262
263
264static struct freq_attr* elanfreq_attr[] = {
265 &cpufreq_freq_attr_scaling_available_freqs,
266 NULL,
267};
268
269
270static struct cpufreq_driver elanfreq_driver = {
271 .get = elanfreq_get_cpu_frequency,
272 .verify = elanfreq_verify,
273 .target = elanfreq_target,
274 .init = elanfreq_cpu_init,
275 .exit = elanfreq_cpu_exit,
276 .name = "elanfreq",
277 .owner = THIS_MODULE,
278 .attr = elanfreq_attr,
279};
280
281
282static int __init elanfreq_init(void)
283{
284 struct cpuinfo_x86 *c = cpu_data;
285
286 /* Test if we have the right hardware */
287 if ((c->x86_vendor != X86_VENDOR_AMD) ||
288 (c->x86 != 4) || (c->x86_model!=10))
289 {
290 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
291 return -ENODEV;
292 }
293
294 return cpufreq_register_driver(&elanfreq_driver);
295}
296
297
298static void __exit elanfreq_exit(void)
299{
300 cpufreq_unregister_driver(&elanfreq_driver);
301}
302
303
304module_param (max_freq, int, 0444);
305
306MODULE_LICENSE("GPL");
307MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
308MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
309
310module_init(elanfreq_init);
311module_exit(elanfreq_exit);
312
diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
new file mode 100644
index 000000000000..1a49adb1f4a6
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
@@ -0,0 +1,502 @@
1/*
2 * Cyrix MediaGX and NatSemi Geode Suspend Modulation
3 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
4 * (C) 2002 Hiroshi Miura <miura@da-cha.org>
5 * All Rights Reserved
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation
10 *
11 * The author(s) of this software shall not be held liable for damages
12 * of any nature resulting due to the use of this software. This
13 * software is provided AS-IS with no warranties.
14 *
15 * Theoritical note:
16 *
17 * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
18 *
19 * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
20 * are based on Suspend Moduration.
21 *
22 * Suspend Modulation works by asserting and de-asserting the SUSP# pin
23 * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
24 * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
25 * asserted then power consumption is reduced.
26 *
27 * Suspend Modulation's OFF/ON duration are configurable
28 * with 'Suspend Modulation OFF Count Register'
29 * and 'Suspend Modulation ON Count Register'.
30 * These registers are 8bit counters that represent the number of
31 * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
32 * to the processor.
33 *
34 * These counters define a ratio which is the effective frequency
35 * of operation of the system.
36 *
37 * OFF Count
38 * F_eff = Fgx * ----------------------
39 * OFF Count + ON Count
40 *
41 * 0 <= On Count, Off Count <= 255
42 *
43 * From these limits, we can get register values
44 *
45 * off_duration + on_duration <= MAX_DURATION
46 * on_duration = off_duration * (stock_freq - freq) / freq
47 *
48 * off_duration = (freq * DURATION) / stock_freq
49 * on_duration = DURATION - off_duration
50 *
51 *
52 *---------------------------------------------------------------------------
53 *
54 * ChangeLog:
55 * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
56 * - fix on/off register mistake
57 * - fix cpu_khz calc when it stops cpu modulation.
58 *
59 * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
60 * - rewrite for Cyrix MediaGX Cx5510/5520 and
61 * NatSemi Geode Cs5530(A).
62 *
63 * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
64 * - cs5530_mod patch for 2.4.19-rc1.
65 *
66 *---------------------------------------------------------------------------
67 *
68 * Todo
69 * Test on machines with 5510, 5530, 5530A
70 */
71
72/************************************************************************
73 * Suspend Modulation - Definitions *
74 ************************************************************************/
75
76#include <linux/kernel.h>
77#include <linux/module.h>
78#include <linux/init.h>
79#include <linux/smp.h>
80#include <linux/cpufreq.h>
81#include <linux/pci.h>
82#include <asm/processor.h>
83#include <asm/errno.h>
84
85/* PCI config registers, all at F0 */
86#define PCI_PMER1 0x80 /* power management enable register 1 */
87#define PCI_PMER2 0x81 /* power management enable register 2 */
88#define PCI_PMER3 0x82 /* power management enable register 3 */
89#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
90#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
91#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
92#define PCI_MODON 0x95 /* suspend modulation ON counter register */
93#define PCI_SUSCFG 0x96 /* suspend configuration register */
94
95/* PMER1 bits */
96#define GPM (1<<0) /* global power management */
97#define GIT (1<<1) /* globally enable PM device idle timers */
98#define GTR (1<<2) /* globally enable IO traps */
99#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
100#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
101
102/* SUSCFG bits */
103#define SUSMOD (1<<0) /* enable/disable suspend modulation */
104/* the belows support only with cs5530 (after rev.1.2)/cs5530A */
105#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
106 /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
107#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
108/* the belows support only with cs5530A */
109#define PWRSVE_ISA (1<<3) /* stop ISA clock */
110#define PWRSVE (1<<4) /* active idle */
111
112struct gxfreq_params {
113 u8 on_duration;
114 u8 off_duration;
115 u8 pci_suscfg;
116 u8 pci_pmer1;
117 u8 pci_pmer2;
118 u8 pci_rev;
119 struct pci_dev *cs55x0;
120};
121
122static struct gxfreq_params *gx_params;
123static int stock_freq;
124
125/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
126static int pci_busclk = 0;
127module_param (pci_busclk, int, 0444);
128
129/* maximum duration for which the cpu may be suspended
130 * (32us * MAX_DURATION). If no parameter is given, this defaults
131 * to 255.
132 * Note that this leads to a maximum of 8 ms(!) where the CPU clock
133 * is suspended -- processing power is just 0.39% of what it used to be,
134 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
135static int max_duration = 255;
136module_param (max_duration, int, 0444);
137
138/* For the default policy, we want at least some processing power
139 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
140 */
141#define POLICY_MIN_DIV 20
142
143
144#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "gx-suspmod", msg)
145
146/**
147 * we can detect a core multipiler from dir0_lsb
148 * from GX1 datasheet p.56,
149 * MULT[3:0]:
150 * 0000 = SYSCLK multiplied by 4 (test only)
151 * 0001 = SYSCLK multiplied by 10
152 * 0010 = SYSCLK multiplied by 4
153 * 0011 = SYSCLK multiplied by 6
154 * 0100 = SYSCLK multiplied by 9
155 * 0101 = SYSCLK multiplied by 5
156 * 0110 = SYSCLK multiplied by 7
157 * 0111 = SYSCLK multiplied by 8
158 * of 33.3MHz
159 **/
160static int gx_freq_mult[16] = {
161 4, 10, 4, 6, 9, 5, 7, 8,
162 0, 0, 0, 0, 0, 0, 0, 0
163};
164
165
166/****************************************************************
167 * Low Level chipset interface *
168 ****************************************************************/
169static struct pci_device_id gx_chipset_tbl[] __initdata = {
170 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, PCI_ANY_ID, PCI_ANY_ID },
171 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, PCI_ANY_ID, PCI_ANY_ID },
172 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, PCI_ANY_ID, PCI_ANY_ID },
173 { 0, },
174};
175
176/**
177 * gx_detect_chipset:
178 *
179 **/
180static __init struct pci_dev *gx_detect_chipset(void)
181{
182 struct pci_dev *gx_pci = NULL;
183
184 /* check if CPU is a MediaGX or a Geode. */
185 if ((current_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
186 (current_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
187 dprintk("error: no MediaGX/Geode processor found!\n");
188 return NULL;
189 }
190
191 /* detect which companion chip is used */
192 while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
193 if ((pci_match_device (gx_chipset_tbl, gx_pci)) != NULL) {
194 return gx_pci;
195 }
196 }
197
198 dprintk("error: no supported chipset found!\n");
199 return NULL;
200}
201
202/**
203 * gx_get_cpuspeed:
204 *
205 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi Geode CPU runs.
206 */
207static unsigned int gx_get_cpuspeed(unsigned int cpu)
208{
209 if ((gx_params->pci_suscfg & SUSMOD) == 0)
210 return stock_freq;
211
212 return (stock_freq * gx_params->off_duration)
213 / (gx_params->on_duration + gx_params->off_duration);
214}
215
216/**
217 * gx_validate_speed:
218 * determine current cpu speed
219 *
220**/
221
222static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, u8 *off_duration)
223{
224 unsigned int i;
225 u8 tmp_on, tmp_off;
226 int old_tmp_freq = stock_freq;
227 int tmp_freq;
228
229 *off_duration=1;
230 *on_duration=0;
231
232 for (i=max_duration; i>0; i--) {
233 tmp_off = ((khz * i) / stock_freq) & 0xff;
234 tmp_on = i - tmp_off;
235 tmp_freq = (stock_freq * tmp_off) / i;
236 /* if this relation is closer to khz, use this. If it's equal,
237 * prefer it, too - lower latency */
238 if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
239 *on_duration = tmp_on;
240 *off_duration = tmp_off;
241 old_tmp_freq = tmp_freq;
242 }
243 }
244
245 return old_tmp_freq;
246}
247
248
249/**
250 * gx_set_cpuspeed:
251 * set cpu speed in khz.
252 **/
253
254static void gx_set_cpuspeed(unsigned int khz)
255{
256 u8 suscfg, pmer1;
257 unsigned int new_khz;
258 unsigned long flags;
259 struct cpufreq_freqs freqs;
260
261
262 freqs.cpu = 0;
263 freqs.old = gx_get_cpuspeed(0);
264
265 new_khz = gx_validate_speed(khz, &gx_params->on_duration, &gx_params->off_duration);
266
267 freqs.new = new_khz;
268
269 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
270 local_irq_save(flags);
271
272 if (new_khz != stock_freq) { /* if new khz == 100% of CPU speed, it is special case */
273 switch (gx_params->cs55x0->device) {
274 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
275 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
276 /* FIXME: need to test other values -- Zwane,Miura */
277 pci_write_config_byte(gx_params->cs55x0, PCI_IRQTC, 4); /* typical 2 to 4ms */
278 pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */
279 pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1);
280
281 if (gx_params->pci_rev < 0x10) { /* CS5530(rev 1.2, 1.3) */
282 suscfg = gx_params->pci_suscfg | SUSMOD;
283 } else { /* CS5530A,B.. */
284 suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE;
285 }
286 break;
287 case PCI_DEVICE_ID_CYRIX_5520:
288 case PCI_DEVICE_ID_CYRIX_5510:
289 suscfg = gx_params->pci_suscfg | SUSMOD;
290 break;
291 default:
292 local_irq_restore(flags);
293 dprintk("fatal: try to set unknown chipset.\n");
294 return;
295 }
296 } else {
297 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
298 gx_params->off_duration = 0;
299 gx_params->on_duration = 0;
300 dprintk("suspend modulation disabled: cpu runs 100 percent speed.\n");
301 }
302
303 pci_write_config_byte(gx_params->cs55x0, PCI_MODOFF, gx_params->off_duration);
304 pci_write_config_byte(gx_params->cs55x0, PCI_MODON, gx_params->on_duration);
305
306 pci_write_config_byte(gx_params->cs55x0, PCI_SUSCFG, suscfg);
307 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
308
309 local_irq_restore(flags);
310
311 gx_params->pci_suscfg = suscfg;
312
313 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
314
315 dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
316 gx_params->on_duration * 32, gx_params->off_duration * 32);
317 dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
318}
319
320/****************************************************************
321 * High level functions *
322 ****************************************************************/
323
324/*
325 * cpufreq_gx_verify: test if frequency range is valid
326 *
327 * This function checks if a given frequency range in kHz is valid
328 * for the hardware supported by the driver.
329 */
330
331static int cpufreq_gx_verify(struct cpufreq_policy *policy)
332{
333 unsigned int tmp_freq = 0;
334 u8 tmp1, tmp2;
335
336 if (!stock_freq || !policy)
337 return -EINVAL;
338
339 policy->cpu = 0;
340 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
341
342 /* it needs to be assured that at least one supported frequency is
343 * within policy->min and policy->max. If it is not, policy->max
344 * needs to be increased until one freuqency is supported.
345 * policy->min may not be decreased, though. This way we guarantee a
346 * specific processing capacity.
347 */
348 tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
349 if (tmp_freq < policy->min)
350 tmp_freq += stock_freq / max_duration;
351 policy->min = tmp_freq;
352 if (policy->min > policy->max)
353 policy->max = tmp_freq;
354 tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
355 if (tmp_freq > policy->max)
356 tmp_freq -= stock_freq / max_duration;
357 policy->max = tmp_freq;
358 if (policy->max < policy->min)
359 policy->max = policy->min;
360 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
361
362 return 0;
363}
364
365/*
366 * cpufreq_gx_target:
367 *
368 */
369static int cpufreq_gx_target(struct cpufreq_policy *policy,
370 unsigned int target_freq,
371 unsigned int relation)
372{
373 u8 tmp1, tmp2;
374 unsigned int tmp_freq;
375
376 if (!stock_freq || !policy)
377 return -EINVAL;
378
379 policy->cpu = 0;
380
381 tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
382 while (tmp_freq < policy->min) {
383 tmp_freq += stock_freq / max_duration;
384 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
385 }
386 while (tmp_freq > policy->max) {
387 tmp_freq -= stock_freq / max_duration;
388 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
389 }
390
391 gx_set_cpuspeed(tmp_freq);
392
393 return 0;
394}
395
396static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
397{
398 unsigned int maxfreq, curfreq;
399
400 if (!policy || policy->cpu != 0)
401 return -ENODEV;
402
403 /* determine maximum frequency */
404 if (pci_busclk) {
405 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
406 } else if (cpu_khz) {
407 maxfreq = cpu_khz;
408 } else {
409 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
410 }
411 stock_freq = maxfreq;
412 curfreq = gx_get_cpuspeed(0);
413
414 dprintk("cpu max frequency is %d.\n", maxfreq);
415 dprintk("cpu current frequency is %dkHz.\n",curfreq);
416
417 /* setup basic struct for cpufreq API */
418 policy->cpu = 0;
419
420 if (max_duration < POLICY_MIN_DIV)
421 policy->min = maxfreq / max_duration;
422 else
423 policy->min = maxfreq / POLICY_MIN_DIV;
424 policy->max = maxfreq;
425 policy->cur = curfreq;
426 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
427 policy->cpuinfo.min_freq = maxfreq / max_duration;
428 policy->cpuinfo.max_freq = maxfreq;
429 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
430
431 return 0;
432}
433
434/*
435 * cpufreq_gx_init:
436 * MediaGX/Geode GX initialize cpufreq driver
437 */
438static struct cpufreq_driver gx_suspmod_driver = {
439 .get = gx_get_cpuspeed,
440 .verify = cpufreq_gx_verify,
441 .target = cpufreq_gx_target,
442 .init = cpufreq_gx_cpu_init,
443 .name = "gx-suspmod",
444 .owner = THIS_MODULE,
445};
446
447static int __init cpufreq_gx_init(void)
448{
449 int ret;
450 struct gxfreq_params *params;
451 struct pci_dev *gx_pci;
452 u32 class_rev;
453
454 /* Test if we have the right hardware */
455 if ((gx_pci = gx_detect_chipset()) == NULL)
456 return -ENODEV;
457
458 /* check whether module parameters are sane */
459 if (max_duration > 0xff)
460 max_duration = 0xff;
461
462 dprintk("geode suspend modulation available.\n");
463
464 params = kmalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
465 if (params == NULL)
466 return -ENOMEM;
467 memset(params, 0, sizeof(struct gxfreq_params));
468
469 params->cs55x0 = gx_pci;
470 gx_params = params;
471
472 /* keep cs55x0 configurations */
473 pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
474 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
475 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
476 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
477 pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration));
478 pci_read_config_dword(params->cs55x0, PCI_CLASS_REVISION, &class_rev);
479 params->pci_rev = class_rev && 0xff;
480
481 if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) {
482 kfree(params);
483 return ret; /* register error! */
484 }
485
486 return 0;
487}
488
489static void __exit cpufreq_gx_exit(void)
490{
491 cpufreq_unregister_driver(&gx_suspmod_driver);
492 pci_dev_put(gx_params->cs55x0);
493 kfree(gx_params);
494}
495
496MODULE_AUTHOR ("Hiroshi Miura <miura@da-cha.org>");
497MODULE_DESCRIPTION ("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
498MODULE_LICENSE ("GPL");
499
500module_init(cpufreq_gx_init);
501module_exit(cpufreq_gx_exit);
502
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
new file mode 100644
index 000000000000..ab0f9f5aac11
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -0,0 +1,658 @@
1/*
2 * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon datasheets & sample CPUs kindly provided by VIA.
7 *
8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is the same as v1, but adds voltage scaling.
12 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C)
13 * voltage scaling support has currently been disabled in this driver
14 * until we have code that gets it right.
15 * Version 3 of longhaul got renamed to Powersaver and redesigned
16 * to use the POWERSAVER MSR at 0x110a.
17 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
18 * It's pretty much the same feature wise to longhaul v2, though
19 * there is provision for scaling FSB too, but this doesn't work
20 * too well in practice so we don't even try to use this.
21 *
22 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
23 */
24
25#include <linux/kernel.h>
26#include <linux/module.h>
27#include <linux/moduleparam.h>
28#include <linux/init.h>
29#include <linux/cpufreq.h>
30#include <linux/slab.h>
31#include <linux/string.h>
32
33#include <asm/msr.h>
34#include <asm/timex.h>
35#include <asm/io.h>
36
37#include "longhaul.h"
38
39#define PFX "longhaul: "
40
41#define TYPE_LONGHAUL_V1 1
42#define TYPE_LONGHAUL_V2 2
43#define TYPE_POWERSAVER 3
44
45#define CPU_SAMUEL 1
46#define CPU_SAMUEL2 2
47#define CPU_EZRA 3
48#define CPU_EZRA_T 4
49#define CPU_NEHEMIAH 5
50
51static int cpu_model;
52static unsigned int numscales=16, numvscales;
53static unsigned int fsb;
54static int minvid, maxvid;
55static unsigned int minmult, maxmult;
56static int can_scale_voltage;
57static int vrmrev;
58
59/* Module parameters */
60static int dont_scale_voltage;
61
62
63#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
64
65
66#define __hlt() __asm__ __volatile__("hlt": : :"memory")
67
68/* Clock ratios multiplied by 10 */
69static int clock_ratio[32];
70static int eblcr_table[32];
71static int voltage_table[32];
72static unsigned int highest_speed, lowest_speed; /* kHz */
73static int longhaul_version;
74static struct cpufreq_frequency_table *longhaul_table;
75
76#ifdef CONFIG_CPU_FREQ_DEBUG
77static char speedbuffer[8];
78
79static char *print_speed(int speed)
80{
81 if (speed > 1000) {
82 if (speed%1000 == 0)
83 sprintf (speedbuffer, "%dGHz", speed/1000);
84 else
85 sprintf (speedbuffer, "%d.%dGHz", speed/1000, (speed%1000)/100);
86 } else
87 sprintf (speedbuffer, "%dMHz", speed);
88
89 return speedbuffer;
90}
91#endif
92
93
94static unsigned int calc_speed(int mult)
95{
96 int khz;
97 khz = (mult/10)*fsb;
98 if (mult%10)
99 khz += fsb/2;
100 khz *= 1000;
101 return khz;
102}
103
104
105static int longhaul_get_cpu_mult(void)
106{
107 unsigned long invalue=0,lo, hi;
108
109 rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi);
110 invalue = (lo & (1<<22|1<<23|1<<24|1<<25)) >>22;
111 if (longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) {
112 if (lo & (1<<27))
113 invalue+=16;
114 }
115 return eblcr_table[invalue];
116}
117
118
119static void do_powersaver(union msr_longhaul *longhaul,
120 unsigned int clock_ratio_index)
121{
122 int version;
123
124 switch (cpu_model) {
125 case CPU_EZRA_T:
126 version = 3;
127 break;
128 case CPU_NEHEMIAH:
129 version = 0xf;
130 break;
131 default:
132 return;
133 }
134
135 rdmsrl(MSR_VIA_LONGHAUL, longhaul->val);
136 longhaul->bits.SoftBusRatio = clock_ratio_index & 0xf;
137 longhaul->bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
138 longhaul->bits.EnableSoftBusRatio = 1;
139 longhaul->bits.RevisionKey = 0;
140 local_irq_disable();
141 wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
142 local_irq_enable();
143 __hlt();
144
145 rdmsrl(MSR_VIA_LONGHAUL, longhaul->val);
146 longhaul->bits.EnableSoftBusRatio = 0;
147 longhaul->bits.RevisionKey = version;
148 local_irq_disable();
149 wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
150 local_irq_enable();
151}
152
153/**
154 * longhaul_set_cpu_frequency()
155 * @clock_ratio_index : bitpattern of the new multiplier.
156 *
157 * Sets a new clock ratio.
158 */
159
160static void longhaul_setstate(unsigned int clock_ratio_index)
161{
162 int speed, mult;
163 struct cpufreq_freqs freqs;
164 union msr_longhaul longhaul;
165 union msr_bcr2 bcr2;
166 static unsigned int old_ratio=-1;
167
168 if (old_ratio == clock_ratio_index)
169 return;
170 old_ratio = clock_ratio_index;
171
172 mult = clock_ratio[clock_ratio_index];
173 if (mult == -1)
174 return;
175
176 speed = calc_speed(mult);
177 if ((speed > highest_speed) || (speed < lowest_speed))
178 return;
179
180 freqs.old = calc_speed(longhaul_get_cpu_mult());
181 freqs.new = speed;
182 freqs.cpu = 0; /* longhaul.c is UP only driver */
183
184 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
185
186 dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
187 fsb, mult/10, mult%10, print_speed(speed/1000));
188
189 switch (longhaul_version) {
190
191 /*
192 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
193 * Software controlled multipliers only.
194 *
195 * *NB* Until we get voltage scaling working v1 & v2 are the same code.
196 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5b] and Ezra [C5C]
197 */
198 case TYPE_LONGHAUL_V1:
199 case TYPE_LONGHAUL_V2:
200 rdmsrl (MSR_VIA_BCR2, bcr2.val);
201 /* Enable software clock multiplier */
202 bcr2.bits.ESOFTBF = 1;
203 bcr2.bits.CLOCKMUL = clock_ratio_index;
204 local_irq_disable();
205 wrmsrl (MSR_VIA_BCR2, bcr2.val);
206 local_irq_enable();
207
208 __hlt();
209
210 /* Disable software clock multiplier */
211 rdmsrl (MSR_VIA_BCR2, bcr2.val);
212 bcr2.bits.ESOFTBF = 0;
213 local_irq_disable();
214 wrmsrl (MSR_VIA_BCR2, bcr2.val);
215 local_irq_enable();
216 break;
217
218 /*
219 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
220 * We can scale voltage with this too, but that's currently
221 * disabled until we come up with a decent 'match freq to voltage'
222 * algorithm.
223 * When we add voltage scaling, we will also need to do the
224 * voltage/freq setting in order depending on the direction
225 * of scaling (like we do in powernow-k7.c)
226 * Nehemiah can do FSB scaling too, but this has never been proven
227 * to work in practice.
228 */
229 case TYPE_POWERSAVER:
230 do_powersaver(&longhaul, clock_ratio_index);
231 break;
232 }
233
234 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
235}
236
237/*
238 * Centaur decided to make life a little more tricky.
239 * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
240 * Samuel2 and above have to try and guess what the FSB is.
241 * We do this by assuming we booted at maximum multiplier, and interpolate
242 * between that value multiplied by possible FSBs and cpu_mhz which
243 * was calculated at boot time. Really ugly, but no other way to do this.
244 */
245
246#define ROUNDING 0xf
247
248static int _guess(int guess)
249{
250 int target;
251
252 target = ((maxmult/10)*guess);
253 if (maxmult%10 != 0)
254 target += (guess/2);
255 target += ROUNDING/2;
256 target &= ~ROUNDING;
257 return target;
258}
259
260
261static int guess_fsb(void)
262{
263 int speed = (cpu_khz/1000);
264 int i;
265 int speeds[3] = { 66, 100, 133 };
266
267 speed += ROUNDING/2;
268 speed &= ~ROUNDING;
269
270 for (i=0; i<3; i++) {
271 if (_guess(speeds[i]) == speed)
272 return speeds[i];
273 }
274 return 0;
275}
276
277
278static int __init longhaul_get_ranges(void)
279{
280 unsigned long invalue;
281 unsigned int multipliers[32]= {
282 50,30,40,100,55,35,45,95,90,70,80,60,120,75,85,65,
283 -1,110,120,-1,135,115,125,105,130,150,160,140,-1,155,-1,145 };
284 unsigned int j, k = 0;
285 union msr_longhaul longhaul;
286 unsigned long lo, hi;
287 unsigned int eblcr_fsb_table_v1[] = { 66, 133, 100, -1 };
288 unsigned int eblcr_fsb_table_v2[] = { 133, 100, -1, 66 };
289
290 switch (longhaul_version) {
291 case TYPE_LONGHAUL_V1:
292 case TYPE_LONGHAUL_V2:
293 /* Ugh, Longhaul v1 didn't have the min/max MSRs.
294 Assume min=3.0x & max = whatever we booted at. */
295 minmult = 30;
296 maxmult = longhaul_get_cpu_mult();
297 rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi);
298 invalue = (lo & (1<<18|1<<19)) >>18;
299 if (cpu_model==CPU_SAMUEL || cpu_model==CPU_SAMUEL2)
300 fsb = eblcr_fsb_table_v1[invalue];
301 else
302 fsb = guess_fsb();
303 break;
304
305 case TYPE_POWERSAVER:
306 /* Ezra-T */
307 if (cpu_model==CPU_EZRA_T) {
308 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
309 invalue = longhaul.bits.MaxMHzBR;
310 if (longhaul.bits.MaxMHzBR4)
311 invalue += 16;
312 maxmult=multipliers[invalue];
313
314 invalue = longhaul.bits.MinMHzBR;
315 if (longhaul.bits.MinMHzBR4 == 1)
316 minmult = 30;
317 else
318 minmult = multipliers[invalue];
319 fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB];
320 break;
321 }
322
323 /* Nehemiah */
324 if (cpu_model==CPU_NEHEMIAH) {
325 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
326
327 /*
328 * TODO: This code works, but raises a lot of questions.
329 * - Some Nehemiah's seem to have broken Min/MaxMHzBR's.
330 * We get around this by using a hardcoded multiplier of 4.0x
331 * for the minimimum speed, and the speed we booted up at for the max.
332 * This is done in longhaul_get_cpu_mult() by reading the EBLCR register.
333 * - According to some VIA documentation EBLCR is only
334 * in pre-Nehemiah C3s. How this still works is a mystery.
335 * We're possibly using something undocumented and unsupported,
336 * But it works, so we don't grumble.
337 */
338 minmult=40;
339 maxmult=longhaul_get_cpu_mult();
340
341 /* Starting with the 1.2GHz parts, theres a 200MHz bus. */
342 if ((cpu_khz/1000) > 1200)
343 fsb = 200;
344 else
345 fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB];
346 break;
347 }
348 }
349
350 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
351 minmult/10, minmult%10, maxmult/10, maxmult%10);
352
353 if (fsb == -1) {
354 printk (KERN_INFO PFX "Invalid (reserved) FSB!\n");
355 return -EINVAL;
356 }
357
358 highest_speed = calc_speed(maxmult);
359 lowest_speed = calc_speed(minmult);
360 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
361 print_speed(lowest_speed/1000),
362 print_speed(highest_speed/1000));
363
364 if (lowest_speed == highest_speed) {
365 printk (KERN_INFO PFX "highestspeed == lowest, aborting.\n");
366 return -EINVAL;
367 }
368 if (lowest_speed > highest_speed) {
369 printk (KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
370 lowest_speed, highest_speed);
371 return -EINVAL;
372 }
373
374 longhaul_table = kmalloc((numscales + 1) * sizeof(struct cpufreq_frequency_table), GFP_KERNEL);
375 if(!longhaul_table)
376 return -ENOMEM;
377
378 for (j=0; j < numscales; j++) {
379 unsigned int ratio;
380 ratio = clock_ratio[j];
381 if (ratio == -1)
382 continue;
383 if (ratio > maxmult || ratio < minmult)
384 continue;
385 longhaul_table[k].frequency = calc_speed(ratio);
386 longhaul_table[k].index = j;
387 k++;
388 }
389
390 longhaul_table[k].frequency = CPUFREQ_TABLE_END;
391 if (!k) {
392 kfree (longhaul_table);
393 return -EINVAL;
394 }
395
396 return 0;
397}
398
399
400static void __init longhaul_setup_voltagescaling(void)
401{
402 union msr_longhaul longhaul;
403
404 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
405
406 if (!(longhaul.bits.RevisionID & 1))
407 return;
408
409 minvid = longhaul.bits.MinimumVID;
410 maxvid = longhaul.bits.MaximumVID;
411 vrmrev = longhaul.bits.VRMRev;
412
413 if (minvid == 0 || maxvid == 0) {
414 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
415 "Voltage scaling disabled.\n",
416 minvid/1000, minvid%1000, maxvid/1000, maxvid%1000);
417 return;
418 }
419
420 if (minvid == maxvid) {
421 printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are "
422 "both %d.%03d. Voltage scaling disabled\n",
423 maxvid/1000, maxvid%1000);
424 return;
425 }
426
427 if (vrmrev==0) {
428 dprintk ("VRM 8.5 \n");
429 memcpy (voltage_table, vrm85scales, sizeof(voltage_table));
430 numvscales = (voltage_table[maxvid]-voltage_table[minvid])/25;
431 } else {
432 dprintk ("Mobile VRM \n");
433 memcpy (voltage_table, mobilevrmscales, sizeof(voltage_table));
434 numvscales = (voltage_table[maxvid]-voltage_table[minvid])/5;
435 }
436
437 /* Current voltage isn't readable at first, so we need to
438 set it to a known value. The spec says to use maxvid */
439 longhaul.bits.RevisionKey = longhaul.bits.RevisionID; /* FIXME: This is bad. */
440 longhaul.bits.EnableSoftVID = 1;
441 longhaul.bits.SoftVID = maxvid;
442 wrmsrl (MSR_VIA_LONGHAUL, longhaul.val);
443
444 minvid = voltage_table[minvid];
445 maxvid = voltage_table[maxvid];
446
447 dprintk ("Min VID=%d.%03d Max VID=%d.%03d, %d possible voltage scales\n",
448 maxvid/1000, maxvid%1000, minvid/1000, minvid%1000, numvscales);
449
450 can_scale_voltage = 1;
451}
452
453
454static int longhaul_verify(struct cpufreq_policy *policy)
455{
456 return cpufreq_frequency_table_verify(policy, longhaul_table);
457}
458
459
460static int longhaul_target(struct cpufreq_policy *policy,
461 unsigned int target_freq, unsigned int relation)
462{
463 unsigned int table_index = 0;
464 unsigned int new_clock_ratio = 0;
465
466 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index))
467 return -EINVAL;
468
469 new_clock_ratio = longhaul_table[table_index].index & 0xFF;
470
471 longhaul_setstate(new_clock_ratio);
472
473 return 0;
474}
475
476
477static unsigned int longhaul_get(unsigned int cpu)
478{
479 if (cpu)
480 return 0;
481 return calc_speed(longhaul_get_cpu_mult());
482}
483
484
485static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
486{
487 struct cpuinfo_x86 *c = cpu_data;
488 char *cpuname=NULL;
489 int ret;
490
491 switch (c->x86_model) {
492 case 6:
493 cpu_model = CPU_SAMUEL;
494 cpuname = "C3 'Samuel' [C5A]";
495 longhaul_version = TYPE_LONGHAUL_V1;
496 memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio));
497 memcpy (eblcr_table, samuel1_eblcr, sizeof(samuel1_eblcr));
498 break;
499
500 case 7:
501 longhaul_version = TYPE_LONGHAUL_V1;
502 switch (c->x86_mask) {
503 case 0:
504 cpu_model = CPU_SAMUEL2;
505 cpuname = "C3 'Samuel 2' [C5B]";
506 /* Note, this is not a typo, early Samuel2's had Samuel1 ratios. */
507 memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio));
508 memcpy (eblcr_table, samuel2_eblcr, sizeof(samuel2_eblcr));
509 break;
510 case 1 ... 15:
511 if (c->x86_mask < 8) {
512 cpu_model = CPU_SAMUEL2;
513 cpuname = "C3 'Samuel 2' [C5B]";
514 } else {
515 cpu_model = CPU_EZRA;
516 cpuname = "C3 'Ezra' [C5C]";
517 }
518 memcpy (clock_ratio, ezra_clock_ratio, sizeof(ezra_clock_ratio));
519 memcpy (eblcr_table, ezra_eblcr, sizeof(ezra_eblcr));
520 break;
521 }
522 break;
523
524 case 8:
525 cpu_model = CPU_EZRA_T;
526 cpuname = "C3 'Ezra-T' [C5M]";
527 longhaul_version = TYPE_POWERSAVER;
528 numscales=32;
529 memcpy (clock_ratio, ezrat_clock_ratio, sizeof(ezrat_clock_ratio));
530 memcpy (eblcr_table, ezrat_eblcr, sizeof(ezrat_eblcr));
531 break;
532
533 case 9:
534 cpu_model = CPU_NEHEMIAH;
535 longhaul_version = TYPE_POWERSAVER;
536 numscales=32;
537 switch (c->x86_mask) {
538 case 0 ... 1:
539 cpuname = "C3 'Nehemiah A' [C5N]";
540 memcpy (clock_ratio, nehemiah_a_clock_ratio, sizeof(nehemiah_a_clock_ratio));
541 memcpy (eblcr_table, nehemiah_a_eblcr, sizeof(nehemiah_a_eblcr));
542 break;
543 case 2 ... 4:
544 cpuname = "C3 'Nehemiah B' [C5N]";
545 memcpy (clock_ratio, nehemiah_b_clock_ratio, sizeof(nehemiah_b_clock_ratio));
546 memcpy (eblcr_table, nehemiah_b_eblcr, sizeof(nehemiah_b_eblcr));
547 break;
548 case 5 ... 15:
549 cpuname = "C3 'Nehemiah C' [C5N]";
550 memcpy (clock_ratio, nehemiah_c_clock_ratio, sizeof(nehemiah_c_clock_ratio));
551 memcpy (eblcr_table, nehemiah_c_eblcr, sizeof(nehemiah_c_eblcr));
552 break;
553 }
554 break;
555
556 default:
557 cpuname = "Unknown";
558 break;
559 }
560
561 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
562 switch (longhaul_version) {
563 case TYPE_LONGHAUL_V1:
564 case TYPE_LONGHAUL_V2:
565 printk ("Longhaul v%d supported.\n", longhaul_version);
566 break;
567 case TYPE_POWERSAVER:
568 printk ("Powersaver supported.\n");
569 break;
570 };
571
572 ret = longhaul_get_ranges();
573 if (ret != 0)
574 return ret;
575
576 if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) &&
577 (dont_scale_voltage==0))
578 longhaul_setup_voltagescaling();
579
580 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
581 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
582 policy->cur = calc_speed(longhaul_get_cpu_mult());
583
584 ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
585 if (ret)
586 return ret;
587
588 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
589
590 return 0;
591}
592
593static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
594{
595 cpufreq_frequency_table_put_attr(policy->cpu);
596 return 0;
597}
598
599static struct freq_attr* longhaul_attr[] = {
600 &cpufreq_freq_attr_scaling_available_freqs,
601 NULL,
602};
603
604static struct cpufreq_driver longhaul_driver = {
605 .verify = longhaul_verify,
606 .target = longhaul_target,
607 .get = longhaul_get,
608 .init = longhaul_cpu_init,
609 .exit = __devexit_p(longhaul_cpu_exit),
610 .name = "longhaul",
611 .owner = THIS_MODULE,
612 .attr = longhaul_attr,
613};
614
615
616static int __init longhaul_init(void)
617{
618 struct cpuinfo_x86 *c = cpu_data;
619
620 if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
621 return -ENODEV;
622
623 switch (c->x86_model) {
624 case 6 ... 9:
625 return cpufreq_register_driver(&longhaul_driver);
626 default:
627 printk (KERN_INFO PFX "Unknown VIA CPU. Contact davej@codemonkey.org.uk\n");
628 }
629
630 return -ENODEV;
631}
632
633
634static void __exit longhaul_exit(void)
635{
636 int i=0;
637
638 for (i=0; i < numscales; i++) {
639 if (clock_ratio[i] == maxmult) {
640 longhaul_setstate(i);
641 break;
642 }
643 }
644
645 cpufreq_unregister_driver(&longhaul_driver);
646 kfree(longhaul_table);
647}
648
649module_param (dont_scale_voltage, int, 0644);
650MODULE_PARM_DESC(dont_scale_voltage, "Don't scale voltage of processor");
651
652MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
653MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
654MODULE_LICENSE ("GPL");
655
656module_init(longhaul_init);
657module_exit(longhaul_exit);
658
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h
new file mode 100644
index 000000000000..2a495c162ec7
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h
@@ -0,0 +1,466 @@
1/*
2 * longhaul.h
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * VIA-specific information
8 */
9
10union msr_bcr2 {
11 struct {
12 unsigned Reseved:19, // 18:0
13 ESOFTBF:1, // 19
14 Reserved2:3, // 22:20
15 CLOCKMUL:4, // 26:23
16 Reserved3:5; // 31:27
17 } bits;
18 unsigned long val;
19};
20
21union msr_longhaul {
22 struct {
23 unsigned RevisionID:4, // 3:0
24 RevisionKey:4, // 7:4
25 EnableSoftBusRatio:1, // 8
26 EnableSoftVID:1, // 9
27 EnableSoftBSEL:1, // 10
28 Reserved:3, // 11:13
29 SoftBusRatio4:1, // 14
30 VRMRev:1, // 15
31 SoftBusRatio:4, // 19:16
32 SoftVID:5, // 24:20
33 Reserved2:3, // 27:25
34 SoftBSEL:2, // 29:28
35 Reserved3:2, // 31:30
36 MaxMHzBR:4, // 35:32
37 MaximumVID:5, // 40:36
38 MaxMHzFSB:2, // 42:41
39 MaxMHzBR4:1, // 43
40 Reserved4:4, // 47:44
41 MinMHzBR:4, // 51:48
42 MinimumVID:5, // 56:52
43 MinMHzFSB:2, // 58:57
44 MinMHzBR4:1, // 59
45 Reserved5:4; // 63:60
46 } bits;
47 unsigned long long val;
48};
49
50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr ones specify the ratio read from the CPU.
53 * The clock_ratio ones specify what to write to the CPU.
54 */
55
56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */
59static int __initdata samuel1_clock_ratio[16] = {
60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */
63 -1, /* 0011 -> RESERVED */
64 -1, /* 0100 -> RESERVED */
65 35, /* 0101 -> 3.5x */
66 45, /* 0110 -> 4.5x */
67 55, /* 0111 -> 5.5x */
68 60, /* 1000 -> 6.0x */
69 70, /* 1001 -> 7.0x */
70 80, /* 1010 -> 8.0x */
71 50, /* 1011 -> 5.0x */
72 65, /* 1100 -> 6.5x */
73 75, /* 1101 -> 7.5x */
74 -1, /* 1110 -> RESERVED */
75 -1, /* 1111 -> RESERVED */
76};
77
78static int __initdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */
82 -1, /* 0011 -> RESERVED */
83 55, /* 0100 -> 5.5x */
84 35, /* 0101 -> 3.5x */
85 45, /* 0110 -> 4.5x */
86 -1, /* 0111 -> RESERVED */
87 -1, /* 1000 -> RESERVED */
88 70, /* 1001 -> 7.0x */
89 80, /* 1010 -> 8.0x */
90 60, /* 1011 -> 6.0x */
91 -1, /* 1100 -> RESERVED */
92 75, /* 1101 -> 7.5x */
93 -1, /* 1110 -> RESERVED */
94 65, /* 1111 -> 6.5x */
95};
96
97/*
98 * VIA C3 Samuel2 Stepping 1->15
99 */
100static int __initdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */
104 100, /* 0011 -> 10.0x */
105 55, /* 0100 -> 5.5x */
106 35, /* 0101 -> 3.5x */
107 45, /* 0110 -> 4.5x */
108 110, /* 0111 -> 11.0x */
109 90, /* 1000 -> 9.0x */
110 70, /* 1001 -> 7.0x */
111 80, /* 1010 -> 8.0x */
112 60, /* 1011 -> 6.0x */
113 120, /* 1100 -> 12.0x */
114 75, /* 1101 -> 7.5x */
115 130, /* 1110 -> 13.0x */
116 65, /* 1111 -> 6.5x */
117};
118
119/*
120 * VIA C3 Ezra
121 */
122static int __initdata ezra_clock_ratio[16] = {
123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */
126 90, /* 0011 -> 9.0x */
127 95, /* 0100 -> 9.5x */
128 35, /* 0101 -> 3.5x */
129 45, /* 0110 -> 4.5x */
130 55, /* 0111 -> 5.5x */
131 60, /* 1000 -> 6.0x */
132 70, /* 1001 -> 7.0x */
133 80, /* 1010 -> 8.0x */
134 50, /* 1011 -> 5.0x */
135 65, /* 1100 -> 6.5x */
136 75, /* 1101 -> 7.5x */
137 85, /* 1110 -> 8.5x */
138 120, /* 1111 -> 12.0x */
139};
140
141static int __initdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */
145 100, /* 0011 -> 10.0x */
146 55, /* 0100 -> 5.5x */
147 35, /* 0101 -> 3.5x */
148 45, /* 0110 -> 4.5x */
149 95, /* 0111 -> 9.5x */
150 90, /* 1000 -> 9.0x */
151 70, /* 1001 -> 7.0x */
152 80, /* 1010 -> 8.0x */
153 60, /* 1011 -> 6.0x */
154 120, /* 1100 -> 12.0x */
155 75, /* 1101 -> 7.5x */
156 85, /* 1110 -> 8.5x */
157 65, /* 1111 -> 6.5x */
158};
159
160/*
161 * VIA C3 (Ezra-T) [C5M].
162 */
163static int __initdata ezrat_clock_ratio[32] = {
164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */
167 90, /* 0011 -> 9.0x */
168 95, /* 0100 -> 9.5x */
169 35, /* 0101 -> 3.5x */
170 45, /* 0110 -> 4.5x */
171 55, /* 0111 -> 5.5x */
172 60, /* 1000 -> 6.0x */
173 70, /* 1001 -> 7.0x */
174 80, /* 1010 -> 8.0x */
175 50, /* 1011 -> 5.0x */
176 65, /* 1100 -> 6.5x */
177 75, /* 1101 -> 7.5x */
178 85, /* 1110 -> 8.5x */
179 120, /* 1111 -> 12.0x */
180
181 -1, /* 0000 -> RESERVED (10.0x) */
182 110, /* 0001 -> 11.0x */
183 120, /* 0010 -> 12.0x */
184 -1, /* 0011 -> RESERVED (9.0x)*/
185 105, /* 0100 -> 10.5x */
186 115, /* 0101 -> 11.5x */
187 125, /* 0110 -> 12.5x */
188 135, /* 0111 -> 13.5x */
189 140, /* 1000 -> 14.0x */
190 150, /* 1001 -> 15.0x */
191 160, /* 1010 -> 16.0x */
192 130, /* 1011 -> 13.0x */
193 145, /* 1100 -> 14.5x */
194 155, /* 1101 -> 15.5x */
195 -1, /* 1110 -> RESERVED (13.0x) */
196 -1, /* 1111 -> RESERVED (12.0x) */
197};
198
199static int __initdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */
203 100, /* 0011 -> 10.0x */
204 55, /* 0100 -> 5.5x */
205 35, /* 0101 -> 3.5x */
206 45, /* 0110 -> 4.5x */
207 95, /* 0111 -> 9.5x */
208 90, /* 1000 -> 9.0x */
209 70, /* 1001 -> 7.0x */
210 80, /* 1010 -> 8.0x */
211 60, /* 1011 -> 6.0x */
212 120, /* 1100 -> 12.0x */
213 75, /* 1101 -> 7.5x */
214 85, /* 1110 -> 8.5x */
215 65, /* 1111 -> 6.5x */
216
217 -1, /* 0000 -> RESERVED (9.0x) */
218 110, /* 0001 -> 11.0x */
219 120, /* 0010 -> 12.0x */
220 -1, /* 0011 -> RESERVED (10.0x)*/
221 135, /* 0100 -> 13.5x */
222 115, /* 0101 -> 11.5x */
223 125, /* 0110 -> 12.5x */
224 105, /* 0111 -> 10.5x */
225 130, /* 1000 -> 13.0x */
226 150, /* 1001 -> 15.0x */
227 160, /* 1010 -> 16.0x */
228 140, /* 1011 -> 14.0x */
229 -1, /* 1100 -> RESERVED (12.0x) */
230 155, /* 1101 -> 15.5x */
231 -1, /* 1110 -> RESERVED (13.0x) */
232 145, /* 1111 -> 14.5x */
233};
234
235/*
236 * VIA C3 Nehemiah */
237
238static int __initdata nehemiah_a_clock_ratio[32] = {
239 100, /* 0000 -> 10.0x */
240 160, /* 0001 -> 16.0x */
241 -1, /* 0010 -> RESERVED */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 -1, /* 0110 -> RESERVED */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 100, /* 0000 -> 10.0x */
256 -1, /* 0001 -> RESERVED */
257 120, /* 0010 -> 12.0x */
258 90, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 120, /* 1111 -> 12.0x */
271};
272
273static int __initdata nehemiah_b_clock_ratio[32] = {
274 100, /* 0000 -> 10.0x */
275 160, /* 0001 -> 16.0x */
276 -1, /* 0010 -> RESERVED */
277 90, /* 0011 -> 9.0x */
278 95, /* 0100 -> 9.5x */
279 -1, /* 0101 -> RESERVED */
280 -1, /* 0110 -> RESERVED */
281 55, /* 0111 -> 5.5x */
282 60, /* 1000 -> 6.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 50, /* 1011 -> 5.0x */
286 65, /* 1100 -> 6.5x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 120, /* 1111 -> 12.0x */
290 100, /* 0000 -> 10.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 90, /* 0011 -> 9.0x */
294 105, /* 0100 -> 10.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 135, /* 0111 -> 13.5x */
298 140, /* 1000 -> 14.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 130, /* 1011 -> 13.0x */
302 145, /* 1100 -> 14.5x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 120, /* 1111 -> 12.0x */
306};
307
308static int __initdata nehemiah_c_clock_ratio[32] = {
309 100, /* 0000 -> 10.0x */
310 160, /* 0001 -> 16.0x */
311 40, /* 0010 -> RESERVED */
312 90, /* 0011 -> 9.0x */
313 95, /* 0100 -> 9.5x */
314 -1, /* 0101 -> RESERVED */
315 45, /* 0110 -> RESERVED */
316 55, /* 0111 -> 5.5x */
317 60, /* 1000 -> 6.0x */
318 70, /* 1001 -> 7.0x */
319 80, /* 1010 -> 8.0x */
320 50, /* 1011 -> 5.0x */
321 65, /* 1100 -> 6.5x */
322 75, /* 1101 -> 7.5x */
323 85, /* 1110 -> 8.5x */
324 120, /* 1111 -> 12.0x */
325 100, /* 0000 -> 10.0x */
326 110, /* 0001 -> 11.0x */
327 120, /* 0010 -> 12.0x */
328 90, /* 0011 -> 9.0x */
329 105, /* 0100 -> 10.5x */
330 115, /* 0101 -> 11.5x */
331 125, /* 0110 -> 12.5x */
332 135, /* 0111 -> 13.5x */
333 140, /* 1000 -> 14.0x */
334 150, /* 1001 -> 15.0x */
335 160, /* 1010 -> 16.0x */
336 130, /* 1011 -> 13.0x */
337 145, /* 1100 -> 14.5x */
338 155, /* 1101 -> 15.5x */
339 -1, /* 1110 -> RESERVED (13.0x) */
340 120, /* 1111 -> 12.0x */
341};
342
343static int __initdata nehemiah_a_eblcr[32] = {
344 50, /* 0000 -> 5.0x */
345 160, /* 0001 -> 16.0x */
346 -1, /* 0010 -> RESERVED */
347 100, /* 0011 -> 10.0x */
348 55, /* 0100 -> 5.5x */
349 -1, /* 0101 -> RESERVED */
350 -1, /* 0110 -> RESERVED */
351 95, /* 0111 -> 9.5x */
352 90, /* 1000 -> 9.0x */
353 70, /* 1001 -> 7.0x */
354 80, /* 1010 -> 8.0x */
355 60, /* 1011 -> 6.0x */
356 120, /* 1100 -> 12.0x */
357 75, /* 1101 -> 7.5x */
358 85, /* 1110 -> 8.5x */
359 65, /* 1111 -> 6.5x */
360 90, /* 0000 -> 9.0x */
361 -1, /* 0001 -> RESERVED */
362 120, /* 0010 -> 12.0x */
363 100, /* 0011 -> 10.0x */
364 135, /* 0100 -> 13.5x */
365 115, /* 0101 -> 11.5x */
366 125, /* 0110 -> 12.5x */
367 105, /* 0111 -> 10.5x */
368 130, /* 1000 -> 13.0x */
369 150, /* 1001 -> 15.0x */
370 160, /* 1010 -> 16.0x */
371 140, /* 1011 -> 14.0x */
372 120, /* 1100 -> 12.0x */
373 155, /* 1101 -> 15.5x */
374 -1, /* 1110 -> RESERVED (13.0x) */
375 145 /* 1111 -> 14.5x */
376 /* end of table */
377};
378static int __initdata nehemiah_b_eblcr[32] = {
379 50, /* 0000 -> 5.0x */
380 160, /* 0001 -> 16.0x */
381 -1, /* 0010 -> RESERVED */
382 100, /* 0011 -> 10.0x */
383 55, /* 0100 -> 5.5x */
384 -1, /* 0101 -> RESERVED */
385 -1, /* 0110 -> RESERVED */
386 95, /* 0111 -> 9.5x */
387 90, /* 1000 -> 9.0x */
388 70, /* 1001 -> 7.0x */
389 80, /* 1010 -> 8.0x */
390 60, /* 1011 -> 6.0x */
391 120, /* 1100 -> 12.0x */
392 75, /* 1101 -> 7.5x */
393 85, /* 1110 -> 8.5x */
394 65, /* 1111 -> 6.5x */
395 90, /* 0000 -> 9.0x */
396 110, /* 0001 -> 11.0x */
397 120, /* 0010 -> 12.0x */
398 100, /* 0011 -> 10.0x */
399 135, /* 0100 -> 13.5x */
400 115, /* 0101 -> 11.5x */
401 125, /* 0110 -> 12.5x */
402 105, /* 0111 -> 10.5x */
403 130, /* 1000 -> 13.0x */
404 150, /* 1001 -> 15.0x */
405 160, /* 1010 -> 16.0x */
406 140, /* 1011 -> 14.0x */
407 120, /* 1100 -> 12.0x */
408 155, /* 1101 -> 15.5x */
409 -1, /* 1110 -> RESERVED (13.0x) */
410 145 /* 1111 -> 14.5x */
411 /* end of table */
412};
413static int __initdata nehemiah_c_eblcr[32] = {
414 50, /* 0000 -> 5.0x */
415 160, /* 0001 -> 16.0x */
416 40, /* 0010 -> RESERVED */
417 100, /* 0011 -> 10.0x */
418 55, /* 0100 -> 5.5x */
419 -1, /* 0101 -> RESERVED */
420 45, /* 0110 -> RESERVED */
421 95, /* 0111 -> 9.5x */
422 90, /* 1000 -> 9.0x */
423 70, /* 1001 -> 7.0x */
424 80, /* 1010 -> 8.0x */
425 60, /* 1011 -> 6.0x */
426 120, /* 1100 -> 12.0x */
427 75, /* 1101 -> 7.5x */
428 85, /* 1110 -> 8.5x */
429 65, /* 1111 -> 6.5x */
430 90, /* 0000 -> 9.0x */
431 110, /* 0001 -> 11.0x */
432 120, /* 0010 -> 12.0x */
433 100, /* 0011 -> 10.0x */
434 135, /* 0100 -> 13.5x */
435 115, /* 0101 -> 11.5x */
436 125, /* 0110 -> 12.5x */
437 105, /* 0111 -> 10.5x */
438 130, /* 1000 -> 13.0x */
439 150, /* 1001 -> 15.0x */
440 160, /* 1010 -> 16.0x */
441 140, /* 1011 -> 14.0x */
442 120, /* 1100 -> 12.0x */
443 155, /* 1101 -> 15.5x */
444 -1, /* 1110 -> RESERVED (13.0x) */
445 145 /* 1111 -> 14.5x */
446 /* end of table */
447};
448
449/*
450 * Voltage scales. Div/Mod by 1000 to get actual voltage.
451 * Which scale to use depends on the VRM type in use.
452 */
453static int __initdata vrm85scales[32] = {
454 1250, 1200, 1150, 1100, 1050, 1800, 1750, 1700,
455 1650, 1600, 1550, 1500, 1450, 1400, 1350, 1300,
456 1275, 1225, 1175, 1125, 1075, 1825, 1775, 1725,
457 1675, 1625, 1575, 1525, 1475, 1425, 1375, 1325,
458};
459
460static int __initdata mobilevrmscales[32] = {
461 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
462 1600, 1550, 1500, 1450, 1500, 1350, 1300, -1,
463 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
464 1075, 1050, 1025, 1000, 975, 950, 925, -1,
465};
466
diff --git a/arch/i386/kernel/cpu/cpufreq/longrun.c b/arch/i386/kernel/cpu/cpufreq/longrun.c
new file mode 100644
index 000000000000..e3868de4dc2e
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/longrun.c
@@ -0,0 +1,326 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h>
14
15#include <asm/msr.h>
16#include <asm/processor.h>
17#include <asm/timex.h>
18
19#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longrun", msg)
20
21static struct cpufreq_driver longrun_driver;
22
23/**
24 * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
25 * values into per cent values. In TMTA microcode, the following is valid:
26 * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
27 */
28static unsigned int longrun_low_freq, longrun_high_freq;
29
30
31/**
32 * longrun_get_policy - get the current LongRun policy
33 * @policy: struct cpufreq_policy where current policy is written into
34 *
35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
36 * and MSR_TMTA_LONGRUN_CTRL
37 */
38static void __init longrun_get_policy(struct cpufreq_policy *policy)
39{
40 u32 msr_lo, msr_hi;
41
42 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
43 dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
44 if (msr_lo & 0x01)
45 policy->policy = CPUFREQ_POLICY_PERFORMANCE;
46 else
47 policy->policy = CPUFREQ_POLICY_POWERSAVE;
48
49 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
50 dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
51 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F;
53
54 if ( longrun_high_freq <= longrun_low_freq ) {
55 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq;
57 } else {
58 policy->min = longrun_low_freq + msr_lo *
59 ((longrun_high_freq - longrun_low_freq) / 100);
60 policy->max = longrun_low_freq + msr_hi *
61 ((longrun_high_freq - longrun_low_freq) / 100);
62 }
63 policy->cpu = 0;
64}
65
66
67/**
68 * longrun_set_policy - sets a new CPUFreq policy
69 * @policy: new policy
70 *
71 * Sets a new CPUFreq policy on LongRun-capable processors. This function
72 * has to be called with cpufreq_driver locked.
73 */
74static int longrun_set_policy(struct cpufreq_policy *policy)
75{
76 u32 msr_lo, msr_hi;
77 u32 pctg_lo, pctg_hi;
78
79 if (!policy)
80 return -EINVAL;
81
82 if ( longrun_high_freq <= longrun_low_freq ) {
83 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100;
85 } else {
86 pctg_lo = (policy->min - longrun_low_freq) /
87 ((longrun_high_freq - longrun_low_freq) / 100);
88 pctg_hi = (policy->max - longrun_low_freq) /
89 ((longrun_high_freq - longrun_low_freq) / 100);
90 }
91
92 if (pctg_hi > 100)
93 pctg_hi = 100;
94 if (pctg_lo > pctg_hi)
95 pctg_lo = pctg_hi;
96
97 /* performance or economy mode */
98 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
99 msr_lo &= 0xFFFFFFFE;
100 switch (policy->policy) {
101 case CPUFREQ_POLICY_PERFORMANCE:
102 msr_lo |= 0x00000001;
103 break;
104 case CPUFREQ_POLICY_POWERSAVE:
105 break;
106 }
107 wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
108
109 /* lower and upper boundary */
110 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
111 msr_lo &= 0xFFFFFF80;
112 msr_hi &= 0xFFFFFF80;
113 msr_lo |= pctg_lo;
114 msr_hi |= pctg_hi;
115 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
116
117 return 0;
118}
119
120
121/**
122 * longrun_verify_poliy - verifies a new CPUFreq policy
123 * @policy: the policy to verify
124 *
125 * Validates a new CPUFreq policy. This function has to be called with
126 * cpufreq_driver locked.
127 */
128static int longrun_verify_policy(struct cpufreq_policy *policy)
129{
130 if (!policy)
131 return -EINVAL;
132
133 policy->cpu = 0;
134 cpufreq_verify_within_limits(policy,
135 policy->cpuinfo.min_freq,
136 policy->cpuinfo.max_freq);
137
138 if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
139 (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
140 return -EINVAL;
141
142 return 0;
143}
144
145static unsigned int longrun_get(unsigned int cpu)
146{
147 u32 eax, ebx, ecx, edx;
148
149 if (cpu)
150 return 0;
151
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax);
154
155 return (eax * 1000);
156}
157
158/**
159 * longrun_determine_freqs - determines the lowest and highest possible core frequency
160 * @low_freq: an int to put the lowest frequency into
161 * @high_freq: an int to put the highest frequency into
162 *
163 * Determines the lowest and highest possible core frequencies on this CPU.
164 * This is necessary to calculate the performance percentage according to
165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */
168static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq)
170{
171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi;
173 u32 eax, ebx, ecx, edx;
174 u32 try_hi;
175 struct cpuinfo_x86 *c = cpu_data;
176
177 if (!low_freq || !high_freq)
178 return -EINVAL;
179
180 if (cpu_has(c, X86_FEATURE_LRTI)) {
181 /* if the LongRun Table Interface is present, the
182 * detection is a bit easier:
183 * For minimum frequency, read out the maximum
184 * level (msr_hi), write that into "currently
185 * selected level", and read out the frequency.
186 * For maximum frequency, read out level zero.
187 */
188 /* minimum */
189 rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
190 wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
191 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
192 *low_freq = msr_lo * 1000; /* to kHz */
193
194 /* maximum */
195 wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */
198
199 dprintk("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq);
200
201 if (*low_freq > *high_freq)
202 *low_freq = *high_freq;
203 return 0;
204 }
205
206 /* set the upper border to the value determined during TSC init */
207 *high_freq = (cpu_khz / 1000);
208 *high_freq = *high_freq * 1000;
209 dprintk("high frequency is %u kHz\n", *high_freq);
210
211 /* get current borders */
212 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
213 save_lo = msr_lo & 0x0000007F;
214 save_hi = msr_hi & 0x0000007F;
215
216 /* if current perf_pctg is larger than 90%, we need to decrease the
217 * upper limit to make the calculation more accurate.
218 */
219 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
220 /* try decreasing in 10% steps, some processors react only
221 * on some barrier values */
222 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -=10) {
223 /* set to 0 to try_hi perf_pctg */
224 msr_lo &= 0xFFFFFF80;
225 msr_hi &= 0xFFFFFF80;
226 msr_lo |= 0;
227 msr_hi |= try_hi;
228 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
229
230 /* read out current core MHz and current perf_pctg */
231 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
232
233 /* restore values */
234 wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
235 }
236 dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
237
238 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
239 * eqals
240 * low_freq * ( 1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
241 *
242 * high_freq * perf_pctg is stored tempoarily into "ebx".
243 */
244 ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
245
246 if ((ecx > 95) || (ecx == 0) || (eax < ebx))
247 return -EIO;
248
249 edx = (eax - ebx) / (100 - ecx);
250 *low_freq = edx * 1000; /* back to kHz */
251
252 dprintk("low frequency is %u kHz\n", *low_freq);
253
254 if (*low_freq > *high_freq)
255 *low_freq = *high_freq;
256
257 return 0;
258}
259
260
261static int __init longrun_cpu_init(struct cpufreq_policy *policy)
262{
263 int result = 0;
264
265 /* capability check */
266 if (policy->cpu != 0)
267 return -ENODEV;
268
269 /* detect low and high frequency */
270 result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
271 if (result)
272 return result;
273
274 /* cpuinfo and default policy values */
275 policy->cpuinfo.min_freq = longrun_low_freq;
276 policy->cpuinfo.max_freq = longrun_high_freq;
277 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
278 longrun_get_policy(policy);
279
280 return 0;
281}
282
283
284static struct cpufreq_driver longrun_driver = {
285 .flags = CPUFREQ_CONST_LOOPS,
286 .verify = longrun_verify_policy,
287 .setpolicy = longrun_set_policy,
288 .get = longrun_get,
289 .init = longrun_cpu_init,
290 .name = "longrun",
291 .owner = THIS_MODULE,
292};
293
294
295/**
296 * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
297 *
298 * Initializes the LongRun support.
299 */
300static int __init longrun_init(void)
301{
302 struct cpuinfo_x86 *c = cpu_data;
303
304 if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
305 !cpu_has(c, X86_FEATURE_LONGRUN))
306 return -ENODEV;
307
308 return cpufreq_register_driver(&longrun_driver);
309}
310
311
312/**
313 * longrun_exit - unregisters LongRun support
314 */
315static void __exit longrun_exit(void)
316{
317 cpufreq_unregister_driver(&longrun_driver);
318}
319
320
321MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
322MODULE_DESCRIPTION ("LongRun driver for Transmeta Crusoe and Efficeon processors.");
323MODULE_LICENSE ("GPL");
324
325module_init(longrun_init);
326module_exit(longrun_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
new file mode 100644
index 000000000000..aa622d52c6e5
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
@@ -0,0 +1,337 @@
1/*
2 * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
5 * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
6 * (C) 2002 Tora T. Engstad
7 * All Rights Reserved
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * The author(s) of this software shall not be held liable for damages
15 * of any nature resulting due to the use of this software. This
16 * software is provided AS-IS with no warranties.
17 *
18 * Date Errata Description
19 * 20020525 N44, O17 12.5% or 25% DC causes lockup
20 *
21 */
22
23#include <linux/config.h>
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/init.h>
27#include <linux/smp.h>
28#include <linux/cpufreq.h>
29#include <linux/slab.h>
30#include <linux/cpumask.h>
31
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/timex.h>
35
36#include "speedstep-lib.h"
37
38#define PFX "p4-clockmod: "
39#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "p4-clockmod", msg)
40
41/*
42 * Duty Cycle (3bits), note DC_DISABLE is not specified in
43 * intel docs i just use it to mean disable
44 */
45enum {
46 DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
47 DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
48};
49
50#define DC_ENTRIES 8
51
52
53static int has_N44_O17_errata[NR_CPUS];
54static unsigned int stock_freq;
55static struct cpufreq_driver p4clockmod_driver;
56static unsigned int cpufreq_p4_get(unsigned int cpu);
57
58static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
59{
60 u32 l, h;
61
62 if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV))
63 return -EINVAL;
64
65 rdmsr(MSR_IA32_THERM_STATUS, l, h);
66
67 if (l & 0x01)
68 dprintk("CPU#%d currently thermal throttled\n", cpu);
69
70 if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT))
71 newstate = DC_38PT;
72
73 rdmsr(MSR_IA32_THERM_CONTROL, l, h);
74 if (newstate == DC_DISABLE) {
75 dprintk("CPU#%d disabling modulation\n", cpu);
76 wrmsr(MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
77 } else {
78 dprintk("CPU#%d setting duty cycle to %d%%\n",
79 cpu, ((125 * newstate) / 10));
80 /* bits 63 - 5 : reserved
81 * bit 4 : enable/disable
82 * bits 3-1 : duty cycle
83 * bit 0 : reserved
84 */
85 l = (l & ~14);
86 l = l | (1<<4) | ((newstate & 0x7)<<1);
87 wrmsr(MSR_IA32_THERM_CONTROL, l, h);
88 }
89
90 return 0;
91}
92
93
94static struct cpufreq_frequency_table p4clockmod_table[] = {
95 {DC_RESV, CPUFREQ_ENTRY_INVALID},
96 {DC_DFLT, 0},
97 {DC_25PT, 0},
98 {DC_38PT, 0},
99 {DC_50PT, 0},
100 {DC_64PT, 0},
101 {DC_75PT, 0},
102 {DC_88PT, 0},
103 {DC_DISABLE, 0},
104 {DC_RESV, CPUFREQ_TABLE_END},
105};
106
107
108static int cpufreq_p4_target(struct cpufreq_policy *policy,
109 unsigned int target_freq,
110 unsigned int relation)
111{
112 unsigned int newstate = DC_RESV;
113 struct cpufreq_freqs freqs;
114 cpumask_t cpus_allowed;
115 int i;
116
117 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate))
118 return -EINVAL;
119
120 freqs.old = cpufreq_p4_get(policy->cpu);
121 freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
122
123 if (freqs.new == freqs.old)
124 return 0;
125
126 /* notifiers */
127 for_each_cpu_mask(i, policy->cpus) {
128 freqs.cpu = i;
129 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
130 }
131
132 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
133 * Developer's Manual, Volume 3
134 */
135 cpus_allowed = current->cpus_allowed;
136
137 for_each_cpu_mask(i, policy->cpus) {
138 cpumask_t this_cpu = cpumask_of_cpu(i);
139
140 set_cpus_allowed(current, this_cpu);
141 BUG_ON(smp_processor_id() != i);
142
143 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
144 }
145 set_cpus_allowed(current, cpus_allowed);
146
147 /* notifiers */
148 for_each_cpu_mask(i, policy->cpus) {
149 freqs.cpu = i;
150 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
151 }
152
153 return 0;
154}
155
156
157static int cpufreq_p4_verify(struct cpufreq_policy *policy)
158{
159 return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
160}
161
162
163static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
164{
165 if ((c->x86 == 0x06) && (c->x86_model == 0x09)) {
166 /* Pentium M (Banias) */
167 printk(KERN_WARNING PFX "Warning: Pentium M detected. "
168 "The speedstep_centrino module offers voltage scaling"
169 " in addition of frequency scaling. You should use "
170 "that instead of p4-clockmod, if possible.\n");
171 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM);
172 }
173
174 if ((c->x86 == 0x06) && (c->x86_model == 0x0D)) {
175 /* Pentium M (Dothan) */
176 printk(KERN_WARNING PFX "Warning: Pentium M detected. "
177 "The speedstep_centrino module offers voltage scaling"
178 " in addition of frequency scaling. You should use "
179 "that instead of p4-clockmod, if possible.\n");
180 /* on P-4s, the TSC runs with constant frequency independent whether
181 * throttling is active or not. */
182 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
183 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM);
184 }
185
186 if (c->x86 != 0xF) {
187 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <linux@brodo.de>\n");
188 return 0;
189 }
190
191 /* on P-4s, the TSC runs with constant frequency independent whether
192 * throttling is active or not. */
193 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
194
195 if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) {
196 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
197 "The speedstep-ich or acpi cpufreq modules offer "
198 "voltage scaling in addition of frequency scaling. "
199 "You should use either one instead of p4-clockmod, "
200 "if possible.\n");
201 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M);
202 }
203
204 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4D);
205}
206
207
208
209static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
210{
211 struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
212 int cpuid = 0;
213 unsigned int i;
214
215#ifdef CONFIG_SMP
216 policy->cpus = cpu_sibling_map[policy->cpu];
217#endif
218
219 /* Errata workaround */
220 cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
221 switch (cpuid) {
222 case 0x0f07:
223 case 0x0f0a:
224 case 0x0f11:
225 case 0x0f12:
226 has_N44_O17_errata[policy->cpu] = 1;
227 dprintk("has errata -- disabling low frequencies\n");
228 }
229
230 /* get max frequency */
231 stock_freq = cpufreq_p4_get_frequency(c);
232 if (!stock_freq)
233 return -EINVAL;
234
235 /* table init */
236 for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
237 if ((i<2) && (has_N44_O17_errata[policy->cpu]))
238 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
239 else
240 p4clockmod_table[i].frequency = (stock_freq * i)/8;
241 }
242 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
243
244 /* cpuinfo and default policy values */
245 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
246 policy->cpuinfo.transition_latency = 1000000; /* assumed */
247 policy->cur = stock_freq;
248
249 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
250}
251
252
253static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
254{
255 cpufreq_frequency_table_put_attr(policy->cpu);
256 return 0;
257}
258
259static unsigned int cpufreq_p4_get(unsigned int cpu)
260{
261 cpumask_t cpus_allowed;
262 u32 l, h;
263
264 cpus_allowed = current->cpus_allowed;
265
266 set_cpus_allowed(current, cpumask_of_cpu(cpu));
267 BUG_ON(smp_processor_id() != cpu);
268
269 rdmsr(MSR_IA32_THERM_CONTROL, l, h);
270
271 set_cpus_allowed(current, cpus_allowed);
272
273 if (l & 0x10) {
274 l = l >> 1;
275 l &= 0x7;
276 } else
277 l = DC_DISABLE;
278
279 if (l != DC_DISABLE)
280 return (stock_freq * l / 8);
281
282 return stock_freq;
283}
284
285static struct freq_attr* p4clockmod_attr[] = {
286 &cpufreq_freq_attr_scaling_available_freqs,
287 NULL,
288};
289
290static struct cpufreq_driver p4clockmod_driver = {
291 .verify = cpufreq_p4_verify,
292 .target = cpufreq_p4_target,
293 .init = cpufreq_p4_cpu_init,
294 .exit = cpufreq_p4_cpu_exit,
295 .get = cpufreq_p4_get,
296 .name = "p4-clockmod",
297 .owner = THIS_MODULE,
298 .attr = p4clockmod_attr,
299};
300
301
302static int __init cpufreq_p4_init(void)
303{
304 struct cpuinfo_x86 *c = cpu_data;
305 int ret;
306
307 /*
308 * THERM_CONTROL is architectural for IA32 now, so
309 * we can rely on the capability checks
310 */
311 if (c->x86_vendor != X86_VENDOR_INTEL)
312 return -ENODEV;
313
314 if (!test_bit(X86_FEATURE_ACPI, c->x86_capability) ||
315 !test_bit(X86_FEATURE_ACC, c->x86_capability))
316 return -ENODEV;
317
318 ret = cpufreq_register_driver(&p4clockmod_driver);
319 if (!ret)
320 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock Modulation available\n");
321
322 return (ret);
323}
324
325
326static void __exit cpufreq_p4_exit(void)
327{
328 cpufreq_unregister_driver(&p4clockmod_driver);
329}
330
331
332MODULE_AUTHOR ("Zwane Mwaikambo <zwane@commfireservices.com>");
333MODULE_DESCRIPTION ("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
334MODULE_LICENSE ("GPL");
335
336late_initcall(cpufreq_p4_init);
337module_exit(cpufreq_p4_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k6.c b/arch/i386/kernel/cpu/cpufreq/powernow-k6.c
new file mode 100644
index 000000000000..222f8cfe3c57
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k6.c
@@ -0,0 +1,256 @@
1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/cpufreq.h>
14#include <linux/ioport.h>
15#include <linux/slab.h>
16
17#include <asm/msr.h>
18#include <asm/timex.h>
19#include <asm/io.h>
20
21
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
23 as it is unused */
24
25static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier;
27
28
29/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
30static struct cpufreq_frequency_table clock_ratio[] = {
31 {45, /* 000 -> 4.5x */ 0},
32 {50, /* 001 -> 5.0x */ 0},
33 {40, /* 010 -> 4.0x */ 0},
34 {55, /* 011 -> 5.5x */ 0},
35 {20, /* 100 -> 2.0x */ 0},
36 {30, /* 101 -> 3.0x */ 0},
37 {60, /* 110 -> 6.0x */ 0},
38 {35, /* 111 -> 3.5x */ 0},
39 {0, CPUFREQ_TABLE_END}
40};
41
42
43/**
44 * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
45 *
46 * Returns the current setting of the frequency multiplier. Core clock
47 * speed is frequency of the Front-Side Bus multiplied with this value.
48 */
49static int powernow_k6_get_cpu_multiplier(void)
50{
51 u64 invalue = 0;
52 u32 msrval;
53
54 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59
60 return clock_ratio[(invalue >> 5)&7].index;
61}
62
63
64/**
65 * powernow_k6_set_state - set the PowerNow! multiplier
66 * @best_i: clock_ratio[best_i] is the target multiplier
67 *
68 * Tries to change the PowerNow! multiplier
69 */
70static void powernow_k6_set_state (unsigned int best_i)
71{
72 unsigned long outvalue=0, invalue=0;
73 unsigned long msrval;
74 struct cpufreq_freqs freqs;
75
76 if (clock_ratio[best_i].index > max_multiplier) {
77 printk(KERN_ERR "cpufreq: invalid target frequency\n");
78 return;
79 }
80
81 freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
82 freqs.new = busfreq * clock_ratio[best_i].index;
83 freqs.cpu = 0; /* powernow-k6.c is UP only driver */
84
85 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
86
87 /* we now need to transform best_i to the BVC format, see AMD#23446 */
88
89 outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
90
91 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99
100 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
101
102 return;
103}
104
105
106/**
107 * powernow_k6_verify - verifies a new CPUfreq policy
108 * @policy: new policy
109 *
110 * Policy must be within lowest and highest possible CPU Frequency,
111 * and at least one possible state must be within min and max.
112 */
113static int powernow_k6_verify(struct cpufreq_policy *policy)
114{
115 return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
116}
117
118
119/**
120 * powernow_k6_setpolicy - sets a new CPUFreq policy
121 * @policy: new policy
122 * @target_freq: the target frequency
123 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
124 *
125 * sets a new CPUFreq policy
126 */
127static int powernow_k6_target (struct cpufreq_policy *policy,
128 unsigned int target_freq,
129 unsigned int relation)
130{
131 unsigned int newstate = 0;
132
133 if (cpufreq_frequency_table_target(policy, &clock_ratio[0], target_freq, relation, &newstate))
134 return -EINVAL;
135
136 powernow_k6_set_state(newstate);
137
138 return 0;
139}
140
141
142static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
143{
144 unsigned int i;
145 int result;
146
147 if (policy->cpu != 0)
148 return -ENODEV;
149
150 /* get frequencies */
151 max_multiplier = powernow_k6_get_cpu_multiplier();
152 busfreq = cpu_khz / max_multiplier;
153
154 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else
159 clock_ratio[i].frequency = busfreq * clock_ratio[i].index;
160 }
161
162 /* cpuinfo and default policy values */
163 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
164 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
165 policy->cur = busfreq * max_multiplier;
166
167 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
168 if (result)
169 return (result);
170
171 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
172
173 return 0;
174}
175
176
177static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
178{
179 unsigned int i;
180 for (i=0; i<8; i++) {
181 if (i==max_multiplier)
182 powernow_k6_set_state(i);
183 }
184 cpufreq_frequency_table_put_attr(policy->cpu);
185 return 0;
186}
187
188static unsigned int powernow_k6_get(unsigned int cpu)
189{
190 return busfreq * powernow_k6_get_cpu_multiplier();
191}
192
193static struct freq_attr* powernow_k6_attr[] = {
194 &cpufreq_freq_attr_scaling_available_freqs,
195 NULL,
196};
197
198static struct cpufreq_driver powernow_k6_driver = {
199 .verify = powernow_k6_verify,
200 .target = powernow_k6_target,
201 .init = powernow_k6_cpu_init,
202 .exit = powernow_k6_cpu_exit,
203 .get = powernow_k6_get,
204 .name = "powernow-k6",
205 .owner = THIS_MODULE,
206 .attr = powernow_k6_attr,
207};
208
209
210/**
211 * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
212 *
213 * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
214 * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
215 * on success.
216 */
217static int __init powernow_k6_init(void)
218{
219 struct cpuinfo_x86 *c = cpu_data;
220
221 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
222 ((c->x86_model != 12) && (c->x86_model != 13)))
223 return -ENODEV;
224
225 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
226 printk("cpufreq: PowerNow IOPORT region already used.\n");
227 return -EIO;
228 }
229
230 if (cpufreq_register_driver(&powernow_k6_driver)) {
231 release_region (POWERNOW_IOPORT, 16);
232 return -EINVAL;
233 }
234
235 return 0;
236}
237
238
239/**
240 * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
241 *
242 * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
243 */
244static void __exit powernow_k6_exit(void)
245{
246 cpufreq_unregister_driver(&powernow_k6_driver);
247 release_region (POWERNOW_IOPORT, 16);
248}
249
250
251MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
252MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
253MODULE_LICENSE ("GPL");
254
255module_init(powernow_k6_init);
256module_exit(powernow_k6_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c
new file mode 100644
index 000000000000..913f652623d9
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c
@@ -0,0 +1,690 @@
1/*
2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 *
9 * Errata 5: Processor may fail to execute a FID/VID change in presence of interrupt.
10 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
11 * Errata 15: Processors with half frequency multipliers may hang upon wakeup from disconnect.
12 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
13 */
14
15#include <linux/config.h>
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/moduleparam.h>
19#include <linux/init.h>
20#include <linux/cpufreq.h>
21#include <linux/slab.h>
22#include <linux/string.h>
23#include <linux/dmi.h>
24
25#include <asm/msr.h>
26#include <asm/timex.h>
27#include <asm/io.h>
28#include <asm/system.h>
29
30#ifdef CONFIG_X86_POWERNOW_K7_ACPI
31#include <linux/acpi.h>
32#include <acpi/processor.h>
33#endif
34
35#include "powernow-k7.h"
36
37#define PFX "powernow: "
38
39
40struct psb_s {
41 u8 signature[10];
42 u8 tableversion;
43 u8 flags;
44 u16 settlingtime;
45 u8 reserved1;
46 u8 numpst;
47};
48
49struct pst_s {
50 u32 cpuid;
51 u8 fsbspeed;
52 u8 maxfid;
53 u8 startvid;
54 u8 numpstates;
55};
56
57#ifdef CONFIG_X86_POWERNOW_K7_ACPI
58union powernow_acpi_control_t {
59 struct {
60 unsigned long fid:5,
61 vid:5,
62 sgtc:20,
63 res1:2;
64 } bits;
65 unsigned long val;
66};
67#endif
68
69#ifdef CONFIG_CPU_FREQ_DEBUG
70/* divide by 1000 to get VCore voltage in V. */
71static int mobile_vid_table[32] = {
72 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
73 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
74 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
75 1075, 1050, 1025, 1000, 975, 950, 925, 0,
76};
77#endif
78
79/* divide by 10 to get FID. */
80static int fid_codes[32] = {
81 110, 115, 120, 125, 50, 55, 60, 65,
82 70, 75, 80, 85, 90, 95, 100, 105,
83 30, 190, 40, 200, 130, 135, 140, 210,
84 150, 225, 160, 165, 170, 180, -1, -1,
85};
86
87/* This parameter is used in order to force ACPI instead of legacy method for
88 * configuration purpose.
89 */
90
91static int acpi_force;
92
93static struct cpufreq_frequency_table *powernow_table;
94
95static unsigned int can_scale_bus;
96static unsigned int can_scale_vid;
97static unsigned int minimum_speed=-1;
98static unsigned int maximum_speed;
99static unsigned int number_scales;
100static unsigned int fsb;
101static unsigned int latency;
102static char have_a0;
103
104#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k7", msg)
105
106static int check_fsb(unsigned int fsbspeed)
107{
108 int delta;
109 unsigned int f = fsb / 1000;
110
111 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
112 return (delta < 5);
113}
114
115static int check_powernow(void)
116{
117 struct cpuinfo_x86 *c = cpu_data;
118 unsigned int maxei, eax, ebx, ecx, edx;
119
120 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) {
121#ifdef MODULE
122 printk (KERN_INFO PFX "This module only works with AMD K7 CPUs\n");
123#endif
124 return 0;
125 }
126
127 /* Get maximum capabilities */
128 maxei = cpuid_eax (0x80000000);
129 if (maxei < 0x80000007) { /* Any powernow info ? */
130#ifdef MODULE
131 printk (KERN_INFO PFX "No powernow capabilities detected\n");
132#endif
133 return 0;
134 }
135
136 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
137 printk (KERN_INFO PFX "K7 660[A0] core detected, enabling errata workarounds\n");
138 have_a0 = 1;
139 }
140
141 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
142
143 /* Check we can actually do something before we say anything.*/
144 if (!(edx & (1 << 1 | 1 << 2)))
145 return 0;
146
147 printk (KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
148
149 if (edx & 1 << 1) {
150 printk ("frequency");
151 can_scale_bus=1;
152 }
153
154 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
155 printk (" and ");
156
157 if (edx & 1 << 2) {
158 printk ("voltage");
159 can_scale_vid=1;
160 }
161
162 printk (".\n");
163 return 1;
164}
165
166
167static int get_ranges (unsigned char *pst)
168{
169 unsigned int j;
170 unsigned int speed;
171 u8 fid, vid;
172
173 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL);
174 if (!powernow_table)
175 return -ENOMEM;
176 memset(powernow_table, 0, (sizeof(struct cpufreq_frequency_table) * (number_scales + 1)));
177
178 for (j=0 ; j < number_scales; j++) {
179 fid = *pst++;
180
181 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
182 powernow_table[j].index = fid; /* lower 8 bits */
183
184 speed = powernow_table[j].frequency;
185
186 if ((fid_codes[fid] % 10)==5) {
187#ifdef CONFIG_X86_POWERNOW_K7_ACPI
188 if (have_a0 == 1)
189 powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID;
190#endif
191 }
192
193 if (speed < minimum_speed)
194 minimum_speed = speed;
195 if (speed > maximum_speed)
196 maximum_speed = speed;
197
198 vid = *pst++;
199 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
200
201 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) "
202 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
203 fid_codes[fid] % 10, speed/1000, vid,
204 mobile_vid_table[vid]/1000,
205 mobile_vid_table[vid]%1000);
206 }
207 powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
208 powernow_table[number_scales].index = 0;
209
210 return 0;
211}
212
213
214static void change_FID(int fid)
215{
216 union msr_fidvidctl fidvidctl;
217
218 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
219 if (fidvidctl.bits.FID != fid) {
220 fidvidctl.bits.SGTC = latency;
221 fidvidctl.bits.FID = fid;
222 fidvidctl.bits.VIDC = 0;
223 fidvidctl.bits.FIDC = 1;
224 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
225 }
226}
227
228
229static void change_VID(int vid)
230{
231 union msr_fidvidctl fidvidctl;
232
233 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
234 if (fidvidctl.bits.VID != vid) {
235 fidvidctl.bits.SGTC = latency;
236 fidvidctl.bits.VID = vid;
237 fidvidctl.bits.FIDC = 0;
238 fidvidctl.bits.VIDC = 1;
239 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
240 }
241}
242
243
244static void change_speed (unsigned int index)
245{
246 u8 fid, vid;
247 struct cpufreq_freqs freqs;
248 union msr_fidvidstatus fidvidstatus;
249 int cfid;
250
251 /* fid are the lower 8 bits of the index we stored into
252 * the cpufreq frequency table in powernow_decode_bios,
253 * vid are the upper 8 bits.
254 */
255
256 fid = powernow_table[index].index & 0xFF;
257 vid = (powernow_table[index].index & 0xFF00) >> 8;
258
259 freqs.cpu = 0;
260
261 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
262 cfid = fidvidstatus.bits.CFID;
263 freqs.old = fsb * fid_codes[cfid] / 10;
264
265 freqs.new = powernow_table[index].frequency;
266
267 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
268
269 /* Now do the magic poking into the MSRs. */
270
271 if (have_a0 == 1) /* A0 errata 5 */
272 local_irq_disable();
273
274 if (freqs.old > freqs.new) {
275 /* Going down, so change FID first */
276 change_FID(fid);
277 change_VID(vid);
278 } else {
279 /* Going up, so change VID first */
280 change_VID(vid);
281 change_FID(fid);
282 }
283
284
285 if (have_a0 == 1)
286 local_irq_enable();
287
288 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
289}
290
291
292#ifdef CONFIG_X86_POWERNOW_K7_ACPI
293
294static struct acpi_processor_performance *acpi_processor_perf;
295
296static int powernow_acpi_init(void)
297{
298 int i;
299 int retval = 0;
300 union powernow_acpi_control_t pc;
301
302 if (acpi_processor_perf != NULL && powernow_table != NULL) {
303 retval = -EINVAL;
304 goto err0;
305 }
306
307 acpi_processor_perf = kmalloc(sizeof(struct acpi_processor_performance),
308 GFP_KERNEL);
309
310 if (!acpi_processor_perf) {
311 retval = -ENOMEM;
312 goto err0;
313 }
314
315 memset(acpi_processor_perf, 0, sizeof(struct acpi_processor_performance));
316
317 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
318 retval = -EIO;
319 goto err1;
320 }
321
322 if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
323 retval = -ENODEV;
324 goto err2;
325 }
326
327 if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
328 retval = -ENODEV;
329 goto err2;
330 }
331
332 number_scales = acpi_processor_perf->state_count;
333
334 if (number_scales < 2) {
335 retval = -ENODEV;
336 goto err2;
337 }
338
339 powernow_table = kmalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL);
340 if (!powernow_table) {
341 retval = -ENOMEM;
342 goto err2;
343 }
344
345 memset(powernow_table, 0, ((number_scales + 1) * sizeof(struct cpufreq_frequency_table)));
346
347 pc.val = (unsigned long) acpi_processor_perf->states[0].control;
348 for (i = 0; i < number_scales; i++) {
349 u8 fid, vid;
350 unsigned int speed;
351
352 pc.val = (unsigned long) acpi_processor_perf->states[i].control;
353 dprintk ("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
354 i,
355 (u32) acpi_processor_perf->states[i].core_frequency,
356 (u32) acpi_processor_perf->states[i].power,
357 (u32) acpi_processor_perf->states[i].transition_latency,
358 (u32) acpi_processor_perf->states[i].control,
359 pc.bits.sgtc);
360
361 vid = pc.bits.vid;
362 fid = pc.bits.fid;
363
364 powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
365 powernow_table[i].index = fid; /* lower 8 bits */
366 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
367
368 speed = powernow_table[i].frequency;
369
370 if ((fid_codes[fid] % 10)==5) {
371 if (have_a0 == 1)
372 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
373 }
374
375 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) "
376 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
377 fid_codes[fid] % 10, speed/1000, vid,
378 mobile_vid_table[vid]/1000,
379 mobile_vid_table[vid]%1000);
380
381 if (latency < pc.bits.sgtc)
382 latency = pc.bits.sgtc;
383
384 if (speed < minimum_speed)
385 minimum_speed = speed;
386 if (speed > maximum_speed)
387 maximum_speed = speed;
388 }
389
390 powernow_table[i].frequency = CPUFREQ_TABLE_END;
391 powernow_table[i].index = 0;
392
393 /* notify BIOS that we exist */
394 acpi_processor_notify_smm(THIS_MODULE);
395
396 return 0;
397
398err2:
399 acpi_processor_unregister_performance(acpi_processor_perf, 0);
400err1:
401 kfree(acpi_processor_perf);
402err0:
403 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
404 acpi_processor_perf = NULL;
405 return retval;
406}
407#else
408static int powernow_acpi_init(void)
409{
410 printk(KERN_INFO PFX "no support for ACPI processor found."
411 " Please recompile your kernel with ACPI processor\n");
412 return -EINVAL;
413}
414#endif
415
416static int powernow_decode_bios (int maxfid, int startvid)
417{
418 struct psb_s *psb;
419 struct pst_s *pst;
420 unsigned int i, j;
421 unsigned char *p;
422 unsigned int etuple;
423 unsigned int ret;
424
425 etuple = cpuid_eax(0x80000001);
426
427 for (i=0xC0000; i < 0xffff0 ; i+=16) {
428
429 p = phys_to_virt(i);
430
431 if (memcmp(p, "AMDK7PNOW!", 10) == 0){
432 dprintk ("Found PSB header at %p\n", p);
433 psb = (struct psb_s *) p;
434 dprintk ("Table version: 0x%x\n", psb->tableversion);
435 if (psb->tableversion != 0x12) {
436 printk (KERN_INFO PFX "Sorry, only v1.2 tables supported right now\n");
437 return -ENODEV;
438 }
439
440 dprintk ("Flags: 0x%x\n", psb->flags);
441 if ((psb->flags & 1)==0) {
442 dprintk ("Mobile voltage regulator\n");
443 } else {
444 dprintk ("Desktop voltage regulator\n");
445 }
446
447 latency = psb->settlingtime;
448 if (latency < 100) {
449 printk (KERN_INFO PFX "BIOS set settling time to %d microseconds."
450 "Should be at least 100. Correcting.\n", latency);
451 latency = 100;
452 }
453 dprintk ("Settling Time: %d microseconds.\n", psb->settlingtime);
454 dprintk ("Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst);
455
456 p += sizeof (struct psb_s);
457
458 pst = (struct pst_s *) p;
459
460 for (i = 0 ; i <psb->numpst; i++) {
461 pst = (struct pst_s *) p;
462 number_scales = pst->numpstates;
463
464 if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) &&
465 (maxfid==pst->maxfid) && (startvid==pst->startvid))
466 {
467 dprintk ("PST:%d (@%p)\n", i, pst);
468 dprintk (" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
469 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
470
471 ret = get_ranges ((char *) pst + sizeof (struct pst_s));
472 return ret;
473
474 } else {
475 p = (char *) pst + sizeof (struct pst_s);
476 for (j=0 ; j < number_scales; j++)
477 p+=2;
478 }
479 }
480 printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple);
481 printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n");
482
483 return -EINVAL;
484 }
485 p++;
486 }
487
488 return -ENODEV;
489}
490
491
492static int powernow_target (struct cpufreq_policy *policy,
493 unsigned int target_freq,
494 unsigned int relation)
495{
496 unsigned int newstate;
497
498 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, relation, &newstate))
499 return -EINVAL;
500
501 change_speed(newstate);
502
503 return 0;
504}
505
506
507static int powernow_verify (struct cpufreq_policy *policy)
508{
509 return cpufreq_frequency_table_verify(policy, powernow_table);
510}
511
512/*
513 * We use the fact that the bus frequency is somehow
514 * a multiple of 100000/3 khz, then we compute sgtc according
515 * to this multiple.
516 * That way, we match more how AMD thinks all of that work.
517 * We will then get the same kind of behaviour already tested under
518 * the "well-known" other OS.
519 */
520static int __init fixup_sgtc(void)
521{
522 unsigned int sgtc;
523 unsigned int m;
524
525 m = fsb / 3333;
526 if ((m % 10) >= 5)
527 m += 5;
528
529 m /= 10;
530
531 sgtc = 100 * m * latency;
532 sgtc = sgtc / 3;
533 if (sgtc > 0xfffff) {
534 printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
535 sgtc = 0xfffff;
536 }
537 return sgtc;
538}
539
540static unsigned int powernow_get(unsigned int cpu)
541{
542 union msr_fidvidstatus fidvidstatus;
543 unsigned int cfid;
544
545 if (cpu)
546 return 0;
547 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
548 cfid = fidvidstatus.bits.CFID;
549
550 return (fsb * fid_codes[cfid] / 10);
551}
552
553
554static int __init acer_cpufreq_pst(struct dmi_system_id *d)
555{
556 printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident);
557 printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n");
558 printk(KERN_WARNING "cpufreq scaling has been disabled as a result of this.\n");
559 return 0;
560}
561
562/*
563 * Some Athlon laptops have really fucked PST tables.
564 * A BIOS update is all that can save them.
565 * Mention this, and disable cpufreq.
566 */
567static struct dmi_system_id __initdata powernow_dmi_table[] = {
568 {
569 .callback = acer_cpufreq_pst,
570 .ident = "Acer Aspire",
571 .matches = {
572 DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
573 DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
574 },
575 },
576 { }
577};
578
579static int __init powernow_cpu_init (struct cpufreq_policy *policy)
580{
581 union msr_fidvidstatus fidvidstatus;
582 int result;
583
584 if (policy->cpu != 0)
585 return -ENODEV;
586
587 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
588
589 /* A K7 with powernow technology is set to max frequency by BIOS */
590 fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.MFID];
591 if (!fsb) {
592 printk(KERN_WARNING PFX "can not determine bus frequency\n");
593 return -EINVAL;
594 }
595 dprintk("FSB: %3d.%03d MHz\n", fsb/1000, fsb%1000);
596
597 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
598 printk (KERN_INFO PFX "PSB/PST known to be broken. Trying ACPI instead\n");
599 result = powernow_acpi_init();
600 } else {
601 result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID);
602 if (result) {
603 printk (KERN_INFO PFX "Trying ACPI perflib\n");
604 maximum_speed = 0;
605 minimum_speed = -1;
606 latency = 0;
607 result = powernow_acpi_init();
608 if (result) {
609 printk (KERN_INFO PFX "ACPI and legacy methods failed\n");
610 printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.shtml\n");
611 }
612 } else {
613 /* SGTC use the bus clock as timer */
614 latency = fixup_sgtc();
615 printk(KERN_INFO PFX "SGTC: %d\n", latency);
616 }
617 }
618
619 if (result)
620 return result;
621
622 printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
623 minimum_speed/1000, maximum_speed/1000);
624
625 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
626
627 policy->cpuinfo.transition_latency = cpufreq_scale(2000000UL, fsb, latency);
628
629 policy->cur = powernow_get(0);
630
631 cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
632
633 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
634}
635
636static int powernow_cpu_exit (struct cpufreq_policy *policy) {
637 cpufreq_frequency_table_put_attr(policy->cpu);
638
639#ifdef CONFIG_X86_POWERNOW_K7_ACPI
640 if (acpi_processor_perf) {
641 acpi_processor_unregister_performance(acpi_processor_perf, 0);
642 kfree(acpi_processor_perf);
643 }
644#endif
645
646 if (powernow_table)
647 kfree(powernow_table);
648
649 return 0;
650}
651
652static struct freq_attr* powernow_table_attr[] = {
653 &cpufreq_freq_attr_scaling_available_freqs,
654 NULL,
655};
656
657static struct cpufreq_driver powernow_driver = {
658 .verify = powernow_verify,
659 .target = powernow_target,
660 .get = powernow_get,
661 .init = powernow_cpu_init,
662 .exit = powernow_cpu_exit,
663 .name = "powernow-k7",
664 .owner = THIS_MODULE,
665 .attr = powernow_table_attr,
666};
667
668static int __init powernow_init (void)
669{
670 if (check_powernow()==0)
671 return -ENODEV;
672 return cpufreq_register_driver(&powernow_driver);
673}
674
675
676static void __exit powernow_exit (void)
677{
678 cpufreq_unregister_driver(&powernow_driver);
679}
680
681module_param(acpi_force, int, 0444);
682MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
683
684MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
685MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
686MODULE_LICENSE ("GPL");
687
688late_initcall(powernow_init);
689module_exit(powernow_exit);
690
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.h b/arch/i386/kernel/cpu/cpufreq/powernow-k7.h
new file mode 100644
index 000000000000..f8a63b3664e3
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.h
@@ -0,0 +1,44 @@
1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * AMD-specific information
8 *
9 */
10
11union msr_fidvidctl {
12 struct {
13 unsigned FID:5, // 4:0
14 reserved1:3, // 7:5
15 VID:5, // 12:8
16 reserved2:3, // 15:13
17 FIDC:1, // 16
18 VIDC:1, // 17
19 reserved3:2, // 19:18
20 FIDCHGRATIO:1, // 20
21 reserved4:11, // 31-21
22 SGTC:20, // 32:51
23 reserved5:12; // 63:52
24 } bits;
25 unsigned long long val;
26};
27
28union msr_fidvidstatus {
29 struct {
30 unsigned CFID:5, // 4:0
31 reserved1:3, // 7:5
32 SFID:5, // 12:8
33 reserved2:3, // 15:13
34 MFID:5, // 20:16
35 reserved3:11, // 31:21
36 CVID:5, // 36:32
37 reserved4:3, // 39:37
38 SVID:5, // 44:40
39 reserved5:3, // 47:45
40 MVID:5, // 52:48
41 reserved6:11; // 63:53
42 } bits;
43 unsigned long long val;
44};
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
new file mode 100644
index 000000000000..a65ff7e32e5d
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -0,0 +1,1135 @@
1/*
2 * (c) 2003, 2004 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Support : paul.devriendt@amd.com
8 *
9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz>
13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 *
16 * Valuable input gratefully received from Dave Jones, Pavel Machek,
17 * Dominik Brodowski, and others.
18 * Processor information obtained from Chapter 9 (Power and Thermal Management)
19 * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
20 * Opteron Processors" available for download from www.amd.com
21 *
22 * Tables for specific CPUs can be infrerred from
23 * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
24 */
25
26#include <linux/kernel.h>
27#include <linux/smp.h>
28#include <linux/module.h>
29#include <linux/init.h>
30#include <linux/cpufreq.h>
31#include <linux/slab.h>
32#include <linux/string.h>
33
34#include <asm/msr.h>
35#include <asm/io.h>
36#include <asm/delay.h>
37
38#ifdef CONFIG_X86_POWERNOW_K8_ACPI
39#include <linux/acpi.h>
40#include <acpi/processor.h>
41#endif
42
43#define PFX "powernow-k8: "
44#define BFX PFX "BIOS error: "
45#define VERSION "version 1.00.09e"
46#include "powernow-k8.h"
47
48/* serialize freq changes */
49static DECLARE_MUTEX(fidvid_sem);
50
51static struct powernow_k8_data *powernow_data[NR_CPUS];
52
53/* Return a frequency in MHz, given an input fid */
54static u32 find_freq_from_fid(u32 fid)
55{
56 return 800 + (fid * 100);
57}
58
59/* Return a frequency in KHz, given an input fid */
60static u32 find_khz_freq_from_fid(u32 fid)
61{
62 return 1000 * find_freq_from_fid(fid);
63}
64
65/* Return a voltage in miliVolts, given an input vid */
66static u32 find_millivolts_from_vid(struct powernow_k8_data *data, u32 vid)
67{
68 return 1550-vid*25;
69}
70
71/* Return the vco fid for an input fid
72 *
73 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
74 * only from corresponding high fids. This returns "high" fid corresponding to
75 * "low" one.
76 */
77static u32 convert_fid_to_vco_fid(u32 fid)
78{
79 if (fid < HI_FID_TABLE_BOTTOM) {
80 return 8 + (2 * fid);
81 } else {
82 return fid;
83 }
84}
85
86/*
87 * Return 1 if the pending bit is set. Unless we just instructed the processor
88 * to transition to a new state, seeing this bit set is really bad news.
89 */
90static int pending_bit_stuck(void)
91{
92 u32 lo, hi;
93
94 rdmsr(MSR_FIDVID_STATUS, lo, hi);
95 return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
96}
97
98/*
99 * Update the global current fid / vid values from the status msr.
100 * Returns 1 on error.
101 */
102static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
103{
104 u32 lo, hi;
105 u32 i = 0;
106
107 lo = MSR_S_LO_CHANGE_PENDING;
108 while (lo & MSR_S_LO_CHANGE_PENDING) {
109 if (i++ > 0x1000000) {
110 printk(KERN_ERR PFX "detected change pending stuck\n");
111 return 1;
112 }
113 rdmsr(MSR_FIDVID_STATUS, lo, hi);
114 }
115
116 data->currvid = hi & MSR_S_HI_CURRENT_VID;
117 data->currfid = lo & MSR_S_LO_CURRENT_FID;
118
119 return 0;
120}
121
122/* the isochronous relief time */
123static void count_off_irt(struct powernow_k8_data *data)
124{
125 udelay((1 << data->irt) * 10);
126 return;
127}
128
129/* the voltage stabalization time */
130static void count_off_vst(struct powernow_k8_data *data)
131{
132 udelay(data->vstable * VST_UNITS_20US);
133 return;
134}
135
136/* need to init the control msr to a safe value (for each cpu) */
137static void fidvid_msr_init(void)
138{
139 u32 lo, hi;
140 u8 fid, vid;
141
142 rdmsr(MSR_FIDVID_STATUS, lo, hi);
143 vid = hi & MSR_S_HI_CURRENT_VID;
144 fid = lo & MSR_S_LO_CURRENT_FID;
145 lo = fid | (vid << MSR_C_LO_VID_SHIFT);
146 hi = MSR_C_HI_STP_GNT_BENIGN;
147 dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
148 wrmsr(MSR_FIDVID_CTL, lo, hi);
149}
150
151
152/* write the new fid value along with the other control fields to the msr */
153static int write_new_fid(struct powernow_k8_data *data, u32 fid)
154{
155 u32 lo;
156 u32 savevid = data->currvid;
157
158 if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
159 printk(KERN_ERR PFX "internal error - overflow on fid write\n");
160 return 1;
161 }
162
163 lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
164
165 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
166 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
167
168 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
169
170 if (query_current_values_with_pending_wait(data))
171 return 1;
172
173 count_off_irt(data);
174
175 if (savevid != data->currvid) {
176 printk(KERN_ERR PFX "vid change on fid trans, old 0x%x, new 0x%x\n",
177 savevid, data->currvid);
178 return 1;
179 }
180
181 if (fid != data->currfid) {
182 printk(KERN_ERR PFX "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
183 data->currfid);
184 return 1;
185 }
186
187 return 0;
188}
189
190/* Write a new vid to the hardware */
191static int write_new_vid(struct powernow_k8_data *data, u32 vid)
192{
193 u32 lo;
194 u32 savefid = data->currfid;
195
196 if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
197 printk(KERN_ERR PFX "internal error - overflow on vid write\n");
198 return 1;
199 }
200
201 lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
202
203 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
204 vid, lo, STOP_GRANT_5NS);
205
206 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
207
208 if (query_current_values_with_pending_wait(data))
209 return 1;
210
211 if (savefid != data->currfid) {
212 printk(KERN_ERR PFX "fid changed on vid trans, old 0x%x new 0x%x\n",
213 savefid, data->currfid);
214 return 1;
215 }
216
217 if (vid != data->currvid) {
218 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, curr 0x%x\n", vid,
219 data->currvid);
220 return 1;
221 }
222
223 return 0;
224}
225
226/*
227 * Reduce the vid by the max of step or reqvid.
228 * Decreasing vid codes represent increasing voltages:
229 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of 0x1f is off.
230 */
231static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step)
232{
233 if ((data->currvid - reqvid) > step)
234 reqvid = data->currvid - step;
235
236 if (write_new_vid(data, reqvid))
237 return 1;
238
239 count_off_vst(data);
240
241 return 0;
242}
243
244/* Change the fid and vid, by the 3 phases. */
245static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid)
246{
247 if (core_voltage_pre_transition(data, reqvid))
248 return 1;
249
250 if (core_frequency_transition(data, reqfid))
251 return 1;
252
253 if (core_voltage_post_transition(data, reqvid))
254 return 1;
255
256 if (query_current_values_with_pending_wait(data))
257 return 1;
258
259 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
260 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, curr 0x%x 0x%x\n",
261 smp_processor_id(),
262 reqfid, reqvid, data->currfid, data->currvid);
263 return 1;
264 }
265
266 dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
267 smp_processor_id(), data->currfid, data->currvid);
268
269 return 0;
270}
271
272/* Phase 1 - core voltage transition ... setup voltage */
273static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid)
274{
275 u32 rvosteps = data->rvo;
276 u32 savefid = data->currfid;
277
278 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n",
279 smp_processor_id(),
280 data->currfid, data->currvid, reqvid, data->rvo);
281
282 while (data->currvid > reqvid) {
283 dprintk("ph1: curr 0x%x, req vid 0x%x\n",
284 data->currvid, reqvid);
285 if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
286 return 1;
287 }
288
289 while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) {
290 if (data->currvid == 0) {
291 rvosteps = 0;
292 } else {
293 dprintk("ph1: changing vid for rvo, req 0x%x\n",
294 data->currvid - 1);
295 if (decrease_vid_code_by_step(data, data->currvid - 1, 1))
296 return 1;
297 rvosteps--;
298 }
299 }
300
301 if (query_current_values_with_pending_wait(data))
302 return 1;
303
304 if (savefid != data->currfid) {
305 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", data->currfid);
306 return 1;
307 }
308
309 dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
310 data->currfid, data->currvid);
311
312 return 0;
313}
314
315/* Phase 2 - core frequency transition */
316static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
317{
318 u32 vcoreqfid, vcocurrfid, vcofiddiff, savevid = data->currvid;
319
320 if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
321 printk(KERN_ERR PFX "ph2: illegal lo-lo transition 0x%x 0x%x\n",
322 reqfid, data->currfid);
323 return 1;
324 }
325
326 if (data->currfid == reqfid) {
327 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid);
328 return 0;
329 }
330
331 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n",
332 smp_processor_id(),
333 data->currfid, data->currvid, reqfid);
334
335 vcoreqfid = convert_fid_to_vco_fid(reqfid);
336 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
337 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
338 : vcoreqfid - vcocurrfid;
339
340 while (vcofiddiff > 2) {
341 if (reqfid > data->currfid) {
342 if (data->currfid > LO_FID_TABLE_TOP) {
343 if (write_new_fid(data, data->currfid + 2)) {
344 return 1;
345 }
346 } else {
347 if (write_new_fid
348 (data, 2 + convert_fid_to_vco_fid(data->currfid))) {
349 return 1;
350 }
351 }
352 } else {
353 if (write_new_fid(data, data->currfid - 2))
354 return 1;
355 }
356
357 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
358 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
359 : vcoreqfid - vcocurrfid;
360 }
361
362 if (write_new_fid(data, reqfid))
363 return 1;
364
365 if (query_current_values_with_pending_wait(data))
366 return 1;
367
368 if (data->currfid != reqfid) {
369 printk(KERN_ERR PFX
370 "ph2: mismatch, failed fid transition, curr 0x%x, req 0x%x\n",
371 data->currfid, reqfid);
372 return 1;
373 }
374
375 if (savevid != data->currvid) {
376 printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
377 savevid, data->currvid);
378 return 1;
379 }
380
381 dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
382 data->currfid, data->currvid);
383
384 return 0;
385}
386
387/* Phase 3 - core voltage transition flow ... jump to the final vid. */
388static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid)
389{
390 u32 savefid = data->currfid;
391 u32 savereqvid = reqvid;
392
393 dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
394 smp_processor_id(),
395 data->currfid, data->currvid);
396
397 if (reqvid != data->currvid) {
398 if (write_new_vid(data, reqvid))
399 return 1;
400
401 if (savefid != data->currfid) {
402 printk(KERN_ERR PFX
403 "ph3: bad fid change, save 0x%x, curr 0x%x\n",
404 savefid, data->currfid);
405 return 1;
406 }
407
408 if (data->currvid != reqvid) {
409 printk(KERN_ERR PFX
410 "ph3: failed vid transition\n, req 0x%x, curr 0x%x",
411 reqvid, data->currvid);
412 return 1;
413 }
414 }
415
416 if (query_current_values_with_pending_wait(data))
417 return 1;
418
419 if (savereqvid != data->currvid) {
420 dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
421 return 1;
422 }
423
424 if (savefid != data->currfid) {
425 dprintk("ph3 failed, currfid changed 0x%x\n",
426 data->currfid);
427 return 1;
428 }
429
430 dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
431 data->currfid, data->currvid);
432
433 return 0;
434}
435
436static int check_supported_cpu(unsigned int cpu)
437{
438 cpumask_t oldmask = CPU_MASK_ALL;
439 u32 eax, ebx, ecx, edx;
440 unsigned int rc = 0;
441
442 oldmask = current->cpus_allowed;
443 set_cpus_allowed(current, cpumask_of_cpu(cpu));
444 schedule();
445
446 if (smp_processor_id() != cpu) {
447 printk(KERN_ERR "limiting to cpu %u failed\n", cpu);
448 goto out;
449 }
450
451 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
452 goto out;
453
454 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
455 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
456 ((eax & CPUID_XFAM) != CPUID_XFAM_K8) ||
457 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_E)) {
458 printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax);
459 goto out;
460 }
461
462 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
463 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
464 printk(KERN_INFO PFX
465 "No frequency change capabilities detected\n");
466 goto out;
467 }
468
469 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
470 if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) {
471 printk(KERN_INFO PFX "Power state transitions not supported\n");
472 goto out;
473 }
474
475 rc = 1;
476
477out:
478 set_cpus_allowed(current, oldmask);
479 schedule();
480 return rc;
481
482}
483
484static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
485{
486 unsigned int j;
487 u8 lastfid = 0xff;
488
489 for (j = 0; j < data->numps; j++) {
490 if (pst[j].vid > LEAST_VID) {
491 printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid);
492 return -EINVAL;
493 }
494 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */
495 printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j);
496 return -ENODEV;
497 }
498 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */
499 printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j);
500 return -ENODEV;
501 }
502 if ((pst[j].fid > MAX_FID)
503 || (pst[j].fid & 1)
504 || (j && (pst[j].fid < HI_FID_TABLE_BOTTOM))) {
505 /* Only first fid is allowed to be in "low" range */
506 printk(KERN_ERR PFX "two low fids - %d : 0x%x\n", j, pst[j].fid);
507 return -EINVAL;
508 }
509 if (pst[j].fid < lastfid)
510 lastfid = pst[j].fid;
511 }
512 if (lastfid & 1) {
513 printk(KERN_ERR PFX "lastfid invalid\n");
514 return -EINVAL;
515 }
516 if (lastfid > LO_FID_TABLE_TOP)
517 printk(KERN_INFO PFX "first fid not from lo freq table\n");
518
519 return 0;
520}
521
522static void print_basics(struct powernow_k8_data *data)
523{
524 int j;
525 for (j = 0; j < data->numps; j++) {
526 if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID)
527 printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x (%d mV)\n", j,
528 data->powernow_table[j].index & 0xff,
529 data->powernow_table[j].frequency/1000,
530 data->powernow_table[j].index >> 8,
531 find_millivolts_from_vid(data, data->powernow_table[j].index >> 8));
532 }
533 if (data->batps)
534 printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps);
535}
536
537static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
538{
539 struct cpufreq_frequency_table *powernow_table;
540 unsigned int j;
541
542 if (data->batps) { /* use ACPI support to get full speed on mains power */
543 printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps);
544 data->numps = data->batps;
545 }
546
547 for ( j=1; j<data->numps; j++ ) {
548 if (pst[j-1].fid >= pst[j].fid) {
549 printk(KERN_ERR PFX "PST out of sequence\n");
550 return -EINVAL;
551 }
552 }
553
554 if (data->numps < 2) {
555 printk(KERN_ERR PFX "no p states to transition\n");
556 return -ENODEV;
557 }
558
559 if (check_pst_table(data, pst, maxvid))
560 return -EINVAL;
561
562 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
563 * (data->numps + 1)), GFP_KERNEL);
564 if (!powernow_table) {
565 printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
566 return -ENOMEM;
567 }
568
569 for (j = 0; j < data->numps; j++) {
570 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
571 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
572 powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid);
573 }
574 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
575 powernow_table[data->numps].index = 0;
576
577 if (query_current_values_with_pending_wait(data)) {
578 kfree(powernow_table);
579 return -EIO;
580 }
581
582 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
583 data->powernow_table = powernow_table;
584 print_basics(data);
585
586 for (j = 0; j < data->numps; j++)
587 if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid))
588 return 0;
589
590 dprintk("currfid/vid do not match PST, ignoring\n");
591 return 0;
592}
593
594/* Find and validate the PSB/PST table in BIOS. */
595static int find_psb_table(struct powernow_k8_data *data)
596{
597 struct psb_s *psb;
598 unsigned int i;
599 u32 mvs;
600 u8 maxvid;
601 u32 cpst = 0;
602 u32 thiscpuid;
603
604 for (i = 0xc0000; i < 0xffff0; i += 0x10) {
605 /* Scan BIOS looking for the signature. */
606 /* It can not be at ffff0 - it is too big. */
607
608 psb = phys_to_virt(i);
609 if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
610 continue;
611
612 dprintk("found PSB header at 0x%p\n", psb);
613
614 dprintk("table vers: 0x%x\n", psb->tableversion);
615 if (psb->tableversion != PSB_VERSION_1_4) {
616 printk(KERN_INFO BFX "PSB table is not v1.4\n");
617 return -ENODEV;
618 }
619
620 dprintk("flags: 0x%x\n", psb->flags1);
621 if (psb->flags1) {
622 printk(KERN_ERR BFX "unknown flags\n");
623 return -ENODEV;
624 }
625
626 data->vstable = psb->vstable;
627 dprintk("voltage stabilization time: %d(*20us)\n", data->vstable);
628
629 dprintk("flags2: 0x%x\n", psb->flags2);
630 data->rvo = psb->flags2 & 3;
631 data->irt = ((psb->flags2) >> 2) & 3;
632 mvs = ((psb->flags2) >> 4) & 3;
633 data->vidmvs = 1 << mvs;
634 data->batps = ((psb->flags2) >> 6) & 3;
635
636 dprintk("ramp voltage offset: %d\n", data->rvo);
637 dprintk("isochronous relief time: %d\n", data->irt);
638 dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
639
640 dprintk("numpst: 0x%x\n", psb->num_tables);
641 cpst = psb->num_tables;
642 if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0) ){
643 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
644 if ((thiscpuid == 0x00000fc0) || (thiscpuid == 0x00000fe0) ) {
645 cpst = 1;
646 }
647 }
648 if (cpst != 1) {
649 printk(KERN_ERR BFX "numpst must be 1\n");
650 return -ENODEV;
651 }
652
653 data->plllock = psb->plllocktime;
654 dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
655 dprintk("maxfid: 0x%x\n", psb->maxfid);
656 dprintk("maxvid: 0x%x\n", psb->maxvid);
657 maxvid = psb->maxvid;
658
659 data->numps = psb->numps;
660 dprintk("numpstates: 0x%x\n", data->numps);
661 return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid);
662 }
663 /*
664 * If you see this message, complain to BIOS manufacturer. If
665 * he tells you "we do not support Linux" or some similar
666 * nonsense, remember that Windows 2000 uses the same legacy
667 * mechanism that the old Linux PSB driver uses. Tell them it
668 * is broken with Windows 2000.
669 *
670 * The reference to the AMD documentation is chapter 9 in the
671 * BIOS and Kernel Developer's Guide, which is available on
672 * www.amd.com
673 */
674 printk(KERN_ERR PFX "BIOS error - no PSB\n");
675 return -ENODEV;
676}
677
678#ifdef CONFIG_X86_POWERNOW_K8_ACPI
679static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
680{
681 if (!data->acpi_data.state_count)
682 return;
683
684 data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK;
685 data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK;
686 data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
687 data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK);
688 data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK;
689}
690
691static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
692{
693 int i;
694 int cntlofreq = 0;
695 struct cpufreq_frequency_table *powernow_table;
696
697 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
698 dprintk("register performance failed\n");
699 return -EIO;
700 }
701
702 /* verify the data contained in the ACPI structures */
703 if (data->acpi_data.state_count <= 1) {
704 dprintk("No ACPI P-States\n");
705 goto err_out;
706 }
707
708 if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
709 (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
710 dprintk("Invalid control/status registers (%x - %x)\n",
711 data->acpi_data.control_register.space_id,
712 data->acpi_data.status_register.space_id);
713 goto err_out;
714 }
715
716 /* fill in data->powernow_table */
717 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
718 * (data->acpi_data.state_count + 1)), GFP_KERNEL);
719 if (!powernow_table) {
720 dprintk("powernow_table memory alloc failure\n");
721 goto err_out;
722 }
723
724 for (i = 0; i < data->acpi_data.state_count; i++) {
725 u32 fid = data->acpi_data.states[i].control & FID_MASK;
726 u32 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK;
727
728 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
729
730 powernow_table[i].index = fid; /* lower 8 bits */
731 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
732 powernow_table[i].frequency = find_khz_freq_from_fid(fid);
733
734 /* verify frequency is OK */
735 if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) ||
736 (powernow_table[i].frequency < (MIN_FREQ * 1000))) {
737 dprintk("invalid freq %u kHz, ignoring\n", powernow_table[i].frequency);
738 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
739 continue;
740 }
741
742 /* verify voltage is OK - BIOSs are using "off" to indicate invalid */
743 if (vid == 0x1f) {
744 dprintk("invalid vid %u, ignoring\n", vid);
745 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
746 continue;
747 }
748
749 if (fid < HI_FID_TABLE_BOTTOM) {
750 if (cntlofreq) {
751 /* if both entries are the same, ignore this
752 * one...
753 */
754 if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) ||
755 (powernow_table[i].index != powernow_table[cntlofreq].index)) {
756 printk(KERN_ERR PFX "Too many lo freq table entries\n");
757 goto err_out_mem;
758 }
759
760 dprintk("double low frequency table entry, ignoring it.\n");
761 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
762 continue;
763 } else
764 cntlofreq = i;
765 }
766
767 if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
768 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
769 powernow_table[i].frequency,
770 (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
771 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
772 continue;
773 }
774 }
775
776 powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
777 powernow_table[data->acpi_data.state_count].index = 0;
778 data->powernow_table = powernow_table;
779
780 /* fill in data */
781 data->numps = data->acpi_data.state_count;
782 print_basics(data);
783 powernow_k8_acpi_pst_values(data, 0);
784
785 /* notify BIOS that we exist */
786 acpi_processor_notify_smm(THIS_MODULE);
787
788 return 0;
789
790err_out_mem:
791 kfree(powernow_table);
792
793err_out:
794 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
795
796 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
797 data->acpi_data.state_count = 0;
798
799 return -ENODEV;
800}
801
802static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
803{
804 if (data->acpi_data.state_count)
805 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
806}
807
808#else
809static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
810static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
811static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
812#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
813
814/* Take a frequency, and issue the fid/vid transition command */
815static int transition_frequency(struct powernow_k8_data *data, unsigned int index)
816{
817 u32 fid;
818 u32 vid;
819 int res;
820 struct cpufreq_freqs freqs;
821
822 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
823
824 /* fid are the lower 8 bits of the index we stored into
825 * the cpufreq frequency table in find_psb_table, vid are
826 * the upper 8 bits.
827 */
828
829 fid = data->powernow_table[index].index & 0xFF;
830 vid = (data->powernow_table[index].index & 0xFF00) >> 8;
831
832 dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
833
834 if (query_current_values_with_pending_wait(data))
835 return 1;
836
837 if ((data->currvid == vid) && (data->currfid == fid)) {
838 dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
839 fid, vid);
840 return 0;
841 }
842
843 if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
844 printk("ignoring illegal change in lo freq table-%x to 0x%x\n",
845 data->currfid, fid);
846 return 1;
847 }
848
849 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
850 smp_processor_id(), fid, vid);
851
852 freqs.cpu = data->cpu;
853
854 freqs.old = find_khz_freq_from_fid(data->currfid);
855 freqs.new = find_khz_freq_from_fid(fid);
856 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
857
858 down(&fidvid_sem);
859 res = transition_fid_vid(data, fid, vid);
860 up(&fidvid_sem);
861
862 freqs.new = find_khz_freq_from_fid(data->currfid);
863 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
864
865 return res;
866}
867
868/* Driver entry point to switch to the target frequency */
869static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
870{
871 cpumask_t oldmask = CPU_MASK_ALL;
872 struct powernow_k8_data *data = powernow_data[pol->cpu];
873 u32 checkfid = data->currfid;
874 u32 checkvid = data->currvid;
875 unsigned int newstate;
876 int ret = -EIO;
877
878 /* only run on specific CPU from here on */
879 oldmask = current->cpus_allowed;
880 set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
881 schedule();
882
883 if (smp_processor_id() != pol->cpu) {
884 printk(KERN_ERR "limiting to cpu %u failed\n", pol->cpu);
885 goto err_out;
886 }
887
888 if (pending_bit_stuck()) {
889 printk(KERN_ERR PFX "failing targ, change pending bit set\n");
890 goto err_out;
891 }
892
893 dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
894 pol->cpu, targfreq, pol->min, pol->max, relation);
895
896 if (query_current_values_with_pending_wait(data)) {
897 ret = -EIO;
898 goto err_out;
899 }
900
901 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
902 data->currfid, data->currvid);
903
904 if ((checkvid != data->currvid) || (checkfid != data->currfid)) {
905 printk(KERN_ERR PFX
906 "error - out of sync, fid 0x%x 0x%x, vid 0x%x 0x%x\n",
907 checkfid, data->currfid, checkvid, data->currvid);
908 }
909
910 if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate))
911 goto err_out;
912
913 powernow_k8_acpi_pst_values(data, newstate);
914
915 if (transition_frequency(data, newstate)) {
916 printk(KERN_ERR PFX "transition frequency failed\n");
917 ret = 1;
918 goto err_out;
919 }
920
921 pol->cur = find_khz_freq_from_fid(data->currfid);
922 ret = 0;
923
924err_out:
925 set_cpus_allowed(current, oldmask);
926 schedule();
927
928 return ret;
929}
930
931/* Driver entry point to verify the policy and range of frequencies */
932static int powernowk8_verify(struct cpufreq_policy *pol)
933{
934 struct powernow_k8_data *data = powernow_data[pol->cpu];
935
936 return cpufreq_frequency_table_verify(pol, data->powernow_table);
937}
938
939/* per CPU init entry point to the driver */
940static int __init powernowk8_cpu_init(struct cpufreq_policy *pol)
941{
942 struct powernow_k8_data *data;
943 cpumask_t oldmask = CPU_MASK_ALL;
944 int rc;
945
946 if (!check_supported_cpu(pol->cpu))
947 return -ENODEV;
948
949 data = kmalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
950 if (!data) {
951 printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
952 return -ENOMEM;
953 }
954 memset(data,0,sizeof(struct powernow_k8_data));
955
956 data->cpu = pol->cpu;
957
958 if (powernow_k8_cpu_init_acpi(data)) {
959 /*
960 * Use the PSB BIOS structure. This is only availabe on
961 * an UP version, and is deprecated by AMD.
962 */
963
964 if ((num_online_cpus() != 1) || (num_possible_cpus() != 1)) {
965 printk(KERN_INFO PFX "MP systems not supported by PSB BIOS structure\n");
966 kfree(data);
967 return -ENODEV;
968 }
969 if (pol->cpu != 0) {
970 printk(KERN_ERR PFX "init not cpu 0\n");
971 kfree(data);
972 return -ENODEV;
973 }
974 rc = find_psb_table(data);
975 if (rc) {
976 kfree(data);
977 return -ENODEV;
978 }
979 }
980
981 /* only run on specific CPU from here on */
982 oldmask = current->cpus_allowed;
983 set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
984 schedule();
985
986 if (smp_processor_id() != pol->cpu) {
987 printk(KERN_ERR "limiting to cpu %u failed\n", pol->cpu);
988 goto err_out;
989 }
990
991 if (pending_bit_stuck()) {
992 printk(KERN_ERR PFX "failing init, change pending bit set\n");
993 goto err_out;
994 }
995
996 if (query_current_values_with_pending_wait(data))
997 goto err_out;
998
999 fidvid_msr_init();
1000
1001 /* run on any CPU again */
1002 set_cpus_allowed(current, oldmask);
1003 schedule();
1004
1005 pol->governor = CPUFREQ_DEFAULT_GOVERNOR;
1006
1007 /* Take a crude guess here.
1008 * That guess was in microseconds, so multiply with 1000 */
1009 pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US)
1010 + (3 * (1 << data->irt) * 10)) * 1000;
1011
1012 pol->cur = find_khz_freq_from_fid(data->currfid);
1013 dprintk("policy current frequency %d kHz\n", pol->cur);
1014
1015 /* min/max the cpu is capable of */
1016 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1017 printk(KERN_ERR PFX "invalid powernow_table\n");
1018 powernow_k8_cpu_exit_acpi(data);
1019 kfree(data->powernow_table);
1020 kfree(data);
1021 return -EINVAL;
1022 }
1023
1024 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1025
1026 printk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1027 data->currfid, data->currvid);
1028
1029 powernow_data[pol->cpu] = data;
1030
1031 return 0;
1032
1033err_out:
1034 set_cpus_allowed(current, oldmask);
1035 schedule();
1036 powernow_k8_cpu_exit_acpi(data);
1037
1038 kfree(data);
1039 return -ENODEV;
1040}
1041
1042static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
1043{
1044 struct powernow_k8_data *data = powernow_data[pol->cpu];
1045
1046 if (!data)
1047 return -EINVAL;
1048
1049 powernow_k8_cpu_exit_acpi(data);
1050
1051 cpufreq_frequency_table_put_attr(pol->cpu);
1052
1053 kfree(data->powernow_table);
1054 kfree(data);
1055
1056 return 0;
1057}
1058
1059static unsigned int powernowk8_get (unsigned int cpu)
1060{
1061 struct powernow_k8_data *data = powernow_data[cpu];
1062 cpumask_t oldmask = current->cpus_allowed;
1063 unsigned int khz = 0;
1064
1065 set_cpus_allowed(current, cpumask_of_cpu(cpu));
1066 if (smp_processor_id() != cpu) {
1067 printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu);
1068 set_cpus_allowed(current, oldmask);
1069 return 0;
1070 }
1071 preempt_disable();
1072
1073 if (query_current_values_with_pending_wait(data))
1074 goto out;
1075
1076 khz = find_khz_freq_from_fid(data->currfid);
1077
1078 out:
1079 preempt_enable_no_resched();
1080 set_cpus_allowed(current, oldmask);
1081
1082 return khz;
1083}
1084
1085static struct freq_attr* powernow_k8_attr[] = {
1086 &cpufreq_freq_attr_scaling_available_freqs,
1087 NULL,
1088};
1089
1090static struct cpufreq_driver cpufreq_amd64_driver = {
1091 .verify = powernowk8_verify,
1092 .target = powernowk8_target,
1093 .init = powernowk8_cpu_init,
1094 .exit = __devexit_p(powernowk8_cpu_exit),
1095 .get = powernowk8_get,
1096 .name = "powernow-k8",
1097 .owner = THIS_MODULE,
1098 .attr = powernow_k8_attr,
1099};
1100
1101/* driver entry point for init */
1102static int __init powernowk8_init(void)
1103{
1104 unsigned int i, supported_cpus = 0;
1105
1106 for (i=0; i<NR_CPUS; i++) {
1107 if (!cpu_online(i))
1108 continue;
1109 if (check_supported_cpu(i))
1110 supported_cpus++;
1111 }
1112
1113 if (supported_cpus == num_online_cpus()) {
1114 printk(KERN_INFO PFX "Found %d AMD Athlon 64 / Opteron processors (" VERSION ")\n",
1115 supported_cpus);
1116 return cpufreq_register_driver(&cpufreq_amd64_driver);
1117 }
1118
1119 return -ENODEV;
1120}
1121
1122/* driver entry point for term */
1123static void __exit powernowk8_exit(void)
1124{
1125 dprintk("exit\n");
1126
1127 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1128}
1129
1130MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com>");
1131MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1132MODULE_LICENSE("GPL");
1133
1134late_initcall(powernowk8_init);
1135module_exit(powernowk8_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h
new file mode 100644
index 000000000000..63ebc8470f52
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h
@@ -0,0 +1,176 @@
1/*
2 * (c) 2003, 2004 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8struct powernow_k8_data {
9 unsigned int cpu;
10
11 u32 numps; /* number of p-states */
12 u32 batps; /* number of p-states supported on battery */
13
14 /* these values are constant when the PSB is used to determine
15 * vid/fid pairings, but are modified during the ->target() call
16 * when ACPI is used */
17 u32 rvo; /* ramp voltage offset */
18 u32 irt; /* isochronous relief time */
19 u32 vidmvs; /* usable value calculated from mvs */
20 u32 vstable; /* voltage stabilization time, units 20 us */
21 u32 plllock; /* pll lock time, units 1 us */
22
23 /* keep track of the current fid / vid */
24 u32 currvid, currfid;
25
26 /* the powernow_table includes all frequency and vid/fid pairings:
27 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
28 * frequency is in kHz */
29 struct cpufreq_frequency_table *powernow_table;
30
31#ifdef CONFIG_X86_POWERNOW_K8_ACPI
32 /* the acpi table needs to be kept. it's only available if ACPI was
33 * used to determine valid frequency/vid/fid states */
34 struct acpi_processor_performance acpi_data;
35#endif
36};
37
38
39/* processor's cpuid instruction support */
40#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
41#define CPUID_XFAM 0x0ff00000 /* extended family */
42#define CPUID_XFAM_K8 0
43#define CPUID_XMOD 0x000f0000 /* extended model */
44#define CPUID_XMOD_REV_E 0x00020000
45#define CPUID_USE_XFAM_XMOD 0x00000f00
46#define CPUID_GET_MAX_CAPABILITIES 0x80000000
47#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
48#define P_STATE_TRANSITION_CAPABLE 6
49
50/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
51/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
52/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
53/* the register number is placed in ecx, and the data is returned in edx:eax. */
54
55#define MSR_FIDVID_CTL 0xc0010041
56#define MSR_FIDVID_STATUS 0xc0010042
57
58/* Field definitions within the FID VID Low Control MSR : */
59#define MSR_C_LO_INIT_FID_VID 0x00010000
60#define MSR_C_LO_NEW_VID 0x00001f00
61#define MSR_C_LO_NEW_FID 0x0000002f
62#define MSR_C_LO_VID_SHIFT 8
63
64/* Field definitions within the FID VID High Control MSR : */
65#define MSR_C_HI_STP_GNT_TO 0x000fffff
66
67/* Field definitions within the FID VID Low Status MSR : */
68#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
69#define MSR_S_LO_MAX_RAMP_VID 0x1f000000
70#define MSR_S_LO_MAX_FID 0x003f0000
71#define MSR_S_LO_START_FID 0x00003f00
72#define MSR_S_LO_CURRENT_FID 0x0000003f
73
74/* Field definitions within the FID VID High Status MSR : */
75#define MSR_S_HI_MAX_WORKING_VID 0x001f0000
76#define MSR_S_HI_START_VID 0x00001f00
77#define MSR_S_HI_CURRENT_VID 0x0000001f
78#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
79
80/*
81 * There are restrictions frequencies have to follow:
82 * - only 1 entry in the low fid table ( <=1.4GHz )
83 * - lowest entry in the high fid table must be >= 2 * the entry in the
84 * low fid table
85 * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
86 * in the low fid table
87 * - the parts can only step at 200 MHz intervals, so 1.9 GHz is never valid
88 * - lowest frequency must be >= interprocessor hypertransport link speed
89 * (only applies to MP systems obviously)
90 */
91
92/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
93#define LO_FID_TABLE_TOP 6 /* fid values marking the boundary */
94#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
95
96#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
97#define HI_VCOFREQ_TABLE_BOTTOM 1600
98
99#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
100
101#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
102#define LEAST_VID 0x1e /* Lowest (numerically highest) useful vid value */
103
104#define MIN_FREQ 800 /* Min and max freqs, per spec */
105#define MAX_FREQ 5000
106
107#define INVALID_FID_MASK 0xffffffc1 /* not a valid fid if these bits are set */
108#define INVALID_VID_MASK 0xffffffe0 /* not a valid vid if these bits are set */
109
110#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
111
112#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
113
114#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
115#define VST_UNITS_20US 20 /* Voltage Stabalization Time is in units of 20us */
116
117/*
118 * Most values of interest are enocoded in a single field of the _PSS
119 * entries: the "control" value.
120 */
121
122#define IRT_SHIFT 30
123#define RVO_SHIFT 28
124#define PLL_L_SHIFT 20
125#define MVS_SHIFT 18
126#define VST_SHIFT 11
127#define VID_SHIFT 6
128#define IRT_MASK 3
129#define RVO_MASK 3
130#define PLL_L_MASK 0x7f
131#define MVS_MASK 3
132#define VST_MASK 0x7f
133#define VID_MASK 0x1f
134#define FID_MASK 0x3f
135
136
137/*
138 * Version 1.4 of the PSB table. This table is constructed by BIOS and is
139 * to tell the OS's power management driver which VIDs and FIDs are
140 * supported by this particular processor.
141 * If the data in the PSB / PST is wrong, then this driver will program the
142 * wrong values into hardware, which is very likely to lead to a crash.
143 */
144
145#define PSB_ID_STRING "AMDK7PNOW!"
146#define PSB_ID_STRING_LEN 10
147
148#define PSB_VERSION_1_4 0x14
149
150struct psb_s {
151 u8 signature[10];
152 u8 tableversion;
153 u8 flags1;
154 u16 vstable;
155 u8 flags2;
156 u8 num_tables;
157 u32 cpuid;
158 u8 plllocktime;
159 u8 maxfid;
160 u8 maxvid;
161 u8 numps;
162};
163
164/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
165struct pst_s {
166 u8 fid;
167 u8 vid;
168};
169
170#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
171
172static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid);
173static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
174static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
175
176static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
new file mode 100644
index 000000000000..07d5612dc00f
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -0,0 +1,715 @@
1/*
2 * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
3 * M (part of the Centrino chipset).
4 *
5 * Despite the "SpeedStep" in the name, this is almost entirely unlike
6 * traditional SpeedStep.
7 *
8 * Modelled on speedstep.c
9 *
10 * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
11 *
12 * WARNING WARNING WARNING
13 *
14 * This driver manipulates the PERF_CTL MSR, which is only somewhat
15 * documented. While it seems to work on my laptop, it has not been
16 * tested anywhere else, and it may not work for you, do strange
17 * things or simply crash.
18 */
19
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/cpufreq.h>
24#include <linux/config.h>
25#include <linux/delay.h>
26#include <linux/compiler.h>
27
28#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
29#include <linux/acpi.h>
30#include <acpi/processor.h>
31#endif
32
33#include <asm/msr.h>
34#include <asm/processor.h>
35#include <asm/cpufeature.h>
36
37#include "speedstep-est-common.h"
38
39#define PFX "speedstep-centrino: "
40#define MAINTAINER "Jeremy Fitzhardinge <jeremy@goop.org>"
41
42#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
43
44
45struct cpu_id
46{
47 __u8 x86; /* CPU family */
48 __u8 x86_model; /* model */
49 __u8 x86_mask; /* stepping */
50};
51
52enum {
53 CPU_BANIAS,
54 CPU_DOTHAN_A1,
55 CPU_DOTHAN_A2,
56 CPU_DOTHAN_B0,
57};
58
59static const struct cpu_id cpu_ids[] = {
60 [CPU_BANIAS] = { 6, 9, 5 },
61 [CPU_DOTHAN_A1] = { 6, 13, 1 },
62 [CPU_DOTHAN_A2] = { 6, 13, 2 },
63 [CPU_DOTHAN_B0] = { 6, 13, 6 },
64};
65#define N_IDS (sizeof(cpu_ids)/sizeof(cpu_ids[0]))
66
67struct cpu_model
68{
69 const struct cpu_id *cpu_id;
70 const char *model_name;
71 unsigned max_freq; /* max clock in kHz */
72
73 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
74};
75static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x);
76
77/* Operating points for current CPU */
78static struct cpu_model *centrino_model[NR_CPUS];
79static const struct cpu_id *centrino_cpu[NR_CPUS];
80
81static struct cpufreq_driver centrino_driver;
82
83#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
84
85/* Computes the correct form for IA32_PERF_CTL MSR for a particular
86 frequency/voltage operating point; frequency in MHz, volts in mV.
87 This is stored as "index" in the structure. */
88#define OP(mhz, mv) \
89 { \
90 .frequency = (mhz) * 1000, \
91 .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
92 }
93
94/*
95 * These voltage tables were derived from the Intel Pentium M
96 * datasheet, document 25261202.pdf, Table 5. I have verified they
97 * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
98 * M.
99 */
100
101/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
102static struct cpufreq_frequency_table banias_900[] =
103{
104 OP(600, 844),
105 OP(800, 988),
106 OP(900, 1004),
107 { .frequency = CPUFREQ_TABLE_END }
108};
109
110/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
111static struct cpufreq_frequency_table banias_1000[] =
112{
113 OP(600, 844),
114 OP(800, 972),
115 OP(900, 988),
116 OP(1000, 1004),
117 { .frequency = CPUFREQ_TABLE_END }
118};
119
120/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
121static struct cpufreq_frequency_table banias_1100[] =
122{
123 OP( 600, 956),
124 OP( 800, 1020),
125 OP( 900, 1100),
126 OP(1000, 1164),
127 OP(1100, 1180),
128 { .frequency = CPUFREQ_TABLE_END }
129};
130
131
132/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
133static struct cpufreq_frequency_table banias_1200[] =
134{
135 OP( 600, 956),
136 OP( 800, 1004),
137 OP( 900, 1020),
138 OP(1000, 1100),
139 OP(1100, 1164),
140 OP(1200, 1180),
141 { .frequency = CPUFREQ_TABLE_END }
142};
143
144/* Intel Pentium M processor 1.30GHz (Banias) */
145static struct cpufreq_frequency_table banias_1300[] =
146{
147 OP( 600, 956),
148 OP( 800, 1260),
149 OP(1000, 1292),
150 OP(1200, 1356),
151 OP(1300, 1388),
152 { .frequency = CPUFREQ_TABLE_END }
153};
154
155/* Intel Pentium M processor 1.40GHz (Banias) */
156static struct cpufreq_frequency_table banias_1400[] =
157{
158 OP( 600, 956),
159 OP( 800, 1180),
160 OP(1000, 1308),
161 OP(1200, 1436),
162 OP(1400, 1484),
163 { .frequency = CPUFREQ_TABLE_END }
164};
165
166/* Intel Pentium M processor 1.50GHz (Banias) */
167static struct cpufreq_frequency_table banias_1500[] =
168{
169 OP( 600, 956),
170 OP( 800, 1116),
171 OP(1000, 1228),
172 OP(1200, 1356),
173 OP(1400, 1452),
174 OP(1500, 1484),
175 { .frequency = CPUFREQ_TABLE_END }
176};
177
178/* Intel Pentium M processor 1.60GHz (Banias) */
179static struct cpufreq_frequency_table banias_1600[] =
180{
181 OP( 600, 956),
182 OP( 800, 1036),
183 OP(1000, 1164),
184 OP(1200, 1276),
185 OP(1400, 1420),
186 OP(1600, 1484),
187 { .frequency = CPUFREQ_TABLE_END }
188};
189
190/* Intel Pentium M processor 1.70GHz (Banias) */
191static struct cpufreq_frequency_table banias_1700[] =
192{
193 OP( 600, 956),
194 OP( 800, 1004),
195 OP(1000, 1116),
196 OP(1200, 1228),
197 OP(1400, 1308),
198 OP(1700, 1484),
199 { .frequency = CPUFREQ_TABLE_END }
200};
201#undef OP
202
203#define _BANIAS(cpuid, max, name) \
204{ .cpu_id = cpuid, \
205 .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
206 .max_freq = (max)*1000, \
207 .op_points = banias_##max, \
208}
209#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
210
211/* CPU models, their operating frequency range, and freq/voltage
212 operating points */
213static struct cpu_model models[] =
214{
215 _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
216 BANIAS(1000),
217 BANIAS(1100),
218 BANIAS(1200),
219 BANIAS(1300),
220 BANIAS(1400),
221 BANIAS(1500),
222 BANIAS(1600),
223 BANIAS(1700),
224
225 /* NULL model_name is a wildcard */
226 { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
227 { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
228 { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
229
230 { NULL, }
231};
232#undef _BANIAS
233#undef BANIAS
234
235static int centrino_cpu_init_table(struct cpufreq_policy *policy)
236{
237 struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu];
238 struct cpu_model *model;
239
240 for(model = models; model->cpu_id != NULL; model++)
241 if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
242 (model->model_name == NULL ||
243 strcmp(cpu->x86_model_id, model->model_name) == 0))
244 break;
245
246 if (model->cpu_id == NULL) {
247 /* No match at all */
248 dprintk(KERN_INFO PFX "no support for CPU model \"%s\": "
249 "send /proc/cpuinfo to " MAINTAINER "\n",
250 cpu->x86_model_id);
251 return -ENOENT;
252 }
253
254 if (model->op_points == NULL) {
255 /* Matched a non-match */
256 dprintk(KERN_INFO PFX "no table support for CPU model \"%s\": \n",
257 cpu->x86_model_id);
258#ifndef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
259 dprintk(KERN_INFO PFX "try compiling with CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI enabled\n");
260#endif
261 return -ENOENT;
262 }
263
264 centrino_model[policy->cpu] = model;
265
266 dprintk("found \"%s\": max frequency: %dkHz\n",
267 model->model_name, model->max_freq);
268
269 return 0;
270}
271
272#else
273static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; }
274#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
275
276static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x)
277{
278 if ((c->x86 == x->x86) &&
279 (c->x86_model == x->x86_model) &&
280 (c->x86_mask == x->x86_mask))
281 return 1;
282 return 0;
283}
284
285/* To be called only after centrino_model is initialized */
286static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
287{
288 int i;
289
290 /*
291 * Extract clock in kHz from PERF_CTL value
292 * for centrino, as some DSDTs are buggy.
293 * Ideally, this can be done using the acpi_data structure.
294 */
295 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) ||
296 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) ||
297 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) {
298 msr = (msr >> 8) & 0xff;
299 return msr * 100000;
300 }
301
302 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points))
303 return 0;
304
305 msr &= 0xffff;
306 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) {
307 if (msr == centrino_model[cpu]->op_points[i].index)
308 return centrino_model[cpu]->op_points[i].frequency;
309 }
310 if (failsafe)
311 return centrino_model[cpu]->op_points[i-1].frequency;
312 else
313 return 0;
314}
315
316/* Return the current CPU frequency in kHz */
317static unsigned int get_cur_freq(unsigned int cpu)
318{
319 unsigned l, h;
320 unsigned clock_freq;
321 cpumask_t saved_mask;
322
323 saved_mask = current->cpus_allowed;
324 set_cpus_allowed(current, cpumask_of_cpu(cpu));
325 if (smp_processor_id() != cpu)
326 return 0;
327
328 rdmsr(MSR_IA32_PERF_STATUS, l, h);
329 clock_freq = extract_clock(l, cpu, 0);
330
331 if (unlikely(clock_freq == 0)) {
332 /*
333 * On some CPUs, we can see transient MSR values (which are
334 * not present in _PSS), while CPU is doing some automatic
335 * P-state transition (like TM2). Get the last freq set
336 * in PERF_CTL.
337 */
338 rdmsr(MSR_IA32_PERF_CTL, l, h);
339 clock_freq = extract_clock(l, cpu, 1);
340 }
341
342 set_cpus_allowed(current, saved_mask);
343 return clock_freq;
344}
345
346
347#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
348
349static struct acpi_processor_performance p;
350
351/*
352 * centrino_cpu_init_acpi - register with ACPI P-States library
353 *
354 * Register with the ACPI P-States library (part of drivers/acpi/processor.c)
355 * in order to determine correct frequency and voltage pairings by reading
356 * the _PSS of the ACPI DSDT or SSDT tables.
357 */
358static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
359{
360 union acpi_object arg0 = {ACPI_TYPE_BUFFER};
361 u32 arg0_buf[3];
362 struct acpi_object_list arg_list = {1, &arg0};
363 unsigned long cur_freq;
364 int result = 0, i;
365 unsigned int cpu = policy->cpu;
366
367 /* _PDC settings */
368 arg0.buffer.length = 12;
369 arg0.buffer.pointer = (u8 *) arg0_buf;
370 arg0_buf[0] = ACPI_PDC_REVISION_ID;
371 arg0_buf[1] = 1;
372 arg0_buf[2] = ACPI_PDC_EST_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_MSR;
373
374 p.pdc = &arg_list;
375
376 /* register with ACPI core */
377 if (acpi_processor_register_performance(&p, cpu)) {
378 dprintk(KERN_INFO PFX "obtaining ACPI data failed\n");
379 return -EIO;
380 }
381
382 /* verify the acpi_data */
383 if (p.state_count <= 1) {
384 dprintk("No P-States\n");
385 result = -ENODEV;
386 goto err_unreg;
387 }
388
389 if ((p.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
390 (p.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
391 dprintk("Invalid control/status registers (%x - %x)\n",
392 p.control_register.space_id, p.status_register.space_id);
393 result = -EIO;
394 goto err_unreg;
395 }
396
397 for (i=0; i<p.state_count; i++) {
398 if (p.states[i].control != p.states[i].status) {
399 dprintk("Different control (%x) and status values (%x)\n",
400 p.states[i].control, p.states[i].status);
401 result = -EINVAL;
402 goto err_unreg;
403 }
404
405 if (!p.states[i].core_frequency) {
406 dprintk("Zero core frequency for state %u\n", i);
407 result = -EINVAL;
408 goto err_unreg;
409 }
410
411 if (p.states[i].core_frequency > p.states[0].core_frequency) {
412 dprintk("P%u has larger frequency (%u) than P0 (%u), skipping\n", i,
413 p.states[i].core_frequency, p.states[0].core_frequency);
414 p.states[i].core_frequency = 0;
415 continue;
416 }
417 }
418
419 centrino_model[cpu] = kmalloc(sizeof(struct cpu_model), GFP_KERNEL);
420 if (!centrino_model[cpu]) {
421 result = -ENOMEM;
422 goto err_unreg;
423 }
424 memset(centrino_model[cpu], 0, sizeof(struct cpu_model));
425
426 centrino_model[cpu]->model_name=NULL;
427 centrino_model[cpu]->max_freq = p.states[0].core_frequency * 1000;
428 centrino_model[cpu]->op_points = kmalloc(sizeof(struct cpufreq_frequency_table) *
429 (p.state_count + 1), GFP_KERNEL);
430 if (!centrino_model[cpu]->op_points) {
431 result = -ENOMEM;
432 goto err_kfree;
433 }
434
435 for (i=0; i<p.state_count; i++) {
436 centrino_model[cpu]->op_points[i].index = p.states[i].control;
437 centrino_model[cpu]->op_points[i].frequency = p.states[i].core_frequency * 1000;
438 dprintk("adding state %i with frequency %u and control value %04x\n",
439 i, centrino_model[cpu]->op_points[i].frequency, centrino_model[cpu]->op_points[i].index);
440 }
441 centrino_model[cpu]->op_points[p.state_count].frequency = CPUFREQ_TABLE_END;
442
443 cur_freq = get_cur_freq(cpu);
444
445 for (i=0; i<p.state_count; i++) {
446 if (!p.states[i].core_frequency) {
447 dprintk("skipping state %u\n", i);
448 centrino_model[cpu]->op_points[i].frequency = CPUFREQ_ENTRY_INVALID;
449 continue;
450 }
451
452 if (extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0) !=
453 (centrino_model[cpu]->op_points[i].frequency)) {
454 dprintk("Invalid encoded frequency (%u vs. %u)\n",
455 extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0),
456 centrino_model[cpu]->op_points[i].frequency);
457 result = -EINVAL;
458 goto err_kfree_all;
459 }
460
461 if (cur_freq == centrino_model[cpu]->op_points[i].frequency)
462 p.state = i;
463 }
464
465 /* notify BIOS that we exist */
466 acpi_processor_notify_smm(THIS_MODULE);
467
468 return 0;
469
470 err_kfree_all:
471 kfree(centrino_model[cpu]->op_points);
472 err_kfree:
473 kfree(centrino_model[cpu]);
474 err_unreg:
475 acpi_processor_unregister_performance(&p, cpu);
476 dprintk(KERN_INFO PFX "invalid ACPI data\n");
477 return (result);
478}
479#else
480static inline int centrino_cpu_init_acpi(struct cpufreq_policy *policy) { return -ENODEV; }
481#endif
482
483static int centrino_cpu_init(struct cpufreq_policy *policy)
484{
485 struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu];
486 unsigned freq;
487 unsigned l, h;
488 int ret;
489 int i;
490
491 /* Only Intel makes Enhanced Speedstep-capable CPUs */
492 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST))
493 return -ENODEV;
494
495 for (i = 0; i < N_IDS; i++)
496 if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
497 break;
498
499 if (i != N_IDS)
500 centrino_cpu[policy->cpu] = &cpu_ids[i];
501
502 if (is_const_loops_cpu(policy->cpu)) {
503 centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
504 }
505
506 if (centrino_cpu_init_acpi(policy)) {
507 if (policy->cpu != 0)
508 return -ENODEV;
509
510 if (!centrino_cpu[policy->cpu]) {
511 dprintk(KERN_INFO PFX "found unsupported CPU with "
512 "Enhanced SpeedStep: send /proc/cpuinfo to "
513 MAINTAINER "\n");
514 return -ENODEV;
515 }
516
517 if (centrino_cpu_init_table(policy)) {
518 return -ENODEV;
519 }
520 }
521
522 /* Check to see if Enhanced SpeedStep is enabled, and try to
523 enable it if not. */
524 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
525
526 if (!(l & (1<<16))) {
527 l |= (1<<16);
528 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
529 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
530
531 /* check to see if it stuck */
532 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
533 if (!(l & (1<<16))) {
534 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n");
535 return -ENODEV;
536 }
537 }
538
539 freq = get_cur_freq(policy->cpu);
540
541 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
542 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */
543 policy->cur = freq;
544
545 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
546
547 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points);
548 if (ret)
549 return (ret);
550
551 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu);
552
553 return 0;
554}
555
556static int centrino_cpu_exit(struct cpufreq_policy *policy)
557{
558 unsigned int cpu = policy->cpu;
559
560 if (!centrino_model[cpu])
561 return -ENODEV;
562
563 cpufreq_frequency_table_put_attr(cpu);
564
565#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
566 if (!centrino_model[cpu]->model_name) {
567 dprintk("unregistering and freeing ACPI data\n");
568 acpi_processor_unregister_performance(&p, cpu);
569 kfree(centrino_model[cpu]->op_points);
570 kfree(centrino_model[cpu]);
571 }
572#endif
573
574 centrino_model[cpu] = NULL;
575
576 return 0;
577}
578
579/**
580 * centrino_verify - verifies a new CPUFreq policy
581 * @policy: new policy
582 *
583 * Limit must be within this model's frequency range at least one
584 * border included.
585 */
586static int centrino_verify (struct cpufreq_policy *policy)
587{
588 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points);
589}
590
591/**
592 * centrino_setpolicy - set a new CPUFreq policy
593 * @policy: new policy
594 * @target_freq: the target frequency
595 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
596 *
597 * Sets a new CPUFreq policy.
598 */
599static int centrino_target (struct cpufreq_policy *policy,
600 unsigned int target_freq,
601 unsigned int relation)
602{
603 unsigned int newstate = 0;
604 unsigned int msr, oldmsr, h, cpu = policy->cpu;
605 struct cpufreq_freqs freqs;
606 cpumask_t saved_mask;
607 int retval;
608
609 if (centrino_model[cpu] == NULL)
610 return -ENODEV;
611
612 /*
613 * Support for SMP systems.
614 * Make sure we are running on the CPU that wants to change frequency
615 */
616 saved_mask = current->cpus_allowed;
617 set_cpus_allowed(current, policy->cpus);
618 if (!cpu_isset(smp_processor_id(), policy->cpus)) {
619 dprintk("couldn't limit to CPUs in this domain\n");
620 return(-EAGAIN);
621 }
622
623 if (cpufreq_frequency_table_target(policy, centrino_model[cpu]->op_points, target_freq,
624 relation, &newstate)) {
625 retval = -EINVAL;
626 goto migrate_end;
627 }
628
629 msr = centrino_model[cpu]->op_points[newstate].index;
630 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
631
632 if (msr == (oldmsr & 0xffff)) {
633 retval = 0;
634 dprintk("no change needed - msr was and needs to be %x\n", oldmsr);
635 goto migrate_end;
636 }
637
638 freqs.cpu = cpu;
639 freqs.old = extract_clock(oldmsr, cpu, 0);
640 freqs.new = extract_clock(msr, cpu, 0);
641
642 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
643 target_freq, freqs.old, freqs.new, msr);
644
645 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
646
647 /* all but 16 LSB are "reserved", so treat them with
648 care */
649 oldmsr &= ~0xffff;
650 msr &= 0xffff;
651 oldmsr |= msr;
652
653 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
654
655 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
656
657 retval = 0;
658migrate_end:
659 set_cpus_allowed(current, saved_mask);
660 return (retval);
661}
662
663static struct freq_attr* centrino_attr[] = {
664 &cpufreq_freq_attr_scaling_available_freqs,
665 NULL,
666};
667
668static struct cpufreq_driver centrino_driver = {
669 .name = "centrino", /* should be speedstep-centrino,
670 but there's a 16 char limit */
671 .init = centrino_cpu_init,
672 .exit = centrino_cpu_exit,
673 .verify = centrino_verify,
674 .target = centrino_target,
675 .get = get_cur_freq,
676 .attr = centrino_attr,
677 .owner = THIS_MODULE,
678};
679
680
681/**
682 * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
683 *
684 * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
685 * unsupported devices, -ENOENT if there's no voltage table for this
686 * particular CPU model, -EINVAL on problems during initiatization,
687 * and zero on success.
688 *
689 * This is quite picky. Not only does the CPU have to advertise the
690 * "est" flag in the cpuid capability flags, we look for a specific
691 * CPU model and stepping, and we need to have the exact model name in
692 * our voltage tables. That is, be paranoid about not releasing
693 * someone's valuable magic smoke.
694 */
695static int __init centrino_init(void)
696{
697 struct cpuinfo_x86 *cpu = cpu_data;
698
699 if (!cpu_has(cpu, X86_FEATURE_EST))
700 return -ENODEV;
701
702 return cpufreq_register_driver(&centrino_driver);
703}
704
705static void __exit centrino_exit(void)
706{
707 cpufreq_unregister_driver(&centrino_driver);
708}
709
710MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
711MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
712MODULE_LICENSE ("GPL");
713
714late_initcall(centrino_init);
715module_exit(centrino_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-est-common.h b/arch/i386/kernel/cpu/cpufreq/speedstep-est-common.h
new file mode 100644
index 000000000000..5ce995c9d866
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-est-common.h
@@ -0,0 +1,25 @@
1/*
2 * Routines common for drivers handling Enhanced Speedstep Technology
3 * Copyright (C) 2004 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
4 *
5 * Licensed under the terms of the GNU GPL License version 2 -- see
6 * COPYING for details.
7 */
8
9static inline int is_const_loops_cpu(unsigned int cpu)
10{
11 struct cpuinfo_x86 *c = cpu_data + cpu;
12
13 if (c->x86_vendor != X86_VENDOR_INTEL || !cpu_has(c, X86_FEATURE_EST))
14 return 0;
15
16 /*
17 * on P-4s, the TSC runs with constant frequency independent of cpu freq
18 * when we use EST
19 */
20 if (c->x86 == 0xf)
21 return 1;
22
23 return 0;
24}
25
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c
new file mode 100644
index 000000000000..5b7d18a06afa
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c
@@ -0,0 +1,424 @@
1/*
2 * (C) 2001 Dave Jones, Arjan van de ven.
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon reverse engineered information, and on Intel documentation
7 * for chipsets ICH2-M and ICH3-M.
8 *
9 * Many thanks to Ducrot Bruno for finding and fixing the last
10 * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
11 * for extensive testing.
12 *
13 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
14 */
15
16
17/*********************************************************************
18 * SPEEDSTEP - DEFINITIONS *
19 *********************************************************************/
20
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/init.h>
24#include <linux/cpufreq.h>
25#include <linux/pci.h>
26#include <linux/slab.h>
27
28#include "speedstep-lib.h"
29
30
31/* speedstep_chipset:
32 * It is necessary to know which chipset is used. As accesses to
33 * this device occur at various places in this module, we need a
34 * static struct pci_dev * pointing to that device.
35 */
36static struct pci_dev *speedstep_chipset_dev;
37
38
39/* speedstep_processor
40 */
41static unsigned int speedstep_processor = 0;
42
43
44/*
45 * There are only two frequency states for each processor. Values
46 * are in kHz for the time being.
47 */
48static struct cpufreq_frequency_table speedstep_freqs[] = {
49 {SPEEDSTEP_HIGH, 0},
50 {SPEEDSTEP_LOW, 0},
51 {0, CPUFREQ_TABLE_END},
52};
53
54
55#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-ich", msg)
56
57
58/**
59 * speedstep_set_state - set the SpeedStep state
60 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
61 *
62 * Tries to change the SpeedStep state.
63 */
64static void speedstep_set_state (unsigned int state)
65{
66 u32 pmbase;
67 u8 pm2_blk;
68 u8 value;
69 unsigned long flags;
70
71 if (!speedstep_chipset_dev || (state > 0x1))
72 return;
73
74 /* get PMBASE */
75 pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
76 if (!(pmbase & 0x01)) {
77 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
78 return;
79 }
80
81 pmbase &= 0xFFFFFFFE;
82 if (!pmbase) {
83 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
84 return;
85 }
86
87 /* Disable IRQs */
88 local_irq_save(flags);
89
90 /* read state */
91 value = inb(pmbase + 0x50);
92
93 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
94
95 /* write new state */
96 value &= 0xFE;
97 value |= state;
98
99 dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
100
101 /* Disable bus master arbitration */
102 pm2_blk = inb(pmbase + 0x20);
103 pm2_blk |= 0x01;
104 outb(pm2_blk, (pmbase + 0x20));
105
106 /* Actual transition */
107 outb(value, (pmbase + 0x50));
108
109 /* Restore bus master arbitration */
110 pm2_blk &= 0xfe;
111 outb(pm2_blk, (pmbase + 0x20));
112
113 /* check if transition was successful */
114 value = inb(pmbase + 0x50);
115
116 /* Enable IRQs */
117 local_irq_restore(flags);
118
119 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
120
121 if (state == (value & 0x1)) {
122 dprintk("change to %u MHz succeeded\n", (speedstep_get_processor_frequency(speedstep_processor) / 1000));
123 } else {
124 printk (KERN_ERR "cpufreq: change failed - I/O error\n");
125 }
126
127 return;
128}
129
130
131/**
132 * speedstep_activate - activate SpeedStep control in the chipset
133 *
134 * Tries to activate the SpeedStep status and control registers.
135 * Returns -EINVAL on an unsupported chipset, and zero on success.
136 */
137static int speedstep_activate (void)
138{
139 u16 value = 0;
140
141 if (!speedstep_chipset_dev)
142 return -EINVAL;
143
144 pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
145 if (!(value & 0x08)) {
146 value |= 0x08;
147 dprintk("activating SpeedStep (TM) registers\n");
148 pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
149 }
150
151 return 0;
152}
153
154
155/**
156 * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
157 *
158 * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
159 * the LPC bridge / PM module which contains all power-management
160 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
161 * chipset, or zero on failure.
162 */
163static unsigned int speedstep_detect_chipset (void)
164{
165 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
166 PCI_DEVICE_ID_INTEL_82801DB_12,
167 PCI_ANY_ID,
168 PCI_ANY_ID,
169 NULL);
170 if (speedstep_chipset_dev)
171 return 4; /* 4-M */
172
173 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
174 PCI_DEVICE_ID_INTEL_82801CA_12,
175 PCI_ANY_ID,
176 PCI_ANY_ID,
177 NULL);
178 if (speedstep_chipset_dev)
179 return 3; /* 3-M */
180
181
182 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
183 PCI_DEVICE_ID_INTEL_82801BA_10,
184 PCI_ANY_ID,
185 PCI_ANY_ID,
186 NULL);
187 if (speedstep_chipset_dev) {
188 /* speedstep.c causes lockups on Dell Inspirons 8000 and
189 * 8100 which use a pretty old revision of the 82815
190 * host brige. Abort on these systems.
191 */
192 static struct pci_dev *hostbridge;
193 u8 rev = 0;
194
195 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
196 PCI_DEVICE_ID_INTEL_82815_MC,
197 PCI_ANY_ID,
198 PCI_ANY_ID,
199 NULL);
200
201 if (!hostbridge)
202 return 2; /* 2-M */
203
204 pci_read_config_byte(hostbridge, PCI_REVISION_ID, &rev);
205 if (rev < 5) {
206 dprintk("hostbridge does not support speedstep\n");
207 speedstep_chipset_dev = NULL;
208 pci_dev_put(hostbridge);
209 return 0;
210 }
211
212 pci_dev_put(hostbridge);
213 return 2; /* 2-M */
214 }
215
216 return 0;
217}
218
219static unsigned int _speedstep_get(cpumask_t cpus)
220{
221 unsigned int speed;
222 cpumask_t cpus_allowed;
223
224 cpus_allowed = current->cpus_allowed;
225 set_cpus_allowed(current, cpus);
226 speed = speedstep_get_processor_frequency(speedstep_processor);
227 set_cpus_allowed(current, cpus_allowed);
228 dprintk("detected %u kHz as current frequency\n", speed);
229 return speed;
230}
231
232static unsigned int speedstep_get(unsigned int cpu)
233{
234 return _speedstep_get(cpumask_of_cpu(cpu));
235}
236
237/**
238 * speedstep_target - set a new CPUFreq policy
239 * @policy: new policy
240 * @target_freq: the target frequency
241 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
242 *
243 * Sets a new CPUFreq policy.
244 */
245static int speedstep_target (struct cpufreq_policy *policy,
246 unsigned int target_freq,
247 unsigned int relation)
248{
249 unsigned int newstate = 0;
250 struct cpufreq_freqs freqs;
251 cpumask_t cpus_allowed;
252 int i;
253
254 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
255 return -EINVAL;
256
257 freqs.old = _speedstep_get(policy->cpus);
258 freqs.new = speedstep_freqs[newstate].frequency;
259 freqs.cpu = policy->cpu;
260
261 dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
262
263 /* no transition necessary */
264 if (freqs.old == freqs.new)
265 return 0;
266
267 cpus_allowed = current->cpus_allowed;
268
269 for_each_cpu_mask(i, policy->cpus) {
270 freqs.cpu = i;
271 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
272 }
273
274 /* switch to physical CPU where state is to be changed */
275 set_cpus_allowed(current, policy->cpus);
276
277 speedstep_set_state(newstate);
278
279 /* allow to be run on all CPUs */
280 set_cpus_allowed(current, cpus_allowed);
281
282 for_each_cpu_mask(i, policy->cpus) {
283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
285 }
286
287 return 0;
288}
289
290
291/**
292 * speedstep_verify - verifies a new CPUFreq policy
293 * @policy: new policy
294 *
295 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
296 * at least one border included.
297 */
298static int speedstep_verify (struct cpufreq_policy *policy)
299{
300 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
301}
302
303
304static int speedstep_cpu_init(struct cpufreq_policy *policy)
305{
306 int result = 0;
307 unsigned int speed;
308 cpumask_t cpus_allowed;
309
310 /* only run on CPU to be set, or on its sibling */
311#ifdef CONFIG_SMP
312 policy->cpus = cpu_sibling_map[policy->cpu];
313#endif
314
315 cpus_allowed = current->cpus_allowed;
316 set_cpus_allowed(current, policy->cpus);
317
318 /* detect low and high frequency */
319 result = speedstep_get_freqs(speedstep_processor,
320 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
321 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
322 &speedstep_set_state);
323 set_cpus_allowed(current, cpus_allowed);
324 if (result)
325 return result;
326
327 /* get current speed setting */
328 speed = _speedstep_get(policy->cpus);
329 if (!speed)
330 return -EIO;
331
332 dprintk("currently at %s speed setting - %i MHz\n",
333 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
334 (speed / 1000));
335
336 /* cpuinfo and default policy values */
337 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
338 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
339 policy->cur = speed;
340
341 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
342 if (result)
343 return (result);
344
345 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
346
347 return 0;
348}
349
350
351static int speedstep_cpu_exit(struct cpufreq_policy *policy)
352{
353 cpufreq_frequency_table_put_attr(policy->cpu);
354 return 0;
355}
356
357static struct freq_attr* speedstep_attr[] = {
358 &cpufreq_freq_attr_scaling_available_freqs,
359 NULL,
360};
361
362
363static struct cpufreq_driver speedstep_driver = {
364 .name = "speedstep-ich",
365 .verify = speedstep_verify,
366 .target = speedstep_target,
367 .init = speedstep_cpu_init,
368 .exit = speedstep_cpu_exit,
369 .get = speedstep_get,
370 .owner = THIS_MODULE,
371 .attr = speedstep_attr,
372};
373
374
375/**
376 * speedstep_init - initializes the SpeedStep CPUFreq driver
377 *
378 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
379 * devices, -EINVAL on problems during initiatization, and zero on
380 * success.
381 */
382static int __init speedstep_init(void)
383{
384 /* detect processor */
385 speedstep_processor = speedstep_detect_processor();
386 if (!speedstep_processor) {
387 dprintk("Intel(R) SpeedStep(TM) capable processor not found\n");
388 return -ENODEV;
389 }
390
391 /* detect chipset */
392 if (!speedstep_detect_chipset()) {
393 dprintk("Intel(R) SpeedStep(TM) for this chipset not (yet) available.\n");
394 return -ENODEV;
395 }
396
397 /* activate speedstep support */
398 if (speedstep_activate()) {
399 pci_dev_put(speedstep_chipset_dev);
400 return -EINVAL;
401 }
402
403 return cpufreq_register_driver(&speedstep_driver);
404}
405
406
407/**
408 * speedstep_exit - unregisters SpeedStep support
409 *
410 * Unregisters SpeedStep support.
411 */
412static void __exit speedstep_exit(void)
413{
414 pci_dev_put(speedstep_chipset_dev);
415 cpufreq_unregister_driver(&speedstep_driver);
416}
417
418
419MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
420MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
421MODULE_LICENSE ("GPL");
422
423module_init(speedstep_init);
424module_exit(speedstep_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c
new file mode 100644
index 000000000000..8ba430a9c3a2
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c
@@ -0,0 +1,385 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/init.h>
15#include <linux/cpufreq.h>
16#include <linux/pci.h>
17#include <linux/slab.h>
18
19#include <asm/msr.h>
20#include "speedstep-lib.h"
21
22#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-lib", msg)
23
24#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
25static int relaxed_check = 0;
26#else
27#define relaxed_check 0
28#endif
29
30/*********************************************************************
31 * GET PROCESSOR CORE SPEED IN KHZ *
32 *********************************************************************/
33
34static unsigned int pentium3_get_frequency (unsigned int processor)
35{
36 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
37 struct {
38 unsigned int ratio; /* Frequency Multiplier (x10) */
39 u8 bitmap; /* power on configuration bits
40 [27, 25:22] (in MSR 0x2a) */
41 } msr_decode_mult [] = {
42 { 30, 0x01 },
43 { 35, 0x05 },
44 { 40, 0x02 },
45 { 45, 0x06 },
46 { 50, 0x00 },
47 { 55, 0x04 },
48 { 60, 0x0b },
49 { 65, 0x0f },
50 { 70, 0x09 },
51 { 75, 0x0d },
52 { 80, 0x0a },
53 { 85, 0x26 },
54 { 90, 0x20 },
55 { 100, 0x2b },
56 { 0, 0xff } /* error or unknown value */
57 };
58
59 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
60 struct {
61 unsigned int value; /* Front Side Bus speed in MHz */
62 u8 bitmap; /* power on configuration bits [18: 19]
63 (in MSR 0x2a) */
64 } msr_decode_fsb [] = {
65 { 66, 0x0 },
66 { 100, 0x2 },
67 { 133, 0x1 },
68 { 0, 0xff}
69 };
70
71 u32 msr_lo, msr_tmp;
72 int i = 0, j = 0;
73
74 /* read MSR 0x2a - we only need the low 32 bits */
75 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
76 dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
77 msr_tmp = msr_lo;
78
79 /* decode the FSB */
80 msr_tmp &= 0x00c0000;
81 msr_tmp >>= 18;
82 while (msr_tmp != msr_decode_fsb[i].bitmap) {
83 if (msr_decode_fsb[i].bitmap == 0xff)
84 return 0;
85 i++;
86 }
87
88 /* decode the multiplier */
89 if (processor == SPEEDSTEP_PROCESSOR_PIII_C_EARLY) {
90 dprintk("workaround for early PIIIs\n");
91 msr_lo &= 0x03c00000;
92 } else
93 msr_lo &= 0x0bc00000;
94 msr_lo >>= 22;
95 while (msr_lo != msr_decode_mult[j].bitmap) {
96 if (msr_decode_mult[j].bitmap == 0xff)
97 return 0;
98 j++;
99 }
100
101 dprintk("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
102
103 return (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100);
104}
105
106
107static unsigned int pentiumM_get_frequency(void)
108{
109 u32 msr_lo, msr_tmp;
110
111 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
112 dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
113
114 /* see table B-2 of 24547212.pdf */
115 if (msr_lo & 0x00040000) {
116 printk(KERN_DEBUG "speedstep-lib: PM - invalid FSB: 0x%x 0x%x\n", msr_lo, msr_tmp);
117 return 0;
118 }
119
120 msr_tmp = (msr_lo >> 22) & 0x1f;
121 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000));
122
123 return (msr_tmp * 100 * 1000);
124}
125
126
127static unsigned int pentium4_get_frequency(void)
128{
129 struct cpuinfo_x86 *c = &boot_cpu_data;
130 u32 msr_lo, msr_hi, mult;
131 unsigned int fsb = 0;
132
133 rdmsr(0x2c, msr_lo, msr_hi);
134
135 dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
136
137 /* decode the FSB: see IA-32 Intel (C) Architecture Software
138 * Developer's Manual, Volume 3: System Prgramming Guide,
139 * revision #12 in Table B-1: MSRs in the Pentium 4 and
140 * Intel Xeon Processors, on page B-4 and B-5.
141 */
142 if (c->x86_model < 2)
143 fsb = 100 * 1000;
144 else {
145 u8 fsb_code = (msr_lo >> 16) & 0x7;
146 switch (fsb_code) {
147 case 0:
148 fsb = 100 * 1000;
149 break;
150 case 1:
151 fsb = 13333 * 10;
152 break;
153 case 2:
154 fsb = 200 * 1000;
155 break;
156 }
157 }
158
159 if (!fsb)
160 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n");
161
162 /* Multiplier. */
163 if (c->x86_model < 2)
164 mult = msr_lo >> 27;
165 else
166 mult = msr_lo >> 24;
167
168 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult));
169
170 return (fsb * mult);
171}
172
173
174unsigned int speedstep_get_processor_frequency(unsigned int processor)
175{
176 switch (processor) {
177 case SPEEDSTEP_PROCESSOR_PM:
178 return pentiumM_get_frequency();
179 case SPEEDSTEP_PROCESSOR_P4D:
180 case SPEEDSTEP_PROCESSOR_P4M:
181 return pentium4_get_frequency();
182 case SPEEDSTEP_PROCESSOR_PIII_T:
183 case SPEEDSTEP_PROCESSOR_PIII_C:
184 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
185 return pentium3_get_frequency(processor);
186 default:
187 return 0;
188 };
189 return 0;
190}
191EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency);
192
193
194/*********************************************************************
195 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
196 *********************************************************************/
197
198unsigned int speedstep_detect_processor (void)
199{
200 struct cpuinfo_x86 *c = cpu_data;
201 u32 ebx, msr_lo, msr_hi;
202
203 dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
204
205 if ((c->x86_vendor != X86_VENDOR_INTEL) ||
206 ((c->x86 != 6) && (c->x86 != 0xF)))
207 return 0;
208
209 if (c->x86 == 0xF) {
210 /* Intel Mobile Pentium 4-M
211 * or Intel Mobile Pentium 4 with 533 MHz FSB */
212 if (c->x86_model != 2)
213 return 0;
214
215 ebx = cpuid_ebx(0x00000001);
216 ebx &= 0x000000FF;
217
218 dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
219
220 switch (c->x86_mask) {
221 case 4:
222 /*
223 * B-stepping [M-P4-M]
224 * sample has ebx = 0x0f, production has 0x0e.
225 */
226 if ((ebx == 0x0e) || (ebx == 0x0f))
227 return SPEEDSTEP_PROCESSOR_P4M;
228 break;
229 case 7:
230 /*
231 * C-stepping [M-P4-M]
232 * needs to have ebx=0x0e, else it's a celeron:
233 * cf. 25130917.pdf / page 7, footnote 5 even
234 * though 25072120.pdf / page 7 doesn't say
235 * samples are only of B-stepping...
236 */
237 if (ebx == 0x0e)
238 return SPEEDSTEP_PROCESSOR_P4M;
239 break;
240 case 9:
241 /*
242 * D-stepping [M-P4-M or M-P4/533]
243 *
244 * this is totally strange: CPUID 0x0F29 is
245 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
246 * The latter need to be sorted out as they don't
247 * support speedstep.
248 * Celerons with CPUID 0x0F29 may have either
249 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
250 * specific.
251 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
252 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
253 * also, M-P4M HTs have ebx=0x8, too
254 * For now, they are distinguished by the model_id string
255 */
256 if ((ebx == 0x0e) || (strstr(c->x86_model_id,"Mobile Intel(R) Pentium(R) 4") != NULL))
257 return SPEEDSTEP_PROCESSOR_P4M;
258 break;
259 default:
260 break;
261 }
262 return 0;
263 }
264
265 switch (c->x86_model) {
266 case 0x0B: /* Intel PIII [Tualatin] */
267 /* cpuid_ebx(1) is 0x04 for desktop PIII,
268 0x06 for mobile PIII-M */
269 ebx = cpuid_ebx(0x00000001);
270 dprintk("ebx is %x\n", ebx);
271
272 ebx &= 0x000000FF;
273
274 if (ebx != 0x06)
275 return 0;
276
277 /* So far all PIII-M processors support SpeedStep. See
278 * Intel's 24540640.pdf of June 2003
279 */
280
281 return SPEEDSTEP_PROCESSOR_PIII_T;
282
283 case 0x08: /* Intel PIII [Coppermine] */
284
285 /* all mobile PIII Coppermines have FSB 100 MHz
286 * ==> sort out a few desktop PIIIs. */
287 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
288 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi);
289 msr_lo &= 0x00c0000;
290 if (msr_lo != 0x0080000)
291 return 0;
292
293 /*
294 * If the processor is a mobile version,
295 * platform ID has bit 50 set
296 * it has SpeedStep technology if either
297 * bit 56 or 57 is set
298 */
299 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
300 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi);
301 if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
302 if (c->x86_mask == 0x01) {
303 dprintk("early PIII version\n");
304 return SPEEDSTEP_PROCESSOR_PIII_C_EARLY;
305 } else
306 return SPEEDSTEP_PROCESSOR_PIII_C;
307 }
308
309 default:
310 return 0;
311 }
312}
313EXPORT_SYMBOL_GPL(speedstep_detect_processor);
314
315
316/*********************************************************************
317 * DETECT SPEEDSTEP SPEEDS *
318 *********************************************************************/
319
320unsigned int speedstep_get_freqs(unsigned int processor,
321 unsigned int *low_speed,
322 unsigned int *high_speed,
323 void (*set_state) (unsigned int state))
324{
325 unsigned int prev_speed;
326 unsigned int ret = 0;
327 unsigned long flags;
328
329 if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
330 return -EINVAL;
331
332 dprintk("trying to determine both speeds\n");
333
334 /* get current speed */
335 prev_speed = speedstep_get_processor_frequency(processor);
336 if (!prev_speed)
337 return -EIO;
338
339 dprintk("previous seped is %u\n", prev_speed);
340
341 local_irq_save(flags);
342
343 /* switch to low state */
344 set_state(SPEEDSTEP_LOW);
345 *low_speed = speedstep_get_processor_frequency(processor);
346 if (!*low_speed) {
347 ret = -EIO;
348 goto out;
349 }
350
351 dprintk("low seped is %u\n", *low_speed);
352
353 /* switch to high state */
354 set_state(SPEEDSTEP_HIGH);
355 *high_speed = speedstep_get_processor_frequency(processor);
356 if (!*high_speed) {
357 ret = -EIO;
358 goto out;
359 }
360
361 dprintk("high seped is %u\n", *high_speed);
362
363 if (*low_speed == *high_speed) {
364 ret = -ENODEV;
365 goto out;
366 }
367
368 /* switch to previous state, if necessary */
369 if (*high_speed != prev_speed)
370 set_state(SPEEDSTEP_LOW);
371
372 out:
373 local_irq_restore(flags);
374 return (ret);
375}
376EXPORT_SYMBOL_GPL(speedstep_get_freqs);
377
378#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
379module_param(relaxed_check, int, 0444);
380MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability.");
381#endif
382
383MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
384MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
385MODULE_LICENSE ("GPL");
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h
new file mode 100644
index 000000000000..261a2c9b7f6b
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h
@@ -0,0 +1,47 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11
12
13/* processors */
14
15#define SPEEDSTEP_PROCESSOR_PIII_C_EARLY 0x00000001 /* Coppermine core */
16#define SPEEDSTEP_PROCESSOR_PIII_C 0x00000002 /* Coppermine core */
17#define SPEEDSTEP_PROCESSOR_PIII_T 0x00000003 /* Tualatin core */
18#define SPEEDSTEP_PROCESSOR_P4M 0x00000004 /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_processor_frequency() call. */
23#define SPEEDSTEP_PROCESSOR_PM 0xFFFFFF03 /* Pentium M */
24#define SPEEDSTEP_PROCESSOR_P4D 0xFFFFFF04 /* desktop P4 */
25
26/* speedstep states -- only two of them */
27
28#define SPEEDSTEP_HIGH 0x00000000
29#define SPEEDSTEP_LOW 0x00000001
30
31
32/* detect a speedstep-capable processor */
33extern unsigned int speedstep_detect_processor (void);
34
35/* detect the current speed (in khz) of the processor */
36extern unsigned int speedstep_get_processor_frequency(unsigned int processor);
37
38
39/* detect the low and high speeds of the processor. The callback
40 * set_state"'s first argument is either SPEEDSTEP_HIGH or
41 * SPEEDSTEP_LOW; the second argument is zero so that no
42 * cpufreq_notify_transition calls are initiated.
43 */
44extern unsigned int speedstep_get_freqs(unsigned int processor,
45 unsigned int *low_speed,
46 unsigned int *high_speed,
47 void (*set_state) (unsigned int state));
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c b/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c
new file mode 100644
index 000000000000..79440b3f087e
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c
@@ -0,0 +1,424 @@
1/*
2 * Intel SpeedStep SMI driver.
3 *
4 * (C) 2003 Hiroshi Miura <miura@da-cha.org>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 */
9
10
11/*********************************************************************
12 * SPEEDSTEP - DEFINITIONS *
13 *********************************************************************/
14
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/moduleparam.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/pci.h>
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <asm/ist.h>
24
25#include "speedstep-lib.h"
26
27/* speedstep system management interface port/command.
28 *
29 * These parameters are got from IST-SMI BIOS call.
30 * If user gives it, these are used.
31 *
32 */
33static int smi_port = 0;
34static int smi_cmd = 0;
35static unsigned int smi_sig = 0;
36
37/* info about the processor */
38static unsigned int speedstep_processor = 0;
39
40/*
41 * There are only two frequency states for each processor. Values
42 * are in kHz for the time being.
43 */
44static struct cpufreq_frequency_table speedstep_freqs[] = {
45 {SPEEDSTEP_HIGH, 0},
46 {SPEEDSTEP_LOW, 0},
47 {0, CPUFREQ_TABLE_END},
48};
49
50#define GET_SPEEDSTEP_OWNER 0
51#define GET_SPEEDSTEP_STATE 1
52#define SET_SPEEDSTEP_STATE 2
53#define GET_SPEEDSTEP_FREQS 4
54
55/* how often shall the SMI call be tried if it failed, e.g. because
56 * of DMA activity going on? */
57#define SMI_TRIES 5
58
59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-smi", msg)
60
61/**
62 * speedstep_smi_ownership
63 */
64static int speedstep_smi_ownership (void)
65{
66 u32 command, result, magic;
67 u32 function = GET_SPEEDSTEP_OWNER;
68 unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
69
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data);
72
73 dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port);
74
75 __asm__ __volatile__(
76 "out %%al, (%%dx)\n"
77 : "=D" (result)
78 : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic)
79 );
80
81 dprintk("result is %x\n", result);
82
83 return result;
84}
85
86/**
87 * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
88 * @low: the low frequency value is placed here
89 * @high: the high frequency value is placed here
90 *
91 * Only available on later SpeedStep-enabled systems, returns false results or
92 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
93 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
94 */
95static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
96{
97 u32 command, result = 0, edi, high_mhz, low_mhz;
98 u32 state=0;
99 u32 function = GET_SPEEDSTEP_FREQS;
100
101 if (!(ist_info.event & 0xFFFF)) {
102 dprintk("bug #1422 -- can't read freqs from BIOS\n", result);
103 return -ENODEV;
104 }
105
106 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
107
108 dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port);
109
110 __asm__ __volatile__("movl $0, %%edi\n"
111 "out %%al, (%%dx)\n"
112 : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi)
113 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
114 );
115
116 dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz);
117
118 /* abort if results are obviously incorrect... */
119 if ((high_mhz + low_mhz) < 600)
120 return -EINVAL;
121
122 *high = high_mhz * 1000;
123 *low = low_mhz * 1000;
124
125 return result;
126}
127
128/**
129 * speedstep_get_state - set the SpeedStep state
130 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
131 *
132 */
133static int speedstep_get_state (void)
134{
135 u32 function=GET_SPEEDSTEP_STATE;
136 u32 result, state, edi, command;
137
138 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
139
140 dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port);
141
142 __asm__ __volatile__("movl $0, %%edi\n"
143 "out %%al, (%%dx)\n"
144 : "=a" (result), "=b" (state), "=D" (edi)
145 : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0)
146 );
147
148 dprintk("state is %x, result is %x\n", state, result);
149
150 return (state & 1);
151}
152
153
154/**
155 * speedstep_set_state - set the SpeedStep state
156 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
157 *
158 */
159static void speedstep_set_state (unsigned int state)
160{
161 unsigned int result = 0, command, new_state;
162 unsigned long flags;
163 unsigned int function=SET_SPEEDSTEP_STATE;
164 unsigned int retry = 0;
165
166 if (state > 0x1)
167 return;
168
169 /* Disable IRQs */
170 local_irq_save(flags);
171
172 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
173
174 dprintk("trying to set frequency to state %u with command %x at port %x\n", state, command, smi_port);
175
176 do {
177 if (retry) {
178 dprintk("retry %u, previous result %u, waiting...\n", retry, result);
179 mdelay(retry * 50);
180 }
181 retry++;
182 __asm__ __volatile__(
183 "movl $0, %%edi\n"
184 "out %%al, (%%dx)\n"
185 : "=b" (new_state), "=D" (result)
186 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
187 );
188 } while ((new_state != state) && (retry <= SMI_TRIES));
189
190 /* enable IRQs */
191 local_irq_restore(flags);
192
193 if (new_state == state) {
194 dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result);
195 } else {
196 printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result);
197 }
198
199 return;
200}
201
202
203/**
204 * speedstep_target - set a new CPUFreq policy
205 * @policy: new policy
206 * @target_freq: new freq
207 * @relation:
208 *
209 * Sets a new CPUFreq policy/freq.
210 */
211static int speedstep_target (struct cpufreq_policy *policy,
212 unsigned int target_freq, unsigned int relation)
213{
214 unsigned int newstate = 0;
215 struct cpufreq_freqs freqs;
216
217 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
218 return -EINVAL;
219
220 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
221 freqs.new = speedstep_freqs[newstate].frequency;
222 freqs.cpu = 0; /* speedstep.c is UP only driver */
223
224 if (freqs.old == freqs.new)
225 return 0;
226
227 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
228 speedstep_set_state(newstate);
229 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
230
231 return 0;
232}
233
234
235/**
236 * speedstep_verify - verifies a new CPUFreq policy
237 * @policy: new policy
238 *
239 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
240 * at least one border included.
241 */
242static int speedstep_verify (struct cpufreq_policy *policy)
243{
244 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
245}
246
247
248static int speedstep_cpu_init(struct cpufreq_policy *policy)
249{
250 int result;
251 unsigned int speed,state;
252
253 /* capability check */
254 if (policy->cpu != 0)
255 return -ENODEV;
256
257 result = speedstep_smi_ownership();
258 if (result) {
259 dprintk("fails in aquiring ownership of a SMI interface.\n");
260 return -EINVAL;
261 }
262
263 /* detect low and high frequency */
264 result = speedstep_smi_get_freqs(&speedstep_freqs[SPEEDSTEP_LOW].frequency,
265 &speedstep_freqs[SPEEDSTEP_HIGH].frequency);
266 if (result) {
267 /* fall back to speedstep_lib.c dection mechanism: try both states out */
268 dprintk("could not detect low and high frequencies by SMI call.\n");
269 result = speedstep_get_freqs(speedstep_processor,
270 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
271 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
272 &speedstep_set_state);
273
274 if (result) {
275 dprintk("could not detect two different speeds -- aborting.\n");
276 return result;
277 } else
278 dprintk("workaround worked.\n");
279 }
280
281 /* get current speed setting */
282 state = speedstep_get_state();
283 speed = speedstep_freqs[state].frequency;
284
285 dprintk("currently at %s speed setting - %i MHz\n",
286 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
287 (speed / 1000));
288
289 /* cpuinfo and default policy values */
290 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
291 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
292 policy->cur = speed;
293
294 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
295 if (result)
296 return (result);
297
298 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
299
300 return 0;
301}
302
303static int speedstep_cpu_exit(struct cpufreq_policy *policy)
304{
305 cpufreq_frequency_table_put_attr(policy->cpu);
306 return 0;
307}
308
309static unsigned int speedstep_get(unsigned int cpu)
310{
311 if (cpu)
312 return -ENODEV;
313 return speedstep_get_processor_frequency(speedstep_processor);
314}
315
316
317static int speedstep_resume(struct cpufreq_policy *policy)
318{
319 int result = speedstep_smi_ownership();
320
321 if (result)
322 dprintk("fails in re-aquiring ownership of a SMI interface.\n");
323
324 return result;
325}
326
327static struct freq_attr* speedstep_attr[] = {
328 &cpufreq_freq_attr_scaling_available_freqs,
329 NULL,
330};
331
332static struct cpufreq_driver speedstep_driver = {
333 .name = "speedstep-smi",
334 .verify = speedstep_verify,
335 .target = speedstep_target,
336 .init = speedstep_cpu_init,
337 .exit = speedstep_cpu_exit,
338 .get = speedstep_get,
339 .resume = speedstep_resume,
340 .owner = THIS_MODULE,
341 .attr = speedstep_attr,
342};
343
344/**
345 * speedstep_init - initializes the SpeedStep CPUFreq driver
346 *
347 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
348 * BIOS, -EINVAL on problems during initiatization, and zero on
349 * success.
350 */
351static int __init speedstep_init(void)
352{
353 speedstep_processor = speedstep_detect_processor();
354
355 switch (speedstep_processor) {
356 case SPEEDSTEP_PROCESSOR_PIII_T:
357 case SPEEDSTEP_PROCESSOR_PIII_C:
358 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
359 break;
360 default:
361 speedstep_processor = 0;
362 }
363
364 if (!speedstep_processor) {
365 dprintk ("No supported Intel CPU detected.\n");
366 return -ENODEV;
367 }
368
369 dprintk("signature:0x%.8lx, command:0x%.8lx, event:0x%.8lx, perf_level:0x%.8lx.\n",
370 ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level);
371
372
373 /* Error if no IST-SMI BIOS or no PARM
374 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
375 if ((ist_info.signature != 0x47534943) && (
376 (smi_port == 0) || (smi_cmd == 0)))
377 return -ENODEV;
378
379 if (smi_sig == 1)
380 smi_sig = 0x47534943;
381 else
382 smi_sig = ist_info.signature;
383
384 /* setup smi_port from MODLULE_PARM or BIOS */
385 if ((smi_port > 0xff) || (smi_port < 0)) {
386 return -EINVAL;
387 } else if (smi_port == 0) {
388 smi_port = ist_info.command & 0xff;
389 }
390
391 if ((smi_cmd > 0xff) || (smi_cmd < 0)) {
392 return -EINVAL;
393 } else if (smi_cmd == 0) {
394 smi_cmd = (ist_info.command >> 16) & 0xff;
395 }
396
397 return cpufreq_register_driver(&speedstep_driver);
398}
399
400
401/**
402 * speedstep_exit - unregisters SpeedStep support
403 *
404 * Unregisters SpeedStep support.
405 */
406static void __exit speedstep_exit(void)
407{
408 cpufreq_unregister_driver(&speedstep_driver);
409}
410
411module_param(smi_port, int, 0444);
412module_param(smi_cmd, int, 0444);
413module_param(smi_sig, uint, 0444);
414
415MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value -- Intel's default setting is 0xb2");
416MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value -- Intel's default setting is 0x82");
417MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the SMI interface.");
418
419MODULE_AUTHOR ("Hiroshi Miura");
420MODULE_DESCRIPTION ("Speedstep driver for IST applet SMI interface.");
421MODULE_LICENSE ("GPL");
422
423module_init(speedstep_init);
424module_exit(speedstep_exit);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
new file mode 100644
index 000000000000..ba4b01138c8f
--- /dev/null
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -0,0 +1,439 @@
1#include <linux/init.h>
2#include <linux/bitops.h>
3#include <linux/delay.h>
4#include <linux/pci.h>
5#include <asm/dma.h>
6#include <asm/io.h>
7#include <asm/processor.h>
8#include <asm/timer.h>
9
10#include "cpu.h"
11
12/*
13 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
14 */
15static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
16{
17 unsigned char ccr2, ccr3;
18 unsigned long flags;
19
20 /* we test for DEVID by checking whether CCR3 is writable */
21 local_irq_save(flags);
22 ccr3 = getCx86(CX86_CCR3);
23 setCx86(CX86_CCR3, ccr3 ^ 0x80);
24 getCx86(0xc0); /* dummy to change bus */
25
26 if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */
27 ccr2 = getCx86(CX86_CCR2);
28 setCx86(CX86_CCR2, ccr2 ^ 0x04);
29 getCx86(0xc0); /* dummy */
30
31 if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */
32 *dir0 = 0xfd;
33 else { /* Cx486S A step */
34 setCx86(CX86_CCR2, ccr2);
35 *dir0 = 0xfe;
36 }
37 }
38 else {
39 setCx86(CX86_CCR3, ccr3); /* restore CCR3 */
40
41 /* read DIR0 and DIR1 CPU registers */
42 *dir0 = getCx86(CX86_DIR0);
43 *dir1 = getCx86(CX86_DIR1);
44 }
45 local_irq_restore(flags);
46}
47
48/*
49 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
50 * order to identify the Cyrix CPU model after we're out of setup.c
51 *
52 * Actually since bugs.h doesn't even reference this perhaps someone should
53 * fix the documentation ???
54 */
55static unsigned char Cx86_dir0_msb __initdata = 0;
56
57static char Cx86_model[][9] __initdata = {
58 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
59 "M II ", "Unknown"
60};
61static char Cx486_name[][5] __initdata = {
62 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
63 "SRx2", "DRx2"
64};
65static char Cx486S_name[][4] __initdata = {
66 "S", "S2", "Se", "S2e"
67};
68static char Cx486D_name[][4] __initdata = {
69 "DX", "DX2", "?", "?", "?", "DX4"
70};
71static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock";
72static char cyrix_model_mult1[] __initdata = "12??43";
73static char cyrix_model_mult2[] __initdata = "12233445";
74
75/*
76 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
77 * BIOSes for compatibility with DOS games. This makes the udelay loop
78 * work correctly, and improves performance.
79 *
80 * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
81 */
82
83extern void calibrate_delay(void) __init;
84
85static void __init check_cx686_slop(struct cpuinfo_x86 *c)
86{
87 unsigned long flags;
88
89 if (Cx86_dir0_msb == 3) {
90 unsigned char ccr3, ccr5;
91
92 local_irq_save(flags);
93 ccr3 = getCx86(CX86_CCR3);
94 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
95 ccr5 = getCx86(CX86_CCR5);
96 if (ccr5 & 2)
97 setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */
98 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
99 local_irq_restore(flags);
100
101 if (ccr5 & 2) { /* possible wrong calibration done */
102 printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
103 calibrate_delay();
104 c->loops_per_jiffy = loops_per_jiffy;
105 }
106 }
107}
108
109
110static void __init set_cx86_reorder(void)
111{
112 u8 ccr3;
113
114 printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
115 ccr3 = getCx86(CX86_CCR3);
116 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN  */
117
118 /* Load/Store Serialize to mem access disable (=reorder it)  */
119 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
120 /* set load/store serialize from 1GB to 4GB */
121 ccr3 |= 0xe0;
122 setCx86(CX86_CCR3, ccr3);
123}
124
125static void __init set_cx86_memwb(void)
126{
127 u32 cr0;
128
129 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
130
131 /* CCR2 bit 2: unlock NW bit */
132 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
133 /* set 'Not Write-through' */
134 cr0 = 0x20000000;
135 __asm__("movl %%cr0,%%eax\n\t"
136 "orl %0,%%eax\n\t"
137 "movl %%eax,%%cr0\n"
138 : : "r" (cr0)
139 :"ax");
140 /* CCR2 bit 2: lock NW bit and set WT1 */
141 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
142}
143
144static void __init set_cx86_inc(void)
145{
146 unsigned char ccr3;
147
148 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
149
150 ccr3 = getCx86(CX86_CCR3);
151 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN  */
152 /* PCR1 -- Performance Control */
153 /* Incrementor on, whatever that is */
154 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
155 /* PCR0 -- Performance Control */
156 /* Incrementor Margin 10 */
157 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
158 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
159}
160
161/*
162 * Configure later MediaGX and/or Geode processor.
163 */
164
165static void __init geode_configure(void)
166{
167 unsigned long flags;
168 u8 ccr3, ccr4;
169 local_irq_save(flags);
170
171 /* Suspend on halt power saving and enable #SUSP pin */
172 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
173
174 ccr3 = getCx86(CX86_CCR3);
175 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */
176
177 ccr4 = getCx86(CX86_CCR4);
178 ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */
179
180 setCx86(CX86_CCR3, ccr3);
181
182 set_cx86_memwb();
183 set_cx86_reorder();
184 set_cx86_inc();
185
186 local_irq_restore(flags);
187}
188
189
190#ifdef CONFIG_PCI
191static struct pci_device_id cyrix_55x0[] = {
192 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
193 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
194 { },
195};
196#endif
197
198static void __init init_cyrix(struct cpuinfo_x86 *c)
199{
200 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
201 char *buf = c->x86_model_id;
202 const char *p = NULL;
203
204 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
205 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
206 clear_bit(0*32+31, c->x86_capability);
207
208 /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
209 if ( test_bit(1*32+24, c->x86_capability) ) {
210 clear_bit(1*32+24, c->x86_capability);
211 set_bit(X86_FEATURE_CXMMX, c->x86_capability);
212 }
213
214 do_cyrix_devid(&dir0, &dir1);
215
216 check_cx686_slop(c);
217
218 Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */
219 dir0_lsn = dir0 & 0xf; /* model or clock multiplier */
220
221 /* common case step number/rev -- exceptions handled below */
222 c->x86_model = (dir1 >> 4) + 1;
223 c->x86_mask = dir1 & 0xf;
224
225 /* Now cook; the original recipe is by Channing Corn, from Cyrix.
226 * We do the same thing for each generation: we work out
227 * the model, multiplier and stepping. Black magic included,
228 * to make the silicon step/rev numbers match the printed ones.
229 */
230
231 switch (dir0_msn) {
232 unsigned char tmp;
233
234 case 0: /* Cx486SLC/DLC/SRx/DRx */
235 p = Cx486_name[dir0_lsn & 7];
236 break;
237
238 case 1: /* Cx486S/DX/DX2/DX4 */
239 p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5]
240 : Cx486S_name[dir0_lsn & 3];
241 break;
242
243 case 2: /* 5x86 */
244 Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
245 p = Cx86_cb+2;
246 break;
247
248 case 3: /* 6x86/6x86L */
249 Cx86_cb[1] = ' ';
250 Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
251 if (dir1 > 0x21) { /* 686L */
252 Cx86_cb[0] = 'L';
253 p = Cx86_cb;
254 (c->x86_model)++;
255 } else /* 686 */
256 p = Cx86_cb+1;
257 /* Emulate MTRRs using Cyrix's ARRs. */
258 set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
259 /* 6x86's contain this bug */
260 c->coma_bug = 1;
261 break;
262
263 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
264#ifdef CONFIG_PCI
265 /* It isn't really a PCI quirk directly, but the cure is the
266 same. The MediaGX has deep magic SMM stuff that handles the
267 SB emulation. It thows away the fifo on disable_dma() which
268 is wrong and ruins the audio.
269
270 Bug2: VSA1 has a wrap bug so that using maximum sized DMA
271 causes bad things. According to NatSemi VSA2 has another
272 bug to do with 'hlt'. I've not seen any boards using VSA2
273 and X doesn't seem to support it either so who cares 8).
274 VSA1 we work around however.
275 */
276
277 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
278 isa_dma_bridge_buggy = 2;
279#endif
280 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
281
282 /*
283 * The 5510/5520 companion chips have a funky PIT.
284 */
285 if (pci_dev_present(cyrix_55x0))
286 pit_latch_buggy = 1;
287
288 /* GXm supports extended cpuid levels 'ala' AMD */
289 if (c->cpuid_level == 2) {
290 /* Enable cxMMX extensions (GX1 Datasheet 54) */
291 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
292
293 /* GXlv/GXm/GX1 */
294 if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63)
295 geode_configure();
296 get_model_name(c); /* get CPU marketing name */
297 return;
298 }
299 else { /* MediaGX */
300 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
301 p = Cx86_cb+2;
302 c->x86_model = (dir1 & 0x20) ? 1 : 2;
303 }
304 break;
305
306 case 5: /* 6x86MX/M II */
307 if (dir1 > 7)
308 {
309 dir0_msn++; /* M II */
310 /* Enable MMX extensions (App note 108) */
311 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
312 }
313 else
314 {
315 c->coma_bug = 1; /* 6x86MX, it has the bug. */
316 }
317 tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
318 Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
319 p = Cx86_cb+tmp;
320 if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
321 (c->x86_model)++;
322 /* Emulate MTRRs using Cyrix's ARRs. */
323 set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
324 break;
325
326 case 0xf: /* Cyrix 486 without DEVID registers */
327 switch (dir0_lsn) {
328 case 0xd: /* either a 486SLC or DLC w/o DEVID */
329 dir0_msn = 0;
330 p = Cx486_name[(c->hard_math) ? 1 : 0];
331 break;
332
333 case 0xe: /* a 486S A step */
334 dir0_msn = 0;
335 p = Cx486S_name[0];
336 break;
337 }
338 break;
339
340 default: /* unknown (shouldn't happen, we know everyone ;-) */
341 dir0_msn = 7;
342 break;
343 }
344 strcpy(buf, Cx86_model[dir0_msn & 7]);
345 if (p) strcat(buf, p);
346 return;
347}
348
349/*
350 * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
351 * by the fact that they preserve the flags across the division of 5/2.
352 * PII and PPro exhibit this behavior too, but they have cpuid available.
353 */
354
355/*
356 * Perform the Cyrix 5/2 test. A Cyrix won't change
357 * the flags, while other 486 chips will.
358 */
359static inline int test_cyrix_52div(void)
360{
361 unsigned int test;
362
363 __asm__ __volatile__(
364 "sahf\n\t" /* clear flags (%eax = 0x0005) */
365 "div %b2\n\t" /* divide 5 by 2 */
366 "lahf" /* store flags into %ah */
367 : "=a" (test)
368 : "0" (5), "q" (2)
369 : "cc");
370
371 /* AH is 0x02 on Cyrix after the divide.. */
372 return (unsigned char) (test >> 8) == 0x02;
373}
374
375static void cyrix_identify(struct cpuinfo_x86 * c)
376{
377 /* Detect Cyrix with disabled CPUID */
378 if ( c->x86 == 4 && test_cyrix_52div() ) {
379 unsigned char dir0, dir1;
380
381 strcpy(c->x86_vendor_id, "CyrixInstead");
382 c->x86_vendor = X86_VENDOR_CYRIX;
383
384 /* Actually enable cpuid on the older cyrix */
385
386 /* Retrieve CPU revisions */
387
388 do_cyrix_devid(&dir0, &dir1);
389
390 dir0>>=4;
391
392 /* Check it is an affected model */
393
394 if (dir0 == 5 || dir0 == 3)
395 {
396 unsigned char ccr3, ccr4;
397 unsigned long flags;
398 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
399 local_irq_save(flags);
400 ccr3 = getCx86(CX86_CCR3);
401 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
402 ccr4 = getCx86(CX86_CCR4);
403 setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */
404 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
405 local_irq_restore(flags);
406 }
407 }
408 generic_identify(c);
409}
410
411static struct cpu_dev cyrix_cpu_dev __initdata = {
412 .c_vendor = "Cyrix",
413 .c_ident = { "CyrixInstead" },
414 .c_init = init_cyrix,
415 .c_identify = cyrix_identify,
416};
417
418int __init cyrix_init_cpu(void)
419{
420 cpu_devs[X86_VENDOR_CYRIX] = &cyrix_cpu_dev;
421 return 0;
422}
423
424//early_arch_initcall(cyrix_init_cpu);
425
426static struct cpu_dev nsc_cpu_dev __initdata = {
427 .c_vendor = "NSC",
428 .c_ident = { "Geode by NSC" },
429 .c_init = init_cyrix,
430 .c_identify = generic_identify,
431};
432
433int __init nsc_init_cpu(void)
434{
435 cpu_devs[X86_VENDOR_NSC] = &nsc_cpu_dev;
436 return 0;
437}
438
439//early_arch_initcall(nsc_init_cpu);
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
new file mode 100644
index 000000000000..b8d847b850dc
--- /dev/null
+++ b/arch/i386/kernel/cpu/intel.c
@@ -0,0 +1,248 @@
1#include <linux/config.h>
2#include <linux/init.h>
3#include <linux/kernel.h>
4
5#include <linux/string.h>
6#include <linux/bitops.h>
7#include <linux/smp.h>
8#include <linux/thread_info.h>
9
10#include <asm/processor.h>
11#include <asm/msr.h>
12#include <asm/uaccess.h>
13
14#include "cpu.h"
15
16#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h>
18#include <asm/apic.h>
19#include <mach_apic.h>
20#endif
21
22extern int trap_init_f00f_bug(void);
23
24#ifdef CONFIG_X86_INTEL_USERCOPY
25/*
26 * Alignment at which movsl is preferred for bulk memory copies.
27 */
28struct movsl_mask movsl_mask;
29#endif
30
31void __init early_intel_workaround(struct cpuinfo_x86 *c)
32{
33 if (c->x86_vendor != X86_VENDOR_INTEL)
34 return;
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38}
39
40/*
41 * Early probe support logic for ppro memory erratum #50
42 *
43 * This is called before we do cpu ident work
44 */
45
46int __init ppro_with_ram_bug(void)
47{
48 /* Uses data from early_cpu_detect now */
49 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
50 boot_cpu_data.x86 == 6 &&
51 boot_cpu_data.x86_model == 1 &&
52 boot_cpu_data.x86_mask < 8) {
53 printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
54 return 1;
55 }
56 return 0;
57}
58
59
60/*
61 * P4 Xeon errata 037 workaround.
62 * Hardware prefetcher may cause stale data to be loaded into the cache.
63 */
64static void __init Intel_errata_workarounds(struct cpuinfo_x86 *c)
65{
66 unsigned long lo, hi;
67
68 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
69 rdmsr (MSR_IA32_MISC_ENABLE, lo, hi);
70 if ((lo & (1<<9)) == 0) {
71 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
72 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
73 lo |= (1<<9); /* Disable hw prefetching */
74 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
75 }
76 }
77}
78
79
80static void __init init_intel(struct cpuinfo_x86 *c)
81{
82 unsigned int l2 = 0;
83 char *p = NULL;
84
85#ifdef CONFIG_X86_F00F_BUG
86 /*
87 * All current models of Pentium and Pentium with MMX technology CPUs
88 * have the F0 0F bug, which lets nonprivileged users lock up the system.
89 * Note that the workaround only should be initialized once...
90 */
91 c->f00f_bug = 0;
92 if ( c->x86 == 5 ) {
93 static int f00f_workaround_enabled = 0;
94
95 c->f00f_bug = 1;
96 if ( !f00f_workaround_enabled ) {
97 trap_init_f00f_bug();
98 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
99 f00f_workaround_enabled = 1;
100 }
101 }
102#endif
103
104 select_idle_routine(c);
105 l2 = init_intel_cacheinfo(c);
106
107 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
108 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
109 clear_bit(X86_FEATURE_SEP, c->x86_capability);
110
111 /* Names for the Pentium II/Celeron processors
112 detectable only by also checking the cache size.
113 Dixon is NOT a Celeron. */
114 if (c->x86 == 6) {
115 switch (c->x86_model) {
116 case 5:
117 if (c->x86_mask == 0) {
118 if (l2 == 0)
119 p = "Celeron (Covington)";
120 else if (l2 == 256)
121 p = "Mobile Pentium II (Dixon)";
122 }
123 break;
124
125 case 6:
126 if (l2 == 128)
127 p = "Celeron (Mendocino)";
128 else if (c->x86_mask == 0 || c->x86_mask == 5)
129 p = "Celeron-A";
130 break;
131
132 case 8:
133 if (l2 == 128)
134 p = "Celeron (Coppermine)";
135 break;
136 }
137 }
138
139 if ( p )
140 strcpy(c->x86_model_id, p);
141
142 detect_ht(c);
143
144 /* Work around errata */
145 Intel_errata_workarounds(c);
146
147#ifdef CONFIG_X86_INTEL_USERCOPY
148 /*
149 * Set up the preferred alignment for movsl bulk memory moves
150 */
151 switch (c->x86) {
152 case 4: /* 486: untested */
153 break;
154 case 5: /* Old Pentia: untested */
155 break;
156 case 6: /* PII/PIII only like movsl with 8-byte alignment */
157 movsl_mask.mask = 7;
158 break;
159 case 15: /* P4 is OK down to 8-byte alignment */
160 movsl_mask.mask = 7;
161 break;
162 }
163#endif
164
165 if (c->x86 == 15)
166 set_bit(X86_FEATURE_P4, c->x86_capability);
167 if (c->x86 == 6)
168 set_bit(X86_FEATURE_P3, c->x86_capability);
169}
170
171
172static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
173{
174 /* Intel PIII Tualatin. This comes in two flavours.
175 * One has 256kb of cache, the other 512. We have no way
176 * to determine which, so we use a boottime override
177 * for the 512kb model, and assume 256 otherwise.
178 */
179 if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
180 size = 256;
181 return size;
182}
183
184static struct cpu_dev intel_cpu_dev __initdata = {
185 .c_vendor = "Intel",
186 .c_ident = { "GenuineIntel" },
187 .c_models = {
188 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
189 {
190 [0] = "486 DX-25/33",
191 [1] = "486 DX-50",
192 [2] = "486 SX",
193 [3] = "486 DX/2",
194 [4] = "486 SL",
195 [5] = "486 SX/2",
196 [7] = "486 DX/2-WB",
197 [8] = "486 DX/4",
198 [9] = "486 DX/4-WB"
199 }
200 },
201 { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
202 {
203 [0] = "Pentium 60/66 A-step",
204 [1] = "Pentium 60/66",
205 [2] = "Pentium 75 - 200",
206 [3] = "OverDrive PODP5V83",
207 [4] = "Pentium MMX",
208 [7] = "Mobile Pentium 75 - 200",
209 [8] = "Mobile Pentium MMX"
210 }
211 },
212 { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
213 {
214 [0] = "Pentium Pro A-step",
215 [1] = "Pentium Pro",
216 [3] = "Pentium II (Klamath)",
217 [4] = "Pentium II (Deschutes)",
218 [5] = "Pentium II (Deschutes)",
219 [6] = "Mobile Pentium II",
220 [7] = "Pentium III (Katmai)",
221 [8] = "Pentium III (Coppermine)",
222 [10] = "Pentium III (Cascades)",
223 [11] = "Pentium III (Tualatin)",
224 }
225 },
226 { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
227 {
228 [0] = "Pentium 4 (Unknown)",
229 [1] = "Pentium 4 (Willamette)",
230 [2] = "Pentium 4 (Northwood)",
231 [4] = "Pentium 4 (Foster)",
232 [5] = "Pentium 4 (Foster)",
233 }
234 },
235 },
236 .c_init = init_intel,
237 .c_identify = generic_identify,
238 .c_size_cache = intel_size_cache,
239};
240
241__init int intel_cpu_init(void)
242{
243 cpu_devs[X86_VENDOR_INTEL] = &intel_cpu_dev;
244 return 0;
245}
246
247// arch_initcall(intel_cpu_init);
248
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
new file mode 100644
index 000000000000..aeb5b4ef8c8b
--- /dev/null
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -0,0 +1,598 @@
1/*
2 * Routines to indentify caches on Intel CPU.
3 *
4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 */
7
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/device.h>
11#include <linux/compiler.h>
12#include <linux/cpu.h>
13
14#include <asm/processor.h>
15#include <asm/smp.h>
16
17#define LVL_1_INST 1
18#define LVL_1_DATA 2
19#define LVL_2 3
20#define LVL_3 4
21#define LVL_TRACE 5
22
23struct _cache_table
24{
25 unsigned char descriptor;
26 char cache_type;
27 short size;
28};
29
30/* all the cache descriptor types we care about (no TLB or trace cache entries) */
31static struct _cache_table cache_table[] __initdata =
32{
33 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
34 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
35 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
36 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
37 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
38 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
39 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */
40 { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */
41 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
42 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
43 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
44 { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
45 { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
46 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
47 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
48 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
49 { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */
50 { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */
51 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
52 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
53 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
54 { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
55 { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
56 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
57 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
58 { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */
59 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
60 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
61 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
62 { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
63 { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */
64 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
65 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
66 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
67 { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */
68 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */
69 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
70 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */
71 { 0x00, 0, 0}
72};
73
74
75enum _cache_type
76{
77 CACHE_TYPE_NULL = 0,
78 CACHE_TYPE_DATA = 1,
79 CACHE_TYPE_INST = 2,
80 CACHE_TYPE_UNIFIED = 3
81};
82
83union _cpuid4_leaf_eax {
84 struct {
85 enum _cache_type type:5;
86 unsigned int level:3;
87 unsigned int is_self_initializing:1;
88 unsigned int is_fully_associative:1;
89 unsigned int reserved:4;
90 unsigned int num_threads_sharing:12;
91 unsigned int num_cores_on_die:6;
92 } split;
93 u32 full;
94};
95
96union _cpuid4_leaf_ebx {
97 struct {
98 unsigned int coherency_line_size:12;
99 unsigned int physical_line_partition:10;
100 unsigned int ways_of_associativity:10;
101 } split;
102 u32 full;
103};
104
105union _cpuid4_leaf_ecx {
106 struct {
107 unsigned int number_of_sets:32;
108 } split;
109 u32 full;
110};
111
112struct _cpuid4_info {
113 union _cpuid4_leaf_eax eax;
114 union _cpuid4_leaf_ebx ebx;
115 union _cpuid4_leaf_ecx ecx;
116 unsigned long size;
117 cpumask_t shared_cpu_map;
118};
119
120#define MAX_CACHE_LEAVES 4
121static unsigned short __devinitdata num_cache_leaves;
122
123static int __devinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
124{
125 unsigned int eax, ebx, ecx, edx;
126 union _cpuid4_leaf_eax cache_eax;
127
128 cpuid_count(4, index, &eax, &ebx, &ecx, &edx);
129 cache_eax.full = eax;
130 if (cache_eax.split.type == CACHE_TYPE_NULL)
131 return -1;
132
133 this_leaf->eax.full = eax;
134 this_leaf->ebx.full = ebx;
135 this_leaf->ecx.full = ecx;
136 this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) *
137 (this_leaf->ebx.split.coherency_line_size + 1) *
138 (this_leaf->ebx.split.physical_line_partition + 1) *
139 (this_leaf->ebx.split.ways_of_associativity + 1);
140 return 0;
141}
142
143static int __init find_num_cache_leaves(void)
144{
145 unsigned int eax, ebx, ecx, edx;
146 union _cpuid4_leaf_eax cache_eax;
147 int i;
148 int retval;
149
150 retval = MAX_CACHE_LEAVES;
151 /* Do cpuid(4) loop to find out num_cache_leaves */
152 for (i = 0; i < MAX_CACHE_LEAVES; i++) {
153 cpuid_count(4, i, &eax, &ebx, &ecx, &edx);
154 cache_eax.full = eax;
155 if (cache_eax.split.type == CACHE_TYPE_NULL) {
156 retval = i;
157 break;
158 }
159 }
160 return retval;
161}
162
163unsigned int __init init_intel_cacheinfo(struct cpuinfo_x86 *c)
164{
165 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
166 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
167 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
168
169 if (c->cpuid_level > 4) {
170 static int is_initialized;
171
172 if (is_initialized == 0) {
173 /* Init num_cache_leaves from boot CPU */
174 num_cache_leaves = find_num_cache_leaves();
175 is_initialized++;
176 }
177
178 /*
179 * Whenever possible use cpuid(4), deterministic cache
180 * parameters cpuid leaf to find the cache details
181 */
182 for (i = 0; i < num_cache_leaves; i++) {
183 struct _cpuid4_info this_leaf;
184
185 int retval;
186
187 retval = cpuid4_cache_lookup(i, &this_leaf);
188 if (retval >= 0) {
189 switch(this_leaf.eax.split.level) {
190 case 1:
191 if (this_leaf.eax.split.type ==
192 CACHE_TYPE_DATA)
193 new_l1d = this_leaf.size/1024;
194 else if (this_leaf.eax.split.type ==
195 CACHE_TYPE_INST)
196 new_l1i = this_leaf.size/1024;
197 break;
198 case 2:
199 new_l2 = this_leaf.size/1024;
200 break;
201 case 3:
202 new_l3 = this_leaf.size/1024;
203 break;
204 default:
205 break;
206 }
207 }
208 }
209 }
210 if (c->cpuid_level > 1) {
211 /* supports eax=2 call */
212 int i, j, n;
213 int regs[4];
214 unsigned char *dp = (unsigned char *)regs;
215
216 /* Number of times to iterate */
217 n = cpuid_eax(2) & 0xFF;
218
219 for ( i = 0 ; i < n ; i++ ) {
220 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
221
222 /* If bit 31 is set, this is an unknown format */
223 for ( j = 0 ; j < 3 ; j++ ) {
224 if ( regs[j] < 0 ) regs[j] = 0;
225 }
226
227 /* Byte 0 is level count, not a descriptor */
228 for ( j = 1 ; j < 16 ; j++ ) {
229 unsigned char des = dp[j];
230 unsigned char k = 0;
231
232 /* look up this descriptor in the table */
233 while (cache_table[k].descriptor != 0)
234 {
235 if (cache_table[k].descriptor == des) {
236 switch (cache_table[k].cache_type) {
237 case LVL_1_INST:
238 l1i += cache_table[k].size;
239 break;
240 case LVL_1_DATA:
241 l1d += cache_table[k].size;
242 break;
243 case LVL_2:
244 l2 += cache_table[k].size;
245 break;
246 case LVL_3:
247 l3 += cache_table[k].size;
248 break;
249 case LVL_TRACE:
250 trace += cache_table[k].size;
251 break;
252 }
253
254 break;
255 }
256
257 k++;
258 }
259 }
260 }
261
262 if (new_l1d)
263 l1d = new_l1d;
264
265 if (new_l1i)
266 l1i = new_l1i;
267
268 if (new_l2)
269 l2 = new_l2;
270
271 if (new_l3)
272 l3 = new_l3;
273
274 if ( trace )
275 printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
276 else if ( l1i )
277 printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
278 if ( l1d )
279 printk(", L1 D cache: %dK\n", l1d);
280 else
281 printk("\n");
282 if ( l2 )
283 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
284 if ( l3 )
285 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
286
287 /*
288 * This assumes the L3 cache is shared; it typically lives in
289 * the northbridge. The L1 caches are included by the L2
290 * cache, and so should not be included for the purpose of
291 * SMP switching weights.
292 */
293 c->x86_cache_size = l2 ? l2 : (l1i+l1d);
294 }
295
296 return l2;
297}
298
299/* pointer to _cpuid4_info array (for each cache leaf) */
300static struct _cpuid4_info *cpuid4_info[NR_CPUS];
301#define CPUID4_INFO_IDX(x,y) (&((cpuid4_info[x])[y]))
302
303#ifdef CONFIG_SMP
304static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
305{
306 struct _cpuid4_info *this_leaf;
307 unsigned long num_threads_sharing;
308
309 this_leaf = CPUID4_INFO_IDX(cpu, index);
310 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
311
312 if (num_threads_sharing == 1)
313 cpu_set(cpu, this_leaf->shared_cpu_map);
314#ifdef CONFIG_X86_HT
315 else if (num_threads_sharing == smp_num_siblings)
316 this_leaf->shared_cpu_map = cpu_sibling_map[cpu];
317#endif
318 else
319 printk(KERN_INFO "Number of CPUs sharing cache didn't match "
320 "any known set of CPUs\n");
321}
322#else
323static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {}
324#endif
325
326static void free_cache_attributes(unsigned int cpu)
327{
328 kfree(cpuid4_info[cpu]);
329 cpuid4_info[cpu] = NULL;
330}
331
332static int __devinit detect_cache_attributes(unsigned int cpu)
333{
334 struct _cpuid4_info *this_leaf;
335 unsigned long j;
336 int retval;
337
338 if (num_cache_leaves == 0)
339 return -ENOENT;
340
341 cpuid4_info[cpu] = kmalloc(
342 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
343 if (unlikely(cpuid4_info[cpu] == NULL))
344 return -ENOMEM;
345 memset(cpuid4_info[cpu], 0,
346 sizeof(struct _cpuid4_info) * num_cache_leaves);
347
348 /* Do cpuid and store the results */
349 for (j = 0; j < num_cache_leaves; j++) {
350 this_leaf = CPUID4_INFO_IDX(cpu, j);
351 retval = cpuid4_cache_lookup(j, this_leaf);
352 if (unlikely(retval < 0))
353 goto err_out;
354 cache_shared_cpu_map_setup(cpu, j);
355 }
356 return 0;
357
358err_out:
359 free_cache_attributes(cpu);
360 return -ENOMEM;
361}
362
363#ifdef CONFIG_SYSFS
364
365#include <linux/kobject.h>
366#include <linux/sysfs.h>
367
368extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
369
370/* pointer to kobject for cpuX/cache */
371static struct kobject * cache_kobject[NR_CPUS];
372
373struct _index_kobject {
374 struct kobject kobj;
375 unsigned int cpu;
376 unsigned short index;
377};
378
379/* pointer to array of kobjects for cpuX/cache/indexY */
380static struct _index_kobject *index_kobject[NR_CPUS];
381#define INDEX_KOBJECT_PTR(x,y) (&((index_kobject[x])[y]))
382
383#define show_one_plus(file_name, object, val) \
384static ssize_t show_##file_name \
385 (struct _cpuid4_info *this_leaf, char *buf) \
386{ \
387 return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \
388}
389
390show_one_plus(level, eax.split.level, 0);
391show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1);
392show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
393show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
394show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
395
396static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
397{
398 return sprintf (buf, "%luK\n", this_leaf->size / 1024);
399}
400
401static ssize_t show_shared_cpu_map(struct _cpuid4_info *this_leaf, char *buf)
402{
403 char mask_str[NR_CPUS];
404 cpumask_scnprintf(mask_str, NR_CPUS, this_leaf->shared_cpu_map);
405 return sprintf(buf, "%s\n", mask_str);
406}
407
408static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
409 switch(this_leaf->eax.split.type) {
410 case CACHE_TYPE_DATA:
411 return sprintf(buf, "Data\n");
412 break;
413 case CACHE_TYPE_INST:
414 return sprintf(buf, "Instruction\n");
415 break;
416 case CACHE_TYPE_UNIFIED:
417 return sprintf(buf, "Unified\n");
418 break;
419 default:
420 return sprintf(buf, "Unknown\n");
421 break;
422 }
423}
424
425struct _cache_attr {
426 struct attribute attr;
427 ssize_t (*show)(struct _cpuid4_info *, char *);
428 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
429};
430
431#define define_one_ro(_name) \
432static struct _cache_attr _name = \
433 __ATTR(_name, 0444, show_##_name, NULL)
434
435define_one_ro(level);
436define_one_ro(type);
437define_one_ro(coherency_line_size);
438define_one_ro(physical_line_partition);
439define_one_ro(ways_of_associativity);
440define_one_ro(number_of_sets);
441define_one_ro(size);
442define_one_ro(shared_cpu_map);
443
444static struct attribute * default_attrs[] = {
445 &type.attr,
446 &level.attr,
447 &coherency_line_size.attr,
448 &physical_line_partition.attr,
449 &ways_of_associativity.attr,
450 &number_of_sets.attr,
451 &size.attr,
452 &shared_cpu_map.attr,
453 NULL
454};
455
456#define to_object(k) container_of(k, struct _index_kobject, kobj)
457#define to_attr(a) container_of(a, struct _cache_attr, attr)
458
459static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
460{
461 struct _cache_attr *fattr = to_attr(attr);
462 struct _index_kobject *this_leaf = to_object(kobj);
463 ssize_t ret;
464
465 ret = fattr->show ?
466 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
467 buf) :
468 0;
469 return ret;
470}
471
472static ssize_t store(struct kobject * kobj, struct attribute * attr,
473 const char * buf, size_t count)
474{
475 return 0;
476}
477
478static struct sysfs_ops sysfs_ops = {
479 .show = show,
480 .store = store,
481};
482
483static struct kobj_type ktype_cache = {
484 .sysfs_ops = &sysfs_ops,
485 .default_attrs = default_attrs,
486};
487
488static struct kobj_type ktype_percpu_entry = {
489 .sysfs_ops = &sysfs_ops,
490};
491
492static void cpuid4_cache_sysfs_exit(unsigned int cpu)
493{
494 kfree(cache_kobject[cpu]);
495 kfree(index_kobject[cpu]);
496 cache_kobject[cpu] = NULL;
497 index_kobject[cpu] = NULL;
498 free_cache_attributes(cpu);
499}
500
501static int __devinit cpuid4_cache_sysfs_init(unsigned int cpu)
502{
503
504 if (num_cache_leaves == 0)
505 return -ENOENT;
506
507 detect_cache_attributes(cpu);
508 if (cpuid4_info[cpu] == NULL)
509 return -ENOENT;
510
511 /* Allocate all required memory */
512 cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL);
513 if (unlikely(cache_kobject[cpu] == NULL))
514 goto err_out;
515 memset(cache_kobject[cpu], 0, sizeof(struct kobject));
516
517 index_kobject[cpu] = kmalloc(
518 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
519 if (unlikely(index_kobject[cpu] == NULL))
520 goto err_out;
521 memset(index_kobject[cpu], 0,
522 sizeof(struct _index_kobject) * num_cache_leaves);
523
524 return 0;
525
526err_out:
527 cpuid4_cache_sysfs_exit(cpu);
528 return -ENOMEM;
529}
530
531/* Add/Remove cache interface for CPU device */
532static int __devinit cache_add_dev(struct sys_device * sys_dev)
533{
534 unsigned int cpu = sys_dev->id;
535 unsigned long i, j;
536 struct _index_kobject *this_object;
537 int retval = 0;
538
539 retval = cpuid4_cache_sysfs_init(cpu);
540 if (unlikely(retval < 0))
541 return retval;
542
543 cache_kobject[cpu]->parent = &sys_dev->kobj;
544 kobject_set_name(cache_kobject[cpu], "%s", "cache");
545 cache_kobject[cpu]->ktype = &ktype_percpu_entry;
546 retval = kobject_register(cache_kobject[cpu]);
547
548 for (i = 0; i < num_cache_leaves; i++) {
549 this_object = INDEX_KOBJECT_PTR(cpu,i);
550 this_object->cpu = cpu;
551 this_object->index = i;
552 this_object->kobj.parent = cache_kobject[cpu];
553 kobject_set_name(&(this_object->kobj), "index%1lu", i);
554 this_object->kobj.ktype = &ktype_cache;
555 retval = kobject_register(&(this_object->kobj));
556 if (unlikely(retval)) {
557 for (j = 0; j < i; j++) {
558 kobject_unregister(
559 &(INDEX_KOBJECT_PTR(cpu,j)->kobj));
560 }
561 kobject_unregister(cache_kobject[cpu]);
562 cpuid4_cache_sysfs_exit(cpu);
563 break;
564 }
565 }
566 return retval;
567}
568
569static int __devexit cache_remove_dev(struct sys_device * sys_dev)
570{
571 unsigned int cpu = sys_dev->id;
572 unsigned long i;
573
574 for (i = 0; i < num_cache_leaves; i++)
575 kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
576 kobject_unregister(cache_kobject[cpu]);
577 cpuid4_cache_sysfs_exit(cpu);
578 return 0;
579}
580
581static struct sysdev_driver cache_sysdev_driver = {
582 .add = cache_add_dev,
583 .remove = __devexit_p(cache_remove_dev),
584};
585
586/* Register/Unregister the cpu_cache driver */
587static int __devinit cache_register_driver(void)
588{
589 if (num_cache_leaves == 0)
590 return 0;
591
592 return sysdev_driver_register(&cpu_sysdev_class,&cache_sysdev_driver);
593}
594
595device_initcall(cache_register_driver);
596
597#endif
598
diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile
new file mode 100644
index 000000000000..30808f3d6715
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/Makefile
@@ -0,0 +1,2 @@
1obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o
2obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
new file mode 100644
index 000000000000..8df52e86c4d2
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -0,0 +1,97 @@
1/*
2 * Athlon/Hammer specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/config.h>
10#include <linux/irq.h>
11#include <linux/interrupt.h>
12#include <linux/smp.h>
13
14#include <asm/processor.h>
15#include <asm/system.h>
16#include <asm/msr.h>
17
18#include "mce.h"
19
20/* Machine Check Handler For AMD Athlon/Duron */
21static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
22{
23 int recover=1;
24 u32 alow, ahigh, high, low;
25 u32 mcgstl, mcgsth;
26 int i;
27
28 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
29 if (mcgstl & (1<<0)) /* Recoverable ? */
30 recover=0;
31
32 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
33 smp_processor_id(), mcgsth, mcgstl);
34
35 for (i=1; i<nr_mce_banks; i++) {
36 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
37 if (high&(1<<31)) {
38 if (high & (1<<29))
39 recover |= 1;
40 if (high & (1<<25))
41 recover |= 2;
42 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
43 high &= ~(1<<31);
44 if (high & (1<<27)) {
45 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 printk ("[%08x%08x]", ahigh, alow);
47 }
48 if (high & (1<<26)) {
49 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 printk (" at %08x%08x", ahigh, alow);
51 }
52 printk ("\n");
53 /* Clear it */
54 wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
55 /* Serialize */
56 wmb();
57 add_taint(TAINT_MACHINE_CHECK);
58 }
59 }
60
61 if (recover&2)
62 panic ("CPU context corrupt");
63 if (recover&1)
64 panic ("Unable to continue");
65 printk (KERN_EMERG "Attempting to continue.\n");
66 mcgstl &= ~(1<<2);
67 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
68}
69
70
71/* AMD K7 machine check is Intel like */
72void __init amd_mcheck_init(struct cpuinfo_x86 *c)
73{
74 u32 l, h;
75 int i;
76
77 machine_check_vector = k7_machine_check;
78 wmb();
79
80 printk (KERN_INFO "Intel machine check architecture supported.\n");
81 rdmsr (MSR_IA32_MCG_CAP, l, h);
82 if (l & (1<<8)) /* Control register present ? */
83 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
84 nr_mce_banks = l & 0xff;
85
86 /* Clear status for MC index 0 separately, we don't touch CTL,
87 * as some Athlons cause spurious MCEs when its enabled. */
88 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
89 for (i=1; i<nr_mce_banks; i++) {
90 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
91 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
92 }
93
94 set_in_cr4 (X86_CR4_MCE);
95 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
96 smp_processor_id());
97}
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..bf6d1aefafc0
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,77 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/smp.h>
12#include <linux/thread_info.h>
13
14#include <asm/processor.h>
15#include <asm/system.h>
16
17#include "mce.h"
18
19int mce_disabled __initdata = 0;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void __init mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled==1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 if (c->x86==6 || c->x86==15)
42 amd_mcheck_init(c);
43 break;
44
45 case X86_VENDOR_INTEL:
46 if (c->x86==5)
47 intel_p5_mcheck_init(c);
48 if (c->x86==6)
49 intel_p6_mcheck_init(c);
50 if (c->x86==15)
51 intel_p4_mcheck_init(c);
52 break;
53
54 case X86_VENDOR_CENTAUR:
55 if (c->x86==5)
56 winchip_mcheck_init(c);
57 break;
58
59 default:
60 break;
61 }
62}
63
64static int __init mcheck_disable(char *str)
65{
66 mce_disabled = 1;
67 return 0;
68}
69
70static int __init mcheck_enable(char *str)
71{
72 mce_disabled = -1;
73 return 0;
74}
75
76__setup("nomce", mcheck_disable);
77__setup("mce", mcheck_enable);
diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h
new file mode 100644
index 000000000000..dc2416dfef15
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -0,0 +1,14 @@
1#include <linux/init.h>
2
3void amd_mcheck_init(struct cpuinfo_x86 *c);
4void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
7void winchip_mcheck_init(struct cpuinfo_x86 *c);
8
9/* Call the installed machine check handler for this CPU setup. */
10extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
11
12extern int mce_disabled __initdata;
13extern int nr_mce_banks;
14
diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c
new file mode 100644
index 000000000000..7864ddfccf07
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -0,0 +1,93 @@
1/*
2 * Non Fatal Machine Check Exception Reporting
3 *
4 * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk>
5 *
6 * This file contains routines to check for non-fatal MCEs every 15s
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/config.h>
15#include <linux/irq.h>
16#include <linux/workqueue.h>
17#include <linux/interrupt.h>
18#include <linux/smp.h>
19#include <linux/module.h>
20
21#include <asm/processor.h>
22#include <asm/system.h>
23#include <asm/msr.h>
24
25#include "mce.h"
26
27static int firstbank;
28
29#define MCE_RATE 15*HZ /* timer rate is 15s */
30
31static void mce_checkregs (void *info)
32{
33 u32 low, high;
34 int i;
35
36 for (i=firstbank; i<nr_mce_banks; i++) {
37 rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
38
39 if (high & (1<<31)) {
40 printk(KERN_INFO "MCE: The hardware reports a non "
41 "fatal, correctable incident occurred on "
42 "CPU %d.\n",
43 smp_processor_id());
44 printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
45
46 /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
47 wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
48
49 /* Serialize */
50 wmb();
51 add_taint(TAINT_MACHINE_CHECK);
52 }
53 }
54}
55
56static void mce_work_fn(void *data);
57static DECLARE_WORK(mce_work, mce_work_fn, NULL);
58
59static void mce_work_fn(void *data)
60{
61 on_each_cpu(mce_checkregs, NULL, 1, 1);
62 schedule_delayed_work(&mce_work, MCE_RATE);
63}
64
65static int __init init_nonfatal_mce_checker(void)
66{
67 struct cpuinfo_x86 *c = &boot_cpu_data;
68
69 /* Check for MCE support */
70 if (!cpu_has(c, X86_FEATURE_MCE))
71 return -ENODEV;
72
73 /* Check for PPro style MCA */
74 if (!cpu_has(c, X86_FEATURE_MCA))
75 return -ENODEV;
76
77 /* Some Athlons misbehave when we frob bank 0 */
78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
79 boot_cpu_data.x86 == 6)
80 firstbank = 1;
81 else
82 firstbank = 0;
83
84 /*
85 * Check for non-fatal errors every MCE_RATE s
86 */
87 schedule_delayed_work(&mce_work, MCE_RATE);
88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89 return 0;
90}
91module_init(init_nonfatal_mce_checker);
92
93MODULE_LICENSE("GPL");
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
new file mode 100644
index 000000000000..8b16ceb929b4
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -0,0 +1,271 @@
1/*
2 * P4 specific Machine Check Exception Reporting
3 */
4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/config.h>
9#include <linux/irq.h>
10#include <linux/interrupt.h>
11#include <linux/smp.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/msr.h>
16#include <asm/apic.h>
17
18#include "mce.h"
19
20/* as supported by the P4/Xeon family */
21struct intel_mce_extended_msrs {
22 u32 eax;
23 u32 ebx;
24 u32 ecx;
25 u32 edx;
26 u32 esi;
27 u32 edi;
28 u32 ebp;
29 u32 esp;
30 u32 eflags;
31 u32 eip;
32 /* u32 *reserved[]; */
33};
34
35static int mce_num_extended_msrs = 0;
36
37
38#ifdef CONFIG_X86_MCE_P4THERMAL
39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK);
44}
45
46/* P4/Xeon Thermal transition interrupt handler */
47static void intel_thermal_interrupt(struct pt_regs *regs)
48{
49 u32 l, h;
50 unsigned int cpu = smp_processor_id();
51 static unsigned long next[NR_CPUS];
52
53 ack_APIC_irq();
54
55 if (time_after(next[cpu], jiffies))
56 return;
57
58 next[cpu] = jiffies + HZ*5;
59 rdmsr(MSR_IA32_THERM_STATUS, l, h);
60 if (l & 0x1) {
61 printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
62 printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
63 cpu);
64 add_taint(TAINT_MACHINE_CHECK);
65 } else {
66 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
67 }
68}
69
70/* Thermal interrupt handler for this CPU setup */
71static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
72
73fastcall void smp_thermal_interrupt(struct pt_regs *regs)
74{
75 irq_enter();
76 vendor_thermal_interrupt(regs);
77 irq_exit();
78}
79
80/* P4/Xeon Thermal regulation detect and init */
81static void __init intel_init_thermal(struct cpuinfo_x86 *c)
82{
83 u32 l, h;
84 unsigned int cpu = smp_processor_id();
85
86 /* Thermal monitoring */
87 if (!cpu_has(c, X86_FEATURE_ACPI))
88 return; /* -ENODEV */
89
90 /* Clock modulation */
91 if (!cpu_has(c, X86_FEATURE_ACC))
92 return; /* -ENODEV */
93
94 /* first check if its enabled already, in which case there might
95 * be some SMM goo which handles it, so we can't even put a handler
96 * since it might be delivered via SMI already -zwanem.
97 */
98 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
99 h = apic_read(APIC_LVTTHMR);
100 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
101 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
102 cpu);
103 return; /* -EBUSY */
104 }
105
106 /* check whether a vector already exists, temporarily masked? */
107 if (h & APIC_VECTOR_MASK) {
108 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
109 "installed\n",
110 cpu, (h & APIC_VECTOR_MASK));
111 return; /* -EBUSY */
112 }
113
114 /* The temperature transition interrupt handler setup */
115 h = THERMAL_APIC_VECTOR; /* our delivery vector */
116 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
117 apic_write_around(APIC_LVTTHMR, h);
118
119 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
120 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
121
122 /* ok we're good to go... */
123 vendor_thermal_interrupt = intel_thermal_interrupt;
124
125 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
126 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
127
128 l = apic_read (APIC_LVTTHMR);
129 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
130 printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
131 return;
132}
133#endif /* CONFIG_X86_MCE_P4THERMAL */
134
135
136/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
137static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
138{
139 u32 h;
140
141 if (mce_num_extended_msrs == 0)
142 goto done;
143
144 rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
145 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
146 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
147 rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
148 rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
149 rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
150 rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
151 rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
152 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
153 rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
154
155 /* can we rely on kmalloc to do a dynamic
156 * allocation for the reserved registers?
157 */
158done:
159 return mce_num_extended_msrs;
160}
161
162static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
163{
164 int recover=1;
165 u32 alow, ahigh, high, low;
166 u32 mcgstl, mcgsth;
167 int i;
168 struct intel_mce_extended_msrs dbg;
169
170 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
171 if (mcgstl & (1<<0)) /* Recoverable ? */
172 recover=0;
173
174 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
175 smp_processor_id(), mcgsth, mcgstl);
176
177 if (intel_get_extended_msrs(&dbg)) {
178 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
179 smp_processor_id(), dbg.eip, dbg.eflags);
180 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
181 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
182 printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
183 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
184 }
185
186 for (i=0; i<nr_mce_banks; i++) {
187 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
188 if (high & (1<<31)) {
189 if (high & (1<<29))
190 recover |= 1;
191 if (high & (1<<25))
192 recover |= 2;
193 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
194 high &= ~(1<<31);
195 if (high & (1<<27)) {
196 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
197 printk ("[%08x%08x]", ahigh, alow);
198 }
199 if (high & (1<<26)) {
200 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
201 printk (" at %08x%08x", ahigh, alow);
202 }
203 printk ("\n");
204 }
205 }
206
207 if (recover & 2)
208 panic ("CPU context corrupt");
209 if (recover & 1)
210 panic ("Unable to continue");
211
212 printk(KERN_EMERG "Attempting to continue.\n");
213 /*
214 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
215 * recoverable/continuable.This will allow BIOS to look at the MSRs
216 * for errors if the OS could not log the error.
217 */
218 for (i=0; i<nr_mce_banks; i++) {
219 u32 msr;
220 msr = MSR_IA32_MC0_STATUS+i*4;
221 rdmsr (msr, low, high);
222 if (high&(1<<31)) {
223 /* Clear it */
224 wrmsr(msr, 0UL, 0UL);
225 /* Serialize */
226 wmb();
227 add_taint(TAINT_MACHINE_CHECK);
228 }
229 }
230 mcgstl &= ~(1<<2);
231 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
232}
233
234
235void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c)
236{
237 u32 l, h;
238 int i;
239
240 machine_check_vector = intel_machine_check;
241 wmb();
242
243 printk (KERN_INFO "Intel machine check architecture supported.\n");
244 rdmsr (MSR_IA32_MCG_CAP, l, h);
245 if (l & (1<<8)) /* Control register present ? */
246 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
247 nr_mce_banks = l & 0xff;
248
249 for (i=0; i<nr_mce_banks; i++) {
250 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
251 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
252 }
253
254 set_in_cr4 (X86_CR4_MCE);
255 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
256 smp_processor_id());
257
258 /* Check for P4/Xeon extended MCE MSRs */
259 rdmsr (MSR_IA32_MCG_CAP, l, h);
260 if (l & (1<<9)) {/* MCG_EXT_P */
261 mce_num_extended_msrs = (l >> 16) & 0xff;
262 printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
263 " available\n",
264 smp_processor_id(), mce_num_extended_msrs);
265
266#ifdef CONFIG_X86_MCE_P4THERMAL
267 /* Check for P4/Xeon Thermal monitor */
268 intel_init_thermal(c);
269#endif
270 }
271}
diff --git a/arch/i386/kernel/cpu/mcheck/p5.c b/arch/i386/kernel/cpu/mcheck/p5.c
new file mode 100644
index 000000000000..c45a1b485c80
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/p5.c
@@ -0,0 +1,54 @@
1/*
2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/irq.h>
10#include <linux/interrupt.h>
11#include <linux/smp.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/msr.h>
16
17#include "mce.h"
18
19/* Machine check handler for Pentium class Intel */
20static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code)
21{
22 u32 loaddr, hi, lotype;
23 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
24 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
25 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
26 if(lotype&(1<<5))
27 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
28 add_taint(TAINT_MACHINE_CHECK);
29}
30
31/* Set up machine check reporting for processors with Intel style MCE */
32void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c)
33{
34 u32 l, h;
35
36 /*Check for MCE support */
37 if( !cpu_has(c, X86_FEATURE_MCE) )
38 return;
39
40 /* Default P5 to off as its often misconnected */
41 if(mce_disabled != -1)
42 return;
43 machine_check_vector = pentium_machine_check;
44 wmb();
45
46 /* Read registers before enabling */
47 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
48 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
49 printk(KERN_INFO "Intel old style machine check architecture supported.\n");
50
51 /* Enable MCE */
52 set_in_cr4(X86_CR4_MCE);
53 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
54}
diff --git a/arch/i386/kernel/cpu/mcheck/p6.c b/arch/i386/kernel/cpu/mcheck/p6.c
new file mode 100644
index 000000000000..46640f8c2494
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/p6.c
@@ -0,0 +1,115 @@
1/*
2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/irq.h>
10#include <linux/interrupt.h>
11#include <linux/smp.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/msr.h>
16
17#include "mce.h"
18
19/* Machine Check Handler For PII/PIII */
20static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
21{
22 int recover=1;
23 u32 alow, ahigh, high, low;
24 u32 mcgstl, mcgsth;
25 int i;
26
27 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
28 if (mcgstl & (1<<0)) /* Recoverable ? */
29 recover=0;
30
31 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
32 smp_processor_id(), mcgsth, mcgstl);
33
34 for (i=0; i<nr_mce_banks; i++) {
35 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
36 if (high & (1<<31)) {
37 if (high & (1<<29))
38 recover |= 1;
39 if (high & (1<<25))
40 recover |= 2;
41 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
42 high &= ~(1<<31);
43 if (high & (1<<27)) {
44 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
45 printk ("[%08x%08x]", ahigh, alow);
46 }
47 if (high & (1<<26)) {
48 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
49 printk (" at %08x%08x", ahigh, alow);
50 }
51 printk ("\n");
52 }
53 }
54
55 if (recover & 2)
56 panic ("CPU context corrupt");
57 if (recover & 1)
58 panic ("Unable to continue");
59
60 printk (KERN_EMERG "Attempting to continue.\n");
61 /*
62 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
63 * recoverable/continuable.This will allow BIOS to look at the MSRs
64 * for errors if the OS could not log the error.
65 */
66 for (i=0; i<nr_mce_banks; i++) {
67 unsigned int msr;
68 msr = MSR_IA32_MC0_STATUS+i*4;
69 rdmsr (msr,low, high);
70 if (high & (1<<31)) {
71 /* Clear it */
72 wrmsr (msr, 0UL, 0UL);
73 /* Serialize */
74 wmb();
75 add_taint(TAINT_MACHINE_CHECK);
76 }
77 }
78 mcgstl &= ~(1<<2);
79 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
80}
81
82/* Set up machine check reporting for processors with Intel style MCE */
83void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c)
84{
85 u32 l, h;
86 int i;
87
88 /* Check for MCE support */
89 if (!cpu_has(c, X86_FEATURE_MCE))
90 return;
91
92 /* Check for PPro style MCA */
93 if (!cpu_has(c, X86_FEATURE_MCA))
94 return;
95
96 /* Ok machine check is available */
97 machine_check_vector = intel_machine_check;
98 wmb();
99
100 printk (KERN_INFO "Intel machine check architecture supported.\n");
101 rdmsr (MSR_IA32_MCG_CAP, l, h);
102 if (l & (1<<8)) /* Control register present ? */
103 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
104 nr_mce_banks = l & 0xff;
105
106 /* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */
107 for (i=1; i<nr_mce_banks; i++) {
108 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
109 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
110 }
111
112 set_in_cr4 (X86_CR4_MCE);
113 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
114 smp_processor_id());
115}
diff --git a/arch/i386/kernel/cpu/mcheck/winchip.c b/arch/i386/kernel/cpu/mcheck/winchip.c
new file mode 100644
index 000000000000..753fa7acb984
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/winchip.c
@@ -0,0 +1,37 @@
1/*
2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/irq.h>
10#include <linux/interrupt.h>
11
12#include <asm/processor.h>
13#include <asm/system.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18/* Machine check handler for WinChip C6 */
19static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code)
20{
21 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
22 add_taint(TAINT_MACHINE_CHECK);
23}
24
25/* Set up machine check reporting on the Winchip C6 series */
26void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
27{
28 u32 lo, hi;
29 machine_check_vector = winchip_machine_check;
30 wmb();
31 rdmsr(MSR_IDT_FCR1, lo, hi);
32 lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
33 lo&= ~(1<<4); /* Enable MCE */
34 wrmsr(MSR_IDT_FCR1, lo, hi);
35 set_in_cr4(X86_CR4_MCE);
36 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
37}
diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
new file mode 100644
index 000000000000..a25b701ab84e
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -0,0 +1,5 @@
1obj-y := main.o if.o generic.o state.o
2obj-y += amd.o
3obj-y += cyrix.o
4obj-y += centaur.o
5
diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
new file mode 100644
index 000000000000..1a1e04b6fd00
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -0,0 +1,121 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3#include <asm/mtrr.h>
4#include <asm/msr.h>
5
6#include "mtrr.h"
7
8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned int *size, mtrr_type * type)
11{
12 unsigned long low, high;
13
14 rdmsr(MSR_K6_UWCCR, low, high);
15 /* Upper dword is region 1, lower is region 0 */
16 if (reg == 1)
17 low = high;
18 /* The base masks off on the right alignment */
19 *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
20 *type = 0;
21 if (low & 1)
22 *type = MTRR_TYPE_UNCACHABLE;
23 if (low & 2)
24 *type = MTRR_TYPE_WRCOMB;
25 if (!(low & 3)) {
26 *size = 0;
27 return;
28 }
29 /*
30 * This needs a little explaining. The size is stored as an
31 * inverted mask of bits of 128K granularity 15 bits long offset
32 * 2 bits
33 *
34 * So to get a size we do invert the mask and add 1 to the lowest
35 * mask bit (4 as its 2 bits in). This gives us a size we then shift
36 * to turn into 128K blocks
37 *
38 * eg 111 1111 1111 1100 is 512K
39 *
40 * invert 000 0000 0000 0011
41 * +1 000 0000 0000 0100
42 * *128K ...
43 */
44 low = (~low) & 0x1FFFC;
45 *size = (low + 4) << (15 - PAGE_SHIFT);
46 return;
47}
48
49static void amd_set_mtrr(unsigned int reg, unsigned long base,
50 unsigned long size, mtrr_type type)
51/* [SUMMARY] Set variable MTRR register on the local CPU.
52 <reg> The register to set.
53 <base> The base address of the region.
54 <size> The size of the region. If this is 0 the region is disabled.
55 <type> The type of the region.
56 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
57 be done externally.
58 [RETURNS] Nothing.
59*/
60{
61 u32 regs[2];
62
63 /*
64 * Low is MTRR0 , High MTRR 1
65 */
66 rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
67 /*
68 * Blank to disable
69 */
70 if (size == 0)
71 regs[reg] = 0;
72 else
73 /* Set the register to the base, the type (off by one) and an
74 inverted bitmask of the size The size is the only odd
75 bit. We are fed say 512K We invert this and we get 111 1111
76 1111 1011 but if you subtract one and invert you get the
77 desired 111 1111 1111 1100 mask
78
79 But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */
80 regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
81 | (base << PAGE_SHIFT) | (type + 1);
82
83 /*
84 * The writeback rule is quite specific. See the manual. Its
85 * disable local interrupts, write back the cache, set the mtrr
86 */
87 wbinvd();
88 wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
89}
90
91static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
92{
93 /* Apply the K6 block alignment and size rules
94 In order
95 o Uncached or gathering only
96 o 128K or bigger block
97 o Power of 2 block
98 o base suitably aligned to the power
99 */
100 if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
101 || (size & ~(size - 1)) - size || (base & (size - 1)))
102 return -EINVAL;
103 return 0;
104}
105
106static struct mtrr_ops amd_mtrr_ops = {
107 .vendor = X86_VENDOR_AMD,
108 .set = amd_set_mtrr,
109 .get = amd_get_mtrr,
110 .get_free_region = generic_get_free_region,
111 .validate_add_page = amd_validate_add_page,
112 .have_wrcomb = positive_have_wrcomb,
113};
114
115int __init amd_init_mtrr(void)
116{
117 set_mtrr_ops(&amd_mtrr_ops);
118 return 0;
119}
120
121//arch_initcall(amd_mtrr_init);
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
new file mode 100644
index 000000000000..33f00ac314ef
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -0,0 +1,223 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3#include <asm/mtrr.h>
4#include <asm/msr.h>
5#include "mtrr.h"
6
7static struct {
8 unsigned long high;
9 unsigned long low;
10} centaur_mcr[8];
11
12static u8 centaur_mcr_reserved;
13static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
14
15/*
16 * Report boot time MCR setups
17 */
18
19static int
20centaur_get_free_region(unsigned long base, unsigned long size)
21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region.
24 [RETURNS] The index of the region on success, else -1 on error.
25*/
26{
27 int i, max;
28 mtrr_type ltype;
29 unsigned long lbase;
30 unsigned int lsize;
31
32 max = num_var_ranges;
33 for (i = 0; i < max; ++i) {
34 if (centaur_mcr_reserved & (1 << i))
35 continue;
36 mtrr_if->get(i, &lbase, &lsize, &ltype);
37 if (lsize == 0)
38 return i;
39 }
40 return -ENOSPC;
41}
42
43void
44mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
45{
46 centaur_mcr[mcr].low = lo;
47 centaur_mcr[mcr].high = hi;
48}
49
50static void
51centaur_get_mcr(unsigned int reg, unsigned long *base,
52 unsigned int *size, mtrr_type * type)
53{
54 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
55 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
56 *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */
57 if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
58 *type = MTRR_TYPE_UNCACHABLE;
59 if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
60 *type = MTRR_TYPE_WRBACK;
61 if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
62 *type = MTRR_TYPE_WRBACK;
63
64}
65
66static void centaur_set_mcr(unsigned int reg, unsigned long base,
67 unsigned long size, mtrr_type type)
68{
69 unsigned long low, high;
70
71 if (size == 0) {
72 /* Disable */
73 high = low = 0;
74 } else {
75 high = base << PAGE_SHIFT;
76 if (centaur_mcr_type == 0)
77 low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */
78 else {
79 if (type == MTRR_TYPE_UNCACHABLE)
80 low = -size << PAGE_SHIFT | 0x02; /* NC */
81 else
82 low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */
83 }
84 }
85 centaur_mcr[reg].high = high;
86 centaur_mcr[reg].low = low;
87 wrmsr(MSR_IDT_MCR0 + reg, low, high);
88}
89
90#if 0
91/*
92 * Initialise the later (saner) Winchip MCR variant. In this version
93 * the BIOS can pass us the registers it has used (but not their values)
94 * and the control register is read/write
95 */
96
97static void __init
98centaur_mcr1_init(void)
99{
100 unsigned i;
101 u32 lo, hi;
102
103 /* Unfortunately, MCR's are read-only, so there is no way to
104 * find out what the bios might have done.
105 */
106
107 rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
108 if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */
109 lo &= ~0x1C0; /* clear key */
110 lo |= 0x040; /* set key to 1 */
111 wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */
112 }
113
114 centaur_mcr_type = 1;
115
116 /*
117 * Clear any unconfigured MCR's.
118 */
119
120 for (i = 0; i < 8; ++i) {
121 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) {
122 if (!(lo & (1 << (9 + i))))
123 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
124 else
125 /*
126 * If the BIOS set up an MCR we cannot see it
127 * but we don't wish to obliterate it
128 */
129 centaur_mcr_reserved |= (1 << i);
130 }
131 }
132 /*
133 * Throw the main write-combining switch...
134 * However if OOSTORE is enabled then people have already done far
135 * cleverer things and we should behave.
136 */
137
138 lo |= 15; /* Write combine enables */
139 wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
140}
141
142/*
143 * Initialise the original winchip with read only MCR registers
144 * no used bitmask for the BIOS to pass on and write only control
145 */
146
147static void __init
148centaur_mcr0_init(void)
149{
150 unsigned i;
151
152 /* Unfortunately, MCR's are read-only, so there is no way to
153 * find out what the bios might have done.
154 */
155
156 /* Clear any unconfigured MCR's.
157 * This way we are sure that the centaur_mcr array contains the actual
158 * values. The disadvantage is that any BIOS tweaks are thus undone.
159 *
160 */
161 for (i = 0; i < 8; ++i) {
162 if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0)
163 wrmsr(MSR_IDT_MCR0 + i, 0, 0);
164 }
165
166 wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */
167}
168
169/*
170 * Initialise Winchip series MCR registers
171 */
172
173static void __init
174centaur_mcr_init(void)
175{
176 struct set_mtrr_context ctxt;
177
178 set_mtrr_prepare_save(&ctxt);
179 set_mtrr_cache_disable(&ctxt);
180
181 if (boot_cpu_data.x86_model == 4)
182 centaur_mcr0_init();
183 else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9)
184 centaur_mcr1_init();
185
186 set_mtrr_done(&ctxt);
187}
188#endif
189
190static int centaur_validate_add_page(unsigned long base,
191 unsigned long size, unsigned int type)
192{
193 /*
194 * FIXME: Winchip2 supports uncached
195 */
196 if (type != MTRR_TYPE_WRCOMB &&
197 (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
198 printk(KERN_WARNING
199 "mtrr: only write-combining%s supported\n",
200 centaur_mcr_type ? " and uncacheable are"
201 : " is");
202 return -EINVAL;
203 }
204 return 0;
205}
206
207static struct mtrr_ops centaur_mtrr_ops = {
208 .vendor = X86_VENDOR_CENTAUR,
209// .init = centaur_mcr_init,
210 .set = centaur_set_mcr,
211 .get = centaur_get_mcr,
212 .get_free_region = centaur_get_free_region,
213 .validate_add_page = centaur_validate_add_page,
214 .have_wrcomb = positive_have_wrcomb,
215};
216
217int __init centaur_init_mtrr(void)
218{
219 set_mtrr_ops(&centaur_mtrr_ops);
220 return 0;
221}
222
223//arch_initcall(centaur_init_mtrr);
diff --git a/arch/i386/kernel/cpu/mtrr/changelog b/arch/i386/kernel/cpu/mtrr/changelog
new file mode 100644
index 000000000000..af1368535955
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/changelog
@@ -0,0 +1,229 @@
1 ChangeLog
2
3 Prehistory Martin Tischhäuser <martin@ikcbarka.fzk.de>
4 Initial register-setting code (from proform-1.0).
5 19971216 Richard Gooch <rgooch@atnf.csiro.au>
6 Original version for /proc/mtrr interface, SMP-safe.
7 v1.0
8 19971217 Richard Gooch <rgooch@atnf.csiro.au>
9 Bug fix for ioctls()'s.
10 Added sample code in Documentation/mtrr.txt
11 v1.1
12 19971218 Richard Gooch <rgooch@atnf.csiro.au>
13 Disallow overlapping regions.
14 19971219 Jens Maurer <jmaurer@menuett.rhein-main.de>
15 Register-setting fixups.
16 v1.2
17 19971222 Richard Gooch <rgooch@atnf.csiro.au>
18 Fixups for kernel 2.1.75.
19 v1.3
20 19971229 David Wragg <dpw@doc.ic.ac.uk>
21 Register-setting fixups and conformity with Intel conventions.
22 19971229 Richard Gooch <rgooch@atnf.csiro.au>
23 Cosmetic changes and wrote this ChangeLog ;-)
24 19980106 Richard Gooch <rgooch@atnf.csiro.au>
25 Fixups for kernel 2.1.78.
26 v1.4
27 19980119 David Wragg <dpw@doc.ic.ac.uk>
28 Included passive-release enable code (elsewhere in PCI setup).
29 v1.5
30 19980131 Richard Gooch <rgooch@atnf.csiro.au>
31 Replaced global kernel lock with private spinlock.
32 v1.6
33 19980201 Richard Gooch <rgooch@atnf.csiro.au>
34 Added wait for other CPUs to complete changes.
35 v1.7
36 19980202 Richard Gooch <rgooch@atnf.csiro.au>
37 Bug fix in definition of <set_mtrr> for UP.
38 v1.8
39 19980319 Richard Gooch <rgooch@atnf.csiro.au>
40 Fixups for kernel 2.1.90.
41 19980323 Richard Gooch <rgooch@atnf.csiro.au>
42 Move SMP BIOS fixup before secondary CPUs call <calibrate_delay>
43 v1.9
44 19980325 Richard Gooch <rgooch@atnf.csiro.au>
45 Fixed test for overlapping regions: confused by adjacent regions
46 19980326 Richard Gooch <rgooch@atnf.csiro.au>
47 Added wbinvd in <set_mtrr_prepare>.
48 19980401 Richard Gooch <rgooch@atnf.csiro.au>
49 Bug fix for non-SMP compilation.
50 19980418 David Wragg <dpw@doc.ic.ac.uk>
51 Fixed-MTRR synchronisation for SMP and use atomic operations
52 instead of spinlocks.
53 19980418 Richard Gooch <rgooch@atnf.csiro.au>
54 Differentiate different MTRR register classes for BIOS fixup.
55 v1.10
56 19980419 David Wragg <dpw@doc.ic.ac.uk>
57 Bug fix in variable MTRR synchronisation.
58 v1.11
59 19980419 Richard Gooch <rgooch@atnf.csiro.au>
60 Fixups for kernel 2.1.97.
61 v1.12
62 19980421 Richard Gooch <rgooch@atnf.csiro.au>
63 Safer synchronisation across CPUs when changing MTRRs.
64 v1.13
65 19980423 Richard Gooch <rgooch@atnf.csiro.au>
66 Bugfix for SMP systems without MTRR support.
67 v1.14
68 19980427 Richard Gooch <rgooch@atnf.csiro.au>
69 Trap calls to <mtrr_add> and <mtrr_del> on non-MTRR machines.
70 v1.15
71 19980427 Richard Gooch <rgooch@atnf.csiro.au>
72 Use atomic bitops for setting SMP change mask.
73 v1.16
74 19980428 Richard Gooch <rgooch@atnf.csiro.au>
75 Removed spurious diagnostic message.
76 v1.17
77 19980429 Richard Gooch <rgooch@atnf.csiro.au>
78 Moved register-setting macros into this file.
79 Moved setup code from init/main.c to i386-specific areas.
80 v1.18
81 19980502 Richard Gooch <rgooch@atnf.csiro.au>
82 Moved MTRR detection outside conditionals in <mtrr_init>.
83 v1.19
84 19980502 Richard Gooch <rgooch@atnf.csiro.au>
85 Documentation improvement: mention Pentium II and AGP.
86 v1.20
87 19980521 Richard Gooch <rgooch@atnf.csiro.au>
88 Only manipulate interrupt enable flag on local CPU.
89 Allow enclosed uncachable regions.
90 v1.21
91 19980611 Richard Gooch <rgooch@atnf.csiro.au>
92 Always define <main_lock>.
93 v1.22
94 19980901 Richard Gooch <rgooch@atnf.csiro.au>
95 Removed module support in order to tidy up code.
96 Added sanity check for <mtrr_add>/<mtrr_del> before <mtrr_init>.
97 Created addition queue for prior to SMP commence.
98 v1.23
99 19980902 Richard Gooch <rgooch@atnf.csiro.au>
100 Ported patch to kernel 2.1.120-pre3.
101 v1.24
102 19980910 Richard Gooch <rgooch@atnf.csiro.au>
103 Removed sanity checks and addition queue: Linus prefers an OOPS.
104 v1.25
105 19981001 Richard Gooch <rgooch@atnf.csiro.au>
106 Fixed harmless compiler warning in include/asm-i386/mtrr.h
107 Fixed version numbering and history for v1.23 -> v1.24.
108 v1.26
109 19990118 Richard Gooch <rgooch@atnf.csiro.au>
110 Added devfs support.
111 v1.27
112 19990123 Richard Gooch <rgooch@atnf.csiro.au>
113 Changed locking to spin with reschedule.
114 Made use of new <smp_call_function>.
115 v1.28
116 19990201 Zoltán Böszörményi <zboszor@mail.externet.hu>
117 Extended the driver to be able to use Cyrix style ARRs.
118 19990204 Richard Gooch <rgooch@atnf.csiro.au>
119 Restructured Cyrix support.
120 v1.29
121 19990204 Zoltán Böszörményi <zboszor@mail.externet.hu>
122 Refined ARR support: enable MAPEN in set_mtrr_prepare()
123 and disable MAPEN in set_mtrr_done().
124 19990205 Richard Gooch <rgooch@atnf.csiro.au>
125 Minor cleanups.
126 v1.30
127 19990208 Zoltán Böszörményi <zboszor@mail.externet.hu>
128 Protect plain 6x86s (and other processors without the
129 Page Global Enable feature) against accessing CR4 in
130 set_mtrr_prepare() and set_mtrr_done().
131 19990210 Richard Gooch <rgooch@atnf.csiro.au>
132 Turned <set_mtrr_up> and <get_mtrr> into function pointers.
133 v1.31
134 19990212 Zoltán Böszörményi <zboszor@mail.externet.hu>
135 Major rewrite of cyrix_arr_init(): do not touch ARRs,
136 leave them as the BIOS have set them up.
137 Enable usage of all 8 ARRs.
138 Avoid multiplications by 3 everywhere and other
139 code clean ups/speed ups.
140 19990213 Zoltán Böszörményi <zboszor@mail.externet.hu>
141 Set up other Cyrix processors identical to the boot cpu.
142 Since Cyrix don't support Intel APIC, this is l'art pour l'art.
143 Weigh ARRs by size:
144 If size <= 32M is given, set up ARR# we were given.
145 If size > 32M is given, set up ARR7 only if it is free,
146 fail otherwise.
147 19990214 Zoltán Böszörményi <zboszor@mail.externet.hu>
148 Also check for size >= 256K if we are to set up ARR7,
149 mtrr_add() returns the value it gets from set_mtrr()
150 19990218 Zoltán Böszörményi <zboszor@mail.externet.hu>
151 Remove Cyrix "coma bug" workaround from here.
152 Moved to linux/arch/i386/kernel/setup.c and
153 linux/include/asm-i386/bugs.h
154 19990228 Richard Gooch <rgooch@atnf.csiro.au>
155 Added MTRRIOC_KILL_ENTRY ioctl(2)
156 Trap for counter underflow in <mtrr_file_del>.
157 Trap for 4 MiB aligned regions for PPro, stepping <= 7.
158 19990301 Richard Gooch <rgooch@atnf.csiro.au>
159 Created <get_free_region> hook.
160 19990305 Richard Gooch <rgooch@atnf.csiro.au>
161 Temporarily disable AMD support now MTRR capability flag is set.
162 v1.32
163 19990308 Zoltán Böszörményi <zboszor@mail.externet.hu>
164 Adjust my changes (19990212-19990218) to Richard Gooch's
165 latest changes. (19990228-19990305)
166 v1.33
167 19990309 Richard Gooch <rgooch@atnf.csiro.au>
168 Fixed typo in <printk> message.
169 19990310 Richard Gooch <rgooch@atnf.csiro.au>
170 Support K6-II/III based on Alan Cox's <alan@redhat.com> patches.
171 v1.34
172 19990511 Bart Hartgers <bart@etpmod.phys.tue.nl>
173 Support Centaur C6 MCR's.
174 19990512 Richard Gooch <rgooch@atnf.csiro.au>
175 Minor cleanups.
176 v1.35
177 19990707 Zoltán Böszörményi <zboszor@mail.externet.hu>
178 Check whether ARR3 is protected in cyrix_get_free_region()
179 and mtrr_del(). The code won't attempt to delete or change it
180 from now on if the BIOS protected ARR3. It silently skips ARR3
181 in cyrix_get_free_region() or returns with an error code from
182 mtrr_del().
183 19990711 Zoltán Böszörményi <zboszor@mail.externet.hu>
184 Reset some bits in the CCRs in cyrix_arr_init() to disable SMM
185 if ARR3 isn't protected. This is needed because if SMM is active
186 and ARR3 isn't protected then deleting and setting ARR3 again
187 may lock up the processor. With SMM entirely disabled, it does
188 not happen.
189 19990812 Zoltán Böszörményi <zboszor@mail.externet.hu>
190 Rearrange switch() statements so the driver accomodates to
191 the fact that the AMD Athlon handles its MTRRs the same way
192 as Intel does.
193 19990814 Zoltán Böszörményi <zboszor@mail.externet.hu>
194 Double check for Intel in mtrr_add()'s big switch() because
195 that revision check is only valid for Intel CPUs.
196 19990819 Alan Cox <alan@redhat.com>
197 Tested Zoltan's changes on a pre production Athlon - 100%
198 success.
199 19991008 Manfred Spraul <manfreds@colorfullife.com>
200 replaced spin_lock_reschedule() with a normal semaphore.
201 v1.36
202 20000221 Richard Gooch <rgooch@atnf.csiro.au>
203 Compile fix if procfs and devfs not enabled.
204 Formatting changes.
205 v1.37
206 20001109 H. Peter Anvin <hpa@zytor.com>
207 Use the new centralized CPU feature detects.
208
209 v1.38
210 20010309 Dave Jones <davej@suse.de>
211 Add support for Cyrix III.
212
213 v1.39
214 20010312 Dave Jones <davej@suse.de>
215 Ugh, I broke AMD support.
216 Reworked fix by Troels Walsted Hansen <troels@thule.no>
217
218 v1.40
219 20010327 Dave Jones <davej@suse.de>
220 Adapted Cyrix III support to include VIA C3.
221
222 v2.0
223 20020306 Patrick Mochel <mochel@osdl.org>
224 Split mtrr.c -> mtrr/*.c
225 Converted to Linux Kernel Coding Style
226 Fixed several minor nits in form
227 Moved some SMP-only functions out, so they can be used
228 for power management in the future.
229 TODO: Fix user interface cruft.
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
new file mode 100644
index 000000000000..933b0dd62f48
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -0,0 +1,364 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3#include <asm/mtrr.h>
4#include <asm/msr.h>
5#include <asm/io.h>
6#include "mtrr.h"
7
8int arr3_protected;
9
10static void
11cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned int *size, mtrr_type * type)
13{
14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift;
16
17 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
18
19 /* Save flags and disable interrupts */
20 local_irq_save(flags);
21
22 ccr3 = getCx86(CX86_CCR3);
23 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
24 ((unsigned char *) base)[3] = getCx86(arr);
25 ((unsigned char *) base)[2] = getCx86(arr + 1);
26 ((unsigned char *) base)[1] = getCx86(arr + 2);
27 rcr = getCx86(CX86_RCR_BASE + reg);
28 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
29
30 /* Enable interrupts if it was enabled previously */
31 local_irq_restore(flags);
32 shift = ((unsigned char *) base)[1] & 0x0f;
33 *base >>= PAGE_SHIFT;
34
35 /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
36 * Note: shift==0xf means 4G, this is unsupported.
37 */
38 if (shift)
39 *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
40 else
41 *size = 0;
42
43 /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */
44 if (reg < 7) {
45 switch (rcr) {
46 case 1:
47 *type = MTRR_TYPE_UNCACHABLE;
48 break;
49 case 8:
50 *type = MTRR_TYPE_WRBACK;
51 break;
52 case 9:
53 *type = MTRR_TYPE_WRCOMB;
54 break;
55 case 24:
56 default:
57 *type = MTRR_TYPE_WRTHROUGH;
58 break;
59 }
60 } else {
61 switch (rcr) {
62 case 0:
63 *type = MTRR_TYPE_UNCACHABLE;
64 break;
65 case 8:
66 *type = MTRR_TYPE_WRCOMB;
67 break;
68 case 9:
69 *type = MTRR_TYPE_WRBACK;
70 break;
71 case 25:
72 default:
73 *type = MTRR_TYPE_WRTHROUGH;
74 break;
75 }
76 }
77}
78
79static int
80cyrix_get_free_region(unsigned long base, unsigned long size)
81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region.
84 [RETURNS] The index of the region on success, else -1 on error.
85*/
86{
87 int i;
88 mtrr_type ltype;
89 unsigned long lbase;
90 unsigned int lsize;
91
92 /* If we are to set up a region >32M then look at ARR7 immediately */
93 if (size > 0x2000) {
94 cyrix_get_arr(7, &lbase, &lsize, &ltype);
95 if (lsize == 0)
96 return 7;
97 /* Else try ARR0-ARR6 first */
98 } else {
99 for (i = 0; i < 7; i++) {
100 cyrix_get_arr(i, &lbase, &lsize, &ltype);
101 if ((i == 3) && arr3_protected)
102 continue;
103 if (lsize == 0)
104 return i;
105 }
106 /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */
107 cyrix_get_arr(i, &lbase, &lsize, &ltype);
108 if ((lsize == 0) && (size >= 0x40))
109 return i;
110 }
111 return -ENOSPC;
112}
113
114static u32 cr4 = 0;
115static u32 ccr3;
116
117static void prepare_set(void)
118{
119 u32 cr0;
120
121 /* Save value of CR4 and clear Page Global Enable (bit 7) */
122 if ( cpu_has_pge ) {
123 cr4 = read_cr4();
124 write_cr4(cr4 & (unsigned char) ~(1 << 7));
125 }
126
127 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
128 a side-effect */
129 cr0 = read_cr0() | 0x40000000;
130 wbinvd();
131 write_cr0(cr0);
132 wbinvd();
133
134 /* Cyrix ARRs - everything else were excluded at the top */
135 ccr3 = getCx86(CX86_CCR3);
136
137 /* Cyrix ARRs - everything else were excluded at the top */
138 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
139
140}
141
142static void post_set(void)
143{
144 /* Flush caches and TLBs */
145 wbinvd();
146
147 /* Cyrix ARRs - everything else was excluded at the top */
148 setCx86(CX86_CCR3, ccr3);
149
150 /* Enable caches */
151 write_cr0(read_cr0() & 0xbfffffff);
152
153 /* Restore value of CR4 */
154 if ( cpu_has_pge )
155 write_cr4(cr4);
156}
157
158static void cyrix_set_arr(unsigned int reg, unsigned long base,
159 unsigned long size, mtrr_type type)
160{
161 unsigned char arr, arr_type, arr_size;
162
163 arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
164
165 /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
166 if (reg >= 7)
167 size >>= 6;
168
169 size &= 0x7fff; /* make sure arr_size <= 14 */
170 for (arr_size = 0; size; arr_size++, size >>= 1) ;
171
172 if (reg < 7) {
173 switch (type) {
174 case MTRR_TYPE_UNCACHABLE:
175 arr_type = 1;
176 break;
177 case MTRR_TYPE_WRCOMB:
178 arr_type = 9;
179 break;
180 case MTRR_TYPE_WRTHROUGH:
181 arr_type = 24;
182 break;
183 default:
184 arr_type = 8;
185 break;
186 }
187 } else {
188 switch (type) {
189 case MTRR_TYPE_UNCACHABLE:
190 arr_type = 0;
191 break;
192 case MTRR_TYPE_WRCOMB:
193 arr_type = 8;
194 break;
195 case MTRR_TYPE_WRTHROUGH:
196 arr_type = 25;
197 break;
198 default:
199 arr_type = 9;
200 break;
201 }
202 }
203
204 prepare_set();
205
206 base <<= PAGE_SHIFT;
207 setCx86(arr, ((unsigned char *) &base)[3]);
208 setCx86(arr + 1, ((unsigned char *) &base)[2]);
209 setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size);
210 setCx86(CX86_RCR_BASE + reg, arr_type);
211
212 post_set();
213}
214
215typedef struct {
216 unsigned long base;
217 unsigned int size;
218 mtrr_type type;
219} arr_state_t;
220
221static arr_state_t arr_state[8] __initdata = {
222 {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
223 {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
224};
225
226static unsigned char ccr_state[7] __initdata = { 0, 0, 0, 0, 0, 0, 0 };
227
228static void cyrix_set_all(void)
229{
230 int i;
231
232 prepare_set();
233
234 /* the CCRs are not contiguous */
235 for (i = 0; i < 4; i++)
236 setCx86(CX86_CCR0 + i, ccr_state[i]);
237 for (; i < 7; i++)
238 setCx86(CX86_CCR4 + i, ccr_state[i]);
239 for (i = 0; i < 8; i++)
240 cyrix_set_arr(i, arr_state[i].base,
241 arr_state[i].size, arr_state[i].type);
242
243 post_set();
244}
245
246#if 0
247/*
248 * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection
249 * with the SMM (System Management Mode) mode. So we need the following:
250 * Check whether SMI_LOCK (CCR3 bit 0) is set
251 * if it is set, write a warning message: ARR3 cannot be changed!
252 * (it cannot be changed until the next processor reset)
253 * if it is reset, then we can change it, set all the needed bits:
254 * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset)
255 * - disable access to SMM memory (CCR1 bit 2 reset)
256 * - disable SMM mode (CCR1 bit 1 reset)
257 * - disable write protection of ARR3 (CCR6 bit 1 reset)
258 * - (maybe) disable ARR3
259 * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set)
260 */
261static void __init
262cyrix_arr_init(void)
263{
264 struct set_mtrr_context ctxt;
265 unsigned char ccr[7];
266 int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 };
267#ifdef CONFIG_SMP
268 int i;
269#endif
270
271 /* flush cache and enable MAPEN */
272 set_mtrr_prepare_save(&ctxt);
273 set_mtrr_cache_disable(&ctxt);
274
275 /* Save all CCRs locally */
276 ccr[0] = getCx86(CX86_CCR0);
277 ccr[1] = getCx86(CX86_CCR1);
278 ccr[2] = getCx86(CX86_CCR2);
279 ccr[3] = ctxt.ccr3;
280 ccr[4] = getCx86(CX86_CCR4);
281 ccr[5] = getCx86(CX86_CCR5);
282 ccr[6] = getCx86(CX86_CCR6);
283
284 if (ccr[3] & 1) {
285 ccrc[3] = 1;
286 arr3_protected = 1;
287 } else {
288 /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and
289 * access to SMM memory through ARR3 (bit 7).
290 */
291 if (ccr[1] & 0x80) {
292 ccr[1] &= 0x7f;
293 ccrc[1] |= 0x80;
294 }
295 if (ccr[1] & 0x04) {
296 ccr[1] &= 0xfb;
297 ccrc[1] |= 0x04;
298 }
299 if (ccr[1] & 0x02) {
300 ccr[1] &= 0xfd;
301 ccrc[1] |= 0x02;
302 }
303 arr3_protected = 0;
304 if (ccr[6] & 0x02) {
305 ccr[6] &= 0xfd;
306 ccrc[6] = 1; /* Disable write protection of ARR3 */
307 setCx86(CX86_CCR6, ccr[6]);
308 }
309 /* Disable ARR3. This is safe now that we disabled SMM. */
310 /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */
311 }
312 /* If we changed CCR1 in memory, change it in the processor, too. */
313 if (ccrc[1])
314 setCx86(CX86_CCR1, ccr[1]);
315
316 /* Enable ARR usage by the processor */
317 if (!(ccr[5] & 0x20)) {
318 ccr[5] |= 0x20;
319 ccrc[5] = 1;
320 setCx86(CX86_CCR5, ccr[5]);
321 }
322#ifdef CONFIG_SMP
323 for (i = 0; i < 7; i++)
324 ccr_state[i] = ccr[i];
325 for (i = 0; i < 8; i++)
326 cyrix_get_arr(i,
327 &arr_state[i].base, &arr_state[i].size,
328 &arr_state[i].type);
329#endif
330
331 set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */
332
333 if (ccrc[5])
334 printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n");
335 if (ccrc[3])
336 printk(KERN_INFO "mtrr: ARR3 cannot be changed\n");
337/*
338 if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n");
339 if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n");
340 if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n");
341*/
342 if (ccrc[6])
343 printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n");
344}
345#endif
346
347static struct mtrr_ops cyrix_mtrr_ops = {
348 .vendor = X86_VENDOR_CYRIX,
349// .init = cyrix_arr_init,
350 .set_all = cyrix_set_all,
351 .set = cyrix_set_arr,
352 .get = cyrix_get_arr,
353 .get_free_region = cyrix_get_free_region,
354 .validate_add_page = generic_validate_add_page,
355 .have_wrcomb = positive_have_wrcomb,
356};
357
358int __init cyrix_init_mtrr(void)
359{
360 set_mtrr_ops(&cyrix_mtrr_ops);
361 return 0;
362}
363
364//arch_initcall(cyrix_init_mtrr);
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
new file mode 100644
index 000000000000..a4cce454d09b
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -0,0 +1,417 @@
1/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
2 because MTRRs can span upto 40 bits (36bits on most modern x86) */
3#include <linux/init.h>
4#include <linux/slab.h>
5#include <linux/mm.h>
6#include <asm/io.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9#include <asm/system.h>
10#include <asm/cpufeature.h>
11#include <asm/tlbflush.h>
12#include "mtrr.h"
13
14struct mtrr_state {
15 struct mtrr_var_range *var_ranges;
16 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
17 unsigned char enabled;
18 mtrr_type def_type;
19};
20
21static unsigned long smp_changes_mask;
22static struct mtrr_state mtrr_state = {};
23
24/* Get the MSR pair relating to a var range */
25static void __init
26get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
27{
28 rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
29 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
30}
31
32static void __init
33get_fixed_ranges(mtrr_type * frs)
34{
35 unsigned int *p = (unsigned int *) frs;
36 int i;
37
38 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
39
40 for (i = 0; i < 2; i++)
41 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
42 for (i = 0; i < 8; i++)
43 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
44}
45
46/* Grab all of the MTRR state for this CPU into *state */
47void __init get_mtrr_state(void)
48{
49 unsigned int i;
50 struct mtrr_var_range *vrs;
51 unsigned lo, dummy;
52
53 if (!mtrr_state.var_ranges) {
54 mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
55 GFP_KERNEL);
56 if (!mtrr_state.var_ranges)
57 return;
58 }
59 vrs = mtrr_state.var_ranges;
60
61 for (i = 0; i < num_var_ranges; i++)
62 get_mtrr_var_range(i, &vrs[i]);
63 get_fixed_ranges(mtrr_state.fixed_ranges);
64
65 rdmsr(MTRRdefType_MSR, lo, dummy);
66 mtrr_state.def_type = (lo & 0xff);
67 mtrr_state.enabled = (lo & 0xc00) >> 10;
68}
69
70/* Free resources associated with a struct mtrr_state */
71void __init finalize_mtrr_state(void)
72{
73 if (mtrr_state.var_ranges)
74 kfree(mtrr_state.var_ranges);
75 mtrr_state.var_ranges = NULL;
76}
77
78/* Some BIOS's are fucked and don't set all MTRRs the same! */
79void __init mtrr_state_warn(void)
80{
81 unsigned long mask = smp_changes_mask;
82
83 if (!mask)
84 return;
85 if (mask & MTRR_CHANGE_MASK_FIXED)
86 printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n");
87 if (mask & MTRR_CHANGE_MASK_VARIABLE)
88 printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n");
89 if (mask & MTRR_CHANGE_MASK_DEFTYPE)
90 printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n");
91 printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
92 printk(KERN_INFO "mtrr: corrected configuration.\n");
93}
94
95/* Doesn't attempt to pass an error out to MTRR users
96 because it's quite complicated in some cases and probably not
97 worth it because the best error handling is to ignore it. */
98void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
99{
100 if (wrmsr_safe(msr, a, b) < 0)
101 printk(KERN_ERR
102 "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
103 smp_processor_id(), msr, a, b);
104}
105
106int generic_get_free_region(unsigned long base, unsigned long size)
107/* [SUMMARY] Get a free MTRR.
108 <base> The starting (base) address of the region.
109 <size> The size (in bytes) of the region.
110 [RETURNS] The index of the region on success, else -1 on error.
111*/
112{
113 int i, max;
114 mtrr_type ltype;
115 unsigned long lbase;
116 unsigned lsize;
117
118 max = num_var_ranges;
119 for (i = 0; i < max; ++i) {
120 mtrr_if->get(i, &lbase, &lsize, &ltype);
121 if (lsize == 0)
122 return i;
123 }
124 return -ENOSPC;
125}
126
127void generic_get_mtrr(unsigned int reg, unsigned long *base,
128 unsigned int *size, mtrr_type * type)
129{
130 unsigned int mask_lo, mask_hi, base_lo, base_hi;
131
132 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
133 if ((mask_lo & 0x800) == 0) {
134 /* Invalid (i.e. free) range */
135 *base = 0;
136 *size = 0;
137 *type = 0;
138 return;
139 }
140
141 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
142
143 /* Work out the shifted address mask. */
144 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT)
145 | mask_lo >> PAGE_SHIFT;
146
147 /* This works correctly if size is a power of two, i.e. a
148 contiguous range. */
149 *size = -mask_lo;
150 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
151 *type = base_lo & 0xff;
152}
153
154static int set_fixed_ranges(mtrr_type * frs)
155{
156 unsigned int *p = (unsigned int *) frs;
157 int changed = FALSE;
158 int i;
159 unsigned int lo, hi;
160
161 rdmsr(MTRRfix64K_00000_MSR, lo, hi);
162 if (p[0] != lo || p[1] != hi) {
163 mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
164 changed = TRUE;
165 }
166
167 for (i = 0; i < 2; i++) {
168 rdmsr(MTRRfix16K_80000_MSR + i, lo, hi);
169 if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) {
170 mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2],
171 p[3 + i * 2]);
172 changed = TRUE;
173 }
174 }
175
176 for (i = 0; i < 8; i++) {
177 rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi);
178 if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) {
179 mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2],
180 p[7 + i * 2]);
181 changed = TRUE;
182 }
183 }
184 return changed;
185}
186
187/* Set the MSR pair relating to a var range. Returns TRUE if
188 changes are made */
189static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
190{
191 unsigned int lo, hi;
192 int changed = FALSE;
193
194 rdmsr(MTRRphysBase_MSR(index), lo, hi);
195 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
196 || (vr->base_hi & 0xfUL) != (hi & 0xfUL)) {
197 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
198 changed = TRUE;
199 }
200
201 rdmsr(MTRRphysMask_MSR(index), lo, hi);
202
203 if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
204 || (vr->mask_hi & 0xfUL) != (hi & 0xfUL)) {
205 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
206 changed = TRUE;
207 }
208 return changed;
209}
210
211static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
212/* [SUMMARY] Set the MTRR state for this CPU.
213 <state> The MTRR state information to read.
214 <ctxt> Some relevant CPU context.
215 [NOTE] The CPU must already be in a safe state for MTRR changes.
216 [RETURNS] 0 if no changes made, else a mask indication what was changed.
217*/
218{
219 unsigned int i;
220 unsigned long change_mask = 0;
221
222 for (i = 0; i < num_var_ranges; i++)
223 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
224 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
225
226 if (set_fixed_ranges(mtrr_state.fixed_ranges))
227 change_mask |= MTRR_CHANGE_MASK_FIXED;
228
229 /* Set_mtrr_restore restores the old value of MTRRdefType,
230 so to set it we fiddle with the saved value */
231 if ((deftype_lo & 0xff) != mtrr_state.def_type
232 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
233 deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10);
234 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
235 }
236
237 return change_mask;
238}
239
240
241static unsigned long cr4 = 0;
242static u32 deftype_lo, deftype_hi;
243static DEFINE_SPINLOCK(set_atomicity_lock);
244
245/*
246 * Since we are disabling the cache don't allow any interrupts - they
247 * would run extremely slow and would only increase the pain. The caller must
248 * ensure that local interrupts are disabled and are reenabled after post_set()
249 * has been called.
250 */
251
252static void prepare_set(void)
253{
254 unsigned long cr0;
255
256 /* Note that this is not ideal, since the cache is only flushed/disabled
257 for this CPU while the MTRRs are changed, but changing this requires
258 more invasive changes to the way the kernel boots */
259
260 spin_lock(&set_atomicity_lock);
261
262 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
263 cr0 = read_cr0() | 0x40000000; /* set CD flag */
264 write_cr0(cr0);
265 wbinvd();
266
267 /* Save value of CR4 and clear Page Global Enable (bit 7) */
268 if ( cpu_has_pge ) {
269 cr4 = read_cr4();
270 write_cr4(cr4 & ~X86_CR4_PGE);
271 }
272
273 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
274 __flush_tlb();
275
276 /* Save MTRR state */
277 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
278
279 /* Disable MTRRs, and set the default type to uncached */
280 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi);
281}
282
283static void post_set(void)
284{
285 /* Flush TLBs (no need to flush caches - they are disabled) */
286 __flush_tlb();
287
288 /* Intel (P6) standard MTRRs */
289 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
290
291 /* Enable caches */
292 write_cr0(read_cr0() & 0xbfffffff);
293
294 /* Restore value of CR4 */
295 if ( cpu_has_pge )
296 write_cr4(cr4);
297 spin_unlock(&set_atomicity_lock);
298}
299
300static void generic_set_all(void)
301{
302 unsigned long mask, count;
303 unsigned long flags;
304
305 local_irq_save(flags);
306 prepare_set();
307
308 /* Actually set the state */
309 mask = set_mtrr_state(deftype_lo,deftype_hi);
310
311 post_set();
312 local_irq_restore(flags);
313
314 /* Use the atomic bitops to update the global mask */
315 for (count = 0; count < sizeof mask * 8; ++count) {
316 if (mask & 0x01)
317 set_bit(count, &smp_changes_mask);
318 mask >>= 1;
319 }
320
321}
322
323static void generic_set_mtrr(unsigned int reg, unsigned long base,
324 unsigned long size, mtrr_type type)
325/* [SUMMARY] Set variable MTRR register on the local CPU.
326 <reg> The register to set.
327 <base> The base address of the region.
328 <size> The size of the region. If this is 0 the region is disabled.
329 <type> The type of the region.
330 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
331 be done externally.
332 [RETURNS] Nothing.
333*/
334{
335 unsigned long flags;
336
337 local_irq_save(flags);
338 prepare_set();
339
340 if (size == 0) {
341 /* The invalid bit is kept in the mask, so we simply clear the
342 relevant mask register to disable a range. */
343 mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
344 } else {
345 mtrr_wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type,
346 (base & size_and_mask) >> (32 - PAGE_SHIFT));
347 mtrr_wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800,
348 (-size & size_and_mask) >> (32 - PAGE_SHIFT));
349 }
350
351 post_set();
352 local_irq_restore(flags);
353}
354
355int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
356{
357 unsigned long lbase, last;
358
359 /* For Intel PPro stepping <= 7, must be 4 MiB aligned
360 and not touch 0x70000000->0x7003FFFF */
361 if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
362 boot_cpu_data.x86_model == 1 &&
363 boot_cpu_data.x86_mask <= 7) {
364 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
365 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
366 return -EINVAL;
367 }
368 if (!(base + size < 0x70000000 || base > 0x7003FFFF) &&
369 (type == MTRR_TYPE_WRCOMB
370 || type == MTRR_TYPE_WRBACK)) {
371 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
372 return -EINVAL;
373 }
374 }
375
376 if (base + size < 0x100) {
377 printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
378 base, size);
379 return -EINVAL;
380 }
381 /* Check upper bits of base and last are equal and lower bits are 0
382 for base and 1 for last */
383 last = base + size - 1;
384 for (lbase = base; !(lbase & 1) && (last & 1);
385 lbase = lbase >> 1, last = last >> 1) ;
386 if (lbase != last) {
387 printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n",
388 base, size);
389 return -EINVAL;
390 }
391 return 0;
392}
393
394
395static int generic_have_wrcomb(void)
396{
397 unsigned long config, dummy;
398 rdmsr(MTRRcap_MSR, config, dummy);
399 return (config & (1 << 10));
400}
401
402int positive_have_wrcomb(void)
403{
404 return 1;
405}
406
407/* generic structure...
408 */
409struct mtrr_ops generic_mtrr_ops = {
410 .use_intel_if = 1,
411 .set_all = generic_set_all,
412 .get = generic_get_mtrr,
413 .get_free_region = generic_get_free_region,
414 .set = generic_set_mtrr,
415 .validate_add_page = generic_validate_add_page,
416 .have_wrcomb = generic_have_wrcomb,
417};
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
new file mode 100644
index 000000000000..1923e0aed26a
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -0,0 +1,374 @@
1#include <linux/init.h>
2#include <linux/proc_fs.h>
3#include <linux/ctype.h>
4#include <linux/module.h>
5#include <linux/seq_file.h>
6#include <asm/uaccess.h>
7
8#define LINE_SIZE 80
9
10#include <asm/mtrr.h>
11#include "mtrr.h"
12
13/* RED-PEN: this is accessed without any locking */
14extern unsigned int *usage_table;
15
16
17#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
18
19static char *mtrr_strings[MTRR_NUM_TYPES] =
20{
21 "uncachable", /* 0 */
22 "write-combining", /* 1 */
23 "?", /* 2 */
24 "?", /* 3 */
25 "write-through", /* 4 */
26 "write-protect", /* 5 */
27 "write-back", /* 6 */
28};
29
30char *mtrr_attrib_to_str(int x)
31{
32 return (x <= 6) ? mtrr_strings[x] : "?";
33}
34
35#ifdef CONFIG_PROC_FS
36
37static int
38mtrr_file_add(unsigned long base, unsigned long size,
39 unsigned int type, char increment, struct file *file, int page)
40{
41 int reg, max;
42 unsigned int *fcount = FILE_FCOUNT(file);
43
44 max = num_var_ranges;
45 if (fcount == NULL) {
46 fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL);
47 if (!fcount)
48 return -ENOMEM;
49 memset(fcount, 0, max * sizeof *fcount);
50 FILE_FCOUNT(file) = fcount;
51 }
52 if (!page) {
53 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
54 return -EINVAL;
55 base >>= PAGE_SHIFT;
56 size >>= PAGE_SHIFT;
57 }
58 reg = mtrr_add_page(base, size, type, 1);
59 if (reg >= 0)
60 ++fcount[reg];
61 return reg;
62}
63
64static int
65mtrr_file_del(unsigned long base, unsigned long size,
66 struct file *file, int page)
67{
68 int reg;
69 unsigned int *fcount = FILE_FCOUNT(file);
70
71 if (!page) {
72 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
73 return -EINVAL;
74 base >>= PAGE_SHIFT;
75 size >>= PAGE_SHIFT;
76 }
77 reg = mtrr_del_page(-1, base, size);
78 if (reg < 0)
79 return reg;
80 if (fcount == NULL)
81 return reg;
82 if (fcount[reg] < 1)
83 return -EINVAL;
84 --fcount[reg];
85 return reg;
86}
87
88/* RED-PEN: seq_file can seek now. this is ignored. */
89static ssize_t
90mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
91/* Format of control line:
92 "base=%Lx size=%Lx type=%s" OR:
93 "disable=%d"
94*/
95{
96 int i, err;
97 unsigned long reg;
98 unsigned long long base, size;
99 char *ptr;
100 char line[LINE_SIZE];
101 size_t linelen;
102
103 if (!capable(CAP_SYS_ADMIN))
104 return -EPERM;
105 if (!len)
106 return -EINVAL;
107 memset(line, 0, LINE_SIZE);
108 if (len > LINE_SIZE)
109 len = LINE_SIZE;
110 if (copy_from_user(line, buf, len - 1))
111 return -EFAULT;
112 linelen = strlen(line);
113 ptr = line + linelen - 1;
114 if (linelen && *ptr == '\n')
115 *ptr = '\0';
116 if (!strncmp(line, "disable=", 8)) {
117 reg = simple_strtoul(line + 8, &ptr, 0);
118 err = mtrr_del_page(reg, 0, 0);
119 if (err < 0)
120 return err;
121 return len;
122 }
123 if (strncmp(line, "base=", 5))
124 return -EINVAL;
125 base = simple_strtoull(line + 5, &ptr, 0);
126 for (; isspace(*ptr); ++ptr) ;
127 if (strncmp(ptr, "size=", 5))
128 return -EINVAL;
129 size = simple_strtoull(ptr + 5, &ptr, 0);
130 if ((base & 0xfff) || (size & 0xfff))
131 return -EINVAL;
132 for (; isspace(*ptr); ++ptr) ;
133 if (strncmp(ptr, "type=", 5))
134 return -EINVAL;
135 ptr += 5;
136 for (; isspace(*ptr); ++ptr) ;
137 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
138 if (strcmp(ptr, mtrr_strings[i]))
139 continue;
140 base >>= PAGE_SHIFT;
141 size >>= PAGE_SHIFT;
142 err =
143 mtrr_add_page((unsigned long) base, (unsigned long) size, i,
144 1);
145 if (err < 0)
146 return err;
147 return len;
148 }
149 return -EINVAL;
150}
151
152static int
153mtrr_ioctl(struct inode *inode, struct file *file,
154 unsigned int cmd, unsigned long __arg)
155{
156 int err;
157 mtrr_type type;
158 struct mtrr_sentry sentry;
159 struct mtrr_gentry gentry;
160 void __user *arg = (void __user *) __arg;
161
162 switch (cmd) {
163 default:
164 return -ENOTTY;
165 case MTRRIOC_ADD_ENTRY:
166 if (!capable(CAP_SYS_ADMIN))
167 return -EPERM;
168 if (copy_from_user(&sentry, arg, sizeof sentry))
169 return -EFAULT;
170 err =
171 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
172 file, 0);
173 if (err < 0)
174 return err;
175 break;
176 case MTRRIOC_SET_ENTRY:
177 if (!capable(CAP_SYS_ADMIN))
178 return -EPERM;
179 if (copy_from_user(&sentry, arg, sizeof sentry))
180 return -EFAULT;
181 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
182 if (err < 0)
183 return err;
184 break;
185 case MTRRIOC_DEL_ENTRY:
186 if (!capable(CAP_SYS_ADMIN))
187 return -EPERM;
188 if (copy_from_user(&sentry, arg, sizeof sentry))
189 return -EFAULT;
190 err = mtrr_file_del(sentry.base, sentry.size, file, 0);
191 if (err < 0)
192 return err;
193 break;
194 case MTRRIOC_KILL_ENTRY:
195 if (!capable(CAP_SYS_ADMIN))
196 return -EPERM;
197 if (copy_from_user(&sentry, arg, sizeof sentry))
198 return -EFAULT;
199 err = mtrr_del(-1, sentry.base, sentry.size);
200 if (err < 0)
201 return err;
202 break;
203 case MTRRIOC_GET_ENTRY:
204 if (copy_from_user(&gentry, arg, sizeof gentry))
205 return -EFAULT;
206 if (gentry.regnum >= num_var_ranges)
207 return -EINVAL;
208 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type);
209
210 /* Hide entries that go above 4GB */
211 if (gentry.base + gentry.size > 0x100000
212 || gentry.size == 0x100000)
213 gentry.base = gentry.size = gentry.type = 0;
214 else {
215 gentry.base <<= PAGE_SHIFT;
216 gentry.size <<= PAGE_SHIFT;
217 gentry.type = type;
218 }
219
220 if (copy_to_user(arg, &gentry, sizeof gentry))
221 return -EFAULT;
222 break;
223 case MTRRIOC_ADD_PAGE_ENTRY:
224 if (!capable(CAP_SYS_ADMIN))
225 return -EPERM;
226 if (copy_from_user(&sentry, arg, sizeof sentry))
227 return -EFAULT;
228 err =
229 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
230 file, 1);
231 if (err < 0)
232 return err;
233 break;
234 case MTRRIOC_SET_PAGE_ENTRY:
235 if (!capable(CAP_SYS_ADMIN))
236 return -EPERM;
237 if (copy_from_user(&sentry, arg, sizeof sentry))
238 return -EFAULT;
239 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
240 if (err < 0)
241 return err;
242 break;
243 case MTRRIOC_DEL_PAGE_ENTRY:
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246 if (copy_from_user(&sentry, arg, sizeof sentry))
247 return -EFAULT;
248 err = mtrr_file_del(sentry.base, sentry.size, file, 1);
249 if (err < 0)
250 return err;
251 break;
252 case MTRRIOC_KILL_PAGE_ENTRY:
253 if (!capable(CAP_SYS_ADMIN))
254 return -EPERM;
255 if (copy_from_user(&sentry, arg, sizeof sentry))
256 return -EFAULT;
257 err = mtrr_del_page(-1, sentry.base, sentry.size);
258 if (err < 0)
259 return err;
260 break;
261 case MTRRIOC_GET_PAGE_ENTRY:
262 if (copy_from_user(&gentry, arg, sizeof gentry))
263 return -EFAULT;
264 if (gentry.regnum >= num_var_ranges)
265 return -EINVAL;
266 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type);
267 gentry.type = type;
268
269 if (copy_to_user(arg, &gentry, sizeof gentry))
270 return -EFAULT;
271 break;
272 }
273 return 0;
274}
275
276static int
277mtrr_close(struct inode *ino, struct file *file)
278{
279 int i, max;
280 unsigned int *fcount = FILE_FCOUNT(file);
281
282 if (fcount != NULL) {
283 max = num_var_ranges;
284 for (i = 0; i < max; ++i) {
285 while (fcount[i] > 0) {
286 mtrr_del(i, 0, 0);
287 --fcount[i];
288 }
289 }
290 kfree(fcount);
291 FILE_FCOUNT(file) = NULL;
292 }
293 return single_release(ino, file);
294}
295
296static int mtrr_seq_show(struct seq_file *seq, void *offset);
297
298static int mtrr_open(struct inode *inode, struct file *file)
299{
300 if (!mtrr_if)
301 return -EIO;
302 if (!mtrr_if->get)
303 return -ENXIO;
304 return single_open(file, mtrr_seq_show, NULL);
305}
306
307static struct file_operations mtrr_fops = {
308 .owner = THIS_MODULE,
309 .open = mtrr_open,
310 .read = seq_read,
311 .llseek = seq_lseek,
312 .write = mtrr_write,
313 .ioctl = mtrr_ioctl,
314 .release = mtrr_close,
315};
316
317
318static struct proc_dir_entry *proc_root_mtrr;
319
320
321static int mtrr_seq_show(struct seq_file *seq, void *offset)
322{
323 char factor;
324 int i, max, len;
325 mtrr_type type;
326 unsigned long base;
327 unsigned int size;
328
329 len = 0;
330 max = num_var_ranges;
331 for (i = 0; i < max; i++) {
332 mtrr_if->get(i, &base, &size, &type);
333 if (size == 0)
334 usage_table[i] = 0;
335 else {
336 if (size < (0x100000 >> PAGE_SHIFT)) {
337 /* less than 1MB */
338 factor = 'K';
339 size <<= PAGE_SHIFT - 10;
340 } else {
341 factor = 'M';
342 size >>= 20 - PAGE_SHIFT;
343 }
344 /* RED-PEN: base can be > 32bit */
345 len += seq_printf(seq,
346 "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n",
347 i, base, base >> (20 - PAGE_SHIFT), size, factor,
348 mtrr_attrib_to_str(type), usage_table[i]);
349 }
350 }
351 return 0;
352}
353
354static int __init mtrr_if_init(void)
355{
356 struct cpuinfo_x86 *c = &boot_cpu_data;
357
358 if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
359 (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
360 (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
361 (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
362 return -ENODEV;
363
364 proc_root_mtrr =
365 create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root);
366 if (proc_root_mtrr) {
367 proc_root_mtrr->owner = THIS_MODULE;
368 proc_root_mtrr->proc_fops = &mtrr_fops;
369 }
370 return 0;
371}
372
373arch_initcall(mtrr_if_init);
374#endif /* CONFIG_PROC_FS */
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
new file mode 100644
index 000000000000..8f67b490a7fd
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -0,0 +1,693 @@
1/* Generic MTRR (Memory Type Range Register) driver.
2
3 Copyright (C) 1997-2000 Richard Gooch
4 Copyright (c) 2002 Patrick Mochel
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public
17 License along with this library; if not, write to the Free
18 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
20 Richard Gooch may be reached by email at rgooch@atnf.csiro.au
21 The postal address is:
22 Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
23
24 Source: "Pentium Pro Family Developer's Manual, Volume 3:
25 Operating System Writer's Guide" (Intel document number 242692),
26 section 11.11.7
27
28 This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
29 on 6-7 March 2002.
30 Source: Intel Architecture Software Developers Manual, Volume 3:
31 System Programming Guide; Section 9.11. (1997 edition - PPro).
32*/
33
34#include <linux/module.h>
35#include <linux/init.h>
36#include <linux/pci.h>
37#include <linux/smp.h>
38#include <linux/cpu.h>
39
40#include <asm/mtrr.h>
41
42#include <asm/uaccess.h>
43#include <asm/processor.h>
44#include <asm/msr.h>
45#include "mtrr.h"
46
47#define MTRR_VERSION "2.0 (20020519)"
48
49u32 num_var_ranges = 0;
50
51unsigned int *usage_table;
52static DECLARE_MUTEX(main_lock);
53
54u32 size_or_mask, size_and_mask;
55
56static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
57
58struct mtrr_ops * mtrr_if = NULL;
59
60static void set_mtrr(unsigned int reg, unsigned long base,
61 unsigned long size, mtrr_type type);
62
63extern int arr3_protected;
64
65void set_mtrr_ops(struct mtrr_ops * ops)
66{
67 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
68 mtrr_ops[ops->vendor] = ops;
69}
70
71/* Returns non-zero if we have the write-combining memory type */
72static int have_wrcomb(void)
73{
74 struct pci_dev *dev;
75
76 if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) {
77 /* ServerWorks LE chipsets have problems with write-combining
78 Don't allow it and leave room for other chipsets to be tagged */
79 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
80 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
81 printk(KERN_INFO "mtrr: Serverworks LE detected. Write-combining disabled.\n");
82 pci_dev_put(dev);
83 return 0;
84 }
85 /* Intel 450NX errata # 23. Non ascending cachline evictions to
86 write combining memory may resulting in data corruption */
87 if (dev->vendor == PCI_VENDOR_ID_INTEL &&
88 dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
89 printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
90 pci_dev_put(dev);
91 return 0;
92 }
93 pci_dev_put(dev);
94 }
95 return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0);
96}
97
98/* This function returns the number of variable MTRRs */
99static void __init set_num_var_ranges(void)
100{
101 unsigned long config = 0, dummy;
102
103 if (use_intel()) {
104 rdmsr(MTRRcap_MSR, config, dummy);
105 } else if (is_cpu(AMD))
106 config = 2;
107 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
108 config = 8;
109 num_var_ranges = config & 0xff;
110}
111
112static void __init init_table(void)
113{
114 int i, max;
115
116 max = num_var_ranges;
117 if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
118 == NULL) {
119 printk(KERN_ERR "mtrr: could not allocate\n");
120 return;
121 }
122 for (i = 0; i < max; i++)
123 usage_table[i] = 1;
124}
125
126struct set_mtrr_data {
127 atomic_t count;
128 atomic_t gate;
129 unsigned long smp_base;
130 unsigned long smp_size;
131 unsigned int smp_reg;
132 mtrr_type smp_type;
133};
134
135#ifdef CONFIG_SMP
136
137static void ipi_handler(void *info)
138/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
139 [RETURNS] Nothing.
140*/
141{
142 struct set_mtrr_data *data = info;
143 unsigned long flags;
144
145 local_irq_save(flags);
146
147 atomic_dec(&data->count);
148 while(!atomic_read(&data->gate))
149 cpu_relax();
150
151 /* The master has cleared me to execute */
152 if (data->smp_reg != ~0U)
153 mtrr_if->set(data->smp_reg, data->smp_base,
154 data->smp_size, data->smp_type);
155 else
156 mtrr_if->set_all();
157
158 atomic_dec(&data->count);
159 while(atomic_read(&data->gate))
160 cpu_relax();
161
162 atomic_dec(&data->count);
163 local_irq_restore(flags);
164}
165
166#endif
167
168/**
169 * set_mtrr - update mtrrs on all processors
170 * @reg: mtrr in question
171 * @base: mtrr base
172 * @size: mtrr size
173 * @type: mtrr type
174 *
175 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
176 *
177 * 1. Send IPI to do the following:
178 * 2. Disable Interrupts
179 * 3. Wait for all procs to do so
180 * 4. Enter no-fill cache mode
181 * 5. Flush caches
182 * 6. Clear PGE bit
183 * 7. Flush all TLBs
184 * 8. Disable all range registers
185 * 9. Update the MTRRs
186 * 10. Enable all range registers
187 * 11. Flush all TLBs and caches again
188 * 12. Enter normal cache mode and reenable caching
189 * 13. Set PGE
190 * 14. Wait for buddies to catch up
191 * 15. Enable interrupts.
192 *
193 * What does that mean for us? Well, first we set data.count to the number
194 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
195 * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
196 * Meanwhile, they are waiting for that flag to be set. Once it's set, each
197 * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it
198 * differently, so we call mtrr_if->set() callback and let them take care of it.
199 * When they're done, they again decrement data->count and wait for data.gate to
200 * be reset.
201 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag.
202 * Everyone then enables interrupts and we all continue on.
203 *
204 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
205 * becomes nops.
206 */
207static void set_mtrr(unsigned int reg, unsigned long base,
208 unsigned long size, mtrr_type type)
209{
210 struct set_mtrr_data data;
211 unsigned long flags;
212
213 data.smp_reg = reg;
214 data.smp_base = base;
215 data.smp_size = size;
216 data.smp_type = type;
217 atomic_set(&data.count, num_booting_cpus() - 1);
218 atomic_set(&data.gate,0);
219
220 /* Start the ball rolling on other CPUs */
221 if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
222 panic("mtrr: timed out waiting for other CPUs\n");
223
224 local_irq_save(flags);
225
226 while(atomic_read(&data.count))
227 cpu_relax();
228
229 /* ok, reset count and toggle gate */
230 atomic_set(&data.count, num_booting_cpus() - 1);
231 atomic_set(&data.gate,1);
232
233 /* do our MTRR business */
234
235 /* HACK!
236 * We use this same function to initialize the mtrrs on boot.
237 * The state of the boot cpu's mtrrs has been saved, and we want
238 * to replicate across all the APs.
239 * If we're doing that @reg is set to something special...
240 */
241 if (reg != ~0U)
242 mtrr_if->set(reg,base,size,type);
243
244 /* wait for the others */
245 while(atomic_read(&data.count))
246 cpu_relax();
247
248 atomic_set(&data.count, num_booting_cpus() - 1);
249 atomic_set(&data.gate,0);
250
251 /*
252 * Wait here for everyone to have seen the gate change
253 * So we're the last ones to touch 'data'
254 */
255 while(atomic_read(&data.count))
256 cpu_relax();
257
258 local_irq_restore(flags);
259}
260
261/**
262 * mtrr_add_page - Add a memory type region
263 * @base: Physical base address of region in pages (4 KB)
264 * @size: Physical size of region in pages (4 KB)
265 * @type: Type of MTRR desired
266 * @increment: If this is true do usage counting on the region
267 *
268 * Memory type region registers control the caching on newer Intel and
269 * non Intel processors. This function allows drivers to request an
270 * MTRR is added. The details and hardware specifics of each processor's
271 * implementation are hidden from the caller, but nevertheless the
272 * caller should expect to need to provide a power of two size on an
273 * equivalent power of two boundary.
274 *
275 * If the region cannot be added either because all regions are in use
276 * or the CPU cannot support it a negative value is returned. On success
277 * the register number for this entry is returned, but should be treated
278 * as a cookie only.
279 *
280 * On a multiprocessor machine the changes are made to all processors.
281 * This is required on x86 by the Intel processors.
282 *
283 * The available types are
284 *
285 * %MTRR_TYPE_UNCACHABLE - No caching
286 *
287 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
288 *
289 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
290 *
291 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
292 *
293 * BUGS: Needs a quiet flag for the cases where drivers do not mind
294 * failures and do not wish system log messages to be sent.
295 */
296
297int mtrr_add_page(unsigned long base, unsigned long size,
298 unsigned int type, char increment)
299{
300 int i;
301 mtrr_type ltype;
302 unsigned long lbase;
303 unsigned int lsize;
304 int error;
305
306 if (!mtrr_if)
307 return -ENXIO;
308
309 if ((error = mtrr_if->validate_add_page(base,size,type)))
310 return error;
311
312 if (type >= MTRR_NUM_TYPES) {
313 printk(KERN_WARNING "mtrr: type: %u invalid\n", type);
314 return -EINVAL;
315 }
316
317 /* If the type is WC, check that this processor supports it */
318 if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
319 printk(KERN_WARNING
320 "mtrr: your processor doesn't support write-combining\n");
321 return -ENOSYS;
322 }
323
324 if (base & size_or_mask || size & size_or_mask) {
325 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
326 return -EINVAL;
327 }
328
329 error = -EINVAL;
330
331 /* Search for existing MTRR */
332 down(&main_lock);
333 for (i = 0; i < num_var_ranges; ++i) {
334 mtrr_if->get(i, &lbase, &lsize, &ltype);
335 if (base >= lbase + lsize)
336 continue;
337 if ((base < lbase) && (base + size <= lbase))
338 continue;
339 /* At this point we know there is some kind of overlap/enclosure */
340 if ((base < lbase) || (base + size > lbase + lsize)) {
341 printk(KERN_WARNING
342 "mtrr: 0x%lx000,0x%lx000 overlaps existing"
343 " 0x%lx000,0x%x000\n", base, size, lbase,
344 lsize);
345 goto out;
346 }
347 /* New region is enclosed by an existing region */
348 if (ltype != type) {
349 if (type == MTRR_TYPE_UNCACHABLE)
350 continue;
351 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
352 base, size, mtrr_attrib_to_str(ltype),
353 mtrr_attrib_to_str(type));
354 goto out;
355 }
356 if (increment)
357 ++usage_table[i];
358 error = i;
359 goto out;
360 }
361 /* Search for an empty MTRR */
362 i = mtrr_if->get_free_region(base, size);
363 if (i >= 0) {
364 set_mtrr(i, base, size, type);
365 usage_table[i] = 1;
366 } else
367 printk(KERN_INFO "mtrr: no more MTRRs available\n");
368 error = i;
369 out:
370 up(&main_lock);
371 return error;
372}
373
374/**
375 * mtrr_add - Add a memory type region
376 * @base: Physical base address of region
377 * @size: Physical size of region
378 * @type: Type of MTRR desired
379 * @increment: If this is true do usage counting on the region
380 *
381 * Memory type region registers control the caching on newer Intel and
382 * non Intel processors. This function allows drivers to request an
383 * MTRR is added. The details and hardware specifics of each processor's
384 * implementation are hidden from the caller, but nevertheless the
385 * caller should expect to need to provide a power of two size on an
386 * equivalent power of two boundary.
387 *
388 * If the region cannot be added either because all regions are in use
389 * or the CPU cannot support it a negative value is returned. On success
390 * the register number for this entry is returned, but should be treated
391 * as a cookie only.
392 *
393 * On a multiprocessor machine the changes are made to all processors.
394 * This is required on x86 by the Intel processors.
395 *
396 * The available types are
397 *
398 * %MTRR_TYPE_UNCACHABLE - No caching
399 *
400 * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
401 *
402 * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
403 *
404 * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
405 *
406 * BUGS: Needs a quiet flag for the cases where drivers do not mind
407 * failures and do not wish system log messages to be sent.
408 */
409
410int
411mtrr_add(unsigned long base, unsigned long size, unsigned int type,
412 char increment)
413{
414 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
415 printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n");
416 printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
417 return -EINVAL;
418 }
419 return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
420 increment);
421}
422
423/**
424 * mtrr_del_page - delete a memory type region
425 * @reg: Register returned by mtrr_add
426 * @base: Physical base address
427 * @size: Size of region
428 *
429 * If register is supplied then base and size are ignored. This is
430 * how drivers should call it.
431 *
432 * Releases an MTRR region. If the usage count drops to zero the
433 * register is freed and the region returns to default state.
434 * On success the register is returned, on failure a negative error
435 * code.
436 */
437
438int mtrr_del_page(int reg, unsigned long base, unsigned long size)
439{
440 int i, max;
441 mtrr_type ltype;
442 unsigned long lbase;
443 unsigned int lsize;
444 int error = -EINVAL;
445
446 if (!mtrr_if)
447 return -ENXIO;
448
449 max = num_var_ranges;
450 down(&main_lock);
451 if (reg < 0) {
452 /* Search for existing MTRR */
453 for (i = 0; i < max; ++i) {
454 mtrr_if->get(i, &lbase, &lsize, &ltype);
455 if (lbase == base && lsize == size) {
456 reg = i;
457 break;
458 }
459 }
460 if (reg < 0) {
461 printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
462 size);
463 goto out;
464 }
465 }
466 if (reg >= max) {
467 printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
468 goto out;
469 }
470 if (is_cpu(CYRIX) && !use_intel()) {
471 if ((reg == 3) && arr3_protected) {
472 printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
473 goto out;
474 }
475 }
476 mtrr_if->get(reg, &lbase, &lsize, &ltype);
477 if (lsize < 1) {
478 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
479 goto out;
480 }
481 if (usage_table[reg] < 1) {
482 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
483 goto out;
484 }
485 if (--usage_table[reg] < 1)
486 set_mtrr(reg, 0, 0, 0);
487 error = reg;
488 out:
489 up(&main_lock);
490 return error;
491}
492/**
493 * mtrr_del - delete a memory type region
494 * @reg: Register returned by mtrr_add
495 * @base: Physical base address
496 * @size: Size of region
497 *
498 * If register is supplied then base and size are ignored. This is
499 * how drivers should call it.
500 *
501 * Releases an MTRR region. If the usage count drops to zero the
502 * register is freed and the region returns to default state.
503 * On success the register is returned, on failure a negative error
504 * code.
505 */
506
507int
508mtrr_del(int reg, unsigned long base, unsigned long size)
509{
510 if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
511 printk(KERN_INFO "mtrr: size and base must be multiples of 4 kiB\n");
512 printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
513 return -EINVAL;
514 }
515 return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
516}
517
518EXPORT_SYMBOL(mtrr_add);
519EXPORT_SYMBOL(mtrr_del);
520
521/* HACK ALERT!
522 * These should be called implicitly, but we can't yet until all the initcall
523 * stuff is done...
524 */
525extern void amd_init_mtrr(void);
526extern void cyrix_init_mtrr(void);
527extern void centaur_init_mtrr(void);
528
529static void __init init_ifs(void)
530{
531 amd_init_mtrr();
532 cyrix_init_mtrr();
533 centaur_init_mtrr();
534}
535
536static void __init init_other_cpus(void)
537{
538 if (use_intel())
539 get_mtrr_state();
540
541 /* bring up the other processors */
542 set_mtrr(~0U,0,0,0);
543
544 if (use_intel()) {
545 finalize_mtrr_state();
546 mtrr_state_warn();
547 }
548}
549
550
551struct mtrr_value {
552 mtrr_type ltype;
553 unsigned long lbase;
554 unsigned int lsize;
555};
556
557static struct mtrr_value * mtrr_state;
558
559static int mtrr_save(struct sys_device * sysdev, u32 state)
560{
561 int i;
562 int size = num_var_ranges * sizeof(struct mtrr_value);
563
564 mtrr_state = kmalloc(size,GFP_ATOMIC);
565 if (mtrr_state)
566 memset(mtrr_state,0,size);
567 else
568 return -ENOMEM;
569
570 for (i = 0; i < num_var_ranges; i++) {
571 mtrr_if->get(i,
572 &mtrr_state[i].lbase,
573 &mtrr_state[i].lsize,
574 &mtrr_state[i].ltype);
575 }
576 return 0;
577}
578
579static int mtrr_restore(struct sys_device * sysdev)
580{
581 int i;
582
583 for (i = 0; i < num_var_ranges; i++) {
584 if (mtrr_state[i].lsize)
585 set_mtrr(i,
586 mtrr_state[i].lbase,
587 mtrr_state[i].lsize,
588 mtrr_state[i].ltype);
589 }
590 kfree(mtrr_state);
591 return 0;
592}
593
594
595
596static struct sysdev_driver mtrr_sysdev_driver = {
597 .suspend = mtrr_save,
598 .resume = mtrr_restore,
599};
600
601
602/**
603 * mtrr_init - initialize mtrrs on the boot CPU
604 *
605 * This needs to be called early; before any of the other CPUs are
606 * initialized (i.e. before smp_init()).
607 *
608 */
609static int __init mtrr_init(void)
610{
611 init_ifs();
612
613 if (cpu_has_mtrr) {
614 mtrr_if = &generic_mtrr_ops;
615 size_or_mask = 0xff000000; /* 36 bits */
616 size_and_mask = 0x00f00000;
617
618 switch (boot_cpu_data.x86_vendor) {
619 case X86_VENDOR_AMD:
620 /* The original Athlon docs said that
621 total addressable memory is 44 bits wide.
622 It was not really clear whether its MTRRs
623 follow this or not. (Read: 44 or 36 bits).
624 However, "x86-64_overview.pdf" explicitly
625 states that "previous implementations support
626 36 bit MTRRs" and also provides a way to
627 query the width (in bits) of the physical
628 addressable memory on the Hammer family.
629 */
630 if (boot_cpu_data.x86 == 15
631 && (cpuid_eax(0x80000000) >= 0x80000008)) {
632 u32 phys_addr;
633 phys_addr = cpuid_eax(0x80000008) & 0xff;
634 size_or_mask =
635 ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
636 size_and_mask = ~size_or_mask & 0xfff00000;
637 }
638 /* Athlon MTRRs use an Intel-compatible interface for
639 * getting and setting */
640 break;
641 case X86_VENDOR_CENTAUR:
642 if (boot_cpu_data.x86 == 6) {
643 /* VIA Cyrix family have Intel style MTRRs, but don't support PAE */
644 size_or_mask = 0xfff00000; /* 32 bits */
645 size_and_mask = 0;
646 }
647 break;
648
649 default:
650 break;
651 }
652 } else {
653 switch (boot_cpu_data.x86_vendor) {
654 case X86_VENDOR_AMD:
655 if (cpu_has_k6_mtrr) {
656 /* Pre-Athlon (K6) AMD CPU MTRRs */
657 mtrr_if = mtrr_ops[X86_VENDOR_AMD];
658 size_or_mask = 0xfff00000; /* 32 bits */
659 size_and_mask = 0;
660 }
661 break;
662 case X86_VENDOR_CENTAUR:
663 if (cpu_has_centaur_mcr) {
664 mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
665 size_or_mask = 0xfff00000; /* 32 bits */
666 size_and_mask = 0;
667 }
668 break;
669 case X86_VENDOR_CYRIX:
670 if (cpu_has_cyrix_arr) {
671 mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
672 size_or_mask = 0xfff00000; /* 32 bits */
673 size_and_mask = 0;
674 }
675 break;
676 default:
677 break;
678 }
679 }
680 printk(KERN_INFO "mtrr: v%s\n",MTRR_VERSION);
681
682 if (mtrr_if) {
683 set_num_var_ranges();
684 init_table();
685 init_other_cpus();
686
687 return sysdev_driver_register(&cpu_sysdev_class,
688 &mtrr_sysdev_driver);
689 }
690 return -ENXIO;
691}
692
693subsys_initcall(mtrr_init);
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
new file mode 100644
index 000000000000..de1351245599
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -0,0 +1,98 @@
1/*
2 * local mtrr defines.
3 */
4
5#ifndef TRUE
6#define TRUE 1
7#define FALSE 0
8#endif
9
10#define MTRRcap_MSR 0x0fe
11#define MTRRdefType_MSR 0x2ff
12
13#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
14#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
15
16#define NUM_FIXED_RANGES 88
17#define MTRRfix64K_00000_MSR 0x250
18#define MTRRfix16K_80000_MSR 0x258
19#define MTRRfix16K_A0000_MSR 0x259
20#define MTRRfix4K_C0000_MSR 0x268
21#define MTRRfix4K_C8000_MSR 0x269
22#define MTRRfix4K_D0000_MSR 0x26a
23#define MTRRfix4K_D8000_MSR 0x26b
24#define MTRRfix4K_E0000_MSR 0x26c
25#define MTRRfix4K_E8000_MSR 0x26d
26#define MTRRfix4K_F0000_MSR 0x26e
27#define MTRRfix4K_F8000_MSR 0x26f
28
29#define MTRR_CHANGE_MASK_FIXED 0x01
30#define MTRR_CHANGE_MASK_VARIABLE 0x02
31#define MTRR_CHANGE_MASK_DEFTYPE 0x04
32
33/* In the Intel processor's MTRR interface, the MTRR type is always held in
34 an 8 bit field: */
35typedef u8 mtrr_type;
36
37struct mtrr_ops {
38 u32 vendor;
39 u32 use_intel_if;
40// void (*init)(void);
41 void (*set)(unsigned int reg, unsigned long base,
42 unsigned long size, mtrr_type type);
43 void (*set_all)(void);
44
45 void (*get)(unsigned int reg, unsigned long *base,
46 unsigned int *size, mtrr_type * type);
47 int (*get_free_region) (unsigned long base, unsigned long size);
48
49 int (*validate_add_page)(unsigned long base, unsigned long size,
50 unsigned int type);
51 int (*have_wrcomb)(void);
52};
53
54extern int generic_get_free_region(unsigned long base, unsigned long size);
55extern int generic_validate_add_page(unsigned long base, unsigned long size,
56 unsigned int type);
57
58extern struct mtrr_ops generic_mtrr_ops;
59
60extern int positive_have_wrcomb(void);
61
62/* library functions for processor-specific routines */
63struct set_mtrr_context {
64 unsigned long flags;
65 unsigned long deftype_lo;
66 unsigned long deftype_hi;
67 unsigned long cr4val;
68 unsigned long ccr3;
69};
70
71struct mtrr_var_range {
72 unsigned long base_lo;
73 unsigned long base_hi;
74 unsigned long mask_lo;
75 unsigned long mask_hi;
76};
77
78void set_mtrr_done(struct set_mtrr_context *ctxt);
79void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
80void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
81
82void get_mtrr_state(void);
83
84extern void set_mtrr_ops(struct mtrr_ops * ops);
85
86extern u32 size_or_mask, size_and_mask;
87extern struct mtrr_ops * mtrr_if;
88
89#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
90#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
91
92extern unsigned int num_var_ranges;
93
94void finalize_mtrr_state(void);
95void mtrr_state_warn(void);
96char *mtrr_attrib_to_str(int x);
97void mtrr_wrmsr(unsigned, unsigned, unsigned);
98
diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c
new file mode 100644
index 000000000000..f62ecd15811a
--- /dev/null
+++ b/arch/i386/kernel/cpu/mtrr/state.c
@@ -0,0 +1,78 @@
1#include <linux/mm.h>
2#include <linux/init.h>
3#include <asm/io.h>
4#include <asm/mtrr.h>
5#include <asm/msr.h>
6#include "mtrr.h"
7
8
9/* Put the processor into a state where MTRRs can be safely set */
10void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
11{
12 unsigned int cr0;
13
14 /* Disable interrupts locally */
15 local_irq_save(ctxt->flags);
16
17 if (use_intel() || is_cpu(CYRIX)) {
18
19 /* Save value of CR4 and clear Page Global Enable (bit 7) */
20 if ( cpu_has_pge ) {
21 ctxt->cr4val = read_cr4();
22 write_cr4(ctxt->cr4val & (unsigned char) ~(1 << 7));
23 }
24
25 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
26 a side-effect */
27 cr0 = read_cr0() | 0x40000000;
28 wbinvd();
29 write_cr0(cr0);
30 wbinvd();
31
32 if (use_intel())
33 /* Save MTRR state */
34 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
35 else
36 /* Cyrix ARRs - everything else were excluded at the top */
37 ctxt->ccr3 = getCx86(CX86_CCR3);
38 }
39}
40
41void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
42{
43 if (use_intel())
44 /* Disable MTRRs, and set the default type to uncached */
45 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
46 ctxt->deftype_hi);
47 else if (is_cpu(CYRIX))
48 /* Cyrix ARRs - everything else were excluded at the top */
49 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
50}
51
52/* Restore the processor after a set_mtrr_prepare */
53void set_mtrr_done(struct set_mtrr_context *ctxt)
54{
55 if (use_intel() || is_cpu(CYRIX)) {
56
57 /* Flush caches and TLBs */
58 wbinvd();
59
60 /* Restore MTRRdefType */
61 if (use_intel())
62 /* Intel (P6) standard MTRRs */
63 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
64 else
65 /* Cyrix ARRs - everything else was excluded at the top */
66 setCx86(CX86_CCR3, ctxt->ccr3);
67
68 /* Enable caches */
69 write_cr0(read_cr0() & 0xbfffffff);
70
71 /* Restore value of CR4 */
72 if ( cpu_has_pge )
73 write_cr4(ctxt->cr4val);
74 }
75 /* Re-enable interrupts locally (if enabled previously) */
76 local_irq_restore(ctxt->flags);
77}
78
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
new file mode 100644
index 000000000000..30898a260a5c
--- /dev/null
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -0,0 +1,63 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <linux/string.h>
4#include <asm/processor.h>
5
6#include "cpu.h"
7
8/*
9 * Detect a NexGen CPU running without BIOS hypercode new enough
10 * to have CPUID. (Thanks to Herbert Oppmann)
11 */
12
13static int __init deep_magic_nexgen_probe(void)
14{
15 int ret;
16
17 __asm__ __volatile__ (
18 " movw $0x5555, %%ax\n"
19 " xorw %%dx,%%dx\n"
20 " movw $2, %%cx\n"
21 " divw %%cx\n"
22 " movl $0, %%eax\n"
23 " jnz 1f\n"
24 " movl $1, %%eax\n"
25 "1:\n"
26 : "=a" (ret) : : "cx", "dx" );
27 return ret;
28}
29
30static void __init init_nexgen(struct cpuinfo_x86 * c)
31{
32 c->x86_cache_size = 256; /* A few had 1 MB... */
33}
34
35static void __init nexgen_identify(struct cpuinfo_x86 * c)
36{
37 /* Detect NexGen with old hypercode */
38 if ( deep_magic_nexgen_probe() ) {
39 strcpy(c->x86_vendor_id, "NexGenDriven");
40 }
41 generic_identify(c);
42}
43
44static struct cpu_dev nexgen_cpu_dev __initdata = {
45 .c_vendor = "Nexgen",
46 .c_ident = { "NexGenDriven" },
47 .c_models = {
48 { .vendor = X86_VENDOR_NEXGEN,
49 .family = 5,
50 .model_names = { [1] = "Nx586" }
51 },
52 },
53 .c_init = init_nexgen,
54 .c_identify = nexgen_identify,
55};
56
57int __init nexgen_init_cpu(void)
58{
59 cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
60 return 0;
61}
62
63//early_arch_initcall(nexgen_init_cpu);
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
new file mode 100644
index 000000000000..c8d83fdc237a
--- /dev/null
+++ b/arch/i386/kernel/cpu/proc.c
@@ -0,0 +1,149 @@
1#include <linux/smp.h>
2#include <linux/timex.h>
3#include <linux/string.h>
4#include <asm/semaphore.h>
5#include <linux/seq_file.h>
6
7/*
8 * Get CPU information for use by the procfs.
9 */
10static int show_cpuinfo(struct seq_file *m, void *v)
11{
12 /*
13 * These flag bits must match the definitions in <asm/cpufeature.h>.
14 * NULL means this bit is undefined or reserved; either way it doesn't
15 * have meaning as far as Linux is concerned. Note that it's important
16 * to realize there is a difference between this table and CPUID -- if
17 * applications want to get the raw CPUID data, they should access
18 * /dev/cpu/<cpu_nr>/cpuid instead.
19 */
20 static char *x86_cap_flags[] = {
21 /* Intel-defined */
22 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
23 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
24 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
25 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
26
27 /* AMD-defined */
28 "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
29 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
30 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
31 NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow",
32
33 /* Transmeta-defined */
34 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
37 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38
39 /* Other (Linux-defined) */
40 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
41 NULL, NULL, NULL, NULL,
42 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
45
46 /* Intel-defined (#2) */
47 "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
48 "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
49 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
50 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
51
52 /* VIA/Cyrix/Centaur-defined */
53 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
54 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
57
58 /* AMD-defined (#2) */
59 "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL,
60 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
61 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 };
64 struct cpuinfo_x86 *c = v;
65 int i, n = c - cpu_data;
66 int fpu_exception;
67
68#ifdef CONFIG_SMP
69 if (!cpu_online(n))
70 return 0;
71#endif
72 seq_printf(m, "processor\t: %d\n"
73 "vendor_id\t: %s\n"
74 "cpu family\t: %d\n"
75 "model\t\t: %d\n"
76 "model name\t: %s\n",
77 n,
78 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
79 c->x86,
80 c->x86_model,
81 c->x86_model_id[0] ? c->x86_model_id : "unknown");
82
83 if (c->x86_mask || c->cpuid_level >= 0)
84 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
85 else
86 seq_printf(m, "stepping\t: unknown\n");
87
88 if ( cpu_has(c, X86_FEATURE_TSC) ) {
89 seq_printf(m, "cpu MHz\t\t: %lu.%03lu\n",
90 cpu_khz / 1000, (cpu_khz % 1000));
91 }
92
93 /* Cache size */
94 if (c->x86_cache_size >= 0)
95 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
96#ifdef CONFIG_X86_HT
97 seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]);
98 seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings);
99#endif
100
101 /* We use exception 16 if we have hardware math and we've either seen it or the CPU claims it is internal */
102 fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
103 seq_printf(m, "fdiv_bug\t: %s\n"
104 "hlt_bug\t\t: %s\n"
105 "f00f_bug\t: %s\n"
106 "coma_bug\t: %s\n"
107 "fpu\t\t: %s\n"
108 "fpu_exception\t: %s\n"
109 "cpuid level\t: %d\n"
110 "wp\t\t: %s\n"
111 "flags\t\t:",
112 c->fdiv_bug ? "yes" : "no",
113 c->hlt_works_ok ? "no" : "yes",
114 c->f00f_bug ? "yes" : "no",
115 c->coma_bug ? "yes" : "no",
116 c->hard_math ? "yes" : "no",
117 fpu_exception ? "yes" : "no",
118 c->cpuid_level,
119 c->wp_works_ok ? "yes" : "no");
120
121 for ( i = 0 ; i < 32*NCAPINTS ; i++ )
122 if ( test_bit(i, c->x86_capability) &&
123 x86_cap_flags[i] != NULL )
124 seq_printf(m, " %s", x86_cap_flags[i]);
125
126 seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n",
127 c->loops_per_jiffy/(500000/HZ),
128 (c->loops_per_jiffy/(5000/HZ)) % 100);
129 return 0;
130}
131
132static void *c_start(struct seq_file *m, loff_t *pos)
133{
134 return *pos < NR_CPUS ? cpu_data + *pos : NULL;
135}
136static void *c_next(struct seq_file *m, void *v, loff_t *pos)
137{
138 ++*pos;
139 return c_start(m, pos);
140}
141static void c_stop(struct seq_file *m, void *v)
142{
143}
144struct seq_operations cpuinfo_op = {
145 .start = c_start,
146 .next = c_next,
147 .stop = c_stop,
148 .show = show_cpuinfo,
149};
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
new file mode 100644
index 000000000000..8602425628ca
--- /dev/null
+++ b/arch/i386/kernel/cpu/rise.c
@@ -0,0 +1,53 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <linux/bitops.h>
4#include <asm/processor.h>
5
6#include "cpu.h"
7
8static void __init init_rise(struct cpuinfo_x86 *c)
9{
10 printk("CPU: Rise iDragon");
11 if (c->x86_model > 2)
12 printk(" II");
13 printk("\n");
14
15 /* Unhide possibly hidden capability flags
16 The mp6 iDragon family don't have MSRs.
17 We switch on extra features with this cpuid weirdness: */
18 __asm__ (
19 "movl $0x6363452a, %%eax\n\t"
20 "movl $0x3231206c, %%ecx\n\t"
21 "movl $0x2a32313a, %%edx\n\t"
22 "cpuid\n\t"
23 "movl $0x63634523, %%eax\n\t"
24 "movl $0x32315f6c, %%ecx\n\t"
25 "movl $0x2333313a, %%edx\n\t"
26 "cpuid\n\t" : : : "eax", "ebx", "ecx", "edx"
27 );
28 set_bit(X86_FEATURE_CX8, c->x86_capability);
29}
30
31static struct cpu_dev rise_cpu_dev __initdata = {
32 .c_vendor = "Rise",
33 .c_ident = { "RiseRiseRise" },
34 .c_models = {
35 { .vendor = X86_VENDOR_RISE, .family = 5, .model_names =
36 {
37 [0] = "iDragon",
38 [2] = "iDragon",
39 [8] = "iDragon II",
40 [9] = "iDragon II"
41 }
42 },
43 },
44 .c_init = init_rise,
45};
46
47int __init rise_init_cpu(void)
48{
49 cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev;
50 return 0;
51}
52
53//early_arch_initcall(rise_init_cpu);
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
new file mode 100644
index 000000000000..f57e5ee94943
--- /dev/null
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -0,0 +1,107 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <asm/processor.h>
4#include <asm/msr.h>
5#include "cpu.h"
6
7static void __init init_transmeta(struct cpuinfo_x86 *c)
8{
9 unsigned int cap_mask, uk, max, dummy;
10 unsigned int cms_rev1, cms_rev2;
11 unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev;
12 char cpu_info[65];
13
14 get_model_name(c); /* Same as AMD/Cyrix */
15 display_cacheinfo(c);
16
17 /* Print CMS and CPU revision */
18 max = cpuid_eax(0x80860000);
19 cpu_rev = 0;
20 if ( max >= 0x80860001 ) {
21 cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
22 if (cpu_rev != 0x02000000) {
23 printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
24 (cpu_rev >> 24) & 0xff,
25 (cpu_rev >> 16) & 0xff,
26 (cpu_rev >> 8) & 0xff,
27 cpu_rev & 0xff,
28 cpu_freq);
29 }
30 }
31 if ( max >= 0x80860002 ) {
32 cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
33 if (cpu_rev == 0x02000000) {
34 printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
35 new_cpu_rev, cpu_freq);
36 }
37 printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
38 (cms_rev1 >> 24) & 0xff,
39 (cms_rev1 >> 16) & 0xff,
40 (cms_rev1 >> 8) & 0xff,
41 cms_rev1 & 0xff,
42 cms_rev2);
43 }
44 if ( max >= 0x80860006 ) {
45 cpuid(0x80860003,
46 (void *)&cpu_info[0],
47 (void *)&cpu_info[4],
48 (void *)&cpu_info[8],
49 (void *)&cpu_info[12]);
50 cpuid(0x80860004,
51 (void *)&cpu_info[16],
52 (void *)&cpu_info[20],
53 (void *)&cpu_info[24],
54 (void *)&cpu_info[28]);
55 cpuid(0x80860005,
56 (void *)&cpu_info[32],
57 (void *)&cpu_info[36],
58 (void *)&cpu_info[40],
59 (void *)&cpu_info[44]);
60 cpuid(0x80860006,
61 (void *)&cpu_info[48],
62 (void *)&cpu_info[52],
63 (void *)&cpu_info[56],
64 (void *)&cpu_info[60]);
65 cpu_info[64] = '\0';
66 printk(KERN_INFO "CPU: %s\n", cpu_info);
67 }
68
69 /* Unhide possibly hidden capability flags */
70 rdmsr(0x80860004, cap_mask, uk);
71 wrmsr(0x80860004, ~0, uk);
72 c->x86_capability[0] = cpuid_edx(0x00000001);
73 wrmsr(0x80860004, cap_mask, uk);
74
75 /* If we can run i686 user-space code, call us an i686 */
76#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
77 if ( c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686 )
78 c->x86 = 6;
79}
80
81static void transmeta_identify(struct cpuinfo_x86 * c)
82{
83 u32 xlvl;
84 generic_identify(c);
85
86 /* Transmeta-defined flags: level 0x80860001 */
87 xlvl = cpuid_eax(0x80860000);
88 if ( (xlvl & 0xffff0000) == 0x80860000 ) {
89 if ( xlvl >= 0x80860001 )
90 c->x86_capability[2] = cpuid_edx(0x80860001);
91 }
92}
93
94static struct cpu_dev transmeta_cpu_dev __initdata = {
95 .c_vendor = "Transmeta",
96 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
97 .c_init = init_transmeta,
98 .c_identify = transmeta_identify,
99};
100
101int __init transmeta_init_cpu(void)
102{
103 cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
104 return 0;
105}
106
107//early_arch_initcall(transmeta_init_cpu);
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
new file mode 100644
index 000000000000..264fcad559d5
--- /dev/null
+++ b/arch/i386/kernel/cpu/umc.c
@@ -0,0 +1,33 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <asm/processor.h>
4#include "cpu.h"
5
6/* UMC chips appear to be only either 386 or 486, so no special init takes place.
7 */
8static void __init init_umc(struct cpuinfo_x86 * c)
9{
10
11}
12
13static struct cpu_dev umc_cpu_dev __initdata = {
14 .c_vendor = "UMC",
15 .c_ident = { "UMC UMC UMC" },
16 .c_models = {
17 { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
18 {
19 [1] = "U5D",
20 [2] = "U5S",
21 }
22 },
23 },
24 .c_init = init_umc,
25};
26
27int __init umc_init_cpu(void)
28{
29 cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
30 return 0;
31}
32
33//early_arch_initcall(umc_init_cpu);
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
new file mode 100644
index 000000000000..2e2756345bb2
--- /dev/null
+++ b/arch/i386/kernel/cpuid.c
@@ -0,0 +1,246 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * cpuid.c
15 *
16 * x86 CPUID access device
17 *
18 * This device is accessed by lseek() to the appropriate CPUID level
19 * and then read in chunks of 16 bytes. A larger size means multiple
20 * reads of consecutive levels.
21 *
22 * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27#include <linux/config.h>
28
29#include <linux/types.h>
30#include <linux/errno.h>
31#include <linux/fcntl.h>
32#include <linux/init.h>
33#include <linux/poll.h>
34#include <linux/smp.h>
35#include <linux/major.h>
36#include <linux/fs.h>
37#include <linux/smp_lock.h>
38#include <linux/fs.h>
39#include <linux/device.h>
40#include <linux/cpu.h>
41#include <linux/notifier.h>
42
43#include <asm/processor.h>
44#include <asm/msr.h>
45#include <asm/uaccess.h>
46#include <asm/system.h>
47
48static struct class_simple *cpuid_class;
49
50#ifdef CONFIG_SMP
51
52struct cpuid_command {
53 int cpu;
54 u32 reg;
55 u32 *data;
56};
57
58static void cpuid_smp_cpuid(void *cmd_block)
59{
60 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
61
62 if (cmd->cpu == smp_processor_id())
63 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
64 &cmd->data[3]);
65}
66
67static inline void do_cpuid(int cpu, u32 reg, u32 * data)
68{
69 struct cpuid_command cmd;
70
71 preempt_disable();
72 if (cpu == smp_processor_id()) {
73 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
74 } else {
75 cmd.cpu = cpu;
76 cmd.reg = reg;
77 cmd.data = data;
78
79 smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1);
80 }
81 preempt_enable();
82}
83#else /* ! CONFIG_SMP */
84
85static inline void do_cpuid(int cpu, u32 reg, u32 * data)
86{
87 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
88}
89
90#endif /* ! CONFIG_SMP */
91
92static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
93{
94 loff_t ret;
95
96 lock_kernel();
97
98 switch (orig) {
99 case 0:
100 file->f_pos = offset;
101 ret = file->f_pos;
102 break;
103 case 1:
104 file->f_pos += offset;
105 ret = file->f_pos;
106 break;
107 default:
108 ret = -EINVAL;
109 }
110
111 unlock_kernel();
112 return ret;
113}
114
115static ssize_t cpuid_read(struct file *file, char __user *buf,
116 size_t count, loff_t * ppos)
117{
118 char __user *tmp = buf;
119 u32 data[4];
120 size_t rv;
121 u32 reg = *ppos;
122 int cpu = iminor(file->f_dentry->d_inode);
123
124 if (count % 16)
125 return -EINVAL; /* Invalid chunk size */
126
127 for (rv = 0; count; count -= 16) {
128 do_cpuid(cpu, reg, data);
129 if (copy_to_user(tmp, &data, 16))
130 return -EFAULT;
131 tmp += 16;
132 *ppos = reg++;
133 }
134
135 return tmp - buf;
136}
137
138static int cpuid_open(struct inode *inode, struct file *file)
139{
140 unsigned int cpu = iminor(file->f_dentry->d_inode);
141 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
142
143 if (cpu >= NR_CPUS || !cpu_online(cpu))
144 return -ENXIO; /* No such CPU */
145 if (c->cpuid_level < 0)
146 return -EIO; /* CPUID not supported */
147
148 return 0;
149}
150
151/*
152 * File operations we support
153 */
154static struct file_operations cpuid_fops = {
155 .owner = THIS_MODULE,
156 .llseek = cpuid_seek,
157 .read = cpuid_read,
158 .open = cpuid_open,
159};
160
161static int cpuid_class_simple_device_add(int i)
162{
163 int err = 0;
164 struct class_device *class_err;
165
166 class_err = class_simple_device_add(cpuid_class, MKDEV(CPUID_MAJOR, i), NULL, "cpu%d",i);
167 if (IS_ERR(class_err))
168 err = PTR_ERR(class_err);
169 return err;
170}
171
172static int __devinit cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
173{
174 unsigned int cpu = (unsigned long)hcpu;
175
176 switch (action) {
177 case CPU_ONLINE:
178 cpuid_class_simple_device_add(cpu);
179 break;
180 case CPU_DEAD:
181 class_simple_device_remove(MKDEV(CPUID_MAJOR, cpu));
182 break;
183 }
184 return NOTIFY_OK;
185}
186
187static struct notifier_block cpuid_class_cpu_notifier =
188{
189 .notifier_call = cpuid_class_cpu_callback,
190};
191
192static int __init cpuid_init(void)
193{
194 int i, err = 0;
195 i = 0;
196
197 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
198 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
199 CPUID_MAJOR);
200 err = -EBUSY;
201 goto out;
202 }
203 cpuid_class = class_simple_create(THIS_MODULE, "cpuid");
204 if (IS_ERR(cpuid_class)) {
205 err = PTR_ERR(cpuid_class);
206 goto out_chrdev;
207 }
208 for_each_online_cpu(i) {
209 err = cpuid_class_simple_device_add(i);
210 if (err != 0)
211 goto out_class;
212 }
213 register_cpu_notifier(&cpuid_class_cpu_notifier);
214
215 err = 0;
216 goto out;
217
218out_class:
219 i = 0;
220 for_each_online_cpu(i) {
221 class_simple_device_remove(MKDEV(CPUID_MAJOR, i));
222 }
223 class_simple_destroy(cpuid_class);
224out_chrdev:
225 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
226out:
227 return err;
228}
229
230static void __exit cpuid_exit(void)
231{
232 int cpu = 0;
233
234 for_each_online_cpu(cpu)
235 class_simple_device_remove(MKDEV(CPUID_MAJOR, cpu));
236 class_simple_destroy(cpuid_class);
237 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
238 unregister_cpu_notifier(&cpuid_class_cpu_notifier);
239}
240
241module_init(cpuid_init);
242module_exit(cpuid_exit);
243
244MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
245MODULE_DESCRIPTION("x86 generic CPUID driver");
246MODULE_LICENSE("GPL");
diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
new file mode 100644
index 000000000000..6ed7e28f306c
--- /dev/null
+++ b/arch/i386/kernel/dmi_scan.c
@@ -0,0 +1,487 @@
1#include <linux/types.h>
2#include <linux/kernel.h>
3#include <linux/string.h>
4#include <linux/init.h>
5#include <linux/module.h>
6#include <linux/slab.h>
7#include <linux/acpi.h>
8#include <asm/io.h>
9#include <linux/pm.h>
10#include <asm/system.h>
11#include <linux/dmi.h>
12#include <linux/bootmem.h>
13
14
15struct dmi_header
16{
17 u8 type;
18 u8 length;
19 u16 handle;
20};
21
22#undef DMI_DEBUG
23
24#ifdef DMI_DEBUG
25#define dmi_printk(x) printk x
26#else
27#define dmi_printk(x)
28#endif
29
30static char * __init dmi_string(struct dmi_header *dm, u8 s)
31{
32 u8 *bp=(u8 *)dm;
33 bp+=dm->length;
34 if(!s)
35 return "";
36 s--;
37 while(s>0 && *bp)
38 {
39 bp+=strlen(bp);
40 bp++;
41 s--;
42 }
43 return bp;
44}
45
46/*
47 * We have to be cautious here. We have seen BIOSes with DMI pointers
48 * pointing to completely the wrong place for example
49 */
50
51static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dmi_header *))
52{
53 u8 *buf;
54 struct dmi_header *dm;
55 u8 *data;
56 int i=0;
57
58 buf = bt_ioremap(base, len);
59 if(buf==NULL)
60 return -1;
61
62 data = buf;
63
64 /*
65 * Stop when we see all the items the table claimed to have
66 * OR we run off the end of the table (also happens)
67 */
68
69 while(i<num && data-buf+sizeof(struct dmi_header)<=len)
70 {
71 dm=(struct dmi_header *)data;
72 /*
73 * We want to know the total length (formated area and strings)
74 * before decoding to make sure we won't run off the table in
75 * dmi_decode or dmi_string
76 */
77 data+=dm->length;
78 while(data-buf<len-1 && (data[0] || data[1]))
79 data++;
80 if(data-buf<len-1)
81 decode(dm);
82 data+=2;
83 i++;
84 }
85 bt_iounmap(buf, len);
86 return 0;
87}
88
89
90inline static int __init dmi_checksum(u8 *buf)
91{
92 u8 sum=0;
93 int a;
94
95 for(a=0; a<15; a++)
96 sum+=buf[a];
97 return (sum==0);
98}
99
100static int __init dmi_iterate(void (*decode)(struct dmi_header *))
101{
102 u8 buf[15];
103 char __iomem *p, *q;
104
105 /*
106 * no iounmap() for that ioremap(); it would be a no-op, but it's
107 * so early in setup that sucker gets confused into doing what
108 * it shouldn't if we actually call it.
109 */
110 p = ioremap(0xF0000, 0x10000);
111 if (p == NULL)
112 return -1;
113 for (q = p; q < p + 0x10000; q += 16) {
114 memcpy_fromio(buf, q, 15);
115 if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf))
116 {
117 u16 num=buf[13]<<8|buf[12];
118 u16 len=buf[7]<<8|buf[6];
119 u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8];
120
121 /*
122 * DMI version 0.0 means that the real version is taken from
123 * the SMBIOS version, which we don't know at this point.
124 */
125 if(buf[14]!=0)
126 printk(KERN_INFO "DMI %d.%d present.\n",
127 buf[14]>>4, buf[14]&0x0F);
128 else
129 printk(KERN_INFO "DMI present.\n");
130 dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
131 num, len));
132 dmi_printk((KERN_INFO "DMI table at 0x%08X.\n",
133 base));
134 if(dmi_table(base,len, num, decode)==0)
135 return 0;
136 }
137 }
138 return -1;
139}
140
141static char *dmi_ident[DMI_STRING_MAX];
142
143/*
144 * Save a DMI string
145 */
146
147static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
148{
149 char *d = (char*)dm;
150 char *p = dmi_string(dm, d[string]);
151 if(p==NULL || *p == 0)
152 return;
153 if (dmi_ident[slot])
154 return;
155 dmi_ident[slot] = alloc_bootmem(strlen(p)+1);
156 if(dmi_ident[slot])
157 strcpy(dmi_ident[slot], p);
158 else
159 printk(KERN_ERR "dmi_save_ident: out of memory.\n");
160}
161
162/*
163 * Ugly compatibility crap.
164 */
165#define dmi_blacklist dmi_system_id
166#define NO_MATCH { DMI_NONE, NULL}
167#define MATCH DMI_MATCH
168
169/*
170 * Toshiba keyboard likes to repeat keys when they are not repeated.
171 */
172
173static __init int broken_toshiba_keyboard(struct dmi_blacklist *d)
174{
175 printk(KERN_WARNING "Toshiba with broken keyboard detected. If your keyboard sometimes generates 3 keypresses instead of one, see http://davyd.ucc.asn.au/projects/toshiba/README\n");
176 return 0;
177}
178
179
180#ifdef CONFIG_ACPI_SLEEP
181static __init int reset_videomode_after_s3(struct dmi_blacklist *d)
182{
183 /* See acpi_wakeup.S */
184 extern long acpi_video_flags;
185 acpi_video_flags |= 2;
186 return 0;
187}
188#endif
189
190
191#ifdef CONFIG_ACPI_BOOT
192extern int acpi_force;
193
194static __init __attribute__((unused)) int dmi_disable_acpi(struct dmi_blacklist *d)
195{
196 if (!acpi_force) {
197 printk(KERN_NOTICE "%s detected: acpi off\n",d->ident);
198 disable_acpi();
199 } else {
200 printk(KERN_NOTICE
201 "Warning: DMI blacklist says broken, but acpi forced\n");
202 }
203 return 0;
204}
205
206/*
207 * Limit ACPI to CPU enumeration for HT
208 */
209static __init __attribute__((unused)) int force_acpi_ht(struct dmi_blacklist *d)
210{
211 if (!acpi_force) {
212 printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", d->ident);
213 disable_acpi();
214 acpi_ht = 1;
215 } else {
216 printk(KERN_NOTICE
217 "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
218 }
219 return 0;
220}
221#endif
222
223#ifdef CONFIG_ACPI_PCI
224static __init int disable_acpi_irq(struct dmi_blacklist *d)
225{
226 if (!acpi_force) {
227 printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
228 d->ident);
229 acpi_noirq_set();
230 }
231 return 0;
232}
233static __init int disable_acpi_pci(struct dmi_blacklist *d)
234{
235 if (!acpi_force) {
236 printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
237 d->ident);
238 acpi_disable_pci();
239 }
240 return 0;
241}
242#endif
243
244/*
245 * Process the DMI blacklists
246 */
247
248
249/*
250 * This will be expanded over time to force things like the APM
251 * interrupt mask settings according to the laptop
252 */
253
254static __initdata struct dmi_blacklist dmi_blacklist[]={
255
256 { broken_toshiba_keyboard, "Toshiba Satellite 4030cdt", { /* Keyboard generates spurious repeats */
257 MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
258 NO_MATCH, NO_MATCH, NO_MATCH
259 } },
260#ifdef CONFIG_ACPI_SLEEP
261 { reset_videomode_after_s3, "Toshiba Satellite 4030cdt", { /* Reset video mode after returning from ACPI S3 sleep */
262 MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
263 NO_MATCH, NO_MATCH, NO_MATCH
264 } },
265#endif
266
267#ifdef CONFIG_ACPI_BOOT
268 /*
269 * If your system is blacklisted here, but you find that acpi=force
270 * works for you, please contact acpi-devel@sourceforge.net
271 */
272
273 /*
274 * Boxes that need ACPI disabled
275 */
276
277 { dmi_disable_acpi, "IBM Thinkpad", {
278 MATCH(DMI_BOARD_VENDOR, "IBM"),
279 MATCH(DMI_BOARD_NAME, "2629H1G"),
280 NO_MATCH, NO_MATCH }},
281
282 /*
283 * Boxes that need acpi=ht
284 */
285
286 { force_acpi_ht, "FSC Primergy T850", {
287 MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
288 MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
289 NO_MATCH, NO_MATCH }},
290
291 { force_acpi_ht, "DELL GX240", {
292 MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
293 MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
294 NO_MATCH, NO_MATCH }},
295
296 { force_acpi_ht, "HP VISUALIZE NT Workstation", {
297 MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
298 MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
299 NO_MATCH, NO_MATCH }},
300
301 { force_acpi_ht, "Compaq Workstation W8000", {
302 MATCH(DMI_SYS_VENDOR, "Compaq"),
303 MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
304 NO_MATCH, NO_MATCH }},
305
306 { force_acpi_ht, "ASUS P4B266", {
307 MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
308 MATCH(DMI_BOARD_NAME, "P4B266"),
309 NO_MATCH, NO_MATCH }},
310
311 { force_acpi_ht, "ASUS P2B-DS", {
312 MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
313 MATCH(DMI_BOARD_NAME, "P2B-DS"),
314 NO_MATCH, NO_MATCH }},
315
316 { force_acpi_ht, "ASUS CUR-DLS", {
317 MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
318 MATCH(DMI_BOARD_NAME, "CUR-DLS"),
319 NO_MATCH, NO_MATCH }},
320
321 { force_acpi_ht, "ABIT i440BX-W83977", {
322 MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
323 MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
324 NO_MATCH, NO_MATCH }},
325
326 { force_acpi_ht, "IBM Bladecenter", {
327 MATCH(DMI_BOARD_VENDOR, "IBM"),
328 MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
329 NO_MATCH, NO_MATCH }},
330
331 { force_acpi_ht, "IBM eServer xSeries 360", {
332 MATCH(DMI_BOARD_VENDOR, "IBM"),
333 MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
334 NO_MATCH, NO_MATCH }},
335
336 { force_acpi_ht, "IBM eserver xSeries 330", {
337 MATCH(DMI_BOARD_VENDOR, "IBM"),
338 MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
339 NO_MATCH, NO_MATCH }},
340
341 { force_acpi_ht, "IBM eserver xSeries 440", {
342 MATCH(DMI_BOARD_VENDOR, "IBM"),
343 MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
344 NO_MATCH, NO_MATCH }},
345
346#endif // CONFIG_ACPI_BOOT
347
348#ifdef CONFIG_ACPI_PCI
349 /*
350 * Boxes that need ACPI PCI IRQ routing disabled
351 */
352
353 { disable_acpi_irq, "ASUS A7V", {
354 MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
355 MATCH(DMI_BOARD_NAME, "<A7V>"),
356 /* newer BIOS, Revision 1011, does work */
357 MATCH(DMI_BIOS_VERSION, "ASUS A7V ACPI BIOS Revision 1007"),
358 NO_MATCH }},
359
360 /*
361 * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
362 */
363 { disable_acpi_pci, "ASUS PR-DLS", { /* _BBN 0 bug */
364 MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
365 MATCH(DMI_BOARD_NAME, "PR-DLS"),
366 MATCH(DMI_BIOS_VERSION, "ASUS PR-DLS ACPI BIOS Revision 1010"),
367 MATCH(DMI_BIOS_DATE, "03/21/2003") }},
368
369 { disable_acpi_pci, "Acer TravelMate 36x Laptop", {
370 MATCH(DMI_SYS_VENDOR, "Acer"),
371 MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
372 NO_MATCH, NO_MATCH
373 } },
374
375#endif
376
377 { NULL, }
378};
379
380/*
381 * Process a DMI table entry. Right now all we care about are the BIOS
382 * and machine entries. For 2.5 we should pull the smbus controller info
383 * out of here.
384 */
385
386static void __init dmi_decode(struct dmi_header *dm)
387{
388#ifdef DMI_DEBUG
389 u8 *data = (u8 *)dm;
390#endif
391
392 switch(dm->type)
393 {
394 case 0:
395 dmi_printk(("BIOS Vendor: %s\n",
396 dmi_string(dm, data[4])));
397 dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
398 dmi_printk(("BIOS Version: %s\n",
399 dmi_string(dm, data[5])));
400 dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
401 dmi_printk(("BIOS Release: %s\n",
402 dmi_string(dm, data[8])));
403 dmi_save_ident(dm, DMI_BIOS_DATE, 8);
404 break;
405 case 1:
406 dmi_printk(("System Vendor: %s\n",
407 dmi_string(dm, data[4])));
408 dmi_save_ident(dm, DMI_SYS_VENDOR, 4);
409 dmi_printk(("Product Name: %s\n",
410 dmi_string(dm, data[5])));
411 dmi_save_ident(dm, DMI_PRODUCT_NAME, 5);
412 dmi_printk(("Version: %s\n",
413 dmi_string(dm, data[6])));
414 dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
415 dmi_printk(("Serial Number: %s\n",
416 dmi_string(dm, data[7])));
417 break;
418 case 2:
419 dmi_printk(("Board Vendor: %s\n",
420 dmi_string(dm, data[4])));
421 dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
422 dmi_printk(("Board Name: %s\n",
423 dmi_string(dm, data[5])));
424 dmi_save_ident(dm, DMI_BOARD_NAME, 5);
425 dmi_printk(("Board Version: %s\n",
426 dmi_string(dm, data[6])));
427 dmi_save_ident(dm, DMI_BOARD_VERSION, 6);
428 break;
429 }
430}
431
432void __init dmi_scan_machine(void)
433{
434 int err = dmi_iterate(dmi_decode);
435 if(err == 0)
436 dmi_check_system(dmi_blacklist);
437 else
438 printk(KERN_INFO "DMI not present.\n");
439}
440
441
442/**
443 * dmi_check_system - check system DMI data
444 * @list: array of dmi_system_id structures to match against
445 *
446 * Walk the blacklist table running matching functions until someone
447 * returns non zero or we hit the end. Callback function is called for
448 * each successfull match. Returns the number of matches.
449 */
450int dmi_check_system(struct dmi_system_id *list)
451{
452 int i, count = 0;
453 struct dmi_system_id *d = list;
454
455 while (d->ident) {
456 for (i = 0; i < ARRAY_SIZE(d->matches); i++) {
457 int s = d->matches[i].slot;
458 if (s == DMI_NONE)
459 continue;
460 if (dmi_ident[s] && strstr(dmi_ident[s], d->matches[i].substr))
461 continue;
462 /* No match */
463 goto fail;
464 }
465 if (d->callback && d->callback(d))
466 break;
467 count++;
468fail: d++;
469 }
470
471 return count;
472}
473
474EXPORT_SYMBOL(dmi_check_system);
475
476/**
477 * dmi_get_system_info - return DMI data value
478 * @field: data index (see enum dmi_filed)
479 *
480 * Returns one DMI data value, can be used to perform
481 * complex DMI data checks.
482 */
483char * dmi_get_system_info(int field)
484{
485 return dmi_ident[field];
486}
487
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
new file mode 100644
index 000000000000..789af3e9fb1f
--- /dev/null
+++ b/arch/i386/kernel/doublefault.c
@@ -0,0 +1,65 @@
1#include <linux/mm.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/init_task.h>
5#include <linux/fs.h>
6
7#include <asm/uaccess.h>
8#include <asm/pgtable.h>
9#include <asm/processor.h>
10#include <asm/desc.h>
11
12#define DOUBLEFAULT_STACKSIZE (1024)
13static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
14#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
15
16#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + 0x1000000)
17
18static void doublefault_fn(void)
19{
20 struct Xgt_desc_struct gdt_desc = {0, 0};
21 unsigned long gdt, tss;
22
23 __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory");
24 gdt = gdt_desc.address;
25
26 printk("double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
27
28 if (ptr_ok(gdt)) {
29 gdt += GDT_ENTRY_TSS << 3;
30 tss = *(u16 *)(gdt+2);
31 tss += *(u8 *)(gdt+4) << 16;
32 tss += *(u8 *)(gdt+7) << 24;
33 printk("double fault, tss at %08lx\n", tss);
34
35 if (ptr_ok(tss)) {
36 struct tss_struct *t = (struct tss_struct *)tss;
37
38 printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp);
39
40 printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
41 t->eax, t->ebx, t->ecx, t->edx);
42 printk("esi = %08lx, edi = %08lx\n",
43 t->esi, t->edi);
44 }
45 }
46
47 for (;;) /* nothing */;
48}
49
50struct tss_struct doublefault_tss __cacheline_aligned = {
51 .esp0 = STACK_START,
52 .ss0 = __KERNEL_DS,
53 .ldt = 0,
54 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
55
56 .eip = (unsigned long) doublefault_fn,
57 .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
58 .esp = STACK_START,
59 .es = __USER_DS,
60 .cs = __KERNEL_CS,
61 .ss = __KERNEL_DS,
62 .ds = __USER_DS,
63
64 .__cr3 = __pa(swapper_pg_dir)
65};
diff --git a/arch/i386/kernel/early_printk.c b/arch/i386/kernel/early_printk.c
new file mode 100644
index 000000000000..92f812ba275c
--- /dev/null
+++ b/arch/i386/kernel/early_printk.c
@@ -0,0 +1,2 @@
1
2#include "../../x86_64/kernel/early_printk.c"
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
new file mode 100644
index 000000000000..9e5e0d8bd36e
--- /dev/null
+++ b/arch/i386/kernel/efi.c
@@ -0,0 +1,635 @@
1/*
2 * Extensible Firmware Interface
3 *
4 * Based on Extensible Firmware Interface Specification version 1.0
5 *
6 * Copyright (C) 1999 VA Linux Systems
7 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
8 * Copyright (C) 1999-2002 Hewlett-Packard Co.
9 * David Mosberger-Tang <davidm@hpl.hp.com>
10 * Stephane Eranian <eranian@hpl.hp.com>
11 *
12 * All EFI Runtime Services are not implemented yet as EFI only
13 * supports physical mode addressing on SoftSDV. This is to be fixed
14 * in a future version. --drummond 1999-07-20
15 *
16 * Implemented EFI runtime services and virtual mode calls. --davidm
17 *
18 * Goutham Rao: <goutham.rao@intel.com>
19 * Skip non-WB memory and ignore empty memory ranges.
20 */
21
22#include <linux/config.h>
23#include <linux/kernel.h>
24#include <linux/init.h>
25#include <linux/mm.h>
26#include <linux/types.h>
27#include <linux/time.h>
28#include <linux/spinlock.h>
29#include <linux/bootmem.h>
30#include <linux/ioport.h>
31#include <linux/module.h>
32#include <linux/efi.h>
33
34#include <asm/setup.h>
35#include <asm/io.h>
36#include <asm/page.h>
37#include <asm/pgtable.h>
38#include <asm/processor.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
41
42#define EFI_DEBUG 0
43#define PFX "EFI: "
44
45extern efi_status_t asmlinkage efi_call_phys(void *, ...);
46
47struct efi efi;
48EXPORT_SYMBOL(efi);
49static struct efi efi_phys __initdata;
50struct efi_memory_map memmap __initdata;
51
52/*
53 * We require an early boot_ioremap mapping mechanism initially
54 */
55extern void * boot_ioremap(unsigned long, unsigned long);
56
57/*
58 * To make EFI call EFI runtime service in physical addressing mode we need
59 * prelog/epilog before/after the invocation to disable interrupt, to
60 * claim EFI runtime service handler exclusively and to duplicate a memory in
61 * low memory space say 0 - 3G.
62 */
63
64static unsigned long efi_rt_eflags;
65static DEFINE_SPINLOCK(efi_rt_lock);
66static pgd_t efi_bak_pg_dir_pointer[2];
67
68static void efi_call_phys_prelog(void)
69{
70 unsigned long cr4;
71 unsigned long temp;
72
73 spin_lock(&efi_rt_lock);
74 local_irq_save(efi_rt_eflags);
75
76 /*
77 * If I don't have PSE, I should just duplicate two entries in page
78 * directory. If I have PSE, I just need to duplicate one entry in
79 * page directory.
80 */
81 __asm__ __volatile__("movl %%cr4, %0":"=r"(cr4));
82
83 if (cr4 & X86_CR4_PSE) {
84 efi_bak_pg_dir_pointer[0].pgd =
85 swapper_pg_dir[pgd_index(0)].pgd;
86 swapper_pg_dir[0].pgd =
87 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
88 } else {
89 efi_bak_pg_dir_pointer[0].pgd =
90 swapper_pg_dir[pgd_index(0)].pgd;
91 efi_bak_pg_dir_pointer[1].pgd =
92 swapper_pg_dir[pgd_index(0x400000)].pgd;
93 swapper_pg_dir[pgd_index(0)].pgd =
94 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
95 temp = PAGE_OFFSET + 0x400000;
96 swapper_pg_dir[pgd_index(0x400000)].pgd =
97 swapper_pg_dir[pgd_index(temp)].pgd;
98 }
99
100 /*
101 * After the lock is released, the original page table is restored.
102 */
103 local_flush_tlb();
104
105 cpu_gdt_descr[0].address = __pa(cpu_gdt_descr[0].address);
106 __asm__ __volatile__("lgdt %0":"=m"
107 (*(struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0])));
108}
109
110static void efi_call_phys_epilog(void)
111{
112 unsigned long cr4;
113
114 cpu_gdt_descr[0].address =
115 (unsigned long) __va(cpu_gdt_descr[0].address);
116 __asm__ __volatile__("lgdt %0":"=m"(cpu_gdt_descr));
117 __asm__ __volatile__("movl %%cr4, %0":"=r"(cr4));
118
119 if (cr4 & X86_CR4_PSE) {
120 swapper_pg_dir[pgd_index(0)].pgd =
121 efi_bak_pg_dir_pointer[0].pgd;
122 } else {
123 swapper_pg_dir[pgd_index(0)].pgd =
124 efi_bak_pg_dir_pointer[0].pgd;
125 swapper_pg_dir[pgd_index(0x400000)].pgd =
126 efi_bak_pg_dir_pointer[1].pgd;
127 }
128
129 /*
130 * After the lock is released, the original page table is restored.
131 */
132 local_flush_tlb();
133
134 local_irq_restore(efi_rt_eflags);
135 spin_unlock(&efi_rt_lock);
136}
137
138static efi_status_t
139phys_efi_set_virtual_address_map(unsigned long memory_map_size,
140 unsigned long descriptor_size,
141 u32 descriptor_version,
142 efi_memory_desc_t *virtual_map)
143{
144 efi_status_t status;
145
146 efi_call_phys_prelog();
147 status = efi_call_phys(efi_phys.set_virtual_address_map,
148 memory_map_size, descriptor_size,
149 descriptor_version, virtual_map);
150 efi_call_phys_epilog();
151 return status;
152}
153
154static efi_status_t
155phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
156{
157 efi_status_t status;
158
159 efi_call_phys_prelog();
160 status = efi_call_phys(efi_phys.get_time, tm, tc);
161 efi_call_phys_epilog();
162 return status;
163}
164
165inline int efi_set_rtc_mmss(unsigned long nowtime)
166{
167 int real_seconds, real_minutes;
168 efi_status_t status;
169 efi_time_t eft;
170 efi_time_cap_t cap;
171
172 spin_lock(&efi_rt_lock);
173 status = efi.get_time(&eft, &cap);
174 spin_unlock(&efi_rt_lock);
175 if (status != EFI_SUCCESS)
176 panic("Ooops, efitime: can't read time!\n");
177 real_seconds = nowtime % 60;
178 real_minutes = nowtime / 60;
179
180 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
181 real_minutes += 30;
182 real_minutes %= 60;
183
184 eft.minute = real_minutes;
185 eft.second = real_seconds;
186
187 if (status != EFI_SUCCESS) {
188 printk("Ooops: efitime: can't read time!\n");
189 return -1;
190 }
191 return 0;
192}
193/*
194 * This should only be used during kernel init and before runtime
195 * services have been remapped, therefore, we'll need to call in physical
196 * mode. Note, this call isn't used later, so mark it __init.
197 */
198inline unsigned long __init efi_get_time(void)
199{
200 efi_status_t status;
201 efi_time_t eft;
202 efi_time_cap_t cap;
203
204 status = phys_efi_get_time(&eft, &cap);
205 if (status != EFI_SUCCESS)
206 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
207
208 return mktime(eft.year, eft.month, eft.day, eft.hour,
209 eft.minute, eft.second);
210}
211
212int is_available_memory(efi_memory_desc_t * md)
213{
214 if (!(md->attribute & EFI_MEMORY_WB))
215 return 0;
216
217 switch (md->type) {
218 case EFI_LOADER_CODE:
219 case EFI_LOADER_DATA:
220 case EFI_BOOT_SERVICES_CODE:
221 case EFI_BOOT_SERVICES_DATA:
222 case EFI_CONVENTIONAL_MEMORY:
223 return 1;
224 }
225 return 0;
226}
227
228/*
229 * We need to map the EFI memory map again after paging_init().
230 */
231void __init efi_map_memmap(void)
232{
233 memmap.map = NULL;
234
235 memmap.map = (efi_memory_desc_t *)
236 bt_ioremap((unsigned long) memmap.phys_map,
237 (memmap.nr_map * sizeof(efi_memory_desc_t)));
238
239 if (memmap.map == NULL)
240 printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
241}
242
243#if EFI_DEBUG
244static void __init print_efi_memmap(void)
245{
246 efi_memory_desc_t *md;
247 int i;
248
249 for (i = 0; i < memmap.nr_map; i++) {
250 md = &memmap.map[i];
251 printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
252 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
253 i, md->type, md->attribute, md->phys_addr,
254 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
255 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
256 }
257}
258#endif /* EFI_DEBUG */
259
260/*
261 * Walks the EFI memory map and calls CALLBACK once for each EFI
262 * memory descriptor that has memory that is available for kernel use.
263 */
264void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
265{
266 int prev_valid = 0;
267 struct range {
268 unsigned long start;
269 unsigned long end;
270 } prev, curr;
271 efi_memory_desc_t *md;
272 unsigned long start, end;
273 int i;
274
275 for (i = 0; i < memmap.nr_map; i++) {
276 md = &memmap.map[i];
277
278 if ((md->num_pages == 0) || (!is_available_memory(md)))
279 continue;
280
281 curr.start = md->phys_addr;
282 curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
283
284 if (!prev_valid) {
285 prev = curr;
286 prev_valid = 1;
287 } else {
288 if (curr.start < prev.start)
289 printk(KERN_INFO PFX "Unordered memory map\n");
290 if (prev.end == curr.start)
291 prev.end = curr.end;
292 else {
293 start =
294 (unsigned long) (PAGE_ALIGN(prev.start));
295 end = (unsigned long) (prev.end & PAGE_MASK);
296 if ((end > start)
297 && (*callback) (start, end, arg) < 0)
298 return;
299 prev = curr;
300 }
301 }
302 }
303 if (prev_valid) {
304 start = (unsigned long) PAGE_ALIGN(prev.start);
305 end = (unsigned long) (prev.end & PAGE_MASK);
306 if (end > start)
307 (*callback) (start, end, arg);
308 }
309}
310
311void __init efi_init(void)
312{
313 efi_config_table_t *config_tables;
314 efi_runtime_services_t *runtime;
315 efi_char16_t *c16;
316 char vendor[100] = "unknown";
317 unsigned long num_config_tables;
318 int i = 0;
319
320 memset(&efi, 0, sizeof(efi) );
321 memset(&efi_phys, 0, sizeof(efi_phys));
322
323 efi_phys.systab = EFI_SYSTAB;
324 memmap.phys_map = EFI_MEMMAP;
325 memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
326 memmap.desc_version = EFI_MEMDESC_VERSION;
327
328 efi.systab = (efi_system_table_t *)
329 boot_ioremap((unsigned long) efi_phys.systab,
330 sizeof(efi_system_table_t));
331 /*
332 * Verify the EFI Table
333 */
334 if (efi.systab == NULL)
335 printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
336 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
337 printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
338 if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0)
339 printk(KERN_ERR PFX
340 "Warning: EFI system table major version mismatch: "
341 "got %d.%02d, expected %d.%02d\n",
342 efi.systab->hdr.revision >> 16,
343 efi.systab->hdr.revision & 0xffff,
344 EFI_SYSTEM_TABLE_REVISION >> 16,
345 EFI_SYSTEM_TABLE_REVISION & 0xffff);
346 /*
347 * Grab some details from the system table
348 */
349 num_config_tables = efi.systab->nr_tables;
350 config_tables = (efi_config_table_t *)efi.systab->tables;
351 runtime = efi.systab->runtime;
352
353 /*
354 * Show what we know for posterity
355 */
356 c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
357 if (c16) {
358 for (i = 0; i < sizeof(vendor) && *c16; ++i)
359 vendor[i] = *c16++;
360 vendor[i] = '\0';
361 } else
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363
364 printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
365 efi.systab->hdr.revision >> 16,
366 efi.systab->hdr.revision & 0xffff, vendor);
367
368 /*
369 * Let's see what config tables the firmware passed to us.
370 */
371 config_tables = (efi_config_table_t *)
372 boot_ioremap((unsigned long) config_tables,
373 num_config_tables * sizeof(efi_config_table_t));
374
375 if (config_tables == NULL)
376 printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
377
378 for (i = 0; i < num_config_tables; i++) {
379 if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
380 efi.mps = (void *)config_tables[i].table;
381 printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
382 } else
383 if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
384 efi.acpi20 = __va(config_tables[i].table);
385 printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
386 } else
387 if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
388 efi.acpi = __va(config_tables[i].table);
389 printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
390 } else
391 if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
392 efi.smbios = (void *) config_tables[i].table;
393 printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
394 } else
395 if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
396 efi.hcdp = (void *)config_tables[i].table;
397 printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
398 } else
399 if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
400 efi.uga = (void *)config_tables[i].table;
401 printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
402 }
403 }
404 printk("\n");
405
406 /*
407 * Check out the runtime services table. We need to map
408 * the runtime services table so that we can grab the physical
409 * address of several of the EFI runtime functions, needed to
410 * set the firmware into virtual mode.
411 */
412
413 runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
414 runtime,
415 sizeof(efi_runtime_services_t));
416 if (runtime != NULL) {
417 /*
418 * We will only need *early* access to the following
419 * two EFI runtime services before set_virtual_address_map
420 * is invoked.
421 */
422 efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
423 efi_phys.set_virtual_address_map =
424 (efi_set_virtual_address_map_t *)
425 runtime->set_virtual_address_map;
426 } else
427 printk(KERN_ERR PFX "Could not map the runtime service table!\n");
428
429 /* Map the EFI memory map for use until paging_init() */
430
431 memmap.map = (efi_memory_desc_t *)
432 boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
433
434 if (memmap.map == NULL)
435 printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
436
437 if (EFI_MEMDESC_SIZE != sizeof(efi_memory_desc_t)) {
438 printk(KERN_WARNING PFX "Warning! Kernel-defined memdesc doesn't "
439 "match the one from EFI!\n");
440 }
441#if EFI_DEBUG
442 print_efi_memmap();
443#endif
444}
445
446/*
447 * This function will switch the EFI runtime services to virtual mode.
448 * Essentially, look through the EFI memmap and map every region that
449 * has the runtime attribute bit set in its memory descriptor and update
450 * that memory descriptor with the virtual address obtained from ioremap().
451 * This enables the runtime services to be called without having to
452 * thunk back into physical mode for every invocation.
453 */
454
455void __init efi_enter_virtual_mode(void)
456{
457 efi_memory_desc_t *md;
458 efi_status_t status;
459 int i;
460
461 efi.systab = NULL;
462
463 for (i = 0; i < memmap.nr_map; i++) {
464 md = &memmap.map[i];
465
466 if (md->attribute & EFI_MEMORY_RUNTIME) {
467 md->virt_addr =
468 (unsigned long)ioremap(md->phys_addr,
469 md->num_pages << EFI_PAGE_SHIFT);
470 if (!(unsigned long)md->virt_addr) {
471 printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
472 (unsigned long)md->phys_addr);
473 }
474
475 if (((unsigned long)md->phys_addr <=
476 (unsigned long)efi_phys.systab) &&
477 ((unsigned long)efi_phys.systab <
478 md->phys_addr +
479 ((unsigned long)md->num_pages <<
480 EFI_PAGE_SHIFT))) {
481 unsigned long addr;
482
483 addr = md->virt_addr - md->phys_addr +
484 (unsigned long)efi_phys.systab;
485 efi.systab = (efi_system_table_t *)addr;
486 }
487 }
488 }
489
490 if (!efi.systab)
491 BUG();
492
493 status = phys_efi_set_virtual_address_map(
494 sizeof(efi_memory_desc_t) * memmap.nr_map,
495 sizeof(efi_memory_desc_t),
496 memmap.desc_version,
497 memmap.phys_map);
498
499 if (status != EFI_SUCCESS) {
500 printk (KERN_ALERT "You are screwed! "
501 "Unable to switch EFI into virtual mode "
502 "(status=%lx)\n", status);
503 panic("EFI call to SetVirtualAddressMap() failed!");
504 }
505
506 /*
507 * Now that EFI is in virtual mode, update the function
508 * pointers in the runtime service table to the new virtual addresses.
509 */
510
511 efi.get_time = (efi_get_time_t *) efi.systab->runtime->get_time;
512 efi.set_time = (efi_set_time_t *) efi.systab->runtime->set_time;
513 efi.get_wakeup_time = (efi_get_wakeup_time_t *)
514 efi.systab->runtime->get_wakeup_time;
515 efi.set_wakeup_time = (efi_set_wakeup_time_t *)
516 efi.systab->runtime->set_wakeup_time;
517 efi.get_variable = (efi_get_variable_t *)
518 efi.systab->runtime->get_variable;
519 efi.get_next_variable = (efi_get_next_variable_t *)
520 efi.systab->runtime->get_next_variable;
521 efi.set_variable = (efi_set_variable_t *)
522 efi.systab->runtime->set_variable;
523 efi.get_next_high_mono_count = (efi_get_next_high_mono_count_t *)
524 efi.systab->runtime->get_next_high_mono_count;
525 efi.reset_system = (efi_reset_system_t *)
526 efi.systab->runtime->reset_system;
527}
528
529void __init
530efi_initialize_iomem_resources(struct resource *code_resource,
531 struct resource *data_resource)
532{
533 struct resource *res;
534 efi_memory_desc_t *md;
535 int i;
536
537 for (i = 0; i < memmap.nr_map; i++) {
538 md = &memmap.map[i];
539
540 if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
541 0x100000000ULL)
542 continue;
543 res = alloc_bootmem_low(sizeof(struct resource));
544 switch (md->type) {
545 case EFI_RESERVED_TYPE:
546 res->name = "Reserved Memory";
547 break;
548 case EFI_LOADER_CODE:
549 res->name = "Loader Code";
550 break;
551 case EFI_LOADER_DATA:
552 res->name = "Loader Data";
553 break;
554 case EFI_BOOT_SERVICES_DATA:
555 res->name = "BootServices Data";
556 break;
557 case EFI_BOOT_SERVICES_CODE:
558 res->name = "BootServices Code";
559 break;
560 case EFI_RUNTIME_SERVICES_CODE:
561 res->name = "Runtime Service Code";
562 break;
563 case EFI_RUNTIME_SERVICES_DATA:
564 res->name = "Runtime Service Data";
565 break;
566 case EFI_CONVENTIONAL_MEMORY:
567 res->name = "Conventional Memory";
568 break;
569 case EFI_UNUSABLE_MEMORY:
570 res->name = "Unusable Memory";
571 break;
572 case EFI_ACPI_RECLAIM_MEMORY:
573 res->name = "ACPI Reclaim";
574 break;
575 case EFI_ACPI_MEMORY_NVS:
576 res->name = "ACPI NVS";
577 break;
578 case EFI_MEMORY_MAPPED_IO:
579 res->name = "Memory Mapped IO";
580 break;
581 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
582 res->name = "Memory Mapped IO Port Space";
583 break;
584 default:
585 res->name = "Reserved";
586 break;
587 }
588 res->start = md->phys_addr;
589 res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
590 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
591 if (request_resource(&iomem_resource, res) < 0)
592 printk(KERN_ERR PFX "Failed to allocate res %s : 0x%lx-0x%lx\n",
593 res->name, res->start, res->end);
594 /*
595 * We don't know which region contains kernel data so we try
596 * it repeatedly and let the resource manager test it.
597 */
598 if (md->type == EFI_CONVENTIONAL_MEMORY) {
599 request_resource(res, code_resource);
600 request_resource(res, data_resource);
601 }
602 }
603}
604
605/*
606 * Convenience functions to obtain memory types and attributes
607 */
608
609u32 efi_mem_type(unsigned long phys_addr)
610{
611 efi_memory_desc_t *md;
612 int i;
613
614 for (i = 0; i < memmap.nr_map; i++) {
615 md = &memmap.map[i];
616 if ((md->phys_addr <= phys_addr) && (phys_addr <
617 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
618 return md->type;
619 }
620 return 0;
621}
622
623u64 efi_mem_attributes(unsigned long phys_addr)
624{
625 efi_memory_desc_t *md;
626 int i;
627
628 for (i = 0; i < memmap.nr_map; i++) {
629 md = &memmap.map[i];
630 if ((md->phys_addr <= phys_addr) && (phys_addr <
631 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
632 return md->attribute;
633 }
634 return 0;
635}
diff --git a/arch/i386/kernel/efi_stub.S b/arch/i386/kernel/efi_stub.S
new file mode 100644
index 000000000000..08c0312d9b6c
--- /dev/null
+++ b/arch/i386/kernel/efi_stub.S
@@ -0,0 +1,124 @@
1/*
2 * EFI call stub for IA32.
3 *
4 * This stub allows us to make EFI calls in physical mode with interrupts
5 * turned off.
6 */
7
8#include <linux/config.h>
9#include <linux/linkage.h>
10#include <asm/page.h>
11#include <asm/pgtable.h>
12
13/*
14 * efi_call_phys(void *, ...) is a function with variable parameters.
15 * All the callers of this function assure that all the parameters are 4-bytes.
16 */
17
18/*
19 * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
20 * So we'd better save all of them at the beginning of this function and restore
21 * at the end no matter how many we use, because we can not assure EFI runtime
22 * service functions will comply with gcc calling convention, too.
23 */
24
25.text
26ENTRY(efi_call_phys)
27 /*
28 * 0. The function can only be called in Linux kernel. So CS has been
29 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
30 * the values of these registers are the same. And, the corresponding
31 * GDT entries are identical. So I will do nothing about segment reg
32 * and GDT, but change GDT base register in prelog and epilog.
33 */
34
35 /*
36 * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
37 * But to make it smoothly switch from virtual mode to flat mode.
38 * The mapping of lower virtual memory has been created in prelog and
39 * epilog.
40 */
41 movl $1f, %edx
42 subl $__PAGE_OFFSET, %edx
43 jmp *%edx
441:
45
46 /*
47 * 2. Now on the top of stack is the return
48 * address in the caller of efi_call_phys(), then parameter 1,
49 * parameter 2, ..., param n. To make things easy, we save the return
50 * address of efi_call_phys in a global variable.
51 */
52 popl %edx
53 movl %edx, saved_return_addr
54 /* get the function pointer into ECX*/
55 popl %ecx
56 movl %ecx, efi_rt_function_ptr
57 movl $2f, %edx
58 subl $__PAGE_OFFSET, %edx
59 pushl %edx
60
61 /*
62 * 3. Clear PG bit in %CR0.
63 */
64 movl %cr0, %edx
65 andl $0x7fffffff, %edx
66 movl %edx, %cr0
67 jmp 1f
681:
69
70 /*
71 * 4. Adjust stack pointer.
72 */
73 subl $__PAGE_OFFSET, %esp
74
75 /*
76 * 5. Call the physical function.
77 */
78 jmp *%ecx
79
802:
81 /*
82 * 6. After EFI runtime service returns, control will return to
83 * following instruction. We'd better readjust stack pointer first.
84 */
85 addl $__PAGE_OFFSET, %esp
86
87 /*
88 * 7. Restore PG bit
89 */
90 movl %cr0, %edx
91 orl $0x80000000, %edx
92 movl %edx, %cr0
93 jmp 1f
941:
95 /*
96 * 8. Now restore the virtual mode from flat mode by
97 * adding EIP with PAGE_OFFSET.
98 */
99 movl $1f, %edx
100 jmp *%edx
1011:
102
103 /*
104 * 9. Balance the stack. And because EAX contain the return value,
105 * we'd better not clobber it.
106 */
107 leal efi_rt_function_ptr, %edx
108 movl (%edx), %ecx
109 pushl %ecx
110
111 /*
112 * 10. Push the saved return address onto the stack and return.
113 */
114 leal saved_return_addr, %edx
115 movl (%edx), %ecx
116 pushl %ecx
117 ret
118.previous
119
120.data
121saved_return_addr:
122 .long 0
123efi_rt_function_ptr:
124 .long 0
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
new file mode 100644
index 000000000000..1e45ff292bc9
--- /dev/null
+++ b/arch/i386/kernel/entry.S
@@ -0,0 +1,950 @@
1/*
2 * linux/arch/i386/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7/*
8 * entry.S contains the system-call and fault low-level handling routines.
9 * This also contains the timer-interrupt handler, as well as all interrupts
10 * and faults that can result in a task-switch.
11 *
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after a timer-interrupt and after each system call.
14 *
15 * I changed all the .align's to 4 (16 byte alignment), as that's faster
16 * on a 486.
17 *
18 * Stack layout in 'ret_from_system_call':
19 * ptrace needs to have all regs on the stack.
20 * if the order here is changed, it needs to be
21 * updated in fork.c:copy_process, signal.c:do_signal,
22 * ptrace.c and ptrace.h
23 *
24 * 0(%esp) - %ebx
25 * 4(%esp) - %ecx
26 * 8(%esp) - %edx
27 * C(%esp) - %esi
28 * 10(%esp) - %edi
29 * 14(%esp) - %ebp
30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds
32 * 20(%esp) - %es
33 * 24(%esp) - orig_eax
34 * 28(%esp) - %eip
35 * 2C(%esp) - %cs
36 * 30(%esp) - %eflags
37 * 34(%esp) - %oldesp
38 * 38(%esp) - %oldss
39 *
40 * "current" is in register %ebx during any slow entries.
41 */
42
43#include <linux/config.h>
44#include <linux/linkage.h>
45#include <asm/thread_info.h>
46#include <asm/errno.h>
47#include <asm/segment.h>
48#include <asm/smp.h>
49#include <asm/page.h>
50#include <asm/desc.h>
51#include "irq_vectors.h"
52
53#define nr_syscalls ((syscall_table_size)/4)
54
55EBX = 0x00
56ECX = 0x04
57EDX = 0x08
58ESI = 0x0C
59EDI = 0x10
60EBP = 0x14
61EAX = 0x18
62DS = 0x1C
63ES = 0x20
64ORIG_EAX = 0x24
65EIP = 0x28
66CS = 0x2C
67EFLAGS = 0x30
68OLDESP = 0x34
69OLDSS = 0x38
70
71CF_MASK = 0x00000001
72TF_MASK = 0x00000100
73IF_MASK = 0x00000200
74DF_MASK = 0x00000400
75NT_MASK = 0x00004000
76VM_MASK = 0x00020000
77
78#ifdef CONFIG_PREEMPT
79#define preempt_stop cli
80#else
81#define preempt_stop
82#define resume_kernel restore_nocheck
83#endif
84
85#define SAVE_ALL \
86 cld; \
87 pushl %es; \
88 pushl %ds; \
89 pushl %eax; \
90 pushl %ebp; \
91 pushl %edi; \
92 pushl %esi; \
93 pushl %edx; \
94 pushl %ecx; \
95 pushl %ebx; \
96 movl $(__USER_DS), %edx; \
97 movl %edx, %ds; \
98 movl %edx, %es;
99
100#define RESTORE_INT_REGS \
101 popl %ebx; \
102 popl %ecx; \
103 popl %edx; \
104 popl %esi; \
105 popl %edi; \
106 popl %ebp; \
107 popl %eax
108
109#define RESTORE_REGS \
110 RESTORE_INT_REGS; \
1111: popl %ds; \
1122: popl %es; \
113.section .fixup,"ax"; \
1143: movl $0,(%esp); \
115 jmp 1b; \
1164: movl $0,(%esp); \
117 jmp 2b; \
118.previous; \
119.section __ex_table,"a";\
120 .align 4; \
121 .long 1b,3b; \
122 .long 2b,4b; \
123.previous
124
125
126ENTRY(ret_from_fork)
127 pushl %eax
128 call schedule_tail
129 GET_THREAD_INFO(%ebp)
130 popl %eax
131 jmp syscall_exit
132
133/*
134 * Return to user mode is not as complex as all this looks,
135 * but we want the default path for a system call return to
136 * go as quickly as possible which is why some of this is
137 * less clear than it otherwise should be.
138 */
139
140 # userspace resumption stub bypassing syscall exit tracing
141 ALIGN
142ret_from_exception:
143 preempt_stop
144ret_from_intr:
145 GET_THREAD_INFO(%ebp)
146 movl EFLAGS(%esp), %eax # mix EFLAGS and CS
147 movb CS(%esp), %al
148 testl $(VM_MASK | 3), %eax
149 jz resume_kernel
150ENTRY(resume_userspace)
151 cli # make sure we don't miss an interrupt
152 # setting need_resched or sigpending
153 # between sampling and the iret
154 movl TI_flags(%ebp), %ecx
155 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
156 # int/exception return?
157 jne work_pending
158 jmp restore_all
159
160#ifdef CONFIG_PREEMPT
161ENTRY(resume_kernel)
162 cli
163 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
164 jnz restore_nocheck
165need_resched:
166 movl TI_flags(%ebp), %ecx # need_resched set ?
167 testb $_TIF_NEED_RESCHED, %cl
168 jz restore_all
169 testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
170 jz restore_all
171 call preempt_schedule_irq
172 jmp need_resched
173#endif
174
175/* SYSENTER_RETURN points to after the "sysenter" instruction in
176 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
177
178 # sysenter call handler stub
179ENTRY(sysenter_entry)
180 movl TSS_sysenter_esp0(%esp),%esp
181sysenter_past_esp:
182 sti
183 pushl $(__USER_DS)
184 pushl %ebp
185 pushfl
186 pushl $(__USER_CS)
187 pushl $SYSENTER_RETURN
188
189/*
190 * Load the potential sixth argument from user stack.
191 * Careful about security.
192 */
193 cmpl $__PAGE_OFFSET-3,%ebp
194 jae syscall_fault
1951: movl (%ebp),%ebp
196.section __ex_table,"a"
197 .align 4
198 .long 1b,syscall_fault
199.previous
200
201 pushl %eax
202 SAVE_ALL
203 GET_THREAD_INFO(%ebp)
204
205 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
206 testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
207 jnz syscall_trace_entry
208 cmpl $(nr_syscalls), %eax
209 jae syscall_badsys
210 call *sys_call_table(,%eax,4)
211 movl %eax,EAX(%esp)
212 cli
213 movl TI_flags(%ebp), %ecx
214 testw $_TIF_ALLWORK_MASK, %cx
215 jne syscall_exit_work
216/* if something modifies registers it must also disable sysexit */
217 movl EIP(%esp), %edx
218 movl OLDESP(%esp), %ecx
219 xorl %ebp,%ebp
220 sti
221 sysexit
222
223
224 # system call handler stub
225ENTRY(system_call)
226 pushl %eax # save orig_eax
227 SAVE_ALL
228 GET_THREAD_INFO(%ebp)
229 # system call tracing in operation
230 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
231 testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
232 jnz syscall_trace_entry
233 cmpl $(nr_syscalls), %eax
234 jae syscall_badsys
235syscall_call:
236 call *sys_call_table(,%eax,4)
237 movl %eax,EAX(%esp) # store the return value
238syscall_exit:
239 cli # make sure we don't miss an interrupt
240 # setting need_resched or sigpending
241 # between sampling and the iret
242 movl TI_flags(%ebp), %ecx
243 testw $_TIF_ALLWORK_MASK, %cx # current->work
244 jne syscall_exit_work
245
246restore_all:
247 movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
248 movb OLDSS(%esp), %ah
249 movb CS(%esp), %al
250 andl $(VM_MASK | (4 << 8) | 3), %eax
251 cmpl $((4 << 8) | 3), %eax
252 je ldt_ss # returning to user-space with LDT SS
253restore_nocheck:
254 RESTORE_REGS
255 addl $4, %esp
2561: iret
257.section .fixup,"ax"
258iret_exc:
259 sti
260 movl $__USER_DS, %edx
261 movl %edx, %ds
262 movl %edx, %es
263 movl $11,%eax
264 call do_exit
265.previous
266.section __ex_table,"a"
267 .align 4
268 .long 1b,iret_exc
269.previous
270
271ldt_ss:
272 larl OLDSS(%esp), %eax
273 jnz restore_nocheck
274 testl $0x00400000, %eax # returning to 32bit stack?
275 jnz restore_nocheck # allright, normal return
276 /* If returning to userspace with 16bit stack,
277 * try to fix the higher word of ESP, as the CPU
278 * won't restore it.
279 * This is an "official" bug of all the x86-compatible
280 * CPUs, which we can try to work around to make
281 * dosemu and wine happy. */
282 subl $8, %esp # reserve space for switch16 pointer
283 cli
284 movl %esp, %eax
285 /* Set up the 16bit stack frame with switch32 pointer on top,
286 * and a switch16 pointer on top of the current frame. */
287 call setup_x86_bogus_stack
288 RESTORE_REGS
289 lss 20+4(%esp), %esp # switch to 16bit stack
2901: iret
291.section __ex_table,"a"
292 .align 4
293 .long 1b,iret_exc
294.previous
295
296 # perform work that needs to be done immediately before resumption
297 ALIGN
298work_pending:
299 testb $_TIF_NEED_RESCHED, %cl
300 jz work_notifysig
301work_resched:
302 call schedule
303 cli # make sure we don't miss an interrupt
304 # setting need_resched or sigpending
305 # between sampling and the iret
306 movl TI_flags(%ebp), %ecx
307 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
308 # than syscall tracing?
309 jz restore_all
310 testb $_TIF_NEED_RESCHED, %cl
311 jnz work_resched
312
313work_notifysig: # deal with pending signals and
314 # notify-resume requests
315 testl $VM_MASK, EFLAGS(%esp)
316 movl %esp, %eax
317 jne work_notifysig_v86 # returning to kernel-space or
318 # vm86-space
319 xorl %edx, %edx
320 call do_notify_resume
321 jmp restore_all
322
323 ALIGN
324work_notifysig_v86:
325 pushl %ecx # save ti_flags for do_notify_resume
326 call save_v86_state # %eax contains pt_regs pointer
327 popl %ecx
328 movl %eax, %esp
329 xorl %edx, %edx
330 call do_notify_resume
331 jmp restore_all
332
333 # perform syscall exit tracing
334 ALIGN
335syscall_trace_entry:
336 movl $-ENOSYS,EAX(%esp)
337 movl %esp, %eax
338 xorl %edx,%edx
339 call do_syscall_trace
340 movl ORIG_EAX(%esp), %eax
341 cmpl $(nr_syscalls), %eax
342 jnae syscall_call
343 jmp syscall_exit
344
345 # perform syscall exit tracing
346 ALIGN
347syscall_exit_work:
348 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
349 jz work_pending
350 sti # could let do_syscall_trace() call
351 # schedule() instead
352 movl %esp, %eax
353 movl $1, %edx
354 call do_syscall_trace
355 jmp resume_userspace
356
357 ALIGN
358syscall_fault:
359 pushl %eax # save orig_eax
360 SAVE_ALL
361 GET_THREAD_INFO(%ebp)
362 movl $-EFAULT,EAX(%esp)
363 jmp resume_userspace
364
365 ALIGN
366syscall_badsys:
367 movl $-ENOSYS,EAX(%esp)
368 jmp resume_userspace
369
370#define FIXUP_ESPFIX_STACK \
371 movl %esp, %eax; \
372 /* switch to 32bit stack using the pointer on top of 16bit stack */ \
373 lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
374 /* copy data from 16bit stack to 32bit stack */ \
375 call fixup_x86_bogus_stack; \
376 /* put ESP to the proper location */ \
377 movl %eax, %esp;
378#define UNWIND_ESPFIX_STACK \
379 pushl %eax; \
380 movl %ss, %eax; \
381 /* see if on 16bit stack */ \
382 cmpw $__ESPFIX_SS, %ax; \
383 jne 28f; \
384 movl $__KERNEL_DS, %edx; \
385 movl %edx, %ds; \
386 movl %edx, %es; \
387 /* switch to 32bit stack */ \
388 FIXUP_ESPFIX_STACK \
38928: popl %eax;
390
391/*
392 * Build the entry stubs and pointer table with
393 * some assembler magic.
394 */
395.data
396ENTRY(interrupt)
397.text
398
399vector=0
400ENTRY(irq_entries_start)
401.rept NR_IRQS
402 ALIGN
4031: pushl $vector-256
404 jmp common_interrupt
405.data
406 .long 1b
407.text
408vector=vector+1
409.endr
410
411 ALIGN
412common_interrupt:
413 SAVE_ALL
414 movl %esp,%eax
415 call do_IRQ
416 jmp ret_from_intr
417
418#define BUILD_INTERRUPT(name, nr) \
419ENTRY(name) \
420 pushl $nr-256; \
421 SAVE_ALL \
422 movl %esp,%eax; \
423 call smp_/**/name; \
424 jmp ret_from_intr;
425
426/* The include is where all of the SMP etc. interrupts come from */
427#include "entry_arch.h"
428
429ENTRY(divide_error)
430 pushl $0 # no error code
431 pushl $do_divide_error
432 ALIGN
433error_code:
434 pushl %ds
435 pushl %eax
436 xorl %eax, %eax
437 pushl %ebp
438 pushl %edi
439 pushl %esi
440 pushl %edx
441 decl %eax # eax = -1
442 pushl %ecx
443 pushl %ebx
444 cld
445 pushl %es
446 UNWIND_ESPFIX_STACK
447 popl %ecx
448 movl ES(%esp), %edi # get the function address
449 movl ORIG_EAX(%esp), %edx # get the error code
450 movl %eax, ORIG_EAX(%esp)
451 movl %ecx, ES(%esp)
452 movl $(__USER_DS), %ecx
453 movl %ecx, %ds
454 movl %ecx, %es
455 movl %esp,%eax # pt_regs pointer
456 call *%edi
457 jmp ret_from_exception
458
459ENTRY(coprocessor_error)
460 pushl $0
461 pushl $do_coprocessor_error
462 jmp error_code
463
464ENTRY(simd_coprocessor_error)
465 pushl $0
466 pushl $do_simd_coprocessor_error
467 jmp error_code
468
469ENTRY(device_not_available)
470 pushl $-1 # mark this as an int
471 SAVE_ALL
472 movl %cr0, %eax
473 testl $0x4, %eax # EM (math emulation bit)
474 jne device_not_available_emulate
475 preempt_stop
476 call math_state_restore
477 jmp ret_from_exception
478device_not_available_emulate:
479 pushl $0 # temporary storage for ORIG_EIP
480 call math_emulate
481 addl $4, %esp
482 jmp ret_from_exception
483
484/*
485 * Debug traps and NMI can happen at the one SYSENTER instruction
486 * that sets up the real kernel stack. Check here, since we can't
487 * allow the wrong stack to be used.
488 *
489 * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
490 * already pushed 3 words if it hits on the sysenter instruction:
491 * eflags, cs and eip.
492 *
493 * We just load the right stack, and push the three (known) values
494 * by hand onto the new stack - while updating the return eip past
495 * the instruction that would have done it for sysenter.
496 */
497#define FIX_STACK(offset, ok, label) \
498 cmpw $__KERNEL_CS,4(%esp); \
499 jne ok; \
500label: \
501 movl TSS_sysenter_esp0+offset(%esp),%esp; \
502 pushfl; \
503 pushl $__KERNEL_CS; \
504 pushl $sysenter_past_esp
505
506ENTRY(debug)
507 cmpl $sysenter_entry,(%esp)
508 jne debug_stack_correct
509 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
510debug_stack_correct:
511 pushl $-1 # mark this as an int
512 SAVE_ALL
513 xorl %edx,%edx # error code 0
514 movl %esp,%eax # pt_regs pointer
515 call do_debug
516 testl %eax,%eax
517 jnz restore_all
518 jmp ret_from_exception
519
520/*
521 * NMI is doubly nasty. It can happen _while_ we're handling
522 * a debug fault, and the debug fault hasn't yet been able to
523 * clear up the stack. So we first check whether we got an
524 * NMI on the sysenter entry path, but after that we need to
525 * check whether we got an NMI on the debug path where the debug
526 * fault happened on the sysenter path.
527 */
528ENTRY(nmi)
529 pushl %eax
530 movl %ss, %eax
531 cmpw $__ESPFIX_SS, %ax
532 popl %eax
533 je nmi_16bit_stack
534 cmpl $sysenter_entry,(%esp)
535 je nmi_stack_fixup
536 pushl %eax
537 movl %esp,%eax
538 /* Do not access memory above the end of our stack page,
539 * it might not exist.
540 */
541 andl $(THREAD_SIZE-1),%eax
542 cmpl $(THREAD_SIZE-20),%eax
543 popl %eax
544 jae nmi_stack_correct
545 cmpl $sysenter_entry,12(%esp)
546 je nmi_debug_stack_check
547nmi_stack_correct:
548 pushl %eax
549 SAVE_ALL
550 xorl %edx,%edx # zero error code
551 movl %esp,%eax # pt_regs pointer
552 call do_nmi
553 jmp restore_all
554
555nmi_stack_fixup:
556 FIX_STACK(12,nmi_stack_correct, 1)
557 jmp nmi_stack_correct
558nmi_debug_stack_check:
559 cmpw $__KERNEL_CS,16(%esp)
560 jne nmi_stack_correct
561 cmpl $debug - 1,(%esp)
562 jle nmi_stack_correct
563 cmpl $debug_esp_fix_insn,(%esp)
564 jle nmi_debug_stack_fixup
565nmi_debug_stack_fixup:
566 FIX_STACK(24,nmi_stack_correct, 1)
567 jmp nmi_stack_correct
568
569nmi_16bit_stack:
570 /* create the pointer to lss back */
571 pushl %ss
572 pushl %esp
573 movzwl %sp, %esp
574 addw $4, (%esp)
575 /* copy the iret frame of 12 bytes */
576 .rept 3
577 pushl 16(%esp)
578 .endr
579 pushl %eax
580 SAVE_ALL
581 FIXUP_ESPFIX_STACK # %eax == %esp
582 xorl %edx,%edx # zero error code
583 call do_nmi
584 RESTORE_REGS
585 lss 12+4(%esp), %esp # back to 16bit stack
5861: iret
587.section __ex_table,"a"
588 .align 4
589 .long 1b,iret_exc
590.previous
591
592ENTRY(int3)
593 pushl $-1 # mark this as an int
594 SAVE_ALL
595 xorl %edx,%edx # zero error code
596 movl %esp,%eax # pt_regs pointer
597 call do_int3
598 testl %eax,%eax
599 jnz restore_all
600 jmp ret_from_exception
601
602ENTRY(overflow)
603 pushl $0
604 pushl $do_overflow
605 jmp error_code
606
607ENTRY(bounds)
608 pushl $0
609 pushl $do_bounds
610 jmp error_code
611
612ENTRY(invalid_op)
613 pushl $0
614 pushl $do_invalid_op
615 jmp error_code
616
617ENTRY(coprocessor_segment_overrun)
618 pushl $0
619 pushl $do_coprocessor_segment_overrun
620 jmp error_code
621
622ENTRY(invalid_TSS)
623 pushl $do_invalid_TSS
624 jmp error_code
625
626ENTRY(segment_not_present)
627 pushl $do_segment_not_present
628 jmp error_code
629
630ENTRY(stack_segment)
631 pushl $do_stack_segment
632 jmp error_code
633
634ENTRY(general_protection)
635 pushl $do_general_protection
636 jmp error_code
637
638ENTRY(alignment_check)
639 pushl $do_alignment_check
640 jmp error_code
641
642ENTRY(page_fault)
643 pushl $do_page_fault
644 jmp error_code
645
646#ifdef CONFIG_X86_MCE
647ENTRY(machine_check)
648 pushl $0
649 pushl machine_check_vector
650 jmp error_code
651#endif
652
653ENTRY(spurious_interrupt_bug)
654 pushl $0
655 pushl $do_spurious_interrupt_bug
656 jmp error_code
657
658.data
659ENTRY(sys_call_table)
660 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
661 .long sys_exit
662 .long sys_fork
663 .long sys_read
664 .long sys_write
665 .long sys_open /* 5 */
666 .long sys_close
667 .long sys_waitpid
668 .long sys_creat
669 .long sys_link
670 .long sys_unlink /* 10 */
671 .long sys_execve
672 .long sys_chdir
673 .long sys_time
674 .long sys_mknod
675 .long sys_chmod /* 15 */
676 .long sys_lchown16
677 .long sys_ni_syscall /* old break syscall holder */
678 .long sys_stat
679 .long sys_lseek
680 .long sys_getpid /* 20 */
681 .long sys_mount
682 .long sys_oldumount
683 .long sys_setuid16
684 .long sys_getuid16
685 .long sys_stime /* 25 */
686 .long sys_ptrace
687 .long sys_alarm
688 .long sys_fstat
689 .long sys_pause
690 .long sys_utime /* 30 */
691 .long sys_ni_syscall /* old stty syscall holder */
692 .long sys_ni_syscall /* old gtty syscall holder */
693 .long sys_access
694 .long sys_nice
695 .long sys_ni_syscall /* 35 - old ftime syscall holder */
696 .long sys_sync
697 .long sys_kill
698 .long sys_rename
699 .long sys_mkdir
700 .long sys_rmdir /* 40 */
701 .long sys_dup
702 .long sys_pipe
703 .long sys_times
704 .long sys_ni_syscall /* old prof syscall holder */
705 .long sys_brk /* 45 */
706 .long sys_setgid16
707 .long sys_getgid16
708 .long sys_signal
709 .long sys_geteuid16
710 .long sys_getegid16 /* 50 */
711 .long sys_acct
712 .long sys_umount /* recycled never used phys() */
713 .long sys_ni_syscall /* old lock syscall holder */
714 .long sys_ioctl
715 .long sys_fcntl /* 55 */
716 .long sys_ni_syscall /* old mpx syscall holder */
717 .long sys_setpgid
718 .long sys_ni_syscall /* old ulimit syscall holder */
719 .long sys_olduname
720 .long sys_umask /* 60 */
721 .long sys_chroot
722 .long sys_ustat
723 .long sys_dup2
724 .long sys_getppid
725 .long sys_getpgrp /* 65 */
726 .long sys_setsid
727 .long sys_sigaction
728 .long sys_sgetmask
729 .long sys_ssetmask
730 .long sys_setreuid16 /* 70 */
731 .long sys_setregid16
732 .long sys_sigsuspend
733 .long sys_sigpending
734 .long sys_sethostname
735 .long sys_setrlimit /* 75 */
736 .long sys_old_getrlimit
737 .long sys_getrusage
738 .long sys_gettimeofday
739 .long sys_settimeofday
740 .long sys_getgroups16 /* 80 */
741 .long sys_setgroups16
742 .long old_select
743 .long sys_symlink
744 .long sys_lstat
745 .long sys_readlink /* 85 */
746 .long sys_uselib
747 .long sys_swapon
748 .long sys_reboot
749 .long old_readdir
750 .long old_mmap /* 90 */
751 .long sys_munmap
752 .long sys_truncate
753 .long sys_ftruncate
754 .long sys_fchmod
755 .long sys_fchown16 /* 95 */
756 .long sys_getpriority
757 .long sys_setpriority
758 .long sys_ni_syscall /* old profil syscall holder */
759 .long sys_statfs
760 .long sys_fstatfs /* 100 */
761 .long sys_ioperm
762 .long sys_socketcall
763 .long sys_syslog
764 .long sys_setitimer
765 .long sys_getitimer /* 105 */
766 .long sys_newstat
767 .long sys_newlstat
768 .long sys_newfstat
769 .long sys_uname
770 .long sys_iopl /* 110 */
771 .long sys_vhangup
772 .long sys_ni_syscall /* old "idle" system call */
773 .long sys_vm86old
774 .long sys_wait4
775 .long sys_swapoff /* 115 */
776 .long sys_sysinfo
777 .long sys_ipc
778 .long sys_fsync
779 .long sys_sigreturn
780 .long sys_clone /* 120 */
781 .long sys_setdomainname
782 .long sys_newuname
783 .long sys_modify_ldt
784 .long sys_adjtimex
785 .long sys_mprotect /* 125 */
786 .long sys_sigprocmask
787 .long sys_ni_syscall /* old "create_module" */
788 .long sys_init_module
789 .long sys_delete_module
790 .long sys_ni_syscall /* 130: old "get_kernel_syms" */
791 .long sys_quotactl
792 .long sys_getpgid
793 .long sys_fchdir
794 .long sys_bdflush
795 .long sys_sysfs /* 135 */
796 .long sys_personality
797 .long sys_ni_syscall /* reserved for afs_syscall */
798 .long sys_setfsuid16
799 .long sys_setfsgid16
800 .long sys_llseek /* 140 */
801 .long sys_getdents
802 .long sys_select
803 .long sys_flock
804 .long sys_msync
805 .long sys_readv /* 145 */
806 .long sys_writev
807 .long sys_getsid
808 .long sys_fdatasync
809 .long sys_sysctl
810 .long sys_mlock /* 150 */
811 .long sys_munlock
812 .long sys_mlockall
813 .long sys_munlockall
814 .long sys_sched_setparam
815 .long sys_sched_getparam /* 155 */
816 .long sys_sched_setscheduler
817 .long sys_sched_getscheduler
818 .long sys_sched_yield
819 .long sys_sched_get_priority_max
820 .long sys_sched_get_priority_min /* 160 */
821 .long sys_sched_rr_get_interval
822 .long sys_nanosleep
823 .long sys_mremap
824 .long sys_setresuid16
825 .long sys_getresuid16 /* 165 */
826 .long sys_vm86
827 .long sys_ni_syscall /* Old sys_query_module */
828 .long sys_poll
829 .long sys_nfsservctl
830 .long sys_setresgid16 /* 170 */
831 .long sys_getresgid16
832 .long sys_prctl
833 .long sys_rt_sigreturn
834 .long sys_rt_sigaction
835 .long sys_rt_sigprocmask /* 175 */
836 .long sys_rt_sigpending
837 .long sys_rt_sigtimedwait
838 .long sys_rt_sigqueueinfo
839 .long sys_rt_sigsuspend
840 .long sys_pread64 /* 180 */
841 .long sys_pwrite64
842 .long sys_chown16
843 .long sys_getcwd
844 .long sys_capget
845 .long sys_capset /* 185 */
846 .long sys_sigaltstack
847 .long sys_sendfile
848 .long sys_ni_syscall /* reserved for streams1 */
849 .long sys_ni_syscall /* reserved for streams2 */
850 .long sys_vfork /* 190 */
851 .long sys_getrlimit
852 .long sys_mmap2
853 .long sys_truncate64
854 .long sys_ftruncate64
855 .long sys_stat64 /* 195 */
856 .long sys_lstat64
857 .long sys_fstat64
858 .long sys_lchown
859 .long sys_getuid
860 .long sys_getgid /* 200 */
861 .long sys_geteuid
862 .long sys_getegid
863 .long sys_setreuid
864 .long sys_setregid
865 .long sys_getgroups /* 205 */
866 .long sys_setgroups
867 .long sys_fchown
868 .long sys_setresuid
869 .long sys_getresuid
870 .long sys_setresgid /* 210 */
871 .long sys_getresgid
872 .long sys_chown
873 .long sys_setuid
874 .long sys_setgid
875 .long sys_setfsuid /* 215 */
876 .long sys_setfsgid
877 .long sys_pivot_root
878 .long sys_mincore
879 .long sys_madvise
880 .long sys_getdents64 /* 220 */
881 .long sys_fcntl64
882 .long sys_ni_syscall /* reserved for TUX */
883 .long sys_ni_syscall
884 .long sys_gettid
885 .long sys_readahead /* 225 */
886 .long sys_setxattr
887 .long sys_lsetxattr
888 .long sys_fsetxattr
889 .long sys_getxattr
890 .long sys_lgetxattr /* 230 */
891 .long sys_fgetxattr
892 .long sys_listxattr
893 .long sys_llistxattr
894 .long sys_flistxattr
895 .long sys_removexattr /* 235 */
896 .long sys_lremovexattr
897 .long sys_fremovexattr
898 .long sys_tkill
899 .long sys_sendfile64
900 .long sys_futex /* 240 */
901 .long sys_sched_setaffinity
902 .long sys_sched_getaffinity
903 .long sys_set_thread_area
904 .long sys_get_thread_area
905 .long sys_io_setup /* 245 */
906 .long sys_io_destroy
907 .long sys_io_getevents
908 .long sys_io_submit
909 .long sys_io_cancel
910 .long sys_fadvise64 /* 250 */
911 .long sys_ni_syscall
912 .long sys_exit_group
913 .long sys_lookup_dcookie
914 .long sys_epoll_create
915 .long sys_epoll_ctl /* 255 */
916 .long sys_epoll_wait
917 .long sys_remap_file_pages
918 .long sys_set_tid_address
919 .long sys_timer_create
920 .long sys_timer_settime /* 260 */
921 .long sys_timer_gettime
922 .long sys_timer_getoverrun
923 .long sys_timer_delete
924 .long sys_clock_settime
925 .long sys_clock_gettime /* 265 */
926 .long sys_clock_getres
927 .long sys_clock_nanosleep
928 .long sys_statfs64
929 .long sys_fstatfs64
930 .long sys_tgkill /* 270 */
931 .long sys_utimes
932 .long sys_fadvise64_64
933 .long sys_ni_syscall /* sys_vserver */
934 .long sys_mbind
935 .long sys_get_mempolicy
936 .long sys_set_mempolicy
937 .long sys_mq_open
938 .long sys_mq_unlink
939 .long sys_mq_timedsend
940 .long sys_mq_timedreceive /* 280 */
941 .long sys_mq_notify
942 .long sys_mq_getsetattr
943 .long sys_ni_syscall /* reserved for kexec */
944 .long sys_waitid
945 .long sys_ni_syscall /* 285 */ /* available */
946 .long sys_add_key
947 .long sys_request_key
948 .long sys_keyctl
949
950syscall_table_size=(.-sys_call_table)
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
new file mode 100644
index 000000000000..d273fd746192
--- /dev/null
+++ b/arch/i386/kernel/head.S
@@ -0,0 +1,521 @@
1/*
2 * linux/arch/i386/kernel/head.S -- the 32-bit startup code.
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Enhanced CPU detection and feature setting code by Mike Jagdis
7 * and Martin Mares, November 1997.
8 */
9
10.text
11#include <linux/config.h>
12#include <linux/threads.h>
13#include <linux/linkage.h>
14#include <asm/segment.h>
15#include <asm/page.h>
16#include <asm/pgtable.h>
17#include <asm/desc.h>
18#include <asm/cache.h>
19#include <asm/thread_info.h>
20#include <asm/asm_offsets.h>
21#include <asm/setup.h>
22
23/*
24 * References to members of the new_cpu_data structure.
25 */
26
27#define X86 new_cpu_data+CPUINFO_x86
28#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
29#define X86_MODEL new_cpu_data+CPUINFO_x86_model
30#define X86_MASK new_cpu_data+CPUINFO_x86_mask
31#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
32#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
33#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
34#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
35
36/*
37 * This is how much memory *in addition to the memory covered up to
38 * and including _end* we need mapped initially. We need one bit for
39 * each possible page, but only in low memory, which means
40 * 2^32/4096/8 = 128K worst case (4G/4G split.)
41 *
42 * Modulo rounding, each megabyte assigned here requires a kilobyte of
43 * memory, which is currently unreclaimed.
44 *
45 * This should be a multiple of a page.
46 */
47#define INIT_MAP_BEYOND_END (128*1024)
48
49
50/*
51 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
52 * %esi points to the real-mode code as a 32-bit pointer.
53 * CS and DS must be 4 GB flat segments, but we don't depend on
54 * any particular GDT layout, because we load our own as soon as we
55 * can.
56 */
57ENTRY(startup_32)
58
59/*
60 * Set segments to known values.
61 */
62 cld
63 lgdt boot_gdt_descr - __PAGE_OFFSET
64 movl $(__BOOT_DS),%eax
65 movl %eax,%ds
66 movl %eax,%es
67 movl %eax,%fs
68 movl %eax,%gs
69
70/*
71 * Clear BSS first so that there are no surprises...
72 * No need to cld as DF is already clear from cld above...
73 */
74 xorl %eax,%eax
75 movl $__bss_start - __PAGE_OFFSET,%edi
76 movl $__bss_stop - __PAGE_OFFSET,%ecx
77 subl %edi,%ecx
78 shrl $2,%ecx
79 rep ; stosl
80
81/*
82 * Initialize page tables. This creates a PDE and a set of page
83 * tables, which are located immediately beyond _end. The variable
84 * init_pg_tables_end is set up to point to the first "safe" location.
85 * Mappings are created both at virtual address 0 (identity mapping)
86 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
87 *
88 * Warning: don't use %esi or the stack in this code. However, %esp
89 * can be used as a GPR if you really need it...
90 */
91page_pde_offset = (__PAGE_OFFSET >> 20);
92
93 movl $(pg0 - __PAGE_OFFSET), %edi
94 movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
95 movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
9610:
97 leal 0x007(%edi),%ecx /* Create PDE entry */
98 movl %ecx,(%edx) /* Store identity PDE entry */
99 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
100 addl $4,%edx
101 movl $1024, %ecx
10211:
103 stosl
104 addl $0x1000,%eax
105 loop 11b
106 /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
107 /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
108 leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
109 cmpl %ebp,%eax
110 jb 10b
111 movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
112
113#ifdef CONFIG_SMP
114 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
115 jmp 3f
116
117/*
118 * Non-boot CPU entry point; entered from trampoline.S
119 * We can't lgdt here, because lgdt itself uses a data segment, but
120 * we know the trampoline has already loaded the boot_gdt_table GDT
121 * for us.
122 */
123ENTRY(startup_32_smp)
124 cld
125 movl $(__BOOT_DS),%eax
126 movl %eax,%ds
127 movl %eax,%es
128 movl %eax,%fs
129 movl %eax,%gs
130
131/*
132 * New page tables may be in 4Mbyte page mode and may
133 * be using the global pages.
134 *
135 * NOTE! If we are on a 486 we may have no cr4 at all!
136 * So we do not try to touch it unless we really have
137 * some bits in it to set. This won't work if the BSP
138 * implements cr4 but this AP does not -- very unlikely
139 * but be warned! The same applies to the pse feature
140 * if not equally supported. --macro
141 *
142 * NOTE! We have to correct for the fact that we're
143 * not yet offset PAGE_OFFSET..
144 */
145#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
146 movl cr4_bits,%edx
147 andl %edx,%edx
148 jz 6f
149 movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
150 orl %edx,%eax
151 movl %eax,%cr4
152
153 btl $5, %eax # check if PAE is enabled
154 jnc 6f
155
156 /* Check if extended functions are implemented */
157 movl $0x80000000, %eax
158 cpuid
159 cmpl $0x80000000, %eax
160 jbe 6f
161 mov $0x80000001, %eax
162 cpuid
163 /* Execute Disable bit supported? */
164 btl $20, %edx
165 jnc 6f
166
167 /* Setup EFER (Extended Feature Enable Register) */
168 movl $0xc0000080, %ecx
169 rdmsr
170
171 btsl $11, %eax
172 /* Make changes effective */
173 wrmsr
174
1756:
176 /* This is a secondary processor (AP) */
177 xorl %ebx,%ebx
178 incl %ebx
179
1803:
181#endif /* CONFIG_SMP */
182
183/*
184 * Enable paging
185 */
186 movl $swapper_pg_dir-__PAGE_OFFSET,%eax
187 movl %eax,%cr3 /* set the page table pointer.. */
188 movl %cr0,%eax
189 orl $0x80000000,%eax
190 movl %eax,%cr0 /* ..and set paging (PG) bit */
191 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1921:
193 /* Set up the stack pointer */
194 lss stack_start,%esp
195
196/*
197 * Initialize eflags. Some BIOS's leave bits like NT set. This would
198 * confuse the debugger if this code is traced.
199 * XXX - best to initialize before switching to protected mode.
200 */
201 pushl $0
202 popfl
203
204#ifdef CONFIG_SMP
205 andl %ebx,%ebx
206 jz 1f /* Initial CPU cleans BSS */
207 jmp checkCPUtype
2081:
209#endif /* CONFIG_SMP */
210
211/*
212 * start system 32-bit setup. We need to re-do some of the things done
213 * in 16-bit mode for the "real" operations.
214 */
215 call setup_idt
216
217/*
218 * Copy bootup parameters out of the way.
219 * Note: %esi still has the pointer to the real-mode data.
220 */
221 movl $boot_params,%edi
222 movl $(PARAM_SIZE/4),%ecx
223 cld
224 rep
225 movsl
226 movl boot_params+NEW_CL_POINTER,%esi
227 andl %esi,%esi
228 jnz 2f # New command line protocol
229 cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
230 jne 1f
231 movzwl OLD_CL_OFFSET,%esi
232 addl $(OLD_CL_BASE_ADDR),%esi
2332:
234 movl $saved_command_line,%edi
235 movl $(COMMAND_LINE_SIZE/4),%ecx
236 rep
237 movsl
2381:
239checkCPUtype:
240
241 movl $-1,X86_CPUID # -1 for no CPUID initially
242
243/* check if it is 486 or 386. */
244/*
245 * XXX - this does a lot of unnecessary setup. Alignment checks don't
246 * apply at our cpl of 0 and the stack ought to be aligned already, and
247 * we don't need to preserve eflags.
248 */
249
250 movb $3,X86 # at least 386
251 pushfl # push EFLAGS
252 popl %eax # get EFLAGS
253 movl %eax,%ecx # save original EFLAGS
254 xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
255 pushl %eax # copy to EFLAGS
256 popfl # set EFLAGS
257 pushfl # get new EFLAGS
258 popl %eax # put it in eax
259 xorl %ecx,%eax # change in flags
260 pushl %ecx # restore original EFLAGS
261 popfl
262 testl $0x40000,%eax # check if AC bit changed
263 je is386
264
265 movb $4,X86 # at least 486
266 testl $0x200000,%eax # check if ID bit changed
267 je is486
268
269 /* get vendor info */
270 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
271 cpuid
272 movl %eax,X86_CPUID # save CPUID level
273 movl %ebx,X86_VENDOR_ID # lo 4 chars
274 movl %edx,X86_VENDOR_ID+4 # next 4 chars
275 movl %ecx,X86_VENDOR_ID+8 # last 4 chars
276
277 orl %eax,%eax # do we have processor info as well?
278 je is486
279
280 movl $1,%eax # Use the CPUID instruction to get CPU type
281 cpuid
282 movb %al,%cl # save reg for future use
283 andb $0x0f,%ah # mask processor family
284 movb %ah,X86
285 andb $0xf0,%al # mask model
286 shrb $4,%al
287 movb %al,X86_MODEL
288 andb $0x0f,%cl # mask mask revision
289 movb %cl,X86_MASK
290 movl %edx,X86_CAPABILITY
291
292is486: movl $0x50022,%ecx # set AM, WP, NE and MP
293 jmp 2f
294
295is386: movl $2,%ecx # set MP
2962: movl %cr0,%eax
297 andl $0x80000011,%eax # Save PG,PE,ET
298 orl %ecx,%eax
299 movl %eax,%cr0
300
301 call check_x87
302 incb ready
303 lgdt cpu_gdt_descr
304 lidt idt_descr
305 ljmp $(__KERNEL_CS),$1f
3061: movl $(__KERNEL_DS),%eax # reload all the segment registers
307 movl %eax,%ss # after changing gdt.
308
309 movl $(__USER_DS),%eax # DS/ES contains default USER segment
310 movl %eax,%ds
311 movl %eax,%es
312
313 xorl %eax,%eax # Clear FS/GS and LDT
314 movl %eax,%fs
315 movl %eax,%gs
316 lldt %ax
317 cld # gcc2 wants the direction flag cleared at all times
318#ifdef CONFIG_SMP
319 movb ready, %cl
320 cmpb $1,%cl
321 je 1f # the first CPU calls start_kernel
322 # all other CPUs call initialize_secondary
323 call initialize_secondary
324 jmp L6
3251:
326#endif /* CONFIG_SMP */
327 call start_kernel
328L6:
329 jmp L6 # main should never return here, but
330 # just in case, we know what happens.
331
332/*
333 * We depend on ET to be correct. This checks for 287/387.
334 */
335check_x87:
336 movb $0,X86_HARD_MATH
337 clts
338 fninit
339 fstsw %ax
340 cmpb $0,%al
341 je 1f
342 movl %cr0,%eax /* no coprocessor: have to set bits */
343 xorl $4,%eax /* set EM */
344 movl %eax,%cr0
345 ret
346 ALIGN
3471: movb $1,X86_HARD_MATH
348 .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
349 ret
350
351/*
352 * setup_idt
353 *
354 * sets up a idt with 256 entries pointing to
355 * ignore_int, interrupt gates. It doesn't actually load
356 * idt - that can be done only after paging has been enabled
357 * and the kernel moved to PAGE_OFFSET. Interrupts
358 * are enabled elsewhere, when we can be relatively
359 * sure everything is ok.
360 *
361 * Warning: %esi is live across this function.
362 */
363setup_idt:
364 lea ignore_int,%edx
365 movl $(__KERNEL_CS << 16),%eax
366 movw %dx,%ax /* selector = 0x0010 = cs */
367 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
368
369 lea idt_table,%edi
370 mov $256,%ecx
371rp_sidt:
372 movl %eax,(%edi)
373 movl %edx,4(%edi)
374 addl $8,%edi
375 dec %ecx
376 jne rp_sidt
377 ret
378
379/* This is the default interrupt "handler" :-) */
380 ALIGN
381ignore_int:
382 cld
383 pushl %eax
384 pushl %ecx
385 pushl %edx
386 pushl %es
387 pushl %ds
388 movl $(__KERNEL_DS),%eax
389 movl %eax,%ds
390 movl %eax,%es
391 pushl 16(%esp)
392 pushl 24(%esp)
393 pushl 32(%esp)
394 pushl 40(%esp)
395 pushl $int_msg
396 call printk
397 addl $(5*4),%esp
398 popl %ds
399 popl %es
400 popl %edx
401 popl %ecx
402 popl %eax
403 iret
404
405/*
406 * Real beginning of normal "text" segment
407 */
408ENTRY(stext)
409ENTRY(_stext)
410
411/*
412 * BSS section
413 */
414.section ".bss.page_aligned","w"
415ENTRY(swapper_pg_dir)
416 .fill 1024,4,0
417ENTRY(empty_zero_page)
418 .fill 4096,1,0
419
420/*
421 * This starts the data section.
422 */
423.data
424
425ENTRY(stack_start)
426 .long init_thread_union+THREAD_SIZE
427 .long __BOOT_DS
428
429ready: .byte 0
430
431int_msg:
432 .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
433
434/*
435 * The IDT and GDT 'descriptors' are a strange 48-bit object
436 * only used by the lidt and lgdt instructions. They are not
437 * like usual segment descriptors - they consist of a 16-bit
438 * segment size, and 32-bit linear address value:
439 */
440
441.globl boot_gdt_descr
442.globl idt_descr
443.globl cpu_gdt_descr
444
445 ALIGN
446# early boot GDT descriptor (must use 1:1 address mapping)
447 .word 0 # 32 bit align gdt_desc.address
448boot_gdt_descr:
449 .word __BOOT_DS+7
450 .long boot_gdt_table - __PAGE_OFFSET
451
452 .word 0 # 32-bit align idt_desc.address
453idt_descr:
454 .word IDT_ENTRIES*8-1 # idt contains 256 entries
455 .long idt_table
456
457# boot GDT descriptor (later on used by CPU#0):
458 .word 0 # 32 bit align gdt_desc.address
459cpu_gdt_descr:
460 .word GDT_ENTRIES*8-1
461 .long cpu_gdt_table
462
463 .fill NR_CPUS-1,8,0 # space for the other GDT descriptors
464
465/*
466 * The boot_gdt_table must mirror the equivalent in setup.S and is
467 * used only for booting.
468 */
469 .align L1_CACHE_BYTES
470ENTRY(boot_gdt_table)
471 .fill GDT_ENTRY_BOOT_CS,8,0
472 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
473 .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
474
475/*
476 * The Global Descriptor Table contains 28 quadwords, per-CPU.
477 */
478 .align PAGE_SIZE_asm
479ENTRY(cpu_gdt_table)
480 .quad 0x0000000000000000 /* NULL descriptor */
481 .quad 0x0000000000000000 /* 0x0b reserved */
482 .quad 0x0000000000000000 /* 0x13 reserved */
483 .quad 0x0000000000000000 /* 0x1b reserved */
484 .quad 0x0000000000000000 /* 0x20 unused */
485 .quad 0x0000000000000000 /* 0x28 unused */
486 .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
487 .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
488 .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
489 .quad 0x0000000000000000 /* 0x4b reserved */
490 .quad 0x0000000000000000 /* 0x53 reserved */
491 .quad 0x0000000000000000 /* 0x5b reserved */
492
493 .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
494 .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
495 .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
496 .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
497
498 .quad 0x0000000000000000 /* 0x80 TSS descriptor */
499 .quad 0x0000000000000000 /* 0x88 LDT descriptor */
500
501 /* Segments used for calling PnP BIOS */
502 .quad 0x00c09a0000000000 /* 0x90 32-bit code */
503 .quad 0x00809a0000000000 /* 0x98 16-bit code */
504 .quad 0x0080920000000000 /* 0xa0 16-bit data */
505 .quad 0x0080920000000000 /* 0xa8 16-bit data */
506 .quad 0x0080920000000000 /* 0xb0 16-bit data */
507 /*
508 * The APM segments have byte granularity and their bases
509 * and limits are set at run time.
510 */
511 .quad 0x00409a0000000000 /* 0xb8 APM CS code */
512 .quad 0x00009a0000000000 /* 0xc0 APM CS 16 code (16 bit) */
513 .quad 0x0040920000000000 /* 0xc8 APM DS data */
514
515 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */
516 .quad 0x0000000000000000 /* 0xd8 - unused */
517 .quad 0x0000000000000000 /* 0xe0 - unused */
518 .quad 0x0000000000000000 /* 0xe8 - unused */
519 .quad 0x0000000000000000 /* 0xf0 - unused */
520 .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
521
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
new file mode 100644
index 000000000000..14ec354bec92
--- /dev/null
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -0,0 +1,195 @@
1#include <linux/config.h>
2#include <linux/module.h>
3#include <linux/smp.h>
4#include <linux/user.h>
5#include <linux/elfcore.h>
6#include <linux/mca.h>
7#include <linux/sched.h>
8#include <linux/in6.h>
9#include <linux/interrupt.h>
10#include <linux/smp_lock.h>
11#include <linux/pm.h>
12#include <linux/pci.h>
13#include <linux/apm_bios.h>
14#include <linux/kernel.h>
15#include <linux/string.h>
16#include <linux/tty.h>
17#include <linux/highmem.h>
18#include <linux/time.h>
19
20#include <asm/semaphore.h>
21#include <asm/processor.h>
22#include <asm/i387.h>
23#include <asm/uaccess.h>
24#include <asm/checksum.h>
25#include <asm/io.h>
26#include <asm/delay.h>
27#include <asm/irq.h>
28#include <asm/mmx.h>
29#include <asm/desc.h>
30#include <asm/pgtable.h>
31#include <asm/tlbflush.h>
32#include <asm/nmi.h>
33#include <asm/ist.h>
34#include <asm/kdebug.h>
35
36extern void dump_thread(struct pt_regs *, struct user *);
37extern spinlock_t rtc_lock;
38
39/* This is definitely a GPL-only symbol */
40EXPORT_SYMBOL_GPL(cpu_gdt_table);
41
42#if defined(CONFIG_APM_MODULE)
43extern void machine_real_restart(unsigned char *, int);
44EXPORT_SYMBOL(machine_real_restart);
45extern void default_idle(void);
46EXPORT_SYMBOL(default_idle);
47#endif
48
49#ifdef CONFIG_SMP
50extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
51extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
52#endif
53
54#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
55extern struct drive_info_struct drive_info;
56EXPORT_SYMBOL(drive_info);
57#endif
58
59extern unsigned long cpu_khz;
60extern unsigned long get_cmos_time(void);
61
62/* platform dependent support */
63EXPORT_SYMBOL(boot_cpu_data);
64#ifdef CONFIG_DISCONTIGMEM
65EXPORT_SYMBOL(node_data);
66EXPORT_SYMBOL(physnode_map);
67#endif
68#ifdef CONFIG_X86_NUMAQ
69EXPORT_SYMBOL(xquad_portio);
70#endif
71EXPORT_SYMBOL(dump_thread);
72EXPORT_SYMBOL(dump_fpu);
73EXPORT_SYMBOL_GPL(kernel_fpu_begin);
74EXPORT_SYMBOL(__ioremap);
75EXPORT_SYMBOL(ioremap_nocache);
76EXPORT_SYMBOL(iounmap);
77EXPORT_SYMBOL(kernel_thread);
78EXPORT_SYMBOL(pm_idle);
79EXPORT_SYMBOL(pm_power_off);
80EXPORT_SYMBOL(get_cmos_time);
81EXPORT_SYMBOL(cpu_khz);
82EXPORT_SYMBOL(apm_info);
83
84EXPORT_SYMBOL(__down_failed);
85EXPORT_SYMBOL(__down_failed_interruptible);
86EXPORT_SYMBOL(__down_failed_trylock);
87EXPORT_SYMBOL(__up_wakeup);
88/* Networking helper routines. */
89EXPORT_SYMBOL(csum_partial_copy_generic);
90/* Delay loops */
91EXPORT_SYMBOL(__ndelay);
92EXPORT_SYMBOL(__udelay);
93EXPORT_SYMBOL(__delay);
94EXPORT_SYMBOL(__const_udelay);
95
96EXPORT_SYMBOL(__get_user_1);
97EXPORT_SYMBOL(__get_user_2);
98EXPORT_SYMBOL(__get_user_4);
99
100EXPORT_SYMBOL(__put_user_1);
101EXPORT_SYMBOL(__put_user_2);
102EXPORT_SYMBOL(__put_user_4);
103EXPORT_SYMBOL(__put_user_8);
104
105EXPORT_SYMBOL(strpbrk);
106EXPORT_SYMBOL(strstr);
107
108EXPORT_SYMBOL(strncpy_from_user);
109EXPORT_SYMBOL(__strncpy_from_user);
110EXPORT_SYMBOL(clear_user);
111EXPORT_SYMBOL(__clear_user);
112EXPORT_SYMBOL(__copy_from_user_ll);
113EXPORT_SYMBOL(__copy_to_user_ll);
114EXPORT_SYMBOL(strnlen_user);
115
116EXPORT_SYMBOL(dma_alloc_coherent);
117EXPORT_SYMBOL(dma_free_coherent);
118
119#ifdef CONFIG_PCI
120EXPORT_SYMBOL(pci_mem_start);
121#endif
122
123#ifdef CONFIG_PCI_BIOS
124EXPORT_SYMBOL(pcibios_set_irq_routing);
125EXPORT_SYMBOL(pcibios_get_irq_routing_table);
126#endif
127
128#ifdef CONFIG_X86_USE_3DNOW
129EXPORT_SYMBOL(_mmx_memcpy);
130EXPORT_SYMBOL(mmx_clear_page);
131EXPORT_SYMBOL(mmx_copy_page);
132#endif
133
134#ifdef CONFIG_X86_HT
135EXPORT_SYMBOL(smp_num_siblings);
136EXPORT_SYMBOL(cpu_sibling_map);
137#endif
138
139#ifdef CONFIG_SMP
140EXPORT_SYMBOL(cpu_data);
141EXPORT_SYMBOL(cpu_online_map);
142EXPORT_SYMBOL(cpu_callout_map);
143EXPORT_SYMBOL(__write_lock_failed);
144EXPORT_SYMBOL(__read_lock_failed);
145
146/* Global SMP stuff */
147EXPORT_SYMBOL(smp_call_function);
148
149/* TLB flushing */
150EXPORT_SYMBOL(flush_tlb_page);
151#endif
152
153#ifdef CONFIG_X86_IO_APIC
154EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
155#endif
156
157#ifdef CONFIG_MCA
158EXPORT_SYMBOL(machine_id);
159#endif
160
161#ifdef CONFIG_VT
162EXPORT_SYMBOL(screen_info);
163#endif
164
165EXPORT_SYMBOL(get_wchan);
166
167EXPORT_SYMBOL(rtc_lock);
168
169EXPORT_SYMBOL_GPL(set_nmi_callback);
170EXPORT_SYMBOL_GPL(unset_nmi_callback);
171
172#undef memcmp
173extern int memcmp(const void *,const void *,__kernel_size_t);
174EXPORT_SYMBOL(memcmp);
175
176EXPORT_SYMBOL(register_die_notifier);
177#ifdef CONFIG_HAVE_DEC_LOCK
178EXPORT_SYMBOL(_atomic_dec_and_lock);
179#endif
180
181EXPORT_SYMBOL(__PAGE_KERNEL);
182
183#ifdef CONFIG_HIGHMEM
184EXPORT_SYMBOL(kmap);
185EXPORT_SYMBOL(kunmap);
186EXPORT_SYMBOL(kmap_atomic);
187EXPORT_SYMBOL(kunmap_atomic);
188EXPORT_SYMBOL(kmap_atomic_to_page);
189#endif
190
191#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
192EXPORT_SYMBOL(ist_info);
193#endif
194
195EXPORT_SYMBOL(csum_partial);
diff --git a/arch/i386/kernel/i387.c b/arch/i386/kernel/i387.c
new file mode 100644
index 000000000000..c55e037f08f7
--- /dev/null
+++ b/arch/i386/kernel/i387.c
@@ -0,0 +1,555 @@
1/*
2 * linux/arch/i386/kernel/i387.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * General FPU state handling cleanups
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 */
10
11#include <linux/config.h>
12#include <linux/sched.h>
13#include <asm/processor.h>
14#include <asm/i387.h>
15#include <asm/math_emu.h>
16#include <asm/sigcontext.h>
17#include <asm/user.h>
18#include <asm/ptrace.h>
19#include <asm/uaccess.h>
20
21#ifdef CONFIG_MATH_EMULATION
22#define HAVE_HWFP (boot_cpu_data.hard_math)
23#else
24#define HAVE_HWFP 1
25#endif
26
27static unsigned long mxcsr_feature_mask = 0xffffffff;
28
29void mxcsr_feature_mask_init(void)
30{
31 unsigned long mask = 0;
32 clts();
33 if (cpu_has_fxsr) {
34 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
35 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
36 mask = current->thread.i387.fxsave.mxcsr_mask;
37 if (mask == 0) mask = 0x0000ffbf;
38 }
39 mxcsr_feature_mask &= mask;
40 stts();
41}
42
43/*
44 * The _current_ task is using the FPU for the first time
45 * so initialize it and set the mxcsr to its default
46 * value at reset if we support XMM instructions and then
47 * remeber the current task has used the FPU.
48 */
49void init_fpu(struct task_struct *tsk)
50{
51 if (cpu_has_fxsr) {
52 memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
53 tsk->thread.i387.fxsave.cwd = 0x37f;
54 if (cpu_has_xmm)
55 tsk->thread.i387.fxsave.mxcsr = 0x1f80;
56 } else {
57 memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
58 tsk->thread.i387.fsave.cwd = 0xffff037fu;
59 tsk->thread.i387.fsave.swd = 0xffff0000u;
60 tsk->thread.i387.fsave.twd = 0xffffffffu;
61 tsk->thread.i387.fsave.fos = 0xffff0000u;
62 }
63 /* only the device not available exception or ptrace can call init_fpu */
64 set_stopped_child_used_math(tsk);
65}
66
67/*
68 * FPU lazy state save handling.
69 */
70
71void kernel_fpu_begin(void)
72{
73 struct thread_info *thread = current_thread_info();
74
75 preempt_disable();
76 if (thread->status & TS_USEDFPU) {
77 __save_init_fpu(thread->task);
78 return;
79 }
80 clts();
81}
82
83void restore_fpu( struct task_struct *tsk )
84{
85 if ( cpu_has_fxsr ) {
86 asm volatile( "fxrstor %0"
87 : : "m" (tsk->thread.i387.fxsave) );
88 } else {
89 asm volatile( "frstor %0"
90 : : "m" (tsk->thread.i387.fsave) );
91 }
92}
93
94/*
95 * FPU tag word conversions.
96 */
97
98static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
99{
100 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
101
102 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
103 tmp = ~twd;
104 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
105 /* and move the valid bits to the lower byte. */
106 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
107 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
108 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
109 return tmp;
110}
111
112static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
113{
114 struct _fpxreg *st = NULL;
115 unsigned long tos = (fxsave->swd >> 11) & 7;
116 unsigned long twd = (unsigned long) fxsave->twd;
117 unsigned long tag;
118 unsigned long ret = 0xffff0000u;
119 int i;
120
121#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
122
123 for ( i = 0 ; i < 8 ; i++ ) {
124 if ( twd & 0x1 ) {
125 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
126
127 switch ( st->exponent & 0x7fff ) {
128 case 0x7fff:
129 tag = 2; /* Special */
130 break;
131 case 0x0000:
132 if ( !st->significand[0] &&
133 !st->significand[1] &&
134 !st->significand[2] &&
135 !st->significand[3] ) {
136 tag = 1; /* Zero */
137 } else {
138 tag = 2; /* Special */
139 }
140 break;
141 default:
142 if ( st->significand[3] & 0x8000 ) {
143 tag = 0; /* Valid */
144 } else {
145 tag = 2; /* Special */
146 }
147 break;
148 }
149 } else {
150 tag = 3; /* Empty */
151 }
152 ret |= (tag << (2 * i));
153 twd = twd >> 1;
154 }
155 return ret;
156}
157
158/*
159 * FPU state interaction.
160 */
161
162unsigned short get_fpu_cwd( struct task_struct *tsk )
163{
164 if ( cpu_has_fxsr ) {
165 return tsk->thread.i387.fxsave.cwd;
166 } else {
167 return (unsigned short)tsk->thread.i387.fsave.cwd;
168 }
169}
170
171unsigned short get_fpu_swd( struct task_struct *tsk )
172{
173 if ( cpu_has_fxsr ) {
174 return tsk->thread.i387.fxsave.swd;
175 } else {
176 return (unsigned short)tsk->thread.i387.fsave.swd;
177 }
178}
179
180#if 0
181unsigned short get_fpu_twd( struct task_struct *tsk )
182{
183 if ( cpu_has_fxsr ) {
184 return tsk->thread.i387.fxsave.twd;
185 } else {
186 return (unsigned short)tsk->thread.i387.fsave.twd;
187 }
188}
189#endif /* 0 */
190
191unsigned short get_fpu_mxcsr( struct task_struct *tsk )
192{
193 if ( cpu_has_xmm ) {
194 return tsk->thread.i387.fxsave.mxcsr;
195 } else {
196 return 0x1f80;
197 }
198}
199
200#if 0
201
202void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
203{
204 if ( cpu_has_fxsr ) {
205 tsk->thread.i387.fxsave.cwd = cwd;
206 } else {
207 tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
208 }
209}
210
211void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
212{
213 if ( cpu_has_fxsr ) {
214 tsk->thread.i387.fxsave.swd = swd;
215 } else {
216 tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
217 }
218}
219
220void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
221{
222 if ( cpu_has_fxsr ) {
223 tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
224 } else {
225 tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
226 }
227}
228
229#endif /* 0 */
230
231/*
232 * FXSR floating point environment conversions.
233 */
234
235static int convert_fxsr_to_user( struct _fpstate __user *buf,
236 struct i387_fxsave_struct *fxsave )
237{
238 unsigned long env[7];
239 struct _fpreg __user *to;
240 struct _fpxreg *from;
241 int i;
242
243 env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
244 env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
245 env[2] = twd_fxsr_to_i387(fxsave);
246 env[3] = fxsave->fip;
247 env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
248 env[5] = fxsave->foo;
249 env[6] = fxsave->fos;
250
251 if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
252 return 1;
253
254 to = &buf->_st[0];
255 from = (struct _fpxreg *) &fxsave->st_space[0];
256 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
257 unsigned long __user *t = (unsigned long __user *)to;
258 unsigned long *f = (unsigned long *)from;
259
260 if (__put_user(*f, t) ||
261 __put_user(*(f + 1), t + 1) ||
262 __put_user(from->exponent, &to->exponent))
263 return 1;
264 }
265 return 0;
266}
267
268static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
269 struct _fpstate __user *buf )
270{
271 unsigned long env[7];
272 struct _fpxreg *to;
273 struct _fpreg __user *from;
274 int i;
275
276 if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
277 return 1;
278
279 fxsave->cwd = (unsigned short)(env[0] & 0xffff);
280 fxsave->swd = (unsigned short)(env[1] & 0xffff);
281 fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
282 fxsave->fip = env[3];
283 fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
284 fxsave->fcs = (env[4] & 0xffff);
285 fxsave->foo = env[5];
286 fxsave->fos = env[6];
287
288 to = (struct _fpxreg *) &fxsave->st_space[0];
289 from = &buf->_st[0];
290 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
291 unsigned long *t = (unsigned long *)to;
292 unsigned long __user *f = (unsigned long __user *)from;
293
294 if (__get_user(*t, f) ||
295 __get_user(*(t + 1), f + 1) ||
296 __get_user(to->exponent, &from->exponent))
297 return 1;
298 }
299 return 0;
300}
301
302/*
303 * Signal frame handlers.
304 */
305
306static inline int save_i387_fsave( struct _fpstate __user *buf )
307{
308 struct task_struct *tsk = current;
309
310 unlazy_fpu( tsk );
311 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
312 if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
313 sizeof(struct i387_fsave_struct) ) )
314 return -1;
315 return 1;
316}
317
318static int save_i387_fxsave( struct _fpstate __user *buf )
319{
320 struct task_struct *tsk = current;
321 int err = 0;
322
323 unlazy_fpu( tsk );
324
325 if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
326 return -1;
327
328 err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
329 err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
330 if ( err )
331 return -1;
332
333 if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
334 sizeof(struct i387_fxsave_struct) ) )
335 return -1;
336 return 1;
337}
338
339int save_i387( struct _fpstate __user *buf )
340{
341 if ( !used_math() )
342 return 0;
343
344 /* This will cause a "finit" to be triggered by the next
345 * attempted FPU operation by the 'current' process.
346 */
347 clear_used_math();
348
349 if ( HAVE_HWFP ) {
350 if ( cpu_has_fxsr ) {
351 return save_i387_fxsave( buf );
352 } else {
353 return save_i387_fsave( buf );
354 }
355 } else {
356 return save_i387_soft( &current->thread.i387.soft, buf );
357 }
358}
359
360static inline int restore_i387_fsave( struct _fpstate __user *buf )
361{
362 struct task_struct *tsk = current;
363 clear_fpu( tsk );
364 return __copy_from_user( &tsk->thread.i387.fsave, buf,
365 sizeof(struct i387_fsave_struct) );
366}
367
368static int restore_i387_fxsave( struct _fpstate __user *buf )
369{
370 int err;
371 struct task_struct *tsk = current;
372 clear_fpu( tsk );
373 err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
374 sizeof(struct i387_fxsave_struct) );
375 /* mxcsr reserved bits must be masked to zero for security reasons */
376 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
377 return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
378}
379
380int restore_i387( struct _fpstate __user *buf )
381{
382 int err;
383
384 if ( HAVE_HWFP ) {
385 if ( cpu_has_fxsr ) {
386 err = restore_i387_fxsave( buf );
387 } else {
388 err = restore_i387_fsave( buf );
389 }
390 } else {
391 err = restore_i387_soft( &current->thread.i387.soft, buf );
392 }
393 set_used_math();
394 return err;
395}
396
397/*
398 * ptrace request handlers.
399 */
400
401static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
402 struct task_struct *tsk )
403{
404 return __copy_to_user( buf, &tsk->thread.i387.fsave,
405 sizeof(struct user_i387_struct) );
406}
407
408static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
409 struct task_struct *tsk )
410{
411 return convert_fxsr_to_user( (struct _fpstate __user *)buf,
412 &tsk->thread.i387.fxsave );
413}
414
415int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
416{
417 if ( HAVE_HWFP ) {
418 if ( cpu_has_fxsr ) {
419 return get_fpregs_fxsave( buf, tsk );
420 } else {
421 return get_fpregs_fsave( buf, tsk );
422 }
423 } else {
424 return save_i387_soft( &tsk->thread.i387.soft,
425 (struct _fpstate __user *)buf );
426 }
427}
428
429static inline int set_fpregs_fsave( struct task_struct *tsk,
430 struct user_i387_struct __user *buf )
431{
432 return __copy_from_user( &tsk->thread.i387.fsave, buf,
433 sizeof(struct user_i387_struct) );
434}
435
436static inline int set_fpregs_fxsave( struct task_struct *tsk,
437 struct user_i387_struct __user *buf )
438{
439 return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
440 (struct _fpstate __user *)buf );
441}
442
443int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
444{
445 if ( HAVE_HWFP ) {
446 if ( cpu_has_fxsr ) {
447 return set_fpregs_fxsave( tsk, buf );
448 } else {
449 return set_fpregs_fsave( tsk, buf );
450 }
451 } else {
452 return restore_i387_soft( &tsk->thread.i387.soft,
453 (struct _fpstate __user *)buf );
454 }
455}
456
457int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
458{
459 if ( cpu_has_fxsr ) {
460 if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
461 sizeof(struct user_fxsr_struct) ))
462 return -EFAULT;
463 return 0;
464 } else {
465 return -EIO;
466 }
467}
468
469int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
470{
471 int ret = 0;
472
473 if ( cpu_has_fxsr ) {
474 if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
475 sizeof(struct user_fxsr_struct) ))
476 ret = -EFAULT;
477 /* mxcsr reserved bits must be masked to zero for security reasons */
478 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
479 } else {
480 ret = -EIO;
481 }
482 return ret;
483}
484
485/*
486 * FPU state for core dumps.
487 */
488
489static inline void copy_fpu_fsave( struct task_struct *tsk,
490 struct user_i387_struct *fpu )
491{
492 memcpy( fpu, &tsk->thread.i387.fsave,
493 sizeof(struct user_i387_struct) );
494}
495
496static inline void copy_fpu_fxsave( struct task_struct *tsk,
497 struct user_i387_struct *fpu )
498{
499 unsigned short *to;
500 unsigned short *from;
501 int i;
502
503 memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
504
505 to = (unsigned short *)&fpu->st_space[0];
506 from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
507 for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
508 memcpy( to, from, 5 * sizeof(unsigned short) );
509 }
510}
511
512int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
513{
514 int fpvalid;
515 struct task_struct *tsk = current;
516
517 fpvalid = !!used_math();
518 if ( fpvalid ) {
519 unlazy_fpu( tsk );
520 if ( cpu_has_fxsr ) {
521 copy_fpu_fxsave( tsk, fpu );
522 } else {
523 copy_fpu_fsave( tsk, fpu );
524 }
525 }
526
527 return fpvalid;
528}
529
530int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
531{
532 int fpvalid = !!tsk_used_math(tsk);
533
534 if (fpvalid) {
535 if (tsk == current)
536 unlazy_fpu(tsk);
537 if (cpu_has_fxsr)
538 copy_fpu_fxsave(tsk, fpu);
539 else
540 copy_fpu_fsave(tsk, fpu);
541 }
542 return fpvalid;
543}
544
545int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
546{
547 int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
548
549 if (fpvalid) {
550 if (tsk == current)
551 unlazy_fpu(tsk);
552 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
553 }
554 return fpvalid;
555}
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
new file mode 100644
index 000000000000..560bef1afb3b
--- /dev/null
+++ b/arch/i386/kernel/i8259.c
@@ -0,0 +1,429 @@
1#include <linux/config.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/slab.h>
8#include <linux/random.h>
9#include <linux/smp_lock.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14
15#include <asm/8253pit.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/irq.h>
20#include <asm/timer.h>
21#include <asm/pgtable.h>
22#include <asm/delay.h>
23#include <asm/desc.h>
24#include <asm/apic.h>
25#include <asm/arch_hooks.h>
26#include <asm/i8259.h>
27
28#include <linux/irq.h>
29
30#include <io_ports.h>
31
32/*
33 * This is the 'legacy' 8259A Programmable Interrupt Controller,
34 * present in the majority of PC/AT boxes.
35 * plus some generic x86 specific things if generic specifics makes
36 * any sense at all.
37 * this file should become arch/i386/kernel/irq.c when the old irq.c
38 * moves to arch independent land
39 */
40
41DEFINE_SPINLOCK(i8259A_lock);
42
43static void end_8259A_irq (unsigned int irq)
44{
45 if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
46 irq_desc[irq].action)
47 enable_8259A_irq(irq);
48}
49
50#define shutdown_8259A_irq disable_8259A_irq
51
52static void mask_and_ack_8259A(unsigned int);
53
54unsigned int startup_8259A_irq(unsigned int irq)
55{
56 enable_8259A_irq(irq);
57 return 0; /* never anything pending */
58}
59
60static struct hw_interrupt_type i8259A_irq_type = {
61 .typename = "XT-PIC",
62 .startup = startup_8259A_irq,
63 .shutdown = shutdown_8259A_irq,
64 .enable = enable_8259A_irq,
65 .disable = disable_8259A_irq,
66 .ack = mask_and_ack_8259A,
67 .end = end_8259A_irq,
68};
69
70/*
71 * 8259A PIC functions to handle ISA devices:
72 */
73
74/*
75 * This contains the irq mask for both 8259A irq controllers,
76 */
77unsigned int cached_irq_mask = 0xffff;
78
79/*
80 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
81 * boards the timer interrupt is not really connected to any IO-APIC pin,
82 * it's fed to the master 8259A's IR0 line only.
83 *
84 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
85 * this 'mixed mode' IRQ handling costs nothing because it's only used
86 * at IRQ setup time.
87 */
88unsigned long io_apic_irqs;
89
90void disable_8259A_irq(unsigned int irq)
91{
92 unsigned int mask = 1 << irq;
93 unsigned long flags;
94
95 spin_lock_irqsave(&i8259A_lock, flags);
96 cached_irq_mask |= mask;
97 if (irq & 8)
98 outb(cached_slave_mask, PIC_SLAVE_IMR);
99 else
100 outb(cached_master_mask, PIC_MASTER_IMR);
101 spin_unlock_irqrestore(&i8259A_lock, flags);
102}
103
104void enable_8259A_irq(unsigned int irq)
105{
106 unsigned int mask = ~(1 << irq);
107 unsigned long flags;
108
109 spin_lock_irqsave(&i8259A_lock, flags);
110 cached_irq_mask &= mask;
111 if (irq & 8)
112 outb(cached_slave_mask, PIC_SLAVE_IMR);
113 else
114 outb(cached_master_mask, PIC_MASTER_IMR);
115 spin_unlock_irqrestore(&i8259A_lock, flags);
116}
117
118int i8259A_irq_pending(unsigned int irq)
119{
120 unsigned int mask = 1<<irq;
121 unsigned long flags;
122 int ret;
123
124 spin_lock_irqsave(&i8259A_lock, flags);
125 if (irq < 8)
126 ret = inb(PIC_MASTER_CMD) & mask;
127 else
128 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
129 spin_unlock_irqrestore(&i8259A_lock, flags);
130
131 return ret;
132}
133
134void make_8259A_irq(unsigned int irq)
135{
136 disable_irq_nosync(irq);
137 io_apic_irqs &= ~(1<<irq);
138 irq_desc[irq].handler = &i8259A_irq_type;
139 enable_irq(irq);
140}
141
142/*
143 * This function assumes to be called rarely. Switching between
144 * 8259A registers is slow.
145 * This has to be protected by the irq controller spinlock
146 * before being called.
147 */
148static inline int i8259A_irq_real(unsigned int irq)
149{
150 int value;
151 int irqmask = 1<<irq;
152
153 if (irq < 8) {
154 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
155 value = inb(PIC_MASTER_CMD) & irqmask;
156 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
157 return value;
158 }
159 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
160 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
161 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
162 return value;
163}
164
165/*
166 * Careful! The 8259A is a fragile beast, it pretty
167 * much _has_ to be done exactly like this (mask it
168 * first, _then_ send the EOI, and the order of EOI
169 * to the two 8259s is important!
170 */
171static void mask_and_ack_8259A(unsigned int irq)
172{
173 unsigned int irqmask = 1 << irq;
174 unsigned long flags;
175
176 spin_lock_irqsave(&i8259A_lock, flags);
177 /*
178 * Lightweight spurious IRQ detection. We do not want
179 * to overdo spurious IRQ handling - it's usually a sign
180 * of hardware problems, so we only do the checks we can
181 * do without slowing down good hardware unnecesserily.
182 *
183 * Note that IRQ7 and IRQ15 (the two spurious IRQs
184 * usually resulting from the 8259A-1|2 PICs) occur
185 * even if the IRQ is masked in the 8259A. Thus we
186 * can check spurious 8259A IRQs without doing the
187 * quite slow i8259A_irq_real() call for every IRQ.
188 * This does not cover 100% of spurious interrupts,
189 * but should be enough to warn the user that there
190 * is something bad going on ...
191 */
192 if (cached_irq_mask & irqmask)
193 goto spurious_8259A_irq;
194 cached_irq_mask |= irqmask;
195
196handle_real_irq:
197 if (irq & 8) {
198 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
199 outb(cached_slave_mask, PIC_SLAVE_IMR);
200 outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
201 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
202 } else {
203 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
204 outb(cached_master_mask, PIC_MASTER_IMR);
205 outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */
206 }
207 spin_unlock_irqrestore(&i8259A_lock, flags);
208 return;
209
210spurious_8259A_irq:
211 /*
212 * this is the slow path - should happen rarely.
213 */
214 if (i8259A_irq_real(irq))
215 /*
216 * oops, the IRQ _is_ in service according to the
217 * 8259A - not spurious, go handle it.
218 */
219 goto handle_real_irq;
220
221 {
222 static int spurious_irq_mask;
223 /*
224 * At this point we can be sure the IRQ is spurious,
225 * lets ACK and report it. [once per IRQ]
226 */
227 if (!(spurious_irq_mask & irqmask)) {
228 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
229 spurious_irq_mask |= irqmask;
230 }
231 atomic_inc(&irq_err_count);
232 /*
233 * Theoretically we do not have to handle this IRQ,
234 * but in Linux this does not cause problems and is
235 * simpler for us.
236 */
237 goto handle_real_irq;
238 }
239}
240
241static char irq_trigger[2];
242/**
243 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
244 */
245static void restore_ELCR(char *trigger)
246{
247 outb(trigger[0], 0x4d0);
248 outb(trigger[1], 0x4d1);
249}
250
251static void save_ELCR(char *trigger)
252{
253 /* IRQ 0,1,2,8,13 are marked as reserved */
254 trigger[0] = inb(0x4d0) & 0xF8;
255 trigger[1] = inb(0x4d1) & 0xDE;
256}
257
258static int i8259A_resume(struct sys_device *dev)
259{
260 init_8259A(0);
261 restore_ELCR(irq_trigger);
262 return 0;
263}
264
265static int i8259A_suspend(struct sys_device *dev, u32 state)
266{
267 save_ELCR(irq_trigger);
268 return 0;
269}
270
271static struct sysdev_class i8259_sysdev_class = {
272 set_kset_name("i8259"),
273 .suspend = i8259A_suspend,
274 .resume = i8259A_resume,
275};
276
277static struct sys_device device_i8259A = {
278 .id = 0,
279 .cls = &i8259_sysdev_class,
280};
281
282static int __init i8259A_init_sysfs(void)
283{
284 int error = sysdev_class_register(&i8259_sysdev_class);
285 if (!error)
286 error = sysdev_register(&device_i8259A);
287 return error;
288}
289
290device_initcall(i8259A_init_sysfs);
291
292void init_8259A(int auto_eoi)
293{
294 unsigned long flags;
295
296 spin_lock_irqsave(&i8259A_lock, flags);
297
298 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
299 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
300
301 /*
302 * outb_p - this has to work on a wide range of PC hardware.
303 */
304 outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
305 outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
306 outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
307 if (auto_eoi) /* master does Auto EOI */
308 outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
309 else /* master expects normal EOI */
310 outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
311
312 outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
313 outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
314 outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
315 outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
316 if (auto_eoi)
317 /*
318 * in AEOI mode we just have to mask the interrupt
319 * when acking.
320 */
321 i8259A_irq_type.ack = disable_8259A_irq;
322 else
323 i8259A_irq_type.ack = mask_and_ack_8259A;
324
325 udelay(100); /* wait for 8259A to initialize */
326
327 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
328 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
329
330 spin_unlock_irqrestore(&i8259A_lock, flags);
331}
332
333/*
334 * Note that on a 486, we don't want to do a SIGFPE on an irq13
335 * as the irq is unreliable, and exception 16 works correctly
336 * (ie as explained in the intel literature). On a 386, you
337 * can't use exception 16 due to bad IBM design, so we have to
338 * rely on the less exact irq13.
339 *
340 * Careful.. Not only is IRQ13 unreliable, but it is also
341 * leads to races. IBM designers who came up with it should
342 * be shot.
343 */
344
345
346static irqreturn_t math_error_irq(int cpl, void *dev_id, struct pt_regs *regs)
347{
348 extern void math_error(void __user *);
349 outb(0,0xF0);
350 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
351 return IRQ_NONE;
352 math_error((void __user *)regs->eip);
353 return IRQ_HANDLED;
354}
355
356/*
357 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
358 * so allow interrupt sharing.
359 */
360static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
361
362void __init init_ISA_irqs (void)
363{
364 int i;
365
366#ifdef CONFIG_X86_LOCAL_APIC
367 init_bsp_APIC();
368#endif
369 init_8259A(0);
370
371 for (i = 0; i < NR_IRQS; i++) {
372 irq_desc[i].status = IRQ_DISABLED;
373 irq_desc[i].action = NULL;
374 irq_desc[i].depth = 1;
375
376 if (i < 16) {
377 /*
378 * 16 old-style INTA-cycle interrupts:
379 */
380 irq_desc[i].handler = &i8259A_irq_type;
381 } else {
382 /*
383 * 'high' PCI IRQs filled in on demand
384 */
385 irq_desc[i].handler = &no_irq_type;
386 }
387 }
388}
389
390void __init init_IRQ(void)
391{
392 int i;
393
394 /* all the set up before the call gates are initialised */
395 pre_intr_init_hook();
396
397 /*
398 * Cover the whole vector space, no vector can escape
399 * us. (some of these will be overridden and become
400 * 'special' SMP interrupts)
401 */
402 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
403 int vector = FIRST_EXTERNAL_VECTOR + i;
404 if (i >= NR_IRQS)
405 break;
406 if (vector != SYSCALL_VECTOR)
407 set_intr_gate(vector, interrupt[i]);
408 }
409
410 /* setup after call gates are initialised (usually add in
411 * the architecture specific gates)
412 */
413 intr_init_hook();
414
415 /*
416 * Set the clock to HZ Hz, we already have a valid
417 * vector now:
418 */
419 setup_pit_timer();
420
421 /*
422 * External FPU? Set up irq13 if so, for
423 * original braindamaged IBM FERR coupling.
424 */
425 if (boot_cpu_data.hard_math && !cpu_has_fpu)
426 setup_irq(FPU_IRQ, &fpu_irq);
427
428 irq_ctx_init(smp_processor_id());
429}
diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
new file mode 100644
index 000000000000..9caa8e8db80c
--- /dev/null
+++ b/arch/i386/kernel/init_task.c
@@ -0,0 +1,46 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm);
18
19EXPORT_SYMBOL(init_mm);
20
21/*
22 * Initial thread structure.
23 *
24 * We need to make sure that this is THREAD_SIZE aligned due to the
25 * way process stacks are handled. This is done by having a special
26 * "init_task" linker map entry..
27 */
28union thread_union init_thread_union
29 __attribute__((__section__(".data.init_task"))) =
30 { INIT_THREAD_INFO(init_task) };
31
32/*
33 * Initial task structure.
34 *
35 * All other task structs will be allocated on slabs in fork.c
36 */
37struct task_struct init_task = INIT_TASK(init_task);
38
39EXPORT_SYMBOL(init_task);
40
41/*
42 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
43 * no more per-task TSS's.
44 */
45DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;
46
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
new file mode 100644
index 000000000000..9c1350e811d0
--- /dev/null
+++ b/arch/i386/kernel/io_apic.c
@@ -0,0 +1,2545 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/irq.h>
25#include <linux/interrupt.h>
26#include <linux/init.h>
27#include <linux/delay.h>
28#include <linux/sched.h>
29#include <linux/config.h>
30#include <linux/smp_lock.h>
31#include <linux/mc146818rtc.h>
32#include <linux/compiler.h>
33#include <linux/acpi.h>
34
35#include <linux/sysdev.h>
36#include <asm/io.h>
37#include <asm/smp.h>
38#include <asm/desc.h>
39#include <asm/timer.h>
40
41#include <mach_apic.h>
42
43#include "io_ports.h"
44
45int (*ioapic_renumber_irq)(int ioapic, int irq);
46atomic_t irq_mis_count;
47
48static DEFINE_SPINLOCK(ioapic_lock);
49
50/*
51 * Is the SiS APIC rmw bug present ?
52 * -1 = don't know, 0 = no, 1 = yes
53 */
54int sis_apic_bug = -1;
55
56/*
57 * # of IRQ routing registers
58 */
59int nr_ioapic_registers[MAX_IO_APICS];
60
61/*
62 * Rough estimation of how many shared IRQs there are, can
63 * be changed anytime.
64 */
65#define MAX_PLUS_SHARED_IRQS NR_IRQS
66#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
67
68/*
69 * This is performance-critical, we want to do it O(1)
70 *
71 * the indexing order of this array favors 1:1 mappings
72 * between pins and IRQs.
73 */
74
75static struct irq_pin_list {
76 int apic, pin, next;
77} irq_2_pin[PIN_MAP_SIZE];
78
79int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
80#ifdef CONFIG_PCI_MSI
81#define vector_to_irq(vector) \
82 (platform_legacy_irq(vector) ? vector : vector_irq[vector])
83#else
84#define vector_to_irq(vector) (vector)
85#endif
86
87/*
88 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
89 * shared ISA-space IRQs, so we have to support them. We are super
90 * fast in the common case, and fast for shared ISA-space IRQs.
91 */
92static void add_pin_to_irq(unsigned int irq, int apic, int pin)
93{
94 static int first_free_entry = NR_IRQS;
95 struct irq_pin_list *entry = irq_2_pin + irq;
96
97 while (entry->next)
98 entry = irq_2_pin + entry->next;
99
100 if (entry->pin != -1) {
101 entry->next = first_free_entry;
102 entry = irq_2_pin + entry->next;
103 if (++first_free_entry >= PIN_MAP_SIZE)
104 panic("io_apic.c: whoops");
105 }
106 entry->apic = apic;
107 entry->pin = pin;
108}
109
110/*
111 * Reroute an IRQ to a different pin.
112 */
113static void __init replace_pin_at_irq(unsigned int irq,
114 int oldapic, int oldpin,
115 int newapic, int newpin)
116{
117 struct irq_pin_list *entry = irq_2_pin + irq;
118
119 while (1) {
120 if (entry->apic == oldapic && entry->pin == oldpin) {
121 entry->apic = newapic;
122 entry->pin = newpin;
123 }
124 if (!entry->next)
125 break;
126 entry = irq_2_pin + entry->next;
127 }
128}
129
130static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
131{
132 struct irq_pin_list *entry = irq_2_pin + irq;
133 unsigned int pin, reg;
134
135 for (;;) {
136 pin = entry->pin;
137 if (pin == -1)
138 break;
139 reg = io_apic_read(entry->apic, 0x10 + pin*2);
140 reg &= ~disable;
141 reg |= enable;
142 io_apic_modify(entry->apic, 0x10 + pin*2, reg);
143 if (!entry->next)
144 break;
145 entry = irq_2_pin + entry->next;
146 }
147}
148
149/* mask = 1 */
150static void __mask_IO_APIC_irq (unsigned int irq)
151{
152 __modify_IO_APIC_irq(irq, 0x00010000, 0);
153}
154
155/* mask = 0 */
156static void __unmask_IO_APIC_irq (unsigned int irq)
157{
158 __modify_IO_APIC_irq(irq, 0, 0x00010000);
159}
160
161/* mask = 1, trigger = 0 */
162static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
163{
164 __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
165}
166
167/* mask = 0, trigger = 1 */
168static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
169{
170 __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
171}
172
173static void mask_IO_APIC_irq (unsigned int irq)
174{
175 unsigned long flags;
176
177 spin_lock_irqsave(&ioapic_lock, flags);
178 __mask_IO_APIC_irq(irq);
179 spin_unlock_irqrestore(&ioapic_lock, flags);
180}
181
182static void unmask_IO_APIC_irq (unsigned int irq)
183{
184 unsigned long flags;
185
186 spin_lock_irqsave(&ioapic_lock, flags);
187 __unmask_IO_APIC_irq(irq);
188 spin_unlock_irqrestore(&ioapic_lock, flags);
189}
190
191static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
192{
193 struct IO_APIC_route_entry entry;
194 unsigned long flags;
195
196 /* Check delivery_mode to be sure we're not clearing an SMI pin */
197 spin_lock_irqsave(&ioapic_lock, flags);
198 *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
199 *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
200 spin_unlock_irqrestore(&ioapic_lock, flags);
201 if (entry.delivery_mode == dest_SMI)
202 return;
203
204 /*
205 * Disable it in the IO-APIC irq-routing table:
206 */
207 memset(&entry, 0, sizeof(entry));
208 entry.mask = 1;
209 spin_lock_irqsave(&ioapic_lock, flags);
210 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
211 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
212 spin_unlock_irqrestore(&ioapic_lock, flags);
213}
214
215static void clear_IO_APIC (void)
216{
217 int apic, pin;
218
219 for (apic = 0; apic < nr_ioapics; apic++)
220 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
221 clear_IO_APIC_pin(apic, pin);
222}
223
224static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
225{
226 unsigned long flags;
227 int pin;
228 struct irq_pin_list *entry = irq_2_pin + irq;
229 unsigned int apicid_value;
230
231 apicid_value = cpu_mask_to_apicid(cpumask);
232 /* Prepare to do the io_apic_write */
233 apicid_value = apicid_value << 24;
234 spin_lock_irqsave(&ioapic_lock, flags);
235 for (;;) {
236 pin = entry->pin;
237 if (pin == -1)
238 break;
239 io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
240 if (!entry->next)
241 break;
242 entry = irq_2_pin + entry->next;
243 }
244 spin_unlock_irqrestore(&ioapic_lock, flags);
245}
246
247#if defined(CONFIG_IRQBALANCE)
248# include <asm/processor.h> /* kernel_thread() */
249# include <linux/kernel_stat.h> /* kstat */
250# include <linux/slab.h> /* kmalloc() */
251# include <linux/timer.h> /* time_after() */
252
253# ifdef CONFIG_BALANCED_IRQ_DEBUG
254# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
255# define Dprintk(x...) do { TDprintk(x); } while (0)
256# else
257# define TDprintk(x...)
258# define Dprintk(x...)
259# endif
260
261cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
262
263#define IRQBALANCE_CHECK_ARCH -999
264static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
265static int physical_balance = 0;
266
267static struct irq_cpu_info {
268 unsigned long * last_irq;
269 unsigned long * irq_delta;
270 unsigned long irq;
271} irq_cpu_data[NR_CPUS];
272
273#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
274#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
275#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
276
277#define IDLE_ENOUGH(cpu,now) \
278 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
279
280#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
281
282#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
283
284#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
285#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
286#define BALANCED_IRQ_MORE_DELTA (HZ/10)
287#define BALANCED_IRQ_LESS_DELTA (HZ)
288
289static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
290
291static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
292 unsigned long now, int direction)
293{
294 int search_idle = 1;
295 int cpu = curr_cpu;
296
297 goto inside;
298
299 do {
300 if (unlikely(cpu == curr_cpu))
301 search_idle = 0;
302inside:
303 if (direction == 1) {
304 cpu++;
305 if (cpu >= NR_CPUS)
306 cpu = 0;
307 } else {
308 cpu--;
309 if (cpu == -1)
310 cpu = NR_CPUS-1;
311 }
312 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
313 (search_idle && !IDLE_ENOUGH(cpu,now)));
314
315 return cpu;
316}
317
318static inline void balance_irq(int cpu, int irq)
319{
320 unsigned long now = jiffies;
321 cpumask_t allowed_mask;
322 unsigned int new_cpu;
323
324 if (irqbalance_disabled)
325 return;
326
327 cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
328 new_cpu = move(cpu, allowed_mask, now, 1);
329 if (cpu != new_cpu) {
330 irq_desc_t *desc = irq_desc + irq;
331 unsigned long flags;
332
333 spin_lock_irqsave(&desc->lock, flags);
334 pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu);
335 spin_unlock_irqrestore(&desc->lock, flags);
336 }
337}
338
339static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
340{
341 int i, j;
342 Dprintk("Rotating IRQs among CPUs.\n");
343 for (i = 0; i < NR_CPUS; i++) {
344 for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
345 if (!irq_desc[j].action)
346 continue;
347 /* Is it a significant load ? */
348 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
349 useful_load_threshold)
350 continue;
351 balance_irq(i, j);
352 }
353 }
354 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
355 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
356 return;
357}
358
359static void do_irq_balance(void)
360{
361 int i, j;
362 unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
363 unsigned long move_this_load = 0;
364 int max_loaded = 0, min_loaded = 0;
365 int load;
366 unsigned long useful_load_threshold = balanced_irq_interval + 10;
367 int selected_irq;
368 int tmp_loaded, first_attempt = 1;
369 unsigned long tmp_cpu_irq;
370 unsigned long imbalance = 0;
371 cpumask_t allowed_mask, target_cpu_mask, tmp;
372
373 for (i = 0; i < NR_CPUS; i++) {
374 int package_index;
375 CPU_IRQ(i) = 0;
376 if (!cpu_online(i))
377 continue;
378 package_index = CPU_TO_PACKAGEINDEX(i);
379 for (j = 0; j < NR_IRQS; j++) {
380 unsigned long value_now, delta;
381 /* Is this an active IRQ? */
382 if (!irq_desc[j].action)
383 continue;
384 if ( package_index == i )
385 IRQ_DELTA(package_index,j) = 0;
386 /* Determine the total count per processor per IRQ */
387 value_now = (unsigned long) kstat_cpu(i).irqs[j];
388
389 /* Determine the activity per processor per IRQ */
390 delta = value_now - LAST_CPU_IRQ(i,j);
391
392 /* Update last_cpu_irq[][] for the next time */
393 LAST_CPU_IRQ(i,j) = value_now;
394
395 /* Ignore IRQs whose rate is less than the clock */
396 if (delta < useful_load_threshold)
397 continue;
398 /* update the load for the processor or package total */
399 IRQ_DELTA(package_index,j) += delta;
400
401 /* Keep track of the higher numbered sibling as well */
402 if (i != package_index)
403 CPU_IRQ(i) += delta;
404 /*
405 * We have sibling A and sibling B in the package
406 *
407 * cpu_irq[A] = load for cpu A + load for cpu B
408 * cpu_irq[B] = load for cpu B
409 */
410 CPU_IRQ(package_index) += delta;
411 }
412 }
413 /* Find the least loaded processor package */
414 for (i = 0; i < NR_CPUS; i++) {
415 if (!cpu_online(i))
416 continue;
417 if (i != CPU_TO_PACKAGEINDEX(i))
418 continue;
419 if (min_cpu_irq > CPU_IRQ(i)) {
420 min_cpu_irq = CPU_IRQ(i);
421 min_loaded = i;
422 }
423 }
424 max_cpu_irq = ULONG_MAX;
425
426tryanothercpu:
427 /* Look for heaviest loaded processor.
428 * We may come back to get the next heaviest loaded processor.
429 * Skip processors with trivial loads.
430 */
431 tmp_cpu_irq = 0;
432 tmp_loaded = -1;
433 for (i = 0; i < NR_CPUS; i++) {
434 if (!cpu_online(i))
435 continue;
436 if (i != CPU_TO_PACKAGEINDEX(i))
437 continue;
438 if (max_cpu_irq <= CPU_IRQ(i))
439 continue;
440 if (tmp_cpu_irq < CPU_IRQ(i)) {
441 tmp_cpu_irq = CPU_IRQ(i);
442 tmp_loaded = i;
443 }
444 }
445
446 if (tmp_loaded == -1) {
447 /* In the case of small number of heavy interrupt sources,
448 * loading some of the cpus too much. We use Ingo's original
449 * approach to rotate them around.
450 */
451 if (!first_attempt && imbalance >= useful_load_threshold) {
452 rotate_irqs_among_cpus(useful_load_threshold);
453 return;
454 }
455 goto not_worth_the_effort;
456 }
457
458 first_attempt = 0; /* heaviest search */
459 max_cpu_irq = tmp_cpu_irq; /* load */
460 max_loaded = tmp_loaded; /* processor */
461 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
462
463 Dprintk("max_loaded cpu = %d\n", max_loaded);
464 Dprintk("min_loaded cpu = %d\n", min_loaded);
465 Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
466 Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
467 Dprintk("load imbalance = %lu\n", imbalance);
468
469 /* if imbalance is less than approx 10% of max load, then
470 * observe diminishing returns action. - quit
471 */
472 if (imbalance < (max_cpu_irq >> 3)) {
473 Dprintk("Imbalance too trivial\n");
474 goto not_worth_the_effort;
475 }
476
477tryanotherirq:
478 /* if we select an IRQ to move that can't go where we want, then
479 * see if there is another one to try.
480 */
481 move_this_load = 0;
482 selected_irq = -1;
483 for (j = 0; j < NR_IRQS; j++) {
484 /* Is this an active IRQ? */
485 if (!irq_desc[j].action)
486 continue;
487 if (imbalance <= IRQ_DELTA(max_loaded,j))
488 continue;
489 /* Try to find the IRQ that is closest to the imbalance
490 * without going over.
491 */
492 if (move_this_load < IRQ_DELTA(max_loaded,j)) {
493 move_this_load = IRQ_DELTA(max_loaded,j);
494 selected_irq = j;
495 }
496 }
497 if (selected_irq == -1) {
498 goto tryanothercpu;
499 }
500
501 imbalance = move_this_load;
502
503 /* For physical_balance case, we accumlated both load
504 * values in the one of the siblings cpu_irq[],
505 * to use the same code for physical and logical processors
506 * as much as possible.
507 *
508 * NOTE: the cpu_irq[] array holds the sum of the load for
509 * sibling A and sibling B in the slot for the lowest numbered
510 * sibling (A), _AND_ the load for sibling B in the slot for
511 * the higher numbered sibling.
512 *
513 * We seek the least loaded sibling by making the comparison
514 * (A+B)/2 vs B
515 */
516 load = CPU_IRQ(min_loaded) >> 1;
517 for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
518 if (load > CPU_IRQ(j)) {
519 /* This won't change cpu_sibling_map[min_loaded] */
520 load = CPU_IRQ(j);
521 min_loaded = j;
522 }
523 }
524
525 cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
526 target_cpu_mask = cpumask_of_cpu(min_loaded);
527 cpus_and(tmp, target_cpu_mask, allowed_mask);
528
529 if (!cpus_empty(tmp)) {
530 irq_desc_t *desc = irq_desc + selected_irq;
531 unsigned long flags;
532
533 Dprintk("irq = %d moved to cpu = %d\n",
534 selected_irq, min_loaded);
535 /* mark for change destination */
536 spin_lock_irqsave(&desc->lock, flags);
537 pending_irq_balance_cpumask[selected_irq] =
538 cpumask_of_cpu(min_loaded);
539 spin_unlock_irqrestore(&desc->lock, flags);
540 /* Since we made a change, come back sooner to
541 * check for more variation.
542 */
543 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
544 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
545 return;
546 }
547 goto tryanotherirq;
548
549not_worth_the_effort:
550 /*
551 * if we did not find an IRQ to move, then adjust the time interval
552 * upward
553 */
554 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
555 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
556 Dprintk("IRQ worth rotating not found\n");
557 return;
558}
559
560static int balanced_irq(void *unused)
561{
562 int i;
563 unsigned long prev_balance_time = jiffies;
564 long time_remaining = balanced_irq_interval;
565
566 daemonize("kirqd");
567
568 /* push everything to CPU 0 to give us a starting point. */
569 for (i = 0 ; i < NR_IRQS ; i++) {
570 pending_irq_balance_cpumask[i] = cpumask_of_cpu(0);
571 }
572
573 for ( ; ; ) {
574 set_current_state(TASK_INTERRUPTIBLE);
575 time_remaining = schedule_timeout(time_remaining);
576 try_to_freeze(PF_FREEZE);
577 if (time_after(jiffies,
578 prev_balance_time+balanced_irq_interval)) {
579 do_irq_balance();
580 prev_balance_time = jiffies;
581 time_remaining = balanced_irq_interval;
582 }
583 }
584 return 0;
585}
586
587static int __init balanced_irq_init(void)
588{
589 int i;
590 struct cpuinfo_x86 *c;
591 cpumask_t tmp;
592
593 cpus_shift_right(tmp, cpu_online_map, 2);
594 c = &boot_cpu_data;
595 /* When not overwritten by the command line ask subarchitecture. */
596 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
597 irqbalance_disabled = NO_BALANCE_IRQ;
598 if (irqbalance_disabled)
599 return 0;
600
601 /* disable irqbalance completely if there is only one processor online */
602 if (num_online_cpus() < 2) {
603 irqbalance_disabled = 1;
604 return 0;
605 }
606 /*
607 * Enable physical balance only if more than 1 physical processor
608 * is present
609 */
610 if (smp_num_siblings > 1 && !cpus_empty(tmp))
611 physical_balance = 1;
612
613 for (i = 0; i < NR_CPUS; i++) {
614 if (!cpu_online(i))
615 continue;
616 irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
617 irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
618 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
619 printk(KERN_ERR "balanced_irq_init: out of memory");
620 goto failed;
621 }
622 memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
623 memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
624 }
625
626 printk(KERN_INFO "Starting balanced_irq\n");
627 if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
628 return 0;
629 else
630 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
631failed:
632 for (i = 0; i < NR_CPUS; i++) {
633 if(irq_cpu_data[i].irq_delta)
634 kfree(irq_cpu_data[i].irq_delta);
635 if(irq_cpu_data[i].last_irq)
636 kfree(irq_cpu_data[i].last_irq);
637 }
638 return 0;
639}
640
641int __init irqbalance_disable(char *str)
642{
643 irqbalance_disabled = 1;
644 return 0;
645}
646
647__setup("noirqbalance", irqbalance_disable);
648
649static inline void move_irq(int irq)
650{
651 /* note - we hold the desc->lock */
652 if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
653 set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
654 cpus_clear(pending_irq_balance_cpumask[irq]);
655 }
656}
657
658late_initcall(balanced_irq_init);
659
660#else /* !CONFIG_IRQBALANCE */
661static inline void move_irq(int irq) { }
662#endif /* CONFIG_IRQBALANCE */
663
664#ifndef CONFIG_SMP
665void fastcall send_IPI_self(int vector)
666{
667 unsigned int cfg;
668
669 /*
670 * Wait for idle.
671 */
672 apic_wait_icr_idle();
673 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
674 /*
675 * Send the IPI. The write to APIC_ICR fires this off.
676 */
677 apic_write_around(APIC_ICR, cfg);
678}
679#endif /* !CONFIG_SMP */
680
681
682/*
683 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
684 * specific CPU-side IRQs.
685 */
686
687#define MAX_PIRQS 8
688static int pirq_entries [MAX_PIRQS];
689static int pirqs_enabled;
690int skip_ioapic_setup;
691
692static int __init ioapic_setup(char *str)
693{
694 skip_ioapic_setup = 1;
695 return 1;
696}
697
698__setup("noapic", ioapic_setup);
699
700static int __init ioapic_pirq_setup(char *str)
701{
702 int i, max;
703 int ints[MAX_PIRQS+1];
704
705 get_options(str, ARRAY_SIZE(ints), ints);
706
707 for (i = 0; i < MAX_PIRQS; i++)
708 pirq_entries[i] = -1;
709
710 pirqs_enabled = 1;
711 apic_printk(APIC_VERBOSE, KERN_INFO
712 "PIRQ redirection, working around broken MP-BIOS.\n");
713 max = MAX_PIRQS;
714 if (ints[0] < MAX_PIRQS)
715 max = ints[0];
716
717 for (i = 0; i < max; i++) {
718 apic_printk(APIC_VERBOSE, KERN_DEBUG
719 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
720 /*
721 * PIRQs are mapped upside down, usually.
722 */
723 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
724 }
725 return 1;
726}
727
728__setup("pirq=", ioapic_pirq_setup);
729
730/*
731 * Find the IRQ entry number of a certain pin.
732 */
733static int find_irq_entry(int apic, int pin, int type)
734{
735 int i;
736
737 for (i = 0; i < mp_irq_entries; i++)
738 if (mp_irqs[i].mpc_irqtype == type &&
739 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
740 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
741 mp_irqs[i].mpc_dstirq == pin)
742 return i;
743
744 return -1;
745}
746
747/*
748 * Find the pin to which IRQ[irq] (ISA) is connected
749 */
750static int find_isa_irq_pin(int irq, int type)
751{
752 int i;
753
754 for (i = 0; i < mp_irq_entries; i++) {
755 int lbus = mp_irqs[i].mpc_srcbus;
756
757 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
758 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
759 mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
760 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
761 ) &&
762 (mp_irqs[i].mpc_irqtype == type) &&
763 (mp_irqs[i].mpc_srcbusirq == irq))
764
765 return mp_irqs[i].mpc_dstirq;
766 }
767 return -1;
768}
769
770/*
771 * Find a specific PCI IRQ entry.
772 * Not an __init, possibly needed by modules
773 */
774static int pin_2_irq(int idx, int apic, int pin);
775
776int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
777{
778 int apic, i, best_guess = -1;
779
780 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
781 "slot:%d, pin:%d.\n", bus, slot, pin);
782 if (mp_bus_id_to_pci_bus[bus] == -1) {
783 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
784 return -1;
785 }
786 for (i = 0; i < mp_irq_entries; i++) {
787 int lbus = mp_irqs[i].mpc_srcbus;
788
789 for (apic = 0; apic < nr_ioapics; apic++)
790 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
791 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
792 break;
793
794 if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
795 !mp_irqs[i].mpc_irqtype &&
796 (bus == lbus) &&
797 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
798 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
799
800 if (!(apic || IO_APIC_IRQ(irq)))
801 continue;
802
803 if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
804 return irq;
805 /*
806 * Use the first all-but-pin matching entry as a
807 * best-guess fuzzy result for broken mptables.
808 */
809 if (best_guess < 0)
810 best_guess = irq;
811 }
812 }
813 return best_guess;
814}
815
816/*
817 * This function currently is only a helper for the i386 smp boot process where
818 * we need to reprogram the ioredtbls to cater for the cpus which have come online
819 * so mask in all cases should simply be TARGET_CPUS
820 */
821void __init setup_ioapic_dest(void)
822{
823 int pin, ioapic, irq, irq_entry;
824
825 if (skip_ioapic_setup == 1)
826 return;
827
828 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
829 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
830 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
831 if (irq_entry == -1)
832 continue;
833 irq = pin_2_irq(irq_entry, ioapic, pin);
834 set_ioapic_affinity_irq(irq, TARGET_CPUS);
835 }
836
837 }
838}
839
840/*
841 * EISA Edge/Level control register, ELCR
842 */
843static int EISA_ELCR(unsigned int irq)
844{
845 if (irq < 16) {
846 unsigned int port = 0x4d0 + (irq >> 3);
847 return (inb(port) >> (irq & 7)) & 1;
848 }
849 apic_printk(APIC_VERBOSE, KERN_INFO
850 "Broken MPtable reports ISA irq %d\n", irq);
851 return 0;
852}
853
854/* EISA interrupts are always polarity zero and can be edge or level
855 * trigger depending on the ELCR value. If an interrupt is listed as
856 * EISA conforming in the MP table, that means its trigger type must
857 * be read in from the ELCR */
858
859#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
860#define default_EISA_polarity(idx) (0)
861
862/* ISA interrupts are always polarity zero edge triggered,
863 * when listed as conforming in the MP table. */
864
865#define default_ISA_trigger(idx) (0)
866#define default_ISA_polarity(idx) (0)
867
868/* PCI interrupts are always polarity one level triggered,
869 * when listed as conforming in the MP table. */
870
871#define default_PCI_trigger(idx) (1)
872#define default_PCI_polarity(idx) (1)
873
874/* MCA interrupts are always polarity zero level triggered,
875 * when listed as conforming in the MP table. */
876
877#define default_MCA_trigger(idx) (1)
878#define default_MCA_polarity(idx) (0)
879
880/* NEC98 interrupts are always polarity zero edge triggered,
881 * when listed as conforming in the MP table. */
882
883#define default_NEC98_trigger(idx) (0)
884#define default_NEC98_polarity(idx) (0)
885
886static int __init MPBIOS_polarity(int idx)
887{
888 int bus = mp_irqs[idx].mpc_srcbus;
889 int polarity;
890
891 /*
892 * Determine IRQ line polarity (high active or low active):
893 */
894 switch (mp_irqs[idx].mpc_irqflag & 3)
895 {
896 case 0: /* conforms, ie. bus-type dependent polarity */
897 {
898 switch (mp_bus_id_to_type[bus])
899 {
900 case MP_BUS_ISA: /* ISA pin */
901 {
902 polarity = default_ISA_polarity(idx);
903 break;
904 }
905 case MP_BUS_EISA: /* EISA pin */
906 {
907 polarity = default_EISA_polarity(idx);
908 break;
909 }
910 case MP_BUS_PCI: /* PCI pin */
911 {
912 polarity = default_PCI_polarity(idx);
913 break;
914 }
915 case MP_BUS_MCA: /* MCA pin */
916 {
917 polarity = default_MCA_polarity(idx);
918 break;
919 }
920 case MP_BUS_NEC98: /* NEC 98 pin */
921 {
922 polarity = default_NEC98_polarity(idx);
923 break;
924 }
925 default:
926 {
927 printk(KERN_WARNING "broken BIOS!!\n");
928 polarity = 1;
929 break;
930 }
931 }
932 break;
933 }
934 case 1: /* high active */
935 {
936 polarity = 0;
937 break;
938 }
939 case 2: /* reserved */
940 {
941 printk(KERN_WARNING "broken BIOS!!\n");
942 polarity = 1;
943 break;
944 }
945 case 3: /* low active */
946 {
947 polarity = 1;
948 break;
949 }
950 default: /* invalid */
951 {
952 printk(KERN_WARNING "broken BIOS!!\n");
953 polarity = 1;
954 break;
955 }
956 }
957 return polarity;
958}
959
960static int MPBIOS_trigger(int idx)
961{
962 int bus = mp_irqs[idx].mpc_srcbus;
963 int trigger;
964
965 /*
966 * Determine IRQ trigger mode (edge or level sensitive):
967 */
968 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
969 {
970 case 0: /* conforms, ie. bus-type dependent */
971 {
972 switch (mp_bus_id_to_type[bus])
973 {
974 case MP_BUS_ISA: /* ISA pin */
975 {
976 trigger = default_ISA_trigger(idx);
977 break;
978 }
979 case MP_BUS_EISA: /* EISA pin */
980 {
981 trigger = default_EISA_trigger(idx);
982 break;
983 }
984 case MP_BUS_PCI: /* PCI pin */
985 {
986 trigger = default_PCI_trigger(idx);
987 break;
988 }
989 case MP_BUS_MCA: /* MCA pin */
990 {
991 trigger = default_MCA_trigger(idx);
992 break;
993 }
994 case MP_BUS_NEC98: /* NEC 98 pin */
995 {
996 trigger = default_NEC98_trigger(idx);
997 break;
998 }
999 default:
1000 {
1001 printk(KERN_WARNING "broken BIOS!!\n");
1002 trigger = 1;
1003 break;
1004 }
1005 }
1006 break;
1007 }
1008 case 1: /* edge */
1009 {
1010 trigger = 0;
1011 break;
1012 }
1013 case 2: /* reserved */
1014 {
1015 printk(KERN_WARNING "broken BIOS!!\n");
1016 trigger = 1;
1017 break;
1018 }
1019 case 3: /* level */
1020 {
1021 trigger = 1;
1022 break;
1023 }
1024 default: /* invalid */
1025 {
1026 printk(KERN_WARNING "broken BIOS!!\n");
1027 trigger = 0;
1028 break;
1029 }
1030 }
1031 return trigger;
1032}
1033
1034static inline int irq_polarity(int idx)
1035{
1036 return MPBIOS_polarity(idx);
1037}
1038
1039static inline int irq_trigger(int idx)
1040{
1041 return MPBIOS_trigger(idx);
1042}
1043
1044static int pin_2_irq(int idx, int apic, int pin)
1045{
1046 int irq, i;
1047 int bus = mp_irqs[idx].mpc_srcbus;
1048
1049 /*
1050 * Debugging check, we are in big trouble if this message pops up!
1051 */
1052 if (mp_irqs[idx].mpc_dstirq != pin)
1053 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1054
1055 switch (mp_bus_id_to_type[bus])
1056 {
1057 case MP_BUS_ISA: /* ISA pin */
1058 case MP_BUS_EISA:
1059 case MP_BUS_MCA:
1060 case MP_BUS_NEC98:
1061 {
1062 irq = mp_irqs[idx].mpc_srcbusirq;
1063 break;
1064 }
1065 case MP_BUS_PCI: /* PCI pin */
1066 {
1067 /*
1068 * PCI IRQs are mapped in order
1069 */
1070 i = irq = 0;
1071 while (i < apic)
1072 irq += nr_ioapic_registers[i++];
1073 irq += pin;
1074
1075 /*
1076 * For MPS mode, so far only needed by ES7000 platform
1077 */
1078 if (ioapic_renumber_irq)
1079 irq = ioapic_renumber_irq(apic, irq);
1080
1081 break;
1082 }
1083 default:
1084 {
1085 printk(KERN_ERR "unknown bus type %d.\n",bus);
1086 irq = 0;
1087 break;
1088 }
1089 }
1090
1091 /*
1092 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1093 */
1094 if ((pin >= 16) && (pin <= 23)) {
1095 if (pirq_entries[pin-16] != -1) {
1096 if (!pirq_entries[pin-16]) {
1097 apic_printk(APIC_VERBOSE, KERN_DEBUG
1098 "disabling PIRQ%d\n", pin-16);
1099 } else {
1100 irq = pirq_entries[pin-16];
1101 apic_printk(APIC_VERBOSE, KERN_DEBUG
1102 "using PIRQ%d -> IRQ %d\n",
1103 pin-16, irq);
1104 }
1105 }
1106 }
1107 return irq;
1108}
1109
1110static inline int IO_APIC_irq_trigger(int irq)
1111{
1112 int apic, idx, pin;
1113
1114 for (apic = 0; apic < nr_ioapics; apic++) {
1115 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1116 idx = find_irq_entry(apic,pin,mp_INT);
1117 if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
1118 return irq_trigger(idx);
1119 }
1120 }
1121 /*
1122 * nonexistent IRQs are edge default
1123 */
1124 return 0;
1125}
1126
1127/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1128u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
1129
1130int assign_irq_vector(int irq)
1131{
1132 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
1133
1134 BUG_ON(irq >= NR_IRQ_VECTORS);
1135 if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
1136 return IO_APIC_VECTOR(irq);
1137next:
1138 current_vector += 8;
1139 if (current_vector == SYSCALL_VECTOR)
1140 goto next;
1141
1142 if (current_vector >= FIRST_SYSTEM_VECTOR) {
1143 offset++;
1144 if (!(offset%8))
1145 return -ENOSPC;
1146 current_vector = FIRST_DEVICE_VECTOR + offset;
1147 }
1148
1149 vector_irq[current_vector] = irq;
1150 if (irq != AUTO_ASSIGN)
1151 IO_APIC_VECTOR(irq) = current_vector;
1152
1153 return current_vector;
1154}
1155
1156static struct hw_interrupt_type ioapic_level_type;
1157static struct hw_interrupt_type ioapic_edge_type;
1158
1159#define IOAPIC_AUTO -1
1160#define IOAPIC_EDGE 0
1161#define IOAPIC_LEVEL 1
1162
1163static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1164{
1165 if (use_pci_vector() && !platform_legacy_irq(irq)) {
1166 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1167 trigger == IOAPIC_LEVEL)
1168 irq_desc[vector].handler = &ioapic_level_type;
1169 else
1170 irq_desc[vector].handler = &ioapic_edge_type;
1171 set_intr_gate(vector, interrupt[vector]);
1172 } else {
1173 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1174 trigger == IOAPIC_LEVEL)
1175 irq_desc[irq].handler = &ioapic_level_type;
1176 else
1177 irq_desc[irq].handler = &ioapic_edge_type;
1178 set_intr_gate(vector, interrupt[irq]);
1179 }
1180}
1181
1182static void __init setup_IO_APIC_irqs(void)
1183{
1184 struct IO_APIC_route_entry entry;
1185 int apic, pin, idx, irq, first_notcon = 1, vector;
1186 unsigned long flags;
1187
1188 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1189
1190 for (apic = 0; apic < nr_ioapics; apic++) {
1191 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1192
1193 /*
1194 * add it to the IO-APIC irq-routing table:
1195 */
1196 memset(&entry,0,sizeof(entry));
1197
1198 entry.delivery_mode = INT_DELIVERY_MODE;
1199 entry.dest_mode = INT_DEST_MODE;
1200 entry.mask = 0; /* enable IRQ */
1201 entry.dest.logical.logical_dest =
1202 cpu_mask_to_apicid(TARGET_CPUS);
1203
1204 idx = find_irq_entry(apic,pin,mp_INT);
1205 if (idx == -1) {
1206 if (first_notcon) {
1207 apic_printk(APIC_VERBOSE, KERN_DEBUG
1208 " IO-APIC (apicid-pin) %d-%d",
1209 mp_ioapics[apic].mpc_apicid,
1210 pin);
1211 first_notcon = 0;
1212 } else
1213 apic_printk(APIC_VERBOSE, ", %d-%d",
1214 mp_ioapics[apic].mpc_apicid, pin);
1215 continue;
1216 }
1217
1218 entry.trigger = irq_trigger(idx);
1219 entry.polarity = irq_polarity(idx);
1220
1221 if (irq_trigger(idx)) {
1222 entry.trigger = 1;
1223 entry.mask = 1;
1224 }
1225
1226 irq = pin_2_irq(idx, apic, pin);
1227 /*
1228 * skip adding the timer int on secondary nodes, which causes
1229 * a small but painful rift in the time-space continuum
1230 */
1231 if (multi_timer_check(apic, irq))
1232 continue;
1233 else
1234 add_pin_to_irq(irq, apic, pin);
1235
1236 if (!apic && !IO_APIC_IRQ(irq))
1237 continue;
1238
1239 if (IO_APIC_IRQ(irq)) {
1240 vector = assign_irq_vector(irq);
1241 entry.vector = vector;
1242 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1243
1244 if (!apic && (irq < 16))
1245 disable_8259A_irq(irq);
1246 }
1247 spin_lock_irqsave(&ioapic_lock, flags);
1248 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
1249 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
1250 spin_unlock_irqrestore(&ioapic_lock, flags);
1251 }
1252 }
1253
1254 if (!first_notcon)
1255 apic_printk(APIC_VERBOSE, " not connected.\n");
1256}
1257
1258/*
1259 * Set up the 8259A-master output pin:
1260 */
1261static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
1262{
1263 struct IO_APIC_route_entry entry;
1264 unsigned long flags;
1265
1266 memset(&entry,0,sizeof(entry));
1267
1268 disable_8259A_irq(0);
1269
1270 /* mask LVT0 */
1271 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1272
1273 /*
1274 * We use logical delivery to get the timer IRQ
1275 * to the first CPU.
1276 */
1277 entry.dest_mode = INT_DEST_MODE;
1278 entry.mask = 0; /* unmask IRQ now */
1279 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1280 entry.delivery_mode = INT_DELIVERY_MODE;
1281 entry.polarity = 0;
1282 entry.trigger = 0;
1283 entry.vector = vector;
1284
1285 /*
1286 * The timer IRQ doesn't have to know that behind the
1287 * scene we have a 8259A-master in AEOI mode ...
1288 */
1289 irq_desc[0].handler = &ioapic_edge_type;
1290
1291 /*
1292 * Add it to the IO-APIC irq-routing table:
1293 */
1294 spin_lock_irqsave(&ioapic_lock, flags);
1295 io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
1296 io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
1297 spin_unlock_irqrestore(&ioapic_lock, flags);
1298
1299 enable_8259A_irq(0);
1300}
1301
1302static inline void UNEXPECTED_IO_APIC(void)
1303{
1304}
1305
1306void __init print_IO_APIC(void)
1307{
1308 int apic, i;
1309 union IO_APIC_reg_00 reg_00;
1310 union IO_APIC_reg_01 reg_01;
1311 union IO_APIC_reg_02 reg_02;
1312 union IO_APIC_reg_03 reg_03;
1313 unsigned long flags;
1314
1315 if (apic_verbosity == APIC_QUIET)
1316 return;
1317
1318 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1319 for (i = 0; i < nr_ioapics; i++)
1320 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1321 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
1322
1323 /*
1324 * We are a bit conservative about what we expect. We have to
1325 * know about every hardware change ASAP.
1326 */
1327 printk(KERN_INFO "testing the IO APIC.......................\n");
1328
1329 for (apic = 0; apic < nr_ioapics; apic++) {
1330
1331 spin_lock_irqsave(&ioapic_lock, flags);
1332 reg_00.raw = io_apic_read(apic, 0);
1333 reg_01.raw = io_apic_read(apic, 1);
1334 if (reg_01.bits.version >= 0x10)
1335 reg_02.raw = io_apic_read(apic, 2);
1336 if (reg_01.bits.version >= 0x20)
1337 reg_03.raw = io_apic_read(apic, 3);
1338 spin_unlock_irqrestore(&ioapic_lock, flags);
1339
1340 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
1341 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1342 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1343 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1344 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1345 if (reg_00.bits.ID >= get_physical_broadcast())
1346 UNEXPECTED_IO_APIC();
1347 if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1348 UNEXPECTED_IO_APIC();
1349
1350 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1351 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1352 if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1353 (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1354 (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1355 (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1356 (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1357 (reg_01.bits.entries != 0x2E) &&
1358 (reg_01.bits.entries != 0x3F)
1359 )
1360 UNEXPECTED_IO_APIC();
1361
1362 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1363 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1364 if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1365 (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1366 (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1367 (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1368 (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1369 )
1370 UNEXPECTED_IO_APIC();
1371 if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1372 UNEXPECTED_IO_APIC();
1373
1374 /*
1375 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1376 * but the value of reg_02 is read as the previous read register
1377 * value, so ignore it if reg_02 == reg_01.
1378 */
1379 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1380 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1381 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1382 if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1383 UNEXPECTED_IO_APIC();
1384 }
1385
1386 /*
1387 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1388 * or reg_03, but the value of reg_0[23] is read as the previous read
1389 * register value, so ignore it if reg_03 == reg_0[12].
1390 */
1391 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1392 reg_03.raw != reg_01.raw) {
1393 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1394 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1395 if (reg_03.bits.__reserved_1)
1396 UNEXPECTED_IO_APIC();
1397 }
1398
1399 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1400
1401 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
1402 " Stat Dest Deli Vect: \n");
1403
1404 for (i = 0; i <= reg_01.bits.entries; i++) {
1405 struct IO_APIC_route_entry entry;
1406
1407 spin_lock_irqsave(&ioapic_lock, flags);
1408 *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
1409 *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
1410 spin_unlock_irqrestore(&ioapic_lock, flags);
1411
1412 printk(KERN_DEBUG " %02x %03X %02X ",
1413 i,
1414 entry.dest.logical.logical_dest,
1415 entry.dest.physical.physical_dest
1416 );
1417
1418 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
1419 entry.mask,
1420 entry.trigger,
1421 entry.irr,
1422 entry.polarity,
1423 entry.delivery_status,
1424 entry.dest_mode,
1425 entry.delivery_mode,
1426 entry.vector
1427 );
1428 }
1429 }
1430 if (use_pci_vector())
1431 printk(KERN_INFO "Using vector-based indexing\n");
1432 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1433 for (i = 0; i < NR_IRQS; i++) {
1434 struct irq_pin_list *entry = irq_2_pin + i;
1435 if (entry->pin < 0)
1436 continue;
1437 if (use_pci_vector() && !platform_legacy_irq(i))
1438 printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
1439 else
1440 printk(KERN_DEBUG "IRQ%d ", i);
1441 for (;;) {
1442 printk("-> %d:%d", entry->apic, entry->pin);
1443 if (!entry->next)
1444 break;
1445 entry = irq_2_pin + entry->next;
1446 }
1447 printk("\n");
1448 }
1449
1450 printk(KERN_INFO ".................................... done.\n");
1451
1452 return;
1453}
1454
1455#if 0
1456
1457static void print_APIC_bitfield (int base)
1458{
1459 unsigned int v;
1460 int i, j;
1461
1462 if (apic_verbosity == APIC_QUIET)
1463 return;
1464
1465 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1466 for (i = 0; i < 8; i++) {
1467 v = apic_read(base + i*0x10);
1468 for (j = 0; j < 32; j++) {
1469 if (v & (1<<j))
1470 printk("1");
1471 else
1472 printk("0");
1473 }
1474 printk("\n");
1475 }
1476}
1477
1478void /*__init*/ print_local_APIC(void * dummy)
1479{
1480 unsigned int v, ver, maxlvt;
1481
1482 if (apic_verbosity == APIC_QUIET)
1483 return;
1484
1485 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1486 smp_processor_id(), hard_smp_processor_id());
1487 v = apic_read(APIC_ID);
1488 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
1489 v = apic_read(APIC_LVR);
1490 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1491 ver = GET_APIC_VERSION(v);
1492 maxlvt = get_maxlvt();
1493
1494 v = apic_read(APIC_TASKPRI);
1495 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1496
1497 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1498 v = apic_read(APIC_ARBPRI);
1499 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1500 v & APIC_ARBPRI_MASK);
1501 v = apic_read(APIC_PROCPRI);
1502 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1503 }
1504
1505 v = apic_read(APIC_EOI);
1506 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1507 v = apic_read(APIC_RRR);
1508 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1509 v = apic_read(APIC_LDR);
1510 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1511 v = apic_read(APIC_DFR);
1512 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1513 v = apic_read(APIC_SPIV);
1514 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1515
1516 printk(KERN_DEBUG "... APIC ISR field:\n");
1517 print_APIC_bitfield(APIC_ISR);
1518 printk(KERN_DEBUG "... APIC TMR field:\n");
1519 print_APIC_bitfield(APIC_TMR);
1520 printk(KERN_DEBUG "... APIC IRR field:\n");
1521 print_APIC_bitfield(APIC_IRR);
1522
1523 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1524 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1525 apic_write(APIC_ESR, 0);
1526 v = apic_read(APIC_ESR);
1527 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1528 }
1529
1530 v = apic_read(APIC_ICR);
1531 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1532 v = apic_read(APIC_ICR2);
1533 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1534
1535 v = apic_read(APIC_LVTT);
1536 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1537
1538 if (maxlvt > 3) { /* PC is LVT#4. */
1539 v = apic_read(APIC_LVTPC);
1540 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1541 }
1542 v = apic_read(APIC_LVT0);
1543 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1544 v = apic_read(APIC_LVT1);
1545 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1546
1547 if (maxlvt > 2) { /* ERR is LVT#3. */
1548 v = apic_read(APIC_LVTERR);
1549 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1550 }
1551
1552 v = apic_read(APIC_TMICT);
1553 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1554 v = apic_read(APIC_TMCCT);
1555 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1556 v = apic_read(APIC_TDCR);
1557 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1558 printk("\n");
1559}
1560
1561void print_all_local_APICs (void)
1562{
1563 on_each_cpu(print_local_APIC, NULL, 1, 1);
1564}
1565
1566void /*__init*/ print_PIC(void)
1567{
1568 extern spinlock_t i8259A_lock;
1569 unsigned int v;
1570 unsigned long flags;
1571
1572 if (apic_verbosity == APIC_QUIET)
1573 return;
1574
1575 printk(KERN_DEBUG "\nprinting PIC contents\n");
1576
1577 spin_lock_irqsave(&i8259A_lock, flags);
1578
1579 v = inb(0xa1) << 8 | inb(0x21);
1580 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1581
1582 v = inb(0xa0) << 8 | inb(0x20);
1583 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1584
1585 outb(0x0b,0xa0);
1586 outb(0x0b,0x20);
1587 v = inb(0xa0) << 8 | inb(0x20);
1588 outb(0x0a,0xa0);
1589 outb(0x0a,0x20);
1590
1591 spin_unlock_irqrestore(&i8259A_lock, flags);
1592
1593 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1594
1595 v = inb(0x4d1) << 8 | inb(0x4d0);
1596 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1597}
1598
1599#endif /* 0 */
1600
1601static void __init enable_IO_APIC(void)
1602{
1603 union IO_APIC_reg_01 reg_01;
1604 int i;
1605 unsigned long flags;
1606
1607 for (i = 0; i < PIN_MAP_SIZE; i++) {
1608 irq_2_pin[i].pin = -1;
1609 irq_2_pin[i].next = 0;
1610 }
1611 if (!pirqs_enabled)
1612 for (i = 0; i < MAX_PIRQS; i++)
1613 pirq_entries[i] = -1;
1614
1615 /*
1616 * The number of IO-APIC IRQ registers (== #pins):
1617 */
1618 for (i = 0; i < nr_ioapics; i++) {
1619 spin_lock_irqsave(&ioapic_lock, flags);
1620 reg_01.raw = io_apic_read(i, 1);
1621 spin_unlock_irqrestore(&ioapic_lock, flags);
1622 nr_ioapic_registers[i] = reg_01.bits.entries+1;
1623 }
1624
1625 /*
1626 * Do not trust the IO-APIC being empty at bootup
1627 */
1628 clear_IO_APIC();
1629}
1630
1631/*
1632 * Not an __init, needed by the reboot code
1633 */
1634void disable_IO_APIC(void)
1635{
1636 /*
1637 * Clear the IO-APIC before rebooting:
1638 */
1639 clear_IO_APIC();
1640
1641 disconnect_bsp_APIC();
1642}
1643
1644/*
1645 * function to set the IO-APIC physical IDs based on the
1646 * values stored in the MPC table.
1647 *
1648 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1649 */
1650
1651#ifndef CONFIG_X86_NUMAQ
1652static void __init setup_ioapic_ids_from_mpc(void)
1653{
1654 union IO_APIC_reg_00 reg_00;
1655 physid_mask_t phys_id_present_map;
1656 int apic;
1657 int i;
1658 unsigned char old_id;
1659 unsigned long flags;
1660
1661 /*
1662 * This is broken; anything with a real cpu count has to
1663 * circumvent this idiocy regardless.
1664 */
1665 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1666
1667 /*
1668 * Set the IOAPIC ID to the value stored in the MPC table.
1669 */
1670 for (apic = 0; apic < nr_ioapics; apic++) {
1671
1672 /* Read the register 0 value */
1673 spin_lock_irqsave(&ioapic_lock, flags);
1674 reg_00.raw = io_apic_read(apic, 0);
1675 spin_unlock_irqrestore(&ioapic_lock, flags);
1676
1677 old_id = mp_ioapics[apic].mpc_apicid;
1678
1679 if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
1680 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1681 apic, mp_ioapics[apic].mpc_apicid);
1682 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1683 reg_00.bits.ID);
1684 mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
1685 }
1686
1687 /* Don't check I/O APIC IDs for some xAPIC systems. They have
1688 * no meaning without the serial APIC bus. */
1689 if (NO_IOAPIC_CHECK)
1690 continue;
1691 /*
1692 * Sanity check, is the ID really free? Every APIC in a
1693 * system must have a unique ID or we get lots of nice
1694 * 'stuck on smp_invalidate_needed IPI wait' messages.
1695 */
1696 if (check_apicid_used(phys_id_present_map,
1697 mp_ioapics[apic].mpc_apicid)) {
1698 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1699 apic, mp_ioapics[apic].mpc_apicid);
1700 for (i = 0; i < get_physical_broadcast(); i++)
1701 if (!physid_isset(i, phys_id_present_map))
1702 break;
1703 if (i >= get_physical_broadcast())
1704 panic("Max APIC ID exceeded!\n");
1705 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1706 i);
1707 physid_set(i, phys_id_present_map);
1708 mp_ioapics[apic].mpc_apicid = i;
1709 } else {
1710 physid_mask_t tmp;
1711 tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
1712 apic_printk(APIC_VERBOSE, "Setting %d in the "
1713 "phys_id_present_map\n",
1714 mp_ioapics[apic].mpc_apicid);
1715 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1716 }
1717
1718
1719 /*
1720 * We need to adjust the IRQ routing table
1721 * if the ID changed.
1722 */
1723 if (old_id != mp_ioapics[apic].mpc_apicid)
1724 for (i = 0; i < mp_irq_entries; i++)
1725 if (mp_irqs[i].mpc_dstapic == old_id)
1726 mp_irqs[i].mpc_dstapic
1727 = mp_ioapics[apic].mpc_apicid;
1728
1729 /*
1730 * Read the right value from the MPC table and
1731 * write it into the ID register.
1732 */
1733 apic_printk(APIC_VERBOSE, KERN_INFO
1734 "...changing IO-APIC physical APIC ID to %d ...",
1735 mp_ioapics[apic].mpc_apicid);
1736
1737 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
1738 spin_lock_irqsave(&ioapic_lock, flags);
1739 io_apic_write(apic, 0, reg_00.raw);
1740 spin_unlock_irqrestore(&ioapic_lock, flags);
1741
1742 /*
1743 * Sanity check
1744 */
1745 spin_lock_irqsave(&ioapic_lock, flags);
1746 reg_00.raw = io_apic_read(apic, 0);
1747 spin_unlock_irqrestore(&ioapic_lock, flags);
1748 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
1749 printk("could not set ID!\n");
1750 else
1751 apic_printk(APIC_VERBOSE, " ok.\n");
1752 }
1753}
1754#else
1755static void __init setup_ioapic_ids_from_mpc(void) { }
1756#endif
1757
1758/*
1759 * There is a nasty bug in some older SMP boards, their mptable lies
1760 * about the timer IRQ. We do the following to work around the situation:
1761 *
1762 * - timer IRQ defaults to IO-APIC IRQ
1763 * - if this function detects that timer IRQs are defunct, then we fall
1764 * back to ISA timer IRQs
1765 */
1766static int __init timer_irq_works(void)
1767{
1768 unsigned long t1 = jiffies;
1769
1770 local_irq_enable();
1771 /* Let ten ticks pass... */
1772 mdelay((10 * 1000) / HZ);
1773
1774 /*
1775 * Expect a few ticks at least, to be sure some possible
1776 * glue logic does not lock up after one or two first
1777 * ticks in a non-ExtINT mode. Also the local APIC
1778 * might have cached one ExtINT interrupt. Finally, at
1779 * least one tick may be lost due to delays.
1780 */
1781 if (jiffies - t1 > 4)
1782 return 1;
1783
1784 return 0;
1785}
1786
1787/*
1788 * In the SMP+IOAPIC case it might happen that there are an unspecified
1789 * number of pending IRQ events unhandled. These cases are very rare,
1790 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1791 * better to do it this way as thus we do not have to be aware of
1792 * 'pending' interrupts in the IRQ path, except at this point.
1793 */
1794/*
1795 * Edge triggered needs to resend any interrupt
1796 * that was delayed but this is now handled in the device
1797 * independent code.
1798 */
1799
1800/*
1801 * Starting up a edge-triggered IO-APIC interrupt is
1802 * nasty - we need to make sure that we get the edge.
1803 * If it is already asserted for some reason, we need
1804 * return 1 to indicate that is was pending.
1805 *
1806 * This is not complete - we should be able to fake
1807 * an edge even if it isn't on the 8259A...
1808 */
1809static unsigned int startup_edge_ioapic_irq(unsigned int irq)
1810{
1811 int was_pending = 0;
1812 unsigned long flags;
1813
1814 spin_lock_irqsave(&ioapic_lock, flags);
1815 if (irq < 16) {
1816 disable_8259A_irq(irq);
1817 if (i8259A_irq_pending(irq))
1818 was_pending = 1;
1819 }
1820 __unmask_IO_APIC_irq(irq);
1821 spin_unlock_irqrestore(&ioapic_lock, flags);
1822
1823 return was_pending;
1824}
1825
1826/*
1827 * Once we have recorded IRQ_PENDING already, we can mask the
1828 * interrupt for real. This prevents IRQ storms from unhandled
1829 * devices.
1830 */
1831static void ack_edge_ioapic_irq(unsigned int irq)
1832{
1833 move_irq(irq);
1834 if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
1835 == (IRQ_PENDING | IRQ_DISABLED))
1836 mask_IO_APIC_irq(irq);
1837 ack_APIC_irq();
1838}
1839
1840/*
1841 * Level triggered interrupts can just be masked,
1842 * and shutting down and starting up the interrupt
1843 * is the same as enabling and disabling them -- except
1844 * with a startup need to return a "was pending" value.
1845 *
1846 * Level triggered interrupts are special because we
1847 * do not touch any IO-APIC register while handling
1848 * them. We ack the APIC in the end-IRQ handler, not
1849 * in the start-IRQ-handler. Protection against reentrance
1850 * from the same interrupt is still provided, both by the
1851 * generic IRQ layer and by the fact that an unacked local
1852 * APIC does not accept IRQs.
1853 */
1854static unsigned int startup_level_ioapic_irq (unsigned int irq)
1855{
1856 unmask_IO_APIC_irq(irq);
1857
1858 return 0; /* don't check for pending */
1859}
1860
1861static void end_level_ioapic_irq (unsigned int irq)
1862{
1863 unsigned long v;
1864 int i;
1865
1866 move_irq(irq);
1867/*
1868 * It appears there is an erratum which affects at least version 0x11
1869 * of I/O APIC (that's the 82093AA and cores integrated into various
1870 * chipsets). Under certain conditions a level-triggered interrupt is
1871 * erroneously delivered as edge-triggered one but the respective IRR
1872 * bit gets set nevertheless. As a result the I/O unit expects an EOI
1873 * message but it will never arrive and further interrupts are blocked
1874 * from the source. The exact reason is so far unknown, but the
1875 * phenomenon was observed when two consecutive interrupt requests
1876 * from a given source get delivered to the same CPU and the source is
1877 * temporarily disabled in between.
1878 *
1879 * A workaround is to simulate an EOI message manually. We achieve it
1880 * by setting the trigger mode to edge and then to level when the edge
1881 * trigger mode gets detected in the TMR of a local APIC for a
1882 * level-triggered interrupt. We mask the source for the time of the
1883 * operation to prevent an edge-triggered interrupt escaping meanwhile.
1884 * The idea is from Manfred Spraul. --macro
1885 */
1886 i = IO_APIC_VECTOR(irq);
1887
1888 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
1889
1890 ack_APIC_irq();
1891
1892 if (!(v & (1 << (i & 0x1f)))) {
1893 atomic_inc(&irq_mis_count);
1894 spin_lock(&ioapic_lock);
1895 __mask_and_edge_IO_APIC_irq(irq);
1896 __unmask_and_level_IO_APIC_irq(irq);
1897 spin_unlock(&ioapic_lock);
1898 }
1899}
1900
1901#ifdef CONFIG_PCI_MSI
1902static unsigned int startup_edge_ioapic_vector(unsigned int vector)
1903{
1904 int irq = vector_to_irq(vector);
1905
1906 return startup_edge_ioapic_irq(irq);
1907}
1908
1909static void ack_edge_ioapic_vector(unsigned int vector)
1910{
1911 int irq = vector_to_irq(vector);
1912
1913 ack_edge_ioapic_irq(irq);
1914}
1915
1916static unsigned int startup_level_ioapic_vector (unsigned int vector)
1917{
1918 int irq = vector_to_irq(vector);
1919
1920 return startup_level_ioapic_irq (irq);
1921}
1922
1923static void end_level_ioapic_vector (unsigned int vector)
1924{
1925 int irq = vector_to_irq(vector);
1926
1927 end_level_ioapic_irq(irq);
1928}
1929
1930static void mask_IO_APIC_vector (unsigned int vector)
1931{
1932 int irq = vector_to_irq(vector);
1933
1934 mask_IO_APIC_irq(irq);
1935}
1936
1937static void unmask_IO_APIC_vector (unsigned int vector)
1938{
1939 int irq = vector_to_irq(vector);
1940
1941 unmask_IO_APIC_irq(irq);
1942}
1943
1944static void set_ioapic_affinity_vector (unsigned int vector,
1945 cpumask_t cpu_mask)
1946{
1947 int irq = vector_to_irq(vector);
1948
1949 set_ioapic_affinity_irq(irq, cpu_mask);
1950}
1951#endif
1952
1953/*
1954 * Level and edge triggered IO-APIC interrupts need different handling,
1955 * so we use two separate IRQ descriptors. Edge triggered IRQs can be
1956 * handled with the level-triggered descriptor, but that one has slightly
1957 * more overhead. Level-triggered interrupts cannot be handled with the
1958 * edge-triggered handler, without risking IRQ storms and other ugly
1959 * races.
1960 */
1961static struct hw_interrupt_type ioapic_edge_type = {
1962 .typename = "IO-APIC-edge",
1963 .startup = startup_edge_ioapic,
1964 .shutdown = shutdown_edge_ioapic,
1965 .enable = enable_edge_ioapic,
1966 .disable = disable_edge_ioapic,
1967 .ack = ack_edge_ioapic,
1968 .end = end_edge_ioapic,
1969 .set_affinity = set_ioapic_affinity,
1970};
1971
1972static struct hw_interrupt_type ioapic_level_type = {
1973 .typename = "IO-APIC-level",
1974 .startup = startup_level_ioapic,
1975 .shutdown = shutdown_level_ioapic,
1976 .enable = enable_level_ioapic,
1977 .disable = disable_level_ioapic,
1978 .ack = mask_and_ack_level_ioapic,
1979 .end = end_level_ioapic,
1980 .set_affinity = set_ioapic_affinity,
1981};
1982
1983static inline void init_IO_APIC_traps(void)
1984{
1985 int irq;
1986
1987 /*
1988 * NOTE! The local APIC isn't very good at handling
1989 * multiple interrupts at the same interrupt level.
1990 * As the interrupt level is determined by taking the
1991 * vector number and shifting that right by 4, we
1992 * want to spread these out a bit so that they don't
1993 * all fall in the same interrupt level.
1994 *
1995 * Also, we've got to be careful not to trash gate
1996 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1997 */
1998 for (irq = 0; irq < NR_IRQS ; irq++) {
1999 int tmp = irq;
2000 if (use_pci_vector()) {
2001 if (!platform_legacy_irq(tmp))
2002 if ((tmp = vector_to_irq(tmp)) == -1)
2003 continue;
2004 }
2005 if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
2006 /*
2007 * Hmm.. We don't have an entry for this,
2008 * so default to an old-fashioned 8259
2009 * interrupt if we can..
2010 */
2011 if (irq < 16)
2012 make_8259A_irq(irq);
2013 else
2014 /* Strange. Oh, well.. */
2015 irq_desc[irq].handler = &no_irq_type;
2016 }
2017 }
2018}
2019
2020static void enable_lapic_irq (unsigned int irq)
2021{
2022 unsigned long v;
2023
2024 v = apic_read(APIC_LVT0);
2025 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2026}
2027
2028static void disable_lapic_irq (unsigned int irq)
2029{
2030 unsigned long v;
2031
2032 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2034}
2035
2036static void ack_lapic_irq (unsigned int irq)
2037{
2038 ack_APIC_irq();
2039}
2040
2041static void end_lapic_irq (unsigned int i) { /* nothing */ }
2042
2043static struct hw_interrupt_type lapic_irq_type = {
2044 .typename = "local-APIC-edge",
2045 .startup = NULL, /* startup_irq() not used for IRQ0 */
2046 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
2047 .enable = enable_lapic_irq,
2048 .disable = disable_lapic_irq,
2049 .ack = ack_lapic_irq,
2050 .end = end_lapic_irq
2051};
2052
2053static void setup_nmi (void)
2054{
2055 /*
2056 * Dirty trick to enable the NMI watchdog ...
2057 * We put the 8259A master into AEOI mode and
2058 * unmask on all local APICs LVT0 as NMI.
2059 *
2060 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2061 * is from Maciej W. Rozycki - so we do not have to EOI from
2062 * the NMI handler or the timer interrupt.
2063 */
2064 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2065
2066 on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
2067
2068 apic_printk(APIC_VERBOSE, " done.\n");
2069}
2070
2071/*
2072 * This looks a bit hackish but it's about the only one way of sending
2073 * a few INTA cycles to 8259As and any associated glue logic. ICR does
2074 * not support the ExtINT mode, unfortunately. We need to send these
2075 * cycles as some i82489DX-based boards have glue logic that keeps the
2076 * 8259A interrupt line asserted until INTA. --macro
2077 */
2078static inline void unlock_ExtINT_logic(void)
2079{
2080 int pin, i;
2081 struct IO_APIC_route_entry entry0, entry1;
2082 unsigned char save_control, save_freq_select;
2083 unsigned long flags;
2084
2085 pin = find_isa_irq_pin(8, mp_INT);
2086 if (pin == -1)
2087 return;
2088
2089 spin_lock_irqsave(&ioapic_lock, flags);
2090 *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
2091 *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
2092 spin_unlock_irqrestore(&ioapic_lock, flags);
2093 clear_IO_APIC_pin(0, pin);
2094
2095 memset(&entry1, 0, sizeof(entry1));
2096
2097 entry1.dest_mode = 0; /* physical delivery */
2098 entry1.mask = 0; /* unmask IRQ now */
2099 entry1.dest.physical.physical_dest = hard_smp_processor_id();
2100 entry1.delivery_mode = dest_ExtINT;
2101 entry1.polarity = entry0.polarity;
2102 entry1.trigger = 0;
2103 entry1.vector = 0;
2104
2105 spin_lock_irqsave(&ioapic_lock, flags);
2106 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
2107 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
2108 spin_unlock_irqrestore(&ioapic_lock, flags);
2109
2110 save_control = CMOS_READ(RTC_CONTROL);
2111 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2112 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
2113 RTC_FREQ_SELECT);
2114 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
2115
2116 i = 100;
2117 while (i-- > 0) {
2118 mdelay(10);
2119 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
2120 i -= 10;
2121 }
2122
2123 CMOS_WRITE(save_control, RTC_CONTROL);
2124 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2125 clear_IO_APIC_pin(0, pin);
2126
2127 spin_lock_irqsave(&ioapic_lock, flags);
2128 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
2129 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
2130 spin_unlock_irqrestore(&ioapic_lock, flags);
2131}
2132
2133/*
2134 * This code may look a bit paranoid, but it's supposed to cooperate with
2135 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2136 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2137 * fanatically on his truly buggy board.
2138 */
2139static inline void check_timer(void)
2140{
2141 int pin1, pin2;
2142 int vector;
2143
2144 /*
2145 * get/set the timer IRQ vector:
2146 */
2147 disable_8259A_irq(0);
2148 vector = assign_irq_vector(0);
2149 set_intr_gate(vector, interrupt[0]);
2150
2151 /*
2152 * Subtle, code in do_timer_interrupt() expects an AEOI
2153 * mode for the 8259A whenever interrupts are routed
2154 * through I/O APICs. Also IRQ0 has to be enabled in
2155 * the 8259A which implies the virtual wire has to be
2156 * disabled in the local APIC.
2157 */
2158 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2159 init_8259A(1);
2160 timer_ack = 1;
2161 enable_8259A_irq(0);
2162
2163 pin1 = find_isa_irq_pin(0, mp_INT);
2164 pin2 = find_isa_irq_pin(0, mp_ExtINT);
2165
2166 printk(KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2);
2167
2168 if (pin1 != -1) {
2169 /*
2170 * Ok, does IRQ0 through the IOAPIC work?
2171 */
2172 unmask_IO_APIC_irq(0);
2173 if (timer_irq_works()) {
2174 if (nmi_watchdog == NMI_IO_APIC) {
2175 disable_8259A_irq(0);
2176 setup_nmi();
2177 enable_8259A_irq(0);
2178 check_nmi_watchdog();
2179 }
2180 return;
2181 }
2182 clear_IO_APIC_pin(0, pin1);
2183 printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
2184 }
2185
2186 printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
2187 if (pin2 != -1) {
2188 printk("\n..... (found pin %d) ...", pin2);
2189 /*
2190 * legacy devices should be connected to IO APIC #0
2191 */
2192 setup_ExtINT_IRQ0_pin(pin2, vector);
2193 if (timer_irq_works()) {
2194 printk("works.\n");
2195 if (pin1 != -1)
2196 replace_pin_at_irq(0, 0, pin1, 0, pin2);
2197 else
2198 add_pin_to_irq(0, 0, pin2);
2199 if (nmi_watchdog == NMI_IO_APIC) {
2200 setup_nmi();
2201 check_nmi_watchdog();
2202 }
2203 return;
2204 }
2205 /*
2206 * Cleanup, just in case ...
2207 */
2208 clear_IO_APIC_pin(0, pin2);
2209 }
2210 printk(" failed.\n");
2211
2212 if (nmi_watchdog == NMI_IO_APIC) {
2213 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
2214 nmi_watchdog = 0;
2215 }
2216
2217 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2218
2219 disable_8259A_irq(0);
2220 irq_desc[0].handler = &lapic_irq_type;
2221 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2222 enable_8259A_irq(0);
2223
2224 if (timer_irq_works()) {
2225 printk(" works.\n");
2226 return;
2227 }
2228 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2229 printk(" failed.\n");
2230
2231 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
2232
2233 timer_ack = 0;
2234 init_8259A(0);
2235 make_8259A_irq(0);
2236 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
2237
2238 unlock_ExtINT_logic();
2239
2240 if (timer_irq_works()) {
2241 printk(" works.\n");
2242 return;
2243 }
2244 printk(" failed :(.\n");
2245 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2246 "report. Then try booting with the 'noapic' option");
2247}
2248
2249/*
2250 *
2251 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
2252 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
2253 * Linux doesn't really care, as it's not actually used
2254 * for any interrupt handling anyway.
2255 */
2256#define PIC_IRQS (1 << PIC_CASCADE_IR)
2257
2258void __init setup_IO_APIC(void)
2259{
2260 enable_IO_APIC();
2261
2262 if (acpi_ioapic)
2263 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
2264 else
2265 io_apic_irqs = ~PIC_IRQS;
2266
2267 printk("ENABLING IO-APIC IRQs\n");
2268
2269 /*
2270 * Set up IO-APIC IRQ routing.
2271 */
2272 if (!acpi_ioapic)
2273 setup_ioapic_ids_from_mpc();
2274 sync_Arb_IDs();
2275 setup_IO_APIC_irqs();
2276 init_IO_APIC_traps();
2277 check_timer();
2278 if (!acpi_ioapic)
2279 print_IO_APIC();
2280}
2281
2282/*
2283 * Called after all the initialization is done. If we didnt find any
2284 * APIC bugs then we can allow the modify fast path
2285 */
2286
2287static int __init io_apic_bug_finalize(void)
2288{
2289 if(sis_apic_bug == -1)
2290 sis_apic_bug = 0;
2291 return 0;
2292}
2293
2294late_initcall(io_apic_bug_finalize);
2295
2296struct sysfs_ioapic_data {
2297 struct sys_device dev;
2298 struct IO_APIC_route_entry entry[0];
2299};
2300static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
2301
2302static int ioapic_suspend(struct sys_device *dev, u32 state)
2303{
2304 struct IO_APIC_route_entry *entry;
2305 struct sysfs_ioapic_data *data;
2306 unsigned long flags;
2307 int i;
2308
2309 data = container_of(dev, struct sysfs_ioapic_data, dev);
2310 entry = data->entry;
2311 spin_lock_irqsave(&ioapic_lock, flags);
2312 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2313 *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
2314 *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
2315 }
2316 spin_unlock_irqrestore(&ioapic_lock, flags);
2317
2318 return 0;
2319}
2320
2321static int ioapic_resume(struct sys_device *dev)
2322{
2323 struct IO_APIC_route_entry *entry;
2324 struct sysfs_ioapic_data *data;
2325 unsigned long flags;
2326 union IO_APIC_reg_00 reg_00;
2327 int i;
2328
2329 data = container_of(dev, struct sysfs_ioapic_data, dev);
2330 entry = data->entry;
2331
2332 spin_lock_irqsave(&ioapic_lock, flags);
2333 reg_00.raw = io_apic_read(dev->id, 0);
2334 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
2335 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2336 io_apic_write(dev->id, 0, reg_00.raw);
2337 }
2338 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2339 io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
2340 io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
2341 }
2342 spin_unlock_irqrestore(&ioapic_lock, flags);
2343
2344 return 0;
2345}
2346
2347static struct sysdev_class ioapic_sysdev_class = {
2348 set_kset_name("ioapic"),
2349 .suspend = ioapic_suspend,
2350 .resume = ioapic_resume,
2351};
2352
2353static int __init ioapic_init_sysfs(void)
2354{
2355 struct sys_device * dev;
2356 int i, size, error = 0;
2357
2358 error = sysdev_class_register(&ioapic_sysdev_class);
2359 if (error)
2360 return error;
2361
2362 for (i = 0; i < nr_ioapics; i++ ) {
2363 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2364 * sizeof(struct IO_APIC_route_entry);
2365 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
2366 if (!mp_ioapic_data[i]) {
2367 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2368 continue;
2369 }
2370 memset(mp_ioapic_data[i], 0, size);
2371 dev = &mp_ioapic_data[i]->dev;
2372 dev->id = i;
2373 dev->cls = &ioapic_sysdev_class;
2374 error = sysdev_register(dev);
2375 if (error) {
2376 kfree(mp_ioapic_data[i]);
2377 mp_ioapic_data[i] = NULL;
2378 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2379 continue;
2380 }
2381 }
2382
2383 return 0;
2384}
2385
2386device_initcall(ioapic_init_sysfs);
2387
2388/* --------------------------------------------------------------------------
2389 ACPI-based IOAPIC Configuration
2390 -------------------------------------------------------------------------- */
2391
2392#ifdef CONFIG_ACPI_BOOT
2393
2394int __init io_apic_get_unique_id (int ioapic, int apic_id)
2395{
2396 union IO_APIC_reg_00 reg_00;
2397 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
2398 physid_mask_t tmp;
2399 unsigned long flags;
2400 int i = 0;
2401
2402 /*
2403 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2404 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2405 * supports up to 16 on one shared APIC bus.
2406 *
2407 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2408 * advantage of new APIC bus architecture.
2409 */
2410
2411 if (physids_empty(apic_id_map))
2412 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
2413
2414 spin_lock_irqsave(&ioapic_lock, flags);
2415 reg_00.raw = io_apic_read(ioapic, 0);
2416 spin_unlock_irqrestore(&ioapic_lock, flags);
2417
2418 if (apic_id >= get_physical_broadcast()) {
2419 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
2420 "%d\n", ioapic, apic_id, reg_00.bits.ID);
2421 apic_id = reg_00.bits.ID;
2422 }
2423
2424 /*
2425 * Every APIC in a system must have a unique ID or we get lots of nice
2426 * 'stuck on smp_invalidate_needed IPI wait' messages.
2427 */
2428 if (check_apicid_used(apic_id_map, apic_id)) {
2429
2430 for (i = 0; i < get_physical_broadcast(); i++) {
2431 if (!check_apicid_used(apic_id_map, i))
2432 break;
2433 }
2434
2435 if (i == get_physical_broadcast())
2436 panic("Max apic_id exceeded!\n");
2437
2438 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
2439 "trying %d\n", ioapic, apic_id, i);
2440
2441 apic_id = i;
2442 }
2443
2444 tmp = apicid_to_cpu_present(apic_id);
2445 physids_or(apic_id_map, apic_id_map, tmp);
2446
2447 if (reg_00.bits.ID != apic_id) {
2448 reg_00.bits.ID = apic_id;
2449
2450 spin_lock_irqsave(&ioapic_lock, flags);
2451 io_apic_write(ioapic, 0, reg_00.raw);
2452 reg_00.raw = io_apic_read(ioapic, 0);
2453 spin_unlock_irqrestore(&ioapic_lock, flags);
2454
2455 /* Sanity check */
2456 if (reg_00.bits.ID != apic_id)
2457 panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
2458 }
2459
2460 apic_printk(APIC_VERBOSE, KERN_INFO
2461 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
2462
2463 return apic_id;
2464}
2465
2466
2467int __init io_apic_get_version (int ioapic)
2468{
2469 union IO_APIC_reg_01 reg_01;
2470 unsigned long flags;
2471
2472 spin_lock_irqsave(&ioapic_lock, flags);
2473 reg_01.raw = io_apic_read(ioapic, 1);
2474 spin_unlock_irqrestore(&ioapic_lock, flags);
2475
2476 return reg_01.bits.version;
2477}
2478
2479
2480int __init io_apic_get_redir_entries (int ioapic)
2481{
2482 union IO_APIC_reg_01 reg_01;
2483 unsigned long flags;
2484
2485 spin_lock_irqsave(&ioapic_lock, flags);
2486 reg_01.raw = io_apic_read(ioapic, 1);
2487 spin_unlock_irqrestore(&ioapic_lock, flags);
2488
2489 return reg_01.bits.entries;
2490}
2491
2492
2493int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
2494{
2495 struct IO_APIC_route_entry entry;
2496 unsigned long flags;
2497
2498 if (!IO_APIC_IRQ(irq)) {
2499 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2500 ioapic);
2501 return -EINVAL;
2502 }
2503
2504 /*
2505 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
2506 * Note that we mask (disable) IRQs now -- these get enabled when the
2507 * corresponding device driver registers for this IRQ.
2508 */
2509
2510 memset(&entry,0,sizeof(entry));
2511
2512 entry.delivery_mode = INT_DELIVERY_MODE;
2513 entry.dest_mode = INT_DEST_MODE;
2514 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2515 entry.trigger = edge_level;
2516 entry.polarity = active_high_low;
2517 entry.mask = 1;
2518
2519 /*
2520 * IRQs < 16 are already in the irq_2_pin[] map
2521 */
2522 if (irq >= 16)
2523 add_pin_to_irq(irq, ioapic, pin);
2524
2525 entry.vector = assign_irq_vector(irq);
2526
2527 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2528 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2529 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
2530 edge_level, active_high_low);
2531
2532 ioapic_register_intr(irq, entry.vector, edge_level);
2533
2534 if (!ioapic && (irq < 16))
2535 disable_8259A_irq(irq);
2536
2537 spin_lock_irqsave(&ioapic_lock, flags);
2538 io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
2539 io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
2540 spin_unlock_irqrestore(&ioapic_lock, flags);
2541
2542 return 0;
2543}
2544
2545#endif /*CONFIG_ACPI_BOOT*/
diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
new file mode 100644
index 000000000000..8b25160393c1
--- /dev/null
+++ b/arch/i386/kernel/ioport.c
@@ -0,0 +1,147 @@
1/*
2 * linux/arch/i386/kernel/ioport.c
3 *
4 * This contains the io-permission bitmap code - written by obz, with changes
5 * by Linus.
6 */
7
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/types.h>
12#include <linux/ioport.h>
13#include <linux/smp.h>
14#include <linux/smp_lock.h>
15#include <linux/stddef.h>
16#include <linux/slab.h>
17#include <linux/thread_info.h>
18
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
21{
22 unsigned long mask;
23 unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
24 unsigned int low_index = base & (BITS_PER_LONG-1);
25 int length = low_index + extent;
26
27 if (low_index != 0) {
28 mask = (~0UL << low_index);
29 if (length < BITS_PER_LONG)
30 mask &= ~(~0UL << length);
31 if (new_value)
32 *bitmap_base++ |= mask;
33 else
34 *bitmap_base++ &= ~mask;
35 length -= BITS_PER_LONG;
36 }
37
38 mask = (new_value ? ~0UL : 0UL);
39 while (length >= BITS_PER_LONG) {
40 *bitmap_base++ = mask;
41 length -= BITS_PER_LONG;
42 }
43
44 if (length > 0) {
45 mask = ~(~0UL << length);
46 if (new_value)
47 *bitmap_base++ |= mask;
48 else
49 *bitmap_base++ &= ~mask;
50 }
51}
52
53
54/*
55 * this changes the io permissions bitmap in the current task.
56 */
57asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
58{
59 unsigned long i, max_long, bytes, bytes_updated;
60 struct thread_struct * t = &current->thread;
61 struct tss_struct * tss;
62 unsigned long *bitmap;
63
64 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
65 return -EINVAL;
66 if (turn_on && !capable(CAP_SYS_RAWIO))
67 return -EPERM;
68
69 /*
70 * If it's the first ioperm() call in this thread's lifetime, set the
71 * IO bitmap up. ioperm() is much less timing critical than clone(),
72 * this is why we delay this operation until now:
73 */
74 if (!t->io_bitmap_ptr) {
75 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
76 if (!bitmap)
77 return -ENOMEM;
78
79 memset(bitmap, 0xff, IO_BITMAP_BYTES);
80 t->io_bitmap_ptr = bitmap;
81 }
82
83 /*
84 * do it in the per-thread copy and in the TSS ...
85 *
86 * Disable preemption via get_cpu() - we must not switch away
87 * because the ->io_bitmap_max value must match the bitmap
88 * contents:
89 */
90 tss = &per_cpu(init_tss, get_cpu());
91
92 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
93
94 /*
95 * Search for a (possibly new) maximum. This is simple and stupid,
96 * to keep it obviously correct:
97 */
98 max_long = 0;
99 for (i = 0; i < IO_BITMAP_LONGS; i++)
100 if (t->io_bitmap_ptr[i] != ~0UL)
101 max_long = i;
102
103 bytes = (max_long + 1) * sizeof(long);
104 bytes_updated = max(bytes, t->io_bitmap_max);
105
106 t->io_bitmap_max = bytes;
107
108 /*
109 * Sets the lazy trigger so that the next I/O operation will
110 * reload the correct bitmap.
111 */
112 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
113
114 put_cpu();
115
116 return 0;
117}
118
119/*
120 * sys_iopl has to be used when you want to access the IO ports
121 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
122 * you'd need 8kB of bitmaps/process, which is a bit excessive.
123 *
124 * Here we just change the eflags value on the stack: we allow
125 * only the super-user to do it. This depends on the stack-layout
126 * on system-call entry - see also fork() and the signal handling
127 * code.
128 */
129
130asmlinkage long sys_iopl(unsigned long unused)
131{
132 volatile struct pt_regs * regs = (struct pt_regs *) &unused;
133 unsigned int level = regs->ebx;
134 unsigned int old = (regs->eflags >> 12) & 3;
135
136 if (level > 3)
137 return -EINVAL;
138 /* Trying to gain more privileges? */
139 if (level > old) {
140 if (!capable(CAP_SYS_RAWIO))
141 return -EPERM;
142 }
143 regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
144 /* Make sure we return the long way (not sysenter) */
145 set_thread_flag(TIF_IRET);
146 return 0;
147}
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
new file mode 100644
index 000000000000..73945a3c53c4
--- /dev/null
+++ b/arch/i386/kernel/irq.c
@@ -0,0 +1,261 @@
1/*
2 * linux/arch/i386/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the lowest level x86-specific interrupt
7 * entry, irq-stacks and irq statistics code. All the remaining
8 * irq logic is done by the generic kernel/irq/ code and
9 * by the x86-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.)
11 */
12
13#include <asm/uaccess.h>
14#include <linux/module.h>
15#include <linux/seq_file.h>
16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h>
18
19DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
20EXPORT_PER_CPU_SYMBOL(irq_stat);
21
22#ifndef CONFIG_X86_LOCAL_APIC
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk("unexpected IRQ trap at vector %02x\n", irq);
30}
31#endif
32
33#ifdef CONFIG_4KSTACKS
34/*
35 * per-CPU IRQ handling contexts (thread information and stack)
36 */
37union irq_ctx {
38 struct thread_info tinfo;
39 u32 stack[THREAD_SIZE/sizeof(u32)];
40};
41
42static union irq_ctx *hardirq_ctx[NR_CPUS];
43static union irq_ctx *softirq_ctx[NR_CPUS];
44#endif
45
46/*
47 * do_IRQ handles all normal device IRQ's (the special
48 * SMP cross-CPU interrupts have their own specific
49 * handlers).
50 */
51fastcall unsigned int do_IRQ(struct pt_regs *regs)
52{
53 /* high bits used in ret_from_ code */
54 int irq = regs->orig_eax & 0xff;
55#ifdef CONFIG_4KSTACKS
56 union irq_ctx *curctx, *irqctx;
57 u32 *isp;
58#endif
59
60 irq_enter();
61#ifdef CONFIG_DEBUG_STACKOVERFLOW
62 /* Debugging check for stack overflow: is there less than 1KB free? */
63 {
64 long esp;
65
66 __asm__ __volatile__("andl %%esp,%0" :
67 "=r" (esp) : "0" (THREAD_SIZE - 1));
68 if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
69 printk("do_IRQ: stack overflow: %ld\n",
70 esp - sizeof(struct thread_info));
71 dump_stack();
72 }
73 }
74#endif
75
76#ifdef CONFIG_4KSTACKS
77
78 curctx = (union irq_ctx *) current_thread_info();
79 irqctx = hardirq_ctx[smp_processor_id()];
80
81 /*
82 * this is where we switch to the IRQ stack. However, if we are
83 * already using the IRQ stack (because we interrupted a hardirq
84 * handler) we can't do that and just have to keep using the
85 * current stack (which is the irq stack already after all)
86 */
87 if (curctx != irqctx) {
88 int arg1, arg2, ebx;
89
90 /* build the stack frame on the IRQ stack */
91 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
92 irqctx->tinfo.task = curctx->tinfo.task;
93 irqctx->tinfo.previous_esp = current_stack_pointer;
94
95 asm volatile(
96 " xchgl %%ebx,%%esp \n"
97 " call __do_IRQ \n"
98 " movl %%ebx,%%esp \n"
99 : "=a" (arg1), "=d" (arg2), "=b" (ebx)
100 : "0" (irq), "1" (regs), "2" (isp)
101 : "memory", "cc", "ecx"
102 );
103 } else
104#endif
105 __do_IRQ(irq, regs);
106
107 irq_exit();
108
109 return 1;
110}
111
112#ifdef CONFIG_4KSTACKS
113
114/*
115 * These should really be __section__(".bss.page_aligned") as well, but
116 * gcc's 3.0 and earlier don't handle that correctly.
117 */
118static char softirq_stack[NR_CPUS * THREAD_SIZE]
119 __attribute__((__aligned__(THREAD_SIZE)));
120
121static char hardirq_stack[NR_CPUS * THREAD_SIZE]
122 __attribute__((__aligned__(THREAD_SIZE)));
123
124/*
125 * allocate per-cpu stacks for hardirq and for softirq processing
126 */
127void irq_ctx_init(int cpu)
128{
129 union irq_ctx *irqctx;
130
131 if (hardirq_ctx[cpu])
132 return;
133
134 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
135 irqctx->tinfo.task = NULL;
136 irqctx->tinfo.exec_domain = NULL;
137 irqctx->tinfo.cpu = cpu;
138 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
139 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
140
141 hardirq_ctx[cpu] = irqctx;
142
143 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
144 irqctx->tinfo.task = NULL;
145 irqctx->tinfo.exec_domain = NULL;
146 irqctx->tinfo.cpu = cpu;
147 irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
148 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
149
150 softirq_ctx[cpu] = irqctx;
151
152 printk("CPU %u irqstacks, hard=%p soft=%p\n",
153 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
154}
155
156extern asmlinkage void __do_softirq(void);
157
158asmlinkage void do_softirq(void)
159{
160 unsigned long flags;
161 struct thread_info *curctx;
162 union irq_ctx *irqctx;
163 u32 *isp;
164
165 if (in_interrupt())
166 return;
167
168 local_irq_save(flags);
169
170 if (local_softirq_pending()) {
171 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()];
173 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer;
175
176 /* build the stack frame on the softirq stack */
177 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
178
179 asm volatile(
180 " xchgl %%ebx,%%esp \n"
181 " call __do_softirq \n"
182 " movl %%ebx,%%esp \n"
183 : "=b"(isp)
184 : "0"(isp)
185 : "memory", "cc", "edx", "ecx", "eax"
186 );
187 }
188
189 local_irq_restore(flags);
190}
191
192EXPORT_SYMBOL(do_softirq);
193#endif
194
195/*
196 * Interrupt statistics:
197 */
198
199atomic_t irq_err_count;
200
201/*
202 * /proc/interrupts printing:
203 */
204
205int show_interrupts(struct seq_file *p, void *v)
206{
207 int i = *(loff_t *) v, j;
208 struct irqaction * action;
209 unsigned long flags;
210
211 if (i == 0) {
212 seq_printf(p, " ");
213 for (j=0; j<NR_CPUS; j++)
214 if (cpu_online(j))
215 seq_printf(p, "CPU%d ",j);
216 seq_putc(p, '\n');
217 }
218
219 if (i < NR_IRQS) {
220 spin_lock_irqsave(&irq_desc[i].lock, flags);
221 action = irq_desc[i].action;
222 if (!action)
223 goto skip;
224 seq_printf(p, "%3d: ",i);
225#ifndef CONFIG_SMP
226 seq_printf(p, "%10u ", kstat_irqs(i));
227#else
228 for (j = 0; j < NR_CPUS; j++)
229 if (cpu_online(j))
230 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
231#endif
232 seq_printf(p, " %14s", irq_desc[i].handler->typename);
233 seq_printf(p, " %s", action->name);
234
235 for (action=action->next; action; action = action->next)
236 seq_printf(p, ", %s", action->name);
237
238 seq_putc(p, '\n');
239skip:
240 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
241 } else if (i == NR_IRQS) {
242 seq_printf(p, "NMI: ");
243 for (j = 0; j < NR_CPUS; j++)
244 if (cpu_online(j))
245 seq_printf(p, "%10u ", nmi_count(j));
246 seq_putc(p, '\n');
247#ifdef CONFIG_X86_LOCAL_APIC
248 seq_printf(p, "LOC: ");
249 for (j = 0; j < NR_CPUS; j++)
250 if (cpu_online(j))
251 seq_printf(p, "%10u ",
252 per_cpu(irq_stat,j).apic_timer_irqs);
253 seq_putc(p, '\n');
254#endif
255 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
256#if defined(CONFIG_X86_IO_APIC)
257 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
258#endif
259 }
260 return 0;
261}
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
new file mode 100644
index 000000000000..671681659243
--- /dev/null
+++ b/arch/i386/kernel/kprobes.c
@@ -0,0 +1,385 @@
1/*
2 * Kernel Probes (KProbes)
3 * arch/i386/kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation ( includes contributions from
23 * Rusty Russell).
24 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
25 * interface to access function arguments.
26 */
27
28#include <linux/config.h>
29#include <linux/kprobes.h>
30#include <linux/ptrace.h>
31#include <linux/spinlock.h>
32#include <linux/preempt.h>
33#include <asm/kdebug.h>
34#include <asm/desc.h>
35
36/* kprobe_status settings */
37#define KPROBE_HIT_ACTIVE 0x00000001
38#define KPROBE_HIT_SS 0x00000002
39
40static struct kprobe *current_kprobe;
41static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags;
42static struct pt_regs jprobe_saved_regs;
43static long *jprobe_saved_esp;
44/* copy of the kernel stack at the probe fire time */
45static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
46void jprobe_return_end(void);
47
48/*
49 * returns non-zero if opcode modifies the interrupt flag.
50 */
51static inline int is_IF_modifier(kprobe_opcode_t opcode)
52{
53 switch (opcode) {
54 case 0xfa: /* cli */
55 case 0xfb: /* sti */
56 case 0xcf: /* iret/iretd */
57 case 0x9d: /* popf/popfd */
58 return 1;
59 }
60 return 0;
61}
62
63int arch_prepare_kprobe(struct kprobe *p)
64{
65 return 0;
66}
67
68void arch_copy_kprobe(struct kprobe *p)
69{
70 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
71}
72
73void arch_remove_kprobe(struct kprobe *p)
74{
75}
76
77static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
78{
79 *p->addr = p->opcode;
80 regs->eip = (unsigned long)p->addr;
81}
82
83static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
84{
85 regs->eflags |= TF_MASK;
86 regs->eflags &= ~IF_MASK;
87 /*single step inline if the instruction is an int3*/
88 if (p->opcode == BREAKPOINT_INSTRUCTION)
89 regs->eip = (unsigned long)p->addr;
90 else
91 regs->eip = (unsigned long)&p->ainsn.insn;
92}
93
94/*
95 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
96 * remain disabled thorough out this function.
97 */
98static int kprobe_handler(struct pt_regs *regs)
99{
100 struct kprobe *p;
101 int ret = 0;
102 kprobe_opcode_t *addr = NULL;
103 unsigned long *lp;
104
105 /* We're in an interrupt, but this is clear and BUG()-safe. */
106 preempt_disable();
107 /* Check if the application is using LDT entry for its code segment and
108 * calculate the address by reading the base address from the LDT entry.
109 */
110 if ((regs->xcs & 4) && (current->mm)) {
111 lp = (unsigned long *) ((unsigned long)((regs->xcs >> 3) * 8)
112 + (char *) current->mm->context.ldt);
113 addr = (kprobe_opcode_t *) (get_desc_base(lp) + regs->eip -
114 sizeof(kprobe_opcode_t));
115 } else {
116 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
117 }
118 /* Check we're not actually recursing */
119 if (kprobe_running()) {
120 /* We *are* holding lock here, so this is safe.
121 Disarm the probe we just hit, and ignore it. */
122 p = get_kprobe(addr);
123 if (p) {
124 if (kprobe_status == KPROBE_HIT_SS) {
125 regs->eflags &= ~TF_MASK;
126 regs->eflags |= kprobe_saved_eflags;
127 unlock_kprobes();
128 goto no_kprobe;
129 }
130 disarm_kprobe(p, regs);
131 ret = 1;
132 } else {
133 p = current_kprobe;
134 if (p->break_handler && p->break_handler(p, regs)) {
135 goto ss_probe;
136 }
137 }
138 /* If it's not ours, can't be delete race, (we hold lock). */
139 goto no_kprobe;
140 }
141
142 lock_kprobes();
143 p = get_kprobe(addr);
144 if (!p) {
145 unlock_kprobes();
146 if (regs->eflags & VM_MASK) {
147 /* We are in virtual-8086 mode. Return 0 */
148 goto no_kprobe;
149 }
150
151 if (*addr != BREAKPOINT_INSTRUCTION) {
152 /*
153 * The breakpoint instruction was removed right
154 * after we hit it. Another cpu has removed
155 * either a probepoint or a debugger breakpoint
156 * at this address. In either case, no further
157 * handling of this interrupt is appropriate.
158 */
159 ret = 1;
160 }
161 /* Not one of ours: let kernel handle it */
162 goto no_kprobe;
163 }
164
165 kprobe_status = KPROBE_HIT_ACTIVE;
166 current_kprobe = p;
167 kprobe_saved_eflags = kprobe_old_eflags
168 = (regs->eflags & (TF_MASK | IF_MASK));
169 if (is_IF_modifier(p->opcode))
170 kprobe_saved_eflags &= ~IF_MASK;
171
172 if (p->pre_handler && p->pre_handler(p, regs))
173 /* handler has already set things up, so skip ss setup */
174 return 1;
175
176ss_probe:
177 prepare_singlestep(p, regs);
178 kprobe_status = KPROBE_HIT_SS;
179 return 1;
180
181no_kprobe:
182 preempt_enable_no_resched();
183 return ret;
184}
185
186/*
187 * Called after single-stepping. p->addr is the address of the
188 * instruction whose first byte has been replaced by the "int 3"
189 * instruction. To avoid the SMP problems that can occur when we
190 * temporarily put back the original opcode to single-step, we
191 * single-stepped a copy of the instruction. The address of this
192 * copy is p->ainsn.insn.
193 *
194 * This function prepares to return from the post-single-step
195 * interrupt. We have to fix up the stack as follows:
196 *
197 * 0) Except in the case of absolute or indirect jump or call instructions,
198 * the new eip is relative to the copied instruction. We need to make
199 * it relative to the original instruction.
200 *
201 * 1) If the single-stepped instruction was pushfl, then the TF and IF
202 * flags are set in the just-pushed eflags, and may need to be cleared.
203 *
204 * 2) If the single-stepped instruction was a call, the return address
205 * that is atop the stack is the address following the copied instruction.
206 * We need to make it the address following the original instruction.
207 */
208static void resume_execution(struct kprobe *p, struct pt_regs *regs)
209{
210 unsigned long *tos = (unsigned long *)&regs->esp;
211 unsigned long next_eip = 0;
212 unsigned long copy_eip = (unsigned long)&p->ainsn.insn;
213 unsigned long orig_eip = (unsigned long)p->addr;
214
215 switch (p->ainsn.insn[0]) {
216 case 0x9c: /* pushfl */
217 *tos &= ~(TF_MASK | IF_MASK);
218 *tos |= kprobe_old_eflags;
219 break;
220 case 0xe8: /* call relative - Fix return addr */
221 *tos = orig_eip + (*tos - copy_eip);
222 break;
223 case 0xff:
224 if ((p->ainsn.insn[1] & 0x30) == 0x10) {
225 /* call absolute, indirect */
226 /* Fix return addr; eip is correct. */
227 next_eip = regs->eip;
228 *tos = orig_eip + (*tos - copy_eip);
229 } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
230 ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
231 /* eip is correct. */
232 next_eip = regs->eip;
233 }
234 break;
235 case 0xea: /* jmp absolute -- eip is correct */
236 next_eip = regs->eip;
237 break;
238 default:
239 break;
240 }
241
242 regs->eflags &= ~TF_MASK;
243 if (next_eip) {
244 regs->eip = next_eip;
245 } else {
246 regs->eip = orig_eip + (regs->eip - copy_eip);
247 }
248}
249
250/*
251 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
252 * remain disabled thoroughout this function. And we hold kprobe lock.
253 */
254static inline int post_kprobe_handler(struct pt_regs *regs)
255{
256 if (!kprobe_running())
257 return 0;
258
259 if (current_kprobe->post_handler)
260 current_kprobe->post_handler(current_kprobe, regs, 0);
261
262 resume_execution(current_kprobe, regs);
263 regs->eflags |= kprobe_saved_eflags;
264
265 unlock_kprobes();
266 preempt_enable_no_resched();
267
268 /*
269 * if somebody else is singlestepping across a probe point, eflags
270 * will have TF set, in which case, continue the remaining processing
271 * of do_debug, as if this is not a probe hit.
272 */
273 if (regs->eflags & TF_MASK)
274 return 0;
275
276 return 1;
277}
278
279/* Interrupts disabled, kprobe_lock held. */
280static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
281{
282 if (current_kprobe->fault_handler
283 && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
284 return 1;
285
286 if (kprobe_status & KPROBE_HIT_SS) {
287 resume_execution(current_kprobe, regs);
288 regs->eflags |= kprobe_old_eflags;
289
290 unlock_kprobes();
291 preempt_enable_no_resched();
292 }
293 return 0;
294}
295
296/*
297 * Wrapper routine to for handling exceptions.
298 */
299int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
300 void *data)
301{
302 struct die_args *args = (struct die_args *)data;
303 switch (val) {
304 case DIE_INT3:
305 if (kprobe_handler(args->regs))
306 return NOTIFY_STOP;
307 break;
308 case DIE_DEBUG:
309 if (post_kprobe_handler(args->regs))
310 return NOTIFY_STOP;
311 break;
312 case DIE_GPF:
313 if (kprobe_running() &&
314 kprobe_fault_handler(args->regs, args->trapnr))
315 return NOTIFY_STOP;
316 break;
317 case DIE_PAGE_FAULT:
318 if (kprobe_running() &&
319 kprobe_fault_handler(args->regs, args->trapnr))
320 return NOTIFY_STOP;
321 break;
322 default:
323 break;
324 }
325 return NOTIFY_DONE;
326}
327
328int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
329{
330 struct jprobe *jp = container_of(p, struct jprobe, kp);
331 unsigned long addr;
332
333 jprobe_saved_regs = *regs;
334 jprobe_saved_esp = &regs->esp;
335 addr = (unsigned long)jprobe_saved_esp;
336
337 /*
338 * TBD: As Linus pointed out, gcc assumes that the callee
339 * owns the argument space and could overwrite it, e.g.
340 * tailcall optimization. So, to be absolutely safe
341 * we also save and restore enough stack bytes to cover
342 * the argument area.
343 */
344 memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
345 regs->eflags &= ~IF_MASK;
346 regs->eip = (unsigned long)(jp->entry);
347 return 1;
348}
349
350void jprobe_return(void)
351{
352 preempt_enable_no_resched();
353 asm volatile (" xchgl %%ebx,%%esp \n"
354 " int3 \n"
355 " .globl jprobe_return_end \n"
356 " jprobe_return_end: \n"
357 " nop \n"::"b"
358 (jprobe_saved_esp):"memory");
359}
360
361int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
362{
363 u8 *addr = (u8 *) (regs->eip - 1);
364 unsigned long stack_addr = (unsigned long)jprobe_saved_esp;
365 struct jprobe *jp = container_of(p, struct jprobe, kp);
366
367 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
368 if (&regs->esp != jprobe_saved_esp) {
369 struct pt_regs *saved_regs =
370 container_of(jprobe_saved_esp, struct pt_regs, esp);
371 printk("current esp %p does not match saved esp %p\n",
372 &regs->esp, jprobe_saved_esp);
373 printk("Saved registers for jprobe %p\n", jp);
374 show_registers(saved_regs);
375 printk("Current registers\n");
376 show_registers(regs);
377 BUG();
378 }
379 *regs = jprobe_saved_regs;
380 memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
381 MIN_STACK_SIZE(stack_addr));
382 return 1;
383 }
384 return 0;
385}
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
new file mode 100644
index 000000000000..bb50afbee921
--- /dev/null
+++ b/arch/i386/kernel/ldt.c
@@ -0,0 +1,255 @@
1/*
2 * linux/kernel/ldt.c
3 *
4 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
5 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6 */
7
8#include <linux/errno.h>
9#include <linux/sched.h>
10#include <linux/string.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/smp_lock.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16
17#include <asm/uaccess.h>
18#include <asm/system.h>
19#include <asm/ldt.h>
20#include <asm/desc.h>
21
22#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
23static void flush_ldt(void *null)
24{
25 if (current->active_mm)
26 load_LDT(&current->active_mm->context);
27}
28#endif
29
30static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
31{
32 void *oldldt;
33 void *newldt;
34 int oldsize;
35
36 if (mincount <= pc->size)
37 return 0;
38 oldsize = pc->size;
39 mincount = (mincount+511)&(~511);
40 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
41 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
42 else
43 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
44
45 if (!newldt)
46 return -ENOMEM;
47
48 if (oldsize)
49 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
50 oldldt = pc->ldt;
51 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
52 pc->ldt = newldt;
53 wmb();
54 pc->size = mincount;
55 wmb();
56
57 if (reload) {
58#ifdef CONFIG_SMP
59 cpumask_t mask;
60 preempt_disable();
61 load_LDT(pc);
62 mask = cpumask_of_cpu(smp_processor_id());
63 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
64 smp_call_function(flush_ldt, NULL, 1, 1);
65 preempt_enable();
66#else
67 load_LDT(pc);
68#endif
69 }
70 if (oldsize) {
71 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
72 vfree(oldldt);
73 else
74 kfree(oldldt);
75 }
76 return 0;
77}
78
79static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
80{
81 int err = alloc_ldt(new, old->size, 0);
82 if (err < 0)
83 return err;
84 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
85 return 0;
86}
87
88/*
89 * we do not have to muck with descriptors here, that is
90 * done in switch_mm() as needed.
91 */
92int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
93{
94 struct mm_struct * old_mm;
95 int retval = 0;
96
97 init_MUTEX(&mm->context.sem);
98 mm->context.size = 0;
99 old_mm = current->mm;
100 if (old_mm && old_mm->context.size > 0) {
101 down(&old_mm->context.sem);
102 retval = copy_ldt(&mm->context, &old_mm->context);
103 up(&old_mm->context.sem);
104 }
105 return retval;
106}
107
108/*
109 * No need to lock the MM as we are the last user
110 */
111void destroy_context(struct mm_struct *mm)
112{
113 if (mm->context.size) {
114 if (mm == current->active_mm)
115 clear_LDT();
116 if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
117 vfree(mm->context.ldt);
118 else
119 kfree(mm->context.ldt);
120 mm->context.size = 0;
121 }
122}
123
124static int read_ldt(void __user * ptr, unsigned long bytecount)
125{
126 int err;
127 unsigned long size;
128 struct mm_struct * mm = current->mm;
129
130 if (!mm->context.size)
131 return 0;
132 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
133 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
134
135 down(&mm->context.sem);
136 size = mm->context.size*LDT_ENTRY_SIZE;
137 if (size > bytecount)
138 size = bytecount;
139
140 err = 0;
141 if (copy_to_user(ptr, mm->context.ldt, size))
142 err = -EFAULT;
143 up(&mm->context.sem);
144 if (err < 0)
145 goto error_return;
146 if (size != bytecount) {
147 /* zero-fill the rest */
148 if (clear_user(ptr+size, bytecount-size) != 0) {
149 err = -EFAULT;
150 goto error_return;
151 }
152 }
153 return bytecount;
154error_return:
155 return err;
156}
157
158static int read_default_ldt(void __user * ptr, unsigned long bytecount)
159{
160 int err;
161 unsigned long size;
162 void *address;
163
164 err = 0;
165 address = &default_ldt[0];
166 size = 5*sizeof(struct desc_struct);
167 if (size > bytecount)
168 size = bytecount;
169
170 err = size;
171 if (copy_to_user(ptr, address, size))
172 err = -EFAULT;
173
174 return err;
175}
176
177static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
178{
179 struct mm_struct * mm = current->mm;
180 __u32 entry_1, entry_2, *lp;
181 int error;
182 struct user_desc ldt_info;
183
184 error = -EINVAL;
185 if (bytecount != sizeof(ldt_info))
186 goto out;
187 error = -EFAULT;
188 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
189 goto out;
190
191 error = -EINVAL;
192 if (ldt_info.entry_number >= LDT_ENTRIES)
193 goto out;
194 if (ldt_info.contents == 3) {
195 if (oldmode)
196 goto out;
197 if (ldt_info.seg_not_present == 0)
198 goto out;
199 }
200
201 down(&mm->context.sem);
202 if (ldt_info.entry_number >= mm->context.size) {
203 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
204 if (error < 0)
205 goto out_unlock;
206 }
207
208 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
209
210 /* Allow LDTs to be cleared by the user. */
211 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
212 if (oldmode || LDT_empty(&ldt_info)) {
213 entry_1 = 0;
214 entry_2 = 0;
215 goto install;
216 }
217 }
218
219 entry_1 = LDT_entry_a(&ldt_info);
220 entry_2 = LDT_entry_b(&ldt_info);
221 if (oldmode)
222 entry_2 &= ~(1 << 20);
223
224 /* Install the new entry ... */
225install:
226 *lp = entry_1;
227 *(lp+1) = entry_2;
228 error = 0;
229
230out_unlock:
231 up(&mm->context.sem);
232out:
233 return error;
234}
235
236asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
237{
238 int ret = -ENOSYS;
239
240 switch (func) {
241 case 0:
242 ret = read_ldt(ptr, bytecount);
243 break;
244 case 1:
245 ret = write_ldt(ptr, bytecount, 1);
246 break;
247 case 2:
248 ret = read_default_ldt(ptr, bytecount);
249 break;
250 case 0x11:
251 ret = write_ldt(ptr, bytecount, 0);
252 break;
253 }
254 return ret;
255}
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
new file mode 100644
index 000000000000..8600faeea29d
--- /dev/null
+++ b/arch/i386/kernel/mca.c
@@ -0,0 +1,474 @@
1/*
2 * linux/arch/i386/kernel/mca.c
3 * Written by Martin Kolinek, February 1996
4 *
5 * Changes:
6 *
7 * Chris Beauregard July 28th, 1996
8 * - Fixed up integrated SCSI detection
9 *
10 * Chris Beauregard August 3rd, 1996
11 * - Made mca_info local
12 * - Made integrated registers accessible through standard function calls
13 * - Added name field
14 * - More sanity checking
15 *
16 * Chris Beauregard August 9th, 1996
17 * - Rewrote /proc/mca
18 *
19 * Chris Beauregard January 7th, 1997
20 * - Added basic NMI-processing
21 * - Added more information to mca_info structure
22 *
23 * David Weinehall October 12th, 1998
24 * - Made a lot of cleaning up in the source
25 * - Added use of save_flags / restore_flags
26 * - Added the 'driver_loaded' flag in MCA_adapter
27 * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
28 *
29 * David Weinehall March 24th, 1999
30 * - Fixed the output of 'Driver Installed' in /proc/mca/pos
31 * - Made the Integrated Video & SCSI show up even if they have id 0000
32 *
33 * Alexander Viro November 9th, 1999
34 * - Switched to regular procfs methods
35 *
36 * Alfred Arnold & David Weinehall August 23rd, 2000
37 * - Added support for Planar POS-registers
38 */
39
40#include <linux/module.h>
41#include <linux/types.h>
42#include <linux/errno.h>
43#include <linux/kernel.h>
44#include <linux/mca.h>
45#include <asm/system.h>
46#include <asm/io.h>
47#include <linux/proc_fs.h>
48#include <linux/mman.h>
49#include <linux/config.h>
50#include <linux/mm.h>
51#include <linux/pagemap.h>
52#include <linux/ioport.h>
53#include <asm/uaccess.h>
54#include <linux/init.h>
55#include <asm/arch_hooks.h>
56
57static unsigned char which_scsi = 0;
58
59int MCA_bus = 0;
60EXPORT_SYMBOL(MCA_bus);
61
62/*
63 * Motherboard register spinlock. Untested on SMP at the moment, but
64 * are there any MCA SMP boxes?
65 *
66 * Yes - Alan
67 */
68static DEFINE_SPINLOCK(mca_lock);
69
70/* Build the status info for the adapter */
71
72static void mca_configure_adapter_status(struct mca_device *mca_dev) {
73 mca_dev->status = MCA_ADAPTER_NONE;
74
75 mca_dev->pos_id = mca_dev->pos[0]
76 + (mca_dev->pos[1] << 8);
77
78 if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
79
80 /* id = 0x0000 usually indicates hardware failure,
81 * however, ZP Gu (zpg@castle.net> reports that his 9556
82 * has 0x0000 as id and everything still works. There
83 * also seem to be an adapter with id = 0x0000; the
84 * NCR Parallel Bus Memory Card. Until this is confirmed,
85 * however, this code will stay.
86 */
87
88 mca_dev->status = MCA_ADAPTER_ERROR;
89
90 return;
91 } else if(mca_dev->pos_id != 0xffff) {
92
93 /* 0xffff usually indicates that there's no adapter,
94 * however, some integrated adapters may have 0xffff as
95 * their id and still be valid. Examples are on-board
96 * VGA of the 55sx, the integrated SCSI of the 56 & 57,
97 * and possibly also the 95 ULTIMEDIA.
98 */
99
100 mca_dev->status = MCA_ADAPTER_NORMAL;
101 }
102
103 if((mca_dev->pos_id == 0xffff ||
104 mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
105 int j;
106
107 for(j = 2; j < 8; j++) {
108 if(mca_dev->pos[j] != 0xff) {
109 mca_dev->status = MCA_ADAPTER_NORMAL;
110 break;
111 }
112 }
113 }
114
115 if(!(mca_dev->pos[2] & MCA_ENABLED)) {
116
117 /* enabled bit is in POS 2 */
118
119 mca_dev->status = MCA_ADAPTER_DISABLED;
120 }
121} /* mca_configure_adapter_status */
122
123/*--------------------------------------------------------------------*/
124
125static struct resource mca_standard_resources[] = {
126 { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
127 { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
128 { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
129 { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
130 { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
131 { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
132 { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
133};
134
135#define MCA_STANDARD_RESOURCES (sizeof(mca_standard_resources)/sizeof(struct resource))
136
137/**
138 * mca_read_and_store_pos - read the POS registers into a memory buffer
139 * @pos: a char pointer to 8 bytes, contains the POS register value on
140 * successful return
141 *
142 * Returns 1 if a card actually exists (i.e. the pos isn't
143 * all 0xff) or 0 otherwise
144 */
145static int mca_read_and_store_pos(unsigned char *pos) {
146 int j;
147 int found = 0;
148
149 for(j=0; j<8; j++) {
150 if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
151 /* 0xff all across means no device. 0x00 means
152 * something's broken, but a device is
153 * probably there. However, if you get 0x00
154 * from a motherboard register it won't matter
155 * what we find. For the record, on the
156 * 57SLC, the integrated SCSI adapter has
157 * 0xffff for the adapter ID, but nonzero for
158 * other registers. */
159
160 found = 1;
161 }
162 }
163 return found;
164}
165
166static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
167{
168 unsigned char byte;
169 unsigned long flags;
170
171 if(reg < 0 || reg >= 8)
172 return 0;
173
174 spin_lock_irqsave(&mca_lock, flags);
175 if(mca_dev->pos_register) {
176 /* Disable adapter setup, enable motherboard setup */
177
178 outb_p(0, MCA_ADAPTER_SETUP_REG);
179 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
180
181 byte = inb_p(MCA_POS_REG(reg));
182 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
183 } else {
184
185 /* Make sure motherboard setup is off */
186
187 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
188
189 /* Read the appropriate register */
190
191 outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
192 byte = inb_p(MCA_POS_REG(reg));
193 outb_p(0, MCA_ADAPTER_SETUP_REG);
194 }
195 spin_unlock_irqrestore(&mca_lock, flags);
196
197 mca_dev->pos[reg] = byte;
198
199 return byte;
200}
201
202static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
203 unsigned char byte)
204{
205 unsigned long flags;
206
207 if(reg < 0 || reg >= 8)
208 return;
209
210 spin_lock_irqsave(&mca_lock, flags);
211
212 /* Make sure motherboard setup is off */
213
214 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
215
216 /* Read in the appropriate register */
217
218 outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
219 outb_p(byte, MCA_POS_REG(reg));
220 outb_p(0, MCA_ADAPTER_SETUP_REG);
221
222 spin_unlock_irqrestore(&mca_lock, flags);
223
224 /* Update the global register list, while we have the byte */
225
226 mca_dev->pos[reg] = byte;
227
228}
229
230/* for the primary MCA bus, we have identity transforms */
231static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
232{
233 return irq;
234}
235
236static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
237{
238 return port;
239}
240
241static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
242{
243 return mem;
244}
245
246
247static int __init mca_init(void)
248{
249 unsigned int i, j;
250 struct mca_device *mca_dev;
251 unsigned char pos[8];
252 short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
253 struct mca_bus *bus;
254
255 /* WARNING: Be careful when making changes here. Putting an adapter
256 * and the motherboard simultaneously into setup mode may result in
257 * damage to chips (according to The Indispensible PC Hardware Book
258 * by Hans-Peter Messmer). Also, we disable system interrupts (so
259 * that we are not disturbed in the middle of this).
260 */
261
262 /* Make sure the MCA bus is present */
263
264 if (mca_system_init()) {
265 printk(KERN_ERR "MCA bus system initialisation failed\n");
266 return -ENODEV;
267 }
268
269 if (!MCA_bus)
270 return -ENODEV;
271
272 printk(KERN_INFO "Micro Channel bus detected.\n");
273
274 /* All MCA systems have at least a primary bus */
275 bus = mca_attach_bus(MCA_PRIMARY_BUS);
276 if (!bus)
277 goto out_nomem;
278 bus->default_dma_mask = 0xffffffffLL;
279 bus->f.mca_write_pos = mca_pc_write_pos;
280 bus->f.mca_read_pos = mca_pc_read_pos;
281 bus->f.mca_transform_irq = mca_dummy_transform_irq;
282 bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284
285 /* get the motherboard device */
286 mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev))
288 goto out_nomem;
289 memset(mca_dev, 0, sizeof(struct mca_device));
290
291 /*
292 * We do not expect many MCA interrupts during initialization,
293 * but let us be safe:
294 */
295 spin_lock_irq(&mca_lock);
296
297 /* Make sure adapter setup is off */
298
299 outb_p(0, MCA_ADAPTER_SETUP_REG);
300
301 /* Read motherboard POS registers */
302
303 mca_dev->pos_register = 0x7f;
304 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
305 mca_dev->name[0] = 0;
306 mca_read_and_store_pos(mca_dev->pos);
307 mca_configure_adapter_status(mca_dev);
308 /* fake POS and slot for a motherboard */
309 mca_dev->pos_id = MCA_MOTHERBOARD_POS;
310 mca_dev->slot = MCA_MOTHERBOARD;
311 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
312
313 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
314 if(unlikely(!mca_dev))
315 goto out_unlock_nomem;
316 memset(mca_dev, 0, sizeof(struct mca_device));
317
318
319 /* Put motherboard into video setup mode, read integrated video
320 * POS registers, and turn motherboard setup off.
321 */
322
323 mca_dev->pos_register = 0xdf;
324 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
325 mca_dev->name[0] = 0;
326 mca_read_and_store_pos(mca_dev->pos);
327 mca_configure_adapter_status(mca_dev);
328 /* fake POS and slot for the integrated video */
329 mca_dev->pos_id = MCA_INTEGVIDEO_POS;
330 mca_dev->slot = MCA_INTEGVIDEO;
331 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
332
333 /* Put motherboard into scsi setup mode, read integrated scsi
334 * POS registers, and turn motherboard setup off.
335 *
336 * It seems there are two possible SCSI registers. Martin says that
337 * for the 56,57, 0xf7 is the one, but fails on the 76.
338 * Alfredo (apena@vnet.ibm.com) says
339 * 0xfd works on his machine. We'll try both of them. I figure it's
340 * a good bet that only one could be valid at a time. This could
341 * screw up though if one is used for something else on the other
342 * machine.
343 */
344
345 for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
346 outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
347 if(mca_read_and_store_pos(pos))
348 break;
349 }
350 if(which_scsi) {
351 /* found a scsi card */
352 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
353 if(unlikely(!mca_dev))
354 goto out_unlock_nomem;
355 memset(mca_dev, 0, sizeof(struct mca_device));
356
357 for(j = 0; j < 8; j++)
358 mca_dev->pos[j] = pos[j];
359
360 mca_configure_adapter_status(mca_dev);
361 /* fake POS and slot for integrated SCSI controller */
362 mca_dev->pos_id = MCA_INTEGSCSI_POS;
363 mca_dev->slot = MCA_INTEGSCSI;
364 mca_dev->pos_register = which_scsi;
365 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
366 }
367
368 /* Turn off motherboard setup */
369
370 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
371
372 /* Now loop over MCA slots: put each adapter into setup mode, and
373 * read its POS registers. Then put adapter setup off.
374 */
375
376 for(i=0; i<MCA_MAX_SLOT_NR; i++) {
377 outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
378 if(!mca_read_and_store_pos(pos))
379 continue;
380
381 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
382 if(unlikely(!mca_dev))
383 goto out_unlock_nomem;
384 memset(mca_dev, 0, sizeof(struct mca_device));
385
386 for(j=0; j<8; j++)
387 mca_dev->pos[j]=pos[j];
388
389 mca_dev->driver_loaded = 0;
390 mca_dev->slot = i;
391 mca_dev->pos_register = 0;
392 mca_configure_adapter_status(mca_dev);
393 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
394 }
395 outb_p(0, MCA_ADAPTER_SETUP_REG);
396
397 /* Enable interrupts and return memory start */
398 spin_unlock_irq(&mca_lock);
399
400 for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
401 request_resource(&ioport_resource, mca_standard_resources + i);
402
403 mca_do_proc_init();
404
405 return 0;
406
407 out_unlock_nomem:
408 spin_unlock_irq(&mca_lock);
409 out_nomem:
410 printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
411 return -ENOMEM;
412}
413
414subsys_initcall(mca_init);
415
416/*--------------------------------------------------------------------*/
417
418static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
419{
420 int slot = mca_dev->slot;
421
422 if(slot == MCA_INTEGSCSI) {
423 printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
424 mca_dev->name);
425 } else if(slot == MCA_INTEGVIDEO) {
426 printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
427 mca_dev->name);
428 } else if(slot == MCA_MOTHERBOARD) {
429 printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
430 mca_dev->name);
431 }
432
433 /* More info available in POS 6 and 7? */
434
435 if(check_flag) {
436 unsigned char pos6, pos7;
437
438 pos6 = mca_device_read_pos(mca_dev, 6);
439 pos7 = mca_device_read_pos(mca_dev, 7);
440
441 printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
442 }
443
444} /* mca_handle_nmi_slot */
445
446/*--------------------------------------------------------------------*/
447
448static int mca_handle_nmi_callback(struct device *dev, void *data)
449{
450 struct mca_device *mca_dev = to_mca_device(dev);
451 unsigned char pos5;
452
453 pos5 = mca_device_read_pos(mca_dev, 5);
454
455 if(!(pos5 & 0x80)) {
456 /* Bit 7 of POS 5 is reset when this adapter has a hardware
457 * error. Bit 7 it reset if there's error information
458 * available in POS 6 and 7.
459 */
460 mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
461 return 1;
462 }
463 return 0;
464}
465
466void mca_handle_nmi(void)
467{
468 /* First try - scan the various adapters and see if a specific
469 * adapter was responsible for the error.
470 */
471 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
472
473 mca_nmi_hook();
474} /* mca_handle_nmi */
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
new file mode 100644
index 000000000000..a77c612aad00
--- /dev/null
+++ b/arch/i386/kernel/microcode.c
@@ -0,0 +1,512 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2004 Tigran Aivazian
5 *
6 * This driver allows to upgrade microcode on Intel processors
7 * belonging to IA-32 family - PentiumPro, Pentium II,
8 * Pentium III, Xeon, Pentium 4, etc.
9 *
10 * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
11 * Order Number 245472 or free download from:
12 *
13 * http://developer.intel.com/design/pentium4/manuals/245472.htm
14 *
15 * For more information, go to http://www.urbanmyth.org/microcode
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public License
19 * as published by the Free Software Foundation; either version
20 * 2 of the License, or (at your option) any later version.
21 *
22 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
23 * Initial release.
24 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Added read() support + cleanups.
26 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added 'device trimming' support. open(O_WRONLY) zeroes
28 * and frees the saved copy of applied microcode.
29 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
30 * Made to use devfs (/dev/cpu/microcode) + cleanups.
31 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
32 * Added misc device support (now uses both devfs and misc).
33 * Added MICROCODE_IOCFREE ioctl to clear memory.
34 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
35 * Messages for error cases (non Intel & no suitable microcode).
36 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
37 * Removed ->release(). Removed exclusive open and status bitmap.
38 * Added microcode_rwsem to serialize read()/write()/ioctl().
39 * Removed global kernel lock usage.
40 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
41 * Write 0 to 0x8B msr and then cpuid before reading revision,
42 * so that it works even if there were no update done by the
43 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
44 * to be 0 on my machine which is why it worked even when I
45 * disabled update by the BIOS)
46 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
47 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
48 * Tigran Aivazian <tigran@veritas.com>
49 * Intel Pentium 4 processor support and bugfixes.
50 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
51 * Bugfix for HT (Hyper-Threading) enabled processors
52 * whereby processor resources are shared by all logical processors
53 * in a single CPU package.
54 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
55 * Tigran Aivazian <tigran@veritas.com>,
56 * Serialize updates as required on HT processors due to speculative
57 * nature of implementation.
58 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
59 * Fix the panic when writing zero-length microcode chunk.
60 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
61 * Jun Nakajima <jun.nakajima@intel.com>
62 * Support for the microcode updates in the new format.
63 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
64 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
65 * because we no longer hold a copy of applied microcode
66 * in kernel memory.
67 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
68 * Fix sigmatch() macro to handle old CPUs with pf == 0.
69 * Thanks to Stuart Swales for pointing out this bug.
70 */
71
72//#define DEBUG /* pr_debug */
73#include <linux/kernel.h>
74#include <linux/init.h>
75#include <linux/sched.h>
76#include <linux/module.h>
77#include <linux/slab.h>
78#include <linux/vmalloc.h>
79#include <linux/miscdevice.h>
80#include <linux/spinlock.h>
81#include <linux/mm.h>
82
83#include <asm/msr.h>
84#include <asm/uaccess.h>
85#include <asm/processor.h>
86
87MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
88MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
89MODULE_LICENSE("GPL");
90
91#define MICROCODE_VERSION "1.14"
92
93#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
94#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
95#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
96#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
97#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
98#define DWSIZE (sizeof (u32))
99#define get_totalsize(mc) \
100 (((microcode_t *)mc)->hdr.totalsize ? \
101 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
102#define get_datasize(mc) \
103 (((microcode_t *)mc)->hdr.datasize ? \
104 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
105
106#define sigmatch(s1, s2, p1, p2) \
107 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
108
109#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
110
111/* serialize access to the physical write to MSR 0x79 */
112static DEFINE_SPINLOCK(microcode_update_lock);
113
114/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
115static DECLARE_MUTEX(microcode_sem);
116
117static void __user *user_buffer; /* user area microcode data buffer */
118static unsigned int user_buffer_size; /* it's size */
119
120typedef enum mc_error_code {
121 MC_SUCCESS = 0,
122 MC_NOTFOUND = 1,
123 MC_MARKED = 2,
124 MC_ALLOCATED = 3,
125} mc_error_code_t;
126
127static struct ucode_cpu_info {
128 unsigned int sig;
129 unsigned int pf;
130 unsigned int rev;
131 unsigned int cksum;
132 mc_error_code_t err;
133 microcode_t *mc;
134} ucode_cpu_info[NR_CPUS];
135
136static int microcode_open (struct inode *unused1, struct file *unused2)
137{
138 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
139}
140
141static void collect_cpu_info (void *unused)
142{
143 int cpu_num = smp_processor_id();
144 struct cpuinfo_x86 *c = cpu_data + cpu_num;
145 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
146 unsigned int val[2];
147
148 uci->sig = uci->pf = uci->rev = uci->cksum = 0;
149 uci->err = MC_NOTFOUND;
150 uci->mc = NULL;
151
152 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
153 cpu_has(c, X86_FEATURE_IA64)) {
154 printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num);
155 return;
156 } else {
157 uci->sig = cpuid_eax(0x00000001);
158
159 if ((c->x86_model >= 5) || (c->x86 > 6)) {
160 /* get processor flags from MSR 0x17 */
161 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
162 uci->pf = 1 << ((val[1] >> 18) & 7);
163 }
164 }
165
166 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
167 __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
168 /* get the current revision from MSR 0x8B */
169 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
170 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
171 uci->sig, uci->pf, uci->rev);
172}
173
174static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum)
175{
176 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
177
178 pr_debug("Microcode Found.\n");
179 pr_debug(" Header Revision 0x%x\n", mc_header->hdrver);
180 pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver);
181 pr_debug(" Revision 0x%x \n", mc_header->rev);
182 pr_debug(" Date %x/%x/%x\n",
183 ((mc_header->date >> 24 ) & 0xff),
184 ((mc_header->date >> 16 ) & 0xff),
185 (mc_header->date & 0xFFFF));
186 pr_debug(" Signature 0x%x\n", sig);
187 pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n",
188 ((sig >> 12) & 0x3),
189 ((sig >> 8) & 0xf),
190 ((sig >> 4) & 0xf),
191 ((sig & 0xf)));
192 pr_debug(" Processor Flags 0x%x\n", pf);
193 pr_debug(" Checksum 0x%x\n", cksum);
194
195 if (mc_header->rev < uci->rev) {
196 printk(KERN_ERR "microcode: CPU%d not 'upgrading' to earlier revision"
197 " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev);
198 goto out;
199 } else if (mc_header->rev == uci->rev) {
200 /* notify the caller of success on this cpu */
201 uci->err = MC_SUCCESS;
202 printk(KERN_ERR "microcode: CPU%d already at revision"
203 " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev);
204 goto out;
205 }
206
207 pr_debug("microcode: CPU%d found a matching microcode update with "
208 " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev);
209 uci->cksum = cksum;
210 uci->pf = pf; /* keep the original mc pf for cksum calculation */
211 uci->err = MC_MARKED; /* found the match */
212out:
213 return;
214}
215
216static int find_matching_ucodes (void)
217{
218 int cursor = 0;
219 int error = 0;
220
221 while (cursor + MC_HEADER_SIZE < user_buffer_size) {
222 microcode_header_t mc_header;
223 void *newmc = NULL;
224 int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size;
225
226 if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) {
227 printk(KERN_ERR "microcode: error! Can not read user data\n");
228 error = -EFAULT;
229 goto out;
230 }
231
232 total_size = get_totalsize(&mc_header);
233 if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) {
234 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
235 error = -EINVAL;
236 goto out;
237 }
238
239 data_size = get_datasize(&mc_header);
240 if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) {
241 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
242 error = -EINVAL;
243 goto out;
244 }
245
246 if (mc_header.ldrver != 1 || mc_header.hdrver != 1) {
247 printk(KERN_ERR "microcode: error! Unknown microcode update format\n");
248 error = -EINVAL;
249 goto out;
250 }
251
252 for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) {
253 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
254 if (uci->err != MC_NOTFOUND) /* already found a match or not an online cpu*/
255 continue;
256
257 if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->pf))
258 mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum);
259 }
260
261 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
262 if (ext_table_size) {
263 struct extended_sigtable ext_header;
264 struct extended_signature ext_sig;
265 int ext_sigcount;
266
267 if ((ext_table_size < EXT_HEADER_SIZE)
268 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
269 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
270 error = -EINVAL;
271 goto out;
272 }
273 if (copy_from_user(&ext_header, user_buffer + cursor
274 + MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) {
275 printk(KERN_ERR "microcode: error! Can not read user data\n");
276 error = -EFAULT;
277 goto out;
278 }
279 if (ext_table_size != exttable_size(&ext_header)) {
280 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
281 error = -EFAULT;
282 goto out;
283 }
284
285 ext_sigcount = ext_header.count;
286
287 for (i = 0; i < ext_sigcount; i++) {
288 if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE
289 + EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) {
290 printk(KERN_ERR "microcode: error! Can not read user data\n");
291 error = -EFAULT;
292 goto out;
293 }
294 for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) {
295 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
296 if (uci->err != MC_NOTFOUND) /* already found a match or not an online cpu*/
297 continue;
298 if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->pf)) {
299 mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum);
300 }
301 }
302 }
303 }
304 /* now check if any cpu has matched */
305 for (cpu_num = 0, allocated_flag = 0, sum = 0; cpu_num < num_online_cpus(); cpu_num++) {
306 if (ucode_cpu_info[cpu_num].err == MC_MARKED) {
307 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
308 if (!allocated_flag) {
309 allocated_flag = 1;
310 newmc = vmalloc(total_size);
311 if (!newmc) {
312 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
313 error = -ENOMEM;
314 goto out;
315 }
316 if (copy_from_user(newmc + MC_HEADER_SIZE,
317 user_buffer + cursor + MC_HEADER_SIZE,
318 total_size - MC_HEADER_SIZE)) {
319 printk(KERN_ERR "microcode: error! Can not read user data\n");
320 vfree(newmc);
321 error = -EFAULT;
322 goto out;
323 }
324 memcpy(newmc, &mc_header, MC_HEADER_SIZE);
325 /* check extended table checksum */
326 if (ext_table_size) {
327 int ext_table_sum = 0;
328 int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size);
329 i = ext_table_size / DWSIZE;
330 while (i--) ext_table_sum += ext_tablep[i];
331 if (ext_table_sum) {
332 printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n");
333 vfree(newmc);
334 error = -EINVAL;
335 goto out;
336 }
337 }
338
339 /* calculate the checksum */
340 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
341 while (i--) sum += ((int *)newmc)[i];
342 sum -= (mc_header.sig + mc_header.pf + mc_header.cksum);
343 }
344 ucode_cpu_info[cpu_num].mc = newmc;
345 ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */
346 if (sum + uci->sig + uci->pf + uci->cksum != 0) {
347 printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num);
348 error = -EINVAL;
349 goto out;
350 }
351 }
352 }
353 cursor += total_size; /* goto the next update patch */
354 } /* end of while */
355out:
356 return error;
357}
358
359static void do_update_one (void * unused)
360{
361 unsigned long flags;
362 unsigned int val[2];
363 int cpu_num = smp_processor_id();
364 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
365
366 if (uci->mc == NULL) {
367 printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num);
368 return;
369 }
370
371 /* serialize access to the physical write to MSR 0x79 */
372 spin_lock_irqsave(&microcode_update_lock, flags);
373
374 /* write microcode via MSR 0x79 */
375 wrmsr(MSR_IA32_UCODE_WRITE,
376 (unsigned long) uci->mc->bits,
377 (unsigned long) uci->mc->bits >> 16 >> 16);
378 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
379
380 __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
381 /* get the current revision from MSR 0x8B */
382 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
383
384 /* notify the caller of success on this cpu */
385 uci->err = MC_SUCCESS;
386 spin_unlock_irqrestore(&microcode_update_lock, flags);
387 printk(KERN_INFO "microcode: CPU%d updated from revision "
388 "0x%x to 0x%x, date = %08x \n",
389 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
390 return;
391}
392
393static int do_microcode_update (void)
394{
395 int i, error;
396
397 if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) {
398 printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
399 error = -EIO;
400 goto out;
401 }
402
403 if ((error = find_matching_ucodes())) {
404 printk(KERN_ERR "microcode: Error in the microcode data\n");
405 goto out_free;
406 }
407
408 if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) {
409 printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
410 error = -EIO;
411 }
412
413out_free:
414 for (i = 0; i < num_online_cpus(); i++) {
415 if (ucode_cpu_info[i].mc) {
416 int j;
417 void *tmp = ucode_cpu_info[i].mc;
418 vfree(tmp);
419 for (j = i; j < num_online_cpus(); j++) {
420 if (ucode_cpu_info[j].mc == tmp)
421 ucode_cpu_info[j].mc = NULL;
422 }
423 }
424 }
425out:
426 return error;
427}
428
429static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
430{
431 ssize_t ret;
432
433 if (len < DEFAULT_UCODE_TOTALSIZE) {
434 printk(KERN_ERR "microcode: not enough data\n");
435 return -EINVAL;
436 }
437
438 if ((len >> PAGE_SHIFT) > num_physpages) {
439 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
440 return -EINVAL;
441 }
442
443 down(&microcode_sem);
444
445 user_buffer = (void __user *) buf;
446 user_buffer_size = (int) len;
447
448 ret = do_microcode_update();
449 if (!ret)
450 ret = (ssize_t)len;
451
452 up(&microcode_sem);
453
454 return ret;
455}
456
457static int microcode_ioctl (struct inode *inode, struct file *file,
458 unsigned int cmd, unsigned long arg)
459{
460 switch (cmd) {
461 /*
462 * XXX: will be removed after microcode_ctl
463 * is updated to ignore failure of this ioctl()
464 */
465 case MICROCODE_IOCFREE:
466 return 0;
467 default:
468 return -EINVAL;
469 }
470 return -EINVAL;
471}
472
473static struct file_operations microcode_fops = {
474 .owner = THIS_MODULE,
475 .write = microcode_write,
476 .ioctl = microcode_ioctl,
477 .open = microcode_open,
478};
479
480static struct miscdevice microcode_dev = {
481 .minor = MICROCODE_MINOR,
482 .name = "microcode",
483 .devfs_name = "cpu/microcode",
484 .fops = &microcode_fops,
485};
486
487static int __init microcode_init (void)
488{
489 int error;
490
491 error = misc_register(&microcode_dev);
492 if (error) {
493 printk(KERN_ERR
494 "microcode: can't misc_register on minor=%d\n",
495 MICROCODE_MINOR);
496 return error;
497 }
498
499 printk(KERN_INFO
500 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
501 return 0;
502}
503
504static void __exit microcode_exit (void)
505{
506 misc_deregister(&microcode_dev);
507 printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n");
508}
509
510module_init(microcode_init)
511module_exit(microcode_exit)
512MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
new file mode 100644
index 000000000000..5149c8a621f0
--- /dev/null
+++ b/arch/i386/kernel/module.c
@@ -0,0 +1,129 @@
1/* Kernel module help for i386.
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/moduleloader.h>
19#include <linux/elf.h>
20#include <linux/vmalloc.h>
21#include <linux/fs.h>
22#include <linux/string.h>
23#include <linux/kernel.h>
24
25#if 0
26#define DEBUGP printk
27#else
28#define DEBUGP(fmt...)
29#endif
30
31void *module_alloc(unsigned long size)
32{
33 if (size == 0)
34 return NULL;
35 return vmalloc_exec(size);
36}
37
38
39/* Free memory returned from module_alloc */
40void module_free(struct module *mod, void *module_region)
41{
42 vfree(module_region);
43 /* FIXME: If module_region == mod->init_region, trim exception
44 table entries. */
45}
46
47/* We don't need anything special. */
48int module_frob_arch_sections(Elf_Ehdr *hdr,
49 Elf_Shdr *sechdrs,
50 char *secstrings,
51 struct module *mod)
52{
53 return 0;
54}
55
56int apply_relocate(Elf32_Shdr *sechdrs,
57 const char *strtab,
58 unsigned int symindex,
59 unsigned int relsec,
60 struct module *me)
61{
62 unsigned int i;
63 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
64 Elf32_Sym *sym;
65 uint32_t *location;
66
67 DEBUGP("Applying relocate section %u to %u\n", relsec,
68 sechdrs[relsec].sh_info);
69 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
70 /* This is where to make the change */
71 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
72 + rel[i].r_offset;
73 /* This is the symbol it is referring to. Note that all
74 undefined symbols have been resolved. */
75 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
76 + ELF32_R_SYM(rel[i].r_info);
77
78 switch (ELF32_R_TYPE(rel[i].r_info)) {
79 case R_386_32:
80 /* We add the value into the location given */
81 *location += sym->st_value;
82 break;
83 case R_386_PC32:
84 /* Add the value, subtract its postition */
85 *location += sym->st_value - (uint32_t)location;
86 break;
87 default:
88 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
89 me->name, ELF32_R_TYPE(rel[i].r_info));
90 return -ENOEXEC;
91 }
92 }
93 return 0;
94}
95
96int apply_relocate_add(Elf32_Shdr *sechdrs,
97 const char *strtab,
98 unsigned int symindex,
99 unsigned int relsec,
100 struct module *me)
101{
102 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
103 me->name);
104 return -ENOEXEC;
105}
106
107extern void apply_alternatives(void *start, void *end);
108
109int module_finalize(const Elf_Ehdr *hdr,
110 const Elf_Shdr *sechdrs,
111 struct module *me)
112{
113 const Elf_Shdr *s;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115
116 /* look for .altinstructions to patch */
117 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
118 void *seg;
119 if (strcmp(".altinstructions", secstrings + s->sh_name))
120 continue;
121 seg = (void *)s->sh_addr;
122 apply_alternatives(seg, seg + s->sh_size);
123 }
124 return 0;
125}
126
127void module_arch_cleanup(struct module *mod)
128{
129}
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
new file mode 100644
index 000000000000..1347ab4939e7
--- /dev/null
+++ b/arch/i386/kernel/mpparse.c
@@ -0,0 +1,1109 @@
1/*
2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines.
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes
9 * Erich Boleyn : MP v1.4 and additional changes.
10 * Alan Cox : Added EBDA scanning
11 * Ingo Molnar : various cleanups and rewrites
12 * Maciej W. Rozycki: Bits for default MP configurations
13 * Paul Diefenbaugh: Added full ACPI support
14 */
15
16#include <linux/mm.h>
17#include <linux/irq.h>
18#include <linux/init.h>
19#include <linux/acpi.h>
20#include <linux/delay.h>
21#include <linux/config.h>
22#include <linux/bootmem.h>
23#include <linux/smp_lock.h>
24#include <linux/kernel_stat.h>
25#include <linux/mc146818rtc.h>
26#include <linux/bitops.h>
27
28#include <asm/smp.h>
29#include <asm/acpi.h>
30#include <asm/mtrr.h>
31#include <asm/mpspec.h>
32#include <asm/io_apic.h>
33
34#include <mach_apic.h>
35#include <mach_mpparse.h>
36#include <bios_ebda.h>
37
38/* Have we found an MP table */
39int smp_found_config;
40unsigned int __initdata maxcpus = NR_CPUS;
41
42/*
43 * Various Linux-internal data structures created from the
44 * MP-table.
45 */
46int apic_version [MAX_APICS];
47int mp_bus_id_to_type [MAX_MP_BUSSES];
48int mp_bus_id_to_node [MAX_MP_BUSSES];
49int mp_bus_id_to_local [MAX_MP_BUSSES];
50int quad_local_to_mp_bus_id [NR_CPUS/4][4];
51int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
52static int mp_current_pci_id;
53
54/* I/O APIC entries */
55struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
56
57/* # of MP IRQ source entries */
58struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
59
60/* MP IRQ source entries */
61int mp_irq_entries;
62
63int nr_ioapics;
64
65int pic_mode;
66unsigned long mp_lapic_addr;
67
68/* Processor that is doing the boot up */
69unsigned int boot_cpu_physical_apicid = -1U;
70unsigned int boot_cpu_logical_apicid = -1U;
71/* Internal processor count */
72static unsigned int __initdata num_processors;
73
74/* Bitmask of physically existing CPUs */
75physid_mask_t phys_cpu_present_map;
76
77u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
78
79/*
80 * Intel MP BIOS table parsing routines:
81 */
82
83
84/*
85 * Checksum an MP configuration block.
86 */
87
88static int __init mpf_checksum(unsigned char *mp, int len)
89{
90 int sum = 0;
91
92 while (len--)
93 sum += *mp++;
94
95 return sum & 0xFF;
96}
97
98/*
99 * Have to match translation table entries to main table entries by counter
100 * hence the mpc_record variable .... can't see a less disgusting way of
101 * doing this ....
102 */
103
104static int mpc_record;
105static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
106
107#ifdef CONFIG_X86_NUMAQ
108static int MP_valid_apicid(int apicid, int version)
109{
110 return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
111}
112#else
113static int MP_valid_apicid(int apicid, int version)
114{
115 if (version >= 0x14)
116 return apicid < 0xff;
117 else
118 return apicid < 0xf;
119}
120#endif
121
122static void __init MP_processor_info (struct mpc_config_processor *m)
123{
124 int ver, apicid;
125 physid_mask_t tmp;
126
127 if (!(m->mpc_cpuflag & CPU_ENABLED))
128 return;
129
130 apicid = mpc_apic_id(m, translation_table[mpc_record]);
131
132 if (m->mpc_featureflag&(1<<0))
133 Dprintk(" Floating point unit present.\n");
134 if (m->mpc_featureflag&(1<<7))
135 Dprintk(" Machine Exception supported.\n");
136 if (m->mpc_featureflag&(1<<8))
137 Dprintk(" 64 bit compare & exchange supported.\n");
138 if (m->mpc_featureflag&(1<<9))
139 Dprintk(" Internal APIC present.\n");
140 if (m->mpc_featureflag&(1<<11))
141 Dprintk(" SEP present.\n");
142 if (m->mpc_featureflag&(1<<12))
143 Dprintk(" MTRR present.\n");
144 if (m->mpc_featureflag&(1<<13))
145 Dprintk(" PGE present.\n");
146 if (m->mpc_featureflag&(1<<14))
147 Dprintk(" MCA present.\n");
148 if (m->mpc_featureflag&(1<<15))
149 Dprintk(" CMOV present.\n");
150 if (m->mpc_featureflag&(1<<16))
151 Dprintk(" PAT present.\n");
152 if (m->mpc_featureflag&(1<<17))
153 Dprintk(" PSE present.\n");
154 if (m->mpc_featureflag&(1<<18))
155 Dprintk(" PSN present.\n");
156 if (m->mpc_featureflag&(1<<19))
157 Dprintk(" Cache Line Flush Instruction present.\n");
158 /* 20 Reserved */
159 if (m->mpc_featureflag&(1<<21))
160 Dprintk(" Debug Trace and EMON Store present.\n");
161 if (m->mpc_featureflag&(1<<22))
162 Dprintk(" ACPI Thermal Throttle Registers present.\n");
163 if (m->mpc_featureflag&(1<<23))
164 Dprintk(" MMX present.\n");
165 if (m->mpc_featureflag&(1<<24))
166 Dprintk(" FXSR present.\n");
167 if (m->mpc_featureflag&(1<<25))
168 Dprintk(" XMM present.\n");
169 if (m->mpc_featureflag&(1<<26))
170 Dprintk(" Willamette New Instructions present.\n");
171 if (m->mpc_featureflag&(1<<27))
172 Dprintk(" Self Snoop present.\n");
173 if (m->mpc_featureflag&(1<<28))
174 Dprintk(" HT present.\n");
175 if (m->mpc_featureflag&(1<<29))
176 Dprintk(" Thermal Monitor present.\n");
177 /* 30, 31 Reserved */
178
179
180 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
181 Dprintk(" Bootup CPU\n");
182 boot_cpu_physical_apicid = m->mpc_apicid;
183 boot_cpu_logical_apicid = apicid;
184 }
185
186 if (num_processors >= NR_CPUS) {
187 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
188 " Processor ignored.\n", NR_CPUS);
189 return;
190 }
191
192 if (num_processors >= maxcpus) {
193 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
194 " Processor ignored.\n", maxcpus);
195 return;
196 }
197 num_processors++;
198 ver = m->mpc_apicver;
199
200 if (!MP_valid_apicid(apicid, ver)) {
201 printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n",
202 m->mpc_apicid, MAX_APICS);
203 --num_processors;
204 return;
205 }
206
207 tmp = apicid_to_cpu_present(apicid);
208 physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp);
209
210 /*
211 * Validate version
212 */
213 if (ver == 0x0) {
214 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
215 ver = 0x10;
216 }
217 apic_version[m->mpc_apicid] = ver;
218 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
219}
220
221static void __init MP_bus_info (struct mpc_config_bus *m)
222{
223 char str[7];
224
225 memcpy(str, m->mpc_bustype, 6);
226 str[6] = 0;
227
228 mpc_oem_bus_info(m, str, translation_table[mpc_record]);
229
230 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
231 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
232 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
233 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
234 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
235 mpc_oem_pci_bus(m, translation_table[mpc_record]);
236 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
237 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
238 mp_current_pci_id++;
239 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
240 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
241 } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
242 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
243 } else {
244 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
245 }
246}
247
248static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
249{
250 if (!(m->mpc_flags & MPC_APIC_USABLE))
251 return;
252
253 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
254 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
255 if (nr_ioapics >= MAX_IO_APICS) {
256 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
257 MAX_IO_APICS, nr_ioapics);
258 panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
259 }
260 if (!m->mpc_apicaddr) {
261 printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
262 " found in MP table, skipping!\n");
263 return;
264 }
265 mp_ioapics[nr_ioapics] = *m;
266 nr_ioapics++;
267}
268
269static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
270{
271 mp_irqs [mp_irq_entries] = *m;
272 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
273 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
274 m->mpc_irqtype, m->mpc_irqflag & 3,
275 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
276 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
277 if (++mp_irq_entries == MAX_IRQ_SOURCES)
278 panic("Max # of irq sources exceeded!!\n");
279}
280
281static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
282{
283 Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
284 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
285 m->mpc_irqtype, m->mpc_irqflag & 3,
286 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
287 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
288 /*
289 * Well it seems all SMP boards in existence
290 * use ExtINT/LVT1 == LINT0 and
291 * NMI/LVT2 == LINT1 - the following check
292 * will show us if this assumptions is false.
293 * Until then we do not have to add baggage.
294 */
295 if ((m->mpc_irqtype == mp_ExtINT) &&
296 (m->mpc_destapiclint != 0))
297 BUG();
298 if ((m->mpc_irqtype == mp_NMI) &&
299 (m->mpc_destapiclint != 1))
300 BUG();
301}
302
303#ifdef CONFIG_X86_NUMAQ
304static void __init MP_translation_info (struct mpc_config_translation *m)
305{
306 printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
307
308 if (mpc_record >= MAX_MPC_ENTRY)
309 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
310 else
311 translation_table[mpc_record] = m; /* stash this for later */
312 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
313 node_set_online(m->trans_quad);
314}
315
316/*
317 * Read/parse the MPC oem tables
318 */
319
320static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
321 unsigned short oemsize)
322{
323 int count = sizeof (*oemtable); /* the header size */
324 unsigned char *oemptr = ((unsigned char *)oemtable)+count;
325
326 mpc_record = 0;
327 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
328 if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
329 {
330 printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
331 oemtable->oem_signature[0],
332 oemtable->oem_signature[1],
333 oemtable->oem_signature[2],
334 oemtable->oem_signature[3]);
335 return;
336 }
337 if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
338 {
339 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
340 return;
341 }
342 while (count < oemtable->oem_length) {
343 switch (*oemptr) {
344 case MP_TRANSLATION:
345 {
346 struct mpc_config_translation *m=
347 (struct mpc_config_translation *)oemptr;
348 MP_translation_info(m);
349 oemptr += sizeof(*m);
350 count += sizeof(*m);
351 ++mpc_record;
352 break;
353 }
354 default:
355 {
356 printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
357 return;
358 }
359 }
360 }
361}
362
363static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
364 char *productid)
365{
366 if (strncmp(oem, "IBM NUMA", 8))
367 printk("Warning! May not be a NUMA-Q system!\n");
368 if (mpc->mpc_oemptr)
369 smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
370 mpc->mpc_oemsize);
371}
372#endif /* CONFIG_X86_NUMAQ */
373
374/*
375 * Read/parse the MPC
376 */
377
378static int __init smp_read_mpc(struct mp_config_table *mpc)
379{
380 char str[16];
381 char oem[10];
382 int count=sizeof(*mpc);
383 unsigned char *mpt=((unsigned char *)mpc)+count;
384
385 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
386 printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
387 *(u32 *)mpc->mpc_signature);
388 return 0;
389 }
390 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
391 printk(KERN_ERR "SMP mptable: checksum error!\n");
392 return 0;
393 }
394 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
395 printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
396 mpc->mpc_spec);
397 return 0;
398 }
399 if (!mpc->mpc_lapic) {
400 printk(KERN_ERR "SMP mptable: null local APIC address!\n");
401 return 0;
402 }
403 memcpy(oem,mpc->mpc_oem,8);
404 oem[8]=0;
405 printk(KERN_INFO "OEM ID: %s ",oem);
406
407 memcpy(str,mpc->mpc_productid,12);
408 str[12]=0;
409 printk("Product ID: %s ",str);
410
411 mps_oem_check(mpc, oem, str);
412
413 printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
414
415 /*
416 * Save the local APIC address (it might be non-default) -- but only
417 * if we're not using ACPI.
418 */
419 if (!acpi_lapic)
420 mp_lapic_addr = mpc->mpc_lapic;
421
422 /*
423 * Now process the configuration blocks.
424 */
425 mpc_record = 0;
426 while (count < mpc->mpc_length) {
427 switch(*mpt) {
428 case MP_PROCESSOR:
429 {
430 struct mpc_config_processor *m=
431 (struct mpc_config_processor *)mpt;
432 /* ACPI may have already provided this data */
433 if (!acpi_lapic)
434 MP_processor_info(m);
435 mpt += sizeof(*m);
436 count += sizeof(*m);
437 break;
438 }
439 case MP_BUS:
440 {
441 struct mpc_config_bus *m=
442 (struct mpc_config_bus *)mpt;
443 MP_bus_info(m);
444 mpt += sizeof(*m);
445 count += sizeof(*m);
446 break;
447 }
448 case MP_IOAPIC:
449 {
450 struct mpc_config_ioapic *m=
451 (struct mpc_config_ioapic *)mpt;
452 MP_ioapic_info(m);
453 mpt+=sizeof(*m);
454 count+=sizeof(*m);
455 break;
456 }
457 case MP_INTSRC:
458 {
459 struct mpc_config_intsrc *m=
460 (struct mpc_config_intsrc *)mpt;
461
462 MP_intsrc_info(m);
463 mpt+=sizeof(*m);
464 count+=sizeof(*m);
465 break;
466 }
467 case MP_LINTSRC:
468 {
469 struct mpc_config_lintsrc *m=
470 (struct mpc_config_lintsrc *)mpt;
471 MP_lintsrc_info(m);
472 mpt+=sizeof(*m);
473 count+=sizeof(*m);
474 break;
475 }
476 default:
477 {
478 count = mpc->mpc_length;
479 break;
480 }
481 }
482 ++mpc_record;
483 }
484 clustered_apic_check();
485 if (!num_processors)
486 printk(KERN_ERR "SMP mptable: no processors registered!\n");
487 return num_processors;
488}
489
490static int __init ELCR_trigger(unsigned int irq)
491{
492 unsigned int port;
493
494 port = 0x4d0 + (irq >> 3);
495 return (inb(port) >> (irq & 7)) & 1;
496}
497
498static void __init construct_default_ioirq_mptable(int mpc_default_type)
499{
500 struct mpc_config_intsrc intsrc;
501 int i;
502 int ELCR_fallback = 0;
503
504 intsrc.mpc_type = MP_INTSRC;
505 intsrc.mpc_irqflag = 0; /* conforming */
506 intsrc.mpc_srcbus = 0;
507 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
508
509 intsrc.mpc_irqtype = mp_INT;
510
511 /*
512 * If true, we have an ISA/PCI system with no IRQ entries
513 * in the MP table. To prevent the PCI interrupts from being set up
514 * incorrectly, we try to use the ELCR. The sanity check to see if
515 * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
516 * never be level sensitive, so we simply see if the ELCR agrees.
517 * If it does, we assume it's valid.
518 */
519 if (mpc_default_type == 5) {
520 printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
521
522 if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
523 printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
524 else {
525 printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
526 ELCR_fallback = 1;
527 }
528 }
529
530 for (i = 0; i < 16; i++) {
531 switch (mpc_default_type) {
532 case 2:
533 if (i == 0 || i == 13)
534 continue; /* IRQ0 & IRQ13 not connected */
535 /* fall through */
536 default:
537 if (i == 2)
538 continue; /* IRQ2 is never connected */
539 }
540
541 if (ELCR_fallback) {
542 /*
543 * If the ELCR indicates a level-sensitive interrupt, we
544 * copy that information over to the MP table in the
545 * irqflag field (level sensitive, active high polarity).
546 */
547 if (ELCR_trigger(i))
548 intsrc.mpc_irqflag = 13;
549 else
550 intsrc.mpc_irqflag = 0;
551 }
552
553 intsrc.mpc_srcbusirq = i;
554 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
555 MP_intsrc_info(&intsrc);
556 }
557
558 intsrc.mpc_irqtype = mp_ExtINT;
559 intsrc.mpc_srcbusirq = 0;
560 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
561 MP_intsrc_info(&intsrc);
562}
563
564static inline void __init construct_default_ISA_mptable(int mpc_default_type)
565{
566 struct mpc_config_processor processor;
567 struct mpc_config_bus bus;
568 struct mpc_config_ioapic ioapic;
569 struct mpc_config_lintsrc lintsrc;
570 int linttypes[2] = { mp_ExtINT, mp_NMI };
571 int i;
572
573 /*
574 * local APIC has default address
575 */
576 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
577
578 /*
579 * 2 CPUs, numbered 0 & 1.
580 */
581 processor.mpc_type = MP_PROCESSOR;
582 /* Either an integrated APIC or a discrete 82489DX. */
583 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
584 processor.mpc_cpuflag = CPU_ENABLED;
585 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
586 (boot_cpu_data.x86_model << 4) |
587 boot_cpu_data.x86_mask;
588 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
589 processor.mpc_reserved[0] = 0;
590 processor.mpc_reserved[1] = 0;
591 for (i = 0; i < 2; i++) {
592 processor.mpc_apicid = i;
593 MP_processor_info(&processor);
594 }
595
596 bus.mpc_type = MP_BUS;
597 bus.mpc_busid = 0;
598 switch (mpc_default_type) {
599 default:
600 printk("???\n");
601 printk(KERN_ERR "Unknown standard configuration %d\n",
602 mpc_default_type);
603 /* fall through */
604 case 1:
605 case 5:
606 memcpy(bus.mpc_bustype, "ISA ", 6);
607 break;
608 case 2:
609 case 6:
610 case 3:
611 memcpy(bus.mpc_bustype, "EISA ", 6);
612 break;
613 case 4:
614 case 7:
615 memcpy(bus.mpc_bustype, "MCA ", 6);
616 }
617 MP_bus_info(&bus);
618 if (mpc_default_type > 4) {
619 bus.mpc_busid = 1;
620 memcpy(bus.mpc_bustype, "PCI ", 6);
621 MP_bus_info(&bus);
622 }
623
624 ioapic.mpc_type = MP_IOAPIC;
625 ioapic.mpc_apicid = 2;
626 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
627 ioapic.mpc_flags = MPC_APIC_USABLE;
628 ioapic.mpc_apicaddr = 0xFEC00000;
629 MP_ioapic_info(&ioapic);
630
631 /*
632 * We set up most of the low 16 IO-APIC pins according to MPS rules.
633 */
634 construct_default_ioirq_mptable(mpc_default_type);
635
636 lintsrc.mpc_type = MP_LINTSRC;
637 lintsrc.mpc_irqflag = 0; /* conforming */
638 lintsrc.mpc_srcbusid = 0;
639 lintsrc.mpc_srcbusirq = 0;
640 lintsrc.mpc_destapic = MP_APIC_ALL;
641 for (i = 0; i < 2; i++) {
642 lintsrc.mpc_irqtype = linttypes[i];
643 lintsrc.mpc_destapiclint = i;
644 MP_lintsrc_info(&lintsrc);
645 }
646}
647
648static struct intel_mp_floating *mpf_found;
649
650/*
651 * Scan the memory blocks for an SMP configuration block.
652 */
653void __init get_smp_config (void)
654{
655 struct intel_mp_floating *mpf = mpf_found;
656
657 /*
658 * ACPI may be used to obtain the entire SMP configuration or just to
659 * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that
660 * ACPI supports both logical (e.g. Hyper-Threading) and physical
661 * processors, where MPS only supports physical.
662 */
663 if (acpi_lapic && acpi_ioapic) {
664 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
665 return;
666 }
667 else if (acpi_lapic)
668 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
669
670 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
671 if (mpf->mpf_feature2 & (1<<7)) {
672 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
673 pic_mode = 1;
674 } else {
675 printk(KERN_INFO " Virtual Wire compatibility mode.\n");
676 pic_mode = 0;
677 }
678
679 /*
680 * Now see if we need to read further.
681 */
682 if (mpf->mpf_feature1 != 0) {
683
684 printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
685 construct_default_ISA_mptable(mpf->mpf_feature1);
686
687 } else if (mpf->mpf_physptr) {
688
689 /*
690 * Read the physical hardware table. Anything here will
691 * override the defaults.
692 */
693 if (!smp_read_mpc((void *)mpf->mpf_physptr)) {
694 smp_found_config = 0;
695 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
696 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
697 return;
698 }
699 /*
700 * If there are no explicit MP IRQ entries, then we are
701 * broken. We set up most of the low 16 IO-APIC pins to
702 * ISA defaults and hope it will work.
703 */
704 if (!mp_irq_entries) {
705 struct mpc_config_bus bus;
706
707 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
708
709 bus.mpc_type = MP_BUS;
710 bus.mpc_busid = 0;
711 memcpy(bus.mpc_bustype, "ISA ", 6);
712 MP_bus_info(&bus);
713
714 construct_default_ioirq_mptable(0);
715 }
716
717 } else
718 BUG();
719
720 printk(KERN_INFO "Processors: %d\n", num_processors);
721 /*
722 * Only use the first configuration found.
723 */
724}
725
726static int __init smp_scan_config (unsigned long base, unsigned long length)
727{
728 unsigned long *bp = phys_to_virt(base);
729 struct intel_mp_floating *mpf;
730
731 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
732 if (sizeof(*mpf) != 16)
733 printk("Error: MPF size\n");
734
735 while (length > 0) {
736 mpf = (struct intel_mp_floating *)bp;
737 if ((*bp == SMP_MAGIC_IDENT) &&
738 (mpf->mpf_length == 1) &&
739 !mpf_checksum((unsigned char *)bp, 16) &&
740 ((mpf->mpf_specification == 1)
741 || (mpf->mpf_specification == 4)) ) {
742
743 smp_found_config = 1;
744 printk(KERN_INFO "found SMP MP-table at %08lx\n",
745 virt_to_phys(mpf));
746 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
747 if (mpf->mpf_physptr) {
748 /*
749 * We cannot access to MPC table to compute
750 * table size yet, as only few megabytes from
751 * the bottom is mapped now.
752 * PC-9800's MPC table places on the very last
753 * of physical memory; so that simply reserving
754 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
755 * in reserve_bootmem.
756 */
757 unsigned long size = PAGE_SIZE;
758 unsigned long end = max_low_pfn * PAGE_SIZE;
759 if (mpf->mpf_physptr + size > end)
760 size = end - mpf->mpf_physptr;
761 reserve_bootmem(mpf->mpf_physptr, size);
762 }
763
764 mpf_found = mpf;
765 return 1;
766 }
767 bp += 4;
768 length -= 16;
769 }
770 return 0;
771}
772
773void __init find_smp_config (void)
774{
775 unsigned int address;
776
777 /*
778 * FIXME: Linux assumes you have 640K of base ram..
779 * this continues the error...
780 *
781 * 1) Scan the bottom 1K for a signature
782 * 2) Scan the top 1K of base RAM
783 * 3) Scan the 64K of bios
784 */
785 if (smp_scan_config(0x0,0x400) ||
786 smp_scan_config(639*0x400,0x400) ||
787 smp_scan_config(0xF0000,0x10000))
788 return;
789 /*
790 * If it is an SMP machine we should know now, unless the
791 * configuration is in an EISA/MCA bus machine with an
792 * extended bios data area.
793 *
794 * there is a real-mode segmented pointer pointing to the
795 * 4K EBDA area at 0x40E, calculate and scan it here.
796 *
797 * NOTE! There are Linux loaders that will corrupt the EBDA
798 * area, and as such this kind of SMP config may be less
799 * trustworthy, simply because the SMP table may have been
800 * stomped on during early boot. These loaders are buggy and
801 * should be fixed.
802 *
803 * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
804 */
805
806 address = get_bios_ebda();
807 if (address)
808 smp_scan_config(address, 0x400);
809}
810
811/* --------------------------------------------------------------------------
812 ACPI-based MP Configuration
813 -------------------------------------------------------------------------- */
814
815#ifdef CONFIG_ACPI_BOOT
816
817void __init mp_register_lapic_address (
818 u64 address)
819{
820 mp_lapic_addr = (unsigned long) address;
821
822 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
823
824 if (boot_cpu_physical_apicid == -1U)
825 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
826
827 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
828}
829
830
831void __init mp_register_lapic (
832 u8 id,
833 u8 enabled)
834{
835 struct mpc_config_processor processor;
836 int boot_cpu = 0;
837
838 if (MAX_APICS - id <= 0) {
839 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
840 id, MAX_APICS);
841 return;
842 }
843
844 if (id == boot_cpu_physical_apicid)
845 boot_cpu = 1;
846
847 processor.mpc_type = MP_PROCESSOR;
848 processor.mpc_apicid = id;
849 processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
850 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
851 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
852 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
853 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
854 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
855 processor.mpc_reserved[0] = 0;
856 processor.mpc_reserved[1] = 0;
857
858 MP_processor_info(&processor);
859}
860
861#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_ACPI_INTERPRETER) || defined(CONFIG_ACPI_BOOT))
862
863#define MP_ISA_BUS 0
864#define MP_MAX_IOAPIC_PIN 127
865
866static struct mp_ioapic_routing {
867 int apic_id;
868 int gsi_base;
869 int gsi_end;
870 u32 pin_programmed[4];
871} mp_ioapic_routing[MAX_IO_APICS];
872
873
874static int mp_find_ioapic (
875 int gsi)
876{
877 int i = 0;
878
879 /* Find the IOAPIC that manages this GSI. */
880 for (i = 0; i < nr_ioapics; i++) {
881 if ((gsi >= mp_ioapic_routing[i].gsi_base)
882 && (gsi <= mp_ioapic_routing[i].gsi_end))
883 return i;
884 }
885
886 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
887
888 return -1;
889}
890
891
892void __init mp_register_ioapic (
893 u8 id,
894 u32 address,
895 u32 gsi_base)
896{
897 int idx = 0;
898
899 if (nr_ioapics >= MAX_IO_APICS) {
900 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
901 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
902 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
903 }
904 if (!address) {
905 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
906 " found in MADT table, skipping!\n");
907 return;
908 }
909
910 idx = nr_ioapics++;
911
912 mp_ioapics[idx].mpc_type = MP_IOAPIC;
913 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
914 mp_ioapics[idx].mpc_apicaddr = address;
915
916 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
917 mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
918 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
919
920 /*
921 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
922 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
923 */
924 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
925 mp_ioapic_routing[idx].gsi_base = gsi_base;
926 mp_ioapic_routing[idx].gsi_end = gsi_base +
927 io_apic_get_redir_entries(idx);
928
929 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
930 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
931 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
932 mp_ioapic_routing[idx].gsi_base,
933 mp_ioapic_routing[idx].gsi_end);
934
935 return;
936}
937
938
939void __init mp_override_legacy_irq (
940 u8 bus_irq,
941 u8 polarity,
942 u8 trigger,
943 u32 gsi)
944{
945 struct mpc_config_intsrc intsrc;
946 int ioapic = -1;
947 int pin = -1;
948
949 /*
950 * Convert 'gsi' to 'ioapic.pin'.
951 */
952 ioapic = mp_find_ioapic(gsi);
953 if (ioapic < 0)
954 return;
955 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
956
957 /*
958 * TBD: This check is for faulty timer entries, where the override
959 * erroneously sets the trigger to level, resulting in a HUGE
960 * increase of timer interrupts!
961 */
962 if ((bus_irq == 0) && (trigger == 3))
963 trigger = 1;
964
965 intsrc.mpc_type = MP_INTSRC;
966 intsrc.mpc_irqtype = mp_INT;
967 intsrc.mpc_irqflag = (trigger << 2) | polarity;
968 intsrc.mpc_srcbus = MP_ISA_BUS;
969 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
970 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
971 intsrc.mpc_dstirq = pin; /* INTIN# */
972
973 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
974 intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
975 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
976 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
977
978 mp_irqs[mp_irq_entries] = intsrc;
979 if (++mp_irq_entries == MAX_IRQ_SOURCES)
980 panic("Max # of irq sources exceeded!\n");
981
982 return;
983}
984
985int es7000_plat;
986
987void __init mp_config_acpi_legacy_irqs (void)
988{
989 struct mpc_config_intsrc intsrc;
990 int i = 0;
991 int ioapic = -1;
992
993 /*
994 * Fabricate the legacy ISA bus (bus #31).
995 */
996 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
997 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
998
999 /*
1000 * Older generations of ES7000 have no legacy identity mappings
1001 */
1002 if (es7000_plat == 1)
1003 return;
1004
1005 /*
1006 * Locate the IOAPIC that manages the ISA IRQs (0-15).
1007 */
1008 ioapic = mp_find_ioapic(0);
1009 if (ioapic < 0)
1010 return;
1011
1012 intsrc.mpc_type = MP_INTSRC;
1013 intsrc.mpc_irqflag = 0; /* Conforming */
1014 intsrc.mpc_srcbus = MP_ISA_BUS;
1015 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
1016
1017 /*
1018 * Use the default configuration for the IRQs 0-15. Unless
1019 * overriden by (MADT) interrupt source override entries.
1020 */
1021 for (i = 0; i < 16; i++) {
1022 int idx;
1023
1024 for (idx = 0; idx < mp_irq_entries; idx++) {
1025 struct mpc_config_intsrc *irq = mp_irqs + idx;
1026
1027 /* Do we already have a mapping for this ISA IRQ? */
1028 if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
1029 break;
1030
1031 /* Do we already have a mapping for this IOAPIC pin */
1032 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
1033 (irq->mpc_dstirq == i))
1034 break;
1035 }
1036
1037 if (idx != mp_irq_entries) {
1038 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
1039 continue; /* IRQ already used */
1040 }
1041
1042 intsrc.mpc_irqtype = mp_INT;
1043 intsrc.mpc_srcbusirq = i; /* Identity mapped */
1044 intsrc.mpc_dstirq = i;
1045
1046 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
1047 "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
1048 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
1049 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
1050 intsrc.mpc_dstirq);
1051
1052 mp_irqs[mp_irq_entries] = intsrc;
1053 if (++mp_irq_entries == MAX_IRQ_SOURCES)
1054 panic("Max # of irq sources exceeded!\n");
1055 }
1056}
1057
1058int mp_register_gsi (u32 gsi, int edge_level, int active_high_low)
1059{
1060 int ioapic = -1;
1061 int ioapic_pin = 0;
1062 int idx, bit = 0;
1063
1064#ifdef CONFIG_ACPI_BUS
1065 /* Don't set up the ACPI SCI because it's already set up */
1066 if (acpi_fadt.sci_int == gsi)
1067 return gsi;
1068#endif
1069
1070 ioapic = mp_find_ioapic(gsi);
1071 if (ioapic < 0) {
1072 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1073 return gsi;
1074 }
1075
1076 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
1077
1078 if (ioapic_renumber_irq)
1079 gsi = ioapic_renumber_irq(ioapic, gsi);
1080
1081 /*
1082 * Avoid pin reprogramming. PRTs typically include entries
1083 * with redundant pin->gsi mappings (but unique PCI devices);
1084 * we only program the IOAPIC on the first.
1085 */
1086 bit = ioapic_pin % 32;
1087 idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
1088 if (idx > 3) {
1089 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1090 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1091 ioapic_pin);
1092 return gsi;
1093 }
1094 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
1095 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
1096 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1097 return gsi;
1098 }
1099
1100 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
1101
1102 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1103 edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
1104 active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
1105 return gsi;
1106}
1107
1108#endif /*CONFIG_X86_IO_APIC && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT)*/
1109#endif /*CONFIG_ACPI_BOOT*/
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
new file mode 100644
index 000000000000..05d9f8f363a6
--- /dev/null
+++ b/arch/i386/kernel/msr.c
@@ -0,0 +1,346 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * msr.c
15 *
16 * x86 MSR access device
17 *
18 * This device is accessed by lseek() to the appropriate register number
19 * and then read/write in chunks of 8 bytes. A larger size means multiple
20 * reads or writes of the same register.
21 *
22 * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27#include <linux/config.h>
28
29#include <linux/types.h>
30#include <linux/errno.h>
31#include <linux/fcntl.h>
32#include <linux/init.h>
33#include <linux/poll.h>
34#include <linux/smp.h>
35#include <linux/smp_lock.h>
36#include <linux/major.h>
37#include <linux/fs.h>
38#include <linux/device.h>
39#include <linux/cpu.h>
40#include <linux/notifier.h>
41
42#include <asm/processor.h>
43#include <asm/msr.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46
47static struct class_simple *msr_class;
48
49/* Note: "err" is handled in a funny way below. Otherwise one version
50 of gcc or another breaks. */
51
52static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx)
53{
54 int err;
55
56 asm volatile ("1: wrmsr\n"
57 "2:\n"
58 ".section .fixup,\"ax\"\n"
59 "3: movl %4,%0\n"
60 " jmp 2b\n"
61 ".previous\n"
62 ".section __ex_table,\"a\"\n"
63 " .align 4\n" " .long 1b,3b\n" ".previous":"=&bDS" (err)
64 :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0));
65
66 return err;
67}
68
69static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
70{
71 int err;
72
73 asm volatile ("1: rdmsr\n"
74 "2:\n"
75 ".section .fixup,\"ax\"\n"
76 "3: movl %4,%0\n"
77 " jmp 2b\n"
78 ".previous\n"
79 ".section __ex_table,\"a\"\n"
80 " .align 4\n"
81 " .long 1b,3b\n"
82 ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx)
83 :"c"(reg), "i"(-EIO), "0"(0));
84
85 return err;
86}
87
88#ifdef CONFIG_SMP
89
90struct msr_command {
91 int cpu;
92 int err;
93 u32 reg;
94 u32 data[2];
95};
96
97static void msr_smp_wrmsr(void *cmd_block)
98{
99 struct msr_command *cmd = (struct msr_command *)cmd_block;
100
101 if (cmd->cpu == smp_processor_id())
102 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
103}
104
105static void msr_smp_rdmsr(void *cmd_block)
106{
107 struct msr_command *cmd = (struct msr_command *)cmd_block;
108
109 if (cmd->cpu == smp_processor_id())
110 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
111}
112
113static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
114{
115 struct msr_command cmd;
116 int ret;
117
118 preempt_disable();
119 if (cpu == smp_processor_id()) {
120 ret = wrmsr_eio(reg, eax, edx);
121 } else {
122 cmd.cpu = cpu;
123 cmd.reg = reg;
124 cmd.data[0] = eax;
125 cmd.data[1] = edx;
126
127 smp_call_function(msr_smp_wrmsr, &cmd, 1, 1);
128 ret = cmd.err;
129 }
130 preempt_enable();
131 return ret;
132}
133
134static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
135{
136 struct msr_command cmd;
137 int ret;
138
139 preempt_disable();
140 if (cpu == smp_processor_id()) {
141 ret = rdmsr_eio(reg, eax, edx);
142 } else {
143 cmd.cpu = cpu;
144 cmd.reg = reg;
145
146 smp_call_function(msr_smp_rdmsr, &cmd, 1, 1);
147
148 *eax = cmd.data[0];
149 *edx = cmd.data[1];
150
151 ret = cmd.err;
152 }
153 preempt_enable();
154 return ret;
155}
156
157#else /* ! CONFIG_SMP */
158
159static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
160{
161 return wrmsr_eio(reg, eax, edx);
162}
163
164static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx)
165{
166 return rdmsr_eio(reg, eax, edx);
167}
168
169#endif /* ! CONFIG_SMP */
170
171static loff_t msr_seek(struct file *file, loff_t offset, int orig)
172{
173 loff_t ret = -EINVAL;
174
175 lock_kernel();
176 switch (orig) {
177 case 0:
178 file->f_pos = offset;
179 ret = file->f_pos;
180 break;
181 case 1:
182 file->f_pos += offset;
183 ret = file->f_pos;
184 }
185 unlock_kernel();
186 return ret;
187}
188
189static ssize_t msr_read(struct file *file, char __user * buf,
190 size_t count, loff_t * ppos)
191{
192 u32 __user *tmp = (u32 __user *) buf;
193 u32 data[2];
194 size_t rv;
195 u32 reg = *ppos;
196 int cpu = iminor(file->f_dentry->d_inode);
197 int err;
198
199 if (count % 8)
200 return -EINVAL; /* Invalid chunk size */
201
202 for (rv = 0; count; count -= 8) {
203 err = do_rdmsr(cpu, reg, &data[0], &data[1]);
204 if (err)
205 return err;
206 if (copy_to_user(tmp, &data, 8))
207 return -EFAULT;
208 tmp += 2;
209 }
210
211 return ((char __user *)tmp) - buf;
212}
213
214static ssize_t msr_write(struct file *file, const char __user *buf,
215 size_t count, loff_t *ppos)
216{
217 const u32 __user *tmp = (const u32 __user *)buf;
218 u32 data[2];
219 size_t rv;
220 u32 reg = *ppos;
221 int cpu = iminor(file->f_dentry->d_inode);
222 int err;
223
224 if (count % 8)
225 return -EINVAL; /* Invalid chunk size */
226
227 for (rv = 0; count; count -= 8) {
228 if (copy_from_user(&data, tmp, 8))
229 return -EFAULT;
230 err = do_wrmsr(cpu, reg, data[0], data[1]);
231 if (err)
232 return err;
233 tmp += 2;
234 }
235
236 return ((char __user *)tmp) - buf;
237}
238
239static int msr_open(struct inode *inode, struct file *file)
240{
241 unsigned int cpu = iminor(file->f_dentry->d_inode);
242 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
243
244 if (cpu >= NR_CPUS || !cpu_online(cpu))
245 return -ENXIO; /* No such CPU */
246 if (!cpu_has(c, X86_FEATURE_MSR))
247 return -EIO; /* MSR not supported */
248
249 return 0;
250}
251
252/*
253 * File operations we support
254 */
255static struct file_operations msr_fops = {
256 .owner = THIS_MODULE,
257 .llseek = msr_seek,
258 .read = msr_read,
259 .write = msr_write,
260 .open = msr_open,
261};
262
263static int msr_class_simple_device_add(int i)
264{
265 int err = 0;
266 struct class_device *class_err;
267
268 class_err = class_simple_device_add(msr_class, MKDEV(MSR_MAJOR, i), NULL, "msr%d",i);
269 if (IS_ERR(class_err))
270 err = PTR_ERR(class_err);
271 return err;
272}
273
274static int __devinit msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
275{
276 unsigned int cpu = (unsigned long)hcpu;
277
278 switch (action) {
279 case CPU_ONLINE:
280 msr_class_simple_device_add(cpu);
281 break;
282 case CPU_DEAD:
283 class_simple_device_remove(MKDEV(MSR_MAJOR, cpu));
284 break;
285 }
286 return NOTIFY_OK;
287}
288
289static struct notifier_block msr_class_cpu_notifier =
290{
291 .notifier_call = msr_class_cpu_callback,
292};
293
294static int __init msr_init(void)
295{
296 int i, err = 0;
297 i = 0;
298
299 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
300 printk(KERN_ERR "msr: unable to get major %d for msr\n",
301 MSR_MAJOR);
302 err = -EBUSY;
303 goto out;
304 }
305 msr_class = class_simple_create(THIS_MODULE, "msr");
306 if (IS_ERR(msr_class)) {
307 err = PTR_ERR(msr_class);
308 goto out_chrdev;
309 }
310 for_each_online_cpu(i) {
311 err = msr_class_simple_device_add(i);
312 if (err != 0)
313 goto out_class;
314 }
315 register_cpu_notifier(&msr_class_cpu_notifier);
316
317 err = 0;
318 goto out;
319
320out_class:
321 i = 0;
322 for_each_online_cpu(i)
323 class_simple_device_remove(MKDEV(MSR_MAJOR, i));
324 class_simple_destroy(msr_class);
325out_chrdev:
326 unregister_chrdev(MSR_MAJOR, "cpu/msr");
327out:
328 return err;
329}
330
331static void __exit msr_exit(void)
332{
333 int cpu = 0;
334 for_each_online_cpu(cpu)
335 class_simple_device_remove(MKDEV(MSR_MAJOR, cpu));
336 class_simple_destroy(msr_class);
337 unregister_chrdev(MSR_MAJOR, "cpu/msr");
338 unregister_cpu_notifier(&msr_class_cpu_notifier);
339}
340
341module_init(msr_init);
342module_exit(msr_exit)
343
344MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
345MODULE_DESCRIPTION("x86 generic MSR driver");
346MODULE_LICENSE("GPL");
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
new file mode 100644
index 000000000000..f5b0c5081bd6
--- /dev/null
+++ b/arch/i386/kernel/nmi.c
@@ -0,0 +1,570 @@
1/*
2 * linux/arch/i386/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12 * Pavel Machek and
13 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
14 */
15
16#include <linux/config.h>
17#include <linux/mm.h>
18#include <linux/irq.h>
19#include <linux/delay.h>
20#include <linux/bootmem.h>
21#include <linux/smp_lock.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/module.h>
26#include <linux/nmi.h>
27#include <linux/sysdev.h>
28#include <linux/sysctl.h>
29
30#include <asm/smp.h>
31#include <asm/mtrr.h>
32#include <asm/mpspec.h>
33#include <asm/nmi.h>
34
35#include "mach_traps.h"
36
37unsigned int nmi_watchdog = NMI_NONE;
38extern int unknown_nmi_panic;
39static unsigned int nmi_hz = HZ;
40static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
41static unsigned int nmi_p4_cccr_val;
42extern void show_registers(struct pt_regs *regs);
43
44/*
45 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
46 * - it may be reserved by some other driver, or not
47 * - when not reserved by some other driver, it may be used for
48 * the NMI watchdog, or not
49 *
50 * This is maintained separately from nmi_active because the NMI
51 * watchdog may also be driven from the I/O APIC timer.
52 */
53static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
54static unsigned int lapic_nmi_owner;
55#define LAPIC_NMI_WATCHDOG (1<<0)
56#define LAPIC_NMI_RESERVED (1<<1)
57
58/* nmi_active:
59 * +1: the lapic NMI watchdog is active, but can be disabled
60 * 0: the lapic NMI watchdog has not been set up, and cannot
61 * be enabled
62 * -1: the lapic NMI watchdog is disabled, but can be enabled
63 */
64int nmi_active;
65
66#define K7_EVNTSEL_ENABLE (1 << 22)
67#define K7_EVNTSEL_INT (1 << 20)
68#define K7_EVNTSEL_OS (1 << 17)
69#define K7_EVNTSEL_USR (1 << 16)
70#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
71#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
72
73#define P6_EVNTSEL0_ENABLE (1 << 22)
74#define P6_EVNTSEL_INT (1 << 20)
75#define P6_EVNTSEL_OS (1 << 17)
76#define P6_EVNTSEL_USR (1 << 16)
77#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
78#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
79
80#define MSR_P4_MISC_ENABLE 0x1A0
81#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
82#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
83#define MSR_P4_PERFCTR0 0x300
84#define MSR_P4_CCCR0 0x360
85#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
86#define P4_ESCR_OS (1<<3)
87#define P4_ESCR_USR (1<<2)
88#define P4_CCCR_OVF_PMI0 (1<<26)
89#define P4_CCCR_OVF_PMI1 (1<<27)
90#define P4_CCCR_THRESHOLD(N) ((N)<<20)
91#define P4_CCCR_COMPLEMENT (1<<19)
92#define P4_CCCR_COMPARE (1<<18)
93#define P4_CCCR_REQUIRED (3<<16)
94#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
95#define P4_CCCR_ENABLE (1<<12)
96/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
97 CRU_ESCR0 (with any non-null event selector) through a complemented
98 max threshold. [IA32-Vol3, Section 14.9.9] */
99#define MSR_P4_IQ_COUNTER0 0x30C
100#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
101#define P4_NMI_IQ_CCCR0 \
102 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
103 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
104
105int __init check_nmi_watchdog (void)
106{
107 unsigned int prev_nmi_count[NR_CPUS];
108 int cpu;
109
110 printk(KERN_INFO "testing NMI watchdog ... ");
111
112 for (cpu = 0; cpu < NR_CPUS; cpu++)
113 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
114 local_irq_enable();
115 mdelay((10*1000)/nmi_hz); // wait 10 ticks
116
117 /* FIXME: Only boot CPU is online at this stage. Check CPUs
118 as they come up. */
119 for (cpu = 0; cpu < NR_CPUS; cpu++) {
120#ifdef CONFIG_SMP
121 /* Check cpu_callin_map here because that is set
122 after the timer is started. */
123 if (!cpu_isset(cpu, cpu_callin_map))
124 continue;
125#endif
126 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
127 printk("CPU#%d: NMI appears to be stuck!\n", cpu);
128 nmi_active = 0;
129 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
130 return -1;
131 }
132 }
133 printk("OK.\n");
134
135 /* now that we know it works we can reduce NMI frequency to
136 something more reasonable; makes a difference in some configs */
137 if (nmi_watchdog == NMI_LOCAL_APIC)
138 nmi_hz = 1;
139
140 return 0;
141}
142
143static int __init setup_nmi_watchdog(char *str)
144{
145 int nmi;
146
147 get_option(&str, &nmi);
148
149 if (nmi >= NMI_INVALID)
150 return 0;
151 if (nmi == NMI_NONE)
152 nmi_watchdog = nmi;
153 /*
154 * If any other x86 CPU has a local APIC, then
155 * please test the NMI stuff there and send me the
156 * missing bits. Right now Intel P6/P4 and AMD K7 only.
157 */
158 if ((nmi == NMI_LOCAL_APIC) &&
159 (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
160 (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
161 nmi_watchdog = nmi;
162 if ((nmi == NMI_LOCAL_APIC) &&
163 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
164 (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
165 nmi_watchdog = nmi;
166 /*
167 * We can enable the IO-APIC watchdog
168 * unconditionally.
169 */
170 if (nmi == NMI_IO_APIC) {
171 nmi_active = 1;
172 nmi_watchdog = nmi;
173 }
174 return 1;
175}
176
177__setup("nmi_watchdog=", setup_nmi_watchdog);
178
179static void disable_lapic_nmi_watchdog(void)
180{
181 if (nmi_active <= 0)
182 return;
183 switch (boot_cpu_data.x86_vendor) {
184 case X86_VENDOR_AMD:
185 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
186 break;
187 case X86_VENDOR_INTEL:
188 switch (boot_cpu_data.x86) {
189 case 6:
190 if (boot_cpu_data.x86_model > 0xd)
191 break;
192
193 wrmsr(MSR_P6_EVNTSEL0, 0, 0);
194 break;
195 case 15:
196 if (boot_cpu_data.x86_model > 0x3)
197 break;
198
199 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
200 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
201 break;
202 }
203 break;
204 }
205 nmi_active = -1;
206 /* tell do_nmi() and others that we're not active any more */
207 nmi_watchdog = 0;
208}
209
210static void enable_lapic_nmi_watchdog(void)
211{
212 if (nmi_active < 0) {
213 nmi_watchdog = NMI_LOCAL_APIC;
214 setup_apic_nmi_watchdog();
215 }
216}
217
218int reserve_lapic_nmi(void)
219{
220 unsigned int old_owner;
221
222 spin_lock(&lapic_nmi_owner_lock);
223 old_owner = lapic_nmi_owner;
224 lapic_nmi_owner |= LAPIC_NMI_RESERVED;
225 spin_unlock(&lapic_nmi_owner_lock);
226 if (old_owner & LAPIC_NMI_RESERVED)
227 return -EBUSY;
228 if (old_owner & LAPIC_NMI_WATCHDOG)
229 disable_lapic_nmi_watchdog();
230 return 0;
231}
232
233void release_lapic_nmi(void)
234{
235 unsigned int new_owner;
236
237 spin_lock(&lapic_nmi_owner_lock);
238 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
239 lapic_nmi_owner = new_owner;
240 spin_unlock(&lapic_nmi_owner_lock);
241 if (new_owner & LAPIC_NMI_WATCHDOG)
242 enable_lapic_nmi_watchdog();
243}
244
245void disable_timer_nmi_watchdog(void)
246{
247 if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
248 return;
249
250 unset_nmi_callback();
251 nmi_active = -1;
252 nmi_watchdog = NMI_NONE;
253}
254
255void enable_timer_nmi_watchdog(void)
256{
257 if (nmi_active < 0) {
258 nmi_watchdog = NMI_IO_APIC;
259 touch_nmi_watchdog();
260 nmi_active = 1;
261 }
262}
263
264#ifdef CONFIG_PM
265
266static int nmi_pm_active; /* nmi_active before suspend */
267
268static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
269{
270 nmi_pm_active = nmi_active;
271 disable_lapic_nmi_watchdog();
272 return 0;
273}
274
275static int lapic_nmi_resume(struct sys_device *dev)
276{
277 if (nmi_pm_active > 0)
278 enable_lapic_nmi_watchdog();
279 return 0;
280}
281
282
283static struct sysdev_class nmi_sysclass = {
284 set_kset_name("lapic_nmi"),
285 .resume = lapic_nmi_resume,
286 .suspend = lapic_nmi_suspend,
287};
288
289static struct sys_device device_lapic_nmi = {
290 .id = 0,
291 .cls = &nmi_sysclass,
292};
293
294static int __init init_lapic_nmi_sysfs(void)
295{
296 int error;
297
298 if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
299 return 0;
300
301 error = sysdev_class_register(&nmi_sysclass);
302 if (!error)
303 error = sysdev_register(&device_lapic_nmi);
304 return error;
305}
306/* must come after the local APIC's device_initcall() */
307late_initcall(init_lapic_nmi_sysfs);
308
309#endif /* CONFIG_PM */
310
311/*
312 * Activate the NMI watchdog via the local APIC.
313 * Original code written by Keith Owens.
314 */
315
316static void clear_msr_range(unsigned int base, unsigned int n)
317{
318 unsigned int i;
319
320 for(i = 0; i < n; ++i)
321 wrmsr(base+i, 0, 0);
322}
323
324static void setup_k7_watchdog(void)
325{
326 unsigned int evntsel;
327
328 nmi_perfctr_msr = MSR_K7_PERFCTR0;
329
330 clear_msr_range(MSR_K7_EVNTSEL0, 4);
331 clear_msr_range(MSR_K7_PERFCTR0, 4);
332
333 evntsel = K7_EVNTSEL_INT
334 | K7_EVNTSEL_OS
335 | K7_EVNTSEL_USR
336 | K7_NMI_EVENT;
337
338 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
339 Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
340 wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
341 apic_write(APIC_LVTPC, APIC_DM_NMI);
342 evntsel |= K7_EVNTSEL_ENABLE;
343 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
344}
345
346static void setup_p6_watchdog(void)
347{
348 unsigned int evntsel;
349
350 nmi_perfctr_msr = MSR_P6_PERFCTR0;
351
352 clear_msr_range(MSR_P6_EVNTSEL0, 2);
353 clear_msr_range(MSR_P6_PERFCTR0, 2);
354
355 evntsel = P6_EVNTSEL_INT
356 | P6_EVNTSEL_OS
357 | P6_EVNTSEL_USR
358 | P6_NMI_EVENT;
359
360 wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
361 Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
362 wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
363 apic_write(APIC_LVTPC, APIC_DM_NMI);
364 evntsel |= P6_EVNTSEL0_ENABLE;
365 wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
366}
367
368static int setup_p4_watchdog(void)
369{
370 unsigned int misc_enable, dummy;
371
372 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
373 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
374 return 0;
375
376 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
377 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
378#ifdef CONFIG_SMP
379 if (smp_num_siblings == 2)
380 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
381#endif
382
383 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
384 clear_msr_range(0x3F1, 2);
385 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
386 docs doesn't fully define it, so leave it alone for now. */
387 if (boot_cpu_data.x86_model >= 0x3) {
388 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
389 clear_msr_range(0x3A0, 26);
390 clear_msr_range(0x3BC, 3);
391 } else {
392 clear_msr_range(0x3A0, 31);
393 }
394 clear_msr_range(0x3C0, 6);
395 clear_msr_range(0x3C8, 6);
396 clear_msr_range(0x3E0, 2);
397 clear_msr_range(MSR_P4_CCCR0, 18);
398 clear_msr_range(MSR_P4_PERFCTR0, 18);
399
400 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
401 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
402 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
403 wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
404 apic_write(APIC_LVTPC, APIC_DM_NMI);
405 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
406 return 1;
407}
408
409void setup_apic_nmi_watchdog (void)
410{
411 switch (boot_cpu_data.x86_vendor) {
412 case X86_VENDOR_AMD:
413 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
414 return;
415 setup_k7_watchdog();
416 break;
417 case X86_VENDOR_INTEL:
418 switch (boot_cpu_data.x86) {
419 case 6:
420 if (boot_cpu_data.x86_model > 0xd)
421 return;
422
423 setup_p6_watchdog();
424 break;
425 case 15:
426 if (boot_cpu_data.x86_model > 0x3)
427 return;
428
429 if (!setup_p4_watchdog())
430 return;
431 break;
432 default:
433 return;
434 }
435 break;
436 default:
437 return;
438 }
439 lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
440 nmi_active = 1;
441}
442
443/*
444 * the best way to detect whether a CPU has a 'hard lockup' problem
445 * is to check it's local APIC timer IRQ counts. If they are not
446 * changing then that CPU has some problem.
447 *
448 * as these watchdog NMI IRQs are generated on every CPU, we only
449 * have to check the current processor.
450 *
451 * since NMIs don't listen to _any_ locks, we have to be extremely
452 * careful not to rely on unsafe variables. The printk might lock
453 * up though, so we have to break up any console locks first ...
454 * [when there will be more tty-related locks, break them up
455 * here too!]
456 */
457
458static unsigned int
459 last_irq_sums [NR_CPUS],
460 alert_counter [NR_CPUS];
461
462void touch_nmi_watchdog (void)
463{
464 int i;
465
466 /*
467 * Just reset the alert counters, (other CPUs might be
468 * spinning on locks we hold):
469 */
470 for (i = 0; i < NR_CPUS; i++)
471 alert_counter[i] = 0;
472}
473
474extern void die_nmi(struct pt_regs *, const char *msg);
475
476void nmi_watchdog_tick (struct pt_regs * regs)
477{
478
479 /*
480 * Since current_thread_info()-> is always on the stack, and we
481 * always switch the stack NMI-atomically, it's safe to use
482 * smp_processor_id().
483 */
484 int sum, cpu = smp_processor_id();
485
486 sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
487
488 if (last_irq_sums[cpu] == sum) {
489 /*
490 * Ayiee, looks like this CPU is stuck ...
491 * wait a few IRQs (5 seconds) before doing the oops ...
492 */
493 alert_counter[cpu]++;
494 if (alert_counter[cpu] == 5*nmi_hz)
495 die_nmi(regs, "NMI Watchdog detected LOCKUP");
496 } else {
497 last_irq_sums[cpu] = sum;
498 alert_counter[cpu] = 0;
499 }
500 if (nmi_perfctr_msr) {
501 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
502 /*
503 * P4 quirks:
504 * - An overflown perfctr will assert its interrupt
505 * until the OVF flag in its CCCR is cleared.
506 * - LVTPC is masked on interrupt and must be
507 * unmasked by the LVTPC handler.
508 */
509 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
510 apic_write(APIC_LVTPC, APIC_DM_NMI);
511 }
512 else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) {
513 /* Only P6 based Pentium M need to re-unmask
514 * the apic vector but it doesn't hurt
515 * other P6 variant */
516 apic_write(APIC_LVTPC, APIC_DM_NMI);
517 }
518 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
519 }
520}
521
522#ifdef CONFIG_SYSCTL
523
524static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
525{
526 unsigned char reason = get_nmi_reason();
527 char buf[64];
528
529 if (!(reason & 0xc0)) {
530 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
531 die_nmi(regs, buf);
532 }
533 return 0;
534}
535
536/*
537 * proc handler for /proc/sys/kernel/unknown_nmi_panic
538 */
539int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
540 void __user *buffer, size_t *length, loff_t *ppos)
541{
542 int old_state;
543
544 old_state = unknown_nmi_panic;
545 proc_dointvec(table, write, file, buffer, length, ppos);
546 if (!!old_state == !!unknown_nmi_panic)
547 return 0;
548
549 if (unknown_nmi_panic) {
550 if (reserve_lapic_nmi() < 0) {
551 unknown_nmi_panic = 0;
552 return -EBUSY;
553 } else {
554 set_nmi_callback(unknown_nmi_panic_callback);
555 }
556 } else {
557 release_lapic_nmi();
558 unset_nmi_callback();
559 }
560 return 0;
561}
562
563#endif
564
565EXPORT_SYMBOL(nmi_active);
566EXPORT_SYMBOL(nmi_watchdog);
567EXPORT_SYMBOL(reserve_lapic_nmi);
568EXPORT_SYMBOL(release_lapic_nmi);
569EXPORT_SYMBOL(disable_timer_nmi_watchdog);
570EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c
new file mode 100644
index 000000000000..e51edf0a6564
--- /dev/null
+++ b/arch/i386/kernel/numaq.c
@@ -0,0 +1,79 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#include <linux/config.h>
27#include <linux/mm.h>
28#include <linux/bootmem.h>
29#include <linux/mmzone.h>
30#include <linux/module.h>
31#include <linux/nodemask.h>
32#include <asm/numaq.h>
33#include <asm/topology.h>
34
35#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
36
37/*
38 * Function: smp_dump_qct()
39 *
40 * Description: gets memory layout from the quad config table. This
41 * function also updates node_online_map with the nodes (quads) present.
42 */
43static void __init smp_dump_qct(void)
44{
45 int node;
46 struct eachquadmem *eq;
47 struct sys_cfg_data *scd =
48 (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
49
50 nodes_clear(node_online_map);
51 for_each_node(node) {
52 if (scd->quads_present31_0 & (1 << node)) {
53 node_set_online(node);
54 eq = &scd->eq[node];
55 /* Convert to pages */
56 node_start_pfn[node] = MB_TO_PAGES(
57 eq->hi_shrd_mem_start - eq->priv_mem_size);
58 node_end_pfn[node] = MB_TO_PAGES(
59 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
60
61 memory_present(node,
62 node_start_pfn[node], node_end_pfn[node]);
63 node_remap_size[node] = node_memmap_size_bytes(node,
64 node_start_pfn[node],
65 node_end_pfn[node]);
66 }
67 }
68}
69
70/*
71 * Unlike Summit, we don't really care to let the NUMA-Q
72 * fall back to flat mode. Don't compile for NUMA-Q
73 * unless you really need it!
74 */
75int __init get_memcfg_numaq(void)
76{
77 smp_dump_qct();
78 return 1;
79}
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
new file mode 100644
index 000000000000..4de2e03c7b45
--- /dev/null
+++ b/arch/i386/kernel/pci-dma.c
@@ -0,0 +1,147 @@
1/*
2 * Dynamic DMA mapping support.
3 *
4 * On i386 there is no hardware dynamic DMA address translation,
5 * so consistent alloc/free are merely page allocation/freeing.
6 * The rest of the dynamic DMA mapping interface is implemented
7 * in asm/pci.h.
8 */
9
10#include <linux/types.h>
11#include <linux/mm.h>
12#include <linux/string.h>
13#include <linux/pci.h>
14#include <asm/io.h>
15
16struct dma_coherent_mem {
17 void *virt_base;
18 u32 device_base;
19 int size;
20 int flags;
21 unsigned long *bitmap;
22};
23
24void *dma_alloc_coherent(struct device *dev, size_t size,
25 dma_addr_t *dma_handle, unsigned int __nocast gfp)
26{
27 void *ret;
28 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
29 int order = get_order(size);
30 /* ignore region specifiers */
31 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
32
33 if (mem) {
34 int page = bitmap_find_free_region(mem->bitmap, mem->size,
35 order);
36 if (page >= 0) {
37 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
38 ret = mem->virt_base + (page << PAGE_SHIFT);
39 memset(ret, 0, size);
40 return ret;
41 }
42 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
43 return NULL;
44 }
45
46 if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
47 gfp |= GFP_DMA;
48
49 ret = (void *)__get_free_pages(gfp, order);
50
51 if (ret != NULL) {
52 memset(ret, 0, size);
53 *dma_handle = virt_to_phys(ret);
54 }
55 return ret;
56}
57
58void dma_free_coherent(struct device *dev, size_t size,
59 void *vaddr, dma_addr_t dma_handle)
60{
61 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
62 int order = get_order(size);
63
64 if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
65 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
66
67 bitmap_release_region(mem->bitmap, page, order);
68 } else
69 free_pages((unsigned long)vaddr, order);
70}
71
72int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
73 dma_addr_t device_addr, size_t size, int flags)
74{
75 void __iomem *mem_base;
76 int pages = size >> PAGE_SHIFT;
77 int bitmap_size = (pages + 31)/32;
78
79 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
80 goto out;
81 if (!size)
82 goto out;
83 if (dev->dma_mem)
84 goto out;
85
86 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
87
88 mem_base = ioremap(bus_addr, size);
89 if (!mem_base)
90 goto out;
91
92 dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
93 if (!dev->dma_mem)
94 goto out;
95 memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
96 dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
97 if (!dev->dma_mem->bitmap)
98 goto free1_out;
99 memset(dev->dma_mem->bitmap, 0, bitmap_size);
100
101 dev->dma_mem->virt_base = mem_base;
102 dev->dma_mem->device_base = device_addr;
103 dev->dma_mem->size = pages;
104 dev->dma_mem->flags = flags;
105
106 if (flags & DMA_MEMORY_MAP)
107 return DMA_MEMORY_MAP;
108
109 return DMA_MEMORY_IO;
110
111 free1_out:
112 kfree(dev->dma_mem->bitmap);
113 out:
114 return 0;
115}
116EXPORT_SYMBOL(dma_declare_coherent_memory);
117
118void dma_release_declared_memory(struct device *dev)
119{
120 struct dma_coherent_mem *mem = dev->dma_mem;
121
122 if(!mem)
123 return;
124 dev->dma_mem = NULL;
125 iounmap(mem->virt_base);
126 kfree(mem->bitmap);
127 kfree(mem);
128}
129EXPORT_SYMBOL(dma_release_declared_memory);
130
131void *dma_mark_declared_memory_occupied(struct device *dev,
132 dma_addr_t device_addr, size_t size)
133{
134 struct dma_coherent_mem *mem = dev->dma_mem;
135 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
136 int pos, err;
137
138 if (!mem)
139 return ERR_PTR(-EINVAL);
140
141 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
142 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
143 if (err != 0)
144 return ERR_PTR(err);
145 return mem->virt_base + (pos << PAGE_SHIFT);
146}
147EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
new file mode 100644
index 000000000000..c36fedf40e95
--- /dev/null
+++ b/arch/i386/kernel/process.c
@@ -0,0 +1,848 @@
1/*
2 * linux/arch/i386/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
9
10/*
11 * This file handles the architecture-dependent parts of process handling..
12 */
13
14#include <stdarg.h>
15
16#include <linux/errno.h>
17#include <linux/sched.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mm.h>
21#include <linux/elfcore.h>
22#include <linux/smp.h>
23#include <linux/smp_lock.h>
24#include <linux/stddef.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
27#include <linux/user.h>
28#include <linux/a.out.h>
29#include <linux/interrupt.h>
30#include <linux/config.h>
31#include <linux/utsname.h>
32#include <linux/delay.h>
33#include <linux/reboot.h>
34#include <linux/init.h>
35#include <linux/mc146818rtc.h>
36#include <linux/module.h>
37#include <linux/kallsyms.h>
38#include <linux/ptrace.h>
39#include <linux/random.h>
40
41#include <asm/uaccess.h>
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/ldt.h>
46#include <asm/processor.h>
47#include <asm/i387.h>
48#include <asm/irq.h>
49#include <asm/desc.h>
50#ifdef CONFIG_MATH_EMULATION
51#include <asm/math_emu.h>
52#endif
53
54#include <linux/irq.h>
55#include <linux/err.h>
56
57asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
58
59static int hlt_counter;
60
61unsigned long boot_option_idle_override = 0;
62EXPORT_SYMBOL(boot_option_idle_override);
63
64/*
65 * Return saved PC of a blocked thread.
66 */
67unsigned long thread_saved_pc(struct task_struct *tsk)
68{
69 return ((unsigned long *)tsk->thread.esp)[3];
70}
71
72/*
73 * Powermanagement idle function, if any..
74 */
75void (*pm_idle)(void);
76static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
77
78void disable_hlt(void)
79{
80 hlt_counter++;
81}
82
83EXPORT_SYMBOL(disable_hlt);
84
85void enable_hlt(void)
86{
87 hlt_counter--;
88}
89
90EXPORT_SYMBOL(enable_hlt);
91
92/*
93 * We use this if we don't have any better
94 * idle routine..
95 */
96void default_idle(void)
97{
98 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
99 local_irq_disable();
100 if (!need_resched())
101 safe_halt();
102 else
103 local_irq_enable();
104 } else {
105 cpu_relax();
106 }
107}
108
109/*
110 * On SMP it's slightly faster (but much more power-consuming!)
111 * to poll the ->work.need_resched flag instead of waiting for the
112 * cross-CPU IPI to arrive. Use this option with caution.
113 */
114static void poll_idle (void)
115{
116 int oldval;
117
118 local_irq_enable();
119
120 /*
121 * Deal with another CPU just having chosen a thread to
122 * run here:
123 */
124 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
125
126 if (!oldval) {
127 set_thread_flag(TIF_POLLING_NRFLAG);
128 asm volatile(
129 "2:"
130 "testl %0, %1;"
131 "rep; nop;"
132 "je 2b;"
133 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
134
135 clear_thread_flag(TIF_POLLING_NRFLAG);
136 } else {
137 set_need_resched();
138 }
139}
140
141/*
142 * The idle thread. There's no useful work to be
143 * done, so just try to conserve power and have a
144 * low exit latency (ie sit in a loop waiting for
145 * somebody to say that they'd like to reschedule)
146 */
147void cpu_idle (void)
148{
149 /* endless idle loop with no priority at all */
150 while (1) {
151 while (!need_resched()) {
152 void (*idle)(void);
153
154 if (__get_cpu_var(cpu_idle_state))
155 __get_cpu_var(cpu_idle_state) = 0;
156
157 rmb();
158 idle = pm_idle;
159
160 if (!idle)
161 idle = default_idle;
162
163 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
164 idle();
165 }
166 schedule();
167 }
168}
169
170void cpu_idle_wait(void)
171{
172 unsigned int cpu, this_cpu = get_cpu();
173 cpumask_t map;
174
175 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
176 put_cpu();
177
178 cpus_clear(map);
179 for_each_online_cpu(cpu) {
180 per_cpu(cpu_idle_state, cpu) = 1;
181 cpu_set(cpu, map);
182 }
183
184 __get_cpu_var(cpu_idle_state) = 0;
185
186 wmb();
187 do {
188 ssleep(1);
189 for_each_online_cpu(cpu) {
190 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
191 cpu_clear(cpu, map);
192 }
193 cpus_and(map, map, cpu_online_map);
194 } while (!cpus_empty(map));
195}
196EXPORT_SYMBOL_GPL(cpu_idle_wait);
197
198/*
199 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
200 * which can obviate IPI to trigger checking of need_resched.
201 * We execute MONITOR against need_resched and enter optimized wait state
202 * through MWAIT. Whenever someone changes need_resched, we would be woken
203 * up from MWAIT (without an IPI).
204 */
205static void mwait_idle(void)
206{
207 local_irq_enable();
208
209 if (!need_resched()) {
210 set_thread_flag(TIF_POLLING_NRFLAG);
211 do {
212 __monitor((void *)&current_thread_info()->flags, 0, 0);
213 if (need_resched())
214 break;
215 __mwait(0, 0);
216 } while (!need_resched());
217 clear_thread_flag(TIF_POLLING_NRFLAG);
218 }
219}
220
221void __init select_idle_routine(const struct cpuinfo_x86 *c)
222{
223 if (cpu_has(c, X86_FEATURE_MWAIT)) {
224 printk("monitor/mwait feature present.\n");
225 /*
226 * Skip, if setup has overridden idle.
227 * One CPU supports mwait => All CPUs supports mwait
228 */
229 if (!pm_idle) {
230 printk("using mwait in idle threads.\n");
231 pm_idle = mwait_idle;
232 }
233 }
234}
235
236static int __init idle_setup (char *str)
237{
238 if (!strncmp(str, "poll", 4)) {
239 printk("using polling idle threads.\n");
240 pm_idle = poll_idle;
241#ifdef CONFIG_X86_SMP
242 if (smp_num_siblings > 1)
243 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
244#endif
245 } else if (!strncmp(str, "halt", 4)) {
246 printk("using halt in idle threads.\n");
247 pm_idle = default_idle;
248 }
249
250 boot_option_idle_override = 1;
251 return 1;
252}
253
254__setup("idle=", idle_setup);
255
256void show_regs(struct pt_regs * regs)
257{
258 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
259
260 printk("\n");
261 printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
262 printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
263 print_symbol("EIP is at %s\n", regs->eip);
264
265 if (regs->xcs & 3)
266 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
267 printk(" EFLAGS: %08lx %s (%s)\n",
268 regs->eflags, print_tainted(), system_utsname.release);
269 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
270 regs->eax,regs->ebx,regs->ecx,regs->edx);
271 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
272 regs->esi, regs->edi, regs->ebp);
273 printk(" DS: %04x ES: %04x\n",
274 0xffff & regs->xds,0xffff & regs->xes);
275
276 __asm__("movl %%cr0, %0": "=r" (cr0));
277 __asm__("movl %%cr2, %0": "=r" (cr2));
278 __asm__("movl %%cr3, %0": "=r" (cr3));
279 /* This could fault if %cr4 does not exist */
280 __asm__("1: movl %%cr4, %0 \n"
281 "2: \n"
282 ".section __ex_table,\"a\" \n"
283 ".long 1b,2b \n"
284 ".previous \n"
285 : "=r" (cr4): "0" (0));
286 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
287 show_trace(NULL, &regs->esp);
288}
289
290/*
291 * This gets run with %ebx containing the
292 * function to call, and %edx containing
293 * the "args".
294 */
295extern void kernel_thread_helper(void);
296__asm__(".section .text\n"
297 ".align 4\n"
298 "kernel_thread_helper:\n\t"
299 "movl %edx,%eax\n\t"
300 "pushl %edx\n\t"
301 "call *%ebx\n\t"
302 "pushl %eax\n\t"
303 "call do_exit\n"
304 ".previous");
305
306/*
307 * Create a kernel thread
308 */
309int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
310{
311 struct pt_regs regs;
312
313 memset(&regs, 0, sizeof(regs));
314
315 regs.ebx = (unsigned long) fn;
316 regs.edx = (unsigned long) arg;
317
318 regs.xds = __USER_DS;
319 regs.xes = __USER_DS;
320 regs.orig_eax = -1;
321 regs.eip = (unsigned long) kernel_thread_helper;
322 regs.xcs = __KERNEL_CS;
323 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
324
325 /* Ok, create the new process.. */
326 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
327}
328
329/*
330 * Free current thread data structures etc..
331 */
332void exit_thread(void)
333{
334 struct task_struct *tsk = current;
335 struct thread_struct *t = &tsk->thread;
336
337 /* The process may have allocated an io port bitmap... nuke it. */
338 if (unlikely(NULL != t->io_bitmap_ptr)) {
339 int cpu = get_cpu();
340 struct tss_struct *tss = &per_cpu(init_tss, cpu);
341
342 kfree(t->io_bitmap_ptr);
343 t->io_bitmap_ptr = NULL;
344 /*
345 * Careful, clear this in the TSS too:
346 */
347 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
348 t->io_bitmap_max = 0;
349 tss->io_bitmap_owner = NULL;
350 tss->io_bitmap_max = 0;
351 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
352 put_cpu();
353 }
354}
355
356void flush_thread(void)
357{
358 struct task_struct *tsk = current;
359
360 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
361 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
362 /*
363 * Forget coprocessor state..
364 */
365 clear_fpu(tsk);
366 clear_used_math();
367}
368
369void release_thread(struct task_struct *dead_task)
370{
371 if (dead_task->mm) {
372 // temporary debugging check
373 if (dead_task->mm->context.size) {
374 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
375 dead_task->comm,
376 dead_task->mm->context.ldt,
377 dead_task->mm->context.size);
378 BUG();
379 }
380 }
381
382 release_vm86_irqs(dead_task);
383}
384
385/*
386 * This gets called before we allocate a new thread and copy
387 * the current task into it.
388 */
389void prepare_to_copy(struct task_struct *tsk)
390{
391 unlazy_fpu(tsk);
392}
393
394int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
395 unsigned long unused,
396 struct task_struct * p, struct pt_regs * regs)
397{
398 struct pt_regs * childregs;
399 struct task_struct *tsk;
400 int err;
401
402 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
403 *childregs = *regs;
404 childregs->eax = 0;
405 childregs->esp = esp;
406
407 p->thread.esp = (unsigned long) childregs;
408 p->thread.esp0 = (unsigned long) (childregs+1);
409
410 p->thread.eip = (unsigned long) ret_from_fork;
411
412 savesegment(fs,p->thread.fs);
413 savesegment(gs,p->thread.gs);
414
415 tsk = current;
416 if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
417 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
418 if (!p->thread.io_bitmap_ptr) {
419 p->thread.io_bitmap_max = 0;
420 return -ENOMEM;
421 }
422 memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
423 IO_BITMAP_BYTES);
424 }
425
426 /*
427 * Set a new TLS for the child thread?
428 */
429 if (clone_flags & CLONE_SETTLS) {
430 struct desc_struct *desc;
431 struct user_desc info;
432 int idx;
433
434 err = -EFAULT;
435 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
436 goto out;
437 err = -EINVAL;
438 if (LDT_empty(&info))
439 goto out;
440
441 idx = info.entry_number;
442 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
443 goto out;
444
445 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
446 desc->a = LDT_entry_a(&info);
447 desc->b = LDT_entry_b(&info);
448 }
449
450 err = 0;
451 out:
452 if (err && p->thread.io_bitmap_ptr) {
453 kfree(p->thread.io_bitmap_ptr);
454 p->thread.io_bitmap_max = 0;
455 }
456 return err;
457}
458
459/*
460 * fill in the user structure for a core dump..
461 */
462void dump_thread(struct pt_regs * regs, struct user * dump)
463{
464 int i;
465
466/* changed the size calculations - should hopefully work better. lbt */
467 dump->magic = CMAGIC;
468 dump->start_code = 0;
469 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
470 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
471 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
472 dump->u_dsize -= dump->u_tsize;
473 dump->u_ssize = 0;
474 for (i = 0; i < 8; i++)
475 dump->u_debugreg[i] = current->thread.debugreg[i];
476
477 if (dump->start_stack < TASK_SIZE)
478 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
479
480 dump->regs.ebx = regs->ebx;
481 dump->regs.ecx = regs->ecx;
482 dump->regs.edx = regs->edx;
483 dump->regs.esi = regs->esi;
484 dump->regs.edi = regs->edi;
485 dump->regs.ebp = regs->ebp;
486 dump->regs.eax = regs->eax;
487 dump->regs.ds = regs->xds;
488 dump->regs.es = regs->xes;
489 savesegment(fs,dump->regs.fs);
490 savesegment(gs,dump->regs.gs);
491 dump->regs.orig_eax = regs->orig_eax;
492 dump->regs.eip = regs->eip;
493 dump->regs.cs = regs->xcs;
494 dump->regs.eflags = regs->eflags;
495 dump->regs.esp = regs->esp;
496 dump->regs.ss = regs->xss;
497
498 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
499}
500
501/*
502 * Capture the user space registers if the task is not running (in user space)
503 */
504int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
505{
506 struct pt_regs ptregs;
507
508 ptregs = *(struct pt_regs *)
509 ((unsigned long)tsk->thread_info+THREAD_SIZE - sizeof(ptregs));
510 ptregs.xcs &= 0xffff;
511 ptregs.xds &= 0xffff;
512 ptregs.xes &= 0xffff;
513 ptregs.xss &= 0xffff;
514
515 elf_core_copy_regs(regs, &ptregs);
516
517 return 1;
518}
519
520static inline void
521handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
522{
523 if (!next->io_bitmap_ptr) {
524 /*
525 * Disable the bitmap via an invalid offset. We still cache
526 * the previous bitmap owner and the IO bitmap contents:
527 */
528 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
529 return;
530 }
531 if (likely(next == tss->io_bitmap_owner)) {
532 /*
533 * Previous owner of the bitmap (hence the bitmap content)
534 * matches the next task, we dont have to do anything but
535 * to set a valid offset in the TSS:
536 */
537 tss->io_bitmap_base = IO_BITMAP_OFFSET;
538 return;
539 }
540 /*
541 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
542 * and we let the task to get a GPF in case an I/O instruction
543 * is performed. The handler of the GPF will verify that the
544 * faulting task has a valid I/O bitmap and, it true, does the
545 * real copy and restart the instruction. This will save us
546 * redundant copies when the currently switched task does not
547 * perform any I/O during its timeslice.
548 */
549 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
550}
551/*
552 * This special macro can be used to load a debugging register
553 */
554#define loaddebug(thread,register) \
555 __asm__("movl %0,%%db" #register \
556 : /* no output */ \
557 :"r" (thread->debugreg[register]))
558
559/*
560 * switch_to(x,yn) should switch tasks from x to y.
561 *
562 * We fsave/fwait so that an exception goes off at the right time
563 * (as a call from the fsave or fwait in effect) rather than to
564 * the wrong process. Lazy FP saving no longer makes any sense
565 * with modern CPU's, and this simplifies a lot of things (SMP
566 * and UP become the same).
567 *
568 * NOTE! We used to use the x86 hardware context switching. The
569 * reason for not using it any more becomes apparent when you
570 * try to recover gracefully from saved state that is no longer
571 * valid (stale segment register values in particular). With the
572 * hardware task-switch, there is no way to fix up bad state in
573 * a reasonable manner.
574 *
575 * The fact that Intel documents the hardware task-switching to
576 * be slow is a fairly red herring - this code is not noticeably
577 * faster. However, there _is_ some room for improvement here,
578 * so the performance issues may eventually be a valid point.
579 * More important, however, is the fact that this allows us much
580 * more flexibility.
581 *
582 * The return value (in %eax) will be the "prev" task after
583 * the task-switch, and shows up in ret_from_fork in entry.S,
584 * for example.
585 */
586struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
587{
588 struct thread_struct *prev = &prev_p->thread,
589 *next = &next_p->thread;
590 int cpu = smp_processor_id();
591 struct tss_struct *tss = &per_cpu(init_tss, cpu);
592
593 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
594
595 __unlazy_fpu(prev_p);
596
597 /*
598 * Reload esp0, LDT and the page table pointer:
599 */
600 load_esp0(tss, next);
601
602 /*
603 * Load the per-thread Thread-Local Storage descriptor.
604 */
605 load_TLS(next, cpu);
606
607 /*
608 * Save away %fs and %gs. No need to save %es and %ds, as
609 * those are always kernel segments while inside the kernel.
610 */
611 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
612 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
613
614 /*
615 * Restore %fs and %gs if needed.
616 */
617 if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
618 loadsegment(fs, next->fs);
619 loadsegment(gs, next->gs);
620 }
621
622 /*
623 * Now maybe reload the debug registers
624 */
625 if (unlikely(next->debugreg[7])) {
626 loaddebug(next, 0);
627 loaddebug(next, 1);
628 loaddebug(next, 2);
629 loaddebug(next, 3);
630 /* no 4 and 5 */
631 loaddebug(next, 6);
632 loaddebug(next, 7);
633 }
634
635 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
636 handle_io_bitmap(next, tss);
637
638 return prev_p;
639}
640
641asmlinkage int sys_fork(struct pt_regs regs)
642{
643 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
644}
645
646asmlinkage int sys_clone(struct pt_regs regs)
647{
648 unsigned long clone_flags;
649 unsigned long newsp;
650 int __user *parent_tidptr, *child_tidptr;
651
652 clone_flags = regs.ebx;
653 newsp = regs.ecx;
654 parent_tidptr = (int __user *)regs.edx;
655 child_tidptr = (int __user *)regs.edi;
656 if (!newsp)
657 newsp = regs.esp;
658 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
659}
660
661/*
662 * This is trivial, and on the face of it looks like it
663 * could equally well be done in user mode.
664 *
665 * Not so, for quite unobvious reasons - register pressure.
666 * In user mode vfork() cannot have a stack frame, and if
667 * done by calling the "clone()" system call directly, you
668 * do not have enough call-clobbered registers to hold all
669 * the information you need.
670 */
671asmlinkage int sys_vfork(struct pt_regs regs)
672{
673 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
674}
675
676/*
677 * sys_execve() executes a new program.
678 */
679asmlinkage int sys_execve(struct pt_regs regs)
680{
681 int error;
682 char * filename;
683
684 filename = getname((char __user *) regs.ebx);
685 error = PTR_ERR(filename);
686 if (IS_ERR(filename))
687 goto out;
688 error = do_execve(filename,
689 (char __user * __user *) regs.ecx,
690 (char __user * __user *) regs.edx,
691 &regs);
692 if (error == 0) {
693 task_lock(current);
694 current->ptrace &= ~PT_DTRACE;
695 task_unlock(current);
696 /* Make sure we don't return using sysenter.. */
697 set_thread_flag(TIF_IRET);
698 }
699 putname(filename);
700out:
701 return error;
702}
703
704#define top_esp (THREAD_SIZE - sizeof(unsigned long))
705#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
706
707unsigned long get_wchan(struct task_struct *p)
708{
709 unsigned long ebp, esp, eip;
710 unsigned long stack_page;
711 int count = 0;
712 if (!p || p == current || p->state == TASK_RUNNING)
713 return 0;
714 stack_page = (unsigned long)p->thread_info;
715 esp = p->thread.esp;
716 if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
717 return 0;
718 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
719 ebp = *(unsigned long *) esp;
720 do {
721 if (ebp < stack_page || ebp > top_ebp+stack_page)
722 return 0;
723 eip = *(unsigned long *) (ebp+4);
724 if (!in_sched_functions(eip))
725 return eip;
726 ebp = *(unsigned long *) ebp;
727 } while (count++ < 16);
728 return 0;
729}
730
731/*
732 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
733 */
734static int get_free_idx(void)
735{
736 struct thread_struct *t = &current->thread;
737 int idx;
738
739 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
740 if (desc_empty(t->tls_array + idx))
741 return idx + GDT_ENTRY_TLS_MIN;
742 return -ESRCH;
743}
744
745/*
746 * Set a given TLS descriptor:
747 */
748asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
749{
750 struct thread_struct *t = &current->thread;
751 struct user_desc info;
752 struct desc_struct *desc;
753 int cpu, idx;
754
755 if (copy_from_user(&info, u_info, sizeof(info)))
756 return -EFAULT;
757 idx = info.entry_number;
758
759 /*
760 * index -1 means the kernel should try to find and
761 * allocate an empty descriptor:
762 */
763 if (idx == -1) {
764 idx = get_free_idx();
765 if (idx < 0)
766 return idx;
767 if (put_user(idx, &u_info->entry_number))
768 return -EFAULT;
769 }
770
771 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
772 return -EINVAL;
773
774 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
775
776 /*
777 * We must not get preempted while modifying the TLS.
778 */
779 cpu = get_cpu();
780
781 if (LDT_empty(&info)) {
782 desc->a = 0;
783 desc->b = 0;
784 } else {
785 desc->a = LDT_entry_a(&info);
786 desc->b = LDT_entry_b(&info);
787 }
788 load_TLS(t, cpu);
789
790 put_cpu();
791
792 return 0;
793}
794
795/*
796 * Get the current Thread-Local Storage area:
797 */
798
799#define GET_BASE(desc) ( \
800 (((desc)->a >> 16) & 0x0000ffff) | \
801 (((desc)->b << 16) & 0x00ff0000) | \
802 ( (desc)->b & 0xff000000) )
803
804#define GET_LIMIT(desc) ( \
805 ((desc)->a & 0x0ffff) | \
806 ((desc)->b & 0xf0000) )
807
808#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
809#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
810#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
811#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
812#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
813#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
814
815asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
816{
817 struct user_desc info;
818 struct desc_struct *desc;
819 int idx;
820
821 if (get_user(idx, &u_info->entry_number))
822 return -EFAULT;
823 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
824 return -EINVAL;
825
826 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
827
828 info.entry_number = idx;
829 info.base_addr = GET_BASE(desc);
830 info.limit = GET_LIMIT(desc);
831 info.seg_32bit = GET_32BIT(desc);
832 info.contents = GET_CONTENTS(desc);
833 info.read_exec_only = !GET_WRITABLE(desc);
834 info.limit_in_pages = GET_LIMIT_PAGES(desc);
835 info.seg_not_present = !GET_PRESENT(desc);
836 info.useable = GET_USEABLE(desc);
837
838 if (copy_to_user(u_info, &info, sizeof(info)))
839 return -EFAULT;
840 return 0;
841}
842
843unsigned long arch_align_stack(unsigned long sp)
844{
845 if (randomize_va_space)
846 sp -= get_random_int() % 8192;
847 return sp & ~0xf;
848}
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
new file mode 100644
index 000000000000..b2f17640ceff
--- /dev/null
+++ b/arch/i386/kernel/ptrace.c
@@ -0,0 +1,717 @@
1/* ptrace.c */
2/* By Ross Biro 1/23/92 */
3/*
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 */
7
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/mm.h>
11#include <linux/smp.h>
12#include <linux/smp_lock.h>
13#include <linux/errno.h>
14#include <linux/ptrace.h>
15#include <linux/user.h>
16#include <linux/security.h>
17#include <linux/audit.h>
18#include <linux/seccomp.h>
19
20#include <asm/uaccess.h>
21#include <asm/pgtable.h>
22#include <asm/system.h>
23#include <asm/processor.h>
24#include <asm/i387.h>
25#include <asm/debugreg.h>
26#include <asm/ldt.h>
27#include <asm/desc.h>
28
29/*
30 * does not yet catch signals sent when the child dies.
31 * in exit.c or in signal.c.
32 */
33
34/* determines which flags the user has access to. */
35/* 1 = access 0 = no access */
36#define FLAG_MASK 0x00044dd5
37
38/* set's the trap flag. */
39#define TRAP_FLAG 0x100
40
41/*
42 * Offset of eflags on child stack..
43 */
44#define EFL_OFFSET ((EFL-2)*4-sizeof(struct pt_regs))
45
46static inline struct pt_regs *get_child_regs(struct task_struct *task)
47{
48 void *stack_top = (void *)task->thread.esp0;
49 return stack_top - sizeof(struct pt_regs);
50}
51
52/*
53 * this routine will get a word off of the processes privileged stack.
54 * the offset is how far from the base addr as stored in the TSS.
55 * this routine assumes that all the privileged stacks are in our
56 * data space.
57 */
58static inline int get_stack_long(struct task_struct *task, int offset)
59{
60 unsigned char *stack;
61
62 stack = (unsigned char *)task->thread.esp0;
63 stack += offset;
64 return (*((int *)stack));
65}
66
67/*
68 * this routine will put a word on the processes privileged stack.
69 * the offset is how far from the base addr as stored in the TSS.
70 * this routine assumes that all the privileged stacks are in our
71 * data space.
72 */
73static inline int put_stack_long(struct task_struct *task, int offset,
74 unsigned long data)
75{
76 unsigned char * stack;
77
78 stack = (unsigned char *) task->thread.esp0;
79 stack += offset;
80 *(unsigned long *) stack = data;
81 return 0;
82}
83
84static int putreg(struct task_struct *child,
85 unsigned long regno, unsigned long value)
86{
87 switch (regno >> 2) {
88 case FS:
89 if (value && (value & 3) != 3)
90 return -EIO;
91 child->thread.fs = value;
92 return 0;
93 case GS:
94 if (value && (value & 3) != 3)
95 return -EIO;
96 child->thread.gs = value;
97 return 0;
98 case DS:
99 case ES:
100 if (value && (value & 3) != 3)
101 return -EIO;
102 value &= 0xffff;
103 break;
104 case SS:
105 case CS:
106 if ((value & 3) != 3)
107 return -EIO;
108 value &= 0xffff;
109 break;
110 case EFL:
111 value &= FLAG_MASK;
112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
113 break;
114 }
115 if (regno > GS*4)
116 regno -= 2*4;
117 put_stack_long(child, regno - sizeof(struct pt_regs), value);
118 return 0;
119}
120
121static unsigned long getreg(struct task_struct *child,
122 unsigned long regno)
123{
124 unsigned long retval = ~0UL;
125
126 switch (regno >> 2) {
127 case FS:
128 retval = child->thread.fs;
129 break;
130 case GS:
131 retval = child->thread.gs;
132 break;
133 case DS:
134 case ES:
135 case SS:
136 case CS:
137 retval = 0xffff;
138 /* fall through */
139 default:
140 if (regno > GS*4)
141 regno -= 2*4;
142 regno = regno - sizeof(struct pt_regs);
143 retval &= get_stack_long(child, regno);
144 }
145 return retval;
146}
147
148#define LDT_SEGMENT 4
149
150static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
151{
152 unsigned long addr, seg;
153
154 addr = regs->eip;
155 seg = regs->xcs & 0xffff;
156 if (regs->eflags & VM_MASK) {
157 addr = (addr & 0xffff) + (seg << 4);
158 return addr;
159 }
160
161 /*
162 * We'll assume that the code segments in the GDT
163 * are all zero-based. That is largely true: the
164 * TLS segments are used for data, and the PNPBIOS
165 * and APM bios ones we just ignore here.
166 */
167 if (seg & LDT_SEGMENT) {
168 u32 *desc;
169 unsigned long base;
170
171 down(&child->mm->context.sem);
172 desc = child->mm->context.ldt + (seg & ~7);
173 base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000);
174
175 /* 16-bit code segment? */
176 if (!((desc[1] >> 22) & 1))
177 addr &= 0xffff;
178 addr += base;
179 up(&child->mm->context.sem);
180 }
181 return addr;
182}
183
184static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs)
185{
186 int i, copied;
187 unsigned char opcode[16];
188 unsigned long addr = convert_eip_to_linear(child, regs);
189
190 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
191 for (i = 0; i < copied; i++) {
192 switch (opcode[i]) {
193 /* popf */
194 case 0x9d:
195 return 1;
196 /* opcode and address size prefixes */
197 case 0x66: case 0x67:
198 continue;
199 /* irrelevant prefixes (segment overrides and repeats) */
200 case 0x26: case 0x2e:
201 case 0x36: case 0x3e:
202 case 0x64: case 0x65:
203 case 0xf0: case 0xf2: case 0xf3:
204 continue;
205
206 /*
207 * pushf: NOTE! We should probably not let
208 * the user see the TF bit being set. But
209 * it's more pain than it's worth to avoid
210 * it, and a debugger could emulate this
211 * all in user space if it _really_ cares.
212 */
213 case 0x9c:
214 default:
215 return 0;
216 }
217 }
218 return 0;
219}
220
221static void set_singlestep(struct task_struct *child)
222{
223 struct pt_regs *regs = get_child_regs(child);
224
225 /*
226 * Always set TIF_SINGLESTEP - this guarantees that
227 * we single-step system calls etc.. This will also
228 * cause us to set TF when returning to user mode.
229 */
230 set_tsk_thread_flag(child, TIF_SINGLESTEP);
231
232 /*
233 * If TF was already set, don't do anything else
234 */
235 if (regs->eflags & TRAP_FLAG)
236 return;
237
238 /* Set TF on the kernel stack.. */
239 regs->eflags |= TRAP_FLAG;
240
241 /*
242 * ..but if TF is changed by the instruction we will trace,
243 * don't mark it as being "us" that set it, so that we
244 * won't clear it by hand later.
245 */
246 if (is_at_popf(child, regs))
247 return;
248
249 child->ptrace |= PT_DTRACE;
250}
251
252static void clear_singlestep(struct task_struct *child)
253{
254 /* Always clear TIF_SINGLESTEP... */
255 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
256
257 /* But touch TF only if it was set by us.. */
258 if (child->ptrace & PT_DTRACE) {
259 struct pt_regs *regs = get_child_regs(child);
260 regs->eflags &= ~TRAP_FLAG;
261 child->ptrace &= ~PT_DTRACE;
262 }
263}
264
265/*
266 * Called by kernel/ptrace.c when detaching..
267 *
268 * Make sure the single step bit is not set.
269 */
270void ptrace_disable(struct task_struct *child)
271{
272 clear_singlestep(child);
273}
274
275/*
276 * Perform get_thread_area on behalf of the traced child.
277 */
278static int
279ptrace_get_thread_area(struct task_struct *child,
280 int idx, struct user_desc __user *user_desc)
281{
282 struct user_desc info;
283 struct desc_struct *desc;
284
285/*
286 * Get the current Thread-Local Storage area:
287 */
288
289#define GET_BASE(desc) ( \
290 (((desc)->a >> 16) & 0x0000ffff) | \
291 (((desc)->b << 16) & 0x00ff0000) | \
292 ( (desc)->b & 0xff000000) )
293
294#define GET_LIMIT(desc) ( \
295 ((desc)->a & 0x0ffff) | \
296 ((desc)->b & 0xf0000) )
297
298#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
299#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
300#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
301#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
302#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
303#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
304
305 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
306 return -EINVAL;
307
308 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
309
310 info.entry_number = idx;
311 info.base_addr = GET_BASE(desc);
312 info.limit = GET_LIMIT(desc);
313 info.seg_32bit = GET_32BIT(desc);
314 info.contents = GET_CONTENTS(desc);
315 info.read_exec_only = !GET_WRITABLE(desc);
316 info.limit_in_pages = GET_LIMIT_PAGES(desc);
317 info.seg_not_present = !GET_PRESENT(desc);
318 info.useable = GET_USEABLE(desc);
319
320 if (copy_to_user(user_desc, &info, sizeof(info)))
321 return -EFAULT;
322
323 return 0;
324}
325
326/*
327 * Perform set_thread_area on behalf of the traced child.
328 */
329static int
330ptrace_set_thread_area(struct task_struct *child,
331 int idx, struct user_desc __user *user_desc)
332{
333 struct user_desc info;
334 struct desc_struct *desc;
335
336 if (copy_from_user(&info, user_desc, sizeof(info)))
337 return -EFAULT;
338
339 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
340 return -EINVAL;
341
342 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
343 if (LDT_empty(&info)) {
344 desc->a = 0;
345 desc->b = 0;
346 } else {
347 desc->a = LDT_entry_a(&info);
348 desc->b = LDT_entry_b(&info);
349 }
350
351 return 0;
352}
353
354asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
355{
356 struct task_struct *child;
357 struct user * dummy = NULL;
358 int i, ret;
359 unsigned long __user *datap = (unsigned long __user *)data;
360
361 lock_kernel();
362 ret = -EPERM;
363 if (request == PTRACE_TRACEME) {
364 /* are we already being traced? */
365 if (current->ptrace & PT_PTRACED)
366 goto out;
367 ret = security_ptrace(current->parent, current);
368 if (ret)
369 goto out;
370 /* set the ptrace bit in the process flags. */
371 current->ptrace |= PT_PTRACED;
372 ret = 0;
373 goto out;
374 }
375 ret = -ESRCH;
376 read_lock(&tasklist_lock);
377 child = find_task_by_pid(pid);
378 if (child)
379 get_task_struct(child);
380 read_unlock(&tasklist_lock);
381 if (!child)
382 goto out;
383
384 ret = -EPERM;
385 if (pid == 1) /* you may not mess with init */
386 goto out_tsk;
387
388 if (request == PTRACE_ATTACH) {
389 ret = ptrace_attach(child);
390 goto out_tsk;
391 }
392
393 ret = ptrace_check_attach(child, request == PTRACE_KILL);
394 if (ret < 0)
395 goto out_tsk;
396
397 switch (request) {
398 /* when I and D space are separate, these will need to be fixed. */
399 case PTRACE_PEEKTEXT: /* read word at location addr. */
400 case PTRACE_PEEKDATA: {
401 unsigned long tmp;
402 int copied;
403
404 copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
405 ret = -EIO;
406 if (copied != sizeof(tmp))
407 break;
408 ret = put_user(tmp, datap);
409 break;
410 }
411
412 /* read the word at location addr in the USER area. */
413 case PTRACE_PEEKUSR: {
414 unsigned long tmp;
415
416 ret = -EIO;
417 if ((addr & 3) || addr < 0 ||
418 addr > sizeof(struct user) - 3)
419 break;
420
421 tmp = 0; /* Default return condition */
422 if(addr < FRAME_SIZE*sizeof(long))
423 tmp = getreg(child, addr);
424 if(addr >= (long) &dummy->u_debugreg[0] &&
425 addr <= (long) &dummy->u_debugreg[7]){
426 addr -= (long) &dummy->u_debugreg[0];
427 addr = addr >> 2;
428 tmp = child->thread.debugreg[addr];
429 }
430 ret = put_user(tmp, datap);
431 break;
432 }
433
434 /* when I and D space are separate, this will have to be fixed. */
435 case PTRACE_POKETEXT: /* write the word at location addr. */
436 case PTRACE_POKEDATA:
437 ret = 0;
438 if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
439 break;
440 ret = -EIO;
441 break;
442
443 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
444 ret = -EIO;
445 if ((addr & 3) || addr < 0 ||
446 addr > sizeof(struct user) - 3)
447 break;
448
449 if (addr < FRAME_SIZE*sizeof(long)) {
450 ret = putreg(child, addr, data);
451 break;
452 }
453 /* We need to be very careful here. We implicitly
454 want to modify a portion of the task_struct, and we
455 have to be selective about what portions we allow someone
456 to modify. */
457
458 ret = -EIO;
459 if(addr >= (long) &dummy->u_debugreg[0] &&
460 addr <= (long) &dummy->u_debugreg[7]){
461
462 if(addr == (long) &dummy->u_debugreg[4]) break;
463 if(addr == (long) &dummy->u_debugreg[5]) break;
464 if(addr < (long) &dummy->u_debugreg[4] &&
465 ((unsigned long) data) >= TASK_SIZE-3) break;
466
467 /* Sanity-check data. Take one half-byte at once with
468 * check = (val >> (16 + 4*i)) & 0xf. It contains the
469 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
470 * 2 and 3 are LENi. Given a list of invalid values,
471 * we do mask |= 1 << invalid_value, so that
472 * (mask >> check) & 1 is a correct test for invalid
473 * values.
474 *
475 * R/Wi contains the type of the breakpoint /
476 * watchpoint, LENi contains the length of the watched
477 * data in the watchpoint case.
478 *
479 * The invalid values are:
480 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
481 * - R/Wi == 0x10 (break on I/O reads or writes), so
482 * mask |= 0x4444.
483 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
484 * 0x1110.
485 *
486 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
487 *
488 * See the Intel Manual "System Programming Guide",
489 * 15.2.4
490 *
491 * Note that LENi == 0x10 is defined on x86_64 in long
492 * mode (i.e. even for 32-bit userspace software, but
493 * 64-bit kernel), so the x86_64 mask value is 0x5454.
494 * See the AMD manual no. 24593 (AMD64 System
495 * Programming)*/
496
497 if(addr == (long) &dummy->u_debugreg[7]) {
498 data &= ~DR_CONTROL_RESERVED;
499 for(i=0; i<4; i++)
500 if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
501 goto out_tsk;
502 }
503
504 addr -= (long) &dummy->u_debugreg;
505 addr = addr >> 2;
506 child->thread.debugreg[addr] = data;
507 ret = 0;
508 }
509 break;
510
511 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
512 case PTRACE_CONT: /* restart after signal. */
513 ret = -EIO;
514 if ((unsigned long) data > _NSIG)
515 break;
516 if (request == PTRACE_SYSCALL) {
517 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
518 }
519 else {
520 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
521 }
522 child->exit_code = data;
523 /* make sure the single step bit is not set. */
524 clear_singlestep(child);
525 wake_up_process(child);
526 ret = 0;
527 break;
528
529/*
530 * make the child exit. Best I can do is send it a sigkill.
531 * perhaps it should be put in the status that it wants to
532 * exit.
533 */
534 case PTRACE_KILL:
535 ret = 0;
536 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
537 break;
538 child->exit_code = SIGKILL;
539 /* make sure the single step bit is not set. */
540 clear_singlestep(child);
541 wake_up_process(child);
542 break;
543
544 case PTRACE_SINGLESTEP: /* set the trap flag. */
545 ret = -EIO;
546 if ((unsigned long) data > _NSIG)
547 break;
548 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
549 set_singlestep(child);
550 child->exit_code = data;
551 /* give it a chance to run. */
552 wake_up_process(child);
553 ret = 0;
554 break;
555
556 case PTRACE_DETACH:
557 /* detach a process that was attached. */
558 ret = ptrace_detach(child, data);
559 break;
560
561 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
562 if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
563 ret = -EIO;
564 break;
565 }
566 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
567 __put_user(getreg(child, i), datap);
568 datap++;
569 }
570 ret = 0;
571 break;
572 }
573
574 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
575 unsigned long tmp;
576 if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
577 ret = -EIO;
578 break;
579 }
580 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
581 __get_user(tmp, datap);
582 putreg(child, i, tmp);
583 datap++;
584 }
585 ret = 0;
586 break;
587 }
588
589 case PTRACE_GETFPREGS: { /* Get the child FPU state. */
590 if (!access_ok(VERIFY_WRITE, datap,
591 sizeof(struct user_i387_struct))) {
592 ret = -EIO;
593 break;
594 }
595 ret = 0;
596 if (!tsk_used_math(child))
597 init_fpu(child);
598 get_fpregs((struct user_i387_struct __user *)data, child);
599 break;
600 }
601
602 case PTRACE_SETFPREGS: { /* Set the child FPU state. */
603 if (!access_ok(VERIFY_READ, datap,
604 sizeof(struct user_i387_struct))) {
605 ret = -EIO;
606 break;
607 }
608 set_stopped_child_used_math(child);
609 set_fpregs(child, (struct user_i387_struct __user *)data);
610 ret = 0;
611 break;
612 }
613
614 case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
615 if (!access_ok(VERIFY_WRITE, datap,
616 sizeof(struct user_fxsr_struct))) {
617 ret = -EIO;
618 break;
619 }
620 if (!tsk_used_math(child))
621 init_fpu(child);
622 ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
623 break;
624 }
625
626 case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
627 if (!access_ok(VERIFY_READ, datap,
628 sizeof(struct user_fxsr_struct))) {
629 ret = -EIO;
630 break;
631 }
632 set_stopped_child_used_math(child);
633 ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
634 break;
635 }
636
637 case PTRACE_GET_THREAD_AREA:
638 ret = ptrace_get_thread_area(child, addr,
639 (struct user_desc __user *) data);
640 break;
641
642 case PTRACE_SET_THREAD_AREA:
643 ret = ptrace_set_thread_area(child, addr,
644 (struct user_desc __user *) data);
645 break;
646
647 default:
648 ret = ptrace_request(child, request, addr, data);
649 break;
650 }
651out_tsk:
652 put_task_struct(child);
653out:
654 unlock_kernel();
655 return ret;
656}
657
658void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
659{
660 struct siginfo info;
661
662 tsk->thread.trap_no = 1;
663 tsk->thread.error_code = error_code;
664
665 memset(&info, 0, sizeof(info));
666 info.si_signo = SIGTRAP;
667 info.si_code = TRAP_BRKPT;
668
669 /* User-mode eip? */
670 info.si_addr = user_mode(regs) ? (void __user *) regs->eip : NULL;
671
672 /* Send us the fakey SIGTRAP */
673 force_sig_info(SIGTRAP, &info, tsk);
674}
675
676/* notification of system call entry/exit
677 * - triggered by current->work.syscall_trace
678 */
679__attribute__((regparm(3)))
680void do_syscall_trace(struct pt_regs *regs, int entryexit)
681{
682 /* do the secure computing check first */
683 secure_computing(regs->orig_eax);
684
685 if (unlikely(current->audit_context)) {
686 if (!entryexit)
687 audit_syscall_entry(current, regs->orig_eax,
688 regs->ebx, regs->ecx,
689 regs->edx, regs->esi);
690 else
691 audit_syscall_exit(current, regs->eax);
692 }
693
694 if (!(current->ptrace & PT_PTRACED))
695 return;
696
697 /* Fake a debug trap */
698 if (test_thread_flag(TIF_SINGLESTEP))
699 send_sigtrap(current, regs, 0);
700
701 if (!test_thread_flag(TIF_SYSCALL_TRACE))
702 return;
703
704 /* the 0x80 provides a way for the tracing parent to distinguish
705 between a syscall stop and SIGTRAP delivery */
706 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
707
708 /*
709 * this isn't the same as continuing with a signal, but it will do
710 * for normal use. strace only continues with a signal if the
711 * stopping signal is not SIGTRAP. -brl
712 */
713 if (current->exit_code) {
714 send_sig(current->exit_code, current, 1);
715 current->exit_code = 0;
716 }
717}
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
new file mode 100644
index 000000000000..aaf89cb2bc51
--- /dev/null
+++ b/arch/i386/kernel/quirks.c
@@ -0,0 +1,52 @@
1/*
2 * This file contains work-arounds for x86 and x86_64 platform bugs.
3 */
4#include <linux/config.h>
5#include <linux/pci.h>
6#include <linux/irq.h>
7
8#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
9
10static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
11{
12 u8 config, rev;
13 u32 word;
14
15 /* BIOS may enable hardware IRQ balancing for
16 * E7520/E7320/E7525(revision ID 0x9 and below)
17 * based platforms.
18 * Disable SW irqbalance/affinity on those platforms.
19 */
20 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
21 if (rev > 0x9)
22 return;
23
24 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
25
26 /* enable access to config space*/
27 pci_read_config_byte(dev, 0xf4, &config);
28 config |= 0x2;
29 pci_write_config_byte(dev, 0xf4, config);
30
31 /* read xTPR register */
32 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
33
34 if (!(word & (1 << 13))) {
35 printk(KERN_INFO "Disabling irq balancing and affinity\n");
36#ifdef CONFIG_IRQBALANCE
37 irqbalance_disable("");
38#endif
39 noirqdebug_setup("");
40#ifdef CONFIG_PROC_FS
41 no_irq_affinity = 1;
42#endif
43 }
44
45 config &= ~0x2;
46 /* disable access to config space*/
47 pci_write_config_byte(dev, 0xf4, config);
48}
49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
50DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
51DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
52#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
new file mode 100644
index 000000000000..3d7e994563df
--- /dev/null
+++ b/arch/i386/kernel/reboot.c
@@ -0,0 +1,382 @@
1/*
2 * linux/arch/i386/kernel/reboot.c
3 */
4
5#include <linux/mm.h>
6#include <linux/module.h>
7#include <linux/delay.h>
8#include <linux/init.h>
9#include <linux/interrupt.h>
10#include <linux/mc146818rtc.h>
11#include <linux/efi.h>
12#include <linux/dmi.h>
13#include <asm/uaccess.h>
14#include <asm/apic.h>
15#include "mach_reboot.h"
16
17/*
18 * Power off function, if any
19 */
20void (*pm_power_off)(void);
21
22static int reboot_mode;
23static int reboot_thru_bios;
24
25#ifdef CONFIG_SMP
26int reboot_smp = 0;
27static int reboot_cpu = -1;
28/* shamelessly grabbed from lib/vsprintf.c for readability */
29#define is_digit(c) ((c) >= '0' && (c) <= '9')
30#endif
31static int __init reboot_setup(char *str)
32{
33 while(1) {
34 switch (*str) {
35 case 'w': /* "warm" reboot (no memory testing etc) */
36 reboot_mode = 0x1234;
37 break;
38 case 'c': /* "cold" reboot (with memory testing etc) */
39 reboot_mode = 0x0;
40 break;
41 case 'b': /* "bios" reboot by jumping through the BIOS */
42 reboot_thru_bios = 1;
43 break;
44 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
45 reboot_thru_bios = 0;
46 break;
47#ifdef CONFIG_SMP
48 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
49 reboot_smp = 1;
50 if (is_digit(*(str+1))) {
51 reboot_cpu = (int) (*(str+1) - '0');
52 if (is_digit(*(str+2)))
53 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
54 }
55 /* we will leave sorting out the final value
56 when we are ready to reboot, since we might not
57 have set up boot_cpu_id or smp_num_cpu */
58 break;
59#endif
60 }
61 if((str = strchr(str,',')) != NULL)
62 str++;
63 else
64 break;
65 }
66 return 1;
67}
68
69__setup("reboot=", reboot_setup);
70
71/*
72 * Reboot options and system auto-detection code provided by
73 * Dell Inc. so their systems "just work". :-)
74 */
75
76/*
77 * Some machines require the "reboot=b" commandline option, this quirk makes that automatic.
78 */
79static int __init set_bios_reboot(struct dmi_system_id *d)
80{
81 if (!reboot_thru_bios) {
82 reboot_thru_bios = 1;
83 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
84 }
85 return 0;
86}
87
88/*
89 * Some machines require the "reboot=s" commandline option, this quirk makes that automatic.
90 */
91static int __init set_smp_reboot(struct dmi_system_id *d)
92{
93#ifdef CONFIG_SMP
94 if (!reboot_smp) {
95 reboot_smp = 1;
96 printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
97 }
98#endif
99 return 0;
100}
101
102/*
103 * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic.
104 */
105static int __init set_smp_bios_reboot(struct dmi_system_id *d)
106{
107 set_smp_reboot(d);
108 set_bios_reboot(d);
109 return 0;
110}
111
112static struct dmi_system_id __initdata reboot_dmi_table[] = {
113 { /* Handle problems with rebooting on Dell 1300's */
114 .callback = set_smp_bios_reboot,
115 .ident = "Dell PowerEdge 1300",
116 .matches = {
117 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
118 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
119 },
120 },
121 { /* Handle problems with rebooting on Dell 300's */
122 .callback = set_bios_reboot,
123 .ident = "Dell PowerEdge 300",
124 .matches = {
125 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
126 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
127 },
128 },
129 { /* Handle problems with rebooting on Dell 2400's */
130 .callback = set_bios_reboot,
131 .ident = "Dell PowerEdge 2400",
132 .matches = {
133 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
134 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
135 },
136 },
137 { }
138};
139
140static int __init reboot_init(void)
141{
142 dmi_check_system(reboot_dmi_table);
143 return 0;
144}
145
146core_initcall(reboot_init);
147
148/* The following code and data reboots the machine by switching to real
149 mode and jumping to the BIOS reset entry point, as if the CPU has
150 really been reset. The previous version asked the keyboard
151 controller to pulse the CPU reset line, which is more thorough, but
152 doesn't work with at least one type of 486 motherboard. It is easy
153 to stop this code working; hence the copious comments. */
154
155static unsigned long long
156real_mode_gdt_entries [3] =
157{
158 0x0000000000000000ULL, /* Null descriptor */
159 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
160 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
161};
162
163static struct
164{
165 unsigned short size __attribute__ ((packed));
166 unsigned long long * base __attribute__ ((packed));
167}
168real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
169real_mode_idt = { 0x3ff, NULL },
170no_idt = { 0, NULL };
171
172
173/* This is 16-bit protected mode code to disable paging and the cache,
174 switch to real mode and jump to the BIOS reset code.
175
176 The instruction that switches to real mode by writing to CR0 must be
177 followed immediately by a far jump instruction, which set CS to a
178 valid value for real mode, and flushes the prefetch queue to avoid
179 running instructions that have already been decoded in protected
180 mode.
181
182 Clears all the flags except ET, especially PG (paging), PE
183 (protected-mode enable) and TS (task switch for coprocessor state
184 save). Flushes the TLB after paging has been disabled. Sets CD and
185 NW, to disable the cache on a 486, and invalidates the cache. This
186 is more like the state of a 486 after reset. I don't know if
187 something else should be done for other chips.
188
189 More could be done here to set up the registers as if a CPU reset had
190 occurred; hopefully real BIOSs don't assume much. */
191
192static unsigned char real_mode_switch [] =
193{
194 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
195 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
196 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
197 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
198 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
199 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
200 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
201 0x74, 0x02, /* jz f */
202 0x0f, 0x09, /* wbinvd */
203 0x24, 0x10, /* f: andb $0x10,al */
204 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
205};
206static unsigned char jump_to_bios [] =
207{
208 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
209};
210
211/*
212 * Switch to real mode and then execute the code
213 * specified by the code and length parameters.
214 * We assume that length will aways be less that 100!
215 */
216void machine_real_restart(unsigned char *code, int length)
217{
218 unsigned long flags;
219
220 local_irq_disable();
221
222 /* Write zero to CMOS register number 0x0f, which the BIOS POST
223 routine will recognize as telling it to do a proper reboot. (Well
224 that's what this book in front of me says -- it may only apply to
225 the Phoenix BIOS though, it's not clear). At the same time,
226 disable NMIs by setting the top bit in the CMOS address register,
227 as we're about to do peculiar things to the CPU. I'm not sure if
228 `outb_p' is needed instead of just `outb'. Use it to be on the
229 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
230 */
231
232 spin_lock_irqsave(&rtc_lock, flags);
233 CMOS_WRITE(0x00, 0x8f);
234 spin_unlock_irqrestore(&rtc_lock, flags);
235
236 /* Remap the kernel at virtual address zero, as well as offset zero
237 from the kernel segment. This assumes the kernel segment starts at
238 virtual address PAGE_OFFSET. */
239
240 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
241 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
242
243 /*
244 * Use `swapper_pg_dir' as our page directory.
245 */
246 load_cr3(swapper_pg_dir);
247
248 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
249 this on booting to tell it to "Bypass memory test (also warm
250 boot)". This seems like a fairly standard thing that gets set by
251 REBOOT.COM programs, and the previous reset routine did this
252 too. */
253
254 *((unsigned short *)0x472) = reboot_mode;
255
256 /* For the switch to real mode, copy some code to low memory. It has
257 to be in the first 64k because it is running in 16-bit mode, and it
258 has to have the same physical and virtual address, because it turns
259 off paging. Copy it near the end of the first page, out of the way
260 of BIOS variables. */
261
262 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
263 real_mode_switch, sizeof (real_mode_switch));
264 memcpy ((void *) (0x1000 - 100), code, length);
265
266 /* Set up the IDT for real mode. */
267
268 __asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
269
270 /* Set up a GDT from which we can load segment descriptors for real
271 mode. The GDT is not used in real mode; it is just needed here to
272 prepare the descriptors. */
273
274 __asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
275
276 /* Load the data segment registers, and thus the descriptors ready for
277 real mode. The base address of each segment is 0x100, 16 times the
278 selector value being loaded here. This is so that the segment
279 registers don't have to be reloaded after switching to real mode:
280 the values are consistent for real mode operation already. */
281
282 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
283 "\tmovl %%eax,%%ds\n"
284 "\tmovl %%eax,%%es\n"
285 "\tmovl %%eax,%%fs\n"
286 "\tmovl %%eax,%%gs\n"
287 "\tmovl %%eax,%%ss" : : : "eax");
288
289 /* Jump to the 16-bit code that we copied earlier. It disables paging
290 and the cache, switches to real mode, and jumps to the BIOS reset
291 entry point. */
292
293 __asm__ __volatile__ ("ljmp $0x0008,%0"
294 :
295 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
296}
297
298void machine_restart(char * __unused)
299{
300#ifdef CONFIG_SMP
301 int cpuid;
302
303 cpuid = GET_APIC_ID(apic_read(APIC_ID));
304
305 if (reboot_smp) {
306
307 /* check to see if reboot_cpu is valid
308 if its not, default to the BSP */
309 if ((reboot_cpu == -1) ||
310 (reboot_cpu > (NR_CPUS -1)) ||
311 !physid_isset(cpuid, phys_cpu_present_map))
312 reboot_cpu = boot_cpu_physical_apicid;
313
314 reboot_smp = 0; /* use this as a flag to only go through this once*/
315 /* re-run this function on the other CPUs
316 it will fall though this section since we have
317 cleared reboot_smp, and do the reboot if it is the
318 correct CPU, otherwise it halts. */
319 if (reboot_cpu != cpuid)
320 smp_call_function((void *)machine_restart , NULL, 1, 0);
321 }
322
323 /* if reboot_cpu is still -1, then we want a tradional reboot,
324 and if we are not running on the reboot_cpu,, halt */
325 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
326 for (;;)
327 __asm__ __volatile__ ("hlt");
328 }
329 /*
330 * Stop all CPUs and turn off local APICs and the IO-APIC, so
331 * other OSs see a clean IRQ state.
332 */
333 smp_send_stop();
334#endif /* CONFIG_SMP */
335
336 lapic_shutdown();
337
338#ifdef CONFIG_X86_IO_APIC
339 disable_IO_APIC();
340#endif
341
342 if (!reboot_thru_bios) {
343 if (efi_enabled) {
344 efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
345 __asm__ __volatile__("lidt %0": :"m" (no_idt));
346 __asm__ __volatile__("int3");
347 }
348 /* rebooting needs to touch the page at absolute addr 0 */
349 *((unsigned short *)__va(0x472)) = reboot_mode;
350 for (;;) {
351 mach_reboot();
352 /* That didn't work - force a triple fault.. */
353 __asm__ __volatile__("lidt %0": :"m" (no_idt));
354 __asm__ __volatile__("int3");
355 }
356 }
357 if (efi_enabled)
358 efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
359
360 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
361}
362
363EXPORT_SYMBOL(machine_restart);
364
365void machine_halt(void)
366{
367}
368
369EXPORT_SYMBOL(machine_halt);
370
371void machine_power_off(void)
372{
373 lapic_shutdown();
374
375 if (efi_enabled)
376 efi.reset_system(EFI_RESET_SHUTDOWN, EFI_SUCCESS, 0, NULL);
377 if (pm_power_off)
378 pm_power_off();
379}
380
381EXPORT_SYMBOL(machine_power_off);
382
diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
new file mode 100644
index 000000000000..69e203a0d330
--- /dev/null
+++ b/arch/i386/kernel/scx200.c
@@ -0,0 +1,167 @@
1/* linux/arch/i386/kernel/scx200.c
2
3 Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
4
5 National Semiconductor SCx200 support. */
6
7#include <linux/config.h>
8#include <linux/module.h>
9#include <linux/errno.h>
10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/pci.h>
13
14#include <linux/scx200.h>
15
16/* Verify that the configuration block really is there */
17#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
18
19#define NAME "scx200"
20
21MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
22MODULE_DESCRIPTION("NatSemi SCx200 Driver");
23MODULE_LICENSE("GPL");
24
25unsigned scx200_gpio_base = 0;
26long scx200_gpio_shadow[2];
27
28unsigned scx200_cb_base = 0;
29
30static struct pci_device_id scx200_tbl[] = {
31 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
32 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
33 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) },
34 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) },
35 { },
36};
37MODULE_DEVICE_TABLE(pci,scx200_tbl);
38
39static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
40
41static struct pci_driver scx200_pci_driver = {
42 .name = "scx200",
43 .id_table = scx200_tbl,
44 .probe = scx200_probe,
45};
46
47static DEFINE_SPINLOCK(scx200_gpio_config_lock);
48
49static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
50{
51 int bank;
52 unsigned base;
53
54 if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
55 pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
56 base = pci_resource_start(pdev, 0);
57 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
58
59 if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
60 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
61 return -EBUSY;
62 }
63
64 scx200_gpio_base = base;
65
66 /* read the current values driven on the GPIO signals */
67 for (bank = 0; bank < 2; ++bank)
68 scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
69
70 } else {
71 /* find the base of the Configuration Block */
72 if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
73 scx200_cb_base = SCx200_CB_BASE_FIXED;
74 } else {
75 pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
76 if (scx200_cb_probe(base)) {
77 scx200_cb_base = base;
78 } else {
79 printk(KERN_WARNING NAME ": Configuration Block not found\n");
80 return -ENODEV;
81 }
82 }
83 printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
84 }
85
86 return 0;
87}
88
89u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
90{
91 u32 config, new_config;
92 unsigned long flags;
93
94 spin_lock_irqsave(&scx200_gpio_config_lock, flags);
95
96 outl(index, scx200_gpio_base + 0x20);
97 config = inl(scx200_gpio_base + 0x24);
98
99 new_config = (config & mask) | bits;
100 outl(new_config, scx200_gpio_base + 0x24);
101
102 spin_unlock_irqrestore(&scx200_gpio_config_lock, flags);
103
104 return config;
105}
106
107#if 0
108void scx200_gpio_dump(unsigned index)
109{
110 u32 config = scx200_gpio_configure(index, ~0, 0);
111 printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config);
112
113 if (config & 1)
114 printk(" OE"); /* output enabled */
115 else
116 printk(" TS"); /* tristate */
117 if (config & 2)
118 printk(" PP"); /* push pull */
119 else
120 printk(" OD"); /* open drain */
121 if (config & 4)
122 printk(" PUE"); /* pull up enabled */
123 else
124 printk(" PUD"); /* pull up disabled */
125 if (config & 8)
126 printk(" LOCKED"); /* locked */
127 if (config & 16)
128 printk(" LEVEL"); /* level input */
129 else
130 printk(" EDGE"); /* edge input */
131 if (config & 32)
132 printk(" HI"); /* trigger on rising edge */
133 else
134 printk(" LO"); /* trigger on falling edge */
135 if (config & 64)
136 printk(" DEBOUNCE"); /* debounce */
137 printk("\n");
138}
139#endif /* 0 */
140
141static int __init scx200_init(void)
142{
143 printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
144
145 return pci_module_init(&scx200_pci_driver);
146}
147
148static void __exit scx200_cleanup(void)
149{
150 pci_unregister_driver(&scx200_pci_driver);
151 release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
152}
153
154module_init(scx200_init);
155module_exit(scx200_cleanup);
156
157EXPORT_SYMBOL(scx200_gpio_base);
158EXPORT_SYMBOL(scx200_gpio_shadow);
159EXPORT_SYMBOL(scx200_gpio_configure);
160EXPORT_SYMBOL(scx200_cb_base);
161
162/*
163 Local variables:
164 compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules"
165 c-basic-offset: 8
166 End:
167*/
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
new file mode 100644
index 000000000000..469f496e55c0
--- /dev/null
+++ b/arch/i386/kernel/semaphore.c
@@ -0,0 +1,297 @@
1/*
2 * i386 semaphore implementation.
3 *
4 * (C) Copyright 1999 Linus Torvalds
5 *
6 * Portions Copyright 1999 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
14 */
15#include <linux/config.h>
16#include <linux/sched.h>
17#include <linux/err.h>
18#include <linux/init.h>
19#include <asm/semaphore.h>
20
21/*
22 * Semaphores are implemented using a two-way counter:
23 * The "count" variable is decremented for each process
24 * that tries to acquire the semaphore, while the "sleeping"
25 * variable is a count of such acquires.
26 *
27 * Notably, the inline "up()" and "down()" functions can
28 * efficiently test if they need to do any extra work (up
29 * needs to do something only if count was negative before
30 * the increment operation.
31 *
32 * "sleeping" and the contention routine ordering is protected
33 * by the spinlock in the semaphore's waitqueue head.
34 *
35 * Note that these functions are only called when there is
36 * contention on the lock, and as such all this is the
37 * "non-critical" part of the whole semaphore business. The
38 * critical part is the inline stuff in <asm/semaphore.h>
39 * where we want to avoid any extra jumps and calls.
40 */
41
42/*
43 * Logic:
44 * - only on a boundary condition do we need to care. When we go
45 * from a negative count to a non-negative, we wake people up.
46 * - when we go from a non-negative count to a negative do we
47 * (a) synchronize with the "sleeper" count and (b) make sure
48 * that we're on the wakeup list before we synchronize so that
49 * we cannot lose wakeup events.
50 */
51
52static fastcall void __attribute_used__ __up(struct semaphore *sem)
53{
54 wake_up(&sem->wait);
55}
56
57static fastcall void __attribute_used__ __sched __down(struct semaphore * sem)
58{
59 struct task_struct *tsk = current;
60 DECLARE_WAITQUEUE(wait, tsk);
61 unsigned long flags;
62
63 tsk->state = TASK_UNINTERRUPTIBLE;
64 spin_lock_irqsave(&sem->wait.lock, flags);
65 add_wait_queue_exclusive_locked(&sem->wait, &wait);
66
67 sem->sleepers++;
68 for (;;) {
69 int sleepers = sem->sleepers;
70
71 /*
72 * Add "everybody else" into it. They aren't
73 * playing, because we own the spinlock in
74 * the wait_queue_head.
75 */
76 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
77 sem->sleepers = 0;
78 break;
79 }
80 sem->sleepers = 1; /* us - see -1 above */
81 spin_unlock_irqrestore(&sem->wait.lock, flags);
82
83 schedule();
84
85 spin_lock_irqsave(&sem->wait.lock, flags);
86 tsk->state = TASK_UNINTERRUPTIBLE;
87 }
88 remove_wait_queue_locked(&sem->wait, &wait);
89 wake_up_locked(&sem->wait);
90 spin_unlock_irqrestore(&sem->wait.lock, flags);
91 tsk->state = TASK_RUNNING;
92}
93
94static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem)
95{
96 int retval = 0;
97 struct task_struct *tsk = current;
98 DECLARE_WAITQUEUE(wait, tsk);
99 unsigned long flags;
100
101 tsk->state = TASK_INTERRUPTIBLE;
102 spin_lock_irqsave(&sem->wait.lock, flags);
103 add_wait_queue_exclusive_locked(&sem->wait, &wait);
104
105 sem->sleepers++;
106 for (;;) {
107 int sleepers = sem->sleepers;
108
109 /*
110 * With signals pending, this turns into
111 * the trylock failure case - we won't be
112 * sleeping, and we* can't get the lock as
113 * it has contention. Just correct the count
114 * and exit.
115 */
116 if (signal_pending(current)) {
117 retval = -EINTR;
118 sem->sleepers = 0;
119 atomic_add(sleepers, &sem->count);
120 break;
121 }
122
123 /*
124 * Add "everybody else" into it. They aren't
125 * playing, because we own the spinlock in
126 * wait_queue_head. The "-1" is because we're
127 * still hoping to get the semaphore.
128 */
129 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
130 sem->sleepers = 0;
131 break;
132 }
133 sem->sleepers = 1; /* us - see -1 above */
134 spin_unlock_irqrestore(&sem->wait.lock, flags);
135
136 schedule();
137
138 spin_lock_irqsave(&sem->wait.lock, flags);
139 tsk->state = TASK_INTERRUPTIBLE;
140 }
141 remove_wait_queue_locked(&sem->wait, &wait);
142 wake_up_locked(&sem->wait);
143 spin_unlock_irqrestore(&sem->wait.lock, flags);
144
145 tsk->state = TASK_RUNNING;
146 return retval;
147}
148
149/*
150 * Trylock failed - make sure we correct for
151 * having decremented the count.
152 *
153 * We could have done the trylock with a
154 * single "cmpxchg" without failure cases,
155 * but then it wouldn't work on a 386.
156 */
157static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem)
158{
159 int sleepers;
160 unsigned long flags;
161
162 spin_lock_irqsave(&sem->wait.lock, flags);
163 sleepers = sem->sleepers + 1;
164 sem->sleepers = 0;
165
166 /*
167 * Add "everybody else" and us into it. They aren't
168 * playing, because we own the spinlock in the
169 * wait_queue_head.
170 */
171 if (!atomic_add_negative(sleepers, &sem->count)) {
172 wake_up_locked(&sem->wait);
173 }
174
175 spin_unlock_irqrestore(&sem->wait.lock, flags);
176 return 1;
177}
178
179
180/*
181 * The semaphore operations have a special calling sequence that
182 * allow us to do a simpler in-line version of them. These routines
183 * need to convert that sequence back into the C sequence when
184 * there is contention on the semaphore.
185 *
186 * %eax contains the semaphore pointer on entry. Save the C-clobbered
187 * registers (%eax, %edx and %ecx) except %eax whish is either a return
188 * value or just clobbered..
189 */
190asm(
191".section .sched.text\n"
192".align 4\n"
193".globl __down_failed\n"
194"__down_failed:\n\t"
195#if defined(CONFIG_FRAME_POINTER)
196 "pushl %ebp\n\t"
197 "movl %esp,%ebp\n\t"
198#endif
199 "pushl %edx\n\t"
200 "pushl %ecx\n\t"
201 "call __down\n\t"
202 "popl %ecx\n\t"
203 "popl %edx\n\t"
204#if defined(CONFIG_FRAME_POINTER)
205 "movl %ebp,%esp\n\t"
206 "popl %ebp\n\t"
207#endif
208 "ret"
209);
210
211asm(
212".section .sched.text\n"
213".align 4\n"
214".globl __down_failed_interruptible\n"
215"__down_failed_interruptible:\n\t"
216#if defined(CONFIG_FRAME_POINTER)
217 "pushl %ebp\n\t"
218 "movl %esp,%ebp\n\t"
219#endif
220 "pushl %edx\n\t"
221 "pushl %ecx\n\t"
222 "call __down_interruptible\n\t"
223 "popl %ecx\n\t"
224 "popl %edx\n\t"
225#if defined(CONFIG_FRAME_POINTER)
226 "movl %ebp,%esp\n\t"
227 "popl %ebp\n\t"
228#endif
229 "ret"
230);
231
232asm(
233".section .sched.text\n"
234".align 4\n"
235".globl __down_failed_trylock\n"
236"__down_failed_trylock:\n\t"
237#if defined(CONFIG_FRAME_POINTER)
238 "pushl %ebp\n\t"
239 "movl %esp,%ebp\n\t"
240#endif
241 "pushl %edx\n\t"
242 "pushl %ecx\n\t"
243 "call __down_trylock\n\t"
244 "popl %ecx\n\t"
245 "popl %edx\n\t"
246#if defined(CONFIG_FRAME_POINTER)
247 "movl %ebp,%esp\n\t"
248 "popl %ebp\n\t"
249#endif
250 "ret"
251);
252
253asm(
254".section .sched.text\n"
255".align 4\n"
256".globl __up_wakeup\n"
257"__up_wakeup:\n\t"
258 "pushl %edx\n\t"
259 "pushl %ecx\n\t"
260 "call __up\n\t"
261 "popl %ecx\n\t"
262 "popl %edx\n\t"
263 "ret"
264);
265
266/*
267 * rw spinlock fallbacks
268 */
269#if defined(CONFIG_SMP)
270asm(
271".section .sched.text\n"
272".align 4\n"
273".globl __write_lock_failed\n"
274"__write_lock_failed:\n\t"
275 LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n"
276"1: rep; nop\n\t"
277 "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
278 "jne 1b\n\t"
279 LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
280 "jnz __write_lock_failed\n\t"
281 "ret"
282);
283
284asm(
285".section .sched.text\n"
286".align 4\n"
287".globl __read_lock_failed\n"
288"__read_lock_failed:\n\t"
289 LOCK "incl (%eax)\n"
290"1: rep; nop\n\t"
291 "cmpl $1,(%eax)\n\t"
292 "js 1b\n\t"
293 LOCK "decl (%eax)\n\t"
294 "js __read_lock_failed\n\t"
295 "ret"
296);
297#endif
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
new file mode 100644
index 000000000000..945ec73163c8
--- /dev/null
+++ b/arch/i386/kernel/setup.c
@@ -0,0 +1,1535 @@
1/*
2 * linux/arch/i386/kernel/setup.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 *
8 * Memory region support
9 * David Parsons <orc@pell.chi.il.us>, July-August 1999
10 *
11 * Added E820 sanitization routine (removes overlapping memory regions);
12 * Brian Moyle <bmoyle@mvista.com>, February 2001
13 *
14 * Moved CPU detection code to cpu/${cpu}.c
15 * Patrick Mochel <mochel@osdl.org>, March 2002
16 *
17 * Provisions for empty E820 memory regions (reported by certain BIOSes).
18 * Alex Achenbach <xela@slit.de>, December 2002.
19 *
20 */
21
22/*
23 * This file handles the architecture-dependent parts of initialization
24 */
25
26#include <linux/sched.h>
27#include <linux/mm.h>
28#include <linux/tty.h>
29#include <linux/ioport.h>
30#include <linux/acpi.h>
31#include <linux/apm_bios.h>
32#include <linux/initrd.h>
33#include <linux/bootmem.h>
34#include <linux/seq_file.h>
35#include <linux/console.h>
36#include <linux/mca.h>
37#include <linux/root_dev.h>
38#include <linux/highmem.h>
39#include <linux/module.h>
40#include <linux/efi.h>
41#include <linux/init.h>
42#include <linux/edd.h>
43#include <linux/nodemask.h>
44#include <video/edid.h>
45#include <asm/e820.h>
46#include <asm/mpspec.h>
47#include <asm/setup.h>
48#include <asm/arch_hooks.h>
49#include <asm/sections.h>
50#include <asm/io_apic.h>
51#include <asm/ist.h>
52#include <asm/io.h>
53#include "setup_arch_pre.h"
54#include <bios_ebda.h>
55
56/* This value is set up by the early boot code to point to the value
57 immediately after the boot time page tables. It contains a *physical*
58 address, and must not be in the .bss segment! */
59unsigned long init_pg_tables_end __initdata = ~0UL;
60
61int disable_pse __initdata = 0;
62
63/*
64 * Machine setup..
65 */
66
67#ifdef CONFIG_EFI
68int efi_enabled = 0;
69EXPORT_SYMBOL(efi_enabled);
70#endif
71
72/* cpu data as detected by the assembly code in head.S */
73struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
74/* common cpu data for all cpus */
75struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
76
77unsigned long mmu_cr4_features;
78
79#ifdef CONFIG_ACPI_INTERPRETER
80 int acpi_disabled = 0;
81#else
82 int acpi_disabled = 1;
83#endif
84EXPORT_SYMBOL(acpi_disabled);
85
86#ifdef CONFIG_ACPI_BOOT
87int __initdata acpi_force = 0;
88extern acpi_interrupt_flags acpi_sci_flags;
89#endif
90
91/* for MCA, but anyone else can use it if they want */
92unsigned int machine_id;
93unsigned int machine_submodel_id;
94unsigned int BIOS_revision;
95unsigned int mca_pentium_flag;
96
97/* For PCI or other memory-mapped resources */
98unsigned long pci_mem_start = 0x10000000;
99
100/* Boot loader ID as an integer, for the benefit of proc_dointvec */
101int bootloader_type;
102
103/* user-defined highmem size */
104static unsigned int highmem_pages = -1;
105
106/*
107 * Setup options
108 */
109struct drive_info_struct { char dummy[32]; } drive_info;
110struct screen_info screen_info;
111struct apm_info apm_info;
112struct sys_desc_table_struct {
113 unsigned short length;
114 unsigned char table[0];
115};
116struct edid_info edid_info;
117struct ist_info ist_info;
118struct e820map e820;
119
120extern void early_cpu_init(void);
121extern void dmi_scan_machine(void);
122extern void generic_apic_probe(char *);
123extern int root_mountflags;
124
125unsigned long saved_videomode;
126
127#define RAMDISK_IMAGE_START_MASK 0x07FF
128#define RAMDISK_PROMPT_FLAG 0x8000
129#define RAMDISK_LOAD_FLAG 0x4000
130
131static char command_line[COMMAND_LINE_SIZE];
132
133unsigned char __initdata boot_params[PARAM_SIZE];
134
135static struct resource data_resource = {
136 .name = "Kernel data",
137 .start = 0,
138 .end = 0,
139 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
140};
141
142static struct resource code_resource = {
143 .name = "Kernel code",
144 .start = 0,
145 .end = 0,
146 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
147};
148
149static struct resource system_rom_resource = {
150 .name = "System ROM",
151 .start = 0xf0000,
152 .end = 0xfffff,
153 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
154};
155
156static struct resource extension_rom_resource = {
157 .name = "Extension ROM",
158 .start = 0xe0000,
159 .end = 0xeffff,
160 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
161};
162
163static struct resource adapter_rom_resources[] = { {
164 .name = "Adapter ROM",
165 .start = 0xc8000,
166 .end = 0,
167 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
168}, {
169 .name = "Adapter ROM",
170 .start = 0,
171 .end = 0,
172 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
173}, {
174 .name = "Adapter ROM",
175 .start = 0,
176 .end = 0,
177 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
178}, {
179 .name = "Adapter ROM",
180 .start = 0,
181 .end = 0,
182 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
183}, {
184 .name = "Adapter ROM",
185 .start = 0,
186 .end = 0,
187 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
188}, {
189 .name = "Adapter ROM",
190 .start = 0,
191 .end = 0,
192 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
193} };
194
195#define ADAPTER_ROM_RESOURCES \
196 (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
197
198static struct resource video_rom_resource = {
199 .name = "Video ROM",
200 .start = 0xc0000,
201 .end = 0xc7fff,
202 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
203};
204
205static struct resource video_ram_resource = {
206 .name = "Video RAM area",
207 .start = 0xa0000,
208 .end = 0xbffff,
209 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
210};
211
212static struct resource standard_io_resources[] = { {
213 .name = "dma1",
214 .start = 0x0000,
215 .end = 0x001f,
216 .flags = IORESOURCE_BUSY | IORESOURCE_IO
217}, {
218 .name = "pic1",
219 .start = 0x0020,
220 .end = 0x0021,
221 .flags = IORESOURCE_BUSY | IORESOURCE_IO
222}, {
223 .name = "timer0",
224 .start = 0x0040,
225 .end = 0x0043,
226 .flags = IORESOURCE_BUSY | IORESOURCE_IO
227}, {
228 .name = "timer1",
229 .start = 0x0050,
230 .end = 0x0053,
231 .flags = IORESOURCE_BUSY | IORESOURCE_IO
232}, {
233 .name = "keyboard",
234 .start = 0x0060,
235 .end = 0x006f,
236 .flags = IORESOURCE_BUSY | IORESOURCE_IO
237}, {
238 .name = "dma page reg",
239 .start = 0x0080,
240 .end = 0x008f,
241 .flags = IORESOURCE_BUSY | IORESOURCE_IO
242}, {
243 .name = "pic2",
244 .start = 0x00a0,
245 .end = 0x00a1,
246 .flags = IORESOURCE_BUSY | IORESOURCE_IO
247}, {
248 .name = "dma2",
249 .start = 0x00c0,
250 .end = 0x00df,
251 .flags = IORESOURCE_BUSY | IORESOURCE_IO
252}, {
253 .name = "fpu",
254 .start = 0x00f0,
255 .end = 0x00ff,
256 .flags = IORESOURCE_BUSY | IORESOURCE_IO
257} };
258
259#define STANDARD_IO_RESOURCES \
260 (sizeof standard_io_resources / sizeof standard_io_resources[0])
261
262#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
263
264static int __init romchecksum(unsigned char *rom, unsigned long length)
265{
266 unsigned char *p, sum = 0;
267
268 for (p = rom; p < rom + length; p++)
269 sum += *p;
270 return sum == 0;
271}
272
273static void __init probe_roms(void)
274{
275 unsigned long start, length, upper;
276 unsigned char *rom;
277 int i;
278
279 /* video rom */
280 upper = adapter_rom_resources[0].start;
281 for (start = video_rom_resource.start; start < upper; start += 2048) {
282 rom = isa_bus_to_virt(start);
283 if (!romsignature(rom))
284 continue;
285
286 video_rom_resource.start = start;
287
288 /* 0 < length <= 0x7f * 512, historically */
289 length = rom[2] * 512;
290
291 /* if checksum okay, trust length byte */
292 if (length && romchecksum(rom, length))
293 video_rom_resource.end = start + length - 1;
294
295 request_resource(&iomem_resource, &video_rom_resource);
296 break;
297 }
298
299 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
300 if (start < upper)
301 start = upper;
302
303 /* system rom */
304 request_resource(&iomem_resource, &system_rom_resource);
305 upper = system_rom_resource.start;
306
307 /* check for extension rom (ignore length byte!) */
308 rom = isa_bus_to_virt(extension_rom_resource.start);
309 if (romsignature(rom)) {
310 length = extension_rom_resource.end - extension_rom_resource.start + 1;
311 if (romchecksum(rom, length)) {
312 request_resource(&iomem_resource, &extension_rom_resource);
313 upper = extension_rom_resource.start;
314 }
315 }
316
317 /* check for adapter roms on 2k boundaries */
318 for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
319 rom = isa_bus_to_virt(start);
320 if (!romsignature(rom))
321 continue;
322
323 /* 0 < length <= 0x7f * 512, historically */
324 length = rom[2] * 512;
325
326 /* but accept any length that fits if checksum okay */
327 if (!length || start + length > upper || !romchecksum(rom, length))
328 continue;
329
330 adapter_rom_resources[i].start = start;
331 adapter_rom_resources[i].end = start + length - 1;
332 request_resource(&iomem_resource, &adapter_rom_resources[i]);
333
334 start = adapter_rom_resources[i++].end & ~2047UL;
335 }
336}
337
338static void __init limit_regions(unsigned long long size)
339{
340 unsigned long long current_addr = 0;
341 int i;
342
343 if (efi_enabled) {
344 for (i = 0; i < memmap.nr_map; i++) {
345 current_addr = memmap.map[i].phys_addr +
346 (memmap.map[i].num_pages << 12);
347 if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) {
348 if (current_addr >= size) {
349 memmap.map[i].num_pages -=
350 (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
351 memmap.nr_map = i + 1;
352 return;
353 }
354 }
355 }
356 }
357 for (i = 0; i < e820.nr_map; i++) {
358 if (e820.map[i].type == E820_RAM) {
359 current_addr = e820.map[i].addr + e820.map[i].size;
360 if (current_addr >= size) {
361 e820.map[i].size -= current_addr-size;
362 e820.nr_map = i + 1;
363 return;
364 }
365 }
366 }
367}
368
369static void __init add_memory_region(unsigned long long start,
370 unsigned long long size, int type)
371{
372 int x;
373
374 if (!efi_enabled) {
375 x = e820.nr_map;
376
377 if (x == E820MAX) {
378 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
379 return;
380 }
381
382 e820.map[x].addr = start;
383 e820.map[x].size = size;
384 e820.map[x].type = type;
385 e820.nr_map++;
386 }
387} /* add_memory_region */
388
389#define E820_DEBUG 1
390
391static void __init print_memory_map(char *who)
392{
393 int i;
394
395 for (i = 0; i < e820.nr_map; i++) {
396 printk(" %s: %016Lx - %016Lx ", who,
397 e820.map[i].addr,
398 e820.map[i].addr + e820.map[i].size);
399 switch (e820.map[i].type) {
400 case E820_RAM: printk("(usable)\n");
401 break;
402 case E820_RESERVED:
403 printk("(reserved)\n");
404 break;
405 case E820_ACPI:
406 printk("(ACPI data)\n");
407 break;
408 case E820_NVS:
409 printk("(ACPI NVS)\n");
410 break;
411 default: printk("type %lu\n", e820.map[i].type);
412 break;
413 }
414 }
415}
416
417/*
418 * Sanitize the BIOS e820 map.
419 *
420 * Some e820 responses include overlapping entries. The following
421 * replaces the original e820 map with a new one, removing overlaps.
422 *
423 */
424struct change_member {
425 struct e820entry *pbios; /* pointer to original bios entry */
426 unsigned long long addr; /* address for this change point */
427};
428static struct change_member change_point_list[2*E820MAX] __initdata;
429static struct change_member *change_point[2*E820MAX] __initdata;
430static struct e820entry *overlap_list[E820MAX] __initdata;
431static struct e820entry new_bios[E820MAX] __initdata;
432
433static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
434{
435 struct change_member *change_tmp;
436 unsigned long current_type, last_type;
437 unsigned long long last_addr;
438 int chgidx, still_changing;
439 int overlap_entries;
440 int new_bios_entry;
441 int old_nr, new_nr, chg_nr;
442 int i;
443
444 /*
445 Visually we're performing the following (1,2,3,4 = memory types)...
446
447 Sample memory map (w/overlaps):
448 ____22__________________
449 ______________________4_
450 ____1111________________
451 _44_____________________
452 11111111________________
453 ____________________33__
454 ___________44___________
455 __________33333_________
456 ______________22________
457 ___________________2222_
458 _________111111111______
459 _____________________11_
460 _________________4______
461
462 Sanitized equivalent (no overlap):
463 1_______________________
464 _44_____________________
465 ___1____________________
466 ____22__________________
467 ______11________________
468 _________1______________
469 __________3_____________
470 ___________44___________
471 _____________33_________
472 _______________2________
473 ________________1_______
474 _________________4______
475 ___________________2____
476 ____________________33__
477 ______________________4_
478 */
479
480 /* if there's only one memory region, don't bother */
481 if (*pnr_map < 2)
482 return -1;
483
484 old_nr = *pnr_map;
485
486 /* bail out if we find any unreasonable addresses in bios map */
487 for (i=0; i<old_nr; i++)
488 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
489 return -1;
490
491 /* create pointers for initial change-point information (for sorting) */
492 for (i=0; i < 2*old_nr; i++)
493 change_point[i] = &change_point_list[i];
494
495 /* record all known change-points (starting and ending addresses),
496 omitting those that are for empty memory regions */
497 chgidx = 0;
498 for (i=0; i < old_nr; i++) {
499 if (biosmap[i].size != 0) {
500 change_point[chgidx]->addr = biosmap[i].addr;
501 change_point[chgidx++]->pbios = &biosmap[i];
502 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
503 change_point[chgidx++]->pbios = &biosmap[i];
504 }
505 }
506 chg_nr = chgidx; /* true number of change-points */
507
508 /* sort change-point list by memory addresses (low -> high) */
509 still_changing = 1;
510 while (still_changing) {
511 still_changing = 0;
512 for (i=1; i < chg_nr; i++) {
513 /* if <current_addr> > <last_addr>, swap */
514 /* or, if current=<start_addr> & last=<end_addr>, swap */
515 if ((change_point[i]->addr < change_point[i-1]->addr) ||
516 ((change_point[i]->addr == change_point[i-1]->addr) &&
517 (change_point[i]->addr == change_point[i]->pbios->addr) &&
518 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
519 )
520 {
521 change_tmp = change_point[i];
522 change_point[i] = change_point[i-1];
523 change_point[i-1] = change_tmp;
524 still_changing=1;
525 }
526 }
527 }
528
529 /* create a new bios memory map, removing overlaps */
530 overlap_entries=0; /* number of entries in the overlap table */
531 new_bios_entry=0; /* index for creating new bios map entries */
532 last_type = 0; /* start with undefined memory type */
533 last_addr = 0; /* start with 0 as last starting address */
534 /* loop through change-points, determining affect on the new bios map */
535 for (chgidx=0; chgidx < chg_nr; chgidx++)
536 {
537 /* keep track of all overlapping bios entries */
538 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
539 {
540 /* add map entry to overlap list (> 1 entry implies an overlap) */
541 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
542 }
543 else
544 {
545 /* remove entry from list (order independent, so swap with last) */
546 for (i=0; i<overlap_entries; i++)
547 {
548 if (overlap_list[i] == change_point[chgidx]->pbios)
549 overlap_list[i] = overlap_list[overlap_entries-1];
550 }
551 overlap_entries--;
552 }
553 /* if there are overlapping entries, decide which "type" to use */
554 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
555 current_type = 0;
556 for (i=0; i<overlap_entries; i++)
557 if (overlap_list[i]->type > current_type)
558 current_type = overlap_list[i]->type;
559 /* continue building up new bios map based on this information */
560 if (current_type != last_type) {
561 if (last_type != 0) {
562 new_bios[new_bios_entry].size =
563 change_point[chgidx]->addr - last_addr;
564 /* move forward only if the new size was non-zero */
565 if (new_bios[new_bios_entry].size != 0)
566 if (++new_bios_entry >= E820MAX)
567 break; /* no more space left for new bios entries */
568 }
569 if (current_type != 0) {
570 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
571 new_bios[new_bios_entry].type = current_type;
572 last_addr=change_point[chgidx]->addr;
573 }
574 last_type = current_type;
575 }
576 }
577 new_nr = new_bios_entry; /* retain count for new bios entries */
578
579 /* copy new bios mapping into original location */
580 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
581 *pnr_map = new_nr;
582
583 return 0;
584}
585
586/*
587 * Copy the BIOS e820 map into a safe place.
588 *
589 * Sanity-check it while we're at it..
590 *
591 * If we're lucky and live on a modern system, the setup code
592 * will have given us a memory map that we can use to properly
593 * set up memory. If we aren't, we'll fake a memory map.
594 *
595 * We check to see that the memory map contains at least 2 elements
596 * before we'll use it, because the detection code in setup.S may
597 * not be perfect and most every PC known to man has two memory
598 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
599 * thinkpad 560x, for example, does not cooperate with the memory
600 * detection code.)
601 */
602static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
603{
604 /* Only one memory region (or negative)? Ignore it */
605 if (nr_map < 2)
606 return -1;
607
608 do {
609 unsigned long long start = biosmap->addr;
610 unsigned long long size = biosmap->size;
611 unsigned long long end = start + size;
612 unsigned long type = biosmap->type;
613
614 /* Overflow in 64 bits? Ignore the memory map. */
615 if (start > end)
616 return -1;
617
618 /*
619 * Some BIOSes claim RAM in the 640k - 1M region.
620 * Not right. Fix it up.
621 */
622 if (type == E820_RAM) {
623 if (start < 0x100000ULL && end > 0xA0000ULL) {
624 if (start < 0xA0000ULL)
625 add_memory_region(start, 0xA0000ULL-start, type);
626 if (end <= 0x100000ULL)
627 continue;
628 start = 0x100000ULL;
629 size = end - start;
630 }
631 }
632 add_memory_region(start, size, type);
633 } while (biosmap++,--nr_map);
634 return 0;
635}
636
637#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
638struct edd edd;
639#ifdef CONFIG_EDD_MODULE
640EXPORT_SYMBOL(edd);
641#endif
642/**
643 * copy_edd() - Copy the BIOS EDD information
644 * from boot_params into a safe place.
645 *
646 */
647static inline void copy_edd(void)
648{
649 memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
650 memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
651 edd.mbr_signature_nr = EDD_MBR_SIG_NR;
652 edd.edd_info_nr = EDD_NR;
653}
654#else
655static inline void copy_edd(void)
656{
657}
658#endif
659
660/*
661 * Do NOT EVER look at the BIOS memory size location.
662 * It does not work on many machines.
663 */
664#define LOWMEMSIZE() (0x9f000)
665
666static void __init parse_cmdline_early (char ** cmdline_p)
667{
668 char c = ' ', *to = command_line, *from = saved_command_line;
669 int len = 0;
670 int userdef = 0;
671
672 /* Save unparsed command line copy for /proc/cmdline */
673 saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
674
675 for (;;) {
676 if (c != ' ')
677 goto next_char;
678 /*
679 * "mem=nopentium" disables the 4MB page tables.
680 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
681 * to <mem>, overriding the bios size.
682 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
683 * <start> to <start>+<mem>, overriding the bios size.
684 *
685 * HPA tells me bootloaders need to parse mem=, so no new
686 * option should be mem= [also see Documentation/i386/boot.txt]
687 */
688 if (!memcmp(from, "mem=", 4)) {
689 if (to != command_line)
690 to--;
691 if (!memcmp(from+4, "nopentium", 9)) {
692 from += 9+4;
693 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
694 disable_pse = 1;
695 } else {
696 /* If the user specifies memory size, we
697 * limit the BIOS-provided memory map to
698 * that size. exactmap can be used to specify
699 * the exact map. mem=number can be used to
700 * trim the existing memory map.
701 */
702 unsigned long long mem_size;
703
704 mem_size = memparse(from+4, &from);
705 limit_regions(mem_size);
706 userdef=1;
707 }
708 }
709
710 else if (!memcmp(from, "memmap=", 7)) {
711 if (to != command_line)
712 to--;
713 if (!memcmp(from+7, "exactmap", 8)) {
714 from += 8+7;
715 e820.nr_map = 0;
716 userdef = 1;
717 } else {
718 /* If the user specifies memory size, we
719 * limit the BIOS-provided memory map to
720 * that size. exactmap can be used to specify
721 * the exact map. mem=number can be used to
722 * trim the existing memory map.
723 */
724 unsigned long long start_at, mem_size;
725
726 mem_size = memparse(from+7, &from);
727 if (*from == '@') {
728 start_at = memparse(from+1, &from);
729 add_memory_region(start_at, mem_size, E820_RAM);
730 } else if (*from == '#') {
731 start_at = memparse(from+1, &from);
732 add_memory_region(start_at, mem_size, E820_ACPI);
733 } else if (*from == '$') {
734 start_at = memparse(from+1, &from);
735 add_memory_region(start_at, mem_size, E820_RESERVED);
736 } else {
737 limit_regions(mem_size);
738 userdef=1;
739 }
740 }
741 }
742
743 else if (!memcmp(from, "noexec=", 7))
744 noexec_setup(from + 7);
745
746
747#ifdef CONFIG_X86_SMP
748 /*
749 * If the BIOS enumerates physical processors before logical,
750 * maxcpus=N at enumeration-time can be used to disable HT.
751 */
752 else if (!memcmp(from, "maxcpus=", 8)) {
753 extern unsigned int maxcpus;
754
755 maxcpus = simple_strtoul(from + 8, NULL, 0);
756 }
757#endif
758
759#ifdef CONFIG_ACPI_BOOT
760 /* "acpi=off" disables both ACPI table parsing and interpreter */
761 else if (!memcmp(from, "acpi=off", 8)) {
762 disable_acpi();
763 }
764
765 /* acpi=force to over-ride black-list */
766 else if (!memcmp(from, "acpi=force", 10)) {
767 acpi_force = 1;
768 acpi_ht = 1;
769 acpi_disabled = 0;
770 }
771
772 /* acpi=strict disables out-of-spec workarounds */
773 else if (!memcmp(from, "acpi=strict", 11)) {
774 acpi_strict = 1;
775 }
776
777 /* Limit ACPI just to boot-time to enable HT */
778 else if (!memcmp(from, "acpi=ht", 7)) {
779 if (!acpi_force)
780 disable_acpi();
781 acpi_ht = 1;
782 }
783
784 /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
785 else if (!memcmp(from, "pci=noacpi", 10)) {
786 acpi_disable_pci();
787 }
788 /* "acpi=noirq" disables ACPI interrupt routing */
789 else if (!memcmp(from, "acpi=noirq", 10)) {
790 acpi_noirq_set();
791 }
792
793 else if (!memcmp(from, "acpi_sci=edge", 13))
794 acpi_sci_flags.trigger = 1;
795
796 else if (!memcmp(from, "acpi_sci=level", 14))
797 acpi_sci_flags.trigger = 3;
798
799 else if (!memcmp(from, "acpi_sci=high", 13))
800 acpi_sci_flags.polarity = 1;
801
802 else if (!memcmp(from, "acpi_sci=low", 12))
803 acpi_sci_flags.polarity = 3;
804
805#ifdef CONFIG_X86_IO_APIC
806 else if (!memcmp(from, "acpi_skip_timer_override", 24))
807 acpi_skip_timer_override = 1;
808#endif
809
810#ifdef CONFIG_X86_LOCAL_APIC
811 /* disable IO-APIC */
812 else if (!memcmp(from, "noapic", 6))
813 disable_ioapic_setup();
814#endif /* CONFIG_X86_LOCAL_APIC */
815#endif /* CONFIG_ACPI_BOOT */
816
817 /*
818 * highmem=size forces highmem to be exactly 'size' bytes.
819 * This works even on boxes that have no highmem otherwise.
820 * This also works to reduce highmem size on bigger boxes.
821 */
822 else if (!memcmp(from, "highmem=", 8))
823 highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
824
825 /*
826 * vmalloc=size forces the vmalloc area to be exactly 'size'
827 * bytes. This can be used to increase (or decrease) the
828 * vmalloc area - the default is 128m.
829 */
830 else if (!memcmp(from, "vmalloc=", 8))
831 __VMALLOC_RESERVE = memparse(from+8, &from);
832
833 next_char:
834 c = *(from++);
835 if (!c)
836 break;
837 if (COMMAND_LINE_SIZE <= ++len)
838 break;
839 *(to++) = c;
840 }
841 *to = '\0';
842 *cmdline_p = command_line;
843 if (userdef) {
844 printk(KERN_INFO "user-defined physical RAM map:\n");
845 print_memory_map("user");
846 }
847}
848
849/*
850 * Callback for efi_memory_walk.
851 */
852static int __init
853efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
854{
855 unsigned long *max_pfn = arg, pfn;
856
857 if (start < end) {
858 pfn = PFN_UP(end -1);
859 if (pfn > *max_pfn)
860 *max_pfn = pfn;
861 }
862 return 0;
863}
864
865
866/*
867 * Find the highest page frame number we have available
868 */
869void __init find_max_pfn(void)
870{
871 int i;
872
873 max_pfn = 0;
874 if (efi_enabled) {
875 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
876 return;
877 }
878
879 for (i = 0; i < e820.nr_map; i++) {
880 unsigned long start, end;
881 /* RAM? */
882 if (e820.map[i].type != E820_RAM)
883 continue;
884 start = PFN_UP(e820.map[i].addr);
885 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
886 if (start >= end)
887 continue;
888 if (end > max_pfn)
889 max_pfn = end;
890 }
891}
892
893/*
894 * Determine low and high memory ranges:
895 */
896unsigned long __init find_max_low_pfn(void)
897{
898 unsigned long max_low_pfn;
899
900 max_low_pfn = max_pfn;
901 if (max_low_pfn > MAXMEM_PFN) {
902 if (highmem_pages == -1)
903 highmem_pages = max_pfn - MAXMEM_PFN;
904 if (highmem_pages + MAXMEM_PFN < max_pfn)
905 max_pfn = MAXMEM_PFN + highmem_pages;
906 if (highmem_pages + MAXMEM_PFN > max_pfn) {
907 printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
908 highmem_pages = 0;
909 }
910 max_low_pfn = MAXMEM_PFN;
911#ifndef CONFIG_HIGHMEM
912 /* Maximum memory usable is what is directly addressable */
913 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
914 MAXMEM>>20);
915 if (max_pfn > MAX_NONPAE_PFN)
916 printk(KERN_WARNING "Use a PAE enabled kernel.\n");
917 else
918 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
919 max_pfn = MAXMEM_PFN;
920#else /* !CONFIG_HIGHMEM */
921#ifndef CONFIG_X86_PAE
922 if (max_pfn > MAX_NONPAE_PFN) {
923 max_pfn = MAX_NONPAE_PFN;
924 printk(KERN_WARNING "Warning only 4GB will be used.\n");
925 printk(KERN_WARNING "Use a PAE enabled kernel.\n");
926 }
927#endif /* !CONFIG_X86_PAE */
928#endif /* !CONFIG_HIGHMEM */
929 } else {
930 if (highmem_pages == -1)
931 highmem_pages = 0;
932#ifdef CONFIG_HIGHMEM
933 if (highmem_pages >= max_pfn) {
934 printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
935 highmem_pages = 0;
936 }
937 if (highmem_pages) {
938 if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
939 printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
940 highmem_pages = 0;
941 }
942 max_low_pfn -= highmem_pages;
943 }
944#else
945 if (highmem_pages)
946 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
947#endif
948 }
949 return max_low_pfn;
950}
951
952/*
953 * Free all available memory for boot time allocation. Used
954 * as a callback function by efi_memory_walk()
955 */
956
957static int __init
958free_available_memory(unsigned long start, unsigned long end, void *arg)
959{
960 /* check max_low_pfn */
961 if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
962 return 0;
963 if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
964 end = (max_low_pfn + 1) << PAGE_SHIFT;
965 if (start < end)
966 free_bootmem(start, end - start);
967
968 return 0;
969}
970/*
971 * Register fully available low RAM pages with the bootmem allocator.
972 */
973static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
974{
975 int i;
976
977 if (efi_enabled) {
978 efi_memmap_walk(free_available_memory, NULL);
979 return;
980 }
981 for (i = 0; i < e820.nr_map; i++) {
982 unsigned long curr_pfn, last_pfn, size;
983 /*
984 * Reserve usable low memory
985 */
986 if (e820.map[i].type != E820_RAM)
987 continue;
988 /*
989 * We are rounding up the start address of usable memory:
990 */
991 curr_pfn = PFN_UP(e820.map[i].addr);
992 if (curr_pfn >= max_low_pfn)
993 continue;
994 /*
995 * ... and at the end of the usable range downwards:
996 */
997 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
998
999 if (last_pfn > max_low_pfn)
1000 last_pfn = max_low_pfn;
1001
1002 /*
1003 * .. finally, did all the rounding and playing
1004 * around just make the area go away?
1005 */
1006 if (last_pfn <= curr_pfn)
1007 continue;
1008
1009 size = last_pfn - curr_pfn;
1010 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1011 }
1012}
1013
1014/*
1015 * workaround for Dell systems that neglect to reserve EBDA
1016 */
1017static void __init reserve_ebda_region(void)
1018{
1019 unsigned int addr;
1020 addr = get_bios_ebda();
1021 if (addr)
1022 reserve_bootmem(addr, PAGE_SIZE);
1023}
1024
1025#ifndef CONFIG_DISCONTIGMEM
1026void __init setup_bootmem_allocator(void);
1027static unsigned long __init setup_memory(void)
1028{
1029 /*
1030 * partially used pages are not usable - thus
1031 * we are rounding upwards:
1032 */
1033 min_low_pfn = PFN_UP(init_pg_tables_end);
1034
1035 find_max_pfn();
1036
1037 max_low_pfn = find_max_low_pfn();
1038
1039#ifdef CONFIG_HIGHMEM
1040 highstart_pfn = highend_pfn = max_pfn;
1041 if (max_pfn > max_low_pfn) {
1042 highstart_pfn = max_low_pfn;
1043 }
1044 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
1045 pages_to_mb(highend_pfn - highstart_pfn));
1046#endif
1047 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
1048 pages_to_mb(max_low_pfn));
1049
1050 setup_bootmem_allocator();
1051
1052 return max_low_pfn;
1053}
1054
1055void __init zone_sizes_init(void)
1056{
1057 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
1058 unsigned int max_dma, low;
1059
1060 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
1061 low = max_low_pfn;
1062
1063 if (low < max_dma)
1064 zones_size[ZONE_DMA] = low;
1065 else {
1066 zones_size[ZONE_DMA] = max_dma;
1067 zones_size[ZONE_NORMAL] = low - max_dma;
1068#ifdef CONFIG_HIGHMEM
1069 zones_size[ZONE_HIGHMEM] = highend_pfn - low;
1070#endif
1071 }
1072 free_area_init(zones_size);
1073}
1074#else
1075extern unsigned long setup_memory(void);
1076extern void zone_sizes_init(void);
1077#endif /* !CONFIG_DISCONTIGMEM */
1078
1079void __init setup_bootmem_allocator(void)
1080{
1081 unsigned long bootmap_size;
1082 /*
1083 * Initialize the boot-time allocator (with low memory only):
1084 */
1085 bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
1086
1087 register_bootmem_low_pages(max_low_pfn);
1088
1089 /*
1090 * Reserve the bootmem bitmap itself as well. We do this in two
1091 * steps (first step was init_bootmem()) because this catches
1092 * the (very unlikely) case of us accidentally initializing the
1093 * bootmem allocator with an invalid RAM area.
1094 */
1095 reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +
1096 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
1097
1098 /*
1099 * reserve physical page 0 - it's a special BIOS page on many boxes,
1100 * enabling clean reboots, SMP operation, laptop functions.
1101 */
1102 reserve_bootmem(0, PAGE_SIZE);
1103
1104 /* reserve EBDA region, it's a 4K region */
1105 reserve_ebda_region();
1106
1107 /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
1108 PCI prefetch into it (errata #56). Usually the page is reserved anyways,
1109 unless you have no PS/2 mouse plugged in. */
1110 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
1111 boot_cpu_data.x86 == 6)
1112 reserve_bootmem(0xa0000 - 4096, 4096);
1113
1114#ifdef CONFIG_SMP
1115 /*
1116 * But first pinch a few for the stack/trampoline stuff
1117 * FIXME: Don't need the extra page at 4K, but need to fix
1118 * trampoline before removing it. (see the GDT stuff)
1119 */
1120 reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
1121#endif
1122#ifdef CONFIG_ACPI_SLEEP
1123 /*
1124 * Reserve low memory region for sleep support.
1125 */
1126 acpi_reserve_bootmem();
1127#endif
1128#ifdef CONFIG_X86_FIND_SMP_CONFIG
1129 /*
1130 * Find and reserve possible boot-time SMP configuration:
1131 */
1132 find_smp_config();
1133#endif
1134
1135#ifdef CONFIG_BLK_DEV_INITRD
1136 if (LOADER_TYPE && INITRD_START) {
1137 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
1138 reserve_bootmem(INITRD_START, INITRD_SIZE);
1139 initrd_start =
1140 INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
1141 initrd_end = initrd_start+INITRD_SIZE;
1142 }
1143 else {
1144 printk(KERN_ERR "initrd extends beyond end of memory "
1145 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
1146 INITRD_START + INITRD_SIZE,
1147 max_low_pfn << PAGE_SHIFT);
1148 initrd_start = 0;
1149 }
1150 }
1151#endif
1152}
1153
1154/*
1155 * The node 0 pgdat is initialized before all of these because
1156 * it's needed for bootmem. node>0 pgdats have their virtual
1157 * space allocated before the pagetables are in place to access
1158 * them, so they can't be cleared then.
1159 *
1160 * This should all compile down to nothing when NUMA is off.
1161 */
1162void __init remapped_pgdat_init(void)
1163{
1164 int nid;
1165
1166 for_each_online_node(nid) {
1167 if (nid != 0)
1168 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
1169 }
1170}
1171
1172/*
1173 * Request address space for all standard RAM and ROM resources
1174 * and also for regions reported as reserved by the e820.
1175 */
1176static void __init
1177legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
1178{
1179 int i;
1180
1181 probe_roms();
1182 for (i = 0; i < e820.nr_map; i++) {
1183 struct resource *res;
1184 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
1185 continue;
1186 res = alloc_bootmem_low(sizeof(struct resource));
1187 switch (e820.map[i].type) {
1188 case E820_RAM: res->name = "System RAM"; break;
1189 case E820_ACPI: res->name = "ACPI Tables"; break;
1190 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
1191 default: res->name = "reserved";
1192 }
1193 res->start = e820.map[i].addr;
1194 res->end = res->start + e820.map[i].size - 1;
1195 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1196 request_resource(&iomem_resource, res);
1197 if (e820.map[i].type == E820_RAM) {
1198 /*
1199 * We don't know which RAM region contains kernel data,
1200 * so we try it repeatedly and let the resource manager
1201 * test it.
1202 */
1203 request_resource(res, code_resource);
1204 request_resource(res, data_resource);
1205 }
1206 }
1207}
1208
1209/*
1210 * Request address space for all standard resources
1211 */
1212static void __init register_memory(void)
1213{
1214 unsigned long gapstart, gapsize;
1215 unsigned long long last;
1216 int i;
1217
1218 if (efi_enabled)
1219 efi_initialize_iomem_resources(&code_resource, &data_resource);
1220 else
1221 legacy_init_iomem_resources(&code_resource, &data_resource);
1222
1223 /* EFI systems may still have VGA */
1224 request_resource(&iomem_resource, &video_ram_resource);
1225
1226 /* request I/O space for devices used on all i[345]86 PCs */
1227 for (i = 0; i < STANDARD_IO_RESOURCES; i++)
1228 request_resource(&ioport_resource, &standard_io_resources[i]);
1229
1230 /*
1231 * Search for the bigest gap in the low 32 bits of the e820
1232 * memory space.
1233 */
1234 last = 0x100000000ull;
1235 gapstart = 0x10000000;
1236 gapsize = 0x400000;
1237 i = e820.nr_map;
1238 while (--i >= 0) {
1239 unsigned long long start = e820.map[i].addr;
1240 unsigned long long end = start + e820.map[i].size;
1241
1242 /*
1243 * Since "last" is at most 4GB, we know we'll
1244 * fit in 32 bits if this condition is true
1245 */
1246 if (last > end) {
1247 unsigned long gap = last - end;
1248
1249 if (gap > gapsize) {
1250 gapsize = gap;
1251 gapstart = end;
1252 }
1253 }
1254 if (start < last)
1255 last = start;
1256 }
1257
1258 /*
1259 * Start allocating dynamic PCI memory a bit into the gap,
1260 * aligned up to the nearest megabyte.
1261 *
1262 * Question: should we try to pad it up a bit (do something
1263 * like " + (gapsize >> 3)" in there too?). We now have the
1264 * technology.
1265 */
1266 pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
1267
1268 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1269 pci_mem_start, gapstart, gapsize);
1270}
1271
1272/* Use inline assembly to define this because the nops are defined
1273 as inline assembly strings in the include files and we cannot
1274 get them easily into strings. */
1275asm("\t.data\nintelnops: "
1276 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
1277 GENERIC_NOP7 GENERIC_NOP8);
1278asm("\t.data\nk8nops: "
1279 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
1280 K8_NOP7 K8_NOP8);
1281asm("\t.data\nk7nops: "
1282 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
1283 K7_NOP7 K7_NOP8);
1284
1285extern unsigned char intelnops[], k8nops[], k7nops[];
1286static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
1287 NULL,
1288 intelnops,
1289 intelnops + 1,
1290 intelnops + 1 + 2,
1291 intelnops + 1 + 2 + 3,
1292 intelnops + 1 + 2 + 3 + 4,
1293 intelnops + 1 + 2 + 3 + 4 + 5,
1294 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
1295 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
1296};
1297static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
1298 NULL,
1299 k8nops,
1300 k8nops + 1,
1301 k8nops + 1 + 2,
1302 k8nops + 1 + 2 + 3,
1303 k8nops + 1 + 2 + 3 + 4,
1304 k8nops + 1 + 2 + 3 + 4 + 5,
1305 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
1306 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
1307};
1308static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
1309 NULL,
1310 k7nops,
1311 k7nops + 1,
1312 k7nops + 1 + 2,
1313 k7nops + 1 + 2 + 3,
1314 k7nops + 1 + 2 + 3 + 4,
1315 k7nops + 1 + 2 + 3 + 4 + 5,
1316 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
1317 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
1318};
1319static struct nop {
1320 int cpuid;
1321 unsigned char **noptable;
1322} noptypes[] = {
1323 { X86_FEATURE_K8, k8_nops },
1324 { X86_FEATURE_K7, k7_nops },
1325 { -1, NULL }
1326};
1327
1328/* Replace instructions with better alternatives for this CPU type.
1329
1330 This runs before SMP is initialized to avoid SMP problems with
1331 self modifying code. This implies that assymetric systems where
1332 APs have less capabilities than the boot processor are not handled.
1333 In this case boot with "noreplacement". */
1334void apply_alternatives(void *start, void *end)
1335{
1336 struct alt_instr *a;
1337 int diff, i, k;
1338 unsigned char **noptable = intel_nops;
1339 for (i = 0; noptypes[i].cpuid >= 0; i++) {
1340 if (boot_cpu_has(noptypes[i].cpuid)) {
1341 noptable = noptypes[i].noptable;
1342 break;
1343 }
1344 }
1345 for (a = start; (void *)a < end; a++) {
1346 if (!boot_cpu_has(a->cpuid))
1347 continue;
1348 BUG_ON(a->replacementlen > a->instrlen);
1349 memcpy(a->instr, a->replacement, a->replacementlen);
1350 diff = a->instrlen - a->replacementlen;
1351 /* Pad the rest with nops */
1352 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
1353 k = diff;
1354 if (k > ASM_NOP_MAX)
1355 k = ASM_NOP_MAX;
1356 memcpy(a->instr + i, noptable[k], k);
1357 }
1358 }
1359}
1360
1361static int no_replacement __initdata = 0;
1362
1363void __init alternative_instructions(void)
1364{
1365 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
1366 if (no_replacement)
1367 return;
1368 apply_alternatives(__alt_instructions, __alt_instructions_end);
1369}
1370
1371static int __init noreplacement_setup(char *s)
1372{
1373 no_replacement = 1;
1374 return 0;
1375}
1376
1377__setup("noreplacement", noreplacement_setup);
1378
1379static char * __init machine_specific_memory_setup(void);
1380
1381#ifdef CONFIG_MCA
1382static void set_mca_bus(int x)
1383{
1384 MCA_bus = x;
1385}
1386#else
1387static void set_mca_bus(int x) { }
1388#endif
1389
1390/*
1391 * Determine if we were loaded by an EFI loader. If so, then we have also been
1392 * passed the efi memmap, systab, etc., so we should use these data structures
1393 * for initialization. Note, the efi init code path is determined by the
1394 * global efi_enabled. This allows the same kernel image to be used on existing
1395 * systems (with a traditional BIOS) as well as on EFI systems.
1396 */
1397void __init setup_arch(char **cmdline_p)
1398{
1399 unsigned long max_low_pfn;
1400
1401 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
1402 pre_setup_arch_hook();
1403 early_cpu_init();
1404
1405 /*
1406 * FIXME: This isn't an official loader_type right
1407 * now but does currently work with elilo.
1408 * If we were configured as an EFI kernel, check to make
1409 * sure that we were loaded correctly from elilo and that
1410 * the system table is valid. If not, then initialize normally.
1411 */
1412#ifdef CONFIG_EFI
1413 if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
1414 efi_enabled = 1;
1415#endif
1416
1417 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
1418 drive_info = DRIVE_INFO;
1419 screen_info = SCREEN_INFO;
1420 edid_info = EDID_INFO;
1421 apm_info.bios = APM_BIOS_INFO;
1422 ist_info = IST_INFO;
1423 saved_videomode = VIDEO_MODE;
1424 if( SYS_DESC_TABLE.length != 0 ) {
1425 set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
1426 machine_id = SYS_DESC_TABLE.table[0];
1427 machine_submodel_id = SYS_DESC_TABLE.table[1];
1428 BIOS_revision = SYS_DESC_TABLE.table[2];
1429 }
1430 bootloader_type = LOADER_TYPE;
1431
1432#ifdef CONFIG_BLK_DEV_RAM
1433 rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
1434 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
1435 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
1436#endif
1437 ARCH_SETUP
1438 if (efi_enabled)
1439 efi_init();
1440 else {
1441 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1442 print_memory_map(machine_specific_memory_setup());
1443 }
1444
1445 copy_edd();
1446
1447 if (!MOUNT_ROOT_RDONLY)
1448 root_mountflags &= ~MS_RDONLY;
1449 init_mm.start_code = (unsigned long) _text;
1450 init_mm.end_code = (unsigned long) _etext;
1451 init_mm.end_data = (unsigned long) _edata;
1452 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
1453
1454 code_resource.start = virt_to_phys(_text);
1455 code_resource.end = virt_to_phys(_etext)-1;
1456 data_resource.start = virt_to_phys(_etext);
1457 data_resource.end = virt_to_phys(_edata)-1;
1458
1459 parse_cmdline_early(cmdline_p);
1460
1461 max_low_pfn = setup_memory();
1462
1463 /*
1464 * NOTE: before this point _nobody_ is allowed to allocate
1465 * any memory using the bootmem allocator. Although the
1466 * alloctor is now initialised only the first 8Mb of the kernel
1467 * virtual address space has been mapped. All allocations before
1468 * paging_init() has completed must use the alloc_bootmem_low_pages()
1469 * variant (which allocates DMA'able memory) and care must be taken
1470 * not to exceed the 8Mb limit.
1471 */
1472
1473#ifdef CONFIG_SMP
1474 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
1475#endif
1476 paging_init();
1477 remapped_pgdat_init();
1478 zone_sizes_init();
1479
1480 /*
1481 * NOTE: at this point the bootmem allocator is fully available.
1482 */
1483
1484#ifdef CONFIG_EARLY_PRINTK
1485 {
1486 char *s = strstr(*cmdline_p, "earlyprintk=");
1487 if (s) {
1488 extern void setup_early_printk(char *);
1489
1490 setup_early_printk(s);
1491 printk("early console enabled\n");
1492 }
1493 }
1494#endif
1495
1496
1497 dmi_scan_machine();
1498
1499#ifdef CONFIG_X86_GENERICARCH
1500 generic_apic_probe(*cmdline_p);
1501#endif
1502 if (efi_enabled)
1503 efi_map_memmap();
1504
1505 /*
1506 * Parse the ACPI tables for possible boot-time SMP configuration.
1507 */
1508 acpi_boot_table_init();
1509 acpi_boot_init();
1510
1511#ifdef CONFIG_X86_LOCAL_APIC
1512 if (smp_found_config)
1513 get_smp_config();
1514#endif
1515
1516 register_memory();
1517
1518#ifdef CONFIG_VT
1519#if defined(CONFIG_VGA_CONSOLE)
1520 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
1521 conswitchp = &vga_con;
1522#elif defined(CONFIG_DUMMY_CONSOLE)
1523 conswitchp = &dummy_con;
1524#endif
1525#endif
1526}
1527
1528#include "setup_arch_post.h"
1529/*
1530 * Local Variables:
1531 * mode:c
1532 * c-file-style:"k&r"
1533 * c-basic-offset:8
1534 * End:
1535 */
diff --git a/arch/i386/kernel/sigframe.h b/arch/i386/kernel/sigframe.h
new file mode 100644
index 000000000000..d21b14f5c25c
--- /dev/null
+++ b/arch/i386/kernel/sigframe.h
@@ -0,0 +1,21 @@
1struct sigframe
2{
3 char *pretcode;
4 int sig;
5 struct sigcontext sc;
6 struct _fpstate fpstate;
7 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8];
9};
10
11struct rt_sigframe
12{
13 char *pretcode;
14 int sig;
15 struct siginfo *pinfo;
16 void *puc;
17 struct siginfo info;
18 struct ucontext uc;
19 struct _fpstate fpstate;
20 char retcode[8];
21};
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
new file mode 100644
index 000000000000..ef3602e1c052
--- /dev/null
+++ b/arch/i386/kernel/signal.c
@@ -0,0 +1,665 @@
1/*
2 * linux/arch/i386/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
7 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
8 */
9
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/smp_lock.h>
14#include <linux/kernel.h>
15#include <linux/signal.h>
16#include <linux/errno.h>
17#include <linux/wait.h>
18#include <linux/unistd.h>
19#include <linux/stddef.h>
20#include <linux/personality.h>
21#include <linux/suspend.h>
22#include <linux/ptrace.h>
23#include <linux/elf.h>
24#include <asm/processor.h>
25#include <asm/ucontext.h>
26#include <asm/uaccess.h>
27#include <asm/i387.h>
28#include "sigframe.h"
29
30#define DEBUG_SIG 0
31
32#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
33
34/*
35 * Atomically swap in the new signal mask, and wait for a signal.
36 */
37asmlinkage int
38sys_sigsuspend(int history0, int history1, old_sigset_t mask)
39{
40 struct pt_regs * regs = (struct pt_regs *) &history0;
41 sigset_t saveset;
42
43 mask &= _BLOCKABLE;
44 spin_lock_irq(&current->sighand->siglock);
45 saveset = current->blocked;
46 siginitset(&current->blocked, mask);
47 recalc_sigpending();
48 spin_unlock_irq(&current->sighand->siglock);
49
50 regs->eax = -EINTR;
51 while (1) {
52 current->state = TASK_INTERRUPTIBLE;
53 schedule();
54 if (do_signal(regs, &saveset))
55 return -EINTR;
56 }
57}
58
59asmlinkage int
60sys_rt_sigsuspend(struct pt_regs regs)
61{
62 sigset_t saveset, newset;
63
64 /* XXX: Don't preclude handling different sized sigset_t's. */
65 if (regs.ecx != sizeof(sigset_t))
66 return -EINVAL;
67
68 if (copy_from_user(&newset, (sigset_t __user *)regs.ebx, sizeof(newset)))
69 return -EFAULT;
70 sigdelsetmask(&newset, ~_BLOCKABLE);
71
72 spin_lock_irq(&current->sighand->siglock);
73 saveset = current->blocked;
74 current->blocked = newset;
75 recalc_sigpending();
76 spin_unlock_irq(&current->sighand->siglock);
77
78 regs.eax = -EINTR;
79 while (1) {
80 current->state = TASK_INTERRUPTIBLE;
81 schedule();
82 if (do_signal(&regs, &saveset))
83 return -EINTR;
84 }
85}
86
87asmlinkage int
88sys_sigaction(int sig, const struct old_sigaction __user *act,
89 struct old_sigaction __user *oact)
90{
91 struct k_sigaction new_ka, old_ka;
92 int ret;
93
94 if (act) {
95 old_sigset_t mask;
96 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
97 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
98 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
99 return -EFAULT;
100 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
101 __get_user(mask, &act->sa_mask);
102 siginitset(&new_ka.sa.sa_mask, mask);
103 }
104
105 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
106
107 if (!ret && oact) {
108 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
109 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
110 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
111 return -EFAULT;
112 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
113 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
114 }
115
116 return ret;
117}
118
119asmlinkage int
120sys_sigaltstack(unsigned long ebx)
121{
122 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
123 struct pt_regs *regs = (struct pt_regs *)&ebx;
124 const stack_t __user *uss = (const stack_t __user *)ebx;
125 stack_t __user *uoss = (stack_t __user *)regs->ecx;
126
127 return do_sigaltstack(uss, uoss, regs->esp);
128}
129
130
131/*
132 * Do a signal return; undo the signal stack.
133 */
134
135static int
136restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
137{
138 unsigned int err = 0;
139
140 /* Always make any pending restarted system calls return -EINTR */
141 current_thread_info()->restart_block.fn = do_no_restart_syscall;
142
143#define COPY(x) err |= __get_user(regs->x, &sc->x)
144
145#define COPY_SEG(seg) \
146 { unsigned short tmp; \
147 err |= __get_user(tmp, &sc->seg); \
148 regs->x##seg = tmp; }
149
150#define COPY_SEG_STRICT(seg) \
151 { unsigned short tmp; \
152 err |= __get_user(tmp, &sc->seg); \
153 regs->x##seg = tmp|3; }
154
155#define GET_SEG(seg) \
156 { unsigned short tmp; \
157 err |= __get_user(tmp, &sc->seg); \
158 loadsegment(seg,tmp); }
159
160#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF | \
161 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
162 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
163
164 GET_SEG(gs);
165 GET_SEG(fs);
166 COPY_SEG(es);
167 COPY_SEG(ds);
168 COPY(edi);
169 COPY(esi);
170 COPY(ebp);
171 COPY(esp);
172 COPY(ebx);
173 COPY(edx);
174 COPY(ecx);
175 COPY(eip);
176 COPY_SEG_STRICT(cs);
177 COPY_SEG_STRICT(ss);
178
179 {
180 unsigned int tmpflags;
181 err |= __get_user(tmpflags, &sc->eflags);
182 regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
183 regs->orig_eax = -1; /* disable syscall checks */
184 }
185
186 {
187 struct _fpstate __user * buf;
188 err |= __get_user(buf, &sc->fpstate);
189 if (buf) {
190 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
191 goto badframe;
192 err |= restore_i387(buf);
193 } else {
194 struct task_struct *me = current;
195 if (used_math()) {
196 clear_fpu(me);
197 clear_used_math();
198 }
199 }
200 }
201
202 err |= __get_user(*peax, &sc->eax);
203 return err;
204
205badframe:
206 return 1;
207}
208
209asmlinkage int sys_sigreturn(unsigned long __unused)
210{
211 struct pt_regs *regs = (struct pt_regs *) &__unused;
212 struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
213 sigset_t set;
214 int eax;
215
216 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
217 goto badframe;
218 if (__get_user(set.sig[0], &frame->sc.oldmask)
219 || (_NSIG_WORDS > 1
220 && __copy_from_user(&set.sig[1], &frame->extramask,
221 sizeof(frame->extramask))))
222 goto badframe;
223
224 sigdelsetmask(&set, ~_BLOCKABLE);
225 spin_lock_irq(&current->sighand->siglock);
226 current->blocked = set;
227 recalc_sigpending();
228 spin_unlock_irq(&current->sighand->siglock);
229
230 if (restore_sigcontext(regs, &frame->sc, &eax))
231 goto badframe;
232 return eax;
233
234badframe:
235 force_sig(SIGSEGV, current);
236 return 0;
237}
238
239asmlinkage int sys_rt_sigreturn(unsigned long __unused)
240{
241 struct pt_regs *regs = (struct pt_regs *) &__unused;
242 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
243 sigset_t set;
244 int eax;
245
246 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
247 goto badframe;
248 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
249 goto badframe;
250
251 sigdelsetmask(&set, ~_BLOCKABLE);
252 spin_lock_irq(&current->sighand->siglock);
253 current->blocked = set;
254 recalc_sigpending();
255 spin_unlock_irq(&current->sighand->siglock);
256
257 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
258 goto badframe;
259
260 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
261 goto badframe;
262
263 return eax;
264
265badframe:
266 force_sig(SIGSEGV, current);
267 return 0;
268}
269
270/*
271 * Set up a signal frame.
272 */
273
274static int
275setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
276 struct pt_regs *regs, unsigned long mask)
277{
278 int tmp, err = 0;
279
280 tmp = 0;
281 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
282 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
283 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
284 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
285
286 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
287 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
288 err |= __put_user(regs->edi, &sc->edi);
289 err |= __put_user(regs->esi, &sc->esi);
290 err |= __put_user(regs->ebp, &sc->ebp);
291 err |= __put_user(regs->esp, &sc->esp);
292 err |= __put_user(regs->ebx, &sc->ebx);
293 err |= __put_user(regs->edx, &sc->edx);
294 err |= __put_user(regs->ecx, &sc->ecx);
295 err |= __put_user(regs->eax, &sc->eax);
296 err |= __put_user(current->thread.trap_no, &sc->trapno);
297 err |= __put_user(current->thread.error_code, &sc->err);
298 err |= __put_user(regs->eip, &sc->eip);
299 err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
300 err |= __put_user(regs->eflags, &sc->eflags);
301 err |= __put_user(regs->esp, &sc->esp_at_signal);
302 err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
303
304 tmp = save_i387(fpstate);
305 if (tmp < 0)
306 err = 1;
307 else
308 err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
309
310 /* non-iBCS2 extensions.. */
311 err |= __put_user(mask, &sc->oldmask);
312 err |= __put_user(current->thread.cr2, &sc->cr2);
313
314 return err;
315}
316
317/*
318 * Determine which stack to use..
319 */
320static inline void __user *
321get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
322{
323 unsigned long esp;
324
325 /* Default to using normal stack */
326 esp = regs->esp;
327
328 /* This is the X/Open sanctioned signal stack switching. */
329 if (ka->sa.sa_flags & SA_ONSTACK) {
330 if (sas_ss_flags(esp) == 0)
331 esp = current->sas_ss_sp + current->sas_ss_size;
332 }
333
334 /* This is the legacy signal stack switching. */
335 else if ((regs->xss & 0xffff) != __USER_DS &&
336 !(ka->sa.sa_flags & SA_RESTORER) &&
337 ka->sa.sa_restorer) {
338 esp = (unsigned long) ka->sa.sa_restorer;
339 }
340
341 return (void __user *)((esp - frame_size) & -8ul);
342}
343
344/* These symbols are defined with the addresses in the vsyscall page.
345 See vsyscall-sigreturn.S. */
346extern void __user __kernel_sigreturn;
347extern void __user __kernel_rt_sigreturn;
348
349static void setup_frame(int sig, struct k_sigaction *ka,
350 sigset_t *set, struct pt_regs * regs)
351{
352 void __user *restorer;
353 struct sigframe __user *frame;
354 int err = 0;
355 int usig;
356
357 frame = get_sigframe(ka, regs, sizeof(*frame));
358
359 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
360 goto give_sigsegv;
361
362 usig = current_thread_info()->exec_domain
363 && current_thread_info()->exec_domain->signal_invmap
364 && sig < 32
365 ? current_thread_info()->exec_domain->signal_invmap[sig]
366 : sig;
367
368 err = __put_user(usig, &frame->sig);
369 if (err)
370 goto give_sigsegv;
371
372 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
373 if (err)
374 goto give_sigsegv;
375
376 if (_NSIG_WORDS > 1) {
377 err = __copy_to_user(&frame->extramask, &set->sig[1],
378 sizeof(frame->extramask));
379 if (err)
380 goto give_sigsegv;
381 }
382
383 restorer = &__kernel_sigreturn;
384 if (ka->sa.sa_flags & SA_RESTORER)
385 restorer = ka->sa.sa_restorer;
386
387 /* Set up to return from userspace. */
388 err |= __put_user(restorer, &frame->pretcode);
389
390 /*
391 * This is popl %eax ; movl $,%eax ; int $0x80
392 *
393 * WE DO NOT USE IT ANY MORE! It's only left here for historical
394 * reasons and because gdb uses it as a signature to notice
395 * signal handler stack frames.
396 */
397 err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
398 err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
399 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
400
401 if (err)
402 goto give_sigsegv;
403
404 /* Set up registers for signal handler */
405 regs->esp = (unsigned long) frame;
406 regs->eip = (unsigned long) ka->sa.sa_handler;
407 regs->eax = (unsigned long) sig;
408 regs->edx = (unsigned long) 0;
409 regs->ecx = (unsigned long) 0;
410
411 set_fs(USER_DS);
412 regs->xds = __USER_DS;
413 regs->xes = __USER_DS;
414 regs->xss = __USER_DS;
415 regs->xcs = __USER_CS;
416
417 /*
418 * Clear TF when entering the signal handler, but
419 * notify any tracer that was single-stepping it.
420 * The tracer may want to single-step inside the
421 * handler too.
422 */
423 regs->eflags &= ~TF_MASK;
424 if (test_thread_flag(TIF_SINGLESTEP))
425 ptrace_notify(SIGTRAP);
426
427#if DEBUG_SIG
428 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
429 current->comm, current->pid, frame, regs->eip, frame->pretcode);
430#endif
431
432 return;
433
434give_sigsegv:
435 force_sigsegv(sig, current);
436}
437
438static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
439 sigset_t *set, struct pt_regs * regs)
440{
441 void __user *restorer;
442 struct rt_sigframe __user *frame;
443 int err = 0;
444 int usig;
445
446 frame = get_sigframe(ka, regs, sizeof(*frame));
447
448 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
449 goto give_sigsegv;
450
451 usig = current_thread_info()->exec_domain
452 && current_thread_info()->exec_domain->signal_invmap
453 && sig < 32
454 ? current_thread_info()->exec_domain->signal_invmap[sig]
455 : sig;
456
457 err |= __put_user(usig, &frame->sig);
458 err |= __put_user(&frame->info, &frame->pinfo);
459 err |= __put_user(&frame->uc, &frame->puc);
460 err |= copy_siginfo_to_user(&frame->info, info);
461 if (err)
462 goto give_sigsegv;
463
464 /* Create the ucontext. */
465 err |= __put_user(0, &frame->uc.uc_flags);
466 err |= __put_user(0, &frame->uc.uc_link);
467 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
468 err |= __put_user(sas_ss_flags(regs->esp),
469 &frame->uc.uc_stack.ss_flags);
470 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
471 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
472 regs, set->sig[0]);
473 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
474 if (err)
475 goto give_sigsegv;
476
477 /* Set up to return from userspace. */
478 restorer = &__kernel_rt_sigreturn;
479 if (ka->sa.sa_flags & SA_RESTORER)
480 restorer = ka->sa.sa_restorer;
481 err |= __put_user(restorer, &frame->pretcode);
482
483 /*
484 * This is movl $,%eax ; int $0x80
485 *
486 * WE DO NOT USE IT ANY MORE! It's only left here for historical
487 * reasons and because gdb uses it as a signature to notice
488 * signal handler stack frames.
489 */
490 err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
491 err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
492 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
493
494 if (err)
495 goto give_sigsegv;
496
497 /* Set up registers for signal handler */
498 regs->esp = (unsigned long) frame;
499 regs->eip = (unsigned long) ka->sa.sa_handler;
500 regs->eax = (unsigned long) usig;
501 regs->edx = (unsigned long) &frame->info;
502 regs->ecx = (unsigned long) &frame->uc;
503
504 set_fs(USER_DS);
505 regs->xds = __USER_DS;
506 regs->xes = __USER_DS;
507 regs->xss = __USER_DS;
508 regs->xcs = __USER_CS;
509
510 /*
511 * Clear TF when entering the signal handler, but
512 * notify any tracer that was single-stepping it.
513 * The tracer may want to single-step inside the
514 * handler too.
515 */
516 regs->eflags &= ~TF_MASK;
517 if (test_thread_flag(TIF_SINGLESTEP))
518 ptrace_notify(SIGTRAP);
519
520#if DEBUG_SIG
521 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
522 current->comm, current->pid, frame, regs->eip, frame->pretcode);
523#endif
524
525 return;
526
527give_sigsegv:
528 force_sigsegv(sig, current);
529}
530
531/*
532 * OK, we're invoking a handler
533 */
534
535static void
536handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
537 sigset_t *oldset, struct pt_regs * regs)
538{
539 /* Are we from a system call? */
540 if (regs->orig_eax >= 0) {
541 /* If so, check system call restarting.. */
542 switch (regs->eax) {
543 case -ERESTART_RESTARTBLOCK:
544 case -ERESTARTNOHAND:
545 regs->eax = -EINTR;
546 break;
547
548 case -ERESTARTSYS:
549 if (!(ka->sa.sa_flags & SA_RESTART)) {
550 regs->eax = -EINTR;
551 break;
552 }
553 /* fallthrough */
554 case -ERESTARTNOINTR:
555 regs->eax = regs->orig_eax;
556 regs->eip -= 2;
557 }
558 }
559
560 /*
561 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
562 * that register information in the sigcontext is correct.
563 */
564 if (unlikely(regs->eflags & TF_MASK)
565 && likely(current->ptrace & PT_DTRACE)) {
566 current->ptrace &= ~PT_DTRACE;
567 regs->eflags &= ~TF_MASK;
568 }
569
570 /* Set up the stack frame */
571 if (ka->sa.sa_flags & SA_SIGINFO)
572 setup_rt_frame(sig, ka, info, oldset, regs);
573 else
574 setup_frame(sig, ka, oldset, regs);
575
576 if (!(ka->sa.sa_flags & SA_NODEFER)) {
577 spin_lock_irq(&current->sighand->siglock);
578 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
579 sigaddset(&current->blocked,sig);
580 recalc_sigpending();
581 spin_unlock_irq(&current->sighand->siglock);
582 }
583}
584
585/*
586 * Note that 'init' is a special process: it doesn't get signals it doesn't
587 * want to handle. Thus you cannot kill init even with a SIGKILL even by
588 * mistake.
589 */
590int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset)
591{
592 siginfo_t info;
593 int signr;
594 struct k_sigaction ka;
595
596 /*
597 * We want the common case to go fast, which
598 * is why we may in certain cases get here from
599 * kernel mode. Just return without doing anything
600 * if so.
601 */
602 if ((regs->xcs & 3) != 3)
603 return 1;
604
605 if (current->flags & PF_FREEZE) {
606 refrigerator(0);
607 goto no_signal;
608 }
609
610 if (!oldset)
611 oldset = &current->blocked;
612
613 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
614 if (signr > 0) {
615 /* Reenable any watchpoints before delivering the
616 * signal to user space. The processor register will
617 * have been cleared if the watchpoint triggered
618 * inside the kernel.
619 */
620 if (unlikely(current->thread.debugreg[7])) {
621 __asm__("movl %0,%%db7" : : "r" (current->thread.debugreg[7]));
622 }
623
624 /* Whee! Actually deliver the signal. */
625 handle_signal(signr, &info, &ka, oldset, regs);
626 return 1;
627 }
628
629 no_signal:
630 /* Did we come from a system call? */
631 if (regs->orig_eax >= 0) {
632 /* Restart the system call - no handlers present */
633 if (regs->eax == -ERESTARTNOHAND ||
634 regs->eax == -ERESTARTSYS ||
635 regs->eax == -ERESTARTNOINTR) {
636 regs->eax = regs->orig_eax;
637 regs->eip -= 2;
638 }
639 if (regs->eax == -ERESTART_RESTARTBLOCK){
640 regs->eax = __NR_restart_syscall;
641 regs->eip -= 2;
642 }
643 }
644 return 0;
645}
646
647/*
648 * notification of userspace execution resumption
649 * - triggered by current->work.notify_resume
650 */
651__attribute__((regparm(3)))
652void do_notify_resume(struct pt_regs *regs, sigset_t *oldset,
653 __u32 thread_info_flags)
654{
655 /* Pending single-step? */
656 if (thread_info_flags & _TIF_SINGLESTEP) {
657 regs->eflags |= TF_MASK;
658 clear_thread_flag(TIF_SINGLESTEP);
659 }
660 /* deal with pending signal delivery */
661 if (thread_info_flags & _TIF_SIGPENDING)
662 do_signal(regs,oldset);
663
664 clear_thread_flag(TIF_IRET);
665}
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
new file mode 100644
index 000000000000..6223c33ac91c
--- /dev/null
+++ b/arch/i386/kernel/smp.c
@@ -0,0 +1,612 @@
1/*
2 * Intel SMP support routines.
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6 *
7 * This code is released under the GNU General Public License version 2 or
8 * later.
9 */
10
11#include <linux/init.h>
12
13#include <linux/mm.h>
14#include <linux/irq.h>
15#include <linux/delay.h>
16#include <linux/spinlock.h>
17#include <linux/smp_lock.h>
18#include <linux/kernel_stat.h>
19#include <linux/mc146818rtc.h>
20#include <linux/cache.h>
21#include <linux/interrupt.h>
22
23#include <asm/mtrr.h>
24#include <asm/tlbflush.h>
25#include <mach_apic.h>
26
27/*
28 * Some notes on x86 processor bugs affecting SMP operation:
29 *
30 * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
31 * The Linux implications for SMP are handled as follows:
32 *
33 * Pentium III / [Xeon]
34 * None of the E1AP-E3AP errata are visible to the user.
35 *
36 * E1AP. see PII A1AP
37 * E2AP. see PII A2AP
38 * E3AP. see PII A3AP
39 *
40 * Pentium II / [Xeon]
41 * None of the A1AP-A3AP errata are visible to the user.
42 *
43 * A1AP. see PPro 1AP
44 * A2AP. see PPro 2AP
45 * A3AP. see PPro 7AP
46 *
47 * Pentium Pro
48 * None of 1AP-9AP errata are visible to the normal user,
49 * except occasional delivery of 'spurious interrupt' as trap #15.
50 * This is very rare and a non-problem.
51 *
52 * 1AP. Linux maps APIC as non-cacheable
53 * 2AP. worked around in hardware
54 * 3AP. fixed in C0 and above steppings microcode update.
55 * Linux does not use excessive STARTUP_IPIs.
56 * 4AP. worked around in hardware
57 * 5AP. symmetric IO mode (normal Linux operation) not affected.
58 * 'noapic' mode has vector 0xf filled out properly.
59 * 6AP. 'noapic' mode might be affected - fixed in later steppings
60 * 7AP. We do not assume writes to the LVT deassering IRQs
61 * 8AP. We do not enable low power mode (deep sleep) during MP bootup
62 * 9AP. We do not use mixed mode
63 *
64 * Pentium
65 * There is a marginal case where REP MOVS on 100MHz SMP
66 * machines with B stepping processors can fail. XXX should provide
67 * an L1cache=Writethrough or L1cache=off option.
68 *
69 * B stepping CPUs may hang. There are hardware work arounds
70 * for this. We warn about it in case your board doesn't have the work
71 * arounds. Basically thats so I can tell anyone with a B stepping
72 * CPU and SMP problems "tough".
73 *
74 * Specific items [From Pentium Processor Specification Update]
75 *
76 * 1AP. Linux doesn't use remote read
77 * 2AP. Linux doesn't trust APIC errors
78 * 3AP. We work around this
79 * 4AP. Linux never generated 3 interrupts of the same priority
80 * to cause a lost local interrupt.
81 * 5AP. Remote read is never used
82 * 6AP. not affected - worked around in hardware
83 * 7AP. not affected - worked around in hardware
84 * 8AP. worked around in hardware - we get explicit CS errors if not
85 * 9AP. only 'noapic' mode affected. Might generate spurious
86 * interrupts, we log only the first one and count the
87 * rest silently.
88 * 10AP. not affected - worked around in hardware
89 * 11AP. Linux reads the APIC between writes to avoid this, as per
90 * the documentation. Make sure you preserve this as it affects
91 * the C stepping chips too.
92 * 12AP. not affected - worked around in hardware
93 * 13AP. not affected - worked around in hardware
94 * 14AP. we always deassert INIT during bootup
95 * 15AP. not affected - worked around in hardware
96 * 16AP. not affected - worked around in hardware
97 * 17AP. not affected - worked around in hardware
98 * 18AP. not affected - worked around in hardware
99 * 19AP. not affected - worked around in BIOS
100 *
101 * If this sounds worrying believe me these bugs are either ___RARE___,
102 * or are signal timing bugs worked around in hardware and there's
103 * about nothing of note with C stepping upwards.
104 */
105
106DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
107
108/*
109 * the following functions deal with sending IPIs between CPUs.
110 *
111 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
112 */
113
114static inline int __prepare_ICR (unsigned int shortcut, int vector)
115{
116 return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
117}
118
119static inline int __prepare_ICR2 (unsigned int mask)
120{
121 return SET_APIC_DEST_FIELD(mask);
122}
123
124void __send_IPI_shortcut(unsigned int shortcut, int vector)
125{
126 /*
127 * Subtle. In the case of the 'never do double writes' workaround
128 * we have to lock out interrupts to be safe. As we don't care
129 * of the value read we use an atomic rmw access to avoid costly
130 * cli/sti. Otherwise we use an even cheaper single atomic write
131 * to the APIC.
132 */
133 unsigned int cfg;
134
135 /*
136 * Wait for idle.
137 */
138 apic_wait_icr_idle();
139
140 /*
141 * No need to touch the target chip field
142 */
143 cfg = __prepare_ICR(shortcut, vector);
144
145 /*
146 * Send the IPI. The write to APIC_ICR fires this off.
147 */
148 apic_write_around(APIC_ICR, cfg);
149}
150
151void fastcall send_IPI_self(int vector)
152{
153 __send_IPI_shortcut(APIC_DEST_SELF, vector);
154}
155
156/*
157 * This is only used on smaller machines.
158 */
159void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
160{
161 unsigned long mask = cpus_addr(cpumask)[0];
162 unsigned long cfg;
163 unsigned long flags;
164
165 local_irq_save(flags);
166
167 /*
168 * Wait for idle.
169 */
170 apic_wait_icr_idle();
171
172 /*
173 * prepare target chip field
174 */
175 cfg = __prepare_ICR2(mask);
176 apic_write_around(APIC_ICR2, cfg);
177
178 /*
179 * program the ICR
180 */
181 cfg = __prepare_ICR(0, vector);
182
183 /*
184 * Send the IPI. The write to APIC_ICR fires this off.
185 */
186 apic_write_around(APIC_ICR, cfg);
187
188 local_irq_restore(flags);
189}
190
191void send_IPI_mask_sequence(cpumask_t mask, int vector)
192{
193 unsigned long cfg, flags;
194 unsigned int query_cpu;
195
196 /*
197 * Hack. The clustered APIC addressing mode doesn't allow us to send
198 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
199 * should be modified to do 1 message per cluster ID - mbligh
200 */
201
202 local_irq_save(flags);
203
204 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
205 if (cpu_isset(query_cpu, mask)) {
206
207 /*
208 * Wait for idle.
209 */
210 apic_wait_icr_idle();
211
212 /*
213 * prepare target chip field
214 */
215 cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
216 apic_write_around(APIC_ICR2, cfg);
217
218 /*
219 * program the ICR
220 */
221 cfg = __prepare_ICR(0, vector);
222
223 /*
224 * Send the IPI. The write to APIC_ICR fires this off.
225 */
226 apic_write_around(APIC_ICR, cfg);
227 }
228 }
229 local_irq_restore(flags);
230}
231
232#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
233
234/*
235 * Smarter SMP flushing macros.
236 * c/o Linus Torvalds.
237 *
238 * These mean you can really definitely utterly forget about
239 * writing to user space from interrupts. (Its not allowed anyway).
240 *
241 * Optimizations Manfred Spraul <manfred@colorfullife.com>
242 */
243
244static cpumask_t flush_cpumask;
245static struct mm_struct * flush_mm;
246static unsigned long flush_va;
247static DEFINE_SPINLOCK(tlbstate_lock);
248#define FLUSH_ALL 0xffffffff
249
250/*
251 * We cannot call mmdrop() because we are in interrupt context,
252 * instead update mm->cpu_vm_mask.
253 *
254 * We need to reload %cr3 since the page tables may be going
255 * away from under us..
256 */
257static inline void leave_mm (unsigned long cpu)
258{
259 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
260 BUG();
261 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
262 load_cr3(swapper_pg_dir);
263}
264
265/*
266 *
267 * The flush IPI assumes that a thread switch happens in this order:
268 * [cpu0: the cpu that switches]
269 * 1) switch_mm() either 1a) or 1b)
270 * 1a) thread switch to a different mm
271 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
272 * Stop ipi delivery for the old mm. This is not synchronized with
273 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
274 * for the wrong mm, and in the worst case we perform a superflous
275 * tlb flush.
276 * 1a2) set cpu_tlbstate to TLBSTATE_OK
277 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
278 * was in lazy tlb mode.
279 * 1a3) update cpu_tlbstate[].active_mm
280 * Now cpu0 accepts tlb flushes for the new mm.
281 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
282 * Now the other cpus will send tlb flush ipis.
283 * 1a4) change cr3.
284 * 1b) thread switch without mm change
285 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
286 * flush ipis.
287 * 1b1) set cpu_tlbstate to TLBSTATE_OK
288 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
289 * Atomically set the bit [other cpus will start sending flush ipis],
290 * and test the bit.
291 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
292 * 2) switch %%esp, ie current
293 *
294 * The interrupt must handle 2 special cases:
295 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
296 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
297 * runs in kernel space, the cpu could load tlb entries for user space
298 * pages.
299 *
300 * The good news is that cpu_tlbstate is local to each cpu, no
301 * write/read ordering problems.
302 */
303
304/*
305 * TLB flush IPI:
306 *
307 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
308 * 2) Leave the mm if we are in the lazy tlb mode.
309 */
310
311fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
312{
313 unsigned long cpu;
314
315 cpu = get_cpu();
316
317 if (!cpu_isset(cpu, flush_cpumask))
318 goto out;
319 /*
320 * This was a BUG() but until someone can quote me the
321 * line from the intel manual that guarantees an IPI to
322 * multiple CPUs is retried _only_ on the erroring CPUs
323 * its staying as a return
324 *
325 * BUG();
326 */
327
328 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
329 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
330 if (flush_va == FLUSH_ALL)
331 local_flush_tlb();
332 else
333 __flush_tlb_one(flush_va);
334 } else
335 leave_mm(cpu);
336 }
337 ack_APIC_irq();
338 smp_mb__before_clear_bit();
339 cpu_clear(cpu, flush_cpumask);
340 smp_mb__after_clear_bit();
341out:
342 put_cpu_no_resched();
343}
344
345static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
346 unsigned long va)
347{
348 cpumask_t tmp;
349 /*
350 * A couple of (to be removed) sanity checks:
351 *
352 * - we do not send IPIs to not-yet booted CPUs.
353 * - current CPU must not be in mask
354 * - mask must exist :)
355 */
356 BUG_ON(cpus_empty(cpumask));
357
358 cpus_and(tmp, cpumask, cpu_online_map);
359 BUG_ON(!cpus_equal(cpumask, tmp));
360 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
361 BUG_ON(!mm);
362
363 /*
364 * i'm not happy about this global shared spinlock in the
365 * MM hot path, but we'll see how contended it is.
366 * Temporarily this turns IRQs off, so that lockups are
367 * detected by the NMI watchdog.
368 */
369 spin_lock(&tlbstate_lock);
370
371 flush_mm = mm;
372 flush_va = va;
373#if NR_CPUS <= BITS_PER_LONG
374 atomic_set_mask(cpumask, &flush_cpumask);
375#else
376 {
377 int k;
378 unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
379 unsigned long *cpu_mask = (unsigned long *)&cpumask;
380 for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
381 atomic_set_mask(cpu_mask[k], &flush_mask[k]);
382 }
383#endif
384 /*
385 * We have to send the IPI only to
386 * CPUs affected.
387 */
388 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
389
390 while (!cpus_empty(flush_cpumask))
391 /* nothing. lockup detection does not belong here */
392 mb();
393
394 flush_mm = NULL;
395 flush_va = 0;
396 spin_unlock(&tlbstate_lock);
397}
398
399void flush_tlb_current_task(void)
400{
401 struct mm_struct *mm = current->mm;
402 cpumask_t cpu_mask;
403
404 preempt_disable();
405 cpu_mask = mm->cpu_vm_mask;
406 cpu_clear(smp_processor_id(), cpu_mask);
407
408 local_flush_tlb();
409 if (!cpus_empty(cpu_mask))
410 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
411 preempt_enable();
412}
413
414void flush_tlb_mm (struct mm_struct * mm)
415{
416 cpumask_t cpu_mask;
417
418 preempt_disable();
419 cpu_mask = mm->cpu_vm_mask;
420 cpu_clear(smp_processor_id(), cpu_mask);
421
422 if (current->active_mm == mm) {
423 if (current->mm)
424 local_flush_tlb();
425 else
426 leave_mm(smp_processor_id());
427 }
428 if (!cpus_empty(cpu_mask))
429 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
430
431 preempt_enable();
432}
433
434void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
435{
436 struct mm_struct *mm = vma->vm_mm;
437 cpumask_t cpu_mask;
438
439 preempt_disable();
440 cpu_mask = mm->cpu_vm_mask;
441 cpu_clear(smp_processor_id(), cpu_mask);
442
443 if (current->active_mm == mm) {
444 if(current->mm)
445 __flush_tlb_one(va);
446 else
447 leave_mm(smp_processor_id());
448 }
449
450 if (!cpus_empty(cpu_mask))
451 flush_tlb_others(cpu_mask, mm, va);
452
453 preempt_enable();
454}
455
456static void do_flush_tlb_all(void* info)
457{
458 unsigned long cpu = smp_processor_id();
459
460 __flush_tlb_all();
461 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
462 leave_mm(cpu);
463}
464
465void flush_tlb_all(void)
466{
467 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
468}
469
470/*
471 * this function sends a 'reschedule' IPI to another CPU.
472 * it goes straight through and wastes no time serializing
473 * anything. Worst case is that we lose a reschedule ...
474 */
475void smp_send_reschedule(int cpu)
476{
477 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
478}
479
480/*
481 * Structure and data for smp_call_function(). This is designed to minimise
482 * static memory requirements. It also looks cleaner.
483 */
484static DEFINE_SPINLOCK(call_lock);
485
486struct call_data_struct {
487 void (*func) (void *info);
488 void *info;
489 atomic_t started;
490 atomic_t finished;
491 int wait;
492};
493
494static struct call_data_struct * call_data;
495
496/*
497 * this function sends a 'generic call function' IPI to all other CPUs
498 * in the system.
499 */
500
501int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
502 int wait)
503/*
504 * [SUMMARY] Run a function on all other CPUs.
505 * <func> The function to run. This must be fast and non-blocking.
506 * <info> An arbitrary pointer to pass to the function.
507 * <nonatomic> currently unused.
508 * <wait> If true, wait (atomically) until function has completed on other CPUs.
509 * [RETURNS] 0 on success, else a negative status code. Does not return until
510 * remote CPUs are nearly ready to execute <<func>> or are or have executed.
511 *
512 * You must not call this function with disabled interrupts or from a
513 * hardware interrupt handler or from a bottom half handler.
514 */
515{
516 struct call_data_struct data;
517 int cpus = num_online_cpus()-1;
518
519 if (!cpus)
520 return 0;
521
522 /* Can deadlock when called with interrupts disabled */
523 WARN_ON(irqs_disabled());
524
525 data.func = func;
526 data.info = info;
527 atomic_set(&data.started, 0);
528 data.wait = wait;
529 if (wait)
530 atomic_set(&data.finished, 0);
531
532 spin_lock(&call_lock);
533 call_data = &data;
534 mb();
535
536 /* Send a message to all other CPUs and wait for them to respond */
537 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
538
539 /* Wait for response */
540 while (atomic_read(&data.started) != cpus)
541 cpu_relax();
542
543 if (wait)
544 while (atomic_read(&data.finished) != cpus)
545 cpu_relax();
546 spin_unlock(&call_lock);
547
548 return 0;
549}
550
551static void stop_this_cpu (void * dummy)
552{
553 /*
554 * Remove this CPU:
555 */
556 cpu_clear(smp_processor_id(), cpu_online_map);
557 local_irq_disable();
558 disable_local_APIC();
559 if (cpu_data[smp_processor_id()].hlt_works_ok)
560 for(;;) __asm__("hlt");
561 for (;;);
562}
563
564/*
565 * this function calls the 'stop' function on all other CPUs in the system.
566 */
567
568void smp_send_stop(void)
569{
570 smp_call_function(stop_this_cpu, NULL, 1, 0);
571
572 local_irq_disable();
573 disable_local_APIC();
574 local_irq_enable();
575}
576
577/*
578 * Reschedule call back. Nothing to do,
579 * all the work is done automatically when
580 * we return from the interrupt.
581 */
582fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
583{
584 ack_APIC_irq();
585}
586
587fastcall void smp_call_function_interrupt(struct pt_regs *regs)
588{
589 void (*func) (void *info) = call_data->func;
590 void *info = call_data->info;
591 int wait = call_data->wait;
592
593 ack_APIC_irq();
594 /*
595 * Notify initiating CPU that I've grabbed the data and am
596 * about to execute the function
597 */
598 mb();
599 atomic_inc(&call_data->started);
600 /*
601 * At this point the info structure may be out of scope unless wait==1
602 */
603 irq_enter();
604 (*func)(info);
605 irq_exit();
606
607 if (wait) {
608 mb();
609 atomic_inc(&call_data->finished);
610 }
611}
612
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
new file mode 100644
index 000000000000..332ee7a1d1a1
--- /dev/null
+++ b/arch/i386/kernel/smpboot.c
@@ -0,0 +1,1145 @@
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 *
7 * Much of the core SMP work is based on previous work by Thomas Radke, to
8 * whom a great many thanks are extended.
9 *
10 * Thanks to Intel for making available several different Pentium,
11 * Pentium Pro and Pentium-II/Xeon MP machines.
12 * Original development of Linux SMP code supported by Caldera.
13 *
14 * This code is released under the GNU General Public License version 2 or
15 * later.
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIPS report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Martin J. Bligh : Added support for multi-quad systems
33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35
36#include <linux/module.h>
37#include <linux/config.h>
38#include <linux/init.h>
39#include <linux/kernel.h>
40
41#include <linux/mm.h>
42#include <linux/sched.h>
43#include <linux/kernel_stat.h>
44#include <linux/smp_lock.h>
45#include <linux/irq.h>
46#include <linux/bootmem.h>
47
48#include <linux/delay.h>
49#include <linux/mc146818rtc.h>
50#include <asm/tlbflush.h>
51#include <asm/desc.h>
52#include <asm/arch_hooks.h>
53
54#include <mach_apic.h>
55#include <mach_wakecpu.h>
56#include <smpboot_hooks.h>
57
58/* Set if we find a B stepping CPU */
59static int __initdata smp_b_stepping;
60
61/* Number of siblings per CPU package */
62int smp_num_siblings = 1;
63int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
64EXPORT_SYMBOL(phys_proc_id);
65
66/* bitmap of online cpus */
67cpumask_t cpu_online_map;
68
69cpumask_t cpu_callin_map;
70cpumask_t cpu_callout_map;
71static cpumask_t smp_commenced_mask;
72
73/* Per CPU bogomips and other parameters */
74struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
75
76u8 x86_cpu_to_apicid[NR_CPUS] =
77 { [0 ... NR_CPUS-1] = 0xff };
78EXPORT_SYMBOL(x86_cpu_to_apicid);
79
80/*
81 * Trampoline 80x86 program as an array.
82 */
83
84extern unsigned char trampoline_data [];
85extern unsigned char trampoline_end [];
86static unsigned char *trampoline_base;
87static int trampoline_exec;
88
89static void map_cpu_to_logical_apicid(void);
90
91/*
92 * Currently trivial. Write the real->protected mode
93 * bootstrap into the page concerned. The caller
94 * has made sure it's suitably aligned.
95 */
96
97static unsigned long __init setup_trampoline(void)
98{
99 memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
100 return virt_to_phys(trampoline_base);
101}
102
103/*
104 * We are called very early to get the low memory for the
105 * SMP bootup trampoline page.
106 */
107void __init smp_alloc_memory(void)
108{
109 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
110 /*
111 * Has to be in very low memory so we can execute
112 * real-mode AP code.
113 */
114 if (__pa(trampoline_base) >= 0x9F000)
115 BUG();
116 /*
117 * Make the SMP trampoline executable:
118 */
119 trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
120}
121
122/*
123 * The bootstrap kernel entry code has set these up. Save them for
124 * a given CPU
125 */
126
127static void __init smp_store_cpu_info(int id)
128{
129 struct cpuinfo_x86 *c = cpu_data + id;
130
131 *c = boot_cpu_data;
132 if (id!=0)
133 identify_cpu(c);
134 /*
135 * Mask B, Pentium, but not Pentium MMX
136 */
137 if (c->x86_vendor == X86_VENDOR_INTEL &&
138 c->x86 == 5 &&
139 c->x86_mask >= 1 && c->x86_mask <= 4 &&
140 c->x86_model <= 3)
141 /*
142 * Remember we have B step Pentia with bugs
143 */
144 smp_b_stepping = 1;
145
146 /*
147 * Certain Athlons might work (for various values of 'work') in SMP
148 * but they are not certified as MP capable.
149 */
150 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
151
152 /* Athlon 660/661 is valid. */
153 if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
154 goto valid_k7;
155
156 /* Duron 670 is valid */
157 if ((c->x86_model==7) && (c->x86_mask==0))
158 goto valid_k7;
159
160 /*
161 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
162 * It's worth noting that the A5 stepping (662) of some Athlon XP's
163 * have the MP bit set.
164 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
165 */
166 if (((c->x86_model==6) && (c->x86_mask>=2)) ||
167 ((c->x86_model==7) && (c->x86_mask>=1)) ||
168 (c->x86_model> 7))
169 if (cpu_has_mp)
170 goto valid_k7;
171
172 /* If we get here, it's not a certified SMP capable AMD system. */
173 tainted |= TAINT_UNSAFE_SMP;
174 }
175
176valid_k7:
177 ;
178}
179
180/*
181 * TSC synchronization.
182 *
183 * We first check whether all CPUs have their TSC's synchronized,
184 * then we print a warning if not, and always resync.
185 */
186
187static atomic_t tsc_start_flag = ATOMIC_INIT(0);
188static atomic_t tsc_count_start = ATOMIC_INIT(0);
189static atomic_t tsc_count_stop = ATOMIC_INIT(0);
190static unsigned long long tsc_values[NR_CPUS];
191
192#define NR_LOOPS 5
193
194static void __init synchronize_tsc_bp (void)
195{
196 int i;
197 unsigned long long t0;
198 unsigned long long sum, avg;
199 long long delta;
200 unsigned long one_usec;
201 int buggy = 0;
202
203 printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
204
205 /* convert from kcyc/sec to cyc/usec */
206 one_usec = cpu_khz / 1000;
207
208 atomic_set(&tsc_start_flag, 1);
209 wmb();
210
211 /*
212 * We loop a few times to get a primed instruction cache,
213 * then the last pass is more or less synchronized and
214 * the BP and APs set their cycle counters to zero all at
215 * once. This reduces the chance of having random offsets
216 * between the processors, and guarantees that the maximum
217 * delay between the cycle counters is never bigger than
218 * the latency of information-passing (cachelines) between
219 * two CPUs.
220 */
221 for (i = 0; i < NR_LOOPS; i++) {
222 /*
223 * all APs synchronize but they loop on '== num_cpus'
224 */
225 while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
226 mb();
227 atomic_set(&tsc_count_stop, 0);
228 wmb();
229 /*
230 * this lets the APs save their current TSC:
231 */
232 atomic_inc(&tsc_count_start);
233
234 rdtscll(tsc_values[smp_processor_id()]);
235 /*
236 * We clear the TSC in the last loop:
237 */
238 if (i == NR_LOOPS-1)
239 write_tsc(0, 0);
240
241 /*
242 * Wait for all APs to leave the synchronization point:
243 */
244 while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
245 mb();
246 atomic_set(&tsc_count_start, 0);
247 wmb();
248 atomic_inc(&tsc_count_stop);
249 }
250
251 sum = 0;
252 for (i = 0; i < NR_CPUS; i++) {
253 if (cpu_isset(i, cpu_callout_map)) {
254 t0 = tsc_values[i];
255 sum += t0;
256 }
257 }
258 avg = sum;
259 do_div(avg, num_booting_cpus());
260
261 sum = 0;
262 for (i = 0; i < NR_CPUS; i++) {
263 if (!cpu_isset(i, cpu_callout_map))
264 continue;
265 delta = tsc_values[i] - avg;
266 if (delta < 0)
267 delta = -delta;
268 /*
269 * We report bigger than 2 microseconds clock differences.
270 */
271 if (delta > 2*one_usec) {
272 long realdelta;
273 if (!buggy) {
274 buggy = 1;
275 printk("\n");
276 }
277 realdelta = delta;
278 do_div(realdelta, one_usec);
279 if (tsc_values[i] < avg)
280 realdelta = -realdelta;
281
282 printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
283 }
284
285 sum += delta;
286 }
287 if (!buggy)
288 printk("passed.\n");
289}
290
291static void __init synchronize_tsc_ap (void)
292{
293 int i;
294
295 /*
296 * Not every cpu is online at the time
297 * this gets called, so we first wait for the BP to
298 * finish SMP initialization:
299 */
300 while (!atomic_read(&tsc_start_flag)) mb();
301
302 for (i = 0; i < NR_LOOPS; i++) {
303 atomic_inc(&tsc_count_start);
304 while (atomic_read(&tsc_count_start) != num_booting_cpus())
305 mb();
306
307 rdtscll(tsc_values[smp_processor_id()]);
308 if (i == NR_LOOPS-1)
309 write_tsc(0, 0);
310
311 atomic_inc(&tsc_count_stop);
312 while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
313 }
314}
315#undef NR_LOOPS
316
317extern void calibrate_delay(void);
318
319static atomic_t init_deasserted;
320
321static void __init smp_callin(void)
322{
323 int cpuid, phys_id;
324 unsigned long timeout;
325
326 /*
327 * If waken up by an INIT in an 82489DX configuration
328 * we may get here before an INIT-deassert IPI reaches
329 * our local APIC. We have to wait for the IPI or we'll
330 * lock up on an APIC access.
331 */
332 wait_for_init_deassert(&init_deasserted);
333
334 /*
335 * (This works even if the APIC is not enabled.)
336 */
337 phys_id = GET_APIC_ID(apic_read(APIC_ID));
338 cpuid = smp_processor_id();
339 if (cpu_isset(cpuid, cpu_callin_map)) {
340 printk("huh, phys CPU#%d, CPU#%d already present??\n",
341 phys_id, cpuid);
342 BUG();
343 }
344 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
345
346 /*
347 * STARTUP IPIs are fragile beasts as they might sometimes
348 * trigger some glue motherboard logic. Complete APIC bus
349 * silence for 1 second, this overestimates the time the
350 * boot CPU is spending to send the up to 2 STARTUP IPIs
351 * by a factor of two. This should be enough.
352 */
353
354 /*
355 * Waiting 2s total for startup (udelay is not yet working)
356 */
357 timeout = jiffies + 2*HZ;
358 while (time_before(jiffies, timeout)) {
359 /*
360 * Has the boot CPU finished it's STARTUP sequence?
361 */
362 if (cpu_isset(cpuid, cpu_callout_map))
363 break;
364 rep_nop();
365 }
366
367 if (!time_before(jiffies, timeout)) {
368 printk("BUG: CPU%d started up but did not get a callout!\n",
369 cpuid);
370 BUG();
371 }
372
373 /*
374 * the boot CPU has finished the init stage and is spinning
375 * on callin_map until we finish. We are free to set up this
376 * CPU, first the APIC. (this is probably redundant on most
377 * boards)
378 */
379
380 Dprintk("CALLIN, before setup_local_APIC().\n");
381 smp_callin_clear_local_apic();
382 setup_local_APIC();
383 map_cpu_to_logical_apicid();
384
385 /*
386 * Get our bogomips.
387 */
388 calibrate_delay();
389 Dprintk("Stack at about %p\n",&cpuid);
390
391 /*
392 * Save our processor parameters
393 */
394 smp_store_cpu_info(cpuid);
395
396 disable_APIC_timer();
397
398 /*
399 * Allow the master to continue.
400 */
401 cpu_set(cpuid, cpu_callin_map);
402
403 /*
404 * Synchronize the TSC with the BP
405 */
406 if (cpu_has_tsc && cpu_khz)
407 synchronize_tsc_ap();
408}
409
410static int cpucount;
411
412/*
413 * Activate a secondary processor.
414 */
415static void __init start_secondary(void *unused)
416{
417 /*
418 * Dont put anything before smp_callin(), SMP
419 * booting is too fragile that we want to limit the
420 * things done here to the most necessary things.
421 */
422 cpu_init();
423 smp_callin();
424 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
425 rep_nop();
426 setup_secondary_APIC_clock();
427 if (nmi_watchdog == NMI_IO_APIC) {
428 disable_8259A_irq(0);
429 enable_NMI_through_LVT0(NULL);
430 enable_8259A_irq(0);
431 }
432 enable_APIC_timer();
433 /*
434 * low-memory mappings have been cleared, flush them from
435 * the local TLBs too.
436 */
437 local_flush_tlb();
438 cpu_set(smp_processor_id(), cpu_online_map);
439
440 /* We can take interrupts now: we're officially "up". */
441 local_irq_enable();
442
443 wmb();
444 cpu_idle();
445}
446
447/*
448 * Everything has been set up for the secondary
449 * CPUs - they just need to reload everything
450 * from the task structure
451 * This function must not return.
452 */
453void __init initialize_secondary(void)
454{
455 /*
456 * We don't actually need to load the full TSS,
457 * basically just the stack pointer and the eip.
458 */
459
460 asm volatile(
461 "movl %0,%%esp\n\t"
462 "jmp *%1"
463 :
464 :"r" (current->thread.esp),"r" (current->thread.eip));
465}
466
467extern struct {
468 void * esp;
469 unsigned short ss;
470} stack_start;
471
472#ifdef CONFIG_NUMA
473
474/* which logical CPUs are on which nodes */
475cpumask_t node_2_cpu_mask[MAX_NUMNODES] =
476 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
477/* which node each logical CPU is on */
478int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
479EXPORT_SYMBOL(cpu_2_node);
480
481/* set up a mapping between cpu and node. */
482static inline void map_cpu_to_node(int cpu, int node)
483{
484 printk("Mapping cpu %d to node %d\n", cpu, node);
485 cpu_set(cpu, node_2_cpu_mask[node]);
486 cpu_2_node[cpu] = node;
487}
488
489/* undo a mapping between cpu and node. */
490static inline void unmap_cpu_to_node(int cpu)
491{
492 int node;
493
494 printk("Unmapping cpu %d from all nodes\n", cpu);
495 for (node = 0; node < MAX_NUMNODES; node ++)
496 cpu_clear(cpu, node_2_cpu_mask[node]);
497 cpu_2_node[cpu] = 0;
498}
499#else /* !CONFIG_NUMA */
500
501#define map_cpu_to_node(cpu, node) ({})
502#define unmap_cpu_to_node(cpu) ({})
503
504#endif /* CONFIG_NUMA */
505
506u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
507
508static void map_cpu_to_logical_apicid(void)
509{
510 int cpu = smp_processor_id();
511 int apicid = logical_smp_processor_id();
512
513 cpu_2_logical_apicid[cpu] = apicid;
514 map_cpu_to_node(cpu, apicid_to_node(apicid));
515}
516
517static void unmap_cpu_to_logical_apicid(int cpu)
518{
519 cpu_2_logical_apicid[cpu] = BAD_APICID;
520 unmap_cpu_to_node(cpu);
521}
522
523#if APIC_DEBUG
524static inline void __inquire_remote_apic(int apicid)
525{
526 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
527 char *names[] = { "ID", "VERSION", "SPIV" };
528 int timeout, status;
529
530 printk("Inquiring remote APIC #%d...\n", apicid);
531
532 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
533 printk("... APIC #%d %s: ", apicid, names[i]);
534
535 /*
536 * Wait for idle.
537 */
538 apic_wait_icr_idle();
539
540 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
541 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
542
543 timeout = 0;
544 do {
545 udelay(100);
546 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
547 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
548
549 switch (status) {
550 case APIC_ICR_RR_VALID:
551 status = apic_read(APIC_RRR);
552 printk("%08x\n", status);
553 break;
554 default:
555 printk("failed\n");
556 }
557 }
558}
559#endif
560
561#ifdef WAKE_SECONDARY_VIA_NMI
562/*
563 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
564 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
565 * won't ... remember to clear down the APIC, etc later.
566 */
567static int __init
568wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
569{
570 unsigned long send_status = 0, accept_status = 0;
571 int timeout, maxlvt;
572
573 /* Target chip */
574 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
575
576 /* Boot on the stack */
577 /* Kick the second */
578 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
579
580 Dprintk("Waiting for send to finish...\n");
581 timeout = 0;
582 do {
583 Dprintk("+");
584 udelay(100);
585 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
586 } while (send_status && (timeout++ < 1000));
587
588 /*
589 * Give the other CPU some time to accept the IPI.
590 */
591 udelay(200);
592 /*
593 * Due to the Pentium erratum 3AP.
594 */
595 maxlvt = get_maxlvt();
596 if (maxlvt > 3) {
597 apic_read_around(APIC_SPIV);
598 apic_write(APIC_ESR, 0);
599 }
600 accept_status = (apic_read(APIC_ESR) & 0xEF);
601 Dprintk("NMI sent.\n");
602
603 if (send_status)
604 printk("APIC never delivered???\n");
605 if (accept_status)
606 printk("APIC delivery error (%lx).\n", accept_status);
607
608 return (send_status | accept_status);
609}
610#endif /* WAKE_SECONDARY_VIA_NMI */
611
612#ifdef WAKE_SECONDARY_VIA_INIT
613static int __init
614wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
615{
616 unsigned long send_status = 0, accept_status = 0;
617 int maxlvt, timeout, num_starts, j;
618
619 /*
620 * Be paranoid about clearing APIC errors.
621 */
622 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
623 apic_read_around(APIC_SPIV);
624 apic_write(APIC_ESR, 0);
625 apic_read(APIC_ESR);
626 }
627
628 Dprintk("Asserting INIT.\n");
629
630 /*
631 * Turn INIT on target chip
632 */
633 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
634
635 /*
636 * Send IPI
637 */
638 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
639 | APIC_DM_INIT);
640
641 Dprintk("Waiting for send to finish...\n");
642 timeout = 0;
643 do {
644 Dprintk("+");
645 udelay(100);
646 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
647 } while (send_status && (timeout++ < 1000));
648
649 mdelay(10);
650
651 Dprintk("Deasserting INIT.\n");
652
653 /* Target chip */
654 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
655
656 /* Send IPI */
657 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
658
659 Dprintk("Waiting for send to finish...\n");
660 timeout = 0;
661 do {
662 Dprintk("+");
663 udelay(100);
664 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
665 } while (send_status && (timeout++ < 1000));
666
667 atomic_set(&init_deasserted, 1);
668
669 /*
670 * Should we send STARTUP IPIs ?
671 *
672 * Determine this based on the APIC version.
673 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
674 */
675 if (APIC_INTEGRATED(apic_version[phys_apicid]))
676 num_starts = 2;
677 else
678 num_starts = 0;
679
680 /*
681 * Run STARTUP IPI loop.
682 */
683 Dprintk("#startup loops: %d.\n", num_starts);
684
685 maxlvt = get_maxlvt();
686
687 for (j = 1; j <= num_starts; j++) {
688 Dprintk("Sending STARTUP #%d.\n",j);
689 apic_read_around(APIC_SPIV);
690 apic_write(APIC_ESR, 0);
691 apic_read(APIC_ESR);
692 Dprintk("After apic_write.\n");
693
694 /*
695 * STARTUP IPI
696 */
697
698 /* Target chip */
699 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
700
701 /* Boot on the stack */
702 /* Kick the second */
703 apic_write_around(APIC_ICR, APIC_DM_STARTUP
704 | (start_eip >> 12));
705
706 /*
707 * Give the other CPU some time to accept the IPI.
708 */
709 udelay(300);
710
711 Dprintk("Startup point 1.\n");
712
713 Dprintk("Waiting for send to finish...\n");
714 timeout = 0;
715 do {
716 Dprintk("+");
717 udelay(100);
718 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
719 } while (send_status && (timeout++ < 1000));
720
721 /*
722 * Give the other CPU some time to accept the IPI.
723 */
724 udelay(200);
725 /*
726 * Due to the Pentium erratum 3AP.
727 */
728 if (maxlvt > 3) {
729 apic_read_around(APIC_SPIV);
730 apic_write(APIC_ESR, 0);
731 }
732 accept_status = (apic_read(APIC_ESR) & 0xEF);
733 if (send_status || accept_status)
734 break;
735 }
736 Dprintk("After Startup.\n");
737
738 if (send_status)
739 printk("APIC never delivered???\n");
740 if (accept_status)
741 printk("APIC delivery error (%lx).\n", accept_status);
742
743 return (send_status | accept_status);
744}
745#endif /* WAKE_SECONDARY_VIA_INIT */
746
747extern cpumask_t cpu_initialized;
748
749static int __init do_boot_cpu(int apicid)
750/*
751 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
752 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
753 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
754 */
755{
756 struct task_struct *idle;
757 unsigned long boot_error;
758 int timeout, cpu;
759 unsigned long start_eip;
760 unsigned short nmi_high = 0, nmi_low = 0;
761
762 cpu = ++cpucount;
763 /*
764 * We can't use kernel_thread since we must avoid to
765 * reschedule the child.
766 */
767 idle = fork_idle(cpu);
768 if (IS_ERR(idle))
769 panic("failed fork for CPU %d", cpu);
770 idle->thread.eip = (unsigned long) start_secondary;
771 /* start_eip had better be page-aligned! */
772 start_eip = setup_trampoline();
773
774 /* So we see what's up */
775 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
776 /* Stack for startup_32 can be just as for start_secondary onwards */
777 stack_start.esp = (void *) idle->thread.esp;
778
779 irq_ctx_init(cpu);
780
781 /*
782 * This grunge runs the startup process for
783 * the targeted processor.
784 */
785
786 atomic_set(&init_deasserted, 0);
787
788 Dprintk("Setting warm reset code and vector.\n");
789
790 store_NMI_vector(&nmi_high, &nmi_low);
791
792 smpboot_setup_warm_reset_vector(start_eip);
793
794 /*
795 * Starting actual IPI sequence...
796 */
797 boot_error = wakeup_secondary_cpu(apicid, start_eip);
798
799 if (!boot_error) {
800 /*
801 * allow APs to start initializing.
802 */
803 Dprintk("Before Callout %d.\n", cpu);
804 cpu_set(cpu, cpu_callout_map);
805 Dprintk("After Callout %d.\n", cpu);
806
807 /*
808 * Wait 5s total for a response
809 */
810 for (timeout = 0; timeout < 50000; timeout++) {
811 if (cpu_isset(cpu, cpu_callin_map))
812 break; /* It has booted */
813 udelay(100);
814 }
815
816 if (cpu_isset(cpu, cpu_callin_map)) {
817 /* number CPUs logically, starting from 1 (BSP is 0) */
818 Dprintk("OK.\n");
819 printk("CPU%d: ", cpu);
820 print_cpu_info(&cpu_data[cpu]);
821 Dprintk("CPU has booted.\n");
822 } else {
823 boot_error= 1;
824 if (*((volatile unsigned char *)trampoline_base)
825 == 0xA5)
826 /* trampoline started but...? */
827 printk("Stuck ??\n");
828 else
829 /* trampoline code not run */
830 printk("Not responding.\n");
831 inquire_remote_apic(apicid);
832 }
833 }
834 x86_cpu_to_apicid[cpu] = apicid;
835 if (boot_error) {
836 /* Try to put things back the way they were before ... */
837 unmap_cpu_to_logical_apicid(cpu);
838 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
839 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
840 cpucount--;
841 }
842
843 /* mark "stuck" area as not stuck */
844 *((volatile unsigned long *)trampoline_base) = 0;
845
846 return boot_error;
847}
848
849static void smp_tune_scheduling (void)
850{
851 unsigned long cachesize; /* kB */
852 unsigned long bandwidth = 350; /* MB/s */
853 /*
854 * Rough estimation for SMP scheduling, this is the number of
855 * cycles it takes for a fully memory-limited process to flush
856 * the SMP-local cache.
857 *
858 * (For a P5 this pretty much means we will choose another idle
859 * CPU almost always at wakeup time (this is due to the small
860 * L1 cache), on PIIs it's around 50-100 usecs, depending on
861 * the cache size)
862 */
863
864 if (!cpu_khz) {
865 /*
866 * this basically disables processor-affinity
867 * scheduling on SMP without a TSC.
868 */
869 return;
870 } else {
871 cachesize = boot_cpu_data.x86_cache_size;
872 if (cachesize == -1) {
873 cachesize = 16; /* Pentiums, 2x8kB cache */
874 bandwidth = 100;
875 }
876 }
877}
878
879/*
880 * Cycle through the processors sending APIC IPIs to boot each.
881 */
882
883static int boot_cpu_logical_apicid;
884/* Where the IO area was mapped on multiquad, always 0 otherwise */
885void *xquad_portio;
886
887cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
888
889static void __init smp_boot_cpus(unsigned int max_cpus)
890{
891 int apicid, cpu, bit, kicked;
892 unsigned long bogosum = 0;
893
894 /*
895 * Setup boot CPU information
896 */
897 smp_store_cpu_info(0); /* Final full version of the data */
898 printk("CPU%d: ", 0);
899 print_cpu_info(&cpu_data[0]);
900
901 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
902 boot_cpu_logical_apicid = logical_smp_processor_id();
903 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
904
905 current_thread_info()->cpu = 0;
906 smp_tune_scheduling();
907 cpus_clear(cpu_sibling_map[0]);
908 cpu_set(0, cpu_sibling_map[0]);
909
910 /*
911 * If we couldn't find an SMP configuration at boot time,
912 * get out of here now!
913 */
914 if (!smp_found_config && !acpi_lapic) {
915 printk(KERN_NOTICE "SMP motherboard not detected.\n");
916 smpboot_clear_io_apic_irqs();
917 phys_cpu_present_map = physid_mask_of_physid(0);
918 if (APIC_init_uniprocessor())
919 printk(KERN_NOTICE "Local APIC not detected."
920 " Using dummy APIC emulation.\n");
921 map_cpu_to_logical_apicid();
922 return;
923 }
924
925 /*
926 * Should not be necessary because the MP table should list the boot
927 * CPU too, but we do it for the sake of robustness anyway.
928 * Makes no sense to do this check in clustered apic mode, so skip it
929 */
930 if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
931 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
932 boot_cpu_physical_apicid);
933 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
934 }
935
936 /*
937 * If we couldn't find a local APIC, then get out of here now!
938 */
939 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
940 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
941 boot_cpu_physical_apicid);
942 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
943 smpboot_clear_io_apic_irqs();
944 phys_cpu_present_map = physid_mask_of_physid(0);
945 return;
946 }
947
948 verify_local_APIC();
949
950 /*
951 * If SMP should be disabled, then really disable it!
952 */
953 if (!max_cpus) {
954 smp_found_config = 0;
955 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
956 smpboot_clear_io_apic_irqs();
957 phys_cpu_present_map = physid_mask_of_physid(0);
958 return;
959 }
960
961 connect_bsp_APIC();
962 setup_local_APIC();
963 map_cpu_to_logical_apicid();
964
965
966 setup_portio_remap();
967
968 /*
969 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
970 *
971 * In clustered apic mode, phys_cpu_present_map is a constructed thus:
972 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
973 * clustered apic ID.
974 */
975 Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
976
977 kicked = 1;
978 for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
979 apicid = cpu_present_to_apicid(bit);
980 /*
981 * Don't even attempt to start the boot CPU!
982 */
983 if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
984 continue;
985
986 if (!check_apicid_present(bit))
987 continue;
988 if (max_cpus <= cpucount+1)
989 continue;
990
991 if (do_boot_cpu(apicid))
992 printk("CPU #%d not responding - cannot use it.\n",
993 apicid);
994 else
995 ++kicked;
996 }
997
998 /*
999 * Cleanup possible dangling ends...
1000 */
1001 smpboot_restore_warm_reset_vector();
1002
1003 /*
1004 * Allow the user to impress friends.
1005 */
1006 Dprintk("Before bogomips.\n");
1007 for (cpu = 0; cpu < NR_CPUS; cpu++)
1008 if (cpu_isset(cpu, cpu_callout_map))
1009 bogosum += cpu_data[cpu].loops_per_jiffy;
1010 printk(KERN_INFO
1011 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
1012 cpucount+1,
1013 bogosum/(500000/HZ),
1014 (bogosum/(5000/HZ))%100);
1015
1016 Dprintk("Before bogocount - setting activated=1.\n");
1017
1018 if (smp_b_stepping)
1019 printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
1020
1021 /*
1022 * Don't taint if we are running SMP kernel on a single non-MP
1023 * approved Athlon
1024 */
1025 if (tainted & TAINT_UNSAFE_SMP) {
1026 if (cpucount)
1027 printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
1028 else
1029 tainted &= ~TAINT_UNSAFE_SMP;
1030 }
1031
1032 Dprintk("Boot done.\n");
1033
1034 /*
1035 * construct cpu_sibling_map[], so that we can tell sibling CPUs
1036 * efficiently.
1037 */
1038 for (cpu = 0; cpu < NR_CPUS; cpu++)
1039 cpus_clear(cpu_sibling_map[cpu]);
1040
1041 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1042 int siblings = 0;
1043 int i;
1044 if (!cpu_isset(cpu, cpu_callout_map))
1045 continue;
1046
1047 if (smp_num_siblings > 1) {
1048 for (i = 0; i < NR_CPUS; i++) {
1049 if (!cpu_isset(i, cpu_callout_map))
1050 continue;
1051 if (phys_proc_id[cpu] == phys_proc_id[i]) {
1052 siblings++;
1053 cpu_set(i, cpu_sibling_map[cpu]);
1054 }
1055 }
1056 } else {
1057 siblings++;
1058 cpu_set(cpu, cpu_sibling_map[cpu]);
1059 }
1060
1061 if (siblings != smp_num_siblings)
1062 printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
1063 }
1064
1065 if (nmi_watchdog == NMI_LOCAL_APIC)
1066 check_nmi_watchdog();
1067
1068 smpboot_setup_io_apic();
1069
1070 setup_boot_APIC_clock();
1071
1072 /*
1073 * Synchronize the TSC with the AP
1074 */
1075 if (cpu_has_tsc && cpucount && cpu_khz)
1076 synchronize_tsc_bp();
1077}
1078
1079/* These are wrappers to interface to the new boot process. Someone
1080 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
1081void __init smp_prepare_cpus(unsigned int max_cpus)
1082{
1083 smp_boot_cpus(max_cpus);
1084}
1085
1086void __devinit smp_prepare_boot_cpu(void)
1087{
1088 cpu_set(smp_processor_id(), cpu_online_map);
1089 cpu_set(smp_processor_id(), cpu_callout_map);
1090}
1091
1092int __devinit __cpu_up(unsigned int cpu)
1093{
1094 /* This only works at boot for x86. See "rewrite" above. */
1095 if (cpu_isset(cpu, smp_commenced_mask)) {
1096 local_irq_enable();
1097 return -ENOSYS;
1098 }
1099
1100 /* In case one didn't come up */
1101 if (!cpu_isset(cpu, cpu_callin_map)) {
1102 local_irq_enable();
1103 return -EIO;
1104 }
1105
1106 local_irq_enable();
1107 /* Unleash the CPU! */
1108 cpu_set(cpu, smp_commenced_mask);
1109 while (!cpu_isset(cpu, cpu_online_map))
1110 mb();
1111 return 0;
1112}
1113
1114void __init smp_cpus_done(unsigned int max_cpus)
1115{
1116#ifdef CONFIG_X86_IO_APIC
1117 setup_ioapic_dest();
1118#endif
1119 zap_low_mappings();
1120 /*
1121 * Disable executability of the SMP trampoline:
1122 */
1123 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
1124}
1125
1126void __init smp_intr_init(void)
1127{
1128 /*
1129 * IRQ0 must be given a fixed assignment and initialized,
1130 * because it's used before the IO-APIC is set up.
1131 */
1132 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1133
1134 /*
1135 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1136 * IPI, driven by wakeup.
1137 */
1138 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1139
1140 /* IPI for invalidation */
1141 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1142
1143 /* IPI for generic function call */
1144 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1145}
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
new file mode 100644
index 000000000000..7b3b27d64409
--- /dev/null
+++ b/arch/i386/kernel/srat.c
@@ -0,0 +1,456 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/config.h>
27#include <linux/mm.h>
28#include <linux/bootmem.h>
29#include <linux/mmzone.h>
30#include <linux/acpi.h>
31#include <linux/nodemask.h>
32#include <asm/srat.h>
33#include <asm/topology.h>
34
35/*
36 * proximity macros and definitions
37 */
38#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
39#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
40#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
41#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
42#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 4
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
57
58static int num_memory_chunks; /* total number of memory chunks */
59static int zholes_size_init;
60static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
61
62extern void * boot_ioremap(unsigned long, unsigned long);
63
64/* Identify CPU proximity domains */
65static void __init parse_cpu_affinity_structure(char *p)
66{
67 struct acpi_table_processor_affinity *cpu_affinity =
68 (struct acpi_table_processor_affinity *) p;
69
70 if (!cpu_affinity->flags.enabled)
71 return; /* empty entry */
72
73 /* mark this node as "seen" in node bitmap */
74 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain);
75
76 printk("CPU 0x%02X in proximity domain 0x%02X\n",
77 cpu_affinity->apic_id, cpu_affinity->proximity_domain);
78}
79
80/*
81 * Identify memory proximity domains and hot-remove capabilities.
82 * Fill node memory chunk list structure.
83 */
84static void __init parse_memory_affinity_structure (char *sratp)
85{
86 unsigned long long paddr, size;
87 unsigned long start_pfn, end_pfn;
88 u8 pxm;
89 struct node_memory_chunk_s *p, *q, *pend;
90 struct acpi_table_memory_affinity *memory_affinity =
91 (struct acpi_table_memory_affinity *) sratp;
92
93 if (!memory_affinity->flags.enabled)
94 return; /* empty entry */
95
96 /* mark this node as "seen" in node bitmap */
97 BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain);
98
99 /* calculate info for memory chunk structure */
100 paddr = memory_affinity->base_addr_hi;
101 paddr = (paddr << 32) | memory_affinity->base_addr_lo;
102 size = memory_affinity->length_hi;
103 size = (size << 32) | memory_affinity->length_lo;
104
105 start_pfn = paddr >> PAGE_SHIFT;
106 end_pfn = (paddr + size) >> PAGE_SHIFT;
107
108 pxm = memory_affinity->proximity_domain;
109
110 if (num_memory_chunks >= MAXCHUNKS) {
111 printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
112 size/(1024*1024), paddr);
113 return;
114 }
115
116 /* Insertion sort based on base address */
117 pend = &node_memory_chunk[num_memory_chunks];
118 for (p = &node_memory_chunk[0]; p < pend; p++) {
119 if (start_pfn < p->start_pfn)
120 break;
121 }
122 if (p < pend) {
123 for (q = pend; q >= p; q--)
124 *(q + 1) = *q;
125 }
126 p->start_pfn = start_pfn;
127 p->end_pfn = end_pfn;
128 p->pxm = pxm;
129
130 num_memory_chunks++;
131
132 printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
133 start_pfn, end_pfn,
134 memory_affinity->memory_type,
135 memory_affinity->proximity_domain,
136 (memory_affinity->flags.hot_pluggable ?
137 "enabled and removable" : "enabled" ) );
138}
139
140#if MAX_NR_ZONES != 3
141#error "MAX_NR_ZONES != 3, chunk_to_zone requires review"
142#endif
143/* Take a chunk of pages from page frame cstart to cend and count the number
144 * of pages in each zone, returned via zones[].
145 */
146static __init void chunk_to_zones(unsigned long cstart, unsigned long cend,
147 unsigned long *zones)
148{
149 unsigned long max_dma;
150 extern unsigned long max_low_pfn;
151
152 int z;
153 unsigned long rend;
154
155 /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide
156 * similarly scoped information and should be handled in a consistant
157 * manner.
158 */
159 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
160
161 /* Split the hole into the zones in which it falls. Repeatedly
162 * take the segment in which the remaining hole starts, round it
163 * to the end of that zone.
164 */
165 memset(zones, 0, MAX_NR_ZONES * sizeof(long));
166 while (cstart < cend) {
167 if (cstart < max_dma) {
168 z = ZONE_DMA;
169 rend = (cend < max_dma)? cend : max_dma;
170
171 } else if (cstart < max_low_pfn) {
172 z = ZONE_NORMAL;
173 rend = (cend < max_low_pfn)? cend : max_low_pfn;
174
175 } else {
176 z = ZONE_HIGHMEM;
177 rend = cend;
178 }
179 zones[z] += rend - cstart;
180 cstart = rend;
181 }
182}
183
184/*
185 * The SRAT table always lists ascending addresses, so can always
186 * assume that the first "start" address that you see is the real
187 * start of the node, and that the current "end" address is after
188 * the previous one.
189 */
190static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
191{
192 /*
193 * Only add present memory as told by the e820.
194 * There is no guarantee from the SRAT that the memory it
195 * enumerates is present at boot time because it represents
196 * *possible* memory hotplug areas the same as normal RAM.
197 */
198 if (memory_chunk->start_pfn >= max_pfn) {
199 printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
200 memory_chunk->start_pfn, memory_chunk->end_pfn);
201 return;
202 }
203 if (memory_chunk->nid != nid)
204 return;
205
206 if (!node_has_online_mem(nid))
207 node_start_pfn[nid] = memory_chunk->start_pfn;
208
209 if (node_start_pfn[nid] > memory_chunk->start_pfn)
210 node_start_pfn[nid] = memory_chunk->start_pfn;
211
212 if (node_end_pfn[nid] < memory_chunk->end_pfn)
213 node_end_pfn[nid] = memory_chunk->end_pfn;
214}
215
216/* Parse the ACPI Static Resource Affinity Table */
217static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
218{
219 u8 *start, *end, *p;
220 int i, j, nid;
221 u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */
222 u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */
223
224 start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
225 p = start;
226 end = (u8 *)sratp + sratp->header.length;
227
228 memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
229 memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
230 memset(zholes_size, 0, sizeof(zholes_size));
231
232 /* -1 in these maps means not available */
233 memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
234 memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
235
236 num_memory_chunks = 0;
237 while (p < end) {
238 switch (*p) {
239 case ACPI_SRAT_PROCESSOR_AFFINITY:
240 parse_cpu_affinity_structure(p);
241 break;
242 case ACPI_SRAT_MEMORY_AFFINITY:
243 parse_memory_affinity_structure(p);
244 break;
245 default:
246 printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
247 break;
248 }
249 p += p[1];
250 if (p[1] == 0) {
251 printk("acpi20_parse_srat: Entry length value is zero;"
252 " can't parse any further!\n");
253 break;
254 }
255 }
256
257 if (num_memory_chunks == 0) {
258 printk("could not finy any ACPI SRAT memory areas.\n");
259 goto out_fail;
260 }
261
262 /* Calculate total number of nodes in system from PXM bitmap and create
263 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
264 * to specify the range of _PXM values.)
265 */
266 /*
267 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
268 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
269 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
270 * approaches MAX_PXM_DOMAINS for i386.
271 */
272 nodes_clear(node_online_map);
273 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
274 if (BMAP_TEST(pxm_bitmap, i)) {
275 nid = num_online_nodes();
276 pxm_to_nid_map[i] = nid;
277 nid_to_pxm_map[nid] = i;
278 node_set_online(nid);
279 }
280 }
281 BUG_ON(num_online_nodes() == 0);
282
283 /* set cnode id in memory chunk structure */
284 for (i = 0; i < num_memory_chunks; i++)
285 node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm];
286
287 printk("pxm bitmap: ");
288 for (i = 0; i < sizeof(pxm_bitmap); i++) {
289 printk("%02X ", pxm_bitmap[i]);
290 }
291 printk("\n");
292 printk("Number of logical nodes in system = %d\n", num_online_nodes());
293 printk("Number of memory chunks in system = %d\n", num_memory_chunks);
294
295 for (j = 0; j < num_memory_chunks; j++){
296 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
297 printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
298 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
299 node_read_chunk(chunk->nid, chunk);
300 }
301
302 for_each_online_node(nid) {
303 unsigned long start = node_start_pfn[nid];
304 unsigned long end = node_end_pfn[nid];
305
306 memory_present(nid, start, end);
307 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
308 }
309 return 1;
310out_fail:
311 return 0;
312}
313
314int __init get_memcfg_from_srat(void)
315{
316 struct acpi_table_header *header = NULL;
317 struct acpi_table_rsdp *rsdp = NULL;
318 struct acpi_table_rsdt *rsdt = NULL;
319 struct acpi_pointer *rsdp_address = NULL;
320 struct acpi_table_rsdt saved_rsdt;
321 int tables = 0;
322 int i = 0;
323
324 acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING, rsdp_address);
325
326 if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) {
327 printk("%s: assigning address to rsdp\n", __FUNCTION__);
328 rsdp = (struct acpi_table_rsdp *)
329 (u32)rsdp_address->pointer.physical;
330 } else {
331 printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__);
332 goto out_err;
333 }
334 if (!rsdp) {
335 printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
336 goto out_err;
337 }
338
339 printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
340 rsdp->oem_id);
341
342 if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) {
343 printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
344 goto out_err;
345 }
346
347 rsdt = (struct acpi_table_rsdt *)
348 boot_ioremap(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt));
349
350 if (!rsdt) {
351 printk(KERN_WARNING
352 "%s: ACPI: Invalid root system description tables (RSDT)\n",
353 __FUNCTION__);
354 goto out_err;
355 }
356
357 header = & rsdt->header;
358
359 if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) {
360 printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
361 goto out_err;
362 }
363
364 /*
365 * The number of tables is computed by taking the
366 * size of all entries (header size minus total
367 * size of RSDT) divided by the size of each entry
368 * (4-byte table pointers).
369 */
370 tables = (header->length - sizeof(struct acpi_table_header)) / 4;
371
372 if (!tables)
373 goto out_err;
374
375 memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
376
377 if (saved_rsdt.header.length > sizeof(saved_rsdt)) {
378 printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
379 saved_rsdt.header.length);
380 goto out_err;
381 }
382
383 printk("Begin SRAT table scan....\n");
384
385 for (i = 0; i < tables; i++) {
386 /* Map in header, then map in full table length. */
387 header = (struct acpi_table_header *)
388 boot_ioremap(saved_rsdt.entry[i], sizeof(struct acpi_table_header));
389 if (!header)
390 break;
391 header = (struct acpi_table_header *)
392 boot_ioremap(saved_rsdt.entry[i], header->length);
393 if (!header)
394 break;
395
396 if (strncmp((char *) &header->signature, "SRAT", 4))
397 continue;
398
399 /* we've found the srat table. don't need to look at any more tables */
400 return acpi20_parse_srat((struct acpi_table_srat *)header);
401 }
402out_err:
403 printk("failed to get NUMA memory information from SRAT table\n");
404 return 0;
405}
406
407/* For each node run the memory list to determine whether there are
408 * any memory holes. For each hole determine which ZONE they fall
409 * into.
410 *
411 * NOTE#1: this requires knowledge of the zone boundries and so
412 * _cannot_ be performed before those are calculated in setup_memory.
413 *
414 * NOTE#2: we rely on the fact that the memory chunks are ordered by
415 * start pfn number during setup.
416 */
417static void __init get_zholes_init(void)
418{
419 int nid;
420 int c;
421 int first;
422 unsigned long end = 0;
423
424 for_each_online_node(nid) {
425 first = 1;
426 for (c = 0; c < num_memory_chunks; c++){
427 if (node_memory_chunk[c].nid == nid) {
428 if (first) {
429 end = node_memory_chunk[c].end_pfn;
430 first = 0;
431
432 } else {
433 /* Record any gap between this chunk
434 * and the previous chunk on this node
435 * against the zones it spans.
436 */
437 chunk_to_zones(end,
438 node_memory_chunk[c].start_pfn,
439 &zholes_size[nid * MAX_NR_ZONES]);
440 }
441 }
442 }
443 }
444}
445
446unsigned long * __init get_zholes_size(int nid)
447{
448 if (!zholes_size_init) {
449 zholes_size_init++;
450 get_zholes_init();
451 }
452 if (nid >= MAX_NUMNODES || !node_online(nid))
453 printk("%s: nid = %d is invalid/offline. num_online_nodes = %d",
454 __FUNCTION__, nid, num_online_nodes());
455 return &zholes_size[nid * MAX_NR_ZONES];
456}
diff --git a/arch/i386/kernel/summit.c b/arch/i386/kernel/summit.c
new file mode 100644
index 000000000000..d0e01a3acf35
--- /dev/null
+++ b/arch/i386/kernel/summit.c
@@ -0,0 +1,180 @@
1/*
2 * arch/i386/kernel/summit.c - IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#include <linux/mm.h>
30#include <linux/init.h>
31#include <asm/io.h>
32#include <asm/mach-summit/mach_mpparse.h>
33
34static struct rio_table_hdr *rio_table_hdr __initdata;
35static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
36static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
37
38static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
39{
40 int twister = 0, node = 0;
41 int i, bus, num_buses;
42
43 for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
44 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
45 twister = rio_devs[i]->owner_id;
46 break;
47 }
48 }
49 if (i == rio_table_hdr->num_rio_dev){
50 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
51 return last_bus;
52 }
53
54 for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
55 if (scal_devs[i]->node_id == twister){
56 node = scal_devs[i]->node_id;
57 break;
58 }
59 }
60 if (i == rio_table_hdr->num_scal_dev){
61 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
62 return last_bus;
63 }
64
65 switch (rio_devs[wpeg_num]->type){
66 case CompatWPEG:
67 /* The Compatability Winnipeg controls the 2 legacy buses,
68 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
69 * a PCI-PCI bridge card is used in either slot: total 5 buses.
70 */
71 num_buses = 5;
72 break;
73 case AltWPEG:
74 /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
75 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
76 * the "extra" buses for each of those slots: total 7 buses.
77 */
78 num_buses = 7;
79 break;
80 case LookOutAWPEG:
81 case LookOutBWPEG:
82 /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
83 * & the "extra" buses for each of those slots: total 9 buses.
84 */
85 num_buses = 9;
86 break;
87 default:
88 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
89 return last_bus;
90 }
91
92 for(bus = last_bus; bus < last_bus + num_buses; bus++)
93 mp_bus_id_to_node[bus] = node;
94 return bus;
95}
96
97static int __init build_detail_arrays(void)
98{
99 unsigned long ptr;
100 int i, scal_detail_size, rio_detail_size;
101
102 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
103 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
104 return 0;
105 }
106
107 switch (rio_table_hdr->version){
108 default:
109 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
110 return 0;
111 case 2:
112 scal_detail_size = 11;
113 rio_detail_size = 13;
114 break;
115 case 3:
116 scal_detail_size = 12;
117 rio_detail_size = 15;
118 break;
119 }
120
121 ptr = (unsigned long)rio_table_hdr + 3;
122 for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
123 scal_devs[i] = (struct scal_detail *)ptr;
124
125 for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
126 rio_devs[i] = (struct rio_detail *)ptr;
127
128 return 1;
129}
130
131void __init setup_summit(void)
132{
133 unsigned long ptr;
134 unsigned short offset;
135 int i, next_wpeg, next_bus = 0;
136
137 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
138 ptr = *(unsigned short *)phys_to_virt(0x40Eul);
139 ptr = (unsigned long)phys_to_virt(ptr << 4);
140
141 rio_table_hdr = NULL;
142 offset = 0x180;
143 while (offset){
144 /* The block id is stored in the 2nd word */
145 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
146 /* set the pointer past the offset & block id */
147 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
148 break;
149 }
150 /* The next offset is stored in the 1st word. 0 means no more */
151 offset = *((unsigned short *)(ptr + offset));
152 }
153 if (!rio_table_hdr){
154 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
155 return;
156 }
157
158 if (!build_detail_arrays())
159 return;
160
161 /* The first Winnipeg we're looking for has an index of 0 */
162 next_wpeg = 0;
163 do {
164 for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
165 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
166 /* It's the Winnipeg we're looking for! */
167 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
168 next_wpeg++;
169 break;
170 }
171 }
172 /*
173 * If we go through all Rio devices and don't find one with
174 * the next index, it means we've found all the Winnipegs,
175 * and thus all the PCI buses.
176 */
177 if (i == rio_table_hdr->num_rio_dev)
178 next_wpeg = 0;
179 } while (next_wpeg != 0);
180}
diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
new file mode 100644
index 000000000000..a4a61976ecb9
--- /dev/null
+++ b/arch/i386/kernel/sys_i386.c
@@ -0,0 +1,252 @@
1/*
2 * linux/arch/i386/kernel/sys_i386.c
3 *
4 * This file contains various random system calls that
5 * have a non-standard calling sequence on the Linux/i386
6 * platform.
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/smp_lock.h>
14#include <linux/sem.h>
15#include <linux/msg.h>
16#include <linux/shm.h>
17#include <linux/stat.h>
18#include <linux/syscalls.h>
19#include <linux/mman.h>
20#include <linux/file.h>
21#include <linux/utsname.h>
22
23#include <asm/uaccess.h>
24#include <asm/ipc.h>
25
26/*
27 * sys_pipe() is the normal C calling standard for creating
28 * a pipe. It's not the way Unix traditionally does this, though.
29 */
30asmlinkage int sys_pipe(unsigned long __user * fildes)
31{
32 int fd[2];
33 int error;
34
35 error = do_pipe(fd);
36 if (!error) {
37 if (copy_to_user(fildes, fd, 2*sizeof(int)))
38 error = -EFAULT;
39 }
40 return error;
41}
42
43/* common code for old and new mmaps */
44static inline long do_mmap2(
45 unsigned long addr, unsigned long len,
46 unsigned long prot, unsigned long flags,
47 unsigned long fd, unsigned long pgoff)
48{
49 int error = -EBADF;
50 struct file * file = NULL;
51
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) {
54 file = fget(fd);
55 if (!file)
56 goto out;
57 }
58
59 down_write(&current->mm->mmap_sem);
60 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
61 up_write(&current->mm->mmap_sem);
62
63 if (file)
64 fput(file);
65out:
66 return error;
67}
68
69asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
70 unsigned long prot, unsigned long flags,
71 unsigned long fd, unsigned long pgoff)
72{
73 return do_mmap2(addr, len, prot, flags, fd, pgoff);
74}
75
76/*
77 * Perform the select(nd, in, out, ex, tv) and mmap() system
78 * calls. Linux/i386 didn't use to be able to handle more than
79 * 4 system call parameters, so these system calls used a memory
80 * block for parameter passing..
81 */
82
83struct mmap_arg_struct {
84 unsigned long addr;
85 unsigned long len;
86 unsigned long prot;
87 unsigned long flags;
88 unsigned long fd;
89 unsigned long offset;
90};
91
92asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
93{
94 struct mmap_arg_struct a;
95 int err = -EFAULT;
96
97 if (copy_from_user(&a, arg, sizeof(a)))
98 goto out;
99
100 err = -EINVAL;
101 if (a.offset & ~PAGE_MASK)
102 goto out;
103
104 err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
105out:
106 return err;
107}
108
109
110struct sel_arg_struct {
111 unsigned long n;
112 fd_set __user *inp, *outp, *exp;
113 struct timeval __user *tvp;
114};
115
116asmlinkage int old_select(struct sel_arg_struct __user *arg)
117{
118 struct sel_arg_struct a;
119
120 if (copy_from_user(&a, arg, sizeof(a)))
121 return -EFAULT;
122 /* sys_select() does the appropriate kernel locking */
123 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
124}
125
126/*
127 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
128 *
129 * This is really horribly ugly.
130 */
131asmlinkage int sys_ipc (uint call, int first, int second,
132 int third, void __user *ptr, long fifth)
133{
134 int version, ret;
135
136 version = call >> 16; /* hack for backward compatibility */
137 call &= 0xffff;
138
139 switch (call) {
140 case SEMOP:
141 return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
142 case SEMTIMEDOP:
143 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
144 (const struct timespec __user *)fifth);
145
146 case SEMGET:
147 return sys_semget (first, second, third);
148 case SEMCTL: {
149 union semun fourth;
150 if (!ptr)
151 return -EINVAL;
152 if (get_user(fourth.__pad, (void __user * __user *) ptr))
153 return -EFAULT;
154 return sys_semctl (first, second, third, fourth);
155 }
156
157 case MSGSND:
158 return sys_msgsnd (first, (struct msgbuf __user *) ptr,
159 second, third);
160 case MSGRCV:
161 switch (version) {
162 case 0: {
163 struct ipc_kludge tmp;
164 if (!ptr)
165 return -EINVAL;
166
167 if (copy_from_user(&tmp,
168 (struct ipc_kludge __user *) ptr,
169 sizeof (tmp)))
170 return -EFAULT;
171 return sys_msgrcv (first, tmp.msgp, second,
172 tmp.msgtyp, third);
173 }
174 default:
175 return sys_msgrcv (first,
176 (struct msgbuf __user *) ptr,
177 second, fifth, third);
178 }
179 case MSGGET:
180 return sys_msgget ((key_t) first, second);
181 case MSGCTL:
182 return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
183
184 case SHMAT:
185 switch (version) {
186 default: {
187 ulong raddr;
188 ret = do_shmat (first, (char __user *) ptr, second, &raddr);
189 if (ret)
190 return ret;
191 return put_user (raddr, (ulong __user *) third);
192 }
193 case 1: /* iBCS2 emulator entry point */
194 if (!segment_eq(get_fs(), get_ds()))
195 return -EINVAL;
196 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
197 return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
198 }
199 case SHMDT:
200 return sys_shmdt ((char __user *)ptr);
201 case SHMGET:
202 return sys_shmget (first, second, third);
203 case SHMCTL:
204 return sys_shmctl (first, second,
205 (struct shmid_ds __user *) ptr);
206 default:
207 return -ENOSYS;
208 }
209}
210
211/*
212 * Old cruft
213 */
214asmlinkage int sys_uname(struct old_utsname __user * name)
215{
216 int err;
217 if (!name)
218 return -EFAULT;
219 down_read(&uts_sem);
220 err=copy_to_user(name, &system_utsname, sizeof (*name));
221 up_read(&uts_sem);
222 return err?-EFAULT:0;
223}
224
225asmlinkage int sys_olduname(struct oldold_utsname __user * name)
226{
227 int error;
228
229 if (!name)
230 return -EFAULT;
231 if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
232 return -EFAULT;
233
234 down_read(&uts_sem);
235
236 error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
237 error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
238 error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
239 error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
240 error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
241 error |= __put_user(0,name->release+__OLD_UTS_LEN);
242 error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
243 error |= __put_user(0,name->version+__OLD_UTS_LEN);
244 error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
245 error |= __put_user(0,name->machine+__OLD_UTS_LEN);
246
247 up_read(&uts_sem);
248
249 error = error ? -EFAULT : 0;
250
251 return error;
252}
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
new file mode 100644
index 000000000000..960d8bd137d0
--- /dev/null
+++ b/arch/i386/kernel/sysenter.c
@@ -0,0 +1,65 @@
1/*
2 * linux/arch/i386/kernel/sysenter.c
3 *
4 * (C) Copyright 2002 Linus Torvalds
5 *
6 * This file contains the needed initializations to support sysenter.
7 */
8
9#include <linux/init.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12#include <linux/sched.h>
13#include <linux/gfp.h>
14#include <linux/string.h>
15#include <linux/elf.h>
16
17#include <asm/cpufeature.h>
18#include <asm/msr.h>
19#include <asm/pgtable.h>
20#include <asm/unistd.h>
21
22extern asmlinkage void sysenter_entry(void);
23
24void enable_sep_cpu(void *info)
25{
26 int cpu = get_cpu();
27 struct tss_struct *tss = &per_cpu(init_tss, cpu);
28
29 tss->ss1 = __KERNEL_CS;
30 tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
31 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
32 wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
33 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
34 put_cpu();
35}
36
37/*
38 * These symbols are defined by vsyscall.o to mark the bounds
39 * of the ELF DSO images included therein.
40 */
41extern const char vsyscall_int80_start, vsyscall_int80_end;
42extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
43
44static int __init sysenter_setup(void)
45{
46 void *page = (void *)get_zeroed_page(GFP_ATOMIC);
47
48 __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
49
50 if (!boot_cpu_has(X86_FEATURE_SEP)) {
51 memcpy(page,
52 &vsyscall_int80_start,
53 &vsyscall_int80_end - &vsyscall_int80_start);
54 return 0;
55 }
56
57 memcpy(page,
58 &vsyscall_sysenter_start,
59 &vsyscall_sysenter_end - &vsyscall_sysenter_start);
60
61 on_each_cpu(enable_sep_cpu, NULL, 1, 1);
62 return 0;
63}
64
65__initcall(sysenter_setup);
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
new file mode 100644
index 000000000000..9b55e30e4490
--- /dev/null
+++ b/arch/i386/kernel/time.c
@@ -0,0 +1,476 @@
1/*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
32
33#include <linux/errno.h>
34#include <linux/sched.h>
35#include <linux/kernel.h>
36#include <linux/param.h>
37#include <linux/string.h>
38#include <linux/mm.h>
39#include <linux/interrupt.h>
40#include <linux/time.h>
41#include <linux/delay.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44#include <linux/module.h>
45#include <linux/sysdev.h>
46#include <linux/bcd.h>
47#include <linux/efi.h>
48#include <linux/mca.h>
49
50#include <asm/io.h>
51#include <asm/smp.h>
52#include <asm/irq.h>
53#include <asm/msr.h>
54#include <asm/delay.h>
55#include <asm/mpspec.h>
56#include <asm/uaccess.h>
57#include <asm/processor.h>
58#include <asm/timer.h>
59
60#include "mach_time.h"
61
62#include <linux/timex.h>
63#include <linux/config.h>
64
65#include <asm/hpet.h>
66
67#include <asm/arch_hooks.h>
68
69#include "io_ports.h"
70
71extern spinlock_t i8259A_lock;
72int pit_latch_buggy; /* extern */
73
74#include "do_timer.h"
75
76u64 jiffies_64 = INITIAL_JIFFIES;
77
78EXPORT_SYMBOL(jiffies_64);
79
80unsigned long cpu_khz; /* Detected as we calibrate the TSC */
81
82extern unsigned long wall_jiffies;
83
84DEFINE_SPINLOCK(rtc_lock);
85
86DEFINE_SPINLOCK(i8253_lock);
87EXPORT_SYMBOL(i8253_lock);
88
89struct timer_opts *cur_timer = &timer_none;
90
91/*
92 * This is a special lock that is owned by the CPU and holds the index
93 * register we are working with. It is required for NMI access to the
94 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
95 */
96volatile unsigned long cmos_lock = 0;
97EXPORT_SYMBOL(cmos_lock);
98
99/* Routines for accessing the CMOS RAM/RTC. */
100unsigned char rtc_cmos_read(unsigned char addr)
101{
102 unsigned char val;
103 lock_cmos_prefix(addr);
104 outb_p(addr, RTC_PORT(0));
105 val = inb_p(RTC_PORT(1));
106 lock_cmos_suffix(addr);
107 return val;
108}
109EXPORT_SYMBOL(rtc_cmos_read);
110
111void rtc_cmos_write(unsigned char val, unsigned char addr)
112{
113 lock_cmos_prefix(addr);
114 outb_p(addr, RTC_PORT(0));
115 outb_p(val, RTC_PORT(1));
116 lock_cmos_suffix(addr);
117}
118EXPORT_SYMBOL(rtc_cmos_write);
119
120/*
121 * This version of gettimeofday has microsecond resolution
122 * and better than microsecond precision on fast x86 machines with TSC.
123 */
124void do_gettimeofday(struct timeval *tv)
125{
126 unsigned long seq;
127 unsigned long usec, sec;
128 unsigned long max_ntp_tick;
129
130 do {
131 unsigned long lost;
132
133 seq = read_seqbegin(&xtime_lock);
134
135 usec = cur_timer->get_offset();
136 lost = jiffies - wall_jiffies;
137
138 /*
139 * If time_adjust is negative then NTP is slowing the clock
140 * so make sure not to go into next possible interval.
141 * Better to lose some accuracy than have time go backwards..
142 */
143 if (unlikely(time_adjust < 0)) {
144 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
145 usec = min(usec, max_ntp_tick);
146
147 if (lost)
148 usec += lost * max_ntp_tick;
149 }
150 else if (unlikely(lost))
151 usec += lost * (USEC_PER_SEC / HZ);
152
153 sec = xtime.tv_sec;
154 usec += (xtime.tv_nsec / 1000);
155 } while (read_seqretry(&xtime_lock, seq));
156
157 while (usec >= 1000000) {
158 usec -= 1000000;
159 sec++;
160 }
161
162 tv->tv_sec = sec;
163 tv->tv_usec = usec;
164}
165
166EXPORT_SYMBOL(do_gettimeofday);
167
168int do_settimeofday(struct timespec *tv)
169{
170 time_t wtm_sec, sec = tv->tv_sec;
171 long wtm_nsec, nsec = tv->tv_nsec;
172
173 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
174 return -EINVAL;
175
176 write_seqlock_irq(&xtime_lock);
177 /*
178 * This is revolting. We need to set "xtime" correctly. However, the
179 * value in this location is the value at the most recent update of
180 * wall time. Discover what correction gettimeofday() would have
181 * made, and then undo it!
182 */
183 nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
184 nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
185
186 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
187 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
188
189 set_normalized_timespec(&xtime, sec, nsec);
190 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
191
192 time_adjust = 0; /* stop active adjtime() */
193 time_status |= STA_UNSYNC;
194 time_maxerror = NTP_PHASE_LIMIT;
195 time_esterror = NTP_PHASE_LIMIT;
196 write_sequnlock_irq(&xtime_lock);
197 clock_was_set();
198 return 0;
199}
200
201EXPORT_SYMBOL(do_settimeofday);
202
203static int set_rtc_mmss(unsigned long nowtime)
204{
205 int retval;
206
207 WARN_ON(irqs_disabled());
208
209 /* gets recalled with irq locally disabled */
210 spin_lock_irq(&rtc_lock);
211 if (efi_enabled)
212 retval = efi_set_rtc_mmss(nowtime);
213 else
214 retval = mach_set_rtc_mmss(nowtime);
215 spin_unlock_irq(&rtc_lock);
216
217 return retval;
218}
219
220
221int timer_ack;
222
223/* monotonic_clock(): returns # of nanoseconds passed since time_init()
224 * Note: This function is required to return accurate
225 * time even in the absence of multiple timer ticks.
226 */
227unsigned long long monotonic_clock(void)
228{
229 return cur_timer->monotonic_clock();
230}
231EXPORT_SYMBOL(monotonic_clock);
232
233#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
234unsigned long profile_pc(struct pt_regs *regs)
235{
236 unsigned long pc = instruction_pointer(regs);
237
238 if (in_lock_functions(pc))
239 return *(unsigned long *)(regs->ebp + 4);
240
241 return pc;
242}
243EXPORT_SYMBOL(profile_pc);
244#endif
245
246/*
247 * timer_interrupt() needs to keep up the real-time clock,
248 * as well as call the "do_timer()" routine every clocktick
249 */
250static inline void do_timer_interrupt(int irq, void *dev_id,
251 struct pt_regs *regs)
252{
253#ifdef CONFIG_X86_IO_APIC
254 if (timer_ack) {
255 /*
256 * Subtle, when I/O APICs are used we have to ack timer IRQ
257 * manually to reset the IRR bit for do_slow_gettimeoffset().
258 * This will also deassert NMI lines for the watchdog if run
259 * on an 82489DX-based system.
260 */
261 spin_lock(&i8259A_lock);
262 outb(0x0c, PIC_MASTER_OCW3);
263 /* Ack the IRQ; AEOI will end it automatically. */
264 inb(PIC_MASTER_POLL);
265 spin_unlock(&i8259A_lock);
266 }
267#endif
268
269 do_timer_interrupt_hook(regs);
270
271
272 if (MCA_bus) {
273 /* The PS/2 uses level-triggered interrupts. You can't
274 turn them off, nor would you want to (any attempt to
275 enable edge-triggered interrupts usually gets intercepted by a
276 special hardware circuit). Hence we have to acknowledge
277 the timer interrupt. Through some incredibly stupid
278 design idea, the reset for IRQ 0 is done by setting the
279 high bit of the PPI port B (0x61). Note that some PS/2s,
280 notably the 55SX, work fine if this is removed. */
281
282 irq = inb_p( 0x61 ); /* read the current state */
283 outb_p( irq|0x80, 0x61 ); /* reset the IRQ */
284 }
285}
286
287/*
288 * This is the same as the above, except we _also_ save the current
289 * Time Stamp Counter value at the time of the timer interrupt, so that
290 * we later on can estimate the time of day more exactly.
291 */
292irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
293{
294 /*
295 * Here we are in the timer irq handler. We just have irqs locally
296 * disabled but we don't know if the timer_bh is running on the other
297 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
298 * the irq version of write_lock because as just said we have irq
299 * locally disabled. -arca
300 */
301 write_seqlock(&xtime_lock);
302
303 cur_timer->mark_offset();
304
305 do_timer_interrupt(irq, NULL, regs);
306
307 write_sequnlock(&xtime_lock);
308 return IRQ_HANDLED;
309}
310
311/* not static: needed by APM */
312unsigned long get_cmos_time(void)
313{
314 unsigned long retval;
315
316 spin_lock(&rtc_lock);
317
318 if (efi_enabled)
319 retval = efi_get_time();
320 else
321 retval = mach_get_cmos_time();
322
323 spin_unlock(&rtc_lock);
324
325 return retval;
326}
327static void sync_cmos_clock(unsigned long dummy);
328
329static struct timer_list sync_cmos_timer =
330 TIMER_INITIALIZER(sync_cmos_clock, 0, 0);
331
332static void sync_cmos_clock(unsigned long dummy)
333{
334 struct timeval now, next;
335 int fail = 1;
336
337 /*
338 * If we have an externally synchronized Linux clock, then update
339 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
340 * called as close as possible to 500 ms before the new second starts.
341 * This code is run on a timer. If the clock is set, that timer
342 * may not expire at the correct time. Thus, we adjust...
343 */
344 if ((time_status & STA_UNSYNC) != 0)
345 /*
346 * Not synced, exit, do not restart a timer (if one is
347 * running, let it run out).
348 */
349 return;
350
351 do_gettimeofday(&now);
352 if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
353 now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
354 fail = set_rtc_mmss(now.tv_sec);
355
356 next.tv_usec = USEC_AFTER - now.tv_usec;
357 if (next.tv_usec <= 0)
358 next.tv_usec += USEC_PER_SEC;
359
360 if (!fail)
361 next.tv_sec = 659;
362 else
363 next.tv_sec = 0;
364
365 if (next.tv_usec >= USEC_PER_SEC) {
366 next.tv_sec++;
367 next.tv_usec -= USEC_PER_SEC;
368 }
369 mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
370}
371
372void notify_arch_cmos_timer(void)
373{
374 mod_timer(&sync_cmos_timer, jiffies + 1);
375}
376
377static long clock_cmos_diff, sleep_start;
378
379static int timer_suspend(struct sys_device *dev, u32 state)
380{
381 /*
382 * Estimate time zone so that set_time can update the clock
383 */
384 clock_cmos_diff = -get_cmos_time();
385 clock_cmos_diff += get_seconds();
386 sleep_start = get_cmos_time();
387 return 0;
388}
389
390static int timer_resume(struct sys_device *dev)
391{
392 unsigned long flags;
393 unsigned long sec;
394 unsigned long sleep_length;
395
396#ifdef CONFIG_HPET_TIMER
397 if (is_hpet_enabled())
398 hpet_reenable();
399#endif
400 sec = get_cmos_time() + clock_cmos_diff;
401 sleep_length = (get_cmos_time() - sleep_start) * HZ;
402 write_seqlock_irqsave(&xtime_lock, flags);
403 xtime.tv_sec = sec;
404 xtime.tv_nsec = 0;
405 write_sequnlock_irqrestore(&xtime_lock, flags);
406 jiffies += sleep_length;
407 wall_jiffies += sleep_length;
408 return 0;
409}
410
411static struct sysdev_class timer_sysclass = {
412 .resume = timer_resume,
413 .suspend = timer_suspend,
414 set_kset_name("timer"),
415};
416
417
418/* XXX this driverfs stuff should probably go elsewhere later -john */
419static struct sys_device device_timer = {
420 .id = 0,
421 .cls = &timer_sysclass,
422};
423
424static int time_init_device(void)
425{
426 int error = sysdev_class_register(&timer_sysclass);
427 if (!error)
428 error = sysdev_register(&device_timer);
429 return error;
430}
431
432device_initcall(time_init_device);
433
434#ifdef CONFIG_HPET_TIMER
435extern void (*late_time_init)(void);
436/* Duplicate of time_init() below, with hpet_enable part added */
437static void __init hpet_time_init(void)
438{
439 xtime.tv_sec = get_cmos_time();
440 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
441 set_normalized_timespec(&wall_to_monotonic,
442 -xtime.tv_sec, -xtime.tv_nsec);
443
444 if (hpet_enable() >= 0) {
445 printk("Using HPET for base-timer\n");
446 }
447
448 cur_timer = select_timer();
449 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
450
451 time_init_hook();
452}
453#endif
454
455void __init time_init(void)
456{
457#ifdef CONFIG_HPET_TIMER
458 if (is_hpet_capable()) {
459 /*
460 * HPET initialization needs to do memory-mapped io. So, let
461 * us do a late initialization after mem_init().
462 */
463 late_time_init = hpet_time_init;
464 return;
465 }
466#endif
467 xtime.tv_sec = get_cmos_time();
468 xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
469 set_normalized_timespec(&wall_to_monotonic,
470 -xtime.tv_sec, -xtime.tv_nsec);
471
472 cur_timer = select_timer();
473 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
474
475 time_init_hook();
476}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
new file mode 100644
index 000000000000..244a31b04be7
--- /dev/null
+++ b/arch/i386/kernel/time_hpet.c
@@ -0,0 +1,458 @@
1/*
2 * linux/arch/i386/kernel/time_hpet.c
3 * This code largely copied from arch/x86_64/kernel/time.c
4 * See that file for credits.
5 *
6 * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support
7 */
8
9#include <linux/errno.h>
10#include <linux/kernel.h>
11#include <linux/param.h>
12#include <linux/string.h>
13#include <linux/init.h>
14#include <linux/smp.h>
15
16#include <asm/timer.h>
17#include <asm/fixmap.h>
18#include <asm/apic.h>
19
20#include <linux/timex.h>
21#include <linux/config.h>
22
23#include <asm/hpet.h>
24#include <linux/hpet.h>
25
26static unsigned long hpet_period; /* fsecs / HPET clock */
27unsigned long hpet_tick; /* hpet clks count per tick */
28unsigned long hpet_address; /* hpet memory map physical address */
29
30static int use_hpet; /* can be used for runtime check of hpet */
31static int boot_hpet_disable; /* boottime override for HPET timer */
32static void __iomem * hpet_virt_address; /* hpet kernel virtual address */
33
34#define FSEC_TO_USEC (1000000000UL)
35
36int hpet_readl(unsigned long a)
37{
38 return readl(hpet_virt_address + a);
39}
40
41static void hpet_writel(unsigned long d, unsigned long a)
42{
43 writel(d, hpet_virt_address + a);
44}
45
46#ifdef CONFIG_X86_LOCAL_APIC
47/*
48 * HPET counters dont wrap around on every tick. They just change the
49 * comparator value and continue. Next tick can be caught by checking
50 * for a change in the comparator value. Used in apic.c.
51 */
52static void __init wait_hpet_tick(void)
53{
54 unsigned int start_cmp_val, end_cmp_val;
55
56 start_cmp_val = hpet_readl(HPET_T0_CMP);
57 do {
58 end_cmp_val = hpet_readl(HPET_T0_CMP);
59 } while (start_cmp_val == end_cmp_val);
60}
61#endif
62
63static int hpet_timer_stop_set_go(unsigned long tick)
64{
65 unsigned int cfg;
66
67 /*
68 * Stop the timers and reset the main counter.
69 */
70 cfg = hpet_readl(HPET_CFG);
71 cfg &= ~HPET_CFG_ENABLE;
72 hpet_writel(cfg, HPET_CFG);
73 hpet_writel(0, HPET_COUNTER);
74 hpet_writel(0, HPET_COUNTER + 4);
75
76 /*
77 * Set up timer 0, as periodic with first interrupt to happen at
78 * hpet_tick, and period also hpet_tick.
79 */
80 cfg = hpet_readl(HPET_T0_CFG);
81 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
82 HPET_TN_SETVAL | HPET_TN_32BIT;
83 hpet_writel(cfg, HPET_T0_CFG);
84
85 /*
86 * The first write after writing TN_SETVAL to the config register sets
87 * the counter value, the second write sets the threshold.
88 */
89 hpet_writel(tick, HPET_T0_CMP);
90 hpet_writel(tick, HPET_T0_CMP);
91
92 /*
93 * Go!
94 */
95 cfg = hpet_readl(HPET_CFG);
96 cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
97 hpet_writel(cfg, HPET_CFG);
98
99 return 0;
100}
101
102/*
103 * Check whether HPET was found by ACPI boot parse. If yes setup HPET
104 * counter 0 for kernel base timer.
105 */
106int __init hpet_enable(void)
107{
108 unsigned int id;
109 unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */
110 unsigned long hpet_tick_rem;
111
112 if (boot_hpet_disable)
113 return -1;
114
115 if (!hpet_address) {
116 return -1;
117 }
118 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
119 /*
120 * Read the period, compute tick and quotient.
121 */
122 id = hpet_readl(HPET_ID);
123
124 /*
125 * We are checking for value '1' or more in number field if
126 * CONFIG_HPET_EMULATE_RTC is set because we will need an
127 * additional timer for RTC emulation.
128 * However, we can do with one timer otherwise using the
129 * the single HPET timer for system time.
130 */
131 if (
132#ifdef CONFIG_HPET_EMULATE_RTC
133 !(id & HPET_ID_NUMBER) ||
134#endif
135 !(id & HPET_ID_LEGSUP))
136 return -1;
137
138 hpet_period = hpet_readl(HPET_PERIOD);
139 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD))
140 return -1;
141
142 /*
143 * 64 bit math
144 * First changing tick into fsec
145 * Then 64 bit div to find number of hpet clk per tick
146 */
147 ASM_MUL64_REG(tick_fsec_low, tick_fsec_high,
148 KERNEL_TICK_USEC, FSEC_TO_USEC);
149 ASM_DIV64_REG(hpet_tick, hpet_tick_rem,
150 hpet_period, tick_fsec_low, tick_fsec_high);
151
152 if (hpet_tick_rem > (hpet_period >> 1))
153 hpet_tick++; /* rounding the result */
154
155 if (hpet_timer_stop_set_go(hpet_tick))
156 return -1;
157
158 use_hpet = 1;
159
160#ifdef CONFIG_HPET
161 {
162 struct hpet_data hd;
163 unsigned int ntimer;
164
165 memset(&hd, 0, sizeof (hd));
166
167 ntimer = hpet_readl(HPET_ID);
168 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
169 ntimer++;
170
171 /*
172 * Register with driver.
173 * Timer0 and Timer1 is used by platform.
174 */
175 hd.hd_phys_address = hpet_address;
176 hd.hd_address = hpet_virt_address;
177 hd.hd_nirqs = ntimer;
178 hd.hd_flags = HPET_DATA_PLATFORM;
179 hpet_reserve_timer(&hd, 0);
180#ifdef CONFIG_HPET_EMULATE_RTC
181 hpet_reserve_timer(&hd, 1);
182#endif
183 hd.hd_irq[0] = HPET_LEGACY_8254;
184 hd.hd_irq[1] = HPET_LEGACY_RTC;
185 if (ntimer > 2) {
186 struct hpet __iomem *hpet;
187 struct hpet_timer __iomem *timer;
188 int i;
189
190 hpet = hpet_virt_address;
191
192 for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
193 timer++, i++)
194 hd.hd_irq[i] = (timer->hpet_config &
195 Tn_INT_ROUTE_CNF_MASK) >>
196 Tn_INT_ROUTE_CNF_SHIFT;
197
198 }
199
200 hpet_alloc(&hd);
201 }
202#endif
203
204#ifdef CONFIG_X86_LOCAL_APIC
205 wait_timer_tick = wait_hpet_tick;
206#endif
207 return 0;
208}
209
210int hpet_reenable(void)
211{
212 return hpet_timer_stop_set_go(hpet_tick);
213}
214
215int is_hpet_enabled(void)
216{
217 return use_hpet;
218}
219
220int is_hpet_capable(void)
221{
222 if (!boot_hpet_disable && hpet_address)
223 return 1;
224 return 0;
225}
226
227static int __init hpet_setup(char* str)
228{
229 if (str) {
230 if (!strncmp("disable", str, 7))
231 boot_hpet_disable = 1;
232 }
233 return 1;
234}
235
236__setup("hpet=", hpet_setup);
237
238#ifdef CONFIG_HPET_EMULATE_RTC
239/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
240 * is enabled, we support RTC interrupt functionality in software.
241 * RTC has 3 kinds of interrupts:
242 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
243 * is updated
244 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
245 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
246 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
247 * (1) and (2) above are implemented using polling at a frequency of
248 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
249 * overhead. (DEFAULT_RTC_INT_FREQ)
250 * For (3), we use interrupts at 64Hz or user specified periodic
251 * frequency, whichever is higher.
252 */
253#include <linux/mc146818rtc.h>
254#include <linux/rtc.h>
255
256extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
257
258#define DEFAULT_RTC_INT_FREQ 64
259#define RTC_NUM_INTS 1
260
261static unsigned long UIE_on;
262static unsigned long prev_update_sec;
263
264static unsigned long AIE_on;
265static struct rtc_time alarm_time;
266
267static unsigned long PIE_on;
268static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
269static unsigned long PIE_count;
270
271static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
272
273/*
274 * Timer 1 for RTC, we do not use periodic interrupt feature,
275 * even if HPET supports periodic interrupts on Timer 1.
276 * The reason being, to set up a periodic interrupt in HPET, we need to
277 * stop the main counter. And if we do that everytime someone diables/enables
278 * RTC, we will have adverse effect on main kernel timer running on Timer 0.
279 * So, for the time being, simulate the periodic interrupt in software.
280 *
281 * hpet_rtc_timer_init() is called for the first time and during subsequent
282 * interuppts reinit happens through hpet_rtc_timer_reinit().
283 */
284int hpet_rtc_timer_init(void)
285{
286 unsigned int cfg, cnt;
287 unsigned long flags;
288
289 if (!is_hpet_enabled())
290 return 0;
291 /*
292 * Set the counter 1 and enable the interrupts.
293 */
294 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
295 hpet_rtc_int_freq = PIE_freq;
296 else
297 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
298
299 local_irq_save(flags);
300 cnt = hpet_readl(HPET_COUNTER);
301 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
302 hpet_writel(cnt, HPET_T1_CMP);
303 local_irq_restore(flags);
304
305 cfg = hpet_readl(HPET_T1_CFG);
306 cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
307 hpet_writel(cfg, HPET_T1_CFG);
308
309 return 1;
310}
311
312static void hpet_rtc_timer_reinit(void)
313{
314 unsigned int cfg, cnt;
315
316 if (!(PIE_on | AIE_on | UIE_on))
317 return;
318
319 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
320 hpet_rtc_int_freq = PIE_freq;
321 else
322 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
323
324 /* It is more accurate to use the comparator value than current count.*/
325 cnt = hpet_readl(HPET_T1_CMP);
326 cnt += hpet_tick*HZ/hpet_rtc_int_freq;
327 hpet_writel(cnt, HPET_T1_CMP);
328
329 cfg = hpet_readl(HPET_T1_CFG);
330 cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
331 hpet_writel(cfg, HPET_T1_CFG);
332
333 return;
334}
335
336/*
337 * The functions below are called from rtc driver.
338 * Return 0 if HPET is not being used.
339 * Otherwise do the necessary changes and return 1.
340 */
341int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
342{
343 if (!is_hpet_enabled())
344 return 0;
345
346 if (bit_mask & RTC_UIE)
347 UIE_on = 0;
348 if (bit_mask & RTC_PIE)
349 PIE_on = 0;
350 if (bit_mask & RTC_AIE)
351 AIE_on = 0;
352
353 return 1;
354}
355
356int hpet_set_rtc_irq_bit(unsigned long bit_mask)
357{
358 int timer_init_reqd = 0;
359
360 if (!is_hpet_enabled())
361 return 0;
362
363 if (!(PIE_on | AIE_on | UIE_on))
364 timer_init_reqd = 1;
365
366 if (bit_mask & RTC_UIE) {
367 UIE_on = 1;
368 }
369 if (bit_mask & RTC_PIE) {
370 PIE_on = 1;
371 PIE_count = 0;
372 }
373 if (bit_mask & RTC_AIE) {
374 AIE_on = 1;
375 }
376
377 if (timer_init_reqd)
378 hpet_rtc_timer_init();
379
380 return 1;
381}
382
383int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
384{
385 if (!is_hpet_enabled())
386 return 0;
387
388 alarm_time.tm_hour = hrs;
389 alarm_time.tm_min = min;
390 alarm_time.tm_sec = sec;
391
392 return 1;
393}
394
395int hpet_set_periodic_freq(unsigned long freq)
396{
397 if (!is_hpet_enabled())
398 return 0;
399
400 PIE_freq = freq;
401 PIE_count = 0;
402
403 return 1;
404}
405
406int hpet_rtc_dropped_irq(void)
407{
408 if (!is_hpet_enabled())
409 return 0;
410
411 return 1;
412}
413
414irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
415{
416 struct rtc_time curr_time;
417 unsigned long rtc_int_flag = 0;
418 int call_rtc_interrupt = 0;
419
420 hpet_rtc_timer_reinit();
421
422 if (UIE_on | AIE_on) {
423 rtc_get_rtc_time(&curr_time);
424 }
425 if (UIE_on) {
426 if (curr_time.tm_sec != prev_update_sec) {
427 /* Set update int info, call real rtc int routine */
428 call_rtc_interrupt = 1;
429 rtc_int_flag = RTC_UF;
430 prev_update_sec = curr_time.tm_sec;
431 }
432 }
433 if (PIE_on) {
434 PIE_count++;
435 if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
436 /* Set periodic int info, call real rtc int routine */
437 call_rtc_interrupt = 1;
438 rtc_int_flag |= RTC_PF;
439 PIE_count = 0;
440 }
441 }
442 if (AIE_on) {
443 if ((curr_time.tm_sec == alarm_time.tm_sec) &&
444 (curr_time.tm_min == alarm_time.tm_min) &&
445 (curr_time.tm_hour == alarm_time.tm_hour)) {
446 /* Set alarm int info, call real rtc int routine */
447 call_rtc_interrupt = 1;
448 rtc_int_flag |= RTC_AF;
449 }
450 }
451 if (call_rtc_interrupt) {
452 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
453 rtc_interrupt(rtc_int_flag, dev_id, regs);
454 }
455 return IRQ_HANDLED;
456}
457#endif
458
diff --git a/arch/i386/kernel/timers/Makefile b/arch/i386/kernel/timers/Makefile
new file mode 100644
index 000000000000..8fa12be658dd
--- /dev/null
+++ b/arch/i386/kernel/timers/Makefile
@@ -0,0 +1,9 @@
1#
2# Makefile for x86 timers
3#
4
5obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
6
7obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o
8obj-$(CONFIG_HPET_TIMER) += timer_hpet.o
9obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o
diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c
new file mode 100644
index 000000000000..f7f90005e22e
--- /dev/null
+++ b/arch/i386/kernel/timers/common.c
@@ -0,0 +1,160 @@
1/*
2 * Common functions used across the timers go here
3 */
4
5#include <linux/init.h>
6#include <linux/timex.h>
7#include <linux/errno.h>
8#include <linux/jiffies.h>
9
10#include <asm/io.h>
11#include <asm/timer.h>
12#include <asm/hpet.h>
13
14#include "mach_timer.h"
15
16/* ------ Calibrate the TSC -------
17 * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
18 * Too much 64-bit arithmetic here to do this cleanly in C, and for
19 * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
20 * output busy loop as low as possible. We avoid reading the CTC registers
21 * directly because of the awkward 8-bit access mechanism of the 82C54
22 * device.
23 */
24
25#define CALIBRATE_TIME (5 * 1000020/HZ)
26
27unsigned long __init calibrate_tsc(void)
28{
29 mach_prepare_counter();
30
31 {
32 unsigned long startlow, starthigh;
33 unsigned long endlow, endhigh;
34 unsigned long count;
35
36 rdtsc(startlow,starthigh);
37 mach_countup(&count);
38 rdtsc(endlow,endhigh);
39
40
41 /* Error: ECTCNEVERSET */
42 if (count <= 1)
43 goto bad_ctc;
44
45 /* 64-bit subtract - gcc just messes up with long longs */
46 __asm__("subl %2,%0\n\t"
47 "sbbl %3,%1"
48 :"=a" (endlow), "=d" (endhigh)
49 :"g" (startlow), "g" (starthigh),
50 "0" (endlow), "1" (endhigh));
51
52 /* Error: ECPUTOOFAST */
53 if (endhigh)
54 goto bad_ctc;
55
56 /* Error: ECPUTOOSLOW */
57 if (endlow <= CALIBRATE_TIME)
58 goto bad_ctc;
59
60 __asm__("divl %2"
61 :"=a" (endlow), "=d" (endhigh)
62 :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME));
63
64 return endlow;
65 }
66
67 /*
68 * The CTC wasn't reliable: we got a hit on the very first read,
69 * or the CPU was so fast/slow that the quotient wouldn't fit in
70 * 32 bits..
71 */
72bad_ctc:
73 return 0;
74}
75
76#ifdef CONFIG_HPET_TIMER
77/* ------ Calibrate the TSC using HPET -------
78 * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq.
79 * Second output is parameter 1 (when non NULL)
80 * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet().
81 * calibrate_tsc() calibrates the processor TSC by comparing
82 * it to the HPET timer of known frequency.
83 * Too much 64-bit arithmetic here to do this cleanly in C
84 */
85#define CALIBRATE_CNT_HPET (5 * hpet_tick)
86#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC)
87
88unsigned long __init calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr)
89{
90 unsigned long tsc_startlow, tsc_starthigh;
91 unsigned long tsc_endlow, tsc_endhigh;
92 unsigned long hpet_start, hpet_end;
93 unsigned long result, remain;
94
95 hpet_start = hpet_readl(HPET_COUNTER);
96 rdtsc(tsc_startlow, tsc_starthigh);
97 do {
98 hpet_end = hpet_readl(HPET_COUNTER);
99 } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET);
100 rdtsc(tsc_endlow, tsc_endhigh);
101
102 /* 64-bit subtract - gcc just messes up with long longs */
103 __asm__("subl %2,%0\n\t"
104 "sbbl %3,%1"
105 :"=a" (tsc_endlow), "=d" (tsc_endhigh)
106 :"g" (tsc_startlow), "g" (tsc_starthigh),
107 "0" (tsc_endlow), "1" (tsc_endhigh));
108
109 /* Error: ECPUTOOFAST */
110 if (tsc_endhigh)
111 goto bad_calibration;
112
113 /* Error: ECPUTOOSLOW */
114 if (tsc_endlow <= CALIBRATE_TIME_HPET)
115 goto bad_calibration;
116
117 ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET);
118 if (remain > (tsc_endlow >> 1))
119 result++; /* rounding the result */
120
121 if (tsc_hpet_quotient_ptr) {
122 unsigned long tsc_hpet_quotient;
123
124 ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0,
125 CALIBRATE_CNT_HPET);
126 if (remain > (tsc_endlow >> 1))
127 tsc_hpet_quotient++; /* rounding the result */
128 *tsc_hpet_quotient_ptr = tsc_hpet_quotient;
129 }
130
131 return result;
132bad_calibration:
133 /*
134 * the CPU was so fast/slow that the quotient wouldn't fit in
135 * 32 bits..
136 */
137 return 0;
138}
139#endif
140
141/* calculate cpu_khz */
142void __init init_cpu_khz(void)
143{
144 if (cpu_has_tsc) {
145 unsigned long tsc_quotient = calibrate_tsc();
146 if (tsc_quotient) {
147 /* report CPU clock rate in Hz.
148 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
149 * clock/second. Our precision is about 100 ppm.
150 */
151 { unsigned long eax=0, edx=1000;
152 __asm__("divl %2"
153 :"=a" (cpu_khz), "=d" (edx)
154 :"r" (tsc_quotient),
155 "0" (eax), "1" (edx));
156 printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000);
157 }
158 }
159 }
160}
diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c
new file mode 100644
index 000000000000..a3d6a288088b
--- /dev/null
+++ b/arch/i386/kernel/timers/timer.c
@@ -0,0 +1,66 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/string.h>
4#include <asm/timer.h>
5
6#ifdef CONFIG_HPET_TIMER
7/*
8 * HPET memory read is slower than tsc reads, but is more dependable as it
9 * always runs at constant frequency and reduces complexity due to
10 * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use
11 * timer_pit when HPET is active. So, we default to timer_tsc.
12 */
13#endif
14/* list of timers, ordered by preference, NULL terminated */
15static struct init_timer_opts* __initdata timers[] = {
16#ifdef CONFIG_X86_CYCLONE_TIMER
17 &timer_cyclone_init,
18#endif
19#ifdef CONFIG_HPET_TIMER
20 &timer_hpet_init,
21#endif
22#ifdef CONFIG_X86_PM_TIMER
23 &timer_pmtmr_init,
24#endif
25 &timer_tsc_init,
26 &timer_pit_init,
27 NULL,
28};
29
30static char clock_override[10] __initdata;
31
32static int __init clock_setup(char* str)
33{
34 if (str)
35 strlcpy(clock_override, str, sizeof(clock_override));
36 return 1;
37}
38__setup("clock=", clock_setup);
39
40
41/* The chosen timesource has been found to be bad.
42 * Fall back to a known good timesource (the PIT)
43 */
44void clock_fallback(void)
45{
46 cur_timer = &timer_pit;
47}
48
49/* iterates through the list of timers, returning the first
50 * one that initializes successfully.
51 */
52struct timer_opts* __init select_timer(void)
53{
54 int i = 0;
55
56 /* find most preferred working timer */
57 while (timers[i]) {
58 if (timers[i]->init)
59 if (timers[i]->init(clock_override) == 0)
60 return timers[i]->opts;
61 ++i;
62 }
63
64 panic("select_timer: Cannot find a suitable timer\n");
65 return NULL;
66}
diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c
new file mode 100644
index 000000000000..f6f1206a11bb
--- /dev/null
+++ b/arch/i386/kernel/timers/timer_cyclone.c
@@ -0,0 +1,259 @@
1/* Cyclone-timer:
2 * This code implements timer_ops for the cyclone counter found
3 * on IBM x440, x360, and other Summit based systems.
4 *
5 * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com)
6 */
7
8
9#include <linux/spinlock.h>
10#include <linux/init.h>
11#include <linux/timex.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/jiffies.h>
15
16#include <asm/timer.h>
17#include <asm/io.h>
18#include <asm/pgtable.h>
19#include <asm/fixmap.h>
20#include "io_ports.h"
21
22extern spinlock_t i8253_lock;
23
24/* Number of usecs that the last interrupt was delayed */
25static int delay_at_last_interrupt;
26
27#define CYCLONE_CBAR_ADDR 0xFEB00CD0
28#define CYCLONE_PMCC_OFFSET 0x51A0
29#define CYCLONE_MPMC_OFFSET 0x51D0
30#define CYCLONE_MPCS_OFFSET 0x51A8
31#define CYCLONE_TIMER_FREQ 100000000
32#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
33int use_cyclone = 0;
34
35static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */
36static u32 last_cyclone_low;
37static u32 last_cyclone_high;
38static unsigned long long monotonic_base;
39static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
40
41/* helper macro to atomically read both cyclone counter registers */
42#define read_cyclone_counter(low,high) \
43 do{ \
44 high = cyclone_timer[1]; low = cyclone_timer[0]; \
45 } while (high != cyclone_timer[1]);
46
47
48static void mark_offset_cyclone(void)
49{
50 unsigned long lost, delay;
51 unsigned long delta = last_cyclone_low;
52 int count;
53 unsigned long long this_offset, last_offset;
54
55 write_seqlock(&monotonic_lock);
56 last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
57
58 spin_lock(&i8253_lock);
59 read_cyclone_counter(last_cyclone_low,last_cyclone_high);
60
61 /* read values for delay_at_last_interrupt */
62 outb_p(0x00, 0x43); /* latch the count ASAP */
63
64 count = inb_p(0x40); /* read the latched count */
65 count |= inb(0x40) << 8;
66
67 /*
68 * VIA686a test code... reset the latch if count > max + 1
69 * from timer_pit.c - cjb
70 */
71 if (count > LATCH) {
72 outb_p(0x34, PIT_MODE);
73 outb_p(LATCH & 0xff, PIT_CH0);
74 outb(LATCH >> 8, PIT_CH0);
75 count = LATCH - 1;
76 }
77 spin_unlock(&i8253_lock);
78
79 /* lost tick compensation */
80 delta = last_cyclone_low - delta;
81 delta /= (CYCLONE_TIMER_FREQ/1000000);
82 delta += delay_at_last_interrupt;
83 lost = delta/(1000000/HZ);
84 delay = delta%(1000000/HZ);
85 if (lost >= 2)
86 jiffies_64 += lost-1;
87
88 /* update the monotonic base value */
89 this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
90 monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
91 write_sequnlock(&monotonic_lock);
92
93 /* calculate delay_at_last_interrupt */
94 count = ((LATCH-1) - count) * TICK_SIZE;
95 delay_at_last_interrupt = (count + LATCH/2) / LATCH;
96
97
98 /* catch corner case where tick rollover occured
99 * between cyclone and pit reads (as noted when
100 * usec delta is > 90% # of usecs/tick)
101 */
102 if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
103 jiffies_64++;
104}
105
106static unsigned long get_offset_cyclone(void)
107{
108 u32 offset;
109
110 if(!cyclone_timer)
111 return delay_at_last_interrupt;
112
113 /* Read the cyclone timer */
114 offset = cyclone_timer[0];
115
116 /* .. relative to previous jiffy */
117 offset = offset - last_cyclone_low;
118
119 /* convert cyclone ticks to microseconds */
120 /* XXX slow, can we speed this up? */
121 offset = offset/(CYCLONE_TIMER_FREQ/1000000);
122
123 /* our adjusted time offset in microseconds */
124 return delay_at_last_interrupt + offset;
125}
126
127static unsigned long long monotonic_clock_cyclone(void)
128{
129 u32 now_low, now_high;
130 unsigned long long last_offset, this_offset, base;
131 unsigned long long ret;
132 unsigned seq;
133
134 /* atomically read monotonic base & last_offset */
135 do {
136 seq = read_seqbegin(&monotonic_lock);
137 last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
138 base = monotonic_base;
139 } while (read_seqretry(&monotonic_lock, seq));
140
141
142 /* Read the cyclone counter */
143 read_cyclone_counter(now_low,now_high);
144 this_offset = ((unsigned long long)now_high<<32)|now_low;
145
146 /* convert to nanoseconds */
147 ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK);
148 return ret * (1000000000 / CYCLONE_TIMER_FREQ);
149}
150
151static int __init init_cyclone(char* override)
152{
153 u32* reg;
154 u32 base; /* saved cyclone base address */
155 u32 pageaddr; /* page that contains cyclone_timer register */
156 u32 offset; /* offset from pageaddr to cyclone_timer register */
157 int i;
158
159 /* check clock override */
160 if (override[0] && strncmp(override,"cyclone",7))
161 return -ENODEV;
162
163 /*make sure we're on a summit box*/
164 if(!use_cyclone) return -ENODEV;
165
166 printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
167
168 /* find base address */
169 pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
170 offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
171 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
172 reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
173 if(!reg){
174 printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
175 return -ENODEV;
176 }
177 base = *reg;
178 if(!base){
179 printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
180 return -ENODEV;
181 }
182
183 /* setup PMCC */
184 pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
185 offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
186 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
187 reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
188 if(!reg){
189 printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
190 return -ENODEV;
191 }
192 reg[0] = 0x00000001;
193
194 /* setup MPCS */
195 pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
196 offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
197 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
198 reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
199 if(!reg){
200 printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
201 return -ENODEV;
202 }
203 reg[0] = 0x00000001;
204
205 /* map in cyclone_timer */
206 pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
207 offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
208 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
209 cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
210 if(!cyclone_timer){
211 printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
212 return -ENODEV;
213 }
214
215 /*quick test to make sure its ticking*/
216 for(i=0; i<3; i++){
217 u32 old = cyclone_timer[0];
218 int stall = 100;
219 while(stall--) barrier();
220 if(cyclone_timer[0] == old){
221 printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
222 cyclone_timer = 0;
223 return -ENODEV;
224 }
225 }
226
227 init_cpu_khz();
228
229 /* Everything looks good! */
230 return 0;
231}
232
233
234static void delay_cyclone(unsigned long loops)
235{
236 unsigned long bclock, now;
237 if(!cyclone_timer)
238 return;
239 bclock = cyclone_timer[0];
240 do {
241 rep_nop();
242 now = cyclone_timer[0];
243 } while ((now-bclock) < loops);
244}
245/************************************************************/
246
247/* cyclone timer_opts struct */
248static struct timer_opts timer_cyclone = {
249 .name = "cyclone",
250 .mark_offset = mark_offset_cyclone,
251 .get_offset = get_offset_cyclone,
252 .monotonic_clock = monotonic_clock_cyclone,
253 .delay = delay_cyclone,
254};
255
256struct init_timer_opts __initdata timer_cyclone_init = {
257 .init = init_cyclone,
258 .opts = &timer_cyclone,
259};
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
new file mode 100644
index 000000000000..713134e71844
--- /dev/null
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -0,0 +1,191 @@
1/*
2 * This code largely moved from arch/i386/kernel/time.c.
3 * See comments there for proper credits.
4 */
5
6#include <linux/spinlock.h>
7#include <linux/init.h>
8#include <linux/timex.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/jiffies.h>
12
13#include <asm/timer.h>
14#include <asm/io.h>
15#include <asm/processor.h>
16
17#include "io_ports.h"
18#include "mach_timer.h"
19#include <asm/hpet.h>
20
21static unsigned long hpet_usec_quotient; /* convert hpet clks to usec */
22static unsigned long tsc_hpet_quotient; /* convert tsc to hpet clks */
23static unsigned long hpet_last; /* hpet counter value at last tick*/
24static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
25static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
26static unsigned long long monotonic_base;
27static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
28
29/* convert from cycles(64bits) => nanoseconds (64bits)
30 * basic equation:
31 * ns = cycles / (freq / ns_per_sec)
32 * ns = cycles * (ns_per_sec / freq)
33 * ns = cycles * (10^9 / (cpu_mhz * 10^6))
34 * ns = cycles * (10^3 / cpu_mhz)
35 *
36 * Then we use scaling math (suggested by george@mvista.com) to get:
37 * ns = cycles * (10^3 * SC / cpu_mhz) / SC
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * And since SC is a constant power of two, we can convert the div
41 * into a shift.
42 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
43 */
44static unsigned long cyc2ns_scale;
45#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
46
47static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
48{
49 cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
50}
51
52static inline unsigned long long cycles_2_ns(unsigned long long cyc)
53{
54 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
55}
56
57static unsigned long long monotonic_clock_hpet(void)
58{
59 unsigned long long last_offset, this_offset, base;
60 unsigned seq;
61
62 /* atomically read monotonic base & last_offset */
63 do {
64 seq = read_seqbegin(&monotonic_lock);
65 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
66 base = monotonic_base;
67 } while (read_seqretry(&monotonic_lock, seq));
68
69 /* Read the Time Stamp Counter */
70 rdtscll(this_offset);
71
72 /* return the value in ns */
73 return base + cycles_2_ns(this_offset - last_offset);
74}
75
76static unsigned long get_offset_hpet(void)
77{
78 register unsigned long eax, edx;
79
80 eax = hpet_readl(HPET_COUNTER);
81 eax -= hpet_last; /* hpet delta */
82
83 /*
84 * Time offset = (hpet delta) * ( usecs per HPET clock )
85 * = (hpet delta) * ( usecs per tick / HPET clocks per tick)
86 * = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
87 *
88 * Where,
89 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
90 *
91 * Using a mull instead of a divl saves some cycles in critical path.
92 */
93 ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax);
94
95 /* our adjusted time offset in microseconds */
96 return edx;
97}
98
99static void mark_offset_hpet(void)
100{
101 unsigned long long this_offset, last_offset;
102 unsigned long offset;
103
104 write_seqlock(&monotonic_lock);
105 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
106 rdtsc(last_tsc_low, last_tsc_high);
107
108 offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
109 if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) {
110 int lost_ticks = (offset - hpet_last) / hpet_tick;
111 jiffies_64 += lost_ticks;
112 }
113 hpet_last = offset;
114
115 /* update the monotonic base value */
116 this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
117 monotonic_base += cycles_2_ns(this_offset - last_offset);
118 write_sequnlock(&monotonic_lock);
119}
120
121static void delay_hpet(unsigned long loops)
122{
123 unsigned long hpet_start, hpet_end;
124 unsigned long eax;
125
126 /* loops is the number of cpu cycles. Convert it to hpet clocks */
127 ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops);
128
129 hpet_start = hpet_readl(HPET_COUNTER);
130 do {
131 rep_nop();
132 hpet_end = hpet_readl(HPET_COUNTER);
133 } while ((hpet_end - hpet_start) < (loops));
134}
135
136static int __init init_hpet(char* override)
137{
138 unsigned long result, remain;
139
140 /* check clock override */
141 if (override[0] && strncmp(override,"hpet",4))
142 return -ENODEV;
143
144 if (!is_hpet_enabled())
145 return -ENODEV;
146
147 printk("Using HPET for gettimeofday\n");
148 if (cpu_has_tsc) {
149 unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient);
150 if (tsc_quotient) {
151 /* report CPU clock rate in Hz.
152 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
153 * clock/second. Our precision is about 100 ppm.
154 */
155 { unsigned long eax=0, edx=1000;
156 ASM_DIV64_REG(cpu_khz, edx, tsc_quotient,
157 eax, edx);
158 printk("Detected %lu.%03lu MHz processor.\n",
159 cpu_khz / 1000, cpu_khz % 1000);
160 }
161 set_cyc2ns_scale(cpu_khz/1000);
162 }
163 }
164
165 /*
166 * Math to calculate hpet to usec multiplier
167 * Look for the comments at get_offset_hpet()
168 */
169 ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC);
170 if (remain > (hpet_tick >> 1))
171 result++; /* rounding the result */
172 hpet_usec_quotient = result;
173
174 return 0;
175}
176
177/************************************************************/
178
179/* tsc timer_opts struct */
180static struct timer_opts timer_hpet = {
181 .name = "hpet",
182 .mark_offset = mark_offset_hpet,
183 .get_offset = get_offset_hpet,
184 .monotonic_clock = monotonic_clock_hpet,
185 .delay = delay_hpet,
186};
187
188struct init_timer_opts __initdata timer_hpet_init = {
189 .init = init_hpet,
190 .opts = &timer_hpet,
191};
diff --git a/arch/i386/kernel/timers/timer_none.c b/arch/i386/kernel/timers/timer_none.c
new file mode 100644
index 000000000000..4ea2f414dbbd
--- /dev/null
+++ b/arch/i386/kernel/timers/timer_none.c
@@ -0,0 +1,39 @@
1#include <linux/init.h>
2#include <asm/timer.h>
3
4static void mark_offset_none(void)
5{
6 /* nothing needed */
7}
8
9static unsigned long get_offset_none(void)
10{
11 return 0;
12}
13
14static unsigned long long monotonic_clock_none(void)
15{
16 return 0;
17}
18
19static void delay_none(unsigned long loops)
20{
21 int d0;
22 __asm__ __volatile__(
23 "\tjmp 1f\n"
24 ".align 16\n"
25 "1:\tjmp 2f\n"
26 ".align 16\n"
27 "2:\tdecl %0\n\tjns 2b"
28 :"=&a" (d0)
29 :"0" (loops));
30}
31
32/* none timer_opts struct */
33struct timer_opts timer_none = {
34 .name = "none",
35 .mark_offset = mark_offset_none,
36 .get_offset = get_offset_none,
37 .monotonic_clock = monotonic_clock_none,
38 .delay = delay_none,
39};
diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c
new file mode 100644
index 000000000000..967d5453cd0e
--- /dev/null
+++ b/arch/i386/kernel/timers/timer_pit.c
@@ -0,0 +1,206 @@
1/*
2 * This code largely moved from arch/i386/kernel/time.c.
3 * See comments there for proper credits.
4 */
5
6#include <linux/spinlock.h>
7#include <linux/module.h>
8#include <linux/device.h>
9#include <linux/irq.h>
10#include <linux/sysdev.h>
11#include <linux/timex.h>
12#include <asm/delay.h>
13#include <asm/mpspec.h>
14#include <asm/timer.h>
15#include <asm/smp.h>
16#include <asm/io.h>
17#include <asm/arch_hooks.h>
18
19extern spinlock_t i8259A_lock;
20extern spinlock_t i8253_lock;
21#include "do_timer.h"
22#include "io_ports.h"
23
24static int count_p; /* counter in get_offset_pit() */
25
26static int __init init_pit(char* override)
27{
28 /* check clock override */
29 if (override[0] && strncmp(override,"pit",3))
30 printk(KERN_ERR "Warning: clock= override failed. Defaulting to PIT\n");
31
32 count_p = LATCH;
33 return 0;
34}
35
36static void mark_offset_pit(void)
37{
38 /* nothing needed */
39}
40
41static unsigned long long monotonic_clock_pit(void)
42{
43 return 0;
44}
45
46static void delay_pit(unsigned long loops)
47{
48 int d0;
49 __asm__ __volatile__(
50 "\tjmp 1f\n"
51 ".align 16\n"
52 "1:\tjmp 2f\n"
53 ".align 16\n"
54 "2:\tdecl %0\n\tjns 2b"
55 :"=&a" (d0)
56 :"0" (loops));
57}
58
59
60/* This function must be called with xtime_lock held.
61 * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs
62 *
63 * However, the pc-audio speaker driver changes the divisor so that
64 * it gets interrupted rather more often - it loads 64 into the
65 * counter rather than 11932! This has an adverse impact on
66 * do_gettimeoffset() -- it stops working! What is also not
67 * good is that the interval that our timer function gets called
68 * is no longer 10.0002 ms, but 9.9767 ms. To get around this
69 * would require using a different timing source. Maybe someone
70 * could use the RTC - I know that this can interrupt at frequencies
71 * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix
72 * it so that at startup, the timer code in sched.c would select
73 * using either the RTC or the 8253 timer. The decision would be
74 * based on whether there was any other device around that needed
75 * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz,
76 * and then do some jiggery to have a version of do_timer that
77 * advanced the clock by 1/1024 s. Every time that reached over 1/100
78 * of a second, then do all the old code. If the time was kept correct
79 * then do_gettimeoffset could just return 0 - there is no low order
80 * divider that can be accessed.
81 *
82 * Ideally, you would be able to use the RTC for the speaker driver,
83 * but it appears that the speaker driver really needs interrupt more
84 * often than every 120 us or so.
85 *
86 * Anyway, this needs more thought.... pjsg (1993-08-28)
87 *
88 * If you are really that interested, you should be reading
89 * comp.protocols.time.ntp!
90 */
91
92static unsigned long get_offset_pit(void)
93{
94 int count;
95 unsigned long flags;
96 static unsigned long jiffies_p = 0;
97
98 /*
99 * cache volatile jiffies temporarily; we have xtime_lock.
100 */
101 unsigned long jiffies_t;
102
103 spin_lock_irqsave(&i8253_lock, flags);
104 /* timer count may underflow right here */
105 outb_p(0x00, PIT_MODE); /* latch the count ASAP */
106
107 count = inb_p(PIT_CH0); /* read the latched count */
108
109 /*
110 * We do this guaranteed double memory access instead of a _p
111 * postfix in the previous port access. Wheee, hackady hack
112 */
113 jiffies_t = jiffies;
114
115 count |= inb_p(PIT_CH0) << 8;
116
117 /* VIA686a test code... reset the latch if count > max + 1 */
118 if (count > LATCH) {
119 outb_p(0x34, PIT_MODE);
120 outb_p(LATCH & 0xff, PIT_CH0);
121 outb(LATCH >> 8, PIT_CH0);
122 count = LATCH - 1;
123 }
124
125 /*
126 * avoiding timer inconsistencies (they are rare, but they happen)...
127 * there are two kinds of problems that must be avoided here:
128 * 1. the timer counter underflows
129 * 2. hardware problem with the timer, not giving us continuous time,
130 * the counter does small "jumps" upwards on some Pentium systems,
131 * (see c't 95/10 page 335 for Neptun bug.)
132 */
133
134 if( jiffies_t == jiffies_p ) {
135 if( count > count_p ) {
136 /* the nutcase */
137 count = do_timer_overflow(count);
138 }
139 } else
140 jiffies_p = jiffies_t;
141
142 count_p = count;
143
144 spin_unlock_irqrestore(&i8253_lock, flags);
145
146 count = ((LATCH-1) - count) * TICK_SIZE;
147 count = (count + LATCH/2) / LATCH;
148
149 return count;
150}
151
152
153/* tsc timer_opts struct */
154struct timer_opts timer_pit = {
155 .name = "pit",
156 .mark_offset = mark_offset_pit,
157 .get_offset = get_offset_pit,
158 .monotonic_clock = monotonic_clock_pit,
159 .delay = delay_pit,
160};
161
162struct init_timer_opts __initdata timer_pit_init = {
163 .init = init_pit,
164 .opts = &timer_pit,
165};
166
167void setup_pit_timer(void)
168{
169 extern spinlock_t i8253_lock;
170 unsigned long flags;
171
172 spin_lock_irqsave(&i8253_lock, flags);
173 outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
174 udelay(10);
175 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
176 udelay(10);
177 outb(LATCH >> 8 , PIT_CH0); /* MSB */
178 spin_unlock_irqrestore(&i8253_lock, flags);
179}
180
181static int timer_resume(struct sys_device *dev)
182{
183 setup_pit_timer();
184 return 0;
185}
186
187static struct sysdev_class timer_sysclass = {
188 set_kset_name("timer_pit"),
189 .resume = timer_resume,
190};
191
192static struct sys_device device_timer = {
193 .id = 0,
194 .cls = &timer_sysclass,
195};
196
197static int __init init_timer_sysfs(void)
198{
199 int error = sysdev_class_register(&timer_sysclass);
200 if (!error)
201 error = sysdev_register(&device_timer);
202 return error;
203}
204
205device_initcall(init_timer_sysfs);
206
diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
new file mode 100644
index 000000000000..d77f22030fe6
--- /dev/null
+++ b/arch/i386/kernel/timers/timer_pm.c
@@ -0,0 +1,258 @@
1/*
2 * (C) Dominik Brodowski <linux@brodo.de> 2003
3 *
4 * Driver to use the Power Management Timer (PMTMR) available in some
5 * southbridges as primary timing source for the Linux kernel.
6 *
7 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
8 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
9 *
10 * This file is licensed under the GPL v2.
11 */
12
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/device.h>
17#include <linux/init.h>
18#include <asm/types.h>
19#include <asm/timer.h>
20#include <asm/smp.h>
21#include <asm/io.h>
22#include <asm/arch_hooks.h>
23
24#include <linux/timex.h>
25#include "mach_timer.h"
26
27/* Number of PMTMR ticks expected during calibration run */
28#define PMTMR_TICKS_PER_SEC 3579545
29#define PMTMR_EXPECTED_RATE \
30 ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
31
32
33/* The I/O port the PMTMR resides at.
34 * The location is detected during setup_arch(),
35 * in arch/i386/acpi/boot.c */
36u32 pmtmr_ioport = 0;
37
38
39/* value of the Power timer at last timer interrupt */
40static u32 offset_tick;
41static u32 offset_delay;
42
43static unsigned long long monotonic_base;
44static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
45
46#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
47
48/*helper function to safely read acpi pm timesource*/
49static inline u32 read_pmtmr(void)
50{
51 u32 v1=0,v2=0,v3=0;
52 /* It has been reported that because of various broken
53 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
54 * source is not latched, so you must read it multiple
55 * times to insure a safe value is read.
56 */
57 do {
58 v1 = inl(pmtmr_ioport);
59 v2 = inl(pmtmr_ioport);
60 v3 = inl(pmtmr_ioport);
61 } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
62 || (v3 > v1 && v3 < v2));
63
64 /* mask the output to 24 bits */
65 return v2 & ACPI_PM_MASK;
66}
67
68
69/*
70 * Some boards have the PMTMR running way too fast. We check
71 * the PMTMR rate against PIT channel 2 to catch these cases.
72 */
73static int verify_pmtmr_rate(void)
74{
75 u32 value1, value2;
76 unsigned long count, delta;
77
78 mach_prepare_counter();
79 value1 = read_pmtmr();
80 mach_countup(&count);
81 value2 = read_pmtmr();
82 delta = (value2 - value1) & ACPI_PM_MASK;
83
84 /* Check that the PMTMR delta is within 5% of what we expect */
85 if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
86 delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
87 printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
88 return -1;
89 }
90
91 return 0;
92}
93
94
95static int init_pmtmr(char* override)
96{
97 u32 value1, value2;
98 unsigned int i;
99
100 if (override[0] && strncmp(override,"pmtmr",5))
101 return -ENODEV;
102
103 if (!pmtmr_ioport)
104 return -ENODEV;
105
106 /* we use the TSC for delay_pmtmr, so make sure it exists */
107 if (!cpu_has_tsc)
108 return -ENODEV;
109
110 /* "verify" this timing source */
111 value1 = read_pmtmr();
112 for (i = 0; i < 10000; i++) {
113 value2 = read_pmtmr();
114 if (value2 == value1)
115 continue;
116 if (value2 > value1)
117 goto pm_good;
118 if ((value2 < value1) && ((value2) < 0xFFF))
119 goto pm_good;
120 printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
121 return -EINVAL;
122 }
123 printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
124 return -ENODEV;
125
126pm_good:
127 if (verify_pmtmr_rate() != 0)
128 return -ENODEV;
129
130 init_cpu_khz();
131 return 0;
132}
133
134static inline u32 cyc2us(u32 cycles)
135{
136 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
137 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
138 *
139 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
140 * easily be multiplied with 286 (=0x11E) without having to fear
141 * u32 overflows.
142 */
143 cycles *= 286;
144 return (cycles >> 10);
145}
146
147/*
148 * this gets called during each timer interrupt
149 * - Called while holding the writer xtime_lock
150 */
151static void mark_offset_pmtmr(void)
152{
153 u32 lost, delta, last_offset;
154 static int first_run = 1;
155 last_offset = offset_tick;
156
157 write_seqlock(&monotonic_lock);
158
159 offset_tick = read_pmtmr();
160
161 /* calculate tick interval */
162 delta = (offset_tick - last_offset) & ACPI_PM_MASK;
163
164 /* convert to usecs */
165 delta = cyc2us(delta);
166
167 /* update the monotonic base value */
168 monotonic_base += delta * NSEC_PER_USEC;
169 write_sequnlock(&monotonic_lock);
170
171 /* convert to ticks */
172 delta += offset_delay;
173 lost = delta / (USEC_PER_SEC / HZ);
174 offset_delay = delta % (USEC_PER_SEC / HZ);
175
176
177 /* compensate for lost ticks */
178 if (lost >= 2)
179 jiffies_64 += lost - 1;
180
181 /* don't calculate delay for first run,
182 or if we've got less then a tick */
183 if (first_run || (lost < 1)) {
184 first_run = 0;
185 offset_delay = 0;
186 }
187}
188
189
190static unsigned long long monotonic_clock_pmtmr(void)
191{
192 u32 last_offset, this_offset;
193 unsigned long long base, ret;
194 unsigned seq;
195
196
197 /* atomically read monotonic base & last_offset */
198 do {
199 seq = read_seqbegin(&monotonic_lock);
200 last_offset = offset_tick;
201 base = monotonic_base;
202 } while (read_seqretry(&monotonic_lock, seq));
203
204 /* Read the pmtmr */
205 this_offset = read_pmtmr();
206
207 /* convert to nanoseconds */
208 ret = (this_offset - last_offset) & ACPI_PM_MASK;
209 ret = base + (cyc2us(ret) * NSEC_PER_USEC);
210 return ret;
211}
212
213static void delay_pmtmr(unsigned long loops)
214{
215 unsigned long bclock, now;
216
217 rdtscl(bclock);
218 do
219 {
220 rep_nop();
221 rdtscl(now);
222 } while ((now-bclock) < loops);
223}
224
225
226/*
227 * get the offset (in microseconds) from the last call to mark_offset()
228 * - Called holding a reader xtime_lock
229 */
230static unsigned long get_offset_pmtmr(void)
231{
232 u32 now, offset, delta = 0;
233
234 offset = offset_tick;
235 now = read_pmtmr();
236 delta = (now - offset)&ACPI_PM_MASK;
237
238 return (unsigned long) offset_delay + cyc2us(delta);
239}
240
241
242/* acpi timer_opts struct */
243static struct timer_opts timer_pmtmr = {
244 .name = "pmtmr",
245 .mark_offset = mark_offset_pmtmr,
246 .get_offset = get_offset_pmtmr,
247 .monotonic_clock = monotonic_clock_pmtmr,
248 .delay = delay_pmtmr,
249};
250
251struct init_timer_opts __initdata timer_pmtmr_init = {
252 .init = init_pmtmr,
253 .opts = &timer_pmtmr,
254};
255
256MODULE_LICENSE("GPL");
257MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
258MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86");
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
new file mode 100644
index 000000000000..a685994e5c8e
--- /dev/null
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -0,0 +1,560 @@
1/*
2 * This code largely moved from arch/i386/kernel/time.c.
3 * See comments there for proper credits.
4 *
5 * 2004-06-25 Jesper Juhl
6 * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
7 * failing to inline.
8 */
9
10#include <linux/spinlock.h>
11#include <linux/init.h>
12#include <linux/timex.h>
13#include <linux/errno.h>
14#include <linux/cpufreq.h>
15#include <linux/string.h>
16#include <linux/jiffies.h>
17
18#include <asm/timer.h>
19#include <asm/io.h>
20/* processor.h for distable_tsc flag */
21#include <asm/processor.h>
22
23#include "io_ports.h"
24#include "mach_timer.h"
25
26#include <asm/hpet.h>
27
28#ifdef CONFIG_HPET_TIMER
29static unsigned long hpet_usec_quotient;
30static unsigned long hpet_last;
31static struct timer_opts timer_tsc;
32#endif
33
34static inline void cpufreq_delayed_get(void);
35
36int tsc_disable __initdata = 0;
37
38extern spinlock_t i8253_lock;
39
40static int use_tsc;
41/* Number of usecs that the last interrupt was delayed */
42static int delay_at_last_interrupt;
43
44static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
45static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
46static unsigned long long monotonic_base;
47static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
48
49/* convert from cycles(64bits) => nanoseconds (64bits)
50 * basic equation:
51 * ns = cycles / (freq / ns_per_sec)
52 * ns = cycles * (ns_per_sec / freq)
53 * ns = cycles * (10^9 / (cpu_mhz * 10^6))
54 * ns = cycles * (10^3 / cpu_mhz)
55 *
56 * Then we use scaling math (suggested by george@mvista.com) to get:
57 * ns = cycles * (10^3 * SC / cpu_mhz) / SC
58 * ns = cycles * cyc2ns_scale / SC
59 *
60 * And since SC is a constant power of two, we can convert the div
61 * into a shift.
62 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
63 */
64static unsigned long cyc2ns_scale;
65#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
66
67static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
68{
69 cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
70}
71
72static inline unsigned long long cycles_2_ns(unsigned long long cyc)
73{
74 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
75}
76
77static int count2; /* counter for mark_offset_tsc() */
78
79/* Cached *multiplier* to convert TSC counts to microseconds.
80 * (see the equation below).
81 * Equal to 2^32 * (1 / (clocks per usec) ).
82 * Initialized in time_init.
83 */
84static unsigned long fast_gettimeoffset_quotient;
85
86static unsigned long get_offset_tsc(void)
87{
88 register unsigned long eax, edx;
89
90 /* Read the Time Stamp Counter */
91
92 rdtsc(eax,edx);
93
94 /* .. relative to previous jiffy (32 bits is enough) */
95 eax -= last_tsc_low; /* tsc_low delta */
96
97 /*
98 * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
99 * = (tsc_low delta) * (usecs_per_clock)
100 * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
101 *
102 * Using a mull instead of a divl saves up to 31 clock cycles
103 * in the critical path.
104 */
105
106 __asm__("mull %2"
107 :"=a" (eax), "=d" (edx)
108 :"rm" (fast_gettimeoffset_quotient),
109 "0" (eax));
110
111 /* our adjusted time offset in microseconds */
112 return delay_at_last_interrupt + edx;
113}
114
115static unsigned long long monotonic_clock_tsc(void)
116{
117 unsigned long long last_offset, this_offset, base;
118 unsigned seq;
119
120 /* atomically read monotonic base & last_offset */
121 do {
122 seq = read_seqbegin(&monotonic_lock);
123 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
124 base = monotonic_base;
125 } while (read_seqretry(&monotonic_lock, seq));
126
127 /* Read the Time Stamp Counter */
128 rdtscll(this_offset);
129
130 /* return the value in ns */
131 return base + cycles_2_ns(this_offset - last_offset);
132}
133
134/*
135 * Scheduler clock - returns current time in nanosec units.
136 */
137unsigned long long sched_clock(void)
138{
139 unsigned long long this_offset;
140
141 /*
142 * In the NUMA case we dont use the TSC as they are not
143 * synchronized across all CPUs.
144 */
145#ifndef CONFIG_NUMA
146 if (!use_tsc)
147#endif
148 /* no locking but a rare wrong value is not a big deal */
149 return jiffies_64 * (1000000000 / HZ);
150
151 /* Read the Time Stamp Counter */
152 rdtscll(this_offset);
153
154 /* return the value in ns */
155 return cycles_2_ns(this_offset);
156}
157
158static void delay_tsc(unsigned long loops)
159{
160 unsigned long bclock, now;
161
162 rdtscl(bclock);
163 do
164 {
165 rep_nop();
166 rdtscl(now);
167 } while ((now-bclock) < loops);
168}
169
170#ifdef CONFIG_HPET_TIMER
171static void mark_offset_tsc_hpet(void)
172{
173 unsigned long long this_offset, last_offset;
174 unsigned long offset, temp, hpet_current;
175
176 write_seqlock(&monotonic_lock);
177 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
178 /*
179 * It is important that these two operations happen almost at
180 * the same time. We do the RDTSC stuff first, since it's
181 * faster. To avoid any inconsistencies, we need interrupts
182 * disabled locally.
183 */
184 /*
185 * Interrupts are just disabled locally since the timer irq
186 * has the SA_INTERRUPT flag set. -arca
187 */
188 /* read Pentium cycle counter */
189
190 hpet_current = hpet_readl(HPET_COUNTER);
191 rdtsc(last_tsc_low, last_tsc_high);
192
193 /* lost tick compensation */
194 offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
195 if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) {
196 int lost_ticks = (offset - hpet_last) / hpet_tick;
197 jiffies_64 += lost_ticks;
198 }
199 hpet_last = hpet_current;
200
201 /* update the monotonic base value */
202 this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
203 monotonic_base += cycles_2_ns(this_offset - last_offset);
204 write_sequnlock(&monotonic_lock);
205
206 /* calculate delay_at_last_interrupt */
207 /*
208 * Time offset = (hpet delta) * ( usecs per HPET clock )
209 * = (hpet delta) * ( usecs per tick / HPET clocks per tick)
210 * = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
211 * Where,
212 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
213 */
214 delay_at_last_interrupt = hpet_current - offset;
215 ASM_MUL64_REG(temp, delay_at_last_interrupt,
216 hpet_usec_quotient, delay_at_last_interrupt);
217}
218#endif
219
220
221#ifdef CONFIG_CPU_FREQ
222#include <linux/workqueue.h>
223
224static unsigned int cpufreq_delayed_issched = 0;
225static unsigned int cpufreq_init = 0;
226static struct work_struct cpufreq_delayed_get_work;
227
228static void handle_cpufreq_delayed_get(void *v)
229{
230 unsigned int cpu;
231 for_each_online_cpu(cpu) {
232 cpufreq_get(cpu);
233 }
234 cpufreq_delayed_issched = 0;
235}
236
237/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
238 * to verify the CPU frequency the timing core thinks the CPU is running
239 * at is still correct.
240 */
241static inline void cpufreq_delayed_get(void)
242{
243 if (cpufreq_init && !cpufreq_delayed_issched) {
244 cpufreq_delayed_issched = 1;
245 printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
246 schedule_work(&cpufreq_delayed_get_work);
247 }
248}
249
250/* If the CPU frequency is scaled, TSC-based delays will need a different
251 * loops_per_jiffy value to function properly.
252 */
253
254static unsigned int ref_freq = 0;
255static unsigned long loops_per_jiffy_ref = 0;
256
257#ifndef CONFIG_SMP
258static unsigned long fast_gettimeoffset_ref = 0;
259static unsigned long cpu_khz_ref = 0;
260#endif
261
262static int
263time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
264 void *data)
265{
266 struct cpufreq_freqs *freq = data;
267
268 if (val != CPUFREQ_RESUMECHANGE)
269 write_seqlock_irq(&xtime_lock);
270 if (!ref_freq) {
271 ref_freq = freq->old;
272 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
273#ifndef CONFIG_SMP
274 fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
275 cpu_khz_ref = cpu_khz;
276#endif
277 }
278
279 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
280 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
281 (val == CPUFREQ_RESUMECHANGE)) {
282 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
283 cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
284#ifndef CONFIG_SMP
285 if (cpu_khz)
286 cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
287 if (use_tsc) {
288 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
289 fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
290 set_cyc2ns_scale(cpu_khz/1000);
291 }
292 }
293#endif
294 }
295
296 if (val != CPUFREQ_RESUMECHANGE)
297 write_sequnlock_irq(&xtime_lock);
298
299 return 0;
300}
301
302static struct notifier_block time_cpufreq_notifier_block = {
303 .notifier_call = time_cpufreq_notifier
304};
305
306
307static int __init cpufreq_tsc(void)
308{
309 int ret;
310 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
311 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
312 CPUFREQ_TRANSITION_NOTIFIER);
313 if (!ret)
314 cpufreq_init = 1;
315 return ret;
316}
317core_initcall(cpufreq_tsc);
318
319#else /* CONFIG_CPU_FREQ */
320static inline void cpufreq_delayed_get(void) { return; }
321#endif
322
323static void mark_offset_tsc(void)
324{
325 unsigned long lost,delay;
326 unsigned long delta = last_tsc_low;
327 int count;
328 int countmp;
329 static int count1 = 0;
330 unsigned long long this_offset, last_offset;
331 static int lost_count = 0;
332
333 write_seqlock(&monotonic_lock);
334 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
335 /*
336 * It is important that these two operations happen almost at
337 * the same time. We do the RDTSC stuff first, since it's
338 * faster. To avoid any inconsistencies, we need interrupts
339 * disabled locally.
340 */
341
342 /*
343 * Interrupts are just disabled locally since the timer irq
344 * has the SA_INTERRUPT flag set. -arca
345 */
346
347 /* read Pentium cycle counter */
348
349 rdtsc(last_tsc_low, last_tsc_high);
350
351 spin_lock(&i8253_lock);
352 outb_p(0x00, PIT_MODE); /* latch the count ASAP */
353
354 count = inb_p(PIT_CH0); /* read the latched count */
355 count |= inb(PIT_CH0) << 8;
356
357 /*
358 * VIA686a test code... reset the latch if count > max + 1
359 * from timer_pit.c - cjb
360 */
361 if (count > LATCH) {
362 outb_p(0x34, PIT_MODE);
363 outb_p(LATCH & 0xff, PIT_CH0);
364 outb(LATCH >> 8, PIT_CH0);
365 count = LATCH - 1;
366 }
367
368 spin_unlock(&i8253_lock);
369
370 if (pit_latch_buggy) {
371 /* get center value of last 3 time lutch */
372 if ((count2 >= count && count >= count1)
373 || (count1 >= count && count >= count2)) {
374 count2 = count1; count1 = count;
375 } else if ((count1 >= count2 && count2 >= count)
376 || (count >= count2 && count2 >= count1)) {
377 countmp = count;count = count2;
378 count2 = count1;count1 = countmp;
379 } else {
380 count2 = count1; count1 = count; count = count1;
381 }
382 }
383
384 /* lost tick compensation */
385 delta = last_tsc_low - delta;
386 {
387 register unsigned long eax, edx;
388 eax = delta;
389 __asm__("mull %2"
390 :"=a" (eax), "=d" (edx)
391 :"rm" (fast_gettimeoffset_quotient),
392 "0" (eax));
393 delta = edx;
394 }
395 delta += delay_at_last_interrupt;
396 lost = delta/(1000000/HZ);
397 delay = delta%(1000000/HZ);
398 if (lost >= 2) {
399 jiffies_64 += lost-1;
400
401 /* sanity check to ensure we're not always losing ticks */
402 if (lost_count++ > 100) {
403 printk(KERN_WARNING "Losing too many ticks!\n");
404 printk(KERN_WARNING "TSC cannot be used as a timesource. \n");
405 printk(KERN_WARNING "Possible reasons for this are:\n");
406 printk(KERN_WARNING " You're running with Speedstep,\n");
407 printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n");
408 printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n");
409 printk(KERN_WARNING "Falling back to a sane timesource now.\n");
410
411 clock_fallback();
412 }
413 /* ... but give the TSC a fair chance */
414 if (lost_count > 25)
415 cpufreq_delayed_get();
416 } else
417 lost_count = 0;
418 /* update the monotonic base value */
419 this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
420 monotonic_base += cycles_2_ns(this_offset - last_offset);
421 write_sequnlock(&monotonic_lock);
422
423 /* calculate delay_at_last_interrupt */
424 count = ((LATCH-1) - count) * TICK_SIZE;
425 delay_at_last_interrupt = (count + LATCH/2) / LATCH;
426
427 /* catch corner case where tick rollover occured
428 * between tsc and pit reads (as noted when
429 * usec delta is > 90% # of usecs/tick)
430 */
431 if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
432 jiffies_64++;
433}
434
435static int __init init_tsc(char* override)
436{
437
438 /* check clock override */
439 if (override[0] && strncmp(override,"tsc",3)) {
440#ifdef CONFIG_HPET_TIMER
441 if (is_hpet_enabled()) {
442 printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
443 } else
444#endif
445 {
446 return -ENODEV;
447 }
448 }
449
450 /*
451 * If we have APM enabled or the CPU clock speed is variable
452 * (CPU stops clock on HLT or slows clock to save power)
453 * then the TSC timestamps may diverge by up to 1 jiffy from
454 * 'real time' but nothing will break.
455 * The most frequent case is that the CPU is "woken" from a halt
456 * state by the timer interrupt itself, so we get 0 error. In the
457 * rare cases where a driver would "wake" the CPU and request a
458 * timestamp, the maximum error is < 1 jiffy. But timestamps are
459 * still perfectly ordered.
460 * Note that the TSC counter will be reset if APM suspends
461 * to disk; this won't break the kernel, though, 'cuz we're
462 * smart. See arch/i386/kernel/apm.c.
463 */
464 /*
465 * Firstly we have to do a CPU check for chips with
466 * a potentially buggy TSC. At this point we haven't run
467 * the ident/bugs checks so we must run this hook as it
468 * may turn off the TSC flag.
469 *
470 * NOTE: this doesn't yet handle SMP 486 machines where only
471 * some CPU's have a TSC. Thats never worked and nobody has
472 * moaned if you have the only one in the world - you fix it!
473 */
474
475 count2 = LATCH; /* initialize counter for mark_offset_tsc() */
476
477 if (cpu_has_tsc) {
478 unsigned long tsc_quotient;
479#ifdef CONFIG_HPET_TIMER
480 if (is_hpet_enabled()){
481 unsigned long result, remain;
482 printk("Using TSC for gettimeofday\n");
483 tsc_quotient = calibrate_tsc_hpet(NULL);
484 timer_tsc.mark_offset = &mark_offset_tsc_hpet;
485 /*
486 * Math to calculate hpet to usec multiplier
487 * Look for the comments at get_offset_tsc_hpet()
488 */
489 ASM_DIV64_REG(result, remain, hpet_tick,
490 0, KERNEL_TICK_USEC);
491 if (remain > (hpet_tick >> 1))
492 result++; /* rounding the result */
493
494 hpet_usec_quotient = result;
495 } else
496#endif
497 {
498 tsc_quotient = calibrate_tsc();
499 }
500
501 if (tsc_quotient) {
502 fast_gettimeoffset_quotient = tsc_quotient;
503 use_tsc = 1;
504 /*
505 * We could be more selective here I suspect
506 * and just enable this for the next intel chips ?
507 */
508 /* report CPU clock rate in Hz.
509 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
510 * clock/second. Our precision is about 100 ppm.
511 */
512 { unsigned long eax=0, edx=1000;
513 __asm__("divl %2"
514 :"=a" (cpu_khz), "=d" (edx)
515 :"r" (tsc_quotient),
516 "0" (eax), "1" (edx));
517 printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000);
518 }
519 set_cyc2ns_scale(cpu_khz/1000);
520 return 0;
521 }
522 }
523 return -ENODEV;
524}
525
526#ifndef CONFIG_X86_TSC
527/* disable flag for tsc. Takes effect by clearing the TSC cpu flag
528 * in cpu/common.c */
529static int __init tsc_setup(char *str)
530{
531 tsc_disable = 1;
532 return 1;
533}
534#else
535static int __init tsc_setup(char *str)
536{
537 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
538 "cannot disable TSC.\n");
539 return 1;
540}
541#endif
542__setup("notsc", tsc_setup);
543
544
545
546/************************************************************/
547
548/* tsc timer_opts struct */
549static struct timer_opts timer_tsc = {
550 .name = "tsc",
551 .mark_offset = mark_offset_tsc,
552 .get_offset = get_offset_tsc,
553 .monotonic_clock = monotonic_clock_tsc,
554 .delay = delay_tsc,
555};
556
557struct init_timer_opts __initdata timer_tsc_init = {
558 .init = init_tsc,
559 .opts = &timer_tsc,
560};
diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S
new file mode 100644
index 000000000000..fcce0e61b0e7
--- /dev/null
+++ b/arch/i386/kernel/trampoline.S
@@ -0,0 +1,80 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 *
7 * This is only used for booting secondary CPUs in SMP machine
8 *
9 * Entry: CS:IP point to the start of our code, we are
10 * in real mode with no stack, but the rest of the
11 * trampoline page to make our stack and everything else
12 * is a mystery.
13 *
14 * In fact we don't actually need a stack so we don't
15 * set one up.
16 *
17 * We jump into the boot/compressed/head.S code. So you'd
18 * better be running a compressed kernel image or you
19 * won't get very far.
20 *
21 * On entry to trampoline_data, the processor is in real mode
22 * with 16-bit addressing and 16-bit data. CS has some value
23 * and IP is zero. Thus, data addresses need to be absolute
24 * (no relocation) and are taken with regard to r_base.
25 *
26 * If you work on this file, check the object module with
27 * objdump --reloc to make sure there are no relocation
28 * entries except for:
29 *
30 * TYPE VALUE
31 * R_386_32 startup_32_smp
32 * R_386_32 boot_gdt_table
33 */
34
35#include <linux/linkage.h>
36#include <asm/segment.h>
37#include <asm/page.h>
38
39.data
40
41.code16
42
43ENTRY(trampoline_data)
44r_base = .
45 wbinvd # Needed for NUMA-Q should be harmless for others
46 mov %cs, %ax # Code and data in the same place
47 mov %ax, %ds
48
49 cli # We should be safe anyway
50
51 movl $0xA5A5A5A5, trampoline_data - r_base
52 # write marker for master knows we're running
53
54 /* GDT tables in non default location kernel can be beyond 16MB and
55 * lgdt will not be able to load the address as in real mode default
56 * operand size is 16bit. Use lgdtl instead to force operand size
57 * to 32 bit.
58 */
59
60 lidtl boot_idt - r_base # load idt with 0, 0
61 lgdtl boot_gdt - r_base # load gdt with whatever is appropriate
62
63 xor %ax, %ax
64 inc %ax # protected mode (PE) bit
65 lmsw %ax # into protected mode
66 # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
67 ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
68
69 # These need to be in the same 64K segment as the above;
70 # hence we don't use the boot_gdt_descr defined in head.S
71boot_gdt:
72 .word __BOOT_DS + 7 # gdt limit
73 .long boot_gdt_table-__PAGE_OFFSET # gdt base
74
75boot_idt:
76 .word 0 # idt limit = 0
77 .long 0 # idt base = 0L
78
79.globl trampoline_end
80trampoline_end:
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
new file mode 100644
index 000000000000..6c0e383915b6
--- /dev/null
+++ b/arch/i386/kernel/traps.c
@@ -0,0 +1,1084 @@
1/*
2 * linux/arch/i386/traps.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
9
10/*
11 * 'Traps.c' handles hardware traps and faults after we have saved some
12 * state in 'asm.s'.
13 */
14#include <linux/config.h>
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/timer.h>
20#include <linux/mm.h>
21#include <linux/init.h>
22#include <linux/delay.h>
23#include <linux/spinlock.h>
24#include <linux/interrupt.h>
25#include <linux/highmem.h>
26#include <linux/kallsyms.h>
27#include <linux/ptrace.h>
28#include <linux/utsname.h>
29#include <linux/kprobes.h>
30
31#ifdef CONFIG_EISA
32#include <linux/ioport.h>
33#include <linux/eisa.h>
34#endif
35
36#ifdef CONFIG_MCA
37#include <linux/mca.h>
38#endif
39
40#include <asm/processor.h>
41#include <asm/system.h>
42#include <asm/uaccess.h>
43#include <asm/io.h>
44#include <asm/atomic.h>
45#include <asm/debugreg.h>
46#include <asm/desc.h>
47#include <asm/i387.h>
48#include <asm/nmi.h>
49
50#include <asm/smp.h>
51#include <asm/arch_hooks.h>
52#include <asm/kdebug.h>
53
54#include <linux/irq.h>
55#include <linux/module.h>
56
57#include "mach_traps.h"
58
59asmlinkage int system_call(void);
60
61struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
62 { 0, 0 }, { 0, 0 } };
63
64/* Do we ignore FPU interrupts ? */
65char ignore_fpu_irq = 0;
66
67/*
68 * The IDT has to be page-aligned to simplify the Pentium
69 * F0 0F bug workaround.. We have a special link segment
70 * for this.
71 */
72struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
73
74asmlinkage void divide_error(void);
75asmlinkage void debug(void);
76asmlinkage void nmi(void);
77asmlinkage void int3(void);
78asmlinkage void overflow(void);
79asmlinkage void bounds(void);
80asmlinkage void invalid_op(void);
81asmlinkage void device_not_available(void);
82asmlinkage void coprocessor_segment_overrun(void);
83asmlinkage void invalid_TSS(void);
84asmlinkage void segment_not_present(void);
85asmlinkage void stack_segment(void);
86asmlinkage void general_protection(void);
87asmlinkage void page_fault(void);
88asmlinkage void coprocessor_error(void);
89asmlinkage void simd_coprocessor_error(void);
90asmlinkage void alignment_check(void);
91asmlinkage void spurious_interrupt_bug(void);
92asmlinkage void machine_check(void);
93
94static int kstack_depth_to_print = 24;
95struct notifier_block *i386die_chain;
96static DEFINE_SPINLOCK(die_notifier_lock);
97
98int register_die_notifier(struct notifier_block *nb)
99{
100 int err = 0;
101 unsigned long flags;
102 spin_lock_irqsave(&die_notifier_lock, flags);
103 err = notifier_chain_register(&i386die_chain, nb);
104 spin_unlock_irqrestore(&die_notifier_lock, flags);
105 return err;
106}
107
108static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
109{
110 return p > (void *)tinfo &&
111 p < (void *)tinfo + THREAD_SIZE - 3;
112}
113
114static inline unsigned long print_context_stack(struct thread_info *tinfo,
115 unsigned long *stack, unsigned long ebp)
116{
117 unsigned long addr;
118
119#ifdef CONFIG_FRAME_POINTER
120 while (valid_stack_ptr(tinfo, (void *)ebp)) {
121 addr = *(unsigned long *)(ebp + 4);
122 printk(" [<%08lx>] ", addr);
123 print_symbol("%s", addr);
124 printk("\n");
125 ebp = *(unsigned long *)ebp;
126 }
127#else
128 while (valid_stack_ptr(tinfo, stack)) {
129 addr = *stack++;
130 if (__kernel_text_address(addr)) {
131 printk(" [<%08lx>]", addr);
132 print_symbol(" %s", addr);
133 printk("\n");
134 }
135 }
136#endif
137 return ebp;
138}
139
140void show_trace(struct task_struct *task, unsigned long * stack)
141{
142 unsigned long ebp;
143
144 if (!task)
145 task = current;
146
147 if (task == current) {
148 /* Grab ebp right from our regs */
149 asm ("movl %%ebp, %0" : "=r" (ebp) : );
150 } else {
151 /* ebp is the last reg pushed by switch_to */
152 ebp = *(unsigned long *) task->thread.esp;
153 }
154
155 while (1) {
156 struct thread_info *context;
157 context = (struct thread_info *)
158 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
159 ebp = print_context_stack(context, stack, ebp);
160 stack = (unsigned long*)context->previous_esp;
161 if (!stack)
162 break;
163 printk(" =======================\n");
164 }
165}
166
167void show_stack(struct task_struct *task, unsigned long *esp)
168{
169 unsigned long *stack;
170 int i;
171
172 if (esp == NULL) {
173 if (task)
174 esp = (unsigned long*)task->thread.esp;
175 else
176 esp = (unsigned long *)&esp;
177 }
178
179 stack = esp;
180 for(i = 0; i < kstack_depth_to_print; i++) {
181 if (kstack_end(stack))
182 break;
183 if (i && ((i % 8) == 0))
184 printk("\n ");
185 printk("%08lx ", *stack++);
186 }
187 printk("\nCall Trace:\n");
188 show_trace(task, esp);
189}
190
191/*
192 * The architecture-independent dump_stack generator
193 */
194void dump_stack(void)
195{
196 unsigned long stack;
197
198 show_trace(current, &stack);
199}
200
201EXPORT_SYMBOL(dump_stack);
202
203void show_registers(struct pt_regs *regs)
204{
205 int i;
206 int in_kernel = 1;
207 unsigned long esp;
208 unsigned short ss;
209
210 esp = (unsigned long) (&regs->esp);
211 ss = __KERNEL_DS;
212 if (regs->xcs & 3) {
213 in_kernel = 0;
214 esp = regs->esp;
215 ss = regs->xss & 0xffff;
216 }
217 print_modules();
218 printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx"
219 " (%s) \n",
220 smp_processor_id(), 0xffff & regs->xcs, regs->eip,
221 print_tainted(), regs->eflags, system_utsname.release);
222 print_symbol("EIP is at %s\n", regs->eip);
223 printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
224 regs->eax, regs->ebx, regs->ecx, regs->edx);
225 printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
226 regs->esi, regs->edi, regs->ebp, esp);
227 printk("ds: %04x es: %04x ss: %04x\n",
228 regs->xds & 0xffff, regs->xes & 0xffff, ss);
229 printk("Process %s (pid: %d, threadinfo=%p task=%p)",
230 current->comm, current->pid, current_thread_info(), current);
231 /*
232 * When in-kernel, we also print out the stack and code at the
233 * time of the fault..
234 */
235 if (in_kernel) {
236 u8 *eip;
237
238 printk("\nStack: ");
239 show_stack(NULL, (unsigned long*)esp);
240
241 printk("Code: ");
242
243 eip = (u8 *)regs->eip - 43;
244 for (i = 0; i < 64; i++, eip++) {
245 unsigned char c;
246
247 if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) {
248 printk(" Bad EIP value.");
249 break;
250 }
251 if (eip == (u8 *)regs->eip)
252 printk("<%02x> ", c);
253 else
254 printk("%02x ", c);
255 }
256 }
257 printk("\n");
258}
259
260static void handle_BUG(struct pt_regs *regs)
261{
262 unsigned short ud2;
263 unsigned short line;
264 char *file;
265 char c;
266 unsigned long eip;
267
268 if (regs->xcs & 3)
269 goto no_bug; /* Not in kernel */
270
271 eip = regs->eip;
272
273 if (eip < PAGE_OFFSET)
274 goto no_bug;
275 if (__get_user(ud2, (unsigned short *)eip))
276 goto no_bug;
277 if (ud2 != 0x0b0f)
278 goto no_bug;
279 if (__get_user(line, (unsigned short *)(eip + 2)))
280 goto bug;
281 if (__get_user(file, (char **)(eip + 4)) ||
282 (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
283 file = "<bad filename>";
284
285 printk("------------[ cut here ]------------\n");
286 printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line);
287
288no_bug:
289 return;
290
291 /* Here we know it was a BUG but file-n-line is unavailable */
292bug:
293 printk("Kernel BUG\n");
294}
295
296void die(const char * str, struct pt_regs * regs, long err)
297{
298 static struct {
299 spinlock_t lock;
300 u32 lock_owner;
301 int lock_owner_depth;
302 } die = {
303 .lock = SPIN_LOCK_UNLOCKED,
304 .lock_owner = -1,
305 .lock_owner_depth = 0
306 };
307 static int die_counter;
308
309 if (die.lock_owner != _smp_processor_id()) {
310 console_verbose();
311 spin_lock_irq(&die.lock);
312 die.lock_owner = smp_processor_id();
313 die.lock_owner_depth = 0;
314 bust_spinlocks(1);
315 }
316
317 if (++die.lock_owner_depth < 3) {
318 int nl = 0;
319 handle_BUG(regs);
320 printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
321#ifdef CONFIG_PREEMPT
322 printk("PREEMPT ");
323 nl = 1;
324#endif
325#ifdef CONFIG_SMP
326 printk("SMP ");
327 nl = 1;
328#endif
329#ifdef CONFIG_DEBUG_PAGEALLOC
330 printk("DEBUG_PAGEALLOC");
331 nl = 1;
332#endif
333 if (nl)
334 printk("\n");
335 notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
336 show_registers(regs);
337 } else
338 printk(KERN_ERR "Recursive die() failure, output suppressed\n");
339
340 bust_spinlocks(0);
341 die.lock_owner = -1;
342 spin_unlock_irq(&die.lock);
343 if (in_interrupt())
344 panic("Fatal exception in interrupt");
345
346 if (panic_on_oops) {
347 printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
348 ssleep(5);
349 panic("Fatal exception");
350 }
351 do_exit(SIGSEGV);
352}
353
354static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
355{
356 if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs))
357 die(str, regs, err);
358}
359
360static void do_trap(int trapnr, int signr, char *str, int vm86,
361 struct pt_regs * regs, long error_code, siginfo_t *info)
362{
363 if (regs->eflags & VM_MASK) {
364 if (vm86)
365 goto vm86_trap;
366 goto trap_signal;
367 }
368
369 if (!(regs->xcs & 3))
370 goto kernel_trap;
371
372 trap_signal: {
373 struct task_struct *tsk = current;
374 tsk->thread.error_code = error_code;
375 tsk->thread.trap_no = trapnr;
376 if (info)
377 force_sig_info(signr, info, tsk);
378 else
379 force_sig(signr, tsk);
380 return;
381 }
382
383 kernel_trap: {
384 if (!fixup_exception(regs))
385 die(str, regs, error_code);
386 return;
387 }
388
389 vm86_trap: {
390 int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
391 if (ret) goto trap_signal;
392 return;
393 }
394}
395
396#define DO_ERROR(trapnr, signr, str, name) \
397fastcall void do_##name(struct pt_regs * regs, long error_code) \
398{ \
399 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
400 == NOTIFY_STOP) \
401 return; \
402 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
403}
404
405#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
406fastcall void do_##name(struct pt_regs * regs, long error_code) \
407{ \
408 siginfo_t info; \
409 info.si_signo = signr; \
410 info.si_errno = 0; \
411 info.si_code = sicode; \
412 info.si_addr = (void __user *)siaddr; \
413 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
414 == NOTIFY_STOP) \
415 return; \
416 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
417}
418
419#define DO_VM86_ERROR(trapnr, signr, str, name) \
420fastcall void do_##name(struct pt_regs * regs, long error_code) \
421{ \
422 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
423 == NOTIFY_STOP) \
424 return; \
425 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
426}
427
428#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
429fastcall void do_##name(struct pt_regs * regs, long error_code) \
430{ \
431 siginfo_t info; \
432 info.si_signo = signr; \
433 info.si_errno = 0; \
434 info.si_code = sicode; \
435 info.si_addr = (void __user *)siaddr; \
436 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
437 == NOTIFY_STOP) \
438 return; \
439 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
440}
441
442DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
443#ifndef CONFIG_KPROBES
444DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
445#endif
446DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
447DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
448DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
449DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
450DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
451DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
452DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
453DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
454
455fastcall void do_general_protection(struct pt_regs * regs, long error_code)
456{
457 int cpu = get_cpu();
458 struct tss_struct *tss = &per_cpu(init_tss, cpu);
459 struct thread_struct *thread = &current->thread;
460
461 /*
462 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
463 * invalid offset set (the LAZY one) and the faulting thread has
464 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
465 * and we set the offset field correctly. Then we let the CPU to
466 * restart the faulting instruction.
467 */
468 if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
469 thread->io_bitmap_ptr) {
470 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
471 thread->io_bitmap_max);
472 /*
473 * If the previously set map was extending to higher ports
474 * than the current one, pad extra space with 0xff (no access).
475 */
476 if (thread->io_bitmap_max < tss->io_bitmap_max)
477 memset((char *) tss->io_bitmap +
478 thread->io_bitmap_max, 0xff,
479 tss->io_bitmap_max - thread->io_bitmap_max);
480 tss->io_bitmap_max = thread->io_bitmap_max;
481 tss->io_bitmap_base = IO_BITMAP_OFFSET;
482 put_cpu();
483 return;
484 }
485 put_cpu();
486
487 if (regs->eflags & VM_MASK)
488 goto gp_in_vm86;
489
490 if (!(regs->xcs & 3))
491 goto gp_in_kernel;
492
493 current->thread.error_code = error_code;
494 current->thread.trap_no = 13;
495 force_sig(SIGSEGV, current);
496 return;
497
498gp_in_vm86:
499 local_irq_enable();
500 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
501 return;
502
503gp_in_kernel:
504 if (!fixup_exception(regs)) {
505 if (notify_die(DIE_GPF, "general protection fault", regs,
506 error_code, 13, SIGSEGV) == NOTIFY_STOP)
507 return;
508 die("general protection fault", regs, error_code);
509 }
510}
511
512static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
513{
514 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
515 printk("You probably have a hardware problem with your RAM chips\n");
516
517 /* Clear and disable the memory parity error line. */
518 clear_mem_error(reason);
519}
520
521static void io_check_error(unsigned char reason, struct pt_regs * regs)
522{
523 unsigned long i;
524
525 printk("NMI: IOCK error (debug interrupt?)\n");
526 show_registers(regs);
527
528 /* Re-enable the IOCK line, wait for a few seconds */
529 reason = (reason & 0xf) | 8;
530 outb(reason, 0x61);
531 i = 2000;
532 while (--i) udelay(1000);
533 reason &= ~8;
534 outb(reason, 0x61);
535}
536
537static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
538{
539#ifdef CONFIG_MCA
540 /* Might actually be able to figure out what the guilty party
541 * is. */
542 if( MCA_bus ) {
543 mca_handle_nmi();
544 return;
545 }
546#endif
547 printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
548 reason, smp_processor_id());
549 printk("Dazed and confused, but trying to continue\n");
550 printk("Do you have a strange power saving mode enabled?\n");
551}
552
553static DEFINE_SPINLOCK(nmi_print_lock);
554
555void die_nmi (struct pt_regs *regs, const char *msg)
556{
557 spin_lock(&nmi_print_lock);
558 /*
559 * We are in trouble anyway, lets at least try
560 * to get a message out.
561 */
562 bust_spinlocks(1);
563 printk(msg);
564 printk(" on CPU%d, eip %08lx, registers:\n",
565 smp_processor_id(), regs->eip);
566 show_registers(regs);
567 printk("console shuts up ...\n");
568 console_silent();
569 spin_unlock(&nmi_print_lock);
570 bust_spinlocks(0);
571 do_exit(SIGSEGV);
572}
573
574static void default_do_nmi(struct pt_regs * regs)
575{
576 unsigned char reason = 0;
577
578 /* Only the BSP gets external NMIs from the system. */
579 if (!smp_processor_id())
580 reason = get_nmi_reason();
581
582 if (!(reason & 0xc0)) {
583 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
584 == NOTIFY_STOP)
585 return;
586#ifdef CONFIG_X86_LOCAL_APIC
587 /*
588 * Ok, so this is none of the documented NMI sources,
589 * so it must be the NMI watchdog.
590 */
591 if (nmi_watchdog) {
592 nmi_watchdog_tick(regs);
593 return;
594 }
595#endif
596 unknown_nmi_error(reason, regs);
597 return;
598 }
599 if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
600 return;
601 if (reason & 0x80)
602 mem_parity_error(reason, regs);
603 if (reason & 0x40)
604 io_check_error(reason, regs);
605 /*
606 * Reassert NMI in case it became active meanwhile
607 * as it's edge-triggered.
608 */
609 reassert_nmi();
610}
611
612static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
613{
614 return 0;
615}
616
617static nmi_callback_t nmi_callback = dummy_nmi_callback;
618
619fastcall void do_nmi(struct pt_regs * regs, long error_code)
620{
621 int cpu;
622
623 nmi_enter();
624
625 cpu = smp_processor_id();
626 ++nmi_count(cpu);
627
628 if (!nmi_callback(regs, cpu))
629 default_do_nmi(regs);
630
631 nmi_exit();
632}
633
634void set_nmi_callback(nmi_callback_t callback)
635{
636 nmi_callback = callback;
637}
638
639void unset_nmi_callback(void)
640{
641 nmi_callback = dummy_nmi_callback;
642}
643
644#ifdef CONFIG_KPROBES
645fastcall int do_int3(struct pt_regs *regs, long error_code)
646{
647 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
648 == NOTIFY_STOP)
649 return 1;
650 /* This is an interrupt gate, because kprobes wants interrupts
651 disabled. Normal trap handlers don't. */
652 restore_interrupts(regs);
653 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
654 return 0;
655}
656#endif
657
658/*
659 * Our handling of the processor debug registers is non-trivial.
660 * We do not clear them on entry and exit from the kernel. Therefore
661 * it is possible to get a watchpoint trap here from inside the kernel.
662 * However, the code in ./ptrace.c has ensured that the user can
663 * only set watchpoints on userspace addresses. Therefore the in-kernel
664 * watchpoint trap can only occur in code which is reading/writing
665 * from user space. Such code must not hold kernel locks (since it
666 * can equally take a page fault), therefore it is safe to call
667 * force_sig_info even though that claims and releases locks.
668 *
669 * Code in ./signal.c ensures that the debug control register
670 * is restored before we deliver any signal, and therefore that
671 * user code runs with the correct debug control register even though
672 * we clear it here.
673 *
674 * Being careful here means that we don't have to be as careful in a
675 * lot of more complicated places (task switching can be a bit lazy
676 * about restoring all the debug state, and ptrace doesn't have to
677 * find every occurrence of the TF bit that could be saved away even
678 * by user code)
679 */
680fastcall void do_debug(struct pt_regs * regs, long error_code)
681{
682 unsigned int condition;
683 struct task_struct *tsk = current;
684
685 __asm__ __volatile__("movl %%db6,%0" : "=r" (condition));
686
687 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
688 SIGTRAP) == NOTIFY_STOP)
689 return;
690 /* It's safe to allow irq's after DR6 has been saved */
691 if (regs->eflags & X86_EFLAGS_IF)
692 local_irq_enable();
693
694 /* Mask out spurious debug traps due to lazy DR7 setting */
695 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
696 if (!tsk->thread.debugreg[7])
697 goto clear_dr7;
698 }
699
700 if (regs->eflags & VM_MASK)
701 goto debug_vm86;
702
703 /* Save debug status register where ptrace can see it */
704 tsk->thread.debugreg[6] = condition;
705
706 /*
707 * Single-stepping through TF: make sure we ignore any events in
708 * kernel space (but re-enable TF when returning to user mode).
709 */
710 if (condition & DR_STEP) {
711 /*
712 * We already checked v86 mode above, so we can
713 * check for kernel mode by just checking the CPL
714 * of CS.
715 */
716 if ((regs->xcs & 3) == 0)
717 goto clear_TF_reenable;
718 }
719
720 /* Ok, finally something we can handle */
721 send_sigtrap(tsk, regs, error_code);
722
723 /* Disable additional traps. They'll be re-enabled when
724 * the signal is delivered.
725 */
726clear_dr7:
727 __asm__("movl %0,%%db7"
728 : /* no output */
729 : "r" (0));
730 return;
731
732debug_vm86:
733 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
734 return;
735
736clear_TF_reenable:
737 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
738 regs->eflags &= ~TF_MASK;
739 return;
740}
741
742/*
743 * Note that we play around with the 'TS' bit in an attempt to get
744 * the correct behaviour even in the presence of the asynchronous
745 * IRQ13 behaviour
746 */
747void math_error(void __user *eip)
748{
749 struct task_struct * task;
750 siginfo_t info;
751 unsigned short cwd, swd;
752
753 /*
754 * Save the info for the exception handler and clear the error.
755 */
756 task = current;
757 save_init_fpu(task);
758 task->thread.trap_no = 16;
759 task->thread.error_code = 0;
760 info.si_signo = SIGFPE;
761 info.si_errno = 0;
762 info.si_code = __SI_FAULT;
763 info.si_addr = eip;
764 /*
765 * (~cwd & swd) will mask out exceptions that are not set to unmasked
766 * status. 0x3f is the exception bits in these regs, 0x200 is the
767 * C1 reg you need in case of a stack fault, 0x040 is the stack
768 * fault bit. We should only be taking one exception at a time,
769 * so if this combination doesn't produce any single exception,
770 * then we have a bad program that isn't syncronizing its FPU usage
771 * and it will suffer the consequences since we won't be able to
772 * fully reproduce the context of the exception
773 */
774 cwd = get_fpu_cwd(task);
775 swd = get_fpu_swd(task);
776 switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
777 case 0x000:
778 default:
779 break;
780 case 0x001: /* Invalid Op */
781 case 0x041: /* Stack Fault */
782 case 0x241: /* Stack Fault | Direction */
783 info.si_code = FPE_FLTINV;
784 /* Should we clear the SF or let user space do it ???? */
785 break;
786 case 0x002: /* Denormalize */
787 case 0x010: /* Underflow */
788 info.si_code = FPE_FLTUND;
789 break;
790 case 0x004: /* Zero Divide */
791 info.si_code = FPE_FLTDIV;
792 break;
793 case 0x008: /* Overflow */
794 info.si_code = FPE_FLTOVF;
795 break;
796 case 0x020: /* Precision */
797 info.si_code = FPE_FLTRES;
798 break;
799 }
800 force_sig_info(SIGFPE, &info, task);
801}
802
803fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
804{
805 ignore_fpu_irq = 1;
806 math_error((void __user *)regs->eip);
807}
808
809static void simd_math_error(void __user *eip)
810{
811 struct task_struct * task;
812 siginfo_t info;
813 unsigned short mxcsr;
814
815 /*
816 * Save the info for the exception handler and clear the error.
817 */
818 task = current;
819 save_init_fpu(task);
820 task->thread.trap_no = 19;
821 task->thread.error_code = 0;
822 info.si_signo = SIGFPE;
823 info.si_errno = 0;
824 info.si_code = __SI_FAULT;
825 info.si_addr = eip;
826 /*
827 * The SIMD FPU exceptions are handled a little differently, as there
828 * is only a single status/control register. Thus, to determine which
829 * unmasked exception was caught we must mask the exception mask bits
830 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
831 */
832 mxcsr = get_fpu_mxcsr(task);
833 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
834 case 0x000:
835 default:
836 break;
837 case 0x001: /* Invalid Op */
838 info.si_code = FPE_FLTINV;
839 break;
840 case 0x002: /* Denormalize */
841 case 0x010: /* Underflow */
842 info.si_code = FPE_FLTUND;
843 break;
844 case 0x004: /* Zero Divide */
845 info.si_code = FPE_FLTDIV;
846 break;
847 case 0x008: /* Overflow */
848 info.si_code = FPE_FLTOVF;
849 break;
850 case 0x020: /* Precision */
851 info.si_code = FPE_FLTRES;
852 break;
853 }
854 force_sig_info(SIGFPE, &info, task);
855}
856
857fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
858 long error_code)
859{
860 if (cpu_has_xmm) {
861 /* Handle SIMD FPU exceptions on PIII+ processors. */
862 ignore_fpu_irq = 1;
863 simd_math_error((void __user *)regs->eip);
864 } else {
865 /*
866 * Handle strange cache flush from user space exception
867 * in all other cases. This is undocumented behaviour.
868 */
869 if (regs->eflags & VM_MASK) {
870 handle_vm86_fault((struct kernel_vm86_regs *)regs,
871 error_code);
872 return;
873 }
874 die_if_kernel("cache flush denied", regs, error_code);
875 current->thread.trap_no = 19;
876 current->thread.error_code = error_code;
877 force_sig(SIGSEGV, current);
878 }
879}
880
881fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
882 long error_code)
883{
884#if 0
885 /* No need to warn about this any longer. */
886 printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
887#endif
888}
889
890fastcall void setup_x86_bogus_stack(unsigned char * stk)
891{
892 unsigned long *switch16_ptr, *switch32_ptr;
893 struct pt_regs *regs;
894 unsigned long stack_top, stack_bot;
895 unsigned short iret_frame16_off;
896 int cpu = smp_processor_id();
897 /* reserve the space on 32bit stack for the magic switch16 pointer */
898 memmove(stk, stk + 8, sizeof(struct pt_regs));
899 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
900 regs = (struct pt_regs *)stk;
901 /* now the switch32 on 16bit stack */
902 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
903 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
904 switch32_ptr = (unsigned long *)(stack_top - 8);
905 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
906 /* copy iret frame on 16bit stack */
907 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
908 /* fill in the switch pointers */
909 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
910 switch16_ptr[1] = __ESPFIX_SS;
911 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
912 8 - CPU_16BIT_STACK_SIZE;
913 switch32_ptr[1] = __KERNEL_DS;
914}
915
916fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
917{
918 unsigned long *switch32_ptr;
919 unsigned char *stack16, *stack32;
920 unsigned long stack_top, stack_bot;
921 int len;
922 int cpu = smp_processor_id();
923 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
924 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
925 switch32_ptr = (unsigned long *)(stack_top - 8);
926 /* copy the data from 16bit stack to 32bit stack */
927 len = CPU_16BIT_STACK_SIZE - 8 - sp;
928 stack16 = (unsigned char *)(stack_bot + sp);
929 stack32 = (unsigned char *)
930 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
931 memcpy(stack32, stack16, len);
932 return stack32;
933}
934
935/*
936 * 'math_state_restore()' saves the current math information in the
937 * old math state array, and gets the new ones from the current task
938 *
939 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
940 * Don't touch unless you *really* know how it works.
941 *
942 * Must be called with kernel preemption disabled (in this case,
943 * local interrupts are disabled at the call-site in entry.S).
944 */
945asmlinkage void math_state_restore(struct pt_regs regs)
946{
947 struct thread_info *thread = current_thread_info();
948 struct task_struct *tsk = thread->task;
949
950 clts(); /* Allow maths ops (or we recurse) */
951 if (!tsk_used_math(tsk))
952 init_fpu(tsk);
953 restore_fpu(tsk);
954 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
955}
956
957#ifndef CONFIG_MATH_EMULATION
958
959asmlinkage void math_emulate(long arg)
960{
961 printk("math-emulation not enabled and no coprocessor found.\n");
962 printk("killing %s.\n",current->comm);
963 force_sig(SIGFPE,current);
964 schedule();
965}
966
967#endif /* CONFIG_MATH_EMULATION */
968
969#ifdef CONFIG_X86_F00F_BUG
970void __init trap_init_f00f_bug(void)
971{
972 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
973
974 /*
975 * Update the IDT descriptor and reload the IDT so that
976 * it uses the read-only mapped virtual address.
977 */
978 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
979 __asm__ __volatile__("lidt %0" : : "m" (idt_descr));
980}
981#endif
982
983#define _set_gate(gate_addr,type,dpl,addr,seg) \
984do { \
985 int __d0, __d1; \
986 __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
987 "movw %4,%%dx\n\t" \
988 "movl %%eax,%0\n\t" \
989 "movl %%edx,%1" \
990 :"=m" (*((long *) (gate_addr))), \
991 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
992 :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
993 "3" ((char *) (addr)),"2" ((seg) << 16)); \
994} while (0)
995
996
997/*
998 * This needs to use 'idt_table' rather than 'idt', and
999 * thus use the _nonmapped_ version of the IDT, as the
1000 * Pentium F0 0F bugfix can have resulted in the mapped
1001 * IDT being write-protected.
1002 */
1003void set_intr_gate(unsigned int n, void *addr)
1004{
1005 _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
1006}
1007
1008/*
1009 * This routine sets up an interrupt gate at directory privilege level 3.
1010 */
1011static inline void set_system_intr_gate(unsigned int n, void *addr)
1012{
1013 _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
1014}
1015
1016static void __init set_trap_gate(unsigned int n, void *addr)
1017{
1018 _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
1019}
1020
1021static void __init set_system_gate(unsigned int n, void *addr)
1022{
1023 _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
1024}
1025
1026static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1027{
1028 _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
1029}
1030
1031
1032void __init trap_init(void)
1033{
1034#ifdef CONFIG_EISA
1035 void __iomem *p = ioremap(0x0FFFD9, 4);
1036 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1037 EISA_bus = 1;
1038 }
1039 iounmap(p);
1040#endif
1041
1042#ifdef CONFIG_X86_LOCAL_APIC
1043 init_apic_mappings();
1044#endif
1045
1046 set_trap_gate(0,&divide_error);
1047 set_intr_gate(1,&debug);
1048 set_intr_gate(2,&nmi);
1049 set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
1050 set_system_gate(4,&overflow);
1051 set_system_gate(5,&bounds);
1052 set_trap_gate(6,&invalid_op);
1053 set_trap_gate(7,&device_not_available);
1054 set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
1055 set_trap_gate(9,&coprocessor_segment_overrun);
1056 set_trap_gate(10,&invalid_TSS);
1057 set_trap_gate(11,&segment_not_present);
1058 set_trap_gate(12,&stack_segment);
1059 set_trap_gate(13,&general_protection);
1060 set_intr_gate(14,&page_fault);
1061 set_trap_gate(15,&spurious_interrupt_bug);
1062 set_trap_gate(16,&coprocessor_error);
1063 set_trap_gate(17,&alignment_check);
1064#ifdef CONFIG_X86_MCE
1065 set_trap_gate(18,&machine_check);
1066#endif
1067 set_trap_gate(19,&simd_coprocessor_error);
1068
1069 set_system_gate(SYSCALL_VECTOR,&system_call);
1070
1071 /*
1072 * Should be a barrier for any external CPU state.
1073 */
1074 cpu_init();
1075
1076 trap_init_hook();
1077}
1078
1079static int __init kstack_setup(char *s)
1080{
1081 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1082 return 0;
1083}
1084__setup("kstack=", kstack_setup);
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
new file mode 100644
index 000000000000..2f3d52dacff7
--- /dev/null
+++ b/arch/i386/kernel/vm86.c
@@ -0,0 +1,804 @@
1/*
2 * linux/kernel/vm86.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
7 * stack - Manfred Spraul <manfreds@colorfullife.com>
8 *
9 * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle
10 * them correctly. Now the emulation will be in a
11 * consistent state after stackfaults - Kasper Dupont
12 * <kasperd@daimi.au.dk>
13 *
14 * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
15 * <kasperd@daimi.au.dk>
16 *
17 * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
18 * caused by Kasper Dupont's changes - Stas Sergeev
19 *
20 * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
21 * Kasper Dupont <kasperd@daimi.au.dk>
22 *
23 * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
24 * Kasper Dupont <kasperd@daimi.au.dk>
25 *
26 * 9 apr 2002 - Changed stack access macros to jump to a label
27 * instead of returning to userspace. This simplifies
28 * do_int, and is needed by handle_vm6_fault. Kasper
29 * Dupont <kasperd@daimi.au.dk>
30 *
31 */
32
33#include <linux/config.h>
34#include <linux/errno.h>
35#include <linux/interrupt.h>
36#include <linux/sched.h>
37#include <linux/kernel.h>
38#include <linux/signal.h>
39#include <linux/string.h>
40#include <linux/mm.h>
41#include <linux/smp.h>
42#include <linux/smp_lock.h>
43#include <linux/highmem.h>
44#include <linux/ptrace.h>
45
46#include <asm/uaccess.h>
47#include <asm/io.h>
48#include <asm/tlbflush.h>
49#include <asm/irq.h>
50
51/*
52 * Known problems:
53 *
54 * Interrupt handling is not guaranteed:
55 * - a real x86 will disable all interrupts for one instruction
56 * after a "mov ss,xx" to make stack handling atomic even without
57 * the 'lss' instruction. We can't guarantee this in v86 mode,
58 * as the next instruction might result in a page fault or similar.
59 * - a real x86 will have interrupts disabled for one instruction
60 * past the 'sti' that enables them. We don't bother with all the
61 * details yet.
62 *
63 * Let's hope these problems do not actually matter for anything.
64 */
65
66
67#define KVM86 ((struct kernel_vm86_struct *)regs)
68#define VMPI KVM86->vm86plus
69
70
71/*
72 * 8- and 16-bit register defines..
73 */
74#define AL(regs) (((unsigned char *)&((regs)->eax))[0])
75#define AH(regs) (((unsigned char *)&((regs)->eax))[1])
76#define IP(regs) (*(unsigned short *)&((regs)->eip))
77#define SP(regs) (*(unsigned short *)&((regs)->esp))
78
79/*
80 * virtual flags (16 and 32-bit versions)
81 */
82#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
83#define VEFLAGS (current->thread.v86flags)
84
85#define set_flags(X,new,mask) \
86((X) = ((X) & ~(mask)) | ((new) & (mask)))
87
88#define SAFE_MASK (0xDD5)
89#define RETURN_MASK (0xDFF)
90
91#define VM86_REGS_PART2 orig_eax
92#define VM86_REGS_SIZE1 \
93 ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) )
94#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1)
95
96struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
97struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
98{
99 struct tss_struct *tss;
100 struct pt_regs *ret;
101 unsigned long tmp;
102
103 /*
104 * This gets called from entry.S with interrupts disabled, but
105 * from process context. Enable interrupts here, before trying
106 * to access user space.
107 */
108 local_irq_enable();
109
110 if (!current->thread.vm86_info) {
111 printk("no vm86_info: BAD\n");
112 do_exit(SIGSEGV);
113 }
114 set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
115 tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1);
116 tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
117 &regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
118 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
119 if (tmp) {
120 printk("vm86: could not access userspace vm86_info\n");
121 do_exit(SIGSEGV);
122 }
123
124 tss = &per_cpu(init_tss, get_cpu());
125 current->thread.esp0 = current->thread.saved_esp0;
126 current->thread.sysenter_cs = __KERNEL_CS;
127 load_esp0(tss, &current->thread);
128 current->thread.saved_esp0 = 0;
129 put_cpu();
130
131 loadsegment(fs, current->thread.saved_fs);
132 loadsegment(gs, current->thread.saved_gs);
133 ret = KVM86->regs32;
134 return ret;
135}
136
137static void mark_screen_rdonly(struct task_struct * tsk)
138{
139 pgd_t *pgd;
140 pud_t *pud;
141 pmd_t *pmd;
142 pte_t *pte, *mapped;
143 int i;
144
145 preempt_disable();
146 spin_lock(&tsk->mm->page_table_lock);
147 pgd = pgd_offset(tsk->mm, 0xA0000);
148 if (pgd_none_or_clear_bad(pgd))
149 goto out;
150 pud = pud_offset(pgd, 0xA0000);
151 if (pud_none_or_clear_bad(pud))
152 goto out;
153 pmd = pmd_offset(pud, 0xA0000);
154 if (pmd_none_or_clear_bad(pmd))
155 goto out;
156 pte = mapped = pte_offset_map(pmd, 0xA0000);
157 for (i = 0; i < 32; i++) {
158 if (pte_present(*pte))
159 set_pte(pte, pte_wrprotect(*pte));
160 pte++;
161 }
162 pte_unmap(mapped);
163out:
164 spin_unlock(&tsk->mm->page_table_lock);
165 preempt_enable();
166 flush_tlb();
167}
168
169
170
171static int do_vm86_irq_handling(int subfunction, int irqnumber);
172static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
173
174asmlinkage int sys_vm86old(struct pt_regs regs)
175{
176 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
177 struct kernel_vm86_struct info; /* declare this _on top_,
178 * this avoids wasting of stack space.
179 * This remains on the stack until we
180 * return to 32 bit user space.
181 */
182 struct task_struct *tsk;
183 int tmp, ret = -EPERM;
184
185 tsk = current;
186 if (tsk->thread.saved_esp0)
187 goto out;
188 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1);
189 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2,
190 (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2);
191 ret = -EFAULT;
192 if (tmp)
193 goto out;
194 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
195 info.regs32 = &regs;
196 tsk->thread.vm86_info = v86;
197 do_sys_vm86(&info, tsk);
198 ret = 0; /* we never return here */
199out:
200 return ret;
201}
202
203
204asmlinkage int sys_vm86(struct pt_regs regs)
205{
206 struct kernel_vm86_struct info; /* declare this _on top_,
207 * this avoids wasting of stack space.
208 * This remains on the stack until we
209 * return to 32 bit user space.
210 */
211 struct task_struct *tsk;
212 int tmp, ret;
213 struct vm86plus_struct __user *v86;
214
215 tsk = current;
216 switch (regs.ebx) {
217 case VM86_REQUEST_IRQ:
218 case VM86_FREE_IRQ:
219 case VM86_GET_IRQ_BITS:
220 case VM86_GET_AND_RESET_IRQ:
221 ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
222 goto out;
223 case VM86_PLUS_INSTALL_CHECK:
224 /* NOTE: on old vm86 stuff this will return the error
225 from verify_area(), because the subfunction is
226 interpreted as (invalid) address to vm86_struct.
227 So the installation check works.
228 */
229 ret = 0;
230 goto out;
231 }
232
233 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
234 ret = -EPERM;
235 if (tsk->thread.saved_esp0)
236 goto out;
237 v86 = (struct vm86plus_struct __user *)regs.ecx;
238 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1);
239 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2,
240 (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2);
241 ret = -EFAULT;
242 if (tmp)
243 goto out;
244 info.regs32 = &regs;
245 info.vm86plus.is_vm86pus = 1;
246 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
247 do_sys_vm86(&info, tsk);
248 ret = 0; /* we never return here */
249out:
250 return ret;
251}
252
253
254static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
255{
256 struct tss_struct *tss;
257/*
258 * make sure the vm86() system call doesn't try to do anything silly
259 */
260 info->regs.__null_ds = 0;
261 info->regs.__null_es = 0;
262
263/* we are clearing fs,gs later just before "jmp resume_userspace",
264 * because starting with Linux 2.1.x they aren't no longer saved/restored
265 */
266
267/*
268 * The eflags register is also special: we cannot trust that the user
269 * has set it up safely, so this makes sure interrupt etc flags are
270 * inherited from protected mode.
271 */
272 VEFLAGS = info->regs.eflags;
273 info->regs.eflags &= SAFE_MASK;
274 info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK;
275 info->regs.eflags |= VM_MASK;
276
277 switch (info->cpu_type) {
278 case CPU_286:
279 tsk->thread.v86mask = 0;
280 break;
281 case CPU_386:
282 tsk->thread.v86mask = NT_MASK | IOPL_MASK;
283 break;
284 case CPU_486:
285 tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
286 break;
287 default:
288 tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
289 break;
290 }
291
292/*
293 * Save old state, set default return value (%eax) to 0
294 */
295 info->regs32->eax = 0;
296 tsk->thread.saved_esp0 = tsk->thread.esp0;
297 asm volatile("movl %%fs,%0":"=m" (tsk->thread.saved_fs));
298 asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs));
299
300 tss = &per_cpu(init_tss, get_cpu());
301 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
302 if (cpu_has_sep)
303 tsk->thread.sysenter_cs = 0;
304 load_esp0(tss, &tsk->thread);
305 put_cpu();
306
307 tsk->thread.screen_bitmap = info->screen_bitmap;
308 if (info->flags & VM86_SCREEN_BITMAP)
309 mark_screen_rdonly(tsk);
310 __asm__ __volatile__(
311 "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
312 "movl %0,%%esp\n\t"
313 "movl %1,%%ebp\n\t"
314 "jmp resume_userspace"
315 : /* no outputs */
316 :"r" (&info->regs), "r" (tsk->thread_info) : "ax");
317 /* we never return here */
318}
319
320static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
321{
322 struct pt_regs * regs32;
323
324 regs32 = save_v86_state(regs16);
325 regs32->eax = retval;
326 __asm__ __volatile__("movl %0,%%esp\n\t"
327 "movl %1,%%ebp\n\t"
328 "jmp resume_userspace"
329 : : "r" (regs32), "r" (current_thread_info()));
330}
331
332static inline void set_IF(struct kernel_vm86_regs * regs)
333{
334 VEFLAGS |= VIF_MASK;
335 if (VEFLAGS & VIP_MASK)
336 return_to_32bit(regs, VM86_STI);
337}
338
339static inline void clear_IF(struct kernel_vm86_regs * regs)
340{
341 VEFLAGS &= ~VIF_MASK;
342}
343
344static inline void clear_TF(struct kernel_vm86_regs * regs)
345{
346 regs->eflags &= ~TF_MASK;
347}
348
349static inline void clear_AC(struct kernel_vm86_regs * regs)
350{
351 regs->eflags &= ~AC_MASK;
352}
353
354/* It is correct to call set_IF(regs) from the set_vflags_*
355 * functions. However someone forgot to call clear_IF(regs)
356 * in the opposite case.
357 * After the command sequence CLI PUSHF STI POPF you should
358 * end up with interrups disabled, but you ended up with
359 * interrupts enabled.
360 * ( I was testing my own changes, but the only bug I
361 * could find was in a function I had not changed. )
362 * [KD]
363 */
364
365static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
366{
367 set_flags(VEFLAGS, eflags, current->thread.v86mask);
368 set_flags(regs->eflags, eflags, SAFE_MASK);
369 if (eflags & IF_MASK)
370 set_IF(regs);
371 else
372 clear_IF(regs);
373}
374
375static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
376{
377 set_flags(VFLAGS, flags, current->thread.v86mask);
378 set_flags(regs->eflags, flags, SAFE_MASK);
379 if (flags & IF_MASK)
380 set_IF(regs);
381 else
382 clear_IF(regs);
383}
384
385static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
386{
387 unsigned long flags = regs->eflags & RETURN_MASK;
388
389 if (VEFLAGS & VIF_MASK)
390 flags |= IF_MASK;
391 flags |= IOPL_MASK;
392 return flags | (VEFLAGS & current->thread.v86mask);
393}
394
395static inline int is_revectored(int nr, struct revectored_struct * bitmap)
396{
397 __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
398 :"=r" (nr)
399 :"m" (*bitmap),"r" (nr));
400 return nr;
401}
402
403#define val_byte(val, n) (((__u8 *)&val)[n])
404
405#define pushb(base, ptr, val, err_label) \
406 do { \
407 __u8 __val = val; \
408 ptr--; \
409 if (put_user(__val, base + ptr) < 0) \
410 goto err_label; \
411 } while(0)
412
413#define pushw(base, ptr, val, err_label) \
414 do { \
415 __u16 __val = val; \
416 ptr--; \
417 if (put_user(val_byte(__val, 1), base + ptr) < 0) \
418 goto err_label; \
419 ptr--; \
420 if (put_user(val_byte(__val, 0), base + ptr) < 0) \
421 goto err_label; \
422 } while(0)
423
424#define pushl(base, ptr, val, err_label) \
425 do { \
426 __u32 __val = val; \
427 ptr--; \
428 if (put_user(val_byte(__val, 3), base + ptr) < 0) \
429 goto err_label; \
430 ptr--; \
431 if (put_user(val_byte(__val, 2), base + ptr) < 0) \
432 goto err_label; \
433 ptr--; \
434 if (put_user(val_byte(__val, 1), base + ptr) < 0) \
435 goto err_label; \
436 ptr--; \
437 if (put_user(val_byte(__val, 0), base + ptr) < 0) \
438 goto err_label; \
439 } while(0)
440
441#define popb(base, ptr, err_label) \
442 ({ \
443 __u8 __res; \
444 if (get_user(__res, base + ptr) < 0) \
445 goto err_label; \
446 ptr++; \
447 __res; \
448 })
449
450#define popw(base, ptr, err_label) \
451 ({ \
452 __u16 __res; \
453 if (get_user(val_byte(__res, 0), base + ptr) < 0) \
454 goto err_label; \
455 ptr++; \
456 if (get_user(val_byte(__res, 1), base + ptr) < 0) \
457 goto err_label; \
458 ptr++; \
459 __res; \
460 })
461
462#define popl(base, ptr, err_label) \
463 ({ \
464 __u32 __res; \
465 if (get_user(val_byte(__res, 0), base + ptr) < 0) \
466 goto err_label; \
467 ptr++; \
468 if (get_user(val_byte(__res, 1), base + ptr) < 0) \
469 goto err_label; \
470 ptr++; \
471 if (get_user(val_byte(__res, 2), base + ptr) < 0) \
472 goto err_label; \
473 ptr++; \
474 if (get_user(val_byte(__res, 3), base + ptr) < 0) \
475 goto err_label; \
476 ptr++; \
477 __res; \
478 })
479
480/* There are so many possible reasons for this function to return
481 * VM86_INTx, so adding another doesn't bother me. We can expect
482 * userspace programs to be able to handle it. (Getting a problem
483 * in userspace is always better than an Oops anyway.) [KD]
484 */
485static void do_int(struct kernel_vm86_regs *regs, int i,
486 unsigned char __user * ssp, unsigned short sp)
487{
488 unsigned long __user *intr_ptr;
489 unsigned long segoffs;
490
491 if (regs->cs == BIOSSEG)
492 goto cannot_handle;
493 if (is_revectored(i, &KVM86->int_revectored))
494 goto cannot_handle;
495 if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
496 goto cannot_handle;
497 intr_ptr = (unsigned long __user *) (i << 2);
498 if (get_user(segoffs, intr_ptr))
499 goto cannot_handle;
500 if ((segoffs >> 16) == BIOSSEG)
501 goto cannot_handle;
502 pushw(ssp, sp, get_vflags(regs), cannot_handle);
503 pushw(ssp, sp, regs->cs, cannot_handle);
504 pushw(ssp, sp, IP(regs), cannot_handle);
505 regs->cs = segoffs >> 16;
506 SP(regs) -= 6;
507 IP(regs) = segoffs & 0xffff;
508 clear_TF(regs);
509 clear_IF(regs);
510 clear_AC(regs);
511 return;
512
513cannot_handle:
514 return_to_32bit(regs, VM86_INTx + (i << 8));
515}
516
517int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
518{
519 if (VMPI.is_vm86pus) {
520 if ( (trapno==3) || (trapno==1) )
521 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
522 do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs));
523 return 0;
524 }
525 if (trapno !=1)
526 return 1; /* we let this handle by the calling routine */
527 if (current->ptrace & PT_PTRACED) {
528 unsigned long flags;
529 spin_lock_irqsave(&current->sighand->siglock, flags);
530 sigdelset(&current->blocked, SIGTRAP);
531 recalc_sigpending();
532 spin_unlock_irqrestore(&current->sighand->siglock, flags);
533 }
534 send_sig(SIGTRAP, current, 1);
535 current->thread.trap_no = trapno;
536 current->thread.error_code = error_code;
537 return 0;
538}
539
540void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
541{
542 unsigned char opcode;
543 unsigned char __user *csp;
544 unsigned char __user *ssp;
545 unsigned short ip, sp;
546 int data32, pref_done;
547
548#define CHECK_IF_IN_TRAP \
549 if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
550 newflags |= TF_MASK
551#define VM86_FAULT_RETURN do { \
552 if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \
553 return_to_32bit(regs, VM86_PICRETURN); \
554 return; } while (0)
555
556 csp = (unsigned char __user *) (regs->cs << 4);
557 ssp = (unsigned char __user *) (regs->ss << 4);
558 sp = SP(regs);
559 ip = IP(regs);
560
561 data32 = 0;
562 pref_done = 0;
563 do {
564 switch (opcode = popb(csp, ip, simulate_sigsegv)) {
565 case 0x66: /* 32-bit data */ data32=1; break;
566 case 0x67: /* 32-bit address */ break;
567 case 0x2e: /* CS */ break;
568 case 0x3e: /* DS */ break;
569 case 0x26: /* ES */ break;
570 case 0x36: /* SS */ break;
571 case 0x65: /* GS */ break;
572 case 0x64: /* FS */ break;
573 case 0xf2: /* repnz */ break;
574 case 0xf3: /* rep */ break;
575 default: pref_done = 1;
576 }
577 } while (!pref_done);
578
579 switch (opcode) {
580
581 /* pushf */
582 case 0x9c:
583 if (data32) {
584 pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
585 SP(regs) -= 4;
586 } else {
587 pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
588 SP(regs) -= 2;
589 }
590 IP(regs) = ip;
591 VM86_FAULT_RETURN;
592
593 /* popf */
594 case 0x9d:
595 {
596 unsigned long newflags;
597 if (data32) {
598 newflags=popl(ssp, sp, simulate_sigsegv);
599 SP(regs) += 4;
600 } else {
601 newflags = popw(ssp, sp, simulate_sigsegv);
602 SP(regs) += 2;
603 }
604 IP(regs) = ip;
605 CHECK_IF_IN_TRAP;
606 if (data32) {
607 set_vflags_long(newflags, regs);
608 } else {
609 set_vflags_short(newflags, regs);
610 }
611 VM86_FAULT_RETURN;
612 }
613
614 /* int xx */
615 case 0xcd: {
616 int intno=popb(csp, ip, simulate_sigsegv);
617 IP(regs) = ip;
618 if (VMPI.vm86dbg_active) {
619 if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
620 return_to_32bit(regs, VM86_INTx + (intno << 8));
621 }
622 do_int(regs, intno, ssp, sp);
623 return;
624 }
625
626 /* iret */
627 case 0xcf:
628 {
629 unsigned long newip;
630 unsigned long newcs;
631 unsigned long newflags;
632 if (data32) {
633 newip=popl(ssp, sp, simulate_sigsegv);
634 newcs=popl(ssp, sp, simulate_sigsegv);
635 newflags=popl(ssp, sp, simulate_sigsegv);
636 SP(regs) += 12;
637 } else {
638 newip = popw(ssp, sp, simulate_sigsegv);
639 newcs = popw(ssp, sp, simulate_sigsegv);
640 newflags = popw(ssp, sp, simulate_sigsegv);
641 SP(regs) += 6;
642 }
643 IP(regs) = newip;
644 regs->cs = newcs;
645 CHECK_IF_IN_TRAP;
646 if (data32) {
647 set_vflags_long(newflags, regs);
648 } else {
649 set_vflags_short(newflags, regs);
650 }
651 VM86_FAULT_RETURN;
652 }
653
654 /* cli */
655 case 0xfa:
656 IP(regs) = ip;
657 clear_IF(regs);
658 VM86_FAULT_RETURN;
659
660 /* sti */
661 /*
662 * Damn. This is incorrect: the 'sti' instruction should actually
663 * enable interrupts after the /next/ instruction. Not good.
664 *
665 * Probably needs some horsing around with the TF flag. Aiee..
666 */
667 case 0xfb:
668 IP(regs) = ip;
669 set_IF(regs);
670 VM86_FAULT_RETURN;
671
672 default:
673 return_to_32bit(regs, VM86_UNKNOWN);
674 }
675
676 return;
677
678simulate_sigsegv:
679 /* FIXME: After a long discussion with Stas we finally
680 * agreed, that this is wrong. Here we should
681 * really send a SIGSEGV to the user program.
682 * But how do we create the correct context? We
683 * are inside a general protection fault handler
684 * and has just returned from a page fault handler.
685 * The correct context for the signal handler
686 * should be a mixture of the two, but how do we
687 * get the information? [KD]
688 */
689 return_to_32bit(regs, VM86_UNKNOWN);
690}
691
692/* ---------------- vm86 special IRQ passing stuff ----------------- */
693
694#define VM86_IRQNAME "vm86irq"
695
696static struct vm86_irqs {
697 struct task_struct *tsk;
698 int sig;
699} vm86_irqs[16];
700
701static DEFINE_SPINLOCK(irqbits_lock);
702static int irqbits;
703
704#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
705 | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \
706 | (1 << SIGUNUSED) )
707
708static irqreturn_t irq_handler(int intno, void *dev_id, struct pt_regs * regs)
709{
710 int irq_bit;
711 unsigned long flags;
712
713 spin_lock_irqsave(&irqbits_lock, flags);
714 irq_bit = 1 << intno;
715 if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
716 goto out;
717 irqbits |= irq_bit;
718 if (vm86_irqs[intno].sig)
719 send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
720 spin_unlock_irqrestore(&irqbits_lock, flags);
721 /*
722 * IRQ will be re-enabled when user asks for the irq (whether
723 * polling or as a result of the signal)
724 */
725 disable_irq(intno);
726 return IRQ_HANDLED;
727
728out:
729 spin_unlock_irqrestore(&irqbits_lock, flags);
730 return IRQ_NONE;
731}
732
733static inline void free_vm86_irq(int irqnumber)
734{
735 unsigned long flags;
736
737 free_irq(irqnumber, NULL);
738 vm86_irqs[irqnumber].tsk = NULL;
739
740 spin_lock_irqsave(&irqbits_lock, flags);
741 irqbits &= ~(1 << irqnumber);
742 spin_unlock_irqrestore(&irqbits_lock, flags);
743}
744
745void release_vm86_irqs(struct task_struct *task)
746{
747 int i;
748 for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
749 if (vm86_irqs[i].tsk == task)
750 free_vm86_irq(i);
751}
752
753static inline int get_and_reset_irq(int irqnumber)
754{
755 int bit;
756 unsigned long flags;
757
758 if (invalid_vm86_irq(irqnumber)) return 0;
759 if (vm86_irqs[irqnumber].tsk != current) return 0;
760 spin_lock_irqsave(&irqbits_lock, flags);
761 bit = irqbits & (1 << irqnumber);
762 irqbits &= ~bit;
763 spin_unlock_irqrestore(&irqbits_lock, flags);
764 if (!bit)
765 return 0;
766 enable_irq(irqnumber);
767 return 1;
768}
769
770
771static int do_vm86_irq_handling(int subfunction, int irqnumber)
772{
773 int ret;
774 switch (subfunction) {
775 case VM86_GET_AND_RESET_IRQ: {
776 return get_and_reset_irq(irqnumber);
777 }
778 case VM86_GET_IRQ_BITS: {
779 return irqbits;
780 }
781 case VM86_REQUEST_IRQ: {
782 int sig = irqnumber >> 8;
783 int irq = irqnumber & 255;
784 if (!capable(CAP_SYS_ADMIN)) return -EPERM;
785 if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
786 if (invalid_vm86_irq(irq)) return -EPERM;
787 if (vm86_irqs[irq].tsk) return -EPERM;
788 ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
789 if (ret) return ret;
790 vm86_irqs[irq].sig = sig;
791 vm86_irqs[irq].tsk = current;
792 return irq;
793 }
794 case VM86_FREE_IRQ: {
795 if (invalid_vm86_irq(irqnumber)) return -EPERM;
796 if (!vm86_irqs[irqnumber].tsk) return 0;
797 if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
798 free_vm86_irq(irqnumber);
799 return 0;
800 }
801 }
802 return -EINVAL;
803}
804
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..e0512cc8bea7
--- /dev/null
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -0,0 +1,134 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#include <asm-generic/vmlinux.lds.h>
6#include <asm/thread_info.h>
7#include <asm/page.h>
8
9OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
10OUTPUT_ARCH(i386)
11ENTRY(startup_32)
12jiffies = jiffies_64;
13SECTIONS
14{
15 . = __PAGE_OFFSET + 0x100000;
16 /* read-only */
17 _text = .; /* Text and read-only data */
18 .text : {
19 *(.text)
20 SCHED_TEXT
21 LOCK_TEXT
22 *(.fixup)
23 *(.gnu.warning)
24 } = 0x9090
25
26 _etext = .; /* End of text section */
27
28 . = ALIGN(16); /* Exception table */
29 __start___ex_table = .;
30 __ex_table : { *(__ex_table) }
31 __stop___ex_table = .;
32
33 RODATA
34
35 /* writeable */
36 .data : { /* Data */
37 *(.data)
38 CONSTRUCTORS
39 }
40
41 . = ALIGN(4096);
42 __nosave_begin = .;
43 .data_nosave : { *(.data.nosave) }
44 . = ALIGN(4096);
45 __nosave_end = .;
46
47 . = ALIGN(4096);
48 .data.page_aligned : { *(.data.idt) }
49
50 . = ALIGN(32);
51 .data.cacheline_aligned : { *(.data.cacheline_aligned) }
52
53 _edata = .; /* End of data section */
54
55 . = ALIGN(THREAD_SIZE); /* init_task */
56 .data.init_task : { *(.data.init_task) }
57
58 /* will be freed after init */
59 . = ALIGN(4096); /* Init code and data */
60 __init_begin = .;
61 .init.text : {
62 _sinittext = .;
63 *(.init.text)
64 _einittext = .;
65 }
66 .init.data : { *(.init.data) }
67 . = ALIGN(16);
68 __setup_start = .;
69 .init.setup : { *(.init.setup) }
70 __setup_end = .;
71 __initcall_start = .;
72 .initcall.init : {
73 *(.initcall1.init)
74 *(.initcall2.init)
75 *(.initcall3.init)
76 *(.initcall4.init)
77 *(.initcall5.init)
78 *(.initcall6.init)
79 *(.initcall7.init)
80 }
81 __initcall_end = .;
82 __con_initcall_start = .;
83 .con_initcall.init : { *(.con_initcall.init) }
84 __con_initcall_end = .;
85 SECURITY_INIT
86 . = ALIGN(4);
87 __alt_instructions = .;
88 .altinstructions : { *(.altinstructions) }
89 __alt_instructions_end = .;
90 .altinstr_replacement : { *(.altinstr_replacement) }
91 /* .exit.text is discard at runtime, not link time, to deal with references
92 from .altinstructions and .eh_frame */
93 .exit.text : { *(.exit.text) }
94 .exit.data : { *(.exit.data) }
95 . = ALIGN(4096);
96 __initramfs_start = .;
97 .init.ramfs : { *(.init.ramfs) }
98 __initramfs_end = .;
99 . = ALIGN(32);
100 __per_cpu_start = .;
101 .data.percpu : { *(.data.percpu) }
102 __per_cpu_end = .;
103 . = ALIGN(4096);
104 __init_end = .;
105 /* freed after init ends here */
106
107 __bss_start = .; /* BSS */
108 .bss : {
109 *(.bss.page_aligned)
110 *(.bss)
111 }
112 . = ALIGN(4);
113 __bss_stop = .;
114
115 _end = . ;
116
117 /* This is where the kernel creates the early boot page tables */
118 . = ALIGN(4096);
119 pg0 = .;
120
121 /* Sections to be discarded */
122 /DISCARD/ : {
123 *(.exitcall.exit)
124 }
125
126 /* Stabs debugging sections. */
127 .stab 0 : { *(.stab) }
128 .stabstr 0 : { *(.stabstr) }
129 .stab.excl 0 : { *(.stab.excl) }
130 .stab.exclstr 0 : { *(.stab.exclstr) }
131 .stab.index 0 : { *(.stab.index) }
132 .stab.indexstr 0 : { *(.stab.indexstr) }
133 .comment 0 : { *(.comment) }
134}
diff --git a/arch/i386/kernel/vsyscall-int80.S b/arch/i386/kernel/vsyscall-int80.S
new file mode 100644
index 000000000000..530d0525e5e2
--- /dev/null
+++ b/arch/i386/kernel/vsyscall-int80.S
@@ -0,0 +1,53 @@
1/*
2 * Code for the vsyscall page. This version uses the old int $0x80 method.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10 .text
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 int $0x80
16 ret
17.LEND_vsyscall:
18 .size __kernel_vsyscall,.-.LSTART_vsyscall
19 .previous
20
21 .section .eh_frame,"a",@progbits
22.LSTARTFRAMEDLSI:
23 .long .LENDCIEDLSI-.LSTARTCIEDLSI
24.LSTARTCIEDLSI:
25 .long 0 /* CIE ID */
26 .byte 1 /* Version number */
27 .string "zR" /* NUL-terminated augmentation string */
28 .uleb128 1 /* Code alignment factor */
29 .sleb128 -4 /* Data alignment factor */
30 .byte 8 /* Return address register column */
31 .uleb128 1 /* Augmentation value length */
32 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
33 .byte 0x0c /* DW_CFA_def_cfa */
34 .uleb128 4
35 .uleb128 4
36 .byte 0x88 /* DW_CFA_offset, column 0x8 */
37 .uleb128 1
38 .align 4
39.LENDCIEDLSI:
40 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
41.LSTARTFDEDLSI:
42 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
43 .long .LSTART_vsyscall-. /* PC-relative start address */
44 .long .LEND_vsyscall-.LSTART_vsyscall
45 .uleb128 0
46 .align 4
47.LENDFDEDLSI:
48 .previous
49
50/*
51 * Get the common code for the sigreturn entry points.
52 */
53#include "vsyscall-sigreturn.S"
diff --git a/arch/i386/kernel/vsyscall-sigreturn.S b/arch/i386/kernel/vsyscall-sigreturn.S
new file mode 100644
index 000000000000..c8fcf75b9be3
--- /dev/null
+++ b/arch/i386/kernel/vsyscall-sigreturn.S
@@ -0,0 +1,142 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * So far this code is the same for both int80 and sysenter versions.
4 * This file is #include'd by vsyscall-*.S to define them after the
5 * vsyscall entry point. The kernel assumes that the addresses of these
6 * routines are constant for all vsyscall implementations.
7 */
8
9#include <asm/unistd.h>
10#include <asm/asm_offsets.h>
11
12
13/* XXX
14 Should these be named "_sigtramp" or something?
15*/
16
17 .text
18 .org __kernel_vsyscall+32
19 .globl __kernel_sigreturn
20 .type __kernel_sigreturn,@function
21__kernel_sigreturn:
22.LSTART_sigreturn:
23 popl %eax /* XXX does this mean it needs unwind info? */
24 movl $__NR_sigreturn, %eax
25 int $0x80
26.LEND_sigreturn:
27 .size __kernel_sigreturn,.-.LSTART_sigreturn
28
29 .balign 32
30 .globl __kernel_rt_sigreturn
31 .type __kernel_rt_sigreturn,@function
32__kernel_rt_sigreturn:
33.LSTART_rt_sigreturn:
34 movl $__NR_rt_sigreturn, %eax
35 int $0x80
36.LEND_rt_sigreturn:
37 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
38 .previous
39
40 .section .eh_frame,"a",@progbits
41.LSTARTFRAMEDLSI1:
42 .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
43.LSTARTCIEDLSI1:
44 .long 0 /* CIE ID */
45 .byte 1 /* Version number */
46 .string "zR" /* NUL-terminated augmentation string */
47 .uleb128 1 /* Code alignment factor */
48 .sleb128 -4 /* Data alignment factor */
49 .byte 8 /* Return address register column */
50 .uleb128 1 /* Augmentation value length */
51 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
52 .byte 0 /* DW_CFA_nop */
53 .align 4
54.LENDCIEDLSI1:
55 .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
56.LSTARTFDEDLSI1:
57 .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
58 /* HACK: The dwarf2 unwind routines will subtract 1 from the
59 return address to get an address in the middle of the
60 presumed call instruction. Since we didn't get here via
61 a call, we need to include the nop before the real start
62 to make up for it. */
63 .long .LSTART_sigreturn-1-. /* PC-relative start address */
64 .long .LEND_sigreturn-.LSTART_sigreturn+1
65 .uleb128 0 /* Augmentation */
66 /* What follows are the instructions for the table generation.
67 We record the locations of each register saved. This is
68 complicated by the fact that the "CFA" is always assumed to
69 be the value of the stack pointer in the caller. This means
70 that we must define the CFA of this body of code to be the
71 saved value of the stack pointer in the sigcontext. Which
72 also means that there is no fixed relation to the other
73 saved registers, which means that we must use DW_CFA_expression
74 to compute their addresses. It also means that when we
75 adjust the stack with the popl, we have to do it all over again. */
76
77#define do_cfa_expr(offset) \
78 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
79 .uleb128 1f-0f; /* length */ \
800: .byte 0x74; /* DW_OP_breg4 */ \
81 .sleb128 offset; /* offset */ \
82 .byte 0x06; /* DW_OP_deref */ \
831:
84
85#define do_expr(regno, offset) \
86 .byte 0x10; /* DW_CFA_expression */ \
87 .uleb128 regno; /* regno */ \
88 .uleb128 1f-0f; /* length */ \
890: .byte 0x74; /* DW_OP_breg4 */ \
90 .sleb128 offset; /* offset */ \
911:
92
93 do_cfa_expr(SIGCONTEXT_esp+4)
94 do_expr(0, SIGCONTEXT_eax+4)
95 do_expr(1, SIGCONTEXT_ecx+4)
96 do_expr(2, SIGCONTEXT_edx+4)
97 do_expr(3, SIGCONTEXT_ebx+4)
98 do_expr(5, SIGCONTEXT_ebp+4)
99 do_expr(6, SIGCONTEXT_esi+4)
100 do_expr(7, SIGCONTEXT_edi+4)
101 do_expr(8, SIGCONTEXT_eip+4)
102
103 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
104
105 do_cfa_expr(SIGCONTEXT_esp)
106 do_expr(0, SIGCONTEXT_eax)
107 do_expr(1, SIGCONTEXT_ecx)
108 do_expr(2, SIGCONTEXT_edx)
109 do_expr(3, SIGCONTEXT_ebx)
110 do_expr(5, SIGCONTEXT_ebp)
111 do_expr(6, SIGCONTEXT_esi)
112 do_expr(7, SIGCONTEXT_edi)
113 do_expr(8, SIGCONTEXT_eip)
114
115 .align 4
116.LENDFDEDLSI1:
117
118 .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
119.LSTARTFDEDLSI2:
120 .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
121 /* HACK: See above wrt unwind library assumptions. */
122 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
123 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
124 .uleb128 0 /* Augmentation */
125 /* What follows are the instructions for the table generation.
126 We record the locations of each register saved. This is
127 slightly less complicated than the above, since we don't
128 modify the stack pointer in the process. */
129
130 do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
131 do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
132 do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
133 do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
134 do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
135 do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
136 do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
137 do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
138 do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
139
140 .align 4
141.LENDFDEDLSI2:
142 .previous
diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S
new file mode 100644
index 000000000000..4daefb2ec1b2
--- /dev/null
+++ b/arch/i386/kernel/vsyscall-sysenter.S
@@ -0,0 +1,104 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10 .text
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 push %ecx
16.Lpush_ecx:
17 push %edx
18.Lpush_edx:
19 push %ebp
20.Lenter_kernel:
21 movl %esp,%ebp
22 sysenter
23
24 /* 7: align return point with nop's to make disassembly easier */
25 .space 7,0x90
26
27 /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */
28 jmp .Lenter_kernel
29 /* 16: System call normal return point is here! */
30 .globl SYSENTER_RETURN /* Symbol used by entry.S. */
31SYSENTER_RETURN:
32 pop %ebp
33.Lpop_ebp:
34 pop %edx
35.Lpop_edx:
36 pop %ecx
37.Lpop_ecx:
38 ret
39.LEND_vsyscall:
40 .size __kernel_vsyscall,.-.LSTART_vsyscall
41 .previous
42
43 .section .eh_frame,"a",@progbits
44.LSTARTFRAMEDLSI:
45 .long .LENDCIEDLSI-.LSTARTCIEDLSI
46.LSTARTCIEDLSI:
47 .long 0 /* CIE ID */
48 .byte 1 /* Version number */
49 .string "zR" /* NUL-terminated augmentation string */
50 .uleb128 1 /* Code alignment factor */
51 .sleb128 -4 /* Data alignment factor */
52 .byte 8 /* Return address register column */
53 .uleb128 1 /* Augmentation value length */
54 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
55 .byte 0x0c /* DW_CFA_def_cfa */
56 .uleb128 4
57 .uleb128 4
58 .byte 0x88 /* DW_CFA_offset, column 0x8 */
59 .uleb128 1
60 .align 4
61.LENDCIEDLSI:
62 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
63.LSTARTFDEDLSI:
64 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
65 .long .LSTART_vsyscall-. /* PC-relative start address */
66 .long .LEND_vsyscall-.LSTART_vsyscall
67 .uleb128 0
68 /* What follows are the instructions for the table generation.
69 We have to record all changes of the stack pointer. */
70 .byte 0x04 /* DW_CFA_advance_loc4 */
71 .long .Lpush_ecx-.LSTART_vsyscall
72 .byte 0x0e /* DW_CFA_def_cfa_offset */
73 .byte 0x08 /* RA at offset 8 now */
74 .byte 0x04 /* DW_CFA_advance_loc4 */
75 .long .Lpush_edx-.Lpush_ecx
76 .byte 0x0e /* DW_CFA_def_cfa_offset */
77 .byte 0x0c /* RA at offset 12 now */
78 .byte 0x04 /* DW_CFA_advance_loc4 */
79 .long .Lenter_kernel-.Lpush_edx
80 .byte 0x0e /* DW_CFA_def_cfa_offset */
81 .byte 0x10 /* RA at offset 16 now */
82 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
83 /* Finally the epilogue. */
84 .byte 0x04 /* DW_CFA_advance_loc4 */
85 .long .Lpop_ebp-.Lenter_kernel
86 .byte 0x0e /* DW_CFA_def_cfa_offset */
87 .byte 0x0c /* RA at offset 12 now */
88 .byte 0xc5 /* DW_CFA_restore %ebp */
89 .byte 0x04 /* DW_CFA_advance_loc4 */
90 .long .Lpop_edx-.Lpop_ebp
91 .byte 0x0e /* DW_CFA_def_cfa_offset */
92 .byte 0x08 /* RA at offset 8 now */
93 .byte 0x04 /* DW_CFA_advance_loc4 */
94 .long .Lpop_ecx-.Lpop_edx
95 .byte 0x0e /* DW_CFA_def_cfa_offset */
96 .byte 0x04 /* RA at offset 4 now */
97 .align 4
98.LENDFDEDLSI:
99 .previous
100
101/*
102 * Get the common code for the sigreturn entry points.
103 */
104#include "vsyscall-sigreturn.S"
diff --git a/arch/i386/kernel/vsyscall.S b/arch/i386/kernel/vsyscall.S
new file mode 100644
index 000000000000..b403890fe39b
--- /dev/null
+++ b/arch/i386/kernel/vsyscall.S
@@ -0,0 +1,15 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vsyscall_int80_start, vsyscall_int80_end
6vsyscall_int80_start:
7 .incbin "arch/i386/kernel/vsyscall-int80.so"
8vsyscall_int80_end:
9
10 .globl vsyscall_sysenter_start, vsyscall_sysenter_end
11vsyscall_sysenter_start:
12 .incbin "arch/i386/kernel/vsyscall-sysenter.so"
13vsyscall_sysenter_end:
14
15__FINIT
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
new file mode 100644
index 000000000000..3a8329d6536e
--- /dev/null
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -0,0 +1,65 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm_offsets.h>
7
8SECTIONS
9{
10 . = VSYSCALL_BASE + SIZEOF_HEADERS;
11
12 .hash : { *(.hash) } :text
13 .dynsym : { *(.dynsym) }
14 .dynstr : { *(.dynstr) }
15 .gnu.version : { *(.gnu.version) }
16 .gnu.version_d : { *(.gnu.version_d) }
17 .gnu.version_r : { *(.gnu.version_r) }
18
19 /* This linker script is used both with -r and with -shared.
20 For the layouts to match, we need to skip more than enough
21 space for the dynamic symbol table et al. If this amount
22 is insufficient, ld -shared will barf. Just increase it here. */
23 . = VSYSCALL_BASE + 0x400;
24
25 .text : { *(.text) } :text =0x90909090
26
27 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
28 .eh_frame : { KEEP (*(.eh_frame)) } :text
29 .dynamic : { *(.dynamic) } :text :dynamic
30 .useless : {
31 *(.got.plt) *(.got)
32 *(.data .data.* .gnu.linkonce.d.*)
33 *(.dynbss)
34 *(.bss .bss.* .gnu.linkonce.b.*)
35 } :text
36}
37
38/*
39 * We must supply the ELF program headers explicitly to get just one
40 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
41 */
42PHDRS
43{
44 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
45 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
46 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
47}
48
49/*
50 * This controls what symbols we export from the DSO.
51 */
52VERSION
53{
54 LINUX_2.5 {
55 global:
56 __kernel_vsyscall;
57 __kernel_sigreturn;
58 __kernel_rt_sigreturn;
59
60 local: *;
61 };
62}
63
64/* The ELF entry point can be used to set the AT_SYSINFO value. */
65ENTRY(__kernel_vsyscall);