diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-07-31 12:43:41 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-07-31 12:43:41 -0400 |
commit | 85e9ca333d03fbd56b9e123c8456f0d98e20faad (patch) | |
tree | 7bb15ada5f536950efa23ad60ea9eea60380ca1c /arch/x86/kernel | |
parent | a300bec952127d9a15e666b391bb35c9aecb3002 (diff) | |
parent | 6e86841d05f371b5b9b86ce76c02aaee83352298 (diff) |
Merge branch 'linus' into timers/hpet
Diffstat (limited to 'arch/x86/kernel')
149 files changed, 15259 insertions, 10391 deletions
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore index 4ea38a39aed4..08f4fd731469 100644 --- a/arch/x86/kernel/.gitignore +++ b/arch/x86/kernel/.gitignore | |||
@@ -1,2 +1,3 @@ | |||
1 | vsyscall.lds | 1 | vsyscall.lds |
2 | vsyscall_32.lds | 2 | vsyscall_32.lds |
3 | vmlinux.lds | ||
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..3db651fc8ec5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -2,10 +2,17 @@ | |||
2 | # Makefile for the linux kernel. | 2 | # Makefile for the linux kernel. |
3 | # | 3 | # |
4 | 4 | ||
5 | extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds | 5 | extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds |
6 | 6 | ||
7 | CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) | 7 | CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) |
8 | 8 | ||
9 | ifdef CONFIG_FTRACE | ||
10 | # Do not profile debug and lowlevel utilities | ||
11 | CFLAGS_REMOVE_tsc.o = -pg | ||
12 | CFLAGS_REMOVE_rtc.o = -pg | ||
13 | CFLAGS_REMOVE_paravirt.o = -pg | ||
14 | endif | ||
15 | |||
9 | # | 16 | # |
10 | # vsyscalls (which work on the user stack) should have | 17 | # vsyscalls (which work on the user stack) should have |
11 | # no stack-protector checks: | 18 | # no stack-protector checks: |
@@ -13,20 +20,21 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) | |||
13 | nostackp := $(call cc-option, -fno-stack-protector) | 20 | nostackp := $(call cc-option, -fno-stack-protector) |
14 | CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) | 21 | CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) |
15 | CFLAGS_hpet.o := $(nostackp) | 22 | CFLAGS_hpet.o := $(nostackp) |
16 | CFLAGS_tsc_64.o := $(nostackp) | 23 | CFLAGS_tsc.o := $(nostackp) |
17 | 24 | ||
18 | obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o | 25 | obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o |
19 | obj-y += traps_$(BITS).o irq_$(BITS).o | 26 | obj-y += traps_$(BITS).o irq_$(BITS).o |
20 | obj-y += time_$(BITS).o ioport.o ldt.o | 27 | obj-y += time_$(BITS).o ioport.o ldt.o |
21 | obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o | 28 | obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o |
29 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o | ||
30 | obj-$(CONFIG_X86_32) += probe_roms_32.o | ||
22 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o | 31 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o |
23 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 32 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
24 | obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o | 33 | obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o |
25 | obj-y += bootflag.o e820_$(BITS).o | 34 | obj-y += bootflag.o e820.o |
26 | obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o | 35 | obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o |
27 | obj-y += alternative.o i8253.o pci-nommu.o | 36 | obj-y += alternative.o i8253.o pci-nommu.o |
28 | obj-$(CONFIG_X86_64) += bugs_64.o | 37 | obj-y += tsc.o io_delay.o rtc.o |
29 | obj-y += tsc_$(BITS).o io_delay.o rtc.o | ||
30 | 38 | ||
31 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o | 39 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o |
32 | obj-y += process.o | 40 | obj-y += process.o |
@@ -53,9 +61,10 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o | |||
53 | obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o | 61 | obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o |
54 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o | 62 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o |
55 | obj-$(CONFIG_X86_MPPARSE) += mpparse.o | 63 | obj-$(CONFIG_X86_MPPARSE) += mpparse.o |
56 | obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o | 64 | obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o |
57 | obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o | 65 | obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o |
58 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o | 66 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o |
67 | obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o | ||
59 | obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | 68 | obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o |
60 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | 69 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o |
61 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 70 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
@@ -64,7 +73,6 @@ obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o | |||
64 | obj-y += vsmp_64.o | 73 | obj-y += vsmp_64.o |
65 | obj-$(CONFIG_KPROBES) += kprobes.o | 74 | obj-$(CONFIG_KPROBES) += kprobes.o |
66 | obj-$(CONFIG_MODULES) += module_$(BITS).o | 75 | obj-$(CONFIG_MODULES) += module_$(BITS).o |
67 | obj-$(CONFIG_ACPI_SRAT) += srat_32.o | ||
68 | obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o | 76 | obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o |
69 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o | 77 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o |
70 | obj-$(CONFIG_KGDB) += kgdb.o | 78 | obj-$(CONFIG_KGDB) += kgdb.o |
@@ -82,6 +90,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o | |||
82 | obj-$(CONFIG_KVM_GUEST) += kvm.o | 90 | obj-$(CONFIG_KVM_GUEST) += kvm.o |
83 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o | 91 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o |
84 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o | 92 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o |
93 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | ||
85 | 94 | ||
86 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o | 95 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o |
87 | 96 | ||
@@ -93,12 +102,14 @@ obj-$(CONFIG_OLPC) += olpc.o | |||
93 | ### | 102 | ### |
94 | # 64 bit specific files | 103 | # 64 bit specific files |
95 | ifeq ($(CONFIG_X86_64),y) | 104 | ifeq ($(CONFIG_X86_64),y) |
96 | obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o | 105 | obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o |
106 | obj-y += bios_uv.o | ||
97 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o | 107 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o |
98 | obj-$(CONFIG_AUDIT) += audit_64.o | 108 | obj-$(CONFIG_AUDIT) += audit_64.o |
99 | 109 | ||
100 | obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o | 110 | obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o |
101 | obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o | 111 | obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o |
112 | obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o | ||
102 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o | 113 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o |
103 | 114 | ||
104 | obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o | 115 | obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 33c5216fd3e1..fa88a1d71290 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
38 | #include <asm/io_apic.h> | 38 | #include <asm/io_apic.h> |
39 | #include <asm/apic.h> | 39 | #include <asm/apic.h> |
40 | #include <asm/genapic.h> | ||
40 | #include <asm/io.h> | 41 | #include <asm/io.h> |
41 | #include <asm/mpspec.h> | 42 | #include <asm/mpspec.h> |
42 | #include <asm/smp.h> | 43 | #include <asm/smp.h> |
@@ -106,21 +107,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; | |||
106 | */ | 107 | */ |
107 | enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; | 108 | enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; |
108 | 109 | ||
109 | #ifdef CONFIG_X86_64 | ||
110 | |||
111 | /* rely on all ACPI tables being in the direct mapping */ | ||
112 | char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) | ||
113 | { | ||
114 | if (!phys_addr || !size) | ||
115 | return NULL; | ||
116 | |||
117 | if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE) | ||
118 | return __va(phys_addr); | ||
119 | |||
120 | return NULL; | ||
121 | } | ||
122 | |||
123 | #else | ||
124 | 110 | ||
125 | /* | 111 | /* |
126 | * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, | 112 | * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, |
@@ -139,11 +125,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) | |||
139 | unsigned long base, offset, mapped_size; | 125 | unsigned long base, offset, mapped_size; |
140 | int idx; | 126 | int idx; |
141 | 127 | ||
142 | if (phys + size < 8 * 1024 * 1024) | 128 | if (!phys || !size) |
129 | return NULL; | ||
130 | |||
131 | if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) | ||
143 | return __va(phys); | 132 | return __va(phys); |
144 | 133 | ||
145 | offset = phys & (PAGE_SIZE - 1); | 134 | offset = phys & (PAGE_SIZE - 1); |
146 | mapped_size = PAGE_SIZE - offset; | 135 | mapped_size = PAGE_SIZE - offset; |
136 | clear_fixmap(FIX_ACPI_END); | ||
147 | set_fixmap(FIX_ACPI_END, phys); | 137 | set_fixmap(FIX_ACPI_END, phys); |
148 | base = fix_to_virt(FIX_ACPI_END); | 138 | base = fix_to_virt(FIX_ACPI_END); |
149 | 139 | ||
@@ -155,13 +145,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) | |||
155 | if (--idx < FIX_ACPI_BEGIN) | 145 | if (--idx < FIX_ACPI_BEGIN) |
156 | return NULL; /* cannot handle this */ | 146 | return NULL; /* cannot handle this */ |
157 | phys += PAGE_SIZE; | 147 | phys += PAGE_SIZE; |
148 | clear_fixmap(idx); | ||
158 | set_fixmap(idx, phys); | 149 | set_fixmap(idx, phys); |
159 | mapped_size += PAGE_SIZE; | 150 | mapped_size += PAGE_SIZE; |
160 | } | 151 | } |
161 | 152 | ||
162 | return ((unsigned char *)base + offset); | 153 | return ((unsigned char *)base + offset); |
163 | } | 154 | } |
164 | #endif | ||
165 | 155 | ||
166 | #ifdef CONFIG_PCI_MMCONFIG | 156 | #ifdef CONFIG_PCI_MMCONFIG |
167 | /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ | 157 | /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ |
@@ -338,8 +328,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e | |||
338 | 328 | ||
339 | #ifdef CONFIG_X86_IO_APIC | 329 | #ifdef CONFIG_X86_IO_APIC |
340 | 330 | ||
341 | struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; | ||
342 | |||
343 | static int __init | 331 | static int __init |
344 | acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) | 332 | acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) |
345 | { | 333 | { |
@@ -514,8 +502,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) | |||
514 | * Make sure all (legacy) PCI IRQs are set as level-triggered. | 502 | * Make sure all (legacy) PCI IRQs are set as level-triggered. |
515 | */ | 503 | */ |
516 | if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { | 504 | if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { |
517 | extern void eisa_set_level_irq(unsigned int irq); | ||
518 | |||
519 | if (triggering == ACPI_LEVEL_SENSITIVE) | 505 | if (triggering == ACPI_LEVEL_SENSITIVE) |
520 | eisa_set_level_irq(gsi); | 506 | eisa_set_level_irq(gsi); |
521 | } | 507 | } |
@@ -860,6 +846,364 @@ static int __init acpi_parse_madt_lapic_entries(void) | |||
860 | #endif /* CONFIG_X86_LOCAL_APIC */ | 846 | #endif /* CONFIG_X86_LOCAL_APIC */ |
861 | 847 | ||
862 | #ifdef CONFIG_X86_IO_APIC | 848 | #ifdef CONFIG_X86_IO_APIC |
849 | #define MP_ISA_BUS 0 | ||
850 | |||
851 | #ifdef CONFIG_X86_ES7000 | ||
852 | extern int es7000_plat; | ||
853 | #endif | ||
854 | |||
855 | static struct { | ||
856 | int apic_id; | ||
857 | int gsi_base; | ||
858 | int gsi_end; | ||
859 | DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); | ||
860 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
861 | |||
862 | static int mp_find_ioapic(int gsi) | ||
863 | { | ||
864 | int i = 0; | ||
865 | |||
866 | /* Find the IOAPIC that manages this GSI. */ | ||
867 | for (i = 0; i < nr_ioapics; i++) { | ||
868 | if ((gsi >= mp_ioapic_routing[i].gsi_base) | ||
869 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | ||
870 | return i; | ||
871 | } | ||
872 | |||
873 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
874 | return -1; | ||
875 | } | ||
876 | |||
877 | static u8 __init uniq_ioapic_id(u8 id) | ||
878 | { | ||
879 | #ifdef CONFIG_X86_32 | ||
880 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | ||
881 | !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
882 | return io_apic_get_unique_id(nr_ioapics, id); | ||
883 | else | ||
884 | return id; | ||
885 | #else | ||
886 | int i; | ||
887 | DECLARE_BITMAP(used, 256); | ||
888 | bitmap_zero(used, 256); | ||
889 | for (i = 0; i < nr_ioapics; i++) { | ||
890 | struct mp_config_ioapic *ia = &mp_ioapics[i]; | ||
891 | __set_bit(ia->mp_apicid, used); | ||
892 | } | ||
893 | if (!test_bit(id, used)) | ||
894 | return id; | ||
895 | return find_first_zero_bit(used, 256); | ||
896 | #endif | ||
897 | } | ||
898 | |||
899 | static int bad_ioapic(unsigned long address) | ||
900 | { | ||
901 | if (nr_ioapics >= MAX_IO_APICS) { | ||
902 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
903 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
904 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
905 | } | ||
906 | if (!address) { | ||
907 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
908 | " found in table, skipping!\n"); | ||
909 | return 1; | ||
910 | } | ||
911 | return 0; | ||
912 | } | ||
913 | |||
914 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | ||
915 | { | ||
916 | int idx = 0; | ||
917 | |||
918 | if (bad_ioapic(address)) | ||
919 | return; | ||
920 | |||
921 | idx = nr_ioapics; | ||
922 | |||
923 | mp_ioapics[idx].mp_type = MP_IOAPIC; | ||
924 | mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; | ||
925 | mp_ioapics[idx].mp_apicaddr = address; | ||
926 | |||
927 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
928 | mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); | ||
929 | #ifdef CONFIG_X86_32 | ||
930 | mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); | ||
931 | #else | ||
932 | mp_ioapics[idx].mp_apicver = 0; | ||
933 | #endif | ||
934 | /* | ||
935 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups | ||
936 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | ||
937 | */ | ||
938 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; | ||
939 | mp_ioapic_routing[idx].gsi_base = gsi_base; | ||
940 | mp_ioapic_routing[idx].gsi_end = gsi_base + | ||
941 | io_apic_get_redir_entries(idx); | ||
942 | |||
943 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " | ||
944 | "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, | ||
945 | mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, | ||
946 | mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); | ||
947 | |||
948 | nr_ioapics++; | ||
949 | } | ||
950 | |||
951 | static void assign_to_mp_irq(struct mp_config_intsrc *m, | ||
952 | struct mp_config_intsrc *mp_irq) | ||
953 | { | ||
954 | memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); | ||
955 | } | ||
956 | |||
957 | static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, | ||
958 | struct mp_config_intsrc *m) | ||
959 | { | ||
960 | return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); | ||
961 | } | ||
962 | |||
963 | static void save_mp_irq(struct mp_config_intsrc *m) | ||
964 | { | ||
965 | int i; | ||
966 | |||
967 | for (i = 0; i < mp_irq_entries; i++) { | ||
968 | if (!mp_irq_cmp(&mp_irqs[i], m)) | ||
969 | return; | ||
970 | } | ||
971 | |||
972 | assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); | ||
973 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
974 | panic("Max # of irq sources exceeded!!\n"); | ||
975 | } | ||
976 | |||
977 | void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | ||
978 | { | ||
979 | int ioapic; | ||
980 | int pin; | ||
981 | struct mp_config_intsrc mp_irq; | ||
982 | |||
983 | /* | ||
984 | * Convert 'gsi' to 'ioapic.pin'. | ||
985 | */ | ||
986 | ioapic = mp_find_ioapic(gsi); | ||
987 | if (ioapic < 0) | ||
988 | return; | ||
989 | pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | ||
990 | |||
991 | /* | ||
992 | * TBD: This check is for faulty timer entries, where the override | ||
993 | * erroneously sets the trigger to level, resulting in a HUGE | ||
994 | * increase of timer interrupts! | ||
995 | */ | ||
996 | if ((bus_irq == 0) && (trigger == 3)) | ||
997 | trigger = 1; | ||
998 | |||
999 | mp_irq.mp_type = MP_INTSRC; | ||
1000 | mp_irq.mp_irqtype = mp_INT; | ||
1001 | mp_irq.mp_irqflag = (trigger << 2) | polarity; | ||
1002 | mp_irq.mp_srcbus = MP_ISA_BUS; | ||
1003 | mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ | ||
1004 | mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ | ||
1005 | mp_irq.mp_dstirq = pin; /* INTIN# */ | ||
1006 | |||
1007 | save_mp_irq(&mp_irq); | ||
1008 | } | ||
1009 | |||
1010 | void __init mp_config_acpi_legacy_irqs(void) | ||
1011 | { | ||
1012 | int i; | ||
1013 | int ioapic; | ||
1014 | unsigned int dstapic; | ||
1015 | struct mp_config_intsrc mp_irq; | ||
1016 | |||
1017 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | ||
1018 | /* | ||
1019 | * Fabricate the legacy ISA bus (bus #31). | ||
1020 | */ | ||
1021 | mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; | ||
1022 | #endif | ||
1023 | set_bit(MP_ISA_BUS, mp_bus_not_pci); | ||
1024 | pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); | ||
1025 | |||
1026 | #ifdef CONFIG_X86_ES7000 | ||
1027 | /* | ||
1028 | * Older generations of ES7000 have no legacy identity mappings | ||
1029 | */ | ||
1030 | if (es7000_plat == 1) | ||
1031 | return; | ||
1032 | #endif | ||
1033 | |||
1034 | /* | ||
1035 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | ||
1036 | */ | ||
1037 | ioapic = mp_find_ioapic(0); | ||
1038 | if (ioapic < 0) | ||
1039 | return; | ||
1040 | dstapic = mp_ioapics[ioapic].mp_apicid; | ||
1041 | |||
1042 | /* | ||
1043 | * Use the default configuration for the IRQs 0-15. Unless | ||
1044 | * overridden by (MADT) interrupt source override entries. | ||
1045 | */ | ||
1046 | for (i = 0; i < 16; i++) { | ||
1047 | int idx; | ||
1048 | |||
1049 | for (idx = 0; idx < mp_irq_entries; idx++) { | ||
1050 | struct mp_config_intsrc *irq = mp_irqs + idx; | ||
1051 | |||
1052 | /* Do we already have a mapping for this ISA IRQ? */ | ||
1053 | if (irq->mp_srcbus == MP_ISA_BUS | ||
1054 | && irq->mp_srcbusirq == i) | ||
1055 | break; | ||
1056 | |||
1057 | /* Do we already have a mapping for this IOAPIC pin */ | ||
1058 | if (irq->mp_dstapic == dstapic && | ||
1059 | irq->mp_dstirq == i) | ||
1060 | break; | ||
1061 | } | ||
1062 | |||
1063 | if (idx != mp_irq_entries) { | ||
1064 | printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | ||
1065 | continue; /* IRQ already used */ | ||
1066 | } | ||
1067 | |||
1068 | mp_irq.mp_type = MP_INTSRC; | ||
1069 | mp_irq.mp_irqflag = 0; /* Conforming */ | ||
1070 | mp_irq.mp_srcbus = MP_ISA_BUS; | ||
1071 | mp_irq.mp_dstapic = dstapic; | ||
1072 | mp_irq.mp_irqtype = mp_INT; | ||
1073 | mp_irq.mp_srcbusirq = i; /* Identity mapped */ | ||
1074 | mp_irq.mp_dstirq = i; | ||
1075 | |||
1076 | save_mp_irq(&mp_irq); | ||
1077 | } | ||
1078 | } | ||
1079 | |||
1080 | int mp_register_gsi(u32 gsi, int triggering, int polarity) | ||
1081 | { | ||
1082 | int ioapic; | ||
1083 | int ioapic_pin; | ||
1084 | #ifdef CONFIG_X86_32 | ||
1085 | #define MAX_GSI_NUM 4096 | ||
1086 | #define IRQ_COMPRESSION_START 64 | ||
1087 | |||
1088 | static int pci_irq = IRQ_COMPRESSION_START; | ||
1089 | /* | ||
1090 | * Mapping between Global System Interrupts, which | ||
1091 | * represent all possible interrupts, and IRQs | ||
1092 | * assigned to actual devices. | ||
1093 | */ | ||
1094 | static int gsi_to_irq[MAX_GSI_NUM]; | ||
1095 | #else | ||
1096 | |||
1097 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | ||
1098 | return gsi; | ||
1099 | #endif | ||
1100 | |||
1101 | /* Don't set up the ACPI SCI because it's already set up */ | ||
1102 | if (acpi_gbl_FADT.sci_interrupt == gsi) | ||
1103 | return gsi; | ||
1104 | |||
1105 | ioapic = mp_find_ioapic(gsi); | ||
1106 | if (ioapic < 0) { | ||
1107 | printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | ||
1108 | return gsi; | ||
1109 | } | ||
1110 | |||
1111 | ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | ||
1112 | |||
1113 | #ifdef CONFIG_X86_32 | ||
1114 | if (ioapic_renumber_irq) | ||
1115 | gsi = ioapic_renumber_irq(ioapic, gsi); | ||
1116 | #endif | ||
1117 | |||
1118 | /* | ||
1119 | * Avoid pin reprogramming. PRTs typically include entries | ||
1120 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
1121 | * we only program the IOAPIC on the first. | ||
1122 | */ | ||
1123 | if (ioapic_pin > MP_MAX_IOAPIC_PIN) { | ||
1124 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | ||
1125 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | ||
1126 | ioapic_pin); | ||
1127 | return gsi; | ||
1128 | } | ||
1129 | if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { | ||
1130 | pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n", | ||
1131 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | ||
1132 | #ifdef CONFIG_X86_32 | ||
1133 | return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); | ||
1134 | #else | ||
1135 | return gsi; | ||
1136 | #endif | ||
1137 | } | ||
1138 | |||
1139 | set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); | ||
1140 | #ifdef CONFIG_X86_32 | ||
1141 | /* | ||
1142 | * For GSI >= 64, use IRQ compression | ||
1143 | */ | ||
1144 | if ((gsi >= IRQ_COMPRESSION_START) | ||
1145 | && (triggering == ACPI_LEVEL_SENSITIVE)) { | ||
1146 | /* | ||
1147 | * For PCI devices assign IRQs in order, avoiding gaps | ||
1148 | * due to unused I/O APIC pins. | ||
1149 | */ | ||
1150 | int irq = gsi; | ||
1151 | if (gsi < MAX_GSI_NUM) { | ||
1152 | /* | ||
1153 | * Retain the VIA chipset work-around (gsi > 15), but | ||
1154 | * avoid a problem where the 8254 timer (IRQ0) is setup | ||
1155 | * via an override (so it's not on pin 0 of the ioapic), | ||
1156 | * and at the same time, the pin 0 interrupt is a PCI | ||
1157 | * type. The gsi > 15 test could cause these two pins | ||
1158 | * to be shared as IRQ0, and they are not shareable. | ||
1159 | * So test for this condition, and if necessary, avoid | ||
1160 | * the pin collision. | ||
1161 | */ | ||
1162 | gsi = pci_irq++; | ||
1163 | /* | ||
1164 | * Don't assign IRQ used by ACPI SCI | ||
1165 | */ | ||
1166 | if (gsi == acpi_gbl_FADT.sci_interrupt) | ||
1167 | gsi = pci_irq++; | ||
1168 | gsi_to_irq[irq] = gsi; | ||
1169 | } else { | ||
1170 | printk(KERN_ERR "GSI %u is too high\n", gsi); | ||
1171 | return gsi; | ||
1172 | } | ||
1173 | } | ||
1174 | #endif | ||
1175 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
1176 | triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
1177 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
1178 | return gsi; | ||
1179 | } | ||
1180 | |||
1181 | int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, | ||
1182 | u32 gsi, int triggering, int polarity) | ||
1183 | { | ||
1184 | #ifdef CONFIG_X86_MPPARSE | ||
1185 | struct mp_config_intsrc mp_irq; | ||
1186 | int ioapic; | ||
1187 | |||
1188 | if (!acpi_ioapic) | ||
1189 | return 0; | ||
1190 | |||
1191 | /* print the entry should happen on mptable identically */ | ||
1192 | mp_irq.mp_type = MP_INTSRC; | ||
1193 | mp_irq.mp_irqtype = mp_INT; | ||
1194 | mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | | ||
1195 | (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); | ||
1196 | mp_irq.mp_srcbus = number; | ||
1197 | mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); | ||
1198 | ioapic = mp_find_ioapic(gsi); | ||
1199 | mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; | ||
1200 | mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; | ||
1201 | |||
1202 | save_mp_irq(&mp_irq); | ||
1203 | #endif | ||
1204 | return 0; | ||
1205 | } | ||
1206 | |||
863 | /* | 1207 | /* |
864 | * Parse IOAPIC related entries in MADT | 1208 | * Parse IOAPIC related entries in MADT |
865 | * returns 0 on success, < 0 on error | 1209 | * returns 0 on success, < 0 on error |
@@ -1009,8 +1353,6 @@ static void __init acpi_process_madt(void) | |||
1009 | return; | 1353 | return; |
1010 | } | 1354 | } |
1011 | 1355 | ||
1012 | #ifdef __i386__ | ||
1013 | |||
1014 | static int __init disable_acpi_irq(const struct dmi_system_id *d) | 1356 | static int __init disable_acpi_irq(const struct dmi_system_id *d) |
1015 | { | 1357 | { |
1016 | if (!acpi_force) { | 1358 | if (!acpi_force) { |
@@ -1061,6 +1403,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) | |||
1061 | } | 1403 | } |
1062 | 1404 | ||
1063 | /* | 1405 | /* |
1406 | * Force ignoring BIOS IRQ0 pin2 override | ||
1407 | */ | ||
1408 | static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) | ||
1409 | { | ||
1410 | pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); | ||
1411 | acpi_skip_timer_override = 1; | ||
1412 | return 0; | ||
1413 | } | ||
1414 | |||
1415 | /* | ||
1064 | * If your system is blacklisted here, but you find that acpi=force | 1416 | * If your system is blacklisted here, but you find that acpi=force |
1065 | * works for you, please contact acpi-devel@sourceforge.net | 1417 | * works for you, please contact acpi-devel@sourceforge.net |
1066 | */ | 1418 | */ |
@@ -1227,11 +1579,35 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { | |||
1227 | DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), | 1579 | DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), |
1228 | }, | 1580 | }, |
1229 | }, | 1581 | }, |
1582 | /* | ||
1583 | * HP laptops which use a DSDT reporting as HP/SB400/10000, | ||
1584 | * which includes some code which overrides all temperature | ||
1585 | * trip points to 16C if the INTIN2 input of the I/O APIC | ||
1586 | * is enabled. This input is incorrectly designated the | ||
1587 | * ISA IRQ 0 via an interrupt source override even though | ||
1588 | * it is wired to the output of the master 8259A and INTIN0 | ||
1589 | * is not connected at all. Force ignoring BIOS IRQ0 pin2 | ||
1590 | * override in that cases. | ||
1591 | */ | ||
1592 | { | ||
1593 | .callback = dmi_ignore_irq0_timer_override, | ||
1594 | .ident = "HP NX6125 laptop", | ||
1595 | .matches = { | ||
1596 | DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), | ||
1597 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"), | ||
1598 | }, | ||
1599 | }, | ||
1600 | { | ||
1601 | .callback = dmi_ignore_irq0_timer_override, | ||
1602 | .ident = "HP NX6325 laptop", | ||
1603 | .matches = { | ||
1604 | DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), | ||
1605 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), | ||
1606 | }, | ||
1607 | }, | ||
1230 | {} | 1608 | {} |
1231 | }; | 1609 | }; |
1232 | 1610 | ||
1233 | #endif /* __i386__ */ | ||
1234 | |||
1235 | /* | 1611 | /* |
1236 | * acpi_boot_table_init() and acpi_boot_init() | 1612 | * acpi_boot_table_init() and acpi_boot_init() |
1237 | * called from setup_arch(), always. | 1613 | * called from setup_arch(), always. |
@@ -1259,9 +1635,7 @@ int __init acpi_boot_table_init(void) | |||
1259 | { | 1635 | { |
1260 | int error; | 1636 | int error; |
1261 | 1637 | ||
1262 | #ifdef __i386__ | ||
1263 | dmi_check_system(acpi_dmi_table); | 1638 | dmi_check_system(acpi_dmi_table); |
1264 | #endif | ||
1265 | 1639 | ||
1266 | /* | 1640 | /* |
1267 | * If acpi_disabled, bail out | 1641 | * If acpi_disabled, bail out |
@@ -1386,6 +1760,20 @@ static int __init parse_pci(char *arg) | |||
1386 | } | 1760 | } |
1387 | early_param("pci", parse_pci); | 1761 | early_param("pci", parse_pci); |
1388 | 1762 | ||
1763 | int __init acpi_mps_check(void) | ||
1764 | { | ||
1765 | #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) | ||
1766 | /* mptable code is not built-in*/ | ||
1767 | if (acpi_disabled || acpi_noirq) { | ||
1768 | printk(KERN_WARNING "MPS support code is not built-in.\n" | ||
1769 | "Using acpi=off or acpi=noirq or pci=noacpi " | ||
1770 | "may have problem\n"); | ||
1771 | return 1; | ||
1772 | } | ||
1773 | #endif | ||
1774 | return 0; | ||
1775 | } | ||
1776 | |||
1389 | #ifdef CONFIG_X86_IO_APIC | 1777 | #ifdef CONFIG_X86_IO_APIC |
1390 | static int __init parse_acpi_skip_timer_override(char *arg) | 1778 | static int __init parse_acpi_skip_timer_override(char *arg) |
1391 | { | 1779 | { |
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index de2d2e4ebad9..7c074eec39fb 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c | |||
@@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) | |||
56 | if (cpu_has(c, X86_FEATURE_ACPI)) | 56 | if (cpu_has(c, X86_FEATURE_ACPI)) |
57 | buf[2] |= ACPI_PDC_T_FFH; | 57 | buf[2] |= ACPI_PDC_T_FFH; |
58 | 58 | ||
59 | /* | ||
60 | * If mwait/monitor is unsupported, C2/C3_FFH will be disabled | ||
61 | */ | ||
62 | if (!cpu_has(c, X86_FEATURE_MWAIT)) | ||
63 | buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); | ||
64 | |||
59 | obj->type = ACPI_TYPE_BUFFER; | 65 | obj->type = ACPI_TYPE_BUFFER; |
60 | obj->buffer.length = 12; | 66 | obj->buffer.length = 12; |
61 | obj->buffer.pointer = (u8 *) buf; | 67 | obj->buffer.pointer = (u8 *) buf; |
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index f9b77fb37e5b..3355973b12ac 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <asm/msr-index.h> | 5 | #include <asm/msr-index.h> |
6 | #include <asm/page.h> | 6 | #include <asm/page.h> |
7 | #include <asm/pgtable.h> | 7 | #include <asm/pgtable.h> |
8 | #include <asm/processor-flags.h> | ||
8 | 9 | ||
9 | .code16 | 10 | .code16 |
10 | .section ".header", "a" | 11 | .section ".header", "a" |
@@ -24,6 +25,11 @@ pmode_gdt: .quad 0 | |||
24 | realmode_flags: .long 0 | 25 | realmode_flags: .long 0 |
25 | real_magic: .long 0 | 26 | real_magic: .long 0 |
26 | trampoline_segment: .word 0 | 27 | trampoline_segment: .word 0 |
28 | _pad1: .byte 0 | ||
29 | wakeup_jmp: .byte 0xea /* ljmpw */ | ||
30 | wakeup_jmp_off: .word 3f | ||
31 | wakeup_jmp_seg: .word 0 | ||
32 | wakeup_gdt: .quad 0, 0, 0 | ||
27 | signature: .long 0x51ee1111 | 33 | signature: .long 0x51ee1111 |
28 | 34 | ||
29 | .text | 35 | .text |
@@ -34,11 +40,34 @@ _start: | |||
34 | cli | 40 | cli |
35 | cld | 41 | cld |
36 | 42 | ||
43 | /* Apparently some dimwit BIOS programmers don't know how to | ||
44 | program a PM to RM transition, and we might end up here with | ||
45 | junk in the data segment descriptor registers. The only way | ||
46 | to repair that is to go into PM and fix it ourselves... */ | ||
47 | movw $16, %cx | ||
48 | lgdtl %cs:wakeup_gdt | ||
49 | movl %cr0, %eax | ||
50 | orb $X86_CR0_PE, %al | ||
51 | movl %eax, %cr0 | ||
52 | jmp 1f | ||
53 | 1: ljmpw $8, $2f | ||
54 | 2: | ||
55 | movw %cx, %ds | ||
56 | movw %cx, %es | ||
57 | movw %cx, %ss | ||
58 | movw %cx, %fs | ||
59 | movw %cx, %gs | ||
60 | |||
61 | andb $~X86_CR0_PE, %al | ||
62 | movl %eax, %cr0 | ||
63 | jmp wakeup_jmp | ||
64 | 3: | ||
37 | /* Set up segments */ | 65 | /* Set up segments */ |
38 | movw %cs, %ax | 66 | movw %cs, %ax |
39 | movw %ax, %ds | 67 | movw %ax, %ds |
40 | movw %ax, %es | 68 | movw %ax, %es |
41 | movw %ax, %ss | 69 | movw %ax, %ss |
70 | lidtl wakeup_idt | ||
42 | 71 | ||
43 | movl $wakeup_stack_end, %esp | 72 | movl $wakeup_stack_end, %esp |
44 | 73 | ||
@@ -98,7 +127,14 @@ bogus_real_magic: | |||
98 | jmp 1b | 127 | jmp 1b |
99 | 128 | ||
100 | .data | 129 | .data |
101 | .balign 4 | 130 | .balign 8 |
131 | |||
132 | /* This is the standard real-mode IDT */ | ||
133 | wakeup_idt: | ||
134 | .word 0xffff /* limit */ | ||
135 | .long 0 /* address */ | ||
136 | .word 0 | ||
137 | |||
102 | .globl HEAP, heap_end | 138 | .globl HEAP, heap_end |
103 | HEAP: | 139 | HEAP: |
104 | .long wakeup_heap | 140 | .long wakeup_heap |
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h index ef8166fe8020..69d38d0b2b64 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ b/arch/x86/kernel/acpi/realmode/wakeup.h | |||
@@ -24,6 +24,11 @@ struct wakeup_header { | |||
24 | u32 realmode_flags; | 24 | u32 realmode_flags; |
25 | u32 real_magic; | 25 | u32 real_magic; |
26 | u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ | 26 | u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ |
27 | u8 _pad1; | ||
28 | u8 wakeup_jmp; | ||
29 | u16 wakeup_jmp_off; | ||
30 | u16 wakeup_jmp_seg; | ||
31 | u64 wakeup_gdt[3]; | ||
27 | u32 signature; /* To check we have correct structure */ | 32 | u32 signature; /* To check we have correct structure */ |
28 | } __attribute__((__packed__)); | 33 | } __attribute__((__packed__)); |
29 | 34 | ||
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index afc25ee9964b..fa2161d5003b 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/bootmem.h> | 9 | #include <linux/bootmem.h> |
10 | #include <linux/dmi.h> | 10 | #include <linux/dmi.h> |
11 | #include <linux/cpumask.h> | 11 | #include <linux/cpumask.h> |
12 | #include <asm/segment.h> | ||
12 | 13 | ||
13 | #include "realmode/wakeup.h" | 14 | #include "realmode/wakeup.h" |
14 | #include "sleep.h" | 15 | #include "sleep.h" |
@@ -50,6 +51,29 @@ int acpi_save_state_mem(void) | |||
50 | 51 | ||
51 | header->video_mode = saved_video_mode; | 52 | header->video_mode = saved_video_mode; |
52 | 53 | ||
54 | header->wakeup_jmp_seg = acpi_wakeup_address >> 4; | ||
55 | |||
56 | /* | ||
57 | * Set up the wakeup GDT. We set these up as Big Real Mode, | ||
58 | * that is, with limits set to 4 GB. At least the Lenovo | ||
59 | * Thinkpad X61 is known to need this for the video BIOS | ||
60 | * initialization quirk to work; this is likely to also | ||
61 | * be the case for other laptops or integrated video devices. | ||
62 | */ | ||
63 | |||
64 | /* GDT[0]: GDT self-pointer */ | ||
65 | header->wakeup_gdt[0] = | ||
66 | (u64)(sizeof(header->wakeup_gdt) - 1) + | ||
67 | ((u64)(acpi_wakeup_address + | ||
68 | ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) | ||
69 | << 16); | ||
70 | /* GDT[1]: big real mode-like code segment */ | ||
71 | header->wakeup_gdt[1] = | ||
72 | GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); | ||
73 | /* GDT[2]: big real mode-like data segment */ | ||
74 | header->wakeup_gdt[2] = | ||
75 | GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff); | ||
76 | |||
53 | #ifndef CONFIG_64BIT | 77 | #ifndef CONFIG_64BIT |
54 | store_gdt((struct desc_ptr *)&header->pmode_gdt); | 78 | store_gdt((struct desc_ptr *)&header->pmode_gdt); |
55 | 79 | ||
@@ -72,7 +96,9 @@ int acpi_save_state_mem(void) | |||
72 | saved_magic = 0x12345678; | 96 | saved_magic = 0x12345678; |
73 | #else /* CONFIG_64BIT */ | 97 | #else /* CONFIG_64BIT */ |
74 | header->trampoline_segment = setup_trampoline() >> 4; | 98 | header->trampoline_segment = setup_trampoline() >> 4; |
75 | init_rsp = (unsigned long)temp_stack + 4096; | 99 | #ifdef CONFIG_SMP |
100 | stack_start.sp = temp_stack + 4096; | ||
101 | #endif | ||
76 | initial_code = (unsigned long)wakeup_long64; | 102 | initial_code = (unsigned long)wakeup_long64; |
77 | saved_magic = 0x123456789abcdef0; | 103 | saved_magic = 0x123456789abcdef0; |
78 | #endif /* CONFIG_64BIT */ | 104 | #endif /* CONFIG_64BIT */ |
@@ -111,7 +137,7 @@ void __init acpi_reserve_bootmem(void) | |||
111 | return; | 137 | return; |
112 | } | 138 | } |
113 | 139 | ||
114 | acpi_wakeup_address = acpi_realmode; | 140 | acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); |
115 | } | 141 | } |
116 | 142 | ||
117 | 143 | ||
@@ -124,6 +150,12 @@ static int __init acpi_sleep_setup(char *str) | |||
124 | acpi_realmode_flags |= 2; | 150 | acpi_realmode_flags |= 2; |
125 | if (strncmp(str, "s3_beep", 7) == 0) | 151 | if (strncmp(str, "s3_beep", 7) == 0) |
126 | acpi_realmode_flags |= 4; | 152 | acpi_realmode_flags |= 4; |
153 | #ifdef CONFIG_HIBERNATION | ||
154 | if (strncmp(str, "s4_nohwsig", 10) == 0) | ||
155 | acpi_no_s4_hw_signature(); | ||
156 | #endif | ||
157 | if (strncmp(str, "old_ordering", 12) == 0) | ||
158 | acpi_old_suspend_ordering(); | ||
127 | str = strchr(str, ','); | 159 | str = strchr(str, ','); |
128 | if (str != NULL) | 160 | if (str != NULL) |
129 | str += strspn(str, ", \t"); | 161 | str += strspn(str, ", \t"); |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 65c7857a90dd..2763cb37b553 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -1,6 +1,6 @@ | |||
1 | #include <linux/module.h> | 1 | #include <linux/module.h> |
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/spinlock.h> | 3 | #include <linux/mutex.h> |
4 | #include <linux/list.h> | 4 | #include <linux/list.h> |
5 | #include <linux/kprobes.h> | 5 | #include <linux/kprobes.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { | |||
143 | #ifdef CONFIG_X86_64 | 143 | #ifdef CONFIG_X86_64 |
144 | 144 | ||
145 | extern char __vsyscall_0; | 145 | extern char __vsyscall_0; |
146 | static inline const unsigned char*const * find_nop_table(void) | 146 | const unsigned char *const *find_nop_table(void) |
147 | { | 147 | { |
148 | return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || | 148 | return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || |
149 | boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; | 149 | boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; |
@@ -162,7 +162,7 @@ static const struct nop { | |||
162 | { -1, NULL } | 162 | { -1, NULL } |
163 | }; | 163 | }; |
164 | 164 | ||
165 | static const unsigned char*const * find_nop_table(void) | 165 | const unsigned char *const *find_nop_table(void) |
166 | { | 166 | { |
167 | const unsigned char *const *noptable = intel_nops; | 167 | const unsigned char *const *noptable = intel_nops; |
168 | int i; | 168 | int i; |
@@ -279,7 +279,7 @@ struct smp_alt_module { | |||
279 | struct list_head next; | 279 | struct list_head next; |
280 | }; | 280 | }; |
281 | static LIST_HEAD(smp_alt_modules); | 281 | static LIST_HEAD(smp_alt_modules); |
282 | static DEFINE_SPINLOCK(smp_alt); | 282 | static DEFINE_MUTEX(smp_alt); |
283 | static int smp_mode = 1; /* protected by smp_alt */ | 283 | static int smp_mode = 1; /* protected by smp_alt */ |
284 | 284 | ||
285 | void alternatives_smp_module_add(struct module *mod, char *name, | 285 | void alternatives_smp_module_add(struct module *mod, char *name, |
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name, | |||
312 | __func__, smp->locks, smp->locks_end, | 312 | __func__, smp->locks, smp->locks_end, |
313 | smp->text, smp->text_end, smp->name); | 313 | smp->text, smp->text_end, smp->name); |
314 | 314 | ||
315 | spin_lock(&smp_alt); | 315 | mutex_lock(&smp_alt); |
316 | list_add_tail(&smp->next, &smp_alt_modules); | 316 | list_add_tail(&smp->next, &smp_alt_modules); |
317 | if (boot_cpu_has(X86_FEATURE_UP)) | 317 | if (boot_cpu_has(X86_FEATURE_UP)) |
318 | alternatives_smp_unlock(smp->locks, smp->locks_end, | 318 | alternatives_smp_unlock(smp->locks, smp->locks_end, |
319 | smp->text, smp->text_end); | 319 | smp->text, smp->text_end); |
320 | spin_unlock(&smp_alt); | 320 | mutex_unlock(&smp_alt); |
321 | } | 321 | } |
322 | 322 | ||
323 | void alternatives_smp_module_del(struct module *mod) | 323 | void alternatives_smp_module_del(struct module *mod) |
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod) | |||
327 | if (smp_alt_once || noreplace_smp) | 327 | if (smp_alt_once || noreplace_smp) |
328 | return; | 328 | return; |
329 | 329 | ||
330 | spin_lock(&smp_alt); | 330 | mutex_lock(&smp_alt); |
331 | list_for_each_entry(item, &smp_alt_modules, next) { | 331 | list_for_each_entry(item, &smp_alt_modules, next) { |
332 | if (mod != item->mod) | 332 | if (mod != item->mod) |
333 | continue; | 333 | continue; |
334 | list_del(&item->next); | 334 | list_del(&item->next); |
335 | spin_unlock(&smp_alt); | 335 | mutex_unlock(&smp_alt); |
336 | DPRINTK("%s: %s\n", __func__, item->name); | 336 | DPRINTK("%s: %s\n", __func__, item->name); |
337 | kfree(item); | 337 | kfree(item); |
338 | return; | 338 | return; |
339 | } | 339 | } |
340 | spin_unlock(&smp_alt); | 340 | mutex_unlock(&smp_alt); |
341 | } | 341 | } |
342 | 342 | ||
343 | void alternatives_smp_switch(int smp) | 343 | void alternatives_smp_switch(int smp) |
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp) | |||
359 | return; | 359 | return; |
360 | BUG_ON(!smp && (num_online_cpus() > 1)); | 360 | BUG_ON(!smp && (num_online_cpus() > 1)); |
361 | 361 | ||
362 | spin_lock(&smp_alt); | 362 | mutex_lock(&smp_alt); |
363 | 363 | ||
364 | /* | 364 | /* |
365 | * Avoid unnecessary switches because it forces JIT based VMs to | 365 | * Avoid unnecessary switches because it forces JIT based VMs to |
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp) | |||
383 | mod->text, mod->text_end); | 383 | mod->text, mod->text_end); |
384 | } | 384 | } |
385 | smp_mode = smp; | 385 | smp_mode = smp; |
386 | spin_unlock(&smp_alt); | 386 | mutex_unlock(&smp_alt); |
387 | } | 387 | } |
388 | 388 | ||
389 | #endif | 389 | #endif |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c new file mode 100644 index 000000000000..22d7d050905d --- /dev/null +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -0,0 +1,1164 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. | ||
3 | * Author: Joerg Roedel <joerg.roedel@amd.com> | ||
4 | * Leo Duran <leo.duran@amd.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of the GNU General Public License version 2 as published | ||
8 | * by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/pci.h> | ||
21 | #include <linux/gfp.h> | ||
22 | #include <linux/bitops.h> | ||
23 | #include <linux/scatterlist.h> | ||
24 | #include <linux/iommu-helper.h> | ||
25 | #include <asm/proto.h> | ||
26 | #include <asm/iommu.h> | ||
27 | #include <asm/amd_iommu_types.h> | ||
28 | #include <asm/amd_iommu.h> | ||
29 | |||
30 | #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) | ||
31 | |||
32 | #define EXIT_LOOP_COUNT 10000000 | ||
33 | |||
34 | static DEFINE_RWLOCK(amd_iommu_devtable_lock); | ||
35 | |||
36 | /* | ||
37 | * general struct to manage commands send to an IOMMU | ||
38 | */ | ||
39 | struct iommu_cmd { | ||
40 | u32 data[4]; | ||
41 | }; | ||
42 | |||
43 | static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, | ||
44 | struct unity_map_entry *e); | ||
45 | |||
46 | /* returns !0 if the IOMMU is caching non-present entries in its TLB */ | ||
47 | static int iommu_has_npcache(struct amd_iommu *iommu) | ||
48 | { | ||
49 | return iommu->cap & IOMMU_CAP_NPCACHE; | ||
50 | } | ||
51 | |||
52 | /**************************************************************************** | ||
53 | * | ||
54 | * IOMMU command queuing functions | ||
55 | * | ||
56 | ****************************************************************************/ | ||
57 | |||
58 | /* | ||
59 | * Writes the command to the IOMMUs command buffer and informs the | ||
60 | * hardware about the new command. Must be called with iommu->lock held. | ||
61 | */ | ||
62 | static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) | ||
63 | { | ||
64 | u32 tail, head; | ||
65 | u8 *target; | ||
66 | |||
67 | tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); | ||
68 | target = (iommu->cmd_buf + tail); | ||
69 | memcpy_toio(target, cmd, sizeof(*cmd)); | ||
70 | tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; | ||
71 | head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); | ||
72 | if (tail == head) | ||
73 | return -ENOMEM; | ||
74 | writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); | ||
75 | |||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | /* | ||
80 | * General queuing function for commands. Takes iommu->lock and calls | ||
81 | * __iommu_queue_command(). | ||
82 | */ | ||
83 | static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) | ||
84 | { | ||
85 | unsigned long flags; | ||
86 | int ret; | ||
87 | |||
88 | spin_lock_irqsave(&iommu->lock, flags); | ||
89 | ret = __iommu_queue_command(iommu, cmd); | ||
90 | spin_unlock_irqrestore(&iommu->lock, flags); | ||
91 | |||
92 | return ret; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * This function is called whenever we need to ensure that the IOMMU has | ||
97 | * completed execution of all commands we sent. It sends a | ||
98 | * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs | ||
99 | * us about that by writing a value to a physical address we pass with | ||
100 | * the command. | ||
101 | */ | ||
102 | static int iommu_completion_wait(struct amd_iommu *iommu) | ||
103 | { | ||
104 | int ret; | ||
105 | struct iommu_cmd cmd; | ||
106 | volatile u64 ready = 0; | ||
107 | unsigned long ready_phys = virt_to_phys(&ready); | ||
108 | unsigned long i = 0; | ||
109 | |||
110 | memset(&cmd, 0, sizeof(cmd)); | ||
111 | cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; | ||
112 | cmd.data[1] = upper_32_bits(ready_phys); | ||
113 | cmd.data[2] = 1; /* value written to 'ready' */ | ||
114 | CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); | ||
115 | |||
116 | iommu->need_sync = 0; | ||
117 | |||
118 | ret = iommu_queue_command(iommu, &cmd); | ||
119 | |||
120 | if (ret) | ||
121 | return ret; | ||
122 | |||
123 | while (!ready && (i < EXIT_LOOP_COUNT)) { | ||
124 | ++i; | ||
125 | cpu_relax(); | ||
126 | } | ||
127 | |||
128 | if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) | ||
129 | printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); | ||
130 | |||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * Command send function for invalidating a device table entry | ||
136 | */ | ||
137 | static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) | ||
138 | { | ||
139 | struct iommu_cmd cmd; | ||
140 | |||
141 | BUG_ON(iommu == NULL); | ||
142 | |||
143 | memset(&cmd, 0, sizeof(cmd)); | ||
144 | CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); | ||
145 | cmd.data[0] = devid; | ||
146 | |||
147 | iommu->need_sync = 1; | ||
148 | |||
149 | return iommu_queue_command(iommu, &cmd); | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * Generic command send function for invalidaing TLB entries | ||
154 | */ | ||
155 | static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, | ||
156 | u64 address, u16 domid, int pde, int s) | ||
157 | { | ||
158 | struct iommu_cmd cmd; | ||
159 | |||
160 | memset(&cmd, 0, sizeof(cmd)); | ||
161 | address &= PAGE_MASK; | ||
162 | CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); | ||
163 | cmd.data[1] |= domid; | ||
164 | cmd.data[2] = LOW_U32(address); | ||
165 | cmd.data[3] = upper_32_bits(address); | ||
166 | if (s) /* size bit - we flush more than one 4kb page */ | ||
167 | cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; | ||
168 | if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ | ||
169 | cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; | ||
170 | |||
171 | iommu->need_sync = 1; | ||
172 | |||
173 | return iommu_queue_command(iommu, &cmd); | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * TLB invalidation function which is called from the mapping functions. | ||
178 | * It invalidates a single PTE if the range to flush is within a single | ||
179 | * page. Otherwise it flushes the whole TLB of the IOMMU. | ||
180 | */ | ||
181 | static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, | ||
182 | u64 address, size_t size) | ||
183 | { | ||
184 | int s = 0; | ||
185 | unsigned pages = iommu_num_pages(address, size); | ||
186 | |||
187 | address &= PAGE_MASK; | ||
188 | |||
189 | if (pages > 1) { | ||
190 | /* | ||
191 | * If we have to flush more than one page, flush all | ||
192 | * TLB entries for this domain | ||
193 | */ | ||
194 | address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; | ||
195 | s = 1; | ||
196 | } | ||
197 | |||
198 | iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); | ||
199 | |||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | /**************************************************************************** | ||
204 | * | ||
205 | * The functions below are used the create the page table mappings for | ||
206 | * unity mapped regions. | ||
207 | * | ||
208 | ****************************************************************************/ | ||
209 | |||
210 | /* | ||
211 | * Generic mapping functions. It maps a physical address into a DMA | ||
212 | * address space. It allocates the page table pages if necessary. | ||
213 | * In the future it can be extended to a generic mapping function | ||
214 | * supporting all features of AMD IOMMU page tables like level skipping | ||
215 | * and full 64 bit address spaces. | ||
216 | */ | ||
217 | static int iommu_map(struct protection_domain *dom, | ||
218 | unsigned long bus_addr, | ||
219 | unsigned long phys_addr, | ||
220 | int prot) | ||
221 | { | ||
222 | u64 __pte, *pte, *page; | ||
223 | |||
224 | bus_addr = PAGE_ALIGN(bus_addr); | ||
225 | phys_addr = PAGE_ALIGN(bus_addr); | ||
226 | |||
227 | /* only support 512GB address spaces for now */ | ||
228 | if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) | ||
229 | return -EINVAL; | ||
230 | |||
231 | pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; | ||
232 | |||
233 | if (!IOMMU_PTE_PRESENT(*pte)) { | ||
234 | page = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
235 | if (!page) | ||
236 | return -ENOMEM; | ||
237 | *pte = IOMMU_L2_PDE(virt_to_phys(page)); | ||
238 | } | ||
239 | |||
240 | pte = IOMMU_PTE_PAGE(*pte); | ||
241 | pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; | ||
242 | |||
243 | if (!IOMMU_PTE_PRESENT(*pte)) { | ||
244 | page = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
245 | if (!page) | ||
246 | return -ENOMEM; | ||
247 | *pte = IOMMU_L1_PDE(virt_to_phys(page)); | ||
248 | } | ||
249 | |||
250 | pte = IOMMU_PTE_PAGE(*pte); | ||
251 | pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; | ||
252 | |||
253 | if (IOMMU_PTE_PRESENT(*pte)) | ||
254 | return -EBUSY; | ||
255 | |||
256 | __pte = phys_addr | IOMMU_PTE_P; | ||
257 | if (prot & IOMMU_PROT_IR) | ||
258 | __pte |= IOMMU_PTE_IR; | ||
259 | if (prot & IOMMU_PROT_IW) | ||
260 | __pte |= IOMMU_PTE_IW; | ||
261 | |||
262 | *pte = __pte; | ||
263 | |||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * This function checks if a specific unity mapping entry is needed for | ||
269 | * this specific IOMMU. | ||
270 | */ | ||
271 | static int iommu_for_unity_map(struct amd_iommu *iommu, | ||
272 | struct unity_map_entry *entry) | ||
273 | { | ||
274 | u16 bdf, i; | ||
275 | |||
276 | for (i = entry->devid_start; i <= entry->devid_end; ++i) { | ||
277 | bdf = amd_iommu_alias_table[i]; | ||
278 | if (amd_iommu_rlookup_table[bdf] == iommu) | ||
279 | return 1; | ||
280 | } | ||
281 | |||
282 | return 0; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * Init the unity mappings for a specific IOMMU in the system | ||
287 | * | ||
288 | * Basically iterates over all unity mapping entries and applies them to | ||
289 | * the default domain DMA of that IOMMU if necessary. | ||
290 | */ | ||
291 | static int iommu_init_unity_mappings(struct amd_iommu *iommu) | ||
292 | { | ||
293 | struct unity_map_entry *entry; | ||
294 | int ret; | ||
295 | |||
296 | list_for_each_entry(entry, &amd_iommu_unity_map, list) { | ||
297 | if (!iommu_for_unity_map(iommu, entry)) | ||
298 | continue; | ||
299 | ret = dma_ops_unity_map(iommu->default_dom, entry); | ||
300 | if (ret) | ||
301 | return ret; | ||
302 | } | ||
303 | |||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * This function actually applies the mapping to the page table of the | ||
309 | * dma_ops domain. | ||
310 | */ | ||
311 | static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, | ||
312 | struct unity_map_entry *e) | ||
313 | { | ||
314 | u64 addr; | ||
315 | int ret; | ||
316 | |||
317 | for (addr = e->address_start; addr < e->address_end; | ||
318 | addr += PAGE_SIZE) { | ||
319 | ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); | ||
320 | if (ret) | ||
321 | return ret; | ||
322 | /* | ||
323 | * if unity mapping is in aperture range mark the page | ||
324 | * as allocated in the aperture | ||
325 | */ | ||
326 | if (addr < dma_dom->aperture_size) | ||
327 | __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); | ||
328 | } | ||
329 | |||
330 | return 0; | ||
331 | } | ||
332 | |||
333 | /* | ||
334 | * Inits the unity mappings required for a specific device | ||
335 | */ | ||
336 | static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, | ||
337 | u16 devid) | ||
338 | { | ||
339 | struct unity_map_entry *e; | ||
340 | int ret; | ||
341 | |||
342 | list_for_each_entry(e, &amd_iommu_unity_map, list) { | ||
343 | if (!(devid >= e->devid_start && devid <= e->devid_end)) | ||
344 | continue; | ||
345 | ret = dma_ops_unity_map(dma_dom, e); | ||
346 | if (ret) | ||
347 | return ret; | ||
348 | } | ||
349 | |||
350 | return 0; | ||
351 | } | ||
352 | |||
353 | /**************************************************************************** | ||
354 | * | ||
355 | * The next functions belong to the address allocator for the dma_ops | ||
356 | * interface functions. They work like the allocators in the other IOMMU | ||
357 | * drivers. Its basically a bitmap which marks the allocated pages in | ||
358 | * the aperture. Maybe it could be enhanced in the future to a more | ||
359 | * efficient allocator. | ||
360 | * | ||
361 | ****************************************************************************/ | ||
362 | static unsigned long dma_mask_to_pages(unsigned long mask) | ||
363 | { | ||
364 | return (mask >> PAGE_SHIFT) + | ||
365 | (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); | ||
366 | } | ||
367 | |||
368 | /* | ||
369 | * The address allocator core function. | ||
370 | * | ||
371 | * called with domain->lock held | ||
372 | */ | ||
373 | static unsigned long dma_ops_alloc_addresses(struct device *dev, | ||
374 | struct dma_ops_domain *dom, | ||
375 | unsigned int pages) | ||
376 | { | ||
377 | unsigned long limit = dma_mask_to_pages(*dev->dma_mask); | ||
378 | unsigned long address; | ||
379 | unsigned long size = dom->aperture_size >> PAGE_SHIFT; | ||
380 | unsigned long boundary_size; | ||
381 | |||
382 | boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, | ||
383 | PAGE_SIZE) >> PAGE_SHIFT; | ||
384 | limit = limit < size ? limit : size; | ||
385 | |||
386 | if (dom->next_bit >= limit) | ||
387 | dom->next_bit = 0; | ||
388 | |||
389 | address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, | ||
390 | 0 , boundary_size, 0); | ||
391 | if (address == -1) | ||
392 | address = iommu_area_alloc(dom->bitmap, limit, 0, pages, | ||
393 | 0, boundary_size, 0); | ||
394 | |||
395 | if (likely(address != -1)) { | ||
396 | dom->next_bit = address + pages; | ||
397 | address <<= PAGE_SHIFT; | ||
398 | } else | ||
399 | address = bad_dma_address; | ||
400 | |||
401 | WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); | ||
402 | |||
403 | return address; | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * The address free function. | ||
408 | * | ||
409 | * called with domain->lock held | ||
410 | */ | ||
411 | static void dma_ops_free_addresses(struct dma_ops_domain *dom, | ||
412 | unsigned long address, | ||
413 | unsigned int pages) | ||
414 | { | ||
415 | address >>= PAGE_SHIFT; | ||
416 | iommu_area_free(dom->bitmap, address, pages); | ||
417 | } | ||
418 | |||
419 | /**************************************************************************** | ||
420 | * | ||
421 | * The next functions belong to the domain allocation. A domain is | ||
422 | * allocated for every IOMMU as the default domain. If device isolation | ||
423 | * is enabled, every device get its own domain. The most important thing | ||
424 | * about domains is the page table mapping the DMA address space they | ||
425 | * contain. | ||
426 | * | ||
427 | ****************************************************************************/ | ||
428 | |||
429 | static u16 domain_id_alloc(void) | ||
430 | { | ||
431 | unsigned long flags; | ||
432 | int id; | ||
433 | |||
434 | write_lock_irqsave(&amd_iommu_devtable_lock, flags); | ||
435 | id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); | ||
436 | BUG_ON(id == 0); | ||
437 | if (id > 0 && id < MAX_DOMAIN_ID) | ||
438 | __set_bit(id, amd_iommu_pd_alloc_bitmap); | ||
439 | else | ||
440 | id = 0; | ||
441 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | ||
442 | |||
443 | return id; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Used to reserve address ranges in the aperture (e.g. for exclusion | ||
448 | * ranges. | ||
449 | */ | ||
450 | static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, | ||
451 | unsigned long start_page, | ||
452 | unsigned int pages) | ||
453 | { | ||
454 | unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; | ||
455 | |||
456 | if (start_page + pages > last_page) | ||
457 | pages = last_page - start_page; | ||
458 | |||
459 | set_bit_string(dom->bitmap, start_page, pages); | ||
460 | } | ||
461 | |||
462 | static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) | ||
463 | { | ||
464 | int i, j; | ||
465 | u64 *p1, *p2, *p3; | ||
466 | |||
467 | p1 = dma_dom->domain.pt_root; | ||
468 | |||
469 | if (!p1) | ||
470 | return; | ||
471 | |||
472 | for (i = 0; i < 512; ++i) { | ||
473 | if (!IOMMU_PTE_PRESENT(p1[i])) | ||
474 | continue; | ||
475 | |||
476 | p2 = IOMMU_PTE_PAGE(p1[i]); | ||
477 | for (j = 0; j < 512; ++i) { | ||
478 | if (!IOMMU_PTE_PRESENT(p2[j])) | ||
479 | continue; | ||
480 | p3 = IOMMU_PTE_PAGE(p2[j]); | ||
481 | free_page((unsigned long)p3); | ||
482 | } | ||
483 | |||
484 | free_page((unsigned long)p2); | ||
485 | } | ||
486 | |||
487 | free_page((unsigned long)p1); | ||
488 | } | ||
489 | |||
490 | /* | ||
491 | * Free a domain, only used if something went wrong in the | ||
492 | * allocation path and we need to free an already allocated page table | ||
493 | */ | ||
494 | static void dma_ops_domain_free(struct dma_ops_domain *dom) | ||
495 | { | ||
496 | if (!dom) | ||
497 | return; | ||
498 | |||
499 | dma_ops_free_pagetable(dom); | ||
500 | |||
501 | kfree(dom->pte_pages); | ||
502 | |||
503 | kfree(dom->bitmap); | ||
504 | |||
505 | kfree(dom); | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * Allocates a new protection domain usable for the dma_ops functions. | ||
510 | * It also intializes the page table and the address allocator data | ||
511 | * structures required for the dma_ops interface | ||
512 | */ | ||
513 | static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, | ||
514 | unsigned order) | ||
515 | { | ||
516 | struct dma_ops_domain *dma_dom; | ||
517 | unsigned i, num_pte_pages; | ||
518 | u64 *l2_pde; | ||
519 | u64 address; | ||
520 | |||
521 | /* | ||
522 | * Currently the DMA aperture must be between 32 MB and 1GB in size | ||
523 | */ | ||
524 | if ((order < 25) || (order > 30)) | ||
525 | return NULL; | ||
526 | |||
527 | dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); | ||
528 | if (!dma_dom) | ||
529 | return NULL; | ||
530 | |||
531 | spin_lock_init(&dma_dom->domain.lock); | ||
532 | |||
533 | dma_dom->domain.id = domain_id_alloc(); | ||
534 | if (dma_dom->domain.id == 0) | ||
535 | goto free_dma_dom; | ||
536 | dma_dom->domain.mode = PAGE_MODE_3_LEVEL; | ||
537 | dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); | ||
538 | dma_dom->domain.priv = dma_dom; | ||
539 | if (!dma_dom->domain.pt_root) | ||
540 | goto free_dma_dom; | ||
541 | dma_dom->aperture_size = (1ULL << order); | ||
542 | dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), | ||
543 | GFP_KERNEL); | ||
544 | if (!dma_dom->bitmap) | ||
545 | goto free_dma_dom; | ||
546 | /* | ||
547 | * mark the first page as allocated so we never return 0 as | ||
548 | * a valid dma-address. So we can use 0 as error value | ||
549 | */ | ||
550 | dma_dom->bitmap[0] = 1; | ||
551 | dma_dom->next_bit = 0; | ||
552 | |||
553 | /* Intialize the exclusion range if necessary */ | ||
554 | if (iommu->exclusion_start && | ||
555 | iommu->exclusion_start < dma_dom->aperture_size) { | ||
556 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; | ||
557 | int pages = iommu_num_pages(iommu->exclusion_start, | ||
558 | iommu->exclusion_length); | ||
559 | dma_ops_reserve_addresses(dma_dom, startpage, pages); | ||
560 | } | ||
561 | |||
562 | /* | ||
563 | * At the last step, build the page tables so we don't need to | ||
564 | * allocate page table pages in the dma_ops mapping/unmapping | ||
565 | * path. | ||
566 | */ | ||
567 | num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); | ||
568 | dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), | ||
569 | GFP_KERNEL); | ||
570 | if (!dma_dom->pte_pages) | ||
571 | goto free_dma_dom; | ||
572 | |||
573 | l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
574 | if (l2_pde == NULL) | ||
575 | goto free_dma_dom; | ||
576 | |||
577 | dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); | ||
578 | |||
579 | for (i = 0; i < num_pte_pages; ++i) { | ||
580 | dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); | ||
581 | if (!dma_dom->pte_pages[i]) | ||
582 | goto free_dma_dom; | ||
583 | address = virt_to_phys(dma_dom->pte_pages[i]); | ||
584 | l2_pde[i] = IOMMU_L1_PDE(address); | ||
585 | } | ||
586 | |||
587 | return dma_dom; | ||
588 | |||
589 | free_dma_dom: | ||
590 | dma_ops_domain_free(dma_dom); | ||
591 | |||
592 | return NULL; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * Find out the protection domain structure for a given PCI device. This | ||
597 | * will give us the pointer to the page table root for example. | ||
598 | */ | ||
599 | static struct protection_domain *domain_for_device(u16 devid) | ||
600 | { | ||
601 | struct protection_domain *dom; | ||
602 | unsigned long flags; | ||
603 | |||
604 | read_lock_irqsave(&amd_iommu_devtable_lock, flags); | ||
605 | dom = amd_iommu_pd_table[devid]; | ||
606 | read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | ||
607 | |||
608 | return dom; | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * If a device is not yet associated with a domain, this function does | ||
613 | * assigns it visible for the hardware | ||
614 | */ | ||
615 | static void set_device_domain(struct amd_iommu *iommu, | ||
616 | struct protection_domain *domain, | ||
617 | u16 devid) | ||
618 | { | ||
619 | unsigned long flags; | ||
620 | |||
621 | u64 pte_root = virt_to_phys(domain->pt_root); | ||
622 | |||
623 | pte_root |= (domain->mode & 0x07) << 9; | ||
624 | pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; | ||
625 | |||
626 | write_lock_irqsave(&amd_iommu_devtable_lock, flags); | ||
627 | amd_iommu_dev_table[devid].data[0] = pte_root; | ||
628 | amd_iommu_dev_table[devid].data[1] = pte_root >> 32; | ||
629 | amd_iommu_dev_table[devid].data[2] = domain->id; | ||
630 | |||
631 | amd_iommu_pd_table[devid] = domain; | ||
632 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | ||
633 | |||
634 | iommu_queue_inv_dev_entry(iommu, devid); | ||
635 | |||
636 | iommu->need_sync = 1; | ||
637 | } | ||
638 | |||
639 | /***************************************************************************** | ||
640 | * | ||
641 | * The next functions belong to the dma_ops mapping/unmapping code. | ||
642 | * | ||
643 | *****************************************************************************/ | ||
644 | |||
645 | /* | ||
646 | * In the dma_ops path we only have the struct device. This function | ||
647 | * finds the corresponding IOMMU, the protection domain and the | ||
648 | * requestor id for a given device. | ||
649 | * If the device is not yet associated with a domain this is also done | ||
650 | * in this function. | ||
651 | */ | ||
652 | static int get_device_resources(struct device *dev, | ||
653 | struct amd_iommu **iommu, | ||
654 | struct protection_domain **domain, | ||
655 | u16 *bdf) | ||
656 | { | ||
657 | struct dma_ops_domain *dma_dom; | ||
658 | struct pci_dev *pcidev; | ||
659 | u16 _bdf; | ||
660 | |||
661 | BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); | ||
662 | |||
663 | pcidev = to_pci_dev(dev); | ||
664 | _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); | ||
665 | |||
666 | /* device not translated by any IOMMU in the system? */ | ||
667 | if (_bdf > amd_iommu_last_bdf) { | ||
668 | *iommu = NULL; | ||
669 | *domain = NULL; | ||
670 | *bdf = 0xffff; | ||
671 | return 0; | ||
672 | } | ||
673 | |||
674 | *bdf = amd_iommu_alias_table[_bdf]; | ||
675 | |||
676 | *iommu = amd_iommu_rlookup_table[*bdf]; | ||
677 | if (*iommu == NULL) | ||
678 | return 0; | ||
679 | dma_dom = (*iommu)->default_dom; | ||
680 | *domain = domain_for_device(*bdf); | ||
681 | if (*domain == NULL) { | ||
682 | *domain = &dma_dom->domain; | ||
683 | set_device_domain(*iommu, *domain, *bdf); | ||
684 | printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " | ||
685 | "device ", (*domain)->id); | ||
686 | print_devid(_bdf, 1); | ||
687 | } | ||
688 | |||
689 | return 1; | ||
690 | } | ||
691 | |||
692 | /* | ||
693 | * This is the generic map function. It maps one 4kb page at paddr to | ||
694 | * the given address in the DMA address space for the domain. | ||
695 | */ | ||
696 | static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, | ||
697 | struct dma_ops_domain *dom, | ||
698 | unsigned long address, | ||
699 | phys_addr_t paddr, | ||
700 | int direction) | ||
701 | { | ||
702 | u64 *pte, __pte; | ||
703 | |||
704 | WARN_ON(address > dom->aperture_size); | ||
705 | |||
706 | paddr &= PAGE_MASK; | ||
707 | |||
708 | pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; | ||
709 | pte += IOMMU_PTE_L0_INDEX(address); | ||
710 | |||
711 | __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; | ||
712 | |||
713 | if (direction == DMA_TO_DEVICE) | ||
714 | __pte |= IOMMU_PTE_IR; | ||
715 | else if (direction == DMA_FROM_DEVICE) | ||
716 | __pte |= IOMMU_PTE_IW; | ||
717 | else if (direction == DMA_BIDIRECTIONAL) | ||
718 | __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW; | ||
719 | |||
720 | WARN_ON(*pte); | ||
721 | |||
722 | *pte = __pte; | ||
723 | |||
724 | return (dma_addr_t)address; | ||
725 | } | ||
726 | |||
727 | /* | ||
728 | * The generic unmapping function for on page in the DMA address space. | ||
729 | */ | ||
730 | static void dma_ops_domain_unmap(struct amd_iommu *iommu, | ||
731 | struct dma_ops_domain *dom, | ||
732 | unsigned long address) | ||
733 | { | ||
734 | u64 *pte; | ||
735 | |||
736 | if (address >= dom->aperture_size) | ||
737 | return; | ||
738 | |||
739 | WARN_ON(address & 0xfffULL || address > dom->aperture_size); | ||
740 | |||
741 | pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; | ||
742 | pte += IOMMU_PTE_L0_INDEX(address); | ||
743 | |||
744 | WARN_ON(!*pte); | ||
745 | |||
746 | *pte = 0ULL; | ||
747 | } | ||
748 | |||
749 | /* | ||
750 | * This function contains common code for mapping of a physically | ||
751 | * contiguous memory region into DMA address space. It is uses by all | ||
752 | * mapping functions provided by this IOMMU driver. | ||
753 | * Must be called with the domain lock held. | ||
754 | */ | ||
755 | static dma_addr_t __map_single(struct device *dev, | ||
756 | struct amd_iommu *iommu, | ||
757 | struct dma_ops_domain *dma_dom, | ||
758 | phys_addr_t paddr, | ||
759 | size_t size, | ||
760 | int dir) | ||
761 | { | ||
762 | dma_addr_t offset = paddr & ~PAGE_MASK; | ||
763 | dma_addr_t address, start; | ||
764 | unsigned int pages; | ||
765 | int i; | ||
766 | |||
767 | pages = iommu_num_pages(paddr, size); | ||
768 | paddr &= PAGE_MASK; | ||
769 | |||
770 | address = dma_ops_alloc_addresses(dev, dma_dom, pages); | ||
771 | if (unlikely(address == bad_dma_address)) | ||
772 | goto out; | ||
773 | |||
774 | start = address; | ||
775 | for (i = 0; i < pages; ++i) { | ||
776 | dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); | ||
777 | paddr += PAGE_SIZE; | ||
778 | start += PAGE_SIZE; | ||
779 | } | ||
780 | address += offset; | ||
781 | |||
782 | out: | ||
783 | return address; | ||
784 | } | ||
785 | |||
786 | /* | ||
787 | * Does the reverse of the __map_single function. Must be called with | ||
788 | * the domain lock held too | ||
789 | */ | ||
790 | static void __unmap_single(struct amd_iommu *iommu, | ||
791 | struct dma_ops_domain *dma_dom, | ||
792 | dma_addr_t dma_addr, | ||
793 | size_t size, | ||
794 | int dir) | ||
795 | { | ||
796 | dma_addr_t i, start; | ||
797 | unsigned int pages; | ||
798 | |||
799 | if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) | ||
800 | return; | ||
801 | |||
802 | pages = iommu_num_pages(dma_addr, size); | ||
803 | dma_addr &= PAGE_MASK; | ||
804 | start = dma_addr; | ||
805 | |||
806 | for (i = 0; i < pages; ++i) { | ||
807 | dma_ops_domain_unmap(iommu, dma_dom, start); | ||
808 | start += PAGE_SIZE; | ||
809 | } | ||
810 | |||
811 | dma_ops_free_addresses(dma_dom, dma_addr, pages); | ||
812 | } | ||
813 | |||
814 | /* | ||
815 | * The exported map_single function for dma_ops. | ||
816 | */ | ||
817 | static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, | ||
818 | size_t size, int dir) | ||
819 | { | ||
820 | unsigned long flags; | ||
821 | struct amd_iommu *iommu; | ||
822 | struct protection_domain *domain; | ||
823 | u16 devid; | ||
824 | dma_addr_t addr; | ||
825 | |||
826 | get_device_resources(dev, &iommu, &domain, &devid); | ||
827 | |||
828 | if (iommu == NULL || domain == NULL) | ||
829 | /* device not handled by any AMD IOMMU */ | ||
830 | return (dma_addr_t)paddr; | ||
831 | |||
832 | spin_lock_irqsave(&domain->lock, flags); | ||
833 | addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); | ||
834 | if (addr == bad_dma_address) | ||
835 | goto out; | ||
836 | |||
837 | if (iommu_has_npcache(iommu)) | ||
838 | iommu_flush_pages(iommu, domain->id, addr, size); | ||
839 | |||
840 | if (iommu->need_sync) | ||
841 | iommu_completion_wait(iommu); | ||
842 | |||
843 | out: | ||
844 | spin_unlock_irqrestore(&domain->lock, flags); | ||
845 | |||
846 | return addr; | ||
847 | } | ||
848 | |||
849 | /* | ||
850 | * The exported unmap_single function for dma_ops. | ||
851 | */ | ||
852 | static void unmap_single(struct device *dev, dma_addr_t dma_addr, | ||
853 | size_t size, int dir) | ||
854 | { | ||
855 | unsigned long flags; | ||
856 | struct amd_iommu *iommu; | ||
857 | struct protection_domain *domain; | ||
858 | u16 devid; | ||
859 | |||
860 | if (!get_device_resources(dev, &iommu, &domain, &devid)) | ||
861 | /* device not handled by any AMD IOMMU */ | ||
862 | return; | ||
863 | |||
864 | spin_lock_irqsave(&domain->lock, flags); | ||
865 | |||
866 | __unmap_single(iommu, domain->priv, dma_addr, size, dir); | ||
867 | |||
868 | iommu_flush_pages(iommu, domain->id, dma_addr, size); | ||
869 | |||
870 | if (iommu->need_sync) | ||
871 | iommu_completion_wait(iommu); | ||
872 | |||
873 | spin_unlock_irqrestore(&domain->lock, flags); | ||
874 | } | ||
875 | |||
876 | /* | ||
877 | * This is a special map_sg function which is used if we should map a | ||
878 | * device which is not handled by an AMD IOMMU in the system. | ||
879 | */ | ||
880 | static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, | ||
881 | int nelems, int dir) | ||
882 | { | ||
883 | struct scatterlist *s; | ||
884 | int i; | ||
885 | |||
886 | for_each_sg(sglist, s, nelems, i) { | ||
887 | s->dma_address = (dma_addr_t)sg_phys(s); | ||
888 | s->dma_length = s->length; | ||
889 | } | ||
890 | |||
891 | return nelems; | ||
892 | } | ||
893 | |||
894 | /* | ||
895 | * The exported map_sg function for dma_ops (handles scatter-gather | ||
896 | * lists). | ||
897 | */ | ||
898 | static int map_sg(struct device *dev, struct scatterlist *sglist, | ||
899 | int nelems, int dir) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | struct amd_iommu *iommu; | ||
903 | struct protection_domain *domain; | ||
904 | u16 devid; | ||
905 | int i; | ||
906 | struct scatterlist *s; | ||
907 | phys_addr_t paddr; | ||
908 | int mapped_elems = 0; | ||
909 | |||
910 | get_device_resources(dev, &iommu, &domain, &devid); | ||
911 | |||
912 | if (!iommu || !domain) | ||
913 | return map_sg_no_iommu(dev, sglist, nelems, dir); | ||
914 | |||
915 | spin_lock_irqsave(&domain->lock, flags); | ||
916 | |||
917 | for_each_sg(sglist, s, nelems, i) { | ||
918 | paddr = sg_phys(s); | ||
919 | |||
920 | s->dma_address = __map_single(dev, iommu, domain->priv, | ||
921 | paddr, s->length, dir); | ||
922 | |||
923 | if (s->dma_address) { | ||
924 | s->dma_length = s->length; | ||
925 | mapped_elems++; | ||
926 | } else | ||
927 | goto unmap; | ||
928 | if (iommu_has_npcache(iommu)) | ||
929 | iommu_flush_pages(iommu, domain->id, s->dma_address, | ||
930 | s->dma_length); | ||
931 | } | ||
932 | |||
933 | if (iommu->need_sync) | ||
934 | iommu_completion_wait(iommu); | ||
935 | |||
936 | out: | ||
937 | spin_unlock_irqrestore(&domain->lock, flags); | ||
938 | |||
939 | return mapped_elems; | ||
940 | unmap: | ||
941 | for_each_sg(sglist, s, mapped_elems, i) { | ||
942 | if (s->dma_address) | ||
943 | __unmap_single(iommu, domain->priv, s->dma_address, | ||
944 | s->dma_length, dir); | ||
945 | s->dma_address = s->dma_length = 0; | ||
946 | } | ||
947 | |||
948 | mapped_elems = 0; | ||
949 | |||
950 | goto out; | ||
951 | } | ||
952 | |||
953 | /* | ||
954 | * The exported map_sg function for dma_ops (handles scatter-gather | ||
955 | * lists). | ||
956 | */ | ||
957 | static void unmap_sg(struct device *dev, struct scatterlist *sglist, | ||
958 | int nelems, int dir) | ||
959 | { | ||
960 | unsigned long flags; | ||
961 | struct amd_iommu *iommu; | ||
962 | struct protection_domain *domain; | ||
963 | struct scatterlist *s; | ||
964 | u16 devid; | ||
965 | int i; | ||
966 | |||
967 | if (!get_device_resources(dev, &iommu, &domain, &devid)) | ||
968 | return; | ||
969 | |||
970 | spin_lock_irqsave(&domain->lock, flags); | ||
971 | |||
972 | for_each_sg(sglist, s, nelems, i) { | ||
973 | __unmap_single(iommu, domain->priv, s->dma_address, | ||
974 | s->dma_length, dir); | ||
975 | iommu_flush_pages(iommu, domain->id, s->dma_address, | ||
976 | s->dma_length); | ||
977 | s->dma_address = s->dma_length = 0; | ||
978 | } | ||
979 | |||
980 | if (iommu->need_sync) | ||
981 | iommu_completion_wait(iommu); | ||
982 | |||
983 | spin_unlock_irqrestore(&domain->lock, flags); | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * The exported alloc_coherent function for dma_ops. | ||
988 | */ | ||
989 | static void *alloc_coherent(struct device *dev, size_t size, | ||
990 | dma_addr_t *dma_addr, gfp_t flag) | ||
991 | { | ||
992 | unsigned long flags; | ||
993 | void *virt_addr; | ||
994 | struct amd_iommu *iommu; | ||
995 | struct protection_domain *domain; | ||
996 | u16 devid; | ||
997 | phys_addr_t paddr; | ||
998 | |||
999 | virt_addr = (void *)__get_free_pages(flag, get_order(size)); | ||
1000 | if (!virt_addr) | ||
1001 | return 0; | ||
1002 | |||
1003 | memset(virt_addr, 0, size); | ||
1004 | paddr = virt_to_phys(virt_addr); | ||
1005 | |||
1006 | get_device_resources(dev, &iommu, &domain, &devid); | ||
1007 | |||
1008 | if (!iommu || !domain) { | ||
1009 | *dma_addr = (dma_addr_t)paddr; | ||
1010 | return virt_addr; | ||
1011 | } | ||
1012 | |||
1013 | spin_lock_irqsave(&domain->lock, flags); | ||
1014 | |||
1015 | *dma_addr = __map_single(dev, iommu, domain->priv, paddr, | ||
1016 | size, DMA_BIDIRECTIONAL); | ||
1017 | |||
1018 | if (*dma_addr == bad_dma_address) { | ||
1019 | free_pages((unsigned long)virt_addr, get_order(size)); | ||
1020 | virt_addr = NULL; | ||
1021 | goto out; | ||
1022 | } | ||
1023 | |||
1024 | if (iommu_has_npcache(iommu)) | ||
1025 | iommu_flush_pages(iommu, domain->id, *dma_addr, size); | ||
1026 | |||
1027 | if (iommu->need_sync) | ||
1028 | iommu_completion_wait(iommu); | ||
1029 | |||
1030 | out: | ||
1031 | spin_unlock_irqrestore(&domain->lock, flags); | ||
1032 | |||
1033 | return virt_addr; | ||
1034 | } | ||
1035 | |||
1036 | /* | ||
1037 | * The exported free_coherent function for dma_ops. | ||
1038 | * FIXME: fix the generic x86 DMA layer so that it actually calls that | ||
1039 | * function. | ||
1040 | */ | ||
1041 | static void free_coherent(struct device *dev, size_t size, | ||
1042 | void *virt_addr, dma_addr_t dma_addr) | ||
1043 | { | ||
1044 | unsigned long flags; | ||
1045 | struct amd_iommu *iommu; | ||
1046 | struct protection_domain *domain; | ||
1047 | u16 devid; | ||
1048 | |||
1049 | get_device_resources(dev, &iommu, &domain, &devid); | ||
1050 | |||
1051 | if (!iommu || !domain) | ||
1052 | goto free_mem; | ||
1053 | |||
1054 | spin_lock_irqsave(&domain->lock, flags); | ||
1055 | |||
1056 | __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); | ||
1057 | iommu_flush_pages(iommu, domain->id, dma_addr, size); | ||
1058 | |||
1059 | if (iommu->need_sync) | ||
1060 | iommu_completion_wait(iommu); | ||
1061 | |||
1062 | spin_unlock_irqrestore(&domain->lock, flags); | ||
1063 | |||
1064 | free_mem: | ||
1065 | free_pages((unsigned long)virt_addr, get_order(size)); | ||
1066 | } | ||
1067 | |||
1068 | /* | ||
1069 | * The function for pre-allocating protection domains. | ||
1070 | * | ||
1071 | * If the driver core informs the DMA layer if a driver grabs a device | ||
1072 | * we don't need to preallocate the protection domains anymore. | ||
1073 | * For now we have to. | ||
1074 | */ | ||
1075 | void prealloc_protection_domains(void) | ||
1076 | { | ||
1077 | struct pci_dev *dev = NULL; | ||
1078 | struct dma_ops_domain *dma_dom; | ||
1079 | struct amd_iommu *iommu; | ||
1080 | int order = amd_iommu_aperture_order; | ||
1081 | u16 devid; | ||
1082 | |||
1083 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | ||
1084 | devid = (dev->bus->number << 8) | dev->devfn; | ||
1085 | if (devid > amd_iommu_last_bdf) | ||
1086 | continue; | ||
1087 | devid = amd_iommu_alias_table[devid]; | ||
1088 | if (domain_for_device(devid)) | ||
1089 | continue; | ||
1090 | iommu = amd_iommu_rlookup_table[devid]; | ||
1091 | if (!iommu) | ||
1092 | continue; | ||
1093 | dma_dom = dma_ops_domain_alloc(iommu, order); | ||
1094 | if (!dma_dom) | ||
1095 | continue; | ||
1096 | init_unity_mappings_for_device(dma_dom, devid); | ||
1097 | set_device_domain(iommu, &dma_dom->domain, devid); | ||
1098 | printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", | ||
1099 | dma_dom->domain.id); | ||
1100 | print_devid(devid, 1); | ||
1101 | } | ||
1102 | } | ||
1103 | |||
1104 | static struct dma_mapping_ops amd_iommu_dma_ops = { | ||
1105 | .alloc_coherent = alloc_coherent, | ||
1106 | .free_coherent = free_coherent, | ||
1107 | .map_single = map_single, | ||
1108 | .unmap_single = unmap_single, | ||
1109 | .map_sg = map_sg, | ||
1110 | .unmap_sg = unmap_sg, | ||
1111 | }; | ||
1112 | |||
1113 | /* | ||
1114 | * The function which clues the AMD IOMMU driver into dma_ops. | ||
1115 | */ | ||
1116 | int __init amd_iommu_init_dma_ops(void) | ||
1117 | { | ||
1118 | struct amd_iommu *iommu; | ||
1119 | int order = amd_iommu_aperture_order; | ||
1120 | int ret; | ||
1121 | |||
1122 | /* | ||
1123 | * first allocate a default protection domain for every IOMMU we | ||
1124 | * found in the system. Devices not assigned to any other | ||
1125 | * protection domain will be assigned to the default one. | ||
1126 | */ | ||
1127 | list_for_each_entry(iommu, &amd_iommu_list, list) { | ||
1128 | iommu->default_dom = dma_ops_domain_alloc(iommu, order); | ||
1129 | if (iommu->default_dom == NULL) | ||
1130 | return -ENOMEM; | ||
1131 | ret = iommu_init_unity_mappings(iommu); | ||
1132 | if (ret) | ||
1133 | goto free_domains; | ||
1134 | } | ||
1135 | |||
1136 | /* | ||
1137 | * If device isolation is enabled, pre-allocate the protection | ||
1138 | * domains for each device. | ||
1139 | */ | ||
1140 | if (amd_iommu_isolate) | ||
1141 | prealloc_protection_domains(); | ||
1142 | |||
1143 | iommu_detected = 1; | ||
1144 | force_iommu = 1; | ||
1145 | bad_dma_address = 0; | ||
1146 | #ifdef CONFIG_GART_IOMMU | ||
1147 | gart_iommu_aperture_disabled = 1; | ||
1148 | gart_iommu_aperture = 0; | ||
1149 | #endif | ||
1150 | |||
1151 | /* Make the driver finally visible to the drivers */ | ||
1152 | dma_ops = &amd_iommu_dma_ops; | ||
1153 | |||
1154 | return 0; | ||
1155 | |||
1156 | free_domains: | ||
1157 | |||
1158 | list_for_each_entry(iommu, &amd_iommu_list, list) { | ||
1159 | if (iommu->default_dom) | ||
1160 | dma_ops_domain_free(iommu->default_dom); | ||
1161 | } | ||
1162 | |||
1163 | return ret; | ||
1164 | } | ||
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c new file mode 100644 index 000000000000..d9a9da597e79 --- /dev/null +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -0,0 +1,1060 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. | ||
3 | * Author: Joerg Roedel <joerg.roedel@amd.com> | ||
4 | * Leo Duran <leo.duran@amd.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of the GNU General Public License version 2 as published | ||
8 | * by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/pci.h> | ||
21 | #include <linux/acpi.h> | ||
22 | #include <linux/gfp.h> | ||
23 | #include <linux/list.h> | ||
24 | #include <linux/sysdev.h> | ||
25 | #include <asm/pci-direct.h> | ||
26 | #include <asm/amd_iommu_types.h> | ||
27 | #include <asm/amd_iommu.h> | ||
28 | #include <asm/iommu.h> | ||
29 | |||
30 | /* | ||
31 | * definitions for the ACPI scanning code | ||
32 | */ | ||
33 | #define PCI_BUS(x) (((x) >> 8) & 0xff) | ||
34 | #define IVRS_HEADER_LENGTH 48 | ||
35 | |||
36 | #define ACPI_IVHD_TYPE 0x10 | ||
37 | #define ACPI_IVMD_TYPE_ALL 0x20 | ||
38 | #define ACPI_IVMD_TYPE 0x21 | ||
39 | #define ACPI_IVMD_TYPE_RANGE 0x22 | ||
40 | |||
41 | #define IVHD_DEV_ALL 0x01 | ||
42 | #define IVHD_DEV_SELECT 0x02 | ||
43 | #define IVHD_DEV_SELECT_RANGE_START 0x03 | ||
44 | #define IVHD_DEV_RANGE_END 0x04 | ||
45 | #define IVHD_DEV_ALIAS 0x42 | ||
46 | #define IVHD_DEV_ALIAS_RANGE 0x43 | ||
47 | #define IVHD_DEV_EXT_SELECT 0x46 | ||
48 | #define IVHD_DEV_EXT_SELECT_RANGE 0x47 | ||
49 | |||
50 | #define IVHD_FLAG_HT_TUN_EN 0x00 | ||
51 | #define IVHD_FLAG_PASSPW_EN 0x01 | ||
52 | #define IVHD_FLAG_RESPASSPW_EN 0x02 | ||
53 | #define IVHD_FLAG_ISOC_EN 0x03 | ||
54 | |||
55 | #define IVMD_FLAG_EXCL_RANGE 0x08 | ||
56 | #define IVMD_FLAG_UNITY_MAP 0x01 | ||
57 | |||
58 | #define ACPI_DEVFLAG_INITPASS 0x01 | ||
59 | #define ACPI_DEVFLAG_EXTINT 0x02 | ||
60 | #define ACPI_DEVFLAG_NMI 0x04 | ||
61 | #define ACPI_DEVFLAG_SYSMGT1 0x10 | ||
62 | #define ACPI_DEVFLAG_SYSMGT2 0x20 | ||
63 | #define ACPI_DEVFLAG_LINT0 0x40 | ||
64 | #define ACPI_DEVFLAG_LINT1 0x80 | ||
65 | #define ACPI_DEVFLAG_ATSDIS 0x10000000 | ||
66 | |||
67 | /* | ||
68 | * ACPI table definitions | ||
69 | * | ||
70 | * These data structures are laid over the table to parse the important values | ||
71 | * out of it. | ||
72 | */ | ||
73 | |||
74 | /* | ||
75 | * structure describing one IOMMU in the ACPI table. Typically followed by one | ||
76 | * or more ivhd_entrys. | ||
77 | */ | ||
78 | struct ivhd_header { | ||
79 | u8 type; | ||
80 | u8 flags; | ||
81 | u16 length; | ||
82 | u16 devid; | ||
83 | u16 cap_ptr; | ||
84 | u64 mmio_phys; | ||
85 | u16 pci_seg; | ||
86 | u16 info; | ||
87 | u32 reserved; | ||
88 | } __attribute__((packed)); | ||
89 | |||
90 | /* | ||
91 | * A device entry describing which devices a specific IOMMU translates and | ||
92 | * which requestor ids they use. | ||
93 | */ | ||
94 | struct ivhd_entry { | ||
95 | u8 type; | ||
96 | u16 devid; | ||
97 | u8 flags; | ||
98 | u32 ext; | ||
99 | } __attribute__((packed)); | ||
100 | |||
101 | /* | ||
102 | * An AMD IOMMU memory definition structure. It defines things like exclusion | ||
103 | * ranges for devices and regions that should be unity mapped. | ||
104 | */ | ||
105 | struct ivmd_header { | ||
106 | u8 type; | ||
107 | u8 flags; | ||
108 | u16 length; | ||
109 | u16 devid; | ||
110 | u16 aux; | ||
111 | u64 resv; | ||
112 | u64 range_start; | ||
113 | u64 range_length; | ||
114 | } __attribute__((packed)); | ||
115 | |||
116 | static int __initdata amd_iommu_detected; | ||
117 | |||
118 | u16 amd_iommu_last_bdf; /* largest PCI device id we have | ||
119 | to handle */ | ||
120 | LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings | ||
121 | we find in ACPI */ | ||
122 | unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ | ||
123 | int amd_iommu_isolate; /* if 1, device isolation is enabled */ | ||
124 | |||
125 | LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the | ||
126 | system */ | ||
127 | |||
128 | /* | ||
129 | * Pointer to the device table which is shared by all AMD IOMMUs | ||
130 | * it is indexed by the PCI device id or the HT unit id and contains | ||
131 | * information about the domain the device belongs to as well as the | ||
132 | * page table root pointer. | ||
133 | */ | ||
134 | struct dev_table_entry *amd_iommu_dev_table; | ||
135 | |||
136 | /* | ||
137 | * The alias table is a driver specific data structure which contains the | ||
138 | * mappings of the PCI device ids to the actual requestor ids on the IOMMU. | ||
139 | * More than one device can share the same requestor id. | ||
140 | */ | ||
141 | u16 *amd_iommu_alias_table; | ||
142 | |||
143 | /* | ||
144 | * The rlookup table is used to find the IOMMU which is responsible | ||
145 | * for a specific device. It is also indexed by the PCI device id. | ||
146 | */ | ||
147 | struct amd_iommu **amd_iommu_rlookup_table; | ||
148 | |||
149 | /* | ||
150 | * The pd table (protection domain table) is used to find the protection domain | ||
151 | * data structure a device belongs to. Indexed with the PCI device id too. | ||
152 | */ | ||
153 | struct protection_domain **amd_iommu_pd_table; | ||
154 | |||
155 | /* | ||
156 | * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap | ||
157 | * to know which ones are already in use. | ||
158 | */ | ||
159 | unsigned long *amd_iommu_pd_alloc_bitmap; | ||
160 | |||
161 | static u32 dev_table_size; /* size of the device table */ | ||
162 | static u32 alias_table_size; /* size of the alias table */ | ||
163 | static u32 rlookup_table_size; /* size if the rlookup table */ | ||
164 | |||
165 | static inline void update_last_devid(u16 devid) | ||
166 | { | ||
167 | if (devid > amd_iommu_last_bdf) | ||
168 | amd_iommu_last_bdf = devid; | ||
169 | } | ||
170 | |||
171 | static inline unsigned long tbl_size(int entry_size) | ||
172 | { | ||
173 | unsigned shift = PAGE_SHIFT + | ||
174 | get_order(amd_iommu_last_bdf * entry_size); | ||
175 | |||
176 | return 1UL << shift; | ||
177 | } | ||
178 | |||
179 | /**************************************************************************** | ||
180 | * | ||
181 | * AMD IOMMU MMIO register space handling functions | ||
182 | * | ||
183 | * These functions are used to program the IOMMU device registers in | ||
184 | * MMIO space required for that driver. | ||
185 | * | ||
186 | ****************************************************************************/ | ||
187 | |||
188 | /* | ||
189 | * This function set the exclusion range in the IOMMU. DMA accesses to the | ||
190 | * exclusion range are passed through untranslated | ||
191 | */ | ||
192 | static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) | ||
193 | { | ||
194 | u64 start = iommu->exclusion_start & PAGE_MASK; | ||
195 | u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; | ||
196 | u64 entry; | ||
197 | |||
198 | if (!iommu->exclusion_start) | ||
199 | return; | ||
200 | |||
201 | entry = start | MMIO_EXCL_ENABLE_MASK; | ||
202 | memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET, | ||
203 | &entry, sizeof(entry)); | ||
204 | |||
205 | entry = limit; | ||
206 | memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET, | ||
207 | &entry, sizeof(entry)); | ||
208 | } | ||
209 | |||
210 | /* Programs the physical address of the device table into the IOMMU hardware */ | ||
211 | static void __init iommu_set_device_table(struct amd_iommu *iommu) | ||
212 | { | ||
213 | u32 entry; | ||
214 | |||
215 | BUG_ON(iommu->mmio_base == NULL); | ||
216 | |||
217 | entry = virt_to_phys(amd_iommu_dev_table); | ||
218 | entry |= (dev_table_size >> 12) - 1; | ||
219 | memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, | ||
220 | &entry, sizeof(entry)); | ||
221 | } | ||
222 | |||
223 | /* Generic functions to enable/disable certain features of the IOMMU. */ | ||
224 | static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) | ||
225 | { | ||
226 | u32 ctrl; | ||
227 | |||
228 | ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); | ||
229 | ctrl |= (1 << bit); | ||
230 | writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); | ||
231 | } | ||
232 | |||
233 | static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) | ||
234 | { | ||
235 | u32 ctrl; | ||
236 | |||
237 | ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); | ||
238 | ctrl &= ~(1 << bit); | ||
239 | writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); | ||
240 | } | ||
241 | |||
242 | /* Function to enable the hardware */ | ||
243 | void __init iommu_enable(struct amd_iommu *iommu) | ||
244 | { | ||
245 | printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); | ||
246 | print_devid(iommu->devid, 0); | ||
247 | printk(" cap 0x%hx\n", iommu->cap_ptr); | ||
248 | |||
249 | iommu_feature_enable(iommu, CONTROL_IOMMU_EN); | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in | ||
254 | * the system has one. | ||
255 | */ | ||
256 | static u8 * __init iommu_map_mmio_space(u64 address) | ||
257 | { | ||
258 | u8 *ret; | ||
259 | |||
260 | if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) | ||
261 | return NULL; | ||
262 | |||
263 | ret = ioremap_nocache(address, MMIO_REGION_LENGTH); | ||
264 | if (ret != NULL) | ||
265 | return ret; | ||
266 | |||
267 | release_mem_region(address, MMIO_REGION_LENGTH); | ||
268 | |||
269 | return NULL; | ||
270 | } | ||
271 | |||
272 | static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) | ||
273 | { | ||
274 | if (iommu->mmio_base) | ||
275 | iounmap(iommu->mmio_base); | ||
276 | release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); | ||
277 | } | ||
278 | |||
279 | /**************************************************************************** | ||
280 | * | ||
281 | * The functions below belong to the first pass of AMD IOMMU ACPI table | ||
282 | * parsing. In this pass we try to find out the highest device id this | ||
283 | * code has to handle. Upon this information the size of the shared data | ||
284 | * structures is determined later. | ||
285 | * | ||
286 | ****************************************************************************/ | ||
287 | |||
288 | /* | ||
289 | * This function reads the last device id the IOMMU has to handle from the PCI | ||
290 | * capability header for this IOMMU | ||
291 | */ | ||
292 | static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) | ||
293 | { | ||
294 | u32 cap; | ||
295 | |||
296 | cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); | ||
297 | update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); | ||
298 | |||
299 | return 0; | ||
300 | } | ||
301 | |||
302 | /* | ||
303 | * After reading the highest device id from the IOMMU PCI capability header | ||
304 | * this function looks if there is a higher device id defined in the ACPI table | ||
305 | */ | ||
306 | static int __init find_last_devid_from_ivhd(struct ivhd_header *h) | ||
307 | { | ||
308 | u8 *p = (void *)h, *end = (void *)h; | ||
309 | struct ivhd_entry *dev; | ||
310 | |||
311 | p += sizeof(*h); | ||
312 | end += h->length; | ||
313 | |||
314 | find_last_devid_on_pci(PCI_BUS(h->devid), | ||
315 | PCI_SLOT(h->devid), | ||
316 | PCI_FUNC(h->devid), | ||
317 | h->cap_ptr); | ||
318 | |||
319 | while (p < end) { | ||
320 | dev = (struct ivhd_entry *)p; | ||
321 | switch (dev->type) { | ||
322 | case IVHD_DEV_SELECT: | ||
323 | case IVHD_DEV_RANGE_END: | ||
324 | case IVHD_DEV_ALIAS: | ||
325 | case IVHD_DEV_EXT_SELECT: | ||
326 | /* all the above subfield types refer to device ids */ | ||
327 | update_last_devid(dev->devid); | ||
328 | break; | ||
329 | default: | ||
330 | break; | ||
331 | } | ||
332 | p += 0x04 << (*p >> 6); | ||
333 | } | ||
334 | |||
335 | WARN_ON(p != end); | ||
336 | |||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | /* | ||
341 | * Iterate over all IVHD entries in the ACPI table and find the highest device | ||
342 | * id which we need to handle. This is the first of three functions which parse | ||
343 | * the ACPI table. So we check the checksum here. | ||
344 | */ | ||
345 | static int __init find_last_devid_acpi(struct acpi_table_header *table) | ||
346 | { | ||
347 | int i; | ||
348 | u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table; | ||
349 | struct ivhd_header *h; | ||
350 | |||
351 | /* | ||
352 | * Validate checksum here so we don't need to do it when | ||
353 | * we actually parse the table | ||
354 | */ | ||
355 | for (i = 0; i < table->length; ++i) | ||
356 | checksum += p[i]; | ||
357 | if (checksum != 0) | ||
358 | /* ACPI table corrupt */ | ||
359 | return -ENODEV; | ||
360 | |||
361 | p += IVRS_HEADER_LENGTH; | ||
362 | |||
363 | end += table->length; | ||
364 | while (p < end) { | ||
365 | h = (struct ivhd_header *)p; | ||
366 | switch (h->type) { | ||
367 | case ACPI_IVHD_TYPE: | ||
368 | find_last_devid_from_ivhd(h); | ||
369 | break; | ||
370 | default: | ||
371 | break; | ||
372 | } | ||
373 | p += h->length; | ||
374 | } | ||
375 | WARN_ON(p != end); | ||
376 | |||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | /**************************************************************************** | ||
381 | * | ||
382 | * The following functions belong the the code path which parses the ACPI table | ||
383 | * the second time. In this ACPI parsing iteration we allocate IOMMU specific | ||
384 | * data structures, initialize the device/alias/rlookup table and also | ||
385 | * basically initialize the hardware. | ||
386 | * | ||
387 | ****************************************************************************/ | ||
388 | |||
389 | /* | ||
390 | * Allocates the command buffer. This buffer is per AMD IOMMU. We can | ||
391 | * write commands to that buffer later and the IOMMU will execute them | ||
392 | * asynchronously | ||
393 | */ | ||
394 | static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) | ||
395 | { | ||
396 | u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, | ||
397 | get_order(CMD_BUFFER_SIZE)); | ||
398 | u64 entry; | ||
399 | |||
400 | if (cmd_buf == NULL) | ||
401 | return NULL; | ||
402 | |||
403 | iommu->cmd_buf_size = CMD_BUFFER_SIZE; | ||
404 | |||
405 | entry = (u64)virt_to_phys(cmd_buf); | ||
406 | entry |= MMIO_CMD_SIZE_512; | ||
407 | memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, | ||
408 | &entry, sizeof(entry)); | ||
409 | |||
410 | iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); | ||
411 | |||
412 | return cmd_buf; | ||
413 | } | ||
414 | |||
415 | static void __init free_command_buffer(struct amd_iommu *iommu) | ||
416 | { | ||
417 | free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); | ||
418 | } | ||
419 | |||
420 | /* sets a specific bit in the device table entry. */ | ||
421 | static void set_dev_entry_bit(u16 devid, u8 bit) | ||
422 | { | ||
423 | int i = (bit >> 5) & 0x07; | ||
424 | int _bit = bit & 0x1f; | ||
425 | |||
426 | amd_iommu_dev_table[devid].data[i] |= (1 << _bit); | ||
427 | } | ||
428 | |||
429 | /* Writes the specific IOMMU for a device into the rlookup table */ | ||
430 | static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) | ||
431 | { | ||
432 | amd_iommu_rlookup_table[devid] = iommu; | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * This function takes the device specific flags read from the ACPI | ||
437 | * table and sets up the device table entry with that information | ||
438 | */ | ||
439 | static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, | ||
440 | u16 devid, u32 flags, u32 ext_flags) | ||
441 | { | ||
442 | if (flags & ACPI_DEVFLAG_INITPASS) | ||
443 | set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); | ||
444 | if (flags & ACPI_DEVFLAG_EXTINT) | ||
445 | set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS); | ||
446 | if (flags & ACPI_DEVFLAG_NMI) | ||
447 | set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS); | ||
448 | if (flags & ACPI_DEVFLAG_SYSMGT1) | ||
449 | set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1); | ||
450 | if (flags & ACPI_DEVFLAG_SYSMGT2) | ||
451 | set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2); | ||
452 | if (flags & ACPI_DEVFLAG_LINT0) | ||
453 | set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); | ||
454 | if (flags & ACPI_DEVFLAG_LINT1) | ||
455 | set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); | ||
456 | |||
457 | set_iommu_for_device(iommu, devid); | ||
458 | } | ||
459 | |||
460 | /* | ||
461 | * Reads the device exclusion range from ACPI and initialize IOMMU with | ||
462 | * it | ||
463 | */ | ||
464 | static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) | ||
465 | { | ||
466 | struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; | ||
467 | |||
468 | if (!(m->flags & IVMD_FLAG_EXCL_RANGE)) | ||
469 | return; | ||
470 | |||
471 | if (iommu) { | ||
472 | /* | ||
473 | * We only can configure exclusion ranges per IOMMU, not | ||
474 | * per device. But we can enable the exclusion range per | ||
475 | * device. This is done here | ||
476 | */ | ||
477 | set_dev_entry_bit(m->devid, DEV_ENTRY_EX); | ||
478 | iommu->exclusion_start = m->range_start; | ||
479 | iommu->exclusion_length = m->range_length; | ||
480 | } | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * This function reads some important data from the IOMMU PCI space and | ||
485 | * initializes the driver data structure with it. It reads the hardware | ||
486 | * capabilities and the first/last device entries | ||
487 | */ | ||
488 | static void __init init_iommu_from_pci(struct amd_iommu *iommu) | ||
489 | { | ||
490 | int bus = PCI_BUS(iommu->devid); | ||
491 | int dev = PCI_SLOT(iommu->devid); | ||
492 | int fn = PCI_FUNC(iommu->devid); | ||
493 | int cap_ptr = iommu->cap_ptr; | ||
494 | u32 range; | ||
495 | |||
496 | iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); | ||
497 | |||
498 | range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); | ||
499 | iommu->first_device = calc_devid(MMIO_GET_BUS(range), | ||
500 | MMIO_GET_FD(range)); | ||
501 | iommu->last_device = calc_devid(MMIO_GET_BUS(range), | ||
502 | MMIO_GET_LD(range)); | ||
503 | } | ||
504 | |||
505 | /* | ||
506 | * Takes a pointer to an AMD IOMMU entry in the ACPI table and | ||
507 | * initializes the hardware and our data structures with it. | ||
508 | */ | ||
509 | static void __init init_iommu_from_acpi(struct amd_iommu *iommu, | ||
510 | struct ivhd_header *h) | ||
511 | { | ||
512 | u8 *p = (u8 *)h; | ||
513 | u8 *end = p, flags = 0; | ||
514 | u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; | ||
515 | u32 ext_flags = 0; | ||
516 | bool alias = false; | ||
517 | struct ivhd_entry *e; | ||
518 | |||
519 | /* | ||
520 | * First set the recommended feature enable bits from ACPI | ||
521 | * into the IOMMU control registers | ||
522 | */ | ||
523 | h->flags & IVHD_FLAG_HT_TUN_EN ? | ||
524 | iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : | ||
525 | iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); | ||
526 | |||
527 | h->flags & IVHD_FLAG_PASSPW_EN ? | ||
528 | iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : | ||
529 | iommu_feature_disable(iommu, CONTROL_PASSPW_EN); | ||
530 | |||
531 | h->flags & IVHD_FLAG_RESPASSPW_EN ? | ||
532 | iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : | ||
533 | iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); | ||
534 | |||
535 | h->flags & IVHD_FLAG_ISOC_EN ? | ||
536 | iommu_feature_enable(iommu, CONTROL_ISOC_EN) : | ||
537 | iommu_feature_disable(iommu, CONTROL_ISOC_EN); | ||
538 | |||
539 | /* | ||
540 | * make IOMMU memory accesses cache coherent | ||
541 | */ | ||
542 | iommu_feature_enable(iommu, CONTROL_COHERENT_EN); | ||
543 | |||
544 | /* | ||
545 | * Done. Now parse the device entries | ||
546 | */ | ||
547 | p += sizeof(struct ivhd_header); | ||
548 | end += h->length; | ||
549 | |||
550 | while (p < end) { | ||
551 | e = (struct ivhd_entry *)p; | ||
552 | switch (e->type) { | ||
553 | case IVHD_DEV_ALL: | ||
554 | for (dev_i = iommu->first_device; | ||
555 | dev_i <= iommu->last_device; ++dev_i) | ||
556 | set_dev_entry_from_acpi(iommu, dev_i, | ||
557 | e->flags, 0); | ||
558 | break; | ||
559 | case IVHD_DEV_SELECT: | ||
560 | devid = e->devid; | ||
561 | set_dev_entry_from_acpi(iommu, devid, e->flags, 0); | ||
562 | break; | ||
563 | case IVHD_DEV_SELECT_RANGE_START: | ||
564 | devid_start = e->devid; | ||
565 | flags = e->flags; | ||
566 | ext_flags = 0; | ||
567 | alias = false; | ||
568 | break; | ||
569 | case IVHD_DEV_ALIAS: | ||
570 | devid = e->devid; | ||
571 | devid_to = e->ext >> 8; | ||
572 | set_dev_entry_from_acpi(iommu, devid, e->flags, 0); | ||
573 | amd_iommu_alias_table[devid] = devid_to; | ||
574 | break; | ||
575 | case IVHD_DEV_ALIAS_RANGE: | ||
576 | devid_start = e->devid; | ||
577 | flags = e->flags; | ||
578 | devid_to = e->ext >> 8; | ||
579 | ext_flags = 0; | ||
580 | alias = true; | ||
581 | break; | ||
582 | case IVHD_DEV_EXT_SELECT: | ||
583 | devid = e->devid; | ||
584 | set_dev_entry_from_acpi(iommu, devid, e->flags, | ||
585 | e->ext); | ||
586 | break; | ||
587 | case IVHD_DEV_EXT_SELECT_RANGE: | ||
588 | devid_start = e->devid; | ||
589 | flags = e->flags; | ||
590 | ext_flags = e->ext; | ||
591 | alias = false; | ||
592 | break; | ||
593 | case IVHD_DEV_RANGE_END: | ||
594 | devid = e->devid; | ||
595 | for (dev_i = devid_start; dev_i <= devid; ++dev_i) { | ||
596 | if (alias) | ||
597 | amd_iommu_alias_table[dev_i] = devid_to; | ||
598 | set_dev_entry_from_acpi(iommu, | ||
599 | amd_iommu_alias_table[dev_i], | ||
600 | flags, ext_flags); | ||
601 | } | ||
602 | break; | ||
603 | default: | ||
604 | break; | ||
605 | } | ||
606 | |||
607 | p += 0x04 << (e->type >> 6); | ||
608 | } | ||
609 | } | ||
610 | |||
611 | /* Initializes the device->iommu mapping for the driver */ | ||
612 | static int __init init_iommu_devices(struct amd_iommu *iommu) | ||
613 | { | ||
614 | u16 i; | ||
615 | |||
616 | for (i = iommu->first_device; i <= iommu->last_device; ++i) | ||
617 | set_iommu_for_device(iommu, i); | ||
618 | |||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | static void __init free_iommu_one(struct amd_iommu *iommu) | ||
623 | { | ||
624 | free_command_buffer(iommu); | ||
625 | iommu_unmap_mmio_space(iommu); | ||
626 | } | ||
627 | |||
628 | static void __init free_iommu_all(void) | ||
629 | { | ||
630 | struct amd_iommu *iommu, *next; | ||
631 | |||
632 | list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { | ||
633 | list_del(&iommu->list); | ||
634 | free_iommu_one(iommu); | ||
635 | kfree(iommu); | ||
636 | } | ||
637 | } | ||
638 | |||
639 | /* | ||
640 | * This function clues the initialization function for one IOMMU | ||
641 | * together and also allocates the command buffer and programs the | ||
642 | * hardware. It does NOT enable the IOMMU. This is done afterwards. | ||
643 | */ | ||
644 | static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) | ||
645 | { | ||
646 | spin_lock_init(&iommu->lock); | ||
647 | list_add_tail(&iommu->list, &amd_iommu_list); | ||
648 | |||
649 | /* | ||
650 | * Copy data from ACPI table entry to the iommu struct | ||
651 | */ | ||
652 | iommu->devid = h->devid; | ||
653 | iommu->cap_ptr = h->cap_ptr; | ||
654 | iommu->mmio_phys = h->mmio_phys; | ||
655 | iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); | ||
656 | if (!iommu->mmio_base) | ||
657 | return -ENOMEM; | ||
658 | |||
659 | iommu_set_device_table(iommu); | ||
660 | iommu->cmd_buf = alloc_command_buffer(iommu); | ||
661 | if (!iommu->cmd_buf) | ||
662 | return -ENOMEM; | ||
663 | |||
664 | init_iommu_from_pci(iommu); | ||
665 | init_iommu_from_acpi(iommu, h); | ||
666 | init_iommu_devices(iommu); | ||
667 | |||
668 | return 0; | ||
669 | } | ||
670 | |||
671 | /* | ||
672 | * Iterates over all IOMMU entries in the ACPI table, allocates the | ||
673 | * IOMMU structure and initializes it with init_iommu_one() | ||
674 | */ | ||
675 | static int __init init_iommu_all(struct acpi_table_header *table) | ||
676 | { | ||
677 | u8 *p = (u8 *)table, *end = (u8 *)table; | ||
678 | struct ivhd_header *h; | ||
679 | struct amd_iommu *iommu; | ||
680 | int ret; | ||
681 | |||
682 | end += table->length; | ||
683 | p += IVRS_HEADER_LENGTH; | ||
684 | |||
685 | while (p < end) { | ||
686 | h = (struct ivhd_header *)p; | ||
687 | switch (*p) { | ||
688 | case ACPI_IVHD_TYPE: | ||
689 | iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); | ||
690 | if (iommu == NULL) | ||
691 | return -ENOMEM; | ||
692 | ret = init_iommu_one(iommu, h); | ||
693 | if (ret) | ||
694 | return ret; | ||
695 | break; | ||
696 | default: | ||
697 | break; | ||
698 | } | ||
699 | p += h->length; | ||
700 | |||
701 | } | ||
702 | WARN_ON(p != end); | ||
703 | |||
704 | return 0; | ||
705 | } | ||
706 | |||
707 | /**************************************************************************** | ||
708 | * | ||
709 | * The next functions belong to the third pass of parsing the ACPI | ||
710 | * table. In this last pass the memory mapping requirements are | ||
711 | * gathered (like exclusion and unity mapping reanges). | ||
712 | * | ||
713 | ****************************************************************************/ | ||
714 | |||
715 | static void __init free_unity_maps(void) | ||
716 | { | ||
717 | struct unity_map_entry *entry, *next; | ||
718 | |||
719 | list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) { | ||
720 | list_del(&entry->list); | ||
721 | kfree(entry); | ||
722 | } | ||
723 | } | ||
724 | |||
725 | /* called when we find an exclusion range definition in ACPI */ | ||
726 | static int __init init_exclusion_range(struct ivmd_header *m) | ||
727 | { | ||
728 | int i; | ||
729 | |||
730 | switch (m->type) { | ||
731 | case ACPI_IVMD_TYPE: | ||
732 | set_device_exclusion_range(m->devid, m); | ||
733 | break; | ||
734 | case ACPI_IVMD_TYPE_ALL: | ||
735 | for (i = 0; i <= amd_iommu_last_bdf; ++i) | ||
736 | set_device_exclusion_range(i, m); | ||
737 | break; | ||
738 | case ACPI_IVMD_TYPE_RANGE: | ||
739 | for (i = m->devid; i <= m->aux; ++i) | ||
740 | set_device_exclusion_range(i, m); | ||
741 | break; | ||
742 | default: | ||
743 | break; | ||
744 | } | ||
745 | |||
746 | return 0; | ||
747 | } | ||
748 | |||
749 | /* called for unity map ACPI definition */ | ||
750 | static int __init init_unity_map_range(struct ivmd_header *m) | ||
751 | { | ||
752 | struct unity_map_entry *e = 0; | ||
753 | |||
754 | e = kzalloc(sizeof(*e), GFP_KERNEL); | ||
755 | if (e == NULL) | ||
756 | return -ENOMEM; | ||
757 | |||
758 | switch (m->type) { | ||
759 | default: | ||
760 | case ACPI_IVMD_TYPE: | ||
761 | e->devid_start = e->devid_end = m->devid; | ||
762 | break; | ||
763 | case ACPI_IVMD_TYPE_ALL: | ||
764 | e->devid_start = 0; | ||
765 | e->devid_end = amd_iommu_last_bdf; | ||
766 | break; | ||
767 | case ACPI_IVMD_TYPE_RANGE: | ||
768 | e->devid_start = m->devid; | ||
769 | e->devid_end = m->aux; | ||
770 | break; | ||
771 | } | ||
772 | e->address_start = PAGE_ALIGN(m->range_start); | ||
773 | e->address_end = e->address_start + PAGE_ALIGN(m->range_length); | ||
774 | e->prot = m->flags >> 1; | ||
775 | |||
776 | list_add_tail(&e->list, &amd_iommu_unity_map); | ||
777 | |||
778 | return 0; | ||
779 | } | ||
780 | |||
781 | /* iterates over all memory definitions we find in the ACPI table */ | ||
782 | static int __init init_memory_definitions(struct acpi_table_header *table) | ||
783 | { | ||
784 | u8 *p = (u8 *)table, *end = (u8 *)table; | ||
785 | struct ivmd_header *m; | ||
786 | |||
787 | end += table->length; | ||
788 | p += IVRS_HEADER_LENGTH; | ||
789 | |||
790 | while (p < end) { | ||
791 | m = (struct ivmd_header *)p; | ||
792 | if (m->flags & IVMD_FLAG_EXCL_RANGE) | ||
793 | init_exclusion_range(m); | ||
794 | else if (m->flags & IVMD_FLAG_UNITY_MAP) | ||
795 | init_unity_map_range(m); | ||
796 | |||
797 | p += m->length; | ||
798 | } | ||
799 | |||
800 | return 0; | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * This function finally enables all IOMMUs found in the system after | ||
805 | * they have been initialized | ||
806 | */ | ||
807 | static void __init enable_iommus(void) | ||
808 | { | ||
809 | struct amd_iommu *iommu; | ||
810 | |||
811 | list_for_each_entry(iommu, &amd_iommu_list, list) { | ||
812 | iommu_set_exclusion_range(iommu); | ||
813 | iommu_enable(iommu); | ||
814 | } | ||
815 | } | ||
816 | |||
817 | /* | ||
818 | * Suspend/Resume support | ||
819 | * disable suspend until real resume implemented | ||
820 | */ | ||
821 | |||
822 | static int amd_iommu_resume(struct sys_device *dev) | ||
823 | { | ||
824 | return 0; | ||
825 | } | ||
826 | |||
827 | static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) | ||
828 | { | ||
829 | return -EINVAL; | ||
830 | } | ||
831 | |||
832 | static struct sysdev_class amd_iommu_sysdev_class = { | ||
833 | .name = "amd_iommu", | ||
834 | .suspend = amd_iommu_suspend, | ||
835 | .resume = amd_iommu_resume, | ||
836 | }; | ||
837 | |||
838 | static struct sys_device device_amd_iommu = { | ||
839 | .id = 0, | ||
840 | .cls = &amd_iommu_sysdev_class, | ||
841 | }; | ||
842 | |||
843 | /* | ||
844 | * This is the core init function for AMD IOMMU hardware in the system. | ||
845 | * This function is called from the generic x86 DMA layer initialization | ||
846 | * code. | ||
847 | * | ||
848 | * This function basically parses the ACPI table for AMD IOMMU (IVRS) | ||
849 | * three times: | ||
850 | * | ||
851 | * 1 pass) Find the highest PCI device id the driver has to handle. | ||
852 | * Upon this information the size of the data structures is | ||
853 | * determined that needs to be allocated. | ||
854 | * | ||
855 | * 2 pass) Initialize the data structures just allocated with the | ||
856 | * information in the ACPI table about available AMD IOMMUs | ||
857 | * in the system. It also maps the PCI devices in the | ||
858 | * system to specific IOMMUs | ||
859 | * | ||
860 | * 3 pass) After the basic data structures are allocated and | ||
861 | * initialized we update them with information about memory | ||
862 | * remapping requirements parsed out of the ACPI table in | ||
863 | * this last pass. | ||
864 | * | ||
865 | * After that the hardware is initialized and ready to go. In the last | ||
866 | * step we do some Linux specific things like registering the driver in | ||
867 | * the dma_ops interface and initializing the suspend/resume support | ||
868 | * functions. Finally it prints some information about AMD IOMMUs and | ||
869 | * the driver state and enables the hardware. | ||
870 | */ | ||
871 | int __init amd_iommu_init(void) | ||
872 | { | ||
873 | int i, ret = 0; | ||
874 | |||
875 | |||
876 | if (no_iommu) { | ||
877 | printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); | ||
878 | return 0; | ||
879 | } | ||
880 | |||
881 | if (!amd_iommu_detected) | ||
882 | return -ENODEV; | ||
883 | |||
884 | /* | ||
885 | * First parse ACPI tables to find the largest Bus/Dev/Func | ||
886 | * we need to handle. Upon this information the shared data | ||
887 | * structures for the IOMMUs in the system will be allocated | ||
888 | */ | ||
889 | if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) | ||
890 | return -ENODEV; | ||
891 | |||
892 | dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); | ||
893 | alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); | ||
894 | rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); | ||
895 | |||
896 | ret = -ENOMEM; | ||
897 | |||
898 | /* Device table - directly used by all IOMMUs */ | ||
899 | amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, | ||
900 | get_order(dev_table_size)); | ||
901 | if (amd_iommu_dev_table == NULL) | ||
902 | goto out; | ||
903 | |||
904 | /* | ||
905 | * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the | ||
906 | * IOMMU see for that device | ||
907 | */ | ||
908 | amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL, | ||
909 | get_order(alias_table_size)); | ||
910 | if (amd_iommu_alias_table == NULL) | ||
911 | goto free; | ||
912 | |||
913 | /* IOMMU rlookup table - find the IOMMU for a specific device */ | ||
914 | amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, | ||
915 | get_order(rlookup_table_size)); | ||
916 | if (amd_iommu_rlookup_table == NULL) | ||
917 | goto free; | ||
918 | |||
919 | /* | ||
920 | * Protection Domain table - maps devices to protection domains | ||
921 | * This table has the same size as the rlookup_table | ||
922 | */ | ||
923 | amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, | ||
924 | get_order(rlookup_table_size)); | ||
925 | if (amd_iommu_pd_table == NULL) | ||
926 | goto free; | ||
927 | |||
928 | amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( | ||
929 | GFP_KERNEL | __GFP_ZERO, | ||
930 | get_order(MAX_DOMAIN_ID/8)); | ||
931 | if (amd_iommu_pd_alloc_bitmap == NULL) | ||
932 | goto free; | ||
933 | |||
934 | /* | ||
935 | * let all alias entries point to itself | ||
936 | */ | ||
937 | for (i = 0; i <= amd_iommu_last_bdf; ++i) | ||
938 | amd_iommu_alias_table[i] = i; | ||
939 | |||
940 | /* | ||
941 | * never allocate domain 0 because its used as the non-allocated and | ||
942 | * error value placeholder | ||
943 | */ | ||
944 | amd_iommu_pd_alloc_bitmap[0] = 1; | ||
945 | |||
946 | /* | ||
947 | * now the data structures are allocated and basically initialized | ||
948 | * start the real acpi table scan | ||
949 | */ | ||
950 | ret = -ENODEV; | ||
951 | if (acpi_table_parse("IVRS", init_iommu_all) != 0) | ||
952 | goto free; | ||
953 | |||
954 | if (acpi_table_parse("IVRS", init_memory_definitions) != 0) | ||
955 | goto free; | ||
956 | |||
957 | ret = amd_iommu_init_dma_ops(); | ||
958 | if (ret) | ||
959 | goto free; | ||
960 | |||
961 | ret = sysdev_class_register(&amd_iommu_sysdev_class); | ||
962 | if (ret) | ||
963 | goto free; | ||
964 | |||
965 | ret = sysdev_register(&device_amd_iommu); | ||
966 | if (ret) | ||
967 | goto free; | ||
968 | |||
969 | enable_iommus(); | ||
970 | |||
971 | printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", | ||
972 | (1 << (amd_iommu_aperture_order-20))); | ||
973 | |||
974 | printk(KERN_INFO "AMD IOMMU: device isolation "); | ||
975 | if (amd_iommu_isolate) | ||
976 | printk("enabled\n"); | ||
977 | else | ||
978 | printk("disabled\n"); | ||
979 | |||
980 | out: | ||
981 | return ret; | ||
982 | |||
983 | free: | ||
984 | free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); | ||
985 | |||
986 | free_pages((unsigned long)amd_iommu_pd_table, | ||
987 | get_order(rlookup_table_size)); | ||
988 | |||
989 | free_pages((unsigned long)amd_iommu_rlookup_table, | ||
990 | get_order(rlookup_table_size)); | ||
991 | |||
992 | free_pages((unsigned long)amd_iommu_alias_table, | ||
993 | get_order(alias_table_size)); | ||
994 | |||
995 | free_pages((unsigned long)amd_iommu_dev_table, | ||
996 | get_order(dev_table_size)); | ||
997 | |||
998 | free_iommu_all(); | ||
999 | |||
1000 | free_unity_maps(); | ||
1001 | |||
1002 | goto out; | ||
1003 | } | ||
1004 | |||
1005 | /**************************************************************************** | ||
1006 | * | ||
1007 | * Early detect code. This code runs at IOMMU detection time in the DMA | ||
1008 | * layer. It just looks if there is an IVRS ACPI table to detect AMD | ||
1009 | * IOMMUs | ||
1010 | * | ||
1011 | ****************************************************************************/ | ||
1012 | static int __init early_amd_iommu_detect(struct acpi_table_header *table) | ||
1013 | { | ||
1014 | return 0; | ||
1015 | } | ||
1016 | |||
1017 | void __init amd_iommu_detect(void) | ||
1018 | { | ||
1019 | if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) | ||
1020 | return; | ||
1021 | |||
1022 | if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { | ||
1023 | iommu_detected = 1; | ||
1024 | amd_iommu_detected = 1; | ||
1025 | #ifdef CONFIG_GART_IOMMU | ||
1026 | gart_iommu_aperture_disabled = 1; | ||
1027 | gart_iommu_aperture = 0; | ||
1028 | #endif | ||
1029 | } | ||
1030 | } | ||
1031 | |||
1032 | /**************************************************************************** | ||
1033 | * | ||
1034 | * Parsing functions for the AMD IOMMU specific kernel command line | ||
1035 | * options. | ||
1036 | * | ||
1037 | ****************************************************************************/ | ||
1038 | |||
1039 | static int __init parse_amd_iommu_options(char *str) | ||
1040 | { | ||
1041 | for (; *str; ++str) { | ||
1042 | if (strcmp(str, "isolate") == 0) | ||
1043 | amd_iommu_isolate = 1; | ||
1044 | } | ||
1045 | |||
1046 | return 1; | ||
1047 | } | ||
1048 | |||
1049 | static int __init parse_amd_iommu_size_options(char *str) | ||
1050 | { | ||
1051 | unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); | ||
1052 | |||
1053 | if ((order > 24) && (order < 31)) | ||
1054 | amd_iommu_aperture_order = order; | ||
1055 | |||
1056 | return 1; | ||
1057 | } | ||
1058 | |||
1059 | __setup("amd_iommu=", parse_amd_iommu_options); | ||
1060 | __setup("amd_iommu_size=", parse_amd_iommu_size_options); | ||
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 479926d9e004..44e21826db11 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/suspend.h> | 21 | #include <linux/suspend.h> |
22 | #include <asm/e820.h> | 22 | #include <asm/e820.h> |
23 | #include <asm/io.h> | 23 | #include <asm/io.h> |
24 | #include <asm/iommu.h> | ||
24 | #include <asm/gart.h> | 25 | #include <asm/gart.h> |
25 | #include <asm/pci-direct.h> | 26 | #include <asm/pci-direct.h> |
26 | #include <asm/dma.h> | 27 | #include <asm/dma.h> |
@@ -35,6 +36,18 @@ int fallback_aper_force __initdata; | |||
35 | 36 | ||
36 | int fix_aperture __initdata = 1; | 37 | int fix_aperture __initdata = 1; |
37 | 38 | ||
39 | struct bus_dev_range { | ||
40 | int bus; | ||
41 | int dev_base; | ||
42 | int dev_limit; | ||
43 | }; | ||
44 | |||
45 | static struct bus_dev_range bus_dev_ranges[] __initdata = { | ||
46 | { 0x00, 0x18, 0x20}, | ||
47 | { 0xff, 0x00, 0x20}, | ||
48 | { 0xfe, 0x00, 0x20} | ||
49 | }; | ||
50 | |||
38 | static struct resource gart_resource = { | 51 | static struct resource gart_resource = { |
39 | .name = "GART", | 52 | .name = "GART", |
40 | .flags = IORESOURCE_MEM, | 53 | .flags = IORESOURCE_MEM, |
@@ -55,8 +68,9 @@ static u32 __init allocate_aperture(void) | |||
55 | u32 aper_size; | 68 | u32 aper_size; |
56 | void *p; | 69 | void *p; |
57 | 70 | ||
58 | if (fallback_aper_order > 7) | 71 | /* aper_size should <= 1G */ |
59 | fallback_aper_order = 7; | 72 | if (fallback_aper_order > 5) |
73 | fallback_aper_order = 5; | ||
60 | aper_size = (32 * 1024 * 1024) << fallback_aper_order; | 74 | aper_size = (32 * 1024 * 1024) << fallback_aper_order; |
61 | 75 | ||
62 | /* | 76 | /* |
@@ -65,7 +79,20 @@ static u32 __init allocate_aperture(void) | |||
65 | * memory. Unfortunately we cannot move it up because that would | 79 | * memory. Unfortunately we cannot move it up because that would |
66 | * make the IOMMU useless. | 80 | * make the IOMMU useless. |
67 | */ | 81 | */ |
68 | p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); | 82 | /* |
83 | * using 512M as goal, in case kexec will load kernel_big | ||
84 | * that will do the on position decompress, and could overlap with | ||
85 | * that positon with gart that is used. | ||
86 | * sequende: | ||
87 | * kernel_small | ||
88 | * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) | ||
89 | * ==> kernel_small(gart area become e820_reserved) | ||
90 | * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) | ||
91 | * ==> kerne_big (uncompressed size will be big than 64M or 128M) | ||
92 | * so don't use 512M below as gart iommu, leave the space for kernel | ||
93 | * code for safe | ||
94 | */ | ||
95 | p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); | ||
69 | if (!p || __pa(p)+aper_size > 0xffffffff) { | 96 | if (!p || __pa(p)+aper_size > 0xffffffff) { |
70 | printk(KERN_ERR | 97 | printk(KERN_ERR |
71 | "Cannot allocate aperture memory hole (%p,%uK)\n", | 98 | "Cannot allocate aperture memory hole (%p,%uK)\n", |
@@ -83,69 +110,53 @@ static u32 __init allocate_aperture(void) | |||
83 | return (u32)__pa(p); | 110 | return (u32)__pa(p); |
84 | } | 111 | } |
85 | 112 | ||
86 | static int __init aperture_valid(u64 aper_base, u32 aper_size) | ||
87 | { | ||
88 | if (!aper_base) | ||
89 | return 0; | ||
90 | |||
91 | if (aper_base + aper_size > 0x100000000UL) { | ||
92 | printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); | ||
93 | return 0; | ||
94 | } | ||
95 | if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { | ||
96 | printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); | ||
97 | return 0; | ||
98 | } | ||
99 | if (aper_size < 64*1024*1024) { | ||
100 | printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | return 1; | ||
105 | } | ||
106 | 113 | ||
107 | /* Find a PCI capability */ | 114 | /* Find a PCI capability */ |
108 | static __u32 __init find_cap(int num, int slot, int func, int cap) | 115 | static u32 __init find_cap(int bus, int slot, int func, int cap) |
109 | { | 116 | { |
110 | int bytes; | 117 | int bytes; |
111 | u8 pos; | 118 | u8 pos; |
112 | 119 | ||
113 | if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & | 120 | if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & |
114 | PCI_STATUS_CAP_LIST)) | 121 | PCI_STATUS_CAP_LIST)) |
115 | return 0; | 122 | return 0; |
116 | 123 | ||
117 | pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); | 124 | pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); |
118 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { | 125 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { |
119 | u8 id; | 126 | u8 id; |
120 | 127 | ||
121 | pos &= ~3; | 128 | pos &= ~3; |
122 | id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); | 129 | id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); |
123 | if (id == 0xff) | 130 | if (id == 0xff) |
124 | break; | 131 | break; |
125 | if (id == cap) | 132 | if (id == cap) |
126 | return pos; | 133 | return pos; |
127 | pos = read_pci_config_byte(num, slot, func, | 134 | pos = read_pci_config_byte(bus, slot, func, |
128 | pos+PCI_CAP_LIST_NEXT); | 135 | pos+PCI_CAP_LIST_NEXT); |
129 | } | 136 | } |
130 | return 0; | 137 | return 0; |
131 | } | 138 | } |
132 | 139 | ||
133 | /* Read a standard AGPv3 bridge header */ | 140 | /* Read a standard AGPv3 bridge header */ |
134 | static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) | 141 | static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) |
135 | { | 142 | { |
136 | u32 apsize; | 143 | u32 apsize; |
137 | u32 apsizereg; | 144 | u32 apsizereg; |
138 | int nbits; | 145 | int nbits; |
139 | u32 aper_low, aper_hi; | 146 | u32 aper_low, aper_hi; |
140 | u64 aper; | 147 | u64 aper; |
148 | u32 old_order; | ||
141 | 149 | ||
142 | printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); | 150 | printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); |
143 | apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); | 151 | apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); |
144 | if (apsizereg == 0xffffffff) { | 152 | if (apsizereg == 0xffffffff) { |
145 | printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); | 153 | printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); |
146 | return 0; | 154 | return 0; |
147 | } | 155 | } |
148 | 156 | ||
157 | /* old_order could be the value from NB gart setting */ | ||
158 | old_order = *order; | ||
159 | |||
149 | apsize = apsizereg & 0xfff; | 160 | apsize = apsizereg & 0xfff; |
150 | /* Some BIOS use weird encodings not in the AGPv3 table. */ | 161 | /* Some BIOS use weird encodings not in the AGPv3 table. */ |
151 | if (apsize & 0xff) | 162 | if (apsize & 0xff) |
@@ -155,14 +166,26 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) | |||
155 | if ((int)*order < 0) /* < 32MB */ | 166 | if ((int)*order < 0) /* < 32MB */ |
156 | *order = 0; | 167 | *order = 0; |
157 | 168 | ||
158 | aper_low = read_pci_config(num, slot, func, 0x10); | 169 | aper_low = read_pci_config(bus, slot, func, 0x10); |
159 | aper_hi = read_pci_config(num, slot, func, 0x14); | 170 | aper_hi = read_pci_config(bus, slot, func, 0x14); |
160 | aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); | 171 | aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); |
161 | 172 | ||
173 | /* | ||
174 | * On some sick chips, APSIZE is 0. It means it wants 4G | ||
175 | * so let double check that order, and lets trust AMD NB settings: | ||
176 | */ | ||
177 | printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", | ||
178 | aper, 32 << old_order); | ||
179 | if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { | ||
180 | printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", | ||
181 | 32 << *order, apsizereg); | ||
182 | *order = old_order; | ||
183 | } | ||
184 | |||
162 | printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", | 185 | printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", |
163 | aper, 32 << *order, apsizereg); | 186 | aper, 32 << *order, apsizereg); |
164 | 187 | ||
165 | if (!aperture_valid(aper, (32*1024*1024) << *order)) | 188 | if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) |
166 | return 0; | 189 | return 0; |
167 | return (u32)aper; | 190 | return (u32)aper; |
168 | } | 191 | } |
@@ -180,17 +203,17 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) | |||
180 | * the AGP bridges should be always an own bus on the HT hierarchy, | 203 | * the AGP bridges should be always an own bus on the HT hierarchy, |
181 | * but do it here for future safety. | 204 | * but do it here for future safety. |
182 | */ | 205 | */ |
183 | static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) | 206 | static u32 __init search_agp_bridge(u32 *order, int *valid_agp) |
184 | { | 207 | { |
185 | int num, slot, func; | 208 | int bus, slot, func; |
186 | 209 | ||
187 | /* Poor man's PCI discovery */ | 210 | /* Poor man's PCI discovery */ |
188 | for (num = 0; num < 256; num++) { | 211 | for (bus = 0; bus < 256; bus++) { |
189 | for (slot = 0; slot < 32; slot++) { | 212 | for (slot = 0; slot < 32; slot++) { |
190 | for (func = 0; func < 8; func++) { | 213 | for (func = 0; func < 8; func++) { |
191 | u32 class, cap; | 214 | u32 class, cap; |
192 | u8 type; | 215 | u8 type; |
193 | class = read_pci_config(num, slot, func, | 216 | class = read_pci_config(bus, slot, func, |
194 | PCI_CLASS_REVISION); | 217 | PCI_CLASS_REVISION); |
195 | if (class == 0xffffffff) | 218 | if (class == 0xffffffff) |
196 | break; | 219 | break; |
@@ -199,17 +222,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) | |||
199 | case PCI_CLASS_BRIDGE_HOST: | 222 | case PCI_CLASS_BRIDGE_HOST: |
200 | case PCI_CLASS_BRIDGE_OTHER: /* needed? */ | 223 | case PCI_CLASS_BRIDGE_OTHER: /* needed? */ |
201 | /* AGP bridge? */ | 224 | /* AGP bridge? */ |
202 | cap = find_cap(num, slot, func, | 225 | cap = find_cap(bus, slot, func, |
203 | PCI_CAP_ID_AGP); | 226 | PCI_CAP_ID_AGP); |
204 | if (!cap) | 227 | if (!cap) |
205 | break; | 228 | break; |
206 | *valid_agp = 1; | 229 | *valid_agp = 1; |
207 | return read_agp(num, slot, func, cap, | 230 | return read_agp(bus, slot, func, cap, |
208 | order); | 231 | order); |
209 | } | 232 | } |
210 | 233 | ||
211 | /* No multi-function device? */ | 234 | /* No multi-function device? */ |
212 | type = read_pci_config_byte(num, slot, func, | 235 | type = read_pci_config_byte(bus, slot, func, |
213 | PCI_HEADER_TYPE); | 236 | PCI_HEADER_TYPE); |
214 | if (!(type & 0x80)) | 237 | if (!(type & 0x80)) |
215 | break; | 238 | break; |
@@ -249,36 +272,50 @@ void __init early_gart_iommu_check(void) | |||
249 | * or BIOS forget to put that in reserved. | 272 | * or BIOS forget to put that in reserved. |
250 | * try to update e820 to make that region as reserved. | 273 | * try to update e820 to make that region as reserved. |
251 | */ | 274 | */ |
252 | int fix, num; | 275 | int i, fix, slot; |
253 | u32 ctl; | 276 | u32 ctl; |
254 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; | 277 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; |
255 | u64 aper_base = 0, last_aper_base = 0; | 278 | u64 aper_base = 0, last_aper_base = 0; |
256 | int aper_enabled = 0, last_aper_enabled = 0; | 279 | int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0; |
257 | 280 | ||
258 | if (!early_pci_allowed()) | 281 | if (!early_pci_allowed()) |
259 | return; | 282 | return; |
260 | 283 | ||
284 | /* This is mostly duplicate of iommu_hole_init */ | ||
261 | fix = 0; | 285 | fix = 0; |
262 | for (num = 24; num < 32; num++) { | 286 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
263 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | 287 | int bus; |
264 | continue; | 288 | int dev_base, dev_limit; |
265 | 289 | ||
266 | ctl = read_pci_config(0, num, 3, 0x90); | 290 | bus = bus_dev_ranges[i].bus; |
267 | aper_enabled = ctl & 1; | 291 | dev_base = bus_dev_ranges[i].dev_base; |
268 | aper_order = (ctl >> 1) & 7; | 292 | dev_limit = bus_dev_ranges[i].dev_limit; |
269 | aper_size = (32 * 1024 * 1024) << aper_order; | 293 | |
270 | aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; | 294 | for (slot = dev_base; slot < dev_limit; slot++) { |
271 | aper_base <<= 25; | 295 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) |
272 | 296 | continue; | |
273 | if ((last_aper_order && aper_order != last_aper_order) || | 297 | |
274 | (last_aper_base && aper_base != last_aper_base) || | 298 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); |
275 | (last_aper_enabled && aper_enabled != last_aper_enabled)) { | 299 | aper_enabled = ctl & AMD64_GARTEN; |
276 | fix = 1; | 300 | aper_order = (ctl >> 1) & 7; |
277 | break; | 301 | aper_size = (32 * 1024 * 1024) << aper_order; |
302 | aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; | ||
303 | aper_base <<= 25; | ||
304 | |||
305 | if (last_valid) { | ||
306 | if ((aper_order != last_aper_order) || | ||
307 | (aper_base != last_aper_base) || | ||
308 | (aper_enabled != last_aper_enabled)) { | ||
309 | fix = 1; | ||
310 | break; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | last_aper_order = aper_order; | ||
315 | last_aper_base = aper_base; | ||
316 | last_aper_enabled = aper_enabled; | ||
317 | last_valid = 1; | ||
278 | } | 318 | } |
279 | last_aper_order = aper_order; | ||
280 | last_aper_base = aper_base; | ||
281 | last_aper_enabled = aper_enabled; | ||
282 | } | 319 | } |
283 | 320 | ||
284 | if (!fix && !aper_enabled) | 321 | if (!fix && !aper_enabled) |
@@ -290,32 +327,46 @@ void __init early_gart_iommu_check(void) | |||
290 | if (gart_fix_e820 && !fix && aper_enabled) { | 327 | if (gart_fix_e820 && !fix && aper_enabled) { |
291 | if (e820_any_mapped(aper_base, aper_base + aper_size, | 328 | if (e820_any_mapped(aper_base, aper_base + aper_size, |
292 | E820_RAM)) { | 329 | E820_RAM)) { |
293 | /* reserved it, so we can resuse it in second kernel */ | 330 | /* reserve it, so we can reuse it in second kernel */ |
294 | printk(KERN_INFO "update e820 for GART\n"); | 331 | printk(KERN_INFO "update e820 for GART\n"); |
295 | add_memory_region(aper_base, aper_size, E820_RESERVED); | 332 | e820_add_region(aper_base, aper_size, E820_RESERVED); |
296 | update_e820(); | 333 | update_e820(); |
297 | } | 334 | } |
298 | return; | ||
299 | } | 335 | } |
300 | 336 | ||
337 | if (!fix) | ||
338 | return; | ||
339 | |||
301 | /* different nodes have different setting, disable them all at first*/ | 340 | /* different nodes have different setting, disable them all at first*/ |
302 | for (num = 24; num < 32; num++) { | 341 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
303 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | 342 | int bus; |
304 | continue; | 343 | int dev_base, dev_limit; |
344 | |||
345 | bus = bus_dev_ranges[i].bus; | ||
346 | dev_base = bus_dev_ranges[i].dev_base; | ||
347 | dev_limit = bus_dev_ranges[i].dev_limit; | ||
348 | |||
349 | for (slot = dev_base; slot < dev_limit; slot++) { | ||
350 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) | ||
351 | continue; | ||
305 | 352 | ||
306 | ctl = read_pci_config(0, num, 3, 0x90); | 353 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); |
307 | ctl &= ~1; | 354 | ctl &= ~AMD64_GARTEN; |
308 | write_pci_config(0, num, 3, 0x90, ctl); | 355 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); |
356 | } | ||
309 | } | 357 | } |
310 | 358 | ||
311 | } | 359 | } |
312 | 360 | ||
361 | static int __initdata printed_gart_size_msg; | ||
362 | |||
313 | void __init gart_iommu_hole_init(void) | 363 | void __init gart_iommu_hole_init(void) |
314 | { | 364 | { |
365 | u32 agp_aper_base = 0, agp_aper_order = 0; | ||
315 | u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; | 366 | u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; |
316 | u64 aper_base, last_aper_base = 0; | 367 | u64 aper_base, last_aper_base = 0; |
317 | int fix, num, valid_agp = 0; | 368 | int fix, slot, valid_agp = 0; |
318 | int node; | 369 | int i, node; |
319 | 370 | ||
320 | if (gart_iommu_aperture_disabled || !fix_aperture || | 371 | if (gart_iommu_aperture_disabled || !fix_aperture || |
321 | !early_pci_allowed()) | 372 | !early_pci_allowed()) |
@@ -323,38 +374,65 @@ void __init gart_iommu_hole_init(void) | |||
323 | 374 | ||
324 | printk(KERN_INFO "Checking aperture...\n"); | 375 | printk(KERN_INFO "Checking aperture...\n"); |
325 | 376 | ||
377 | if (!fallback_aper_force) | ||
378 | agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); | ||
379 | |||
326 | fix = 0; | 380 | fix = 0; |
327 | node = 0; | 381 | node = 0; |
328 | for (num = 24; num < 32; num++) { | 382 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
329 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | 383 | int bus; |
330 | continue; | 384 | int dev_base, dev_limit; |
331 | 385 | ||
332 | iommu_detected = 1; | 386 | bus = bus_dev_ranges[i].bus; |
333 | gart_iommu_aperture = 1; | 387 | dev_base = bus_dev_ranges[i].dev_base; |
334 | 388 | dev_limit = bus_dev_ranges[i].dev_limit; | |
335 | aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; | 389 | |
336 | aper_size = (32 * 1024 * 1024) << aper_order; | 390 | for (slot = dev_base; slot < dev_limit; slot++) { |
337 | aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; | 391 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) |
338 | aper_base <<= 25; | 392 | continue; |
339 | 393 | ||
340 | printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", | 394 | iommu_detected = 1; |
341 | node, aper_base, aper_size >> 20); | 395 | gart_iommu_aperture = 1; |
342 | node++; | 396 | |
343 | 397 | aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; | |
344 | if (!aperture_valid(aper_base, aper_size)) { | 398 | aper_size = (32 * 1024 * 1024) << aper_order; |
345 | fix = 1; | 399 | aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; |
346 | break; | 400 | aper_base <<= 25; |
347 | } | 401 | |
402 | printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", | ||
403 | node, aper_base, aper_size >> 20); | ||
404 | node++; | ||
405 | |||
406 | if (!aperture_valid(aper_base, aper_size, 64<<20)) { | ||
407 | if (valid_agp && agp_aper_base && | ||
408 | agp_aper_base == aper_base && | ||
409 | agp_aper_order == aper_order) { | ||
410 | /* the same between two setting from NB and agp */ | ||
411 | if (!no_iommu && | ||
412 | max_pfn > MAX_DMA32_PFN && | ||
413 | !printed_gart_size_msg) { | ||
414 | printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); | ||
415 | printk(KERN_ERR "please increase GART size in your BIOS setup\n"); | ||
416 | printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); | ||
417 | printed_gart_size_msg = 1; | ||
418 | } | ||
419 | } else { | ||
420 | fix = 1; | ||
421 | goto out; | ||
422 | } | ||
423 | } | ||
348 | 424 | ||
349 | if ((last_aper_order && aper_order != last_aper_order) || | 425 | if ((last_aper_order && aper_order != last_aper_order) || |
350 | (last_aper_base && aper_base != last_aper_base)) { | 426 | (last_aper_base && aper_base != last_aper_base)) { |
351 | fix = 1; | 427 | fix = 1; |
352 | break; | 428 | goto out; |
429 | } | ||
430 | last_aper_order = aper_order; | ||
431 | last_aper_base = aper_base; | ||
353 | } | 432 | } |
354 | last_aper_order = aper_order; | ||
355 | last_aper_base = aper_base; | ||
356 | } | 433 | } |
357 | 434 | ||
435 | out: | ||
358 | if (!fix && !fallback_aper_force) { | 436 | if (!fix && !fallback_aper_force) { |
359 | if (last_aper_base) { | 437 | if (last_aper_base) { |
360 | unsigned long n = (32 * 1024 * 1024) << last_aper_order; | 438 | unsigned long n = (32 * 1024 * 1024) << last_aper_order; |
@@ -364,14 +442,16 @@ void __init gart_iommu_hole_init(void) | |||
364 | return; | 442 | return; |
365 | } | 443 | } |
366 | 444 | ||
367 | if (!fallback_aper_force) | 445 | if (!fallback_aper_force) { |
368 | aper_alloc = search_agp_bridge(&aper_order, &valid_agp); | 446 | aper_alloc = agp_aper_base; |
447 | aper_order = agp_aper_order; | ||
448 | } | ||
369 | 449 | ||
370 | if (aper_alloc) { | 450 | if (aper_alloc) { |
371 | /* Got the aperture from the AGP bridge */ | 451 | /* Got the aperture from the AGP bridge */ |
372 | } else if (swiotlb && !valid_agp) { | 452 | } else if (swiotlb && !valid_agp) { |
373 | /* Do nothing */ | 453 | /* Do nothing */ |
374 | } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || | 454 | } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || |
375 | force_iommu || | 455 | force_iommu || |
376 | valid_agp || | 456 | valid_agp || |
377 | fallback_aper_force) { | 457 | fallback_aper_force) { |
@@ -401,16 +481,24 @@ void __init gart_iommu_hole_init(void) | |||
401 | } | 481 | } |
402 | 482 | ||
403 | /* Fix up the north bridges */ | 483 | /* Fix up the north bridges */ |
404 | for (num = 24; num < 32; num++) { | 484 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
405 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | 485 | int bus; |
406 | continue; | 486 | int dev_base, dev_limit; |
407 | 487 | ||
408 | /* | 488 | bus = bus_dev_ranges[i].bus; |
409 | * Don't enable translation yet. That is done later. | 489 | dev_base = bus_dev_ranges[i].dev_base; |
410 | * Assume this BIOS didn't initialise the GART so | 490 | dev_limit = bus_dev_ranges[i].dev_limit; |
411 | * just overwrite all previous bits | 491 | for (slot = dev_base; slot < dev_limit; slot++) { |
412 | */ | 492 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) |
413 | write_pci_config(0, num, 3, 0x90, aper_order<<1); | 493 | continue; |
414 | write_pci_config(0, num, 3, 0x94, aper_alloc>>25); | 494 | |
495 | /* Don't enable translation yet. That is done later. | ||
496 | Assume this BIOS didn't initialise the GART so | ||
497 | just overwrite all previous bits */ | ||
498 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); | ||
499 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); | ||
500 | } | ||
415 | } | 501 | } |
502 | |||
503 | set_up_gart_resume(aper_order, aper_alloc); | ||
416 | } | 504 | } |
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1bdeb6c..d6c898358371 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c | |||
@@ -52,29 +52,40 @@ | |||
52 | 52 | ||
53 | unsigned long mp_lapic_addr; | 53 | unsigned long mp_lapic_addr; |
54 | 54 | ||
55 | DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; | ||
56 | EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); | ||
57 | |||
58 | /* | 55 | /* |
59 | * Knob to control our willingness to enable the local APIC. | 56 | * Knob to control our willingness to enable the local APIC. |
60 | * | 57 | * |
61 | * -1=force-disable, +1=force-enable | 58 | * +1=force-enable |
62 | */ | 59 | */ |
63 | static int enable_local_apic __initdata; | 60 | static int force_enable_local_apic; |
61 | int disable_apic; | ||
64 | 62 | ||
65 | /* Local APIC timer verification ok */ | 63 | /* Local APIC timer verification ok */ |
66 | static int local_apic_timer_verify_ok; | 64 | static int local_apic_timer_verify_ok; |
67 | /* Disable local APIC timer from the kernel commandline or via dmi quirk | 65 | /* Disable local APIC timer from the kernel commandline or via dmi quirk */ |
68 | or using CPU MSR check */ | 66 | static int local_apic_timer_disabled; |
69 | int local_apic_timer_disabled; | ||
70 | /* Local APIC timer works in C2 */ | 67 | /* Local APIC timer works in C2 */ |
71 | int local_apic_timer_c2_ok; | 68 | int local_apic_timer_c2_ok; |
72 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); | 69 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); |
73 | 70 | ||
71 | int first_system_vector = 0xfe; | ||
72 | |||
73 | char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; | ||
74 | |||
74 | /* | 75 | /* |
75 | * Debug level, exported for io_apic.c | 76 | * Debug level, exported for io_apic.c |
76 | */ | 77 | */ |
77 | int apic_verbosity; | 78 | unsigned int apic_verbosity; |
79 | |||
80 | int pic_mode; | ||
81 | |||
82 | /* Have we found an MP table */ | ||
83 | int smp_found_config; | ||
84 | |||
85 | static struct resource lapic_resource = { | ||
86 | .name = "Local APIC", | ||
87 | .flags = IORESOURCE_MEM | IORESOURCE_BUSY, | ||
88 | }; | ||
78 | 89 | ||
79 | static unsigned int calibration_result; | 90 | static unsigned int calibration_result; |
80 | 91 | ||
@@ -166,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void) | |||
166 | /* Level triggered for 82489DX */ | 177 | /* Level triggered for 82489DX */ |
167 | if (!lapic_is_integrated()) | 178 | if (!lapic_is_integrated()) |
168 | v |= APIC_LVT_LEVEL_TRIGGER; | 179 | v |= APIC_LVT_LEVEL_TRIGGER; |
169 | apic_write_around(APIC_LVT0, v); | 180 | apic_write(APIC_LVT0, v); |
170 | } | 181 | } |
171 | 182 | ||
172 | /** | 183 | /** |
@@ -201,9 +212,6 @@ int lapic_get_maxlvt(void) | |||
201 | * this function twice on the boot CPU, once with a bogus timeout | 212 | * this function twice on the boot CPU, once with a bogus timeout |
202 | * value, second time for real. The other (noncalibrating) CPUs | 213 | * value, second time for real. The other (noncalibrating) CPUs |
203 | * call this function only once, with the real, calibrated value. | 214 | * call this function only once, with the real, calibrated value. |
204 | * | ||
205 | * We do reads before writes even if unnecessary, to get around the | ||
206 | * P5 APIC double write bug. | ||
207 | */ | 215 | */ |
208 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | 216 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) |
209 | { | 217 | { |
@@ -218,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
218 | if (!irqen) | 226 | if (!irqen) |
219 | lvtt_value |= APIC_LVT_MASKED; | 227 | lvtt_value |= APIC_LVT_MASKED; |
220 | 228 | ||
221 | apic_write_around(APIC_LVTT, lvtt_value); | 229 | apic_write(APIC_LVTT, lvtt_value); |
222 | 230 | ||
223 | /* | 231 | /* |
224 | * Divide PICLK by 16 | 232 | * Divide PICLK by 16 |
225 | */ | 233 | */ |
226 | tmp_value = apic_read(APIC_TDCR); | 234 | tmp_value = apic_read(APIC_TDCR); |
227 | apic_write_around(APIC_TDCR, (tmp_value | 235 | apic_write(APIC_TDCR, |
228 | & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 236 | (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | |
229 | | APIC_TDR_DIV_16); | 237 | APIC_TDR_DIV_16); |
230 | 238 | ||
231 | if (!oneshot) | 239 | if (!oneshot) |
232 | apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); | 240 | apic_write(APIC_TMICT, clocks / APIC_DIVISOR); |
233 | } | 241 | } |
234 | 242 | ||
235 | /* | 243 | /* |
@@ -238,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
238 | static int lapic_next_event(unsigned long delta, | 246 | static int lapic_next_event(unsigned long delta, |
239 | struct clock_event_device *evt) | 247 | struct clock_event_device *evt) |
240 | { | 248 | { |
241 | apic_write_around(APIC_TMICT, delta); | 249 | apic_write(APIC_TMICT, delta); |
242 | return 0; | 250 | return 0; |
243 | } | 251 | } |
244 | 252 | ||
@@ -267,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, | |||
267 | case CLOCK_EVT_MODE_SHUTDOWN: | 275 | case CLOCK_EVT_MODE_SHUTDOWN: |
268 | v = apic_read(APIC_LVTT); | 276 | v = apic_read(APIC_LVTT); |
269 | v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | 277 | v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); |
270 | apic_write_around(APIC_LVTT, v); | 278 | apic_write(APIC_LVTT, v); |
271 | break; | 279 | break; |
272 | case CLOCK_EVT_MODE_RESUME: | 280 | case CLOCK_EVT_MODE_RESUME: |
273 | /* Nothing to do here */ | 281 | /* Nothing to do here */ |
@@ -361,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) | |||
361 | } | 369 | } |
362 | } | 370 | } |
363 | 371 | ||
364 | /* | 372 | static int __init calibrate_APIC_clock(void) |
365 | * Setup the boot APIC | ||
366 | * | ||
367 | * Calibrate and verify the result. | ||
368 | */ | ||
369 | void __init setup_boot_APIC_clock(void) | ||
370 | { | 373 | { |
371 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); | 374 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); |
372 | const long pm_100ms = PMTMR_TICKS_PER_SEC/10; | 375 | const long pm_100ms = PMTMR_TICKS_PER_SEC/10; |
@@ -376,24 +379,6 @@ void __init setup_boot_APIC_clock(void) | |||
376 | long delta, deltapm; | 379 | long delta, deltapm; |
377 | int pm_referenced = 0; | 380 | int pm_referenced = 0; |
378 | 381 | ||
379 | /* | ||
380 | * The local apic timer can be disabled via the kernel | ||
381 | * commandline or from the CPU detection code. Register the lapic | ||
382 | * timer as a dummy clock event source on SMP systems, so the | ||
383 | * broadcast mechanism is used. On UP systems simply ignore it. | ||
384 | */ | ||
385 | if (local_apic_timer_disabled) { | ||
386 | /* No broadcast on UP ! */ | ||
387 | if (num_possible_cpus() > 1) { | ||
388 | lapic_clockevent.mult = 1; | ||
389 | setup_APIC_timer(); | ||
390 | } | ||
391 | return; | ||
392 | } | ||
393 | |||
394 | apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" | ||
395 | "calibrating APIC timer ...\n"); | ||
396 | |||
397 | local_irq_disable(); | 382 | local_irq_disable(); |
398 | 383 | ||
399 | /* Replace the global interrupt handler */ | 384 | /* Replace the global interrupt handler */ |
@@ -478,8 +463,6 @@ void __init setup_boot_APIC_clock(void) | |||
478 | calibration_result / (1000000 / HZ), | 463 | calibration_result / (1000000 / HZ), |
479 | calibration_result % (1000000 / HZ)); | 464 | calibration_result % (1000000 / HZ)); |
480 | 465 | ||
481 | local_apic_timer_verify_ok = 1; | ||
482 | |||
483 | /* | 466 | /* |
484 | * Do a sanity check on the APIC calibration result | 467 | * Do a sanity check on the APIC calibration result |
485 | */ | 468 | */ |
@@ -487,12 +470,11 @@ void __init setup_boot_APIC_clock(void) | |||
487 | local_irq_enable(); | 470 | local_irq_enable(); |
488 | printk(KERN_WARNING | 471 | printk(KERN_WARNING |
489 | "APIC frequency too slow, disabling apic timer\n"); | 472 | "APIC frequency too slow, disabling apic timer\n"); |
490 | /* No broadcast on UP ! */ | 473 | return -1; |
491 | if (num_possible_cpus() > 1) | ||
492 | setup_APIC_timer(); | ||
493 | return; | ||
494 | } | 474 | } |
495 | 475 | ||
476 | local_apic_timer_verify_ok = 1; | ||
477 | |||
496 | /* We trust the pm timer based calibration */ | 478 | /* We trust the pm timer based calibration */ |
497 | if (!pm_referenced) { | 479 | if (!pm_referenced) { |
498 | apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); | 480 | apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); |
@@ -532,22 +514,55 @@ void __init setup_boot_APIC_clock(void) | |||
532 | if (!local_apic_timer_verify_ok) { | 514 | if (!local_apic_timer_verify_ok) { |
533 | printk(KERN_WARNING | 515 | printk(KERN_WARNING |
534 | "APIC timer disabled due to verification failure.\n"); | 516 | "APIC timer disabled due to verification failure.\n"); |
517 | return -1; | ||
518 | } | ||
519 | |||
520 | return 0; | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * Setup the boot APIC | ||
525 | * | ||
526 | * Calibrate and verify the result. | ||
527 | */ | ||
528 | void __init setup_boot_APIC_clock(void) | ||
529 | { | ||
530 | /* | ||
531 | * The local apic timer can be disabled via the kernel | ||
532 | * commandline or from the CPU detection code. Register the lapic | ||
533 | * timer as a dummy clock event source on SMP systems, so the | ||
534 | * broadcast mechanism is used. On UP systems simply ignore it. | ||
535 | */ | ||
536 | if (local_apic_timer_disabled) { | ||
535 | /* No broadcast on UP ! */ | 537 | /* No broadcast on UP ! */ |
536 | if (num_possible_cpus() == 1) | 538 | if (num_possible_cpus() > 1) { |
537 | return; | 539 | lapic_clockevent.mult = 1; |
538 | } else { | 540 | setup_APIC_timer(); |
539 | /* | 541 | } |
540 | * If nmi_watchdog is set to IO_APIC, we need the | 542 | return; |
541 | * PIT/HPET going. Otherwise register lapic as a dummy | 543 | } |
542 | * device. | 544 | |
543 | */ | 545 | apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" |
544 | if (nmi_watchdog != NMI_IO_APIC) | 546 | "calibrating APIC timer ...\n"); |
545 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; | 547 | |
546 | else | 548 | if (calibrate_APIC_clock()) { |
547 | printk(KERN_WARNING "APIC timer registered as dummy," | 549 | /* No broadcast on UP ! */ |
548 | " due to nmi_watchdog=1!\n"); | 550 | if (num_possible_cpus() > 1) |
551 | setup_APIC_timer(); | ||
552 | return; | ||
549 | } | 553 | } |
550 | 554 | ||
555 | /* | ||
556 | * If nmi_watchdog is set to IO_APIC, we need the | ||
557 | * PIT/HPET going. Otherwise register lapic as a dummy | ||
558 | * device. | ||
559 | */ | ||
560 | if (nmi_watchdog != NMI_IO_APIC) | ||
561 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; | ||
562 | else | ||
563 | printk(KERN_WARNING "APIC timer registered as dummy," | ||
564 | " due to nmi_watchdog=%d!\n", nmi_watchdog); | ||
565 | |||
551 | /* Setup the lapic or request the broadcast */ | 566 | /* Setup the lapic or request the broadcast */ |
552 | setup_APIC_timer(); | 567 | setup_APIC_timer(); |
553 | } | 568 | } |
@@ -682,44 +697,44 @@ void clear_local_APIC(void) | |||
682 | */ | 697 | */ |
683 | if (maxlvt >= 3) { | 698 | if (maxlvt >= 3) { |
684 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ | 699 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ |
685 | apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); | 700 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); |
686 | } | 701 | } |
687 | /* | 702 | /* |
688 | * Careful: we have to set masks only first to deassert | 703 | * Careful: we have to set masks only first to deassert |
689 | * any level-triggered sources. | 704 | * any level-triggered sources. |
690 | */ | 705 | */ |
691 | v = apic_read(APIC_LVTT); | 706 | v = apic_read(APIC_LVTT); |
692 | apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | 707 | apic_write(APIC_LVTT, v | APIC_LVT_MASKED); |
693 | v = apic_read(APIC_LVT0); | 708 | v = apic_read(APIC_LVT0); |
694 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | 709 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); |
695 | v = apic_read(APIC_LVT1); | 710 | v = apic_read(APIC_LVT1); |
696 | apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); | 711 | apic_write(APIC_LVT1, v | APIC_LVT_MASKED); |
697 | if (maxlvt >= 4) { | 712 | if (maxlvt >= 4) { |
698 | v = apic_read(APIC_LVTPC); | 713 | v = apic_read(APIC_LVTPC); |
699 | apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); | 714 | apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); |
700 | } | 715 | } |
701 | 716 | ||
702 | /* lets not touch this if we didn't frob it */ | 717 | /* lets not touch this if we didn't frob it */ |
703 | #ifdef CONFIG_X86_MCE_P4THERMAL | 718 | #ifdef CONFIG_X86_MCE_P4THERMAL |
704 | if (maxlvt >= 5) { | 719 | if (maxlvt >= 5) { |
705 | v = apic_read(APIC_LVTTHMR); | 720 | v = apic_read(APIC_LVTTHMR); |
706 | apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); | 721 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); |
707 | } | 722 | } |
708 | #endif | 723 | #endif |
709 | /* | 724 | /* |
710 | * Clean APIC state for other OSs: | 725 | * Clean APIC state for other OSs: |
711 | */ | 726 | */ |
712 | apic_write_around(APIC_LVTT, APIC_LVT_MASKED); | 727 | apic_write(APIC_LVTT, APIC_LVT_MASKED); |
713 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED); | 728 | apic_write(APIC_LVT0, APIC_LVT_MASKED); |
714 | apic_write_around(APIC_LVT1, APIC_LVT_MASKED); | 729 | apic_write(APIC_LVT1, APIC_LVT_MASKED); |
715 | if (maxlvt >= 3) | 730 | if (maxlvt >= 3) |
716 | apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); | 731 | apic_write(APIC_LVTERR, APIC_LVT_MASKED); |
717 | if (maxlvt >= 4) | 732 | if (maxlvt >= 4) |
718 | apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); | 733 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); |
719 | 734 | ||
720 | #ifdef CONFIG_X86_MCE_P4THERMAL | 735 | #ifdef CONFIG_X86_MCE_P4THERMAL |
721 | if (maxlvt >= 5) | 736 | if (maxlvt >= 5) |
722 | apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); | 737 | apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); |
723 | #endif | 738 | #endif |
724 | /* Integrated APIC (!82489DX) ? */ | 739 | /* Integrated APIC (!82489DX) ? */ |
725 | if (lapic_is_integrated()) { | 740 | if (lapic_is_integrated()) { |
@@ -745,7 +760,7 @@ void disable_local_APIC(void) | |||
745 | */ | 760 | */ |
746 | value = apic_read(APIC_SPIV); | 761 | value = apic_read(APIC_SPIV); |
747 | value &= ~APIC_SPIV_APIC_ENABLED; | 762 | value &= ~APIC_SPIV_APIC_ENABLED; |
748 | apic_write_around(APIC_SPIV, value); | 763 | apic_write(APIC_SPIV, value); |
749 | 764 | ||
750 | /* | 765 | /* |
751 | * When LAPIC was disabled by the BIOS and enabled by the kernel, | 766 | * When LAPIC was disabled by the BIOS and enabled by the kernel, |
@@ -854,8 +869,8 @@ void __init sync_Arb_IDs(void) | |||
854 | apic_wait_icr_idle(); | 869 | apic_wait_icr_idle(); |
855 | 870 | ||
856 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | 871 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); |
857 | apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | 872 | apic_write(APIC_ICR, |
858 | | APIC_DM_INIT); | 873 | APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); |
859 | } | 874 | } |
860 | 875 | ||
861 | /* | 876 | /* |
@@ -891,16 +906,16 @@ void __init init_bsp_APIC(void) | |||
891 | else | 906 | else |
892 | value |= APIC_SPIV_FOCUS_DISABLED; | 907 | value |= APIC_SPIV_FOCUS_DISABLED; |
893 | value |= SPURIOUS_APIC_VECTOR; | 908 | value |= SPURIOUS_APIC_VECTOR; |
894 | apic_write_around(APIC_SPIV, value); | 909 | apic_write(APIC_SPIV, value); |
895 | 910 | ||
896 | /* | 911 | /* |
897 | * Set up the virtual wire mode. | 912 | * Set up the virtual wire mode. |
898 | */ | 913 | */ |
899 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | 914 | apic_write(APIC_LVT0, APIC_DM_EXTINT); |
900 | value = APIC_DM_NMI; | 915 | value = APIC_DM_NMI; |
901 | if (!lapic_is_integrated()) /* 82489DX */ | 916 | if (!lapic_is_integrated()) /* 82489DX */ |
902 | value |= APIC_LVT_LEVEL_TRIGGER; | 917 | value |= APIC_LVT_LEVEL_TRIGGER; |
903 | apic_write_around(APIC_LVT1, value); | 918 | apic_write(APIC_LVT1, value); |
904 | } | 919 | } |
905 | 920 | ||
906 | static void __cpuinit lapic_setup_esr(void) | 921 | static void __cpuinit lapic_setup_esr(void) |
@@ -915,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void) | |||
915 | 930 | ||
916 | /* enables sending errors */ | 931 | /* enables sending errors */ |
917 | value = ERROR_APIC_VECTOR; | 932 | value = ERROR_APIC_VECTOR; |
918 | apic_write_around(APIC_LVTERR, value); | 933 | apic_write(APIC_LVTERR, value); |
919 | /* | 934 | /* |
920 | * spec says clear errors after enabling vector. | 935 | * spec says clear errors after enabling vector. |
921 | */ | 936 | */ |
@@ -963,7 +978,7 @@ void __cpuinit setup_local_APIC(void) | |||
963 | * Double-check whether this APIC is really registered. | 978 | * Double-check whether this APIC is really registered. |
964 | */ | 979 | */ |
965 | if (!apic_id_registered()) | 980 | if (!apic_id_registered()) |
966 | BUG(); | 981 | WARN_ON_ONCE(1); |
967 | 982 | ||
968 | /* | 983 | /* |
969 | * Intel recommends to set DFR, LDR and TPR before enabling | 984 | * Intel recommends to set DFR, LDR and TPR before enabling |
@@ -978,7 +993,7 @@ void __cpuinit setup_local_APIC(void) | |||
978 | */ | 993 | */ |
979 | value = apic_read(APIC_TASKPRI); | 994 | value = apic_read(APIC_TASKPRI); |
980 | value &= ~APIC_TPRI_MASK; | 995 | value &= ~APIC_TPRI_MASK; |
981 | apic_write_around(APIC_TASKPRI, value); | 996 | apic_write(APIC_TASKPRI, value); |
982 | 997 | ||
983 | /* | 998 | /* |
984 | * After a crash, we no longer service the interrupts and a pending | 999 | * After a crash, we no longer service the interrupts and a pending |
@@ -1036,7 +1051,7 @@ void __cpuinit setup_local_APIC(void) | |||
1036 | * Set spurious IRQ vector | 1051 | * Set spurious IRQ vector |
1037 | */ | 1052 | */ |
1038 | value |= SPURIOUS_APIC_VECTOR; | 1053 | value |= SPURIOUS_APIC_VECTOR; |
1039 | apic_write_around(APIC_SPIV, value); | 1054 | apic_write(APIC_SPIV, value); |
1040 | 1055 | ||
1041 | /* | 1056 | /* |
1042 | * Set up LVT0, LVT1: | 1057 | * Set up LVT0, LVT1: |
@@ -1058,7 +1073,7 @@ void __cpuinit setup_local_APIC(void) | |||
1058 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", | 1073 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", |
1059 | smp_processor_id()); | 1074 | smp_processor_id()); |
1060 | } | 1075 | } |
1061 | apic_write_around(APIC_LVT0, value); | 1076 | apic_write(APIC_LVT0, value); |
1062 | 1077 | ||
1063 | /* | 1078 | /* |
1064 | * only the BP should see the LINT1 NMI signal, obviously. | 1079 | * only the BP should see the LINT1 NMI signal, obviously. |
@@ -1069,7 +1084,7 @@ void __cpuinit setup_local_APIC(void) | |||
1069 | value = APIC_DM_NMI | APIC_LVT_MASKED; | 1084 | value = APIC_DM_NMI | APIC_LVT_MASKED; |
1070 | if (!integrated) /* 82489DX */ | 1085 | if (!integrated) /* 82489DX */ |
1071 | value |= APIC_LVT_LEVEL_TRIGGER; | 1086 | value |= APIC_LVT_LEVEL_TRIGGER; |
1072 | apic_write_around(APIC_LVT1, value); | 1087 | apic_write(APIC_LVT1, value); |
1073 | } | 1088 | } |
1074 | 1089 | ||
1075 | void __cpuinit end_local_APIC_setup(void) | 1090 | void __cpuinit end_local_APIC_setup(void) |
@@ -1080,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void) | |||
1080 | /* Disable the local apic timer */ | 1095 | /* Disable the local apic timer */ |
1081 | value = apic_read(APIC_LVTT); | 1096 | value = apic_read(APIC_LVTT); |
1082 | value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | 1097 | value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); |
1083 | apic_write_around(APIC_LVTT, value); | 1098 | apic_write(APIC_LVTT, value); |
1084 | 1099 | ||
1085 | setup_apic_nmi_watchdog(NULL); | 1100 | setup_apic_nmi_watchdog(NULL); |
1086 | apic_pm_activate(); | 1101 | apic_pm_activate(); |
@@ -1094,7 +1109,7 @@ static int __init detect_init_APIC(void) | |||
1094 | u32 h, l, features; | 1109 | u32 h, l, features; |
1095 | 1110 | ||
1096 | /* Disabled by kernel option? */ | 1111 | /* Disabled by kernel option? */ |
1097 | if (enable_local_apic < 0) | 1112 | if (disable_apic) |
1098 | return -1; | 1113 | return -1; |
1099 | 1114 | ||
1100 | switch (boot_cpu_data.x86_vendor) { | 1115 | switch (boot_cpu_data.x86_vendor) { |
@@ -1117,7 +1132,7 @@ static int __init detect_init_APIC(void) | |||
1117 | * Over-ride BIOS and try to enable the local APIC only if | 1132 | * Over-ride BIOS and try to enable the local APIC only if |
1118 | * "lapic" specified. | 1133 | * "lapic" specified. |
1119 | */ | 1134 | */ |
1120 | if (enable_local_apic <= 0) { | 1135 | if (!force_enable_local_apic) { |
1121 | printk(KERN_INFO "Local APIC disabled by BIOS -- " | 1136 | printk(KERN_INFO "Local APIC disabled by BIOS -- " |
1122 | "you can enable it with \"lapic\"\n"); | 1137 | "you can enable it with \"lapic\"\n"); |
1123 | return -1; | 1138 | return -1; |
@@ -1154,9 +1169,6 @@ static int __init detect_init_APIC(void) | |||
1154 | if (l & MSR_IA32_APICBASE_ENABLE) | 1169 | if (l & MSR_IA32_APICBASE_ENABLE) |
1155 | mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; | 1170 | mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; |
1156 | 1171 | ||
1157 | if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED) | ||
1158 | nmi_watchdog = NMI_LOCAL_APIC; | ||
1159 | |||
1160 | printk(KERN_INFO "Found and enabled local APIC!\n"); | 1172 | printk(KERN_INFO "Found and enabled local APIC!\n"); |
1161 | 1173 | ||
1162 | apic_pm_activate(); | 1174 | apic_pm_activate(); |
@@ -1195,36 +1207,6 @@ void __init init_apic_mappings(void) | |||
1195 | if (boot_cpu_physical_apicid == -1U) | 1207 | if (boot_cpu_physical_apicid == -1U) |
1196 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); | 1208 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); |
1197 | 1209 | ||
1198 | #ifdef CONFIG_X86_IO_APIC | ||
1199 | { | ||
1200 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
1201 | int i; | ||
1202 | |||
1203 | for (i = 0; i < nr_ioapics; i++) { | ||
1204 | if (smp_found_config) { | ||
1205 | ioapic_phys = mp_ioapics[i].mpc_apicaddr; | ||
1206 | if (!ioapic_phys) { | ||
1207 | printk(KERN_ERR | ||
1208 | "WARNING: bogus zero IO-APIC " | ||
1209 | "address found in MPTABLE, " | ||
1210 | "disabling IO/APIC support!\n"); | ||
1211 | smp_found_config = 0; | ||
1212 | skip_ioapic_setup = 1; | ||
1213 | goto fake_ioapic_page; | ||
1214 | } | ||
1215 | } else { | ||
1216 | fake_ioapic_page: | ||
1217 | ioapic_phys = (unsigned long) | ||
1218 | alloc_bootmem_pages(PAGE_SIZE); | ||
1219 | ioapic_phys = __pa(ioapic_phys); | ||
1220 | } | ||
1221 | set_fixmap_nocache(idx, ioapic_phys); | ||
1222 | printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", | ||
1223 | __fix_to_virt(idx), ioapic_phys); | ||
1224 | idx++; | ||
1225 | } | ||
1226 | } | ||
1227 | #endif | ||
1228 | } | 1210 | } |
1229 | 1211 | ||
1230 | /* | 1212 | /* |
@@ -1236,9 +1218,6 @@ int apic_version[MAX_APICS]; | |||
1236 | 1218 | ||
1237 | int __init APIC_init_uniprocessor(void) | 1219 | int __init APIC_init_uniprocessor(void) |
1238 | { | 1220 | { |
1239 | if (enable_local_apic < 0) | ||
1240 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); | ||
1241 | |||
1242 | if (!smp_found_config && !cpu_has_apic) | 1221 | if (!smp_found_config && !cpu_has_apic) |
1243 | return -1; | 1222 | return -1; |
1244 | 1223 | ||
@@ -1265,10 +1244,14 @@ int __init APIC_init_uniprocessor(void) | |||
1265 | #ifdef CONFIG_CRASH_DUMP | 1244 | #ifdef CONFIG_CRASH_DUMP |
1266 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); | 1245 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); |
1267 | #endif | 1246 | #endif |
1268 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); | 1247 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); |
1269 | 1248 | ||
1270 | setup_local_APIC(); | 1249 | setup_local_APIC(); |
1271 | 1250 | ||
1251 | #ifdef CONFIG_X86_IO_APIC | ||
1252 | if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) | ||
1253 | #endif | ||
1254 | localise_nmi_watchdog(); | ||
1272 | end_local_APIC_setup(); | 1255 | end_local_APIC_setup(); |
1273 | #ifdef CONFIG_X86_IO_APIC | 1256 | #ifdef CONFIG_X86_IO_APIC |
1274 | if (smp_found_config) | 1257 | if (smp_found_config) |
@@ -1351,13 +1334,17 @@ void __init smp_intr_init(void) | |||
1351 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | 1334 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper |
1352 | * IPI, driven by wakeup. | 1335 | * IPI, driven by wakeup. |
1353 | */ | 1336 | */ |
1354 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | 1337 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); |
1355 | 1338 | ||
1356 | /* IPI for invalidation */ | 1339 | /* IPI for invalidation */ |
1357 | set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | 1340 | alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); |
1358 | 1341 | ||
1359 | /* IPI for generic function call */ | 1342 | /* IPI for generic function call */ |
1360 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | 1343 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); |
1344 | |||
1345 | /* IPI for single call function */ | ||
1346 | set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, | ||
1347 | call_function_single_interrupt); | ||
1361 | } | 1348 | } |
1362 | #endif | 1349 | #endif |
1363 | 1350 | ||
@@ -1370,15 +1357,15 @@ void __init apic_intr_init(void) | |||
1370 | smp_intr_init(); | 1357 | smp_intr_init(); |
1371 | #endif | 1358 | #endif |
1372 | /* self generated IPI for local APIC timer */ | 1359 | /* self generated IPI for local APIC timer */ |
1373 | set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | 1360 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); |
1374 | 1361 | ||
1375 | /* IPI vectors for APIC spurious and error interrupts */ | 1362 | /* IPI vectors for APIC spurious and error interrupts */ |
1376 | set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 1363 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
1377 | set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 1364 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
1378 | 1365 | ||
1379 | /* thermal monitor LVT interrupt */ | 1366 | /* thermal monitor LVT interrupt */ |
1380 | #ifdef CONFIG_X86_MCE_P4THERMAL | 1367 | #ifdef CONFIG_X86_MCE_P4THERMAL |
1381 | set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | 1368 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); |
1382 | #endif | 1369 | #endif |
1383 | } | 1370 | } |
1384 | 1371 | ||
@@ -1433,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) | |||
1433 | value &= ~APIC_VECTOR_MASK; | 1420 | value &= ~APIC_VECTOR_MASK; |
1434 | value |= APIC_SPIV_APIC_ENABLED; | 1421 | value |= APIC_SPIV_APIC_ENABLED; |
1435 | value |= 0xf; | 1422 | value |= 0xf; |
1436 | apic_write_around(APIC_SPIV, value); | 1423 | apic_write(APIC_SPIV, value); |
1437 | 1424 | ||
1438 | if (!virt_wire_setup) { | 1425 | if (!virt_wire_setup) { |
1439 | /* | 1426 | /* |
@@ -1446,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) | |||
1446 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | 1433 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); |
1447 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | 1434 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; |
1448 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); | 1435 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); |
1449 | apic_write_around(APIC_LVT0, value); | 1436 | apic_write(APIC_LVT0, value); |
1450 | } else { | 1437 | } else { |
1451 | /* Disable LVT0 */ | 1438 | /* Disable LVT0 */ |
1452 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED); | 1439 | apic_write(APIC_LVT0, APIC_LVT_MASKED); |
1453 | } | 1440 | } |
1454 | 1441 | ||
1455 | /* | 1442 | /* |
@@ -1463,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) | |||
1463 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | 1450 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); |
1464 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | 1451 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; |
1465 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | 1452 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); |
1466 | apic_write_around(APIC_LVT1, value); | 1453 | apic_write(APIC_LVT1, value); |
1467 | } | 1454 | } |
1468 | } | 1455 | } |
1469 | 1456 | ||
@@ -1513,6 +1500,9 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1513 | */ | 1500 | */ |
1514 | cpu = 0; | 1501 | cpu = 0; |
1515 | 1502 | ||
1503 | if (apicid > max_physical_apicid) | ||
1504 | max_physical_apicid = apicid; | ||
1505 | |||
1516 | /* | 1506 | /* |
1517 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y | 1507 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y |
1518 | * but we need to work other dependencies like SMP_SUSPEND etc | 1508 | * but we need to work other dependencies like SMP_SUSPEND etc |
@@ -1520,7 +1510,7 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1520 | * if (CPU_HOTPLUG_ENABLED || num_processors > 8) | 1510 | * if (CPU_HOTPLUG_ENABLED || num_processors > 8) |
1521 | * - Ashok Raj <ashok.raj@intel.com> | 1511 | * - Ashok Raj <ashok.raj@intel.com> |
1522 | */ | 1512 | */ |
1523 | if (num_processors > 8) { | 1513 | if (max_physical_apicid >= 8) { |
1524 | switch (boot_cpu_data.x86_vendor) { | 1514 | switch (boot_cpu_data.x86_vendor) { |
1525 | case X86_VENDOR_INTEL: | 1515 | case X86_VENDOR_INTEL: |
1526 | if (!APIC_XAPIC(version)) { | 1516 | if (!APIC_XAPIC(version)) { |
@@ -1534,9 +1524,9 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1534 | } | 1524 | } |
1535 | #ifdef CONFIG_SMP | 1525 | #ifdef CONFIG_SMP |
1536 | /* are we being called early in kernel startup? */ | 1526 | /* are we being called early in kernel startup? */ |
1537 | if (x86_cpu_to_apicid_early_ptr) { | 1527 | if (early_per_cpu_ptr(x86_cpu_to_apicid)) { |
1538 | u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; | 1528 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); |
1539 | u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; | 1529 | u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); |
1540 | 1530 | ||
1541 | cpu_to_apicid[cpu] = apicid; | 1531 | cpu_to_apicid[cpu] = apicid; |
1542 | bios_cpu_apicid[cpu] = apicid; | 1532 | bios_cpu_apicid[cpu] = apicid; |
@@ -1703,15 +1693,15 @@ static void apic_pm_activate(void) { } | |||
1703 | */ | 1693 | */ |
1704 | static int __init parse_lapic(char *arg) | 1694 | static int __init parse_lapic(char *arg) |
1705 | { | 1695 | { |
1706 | enable_local_apic = 1; | 1696 | force_enable_local_apic = 1; |
1707 | return 0; | 1697 | return 0; |
1708 | } | 1698 | } |
1709 | early_param("lapic", parse_lapic); | 1699 | early_param("lapic", parse_lapic); |
1710 | 1700 | ||
1711 | static int __init parse_nolapic(char *arg) | 1701 | static int __init parse_nolapic(char *arg) |
1712 | { | 1702 | { |
1713 | enable_local_apic = -1; | 1703 | disable_apic = 1; |
1714 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); | 1704 | setup_clear_cpu_cap(X86_FEATURE_APIC); |
1715 | return 0; | 1705 | return 0; |
1716 | } | 1706 | } |
1717 | early_param("nolapic", parse_nolapic); | 1707 | early_param("nolapic", parse_nolapic); |
@@ -1740,3 +1730,21 @@ static int __init apic_set_verbosity(char *str) | |||
1740 | } | 1730 | } |
1741 | __setup("apic=", apic_set_verbosity); | 1731 | __setup("apic=", apic_set_verbosity); |
1742 | 1732 | ||
1733 | static int __init lapic_insert_resource(void) | ||
1734 | { | ||
1735 | if (!apic_phys) | ||
1736 | return -1; | ||
1737 | |||
1738 | /* Put local APIC into the resource map. */ | ||
1739 | lapic_resource.start = apic_phys; | ||
1740 | lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; | ||
1741 | insert_resource(&iomem_resource, &lapic_resource); | ||
1742 | |||
1743 | return 0; | ||
1744 | } | ||
1745 | |||
1746 | /* | ||
1747 | * need call insert after e820_reserve_resources() | ||
1748 | * that is using request_resource | ||
1749 | */ | ||
1750 | late_initcall(lapic_insert_resource); | ||
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd0dc29..7f1f030da7ee 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c | |||
@@ -43,7 +43,7 @@ | |||
43 | #include <mach_ipi.h> | 43 | #include <mach_ipi.h> |
44 | #include <mach_apic.h> | 44 | #include <mach_apic.h> |
45 | 45 | ||
46 | int disable_apic_timer __cpuinitdata; | 46 | static int disable_apic_timer __cpuinitdata; |
47 | static int apic_calibrate_pmtmr __initdata; | 47 | static int apic_calibrate_pmtmr __initdata; |
48 | int disable_apic; | 48 | int disable_apic; |
49 | 49 | ||
@@ -54,7 +54,10 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); | |||
54 | /* | 54 | /* |
55 | * Debug level, exported for io_apic.c | 55 | * Debug level, exported for io_apic.c |
56 | */ | 56 | */ |
57 | int apic_verbosity; | 57 | unsigned int apic_verbosity; |
58 | |||
59 | /* Have we found an MP table */ | ||
60 | int smp_found_config; | ||
58 | 61 | ||
59 | static struct resource lapic_resource = { | 62 | static struct resource lapic_resource = { |
60 | .name = "Local APIC", | 63 | .name = "Local APIC", |
@@ -87,9 +90,6 @@ static unsigned long apic_phys; | |||
87 | 90 | ||
88 | unsigned long mp_lapic_addr; | 91 | unsigned long mp_lapic_addr; |
89 | 92 | ||
90 | DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; | ||
91 | EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); | ||
92 | |||
93 | unsigned int __cpuinitdata maxcpus = NR_CPUS; | 93 | unsigned int __cpuinitdata maxcpus = NR_CPUS; |
94 | /* | 94 | /* |
95 | * Get the LAPIC version | 95 | * Get the LAPIC version |
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void) | |||
314 | 314 | ||
315 | #define TICK_COUNT 100000000 | 315 | #define TICK_COUNT 100000000 |
316 | 316 | ||
317 | static void __init calibrate_APIC_clock(void) | 317 | static int __init calibrate_APIC_clock(void) |
318 | { | 318 | { |
319 | unsigned apic, apic_start; | 319 | unsigned apic, apic_start; |
320 | unsigned long tsc, tsc_start; | 320 | unsigned long tsc, tsc_start; |
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void) | |||
368 | clockevent_delta2ns(0xF, &lapic_clockevent); | 368 | clockevent_delta2ns(0xF, &lapic_clockevent); |
369 | 369 | ||
370 | calibration_result = result / HZ; | 370 | calibration_result = result / HZ; |
371 | |||
372 | /* | ||
373 | * Do a sanity check on the APIC calibration result | ||
374 | */ | ||
375 | if (calibration_result < (1000000 / HZ)) { | ||
376 | printk(KERN_WARNING | ||
377 | "APIC frequency too slow, disabling apic timer\n"); | ||
378 | return -1; | ||
379 | } | ||
380 | |||
381 | return 0; | ||
371 | } | 382 | } |
372 | 383 | ||
373 | /* | 384 | /* |
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void) | |||
394 | } | 405 | } |
395 | 406 | ||
396 | printk(KERN_INFO "Using local APIC timer interrupts.\n"); | 407 | printk(KERN_INFO "Using local APIC timer interrupts.\n"); |
397 | calibrate_APIC_clock(); | 408 | if (calibrate_APIC_clock()) { |
398 | |||
399 | /* | ||
400 | * Do a sanity check on the APIC calibration result | ||
401 | */ | ||
402 | if (calibration_result < (1000000 / HZ)) { | ||
403 | printk(KERN_WARNING | ||
404 | "APIC frequency too slow, disabling apic timer\n"); | ||
405 | /* No broadcast on UP ! */ | 409 | /* No broadcast on UP ! */ |
406 | if (num_possible_cpus() > 1) | 410 | if (num_possible_cpus() > 1) |
407 | setup_APIC_timer(); | 411 | setup_APIC_timer(); |
@@ -417,37 +421,13 @@ void __init setup_boot_APIC_clock(void) | |||
417 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; | 421 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; |
418 | else | 422 | else |
419 | printk(KERN_WARNING "APIC timer registered as dummy," | 423 | printk(KERN_WARNING "APIC timer registered as dummy," |
420 | " due to nmi_watchdog=1!\n"); | 424 | " due to nmi_watchdog=%d!\n", nmi_watchdog); |
421 | 425 | ||
422 | setup_APIC_timer(); | 426 | setup_APIC_timer(); |
423 | } | 427 | } |
424 | 428 | ||
425 | /* | ||
426 | * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the | ||
427 | * C1E flag only in the secondary CPU, so when we detect the wreckage | ||
428 | * we already have enabled the boot CPU local apic timer. Check, if | ||
429 | * disable_apic_timer is set and the DUMMY flag is cleared. If yes, | ||
430 | * set the DUMMY flag again and force the broadcast mode in the | ||
431 | * clockevents layer. | ||
432 | */ | ||
433 | static void __cpuinit check_boot_apic_timer_broadcast(void) | ||
434 | { | ||
435 | if (!disable_apic_timer || | ||
436 | (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) | ||
437 | return; | ||
438 | |||
439 | printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); | ||
440 | lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; | ||
441 | |||
442 | local_irq_enable(); | ||
443 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, | ||
444 | &boot_cpu_physical_apicid); | ||
445 | local_irq_disable(); | ||
446 | } | ||
447 | |||
448 | void __cpuinit setup_secondary_APIC_clock(void) | 429 | void __cpuinit setup_secondary_APIC_clock(void) |
449 | { | 430 | { |
450 | check_boot_apic_timer_broadcast(); | ||
451 | setup_APIC_timer(); | 431 | setup_APIC_timer(); |
452 | } | 432 | } |
453 | 433 | ||
@@ -850,7 +830,6 @@ static void __cpuinit lapic_setup_esr(void) | |||
850 | void __cpuinit end_local_APIC_setup(void) | 830 | void __cpuinit end_local_APIC_setup(void) |
851 | { | 831 | { |
852 | lapic_setup_esr(); | 832 | lapic_setup_esr(); |
853 | nmi_watchdog_default(); | ||
854 | setup_apic_nmi_watchdog(NULL); | 833 | setup_apic_nmi_watchdog(NULL); |
855 | apic_pm_activate(); | 834 | apic_pm_activate(); |
856 | } | 835 | } |
@@ -875,7 +854,7 @@ static int __init detect_init_APIC(void) | |||
875 | 854 | ||
876 | void __init early_init_lapic_mapping(void) | 855 | void __init early_init_lapic_mapping(void) |
877 | { | 856 | { |
878 | unsigned long apic_phys; | 857 | unsigned long phys_addr; |
879 | 858 | ||
880 | /* | 859 | /* |
881 | * If no local APIC can be found then go out | 860 | * If no local APIC can be found then go out |
@@ -884,11 +863,11 @@ void __init early_init_lapic_mapping(void) | |||
884 | if (!smp_found_config) | 863 | if (!smp_found_config) |
885 | return; | 864 | return; |
886 | 865 | ||
887 | apic_phys = mp_lapic_addr; | 866 | phys_addr = mp_lapic_addr; |
888 | 867 | ||
889 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | 868 | set_fixmap_nocache(FIX_APIC_BASE, phys_addr); |
890 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", | 869 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", |
891 | APIC_BASE, apic_phys); | 870 | APIC_BASE, phys_addr); |
892 | 871 | ||
893 | /* | 872 | /* |
894 | * Fetch the APIC ID of the BSP in case we have a | 873 | * Fetch the APIC ID of the BSP in case we have a |
@@ -942,7 +921,9 @@ int __init APIC_init_uniprocessor(void) | |||
942 | 921 | ||
943 | verify_local_APIC(); | 922 | verify_local_APIC(); |
944 | 923 | ||
945 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); | 924 | connect_bsp_APIC(); |
925 | |||
926 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); | ||
946 | apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); | 927 | apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); |
947 | 928 | ||
948 | setup_local_APIC(); | 929 | setup_local_APIC(); |
@@ -954,6 +935,8 @@ int __init APIC_init_uniprocessor(void) | |||
954 | if (!skip_ioapic_setup && nr_ioapics) | 935 | if (!skip_ioapic_setup && nr_ioapics) |
955 | enable_IO_APIC(); | 936 | enable_IO_APIC(); |
956 | 937 | ||
938 | if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) | ||
939 | localise_nmi_watchdog(); | ||
957 | end_local_APIC_setup(); | 940 | end_local_APIC_setup(); |
958 | 941 | ||
959 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | 942 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) |
@@ -1021,6 +1004,14 @@ asmlinkage void smp_error_interrupt(void) | |||
1021 | irq_exit(); | 1004 | irq_exit(); |
1022 | } | 1005 | } |
1023 | 1006 | ||
1007 | /** | ||
1008 | * * connect_bsp_APIC - attach the APIC to the interrupt system | ||
1009 | * */ | ||
1010 | void __init connect_bsp_APIC(void) | ||
1011 | { | ||
1012 | enable_apic_mode(); | ||
1013 | } | ||
1014 | |||
1024 | void disconnect_bsp_APIC(int virt_wire_setup) | 1015 | void disconnect_bsp_APIC(int virt_wire_setup) |
1025 | { | 1016 | { |
1026 | /* Go back to Virtual Wire compatibility mode */ | 1017 | /* Go back to Virtual Wire compatibility mode */ |
@@ -1090,10 +1081,13 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1090 | */ | 1081 | */ |
1091 | cpu = 0; | 1082 | cpu = 0; |
1092 | } | 1083 | } |
1084 | if (apicid > max_physical_apicid) | ||
1085 | max_physical_apicid = apicid; | ||
1086 | |||
1093 | /* are we being called early in kernel startup? */ | 1087 | /* are we being called early in kernel startup? */ |
1094 | if (x86_cpu_to_apicid_early_ptr) { | 1088 | if (early_per_cpu_ptr(x86_cpu_to_apicid)) { |
1095 | u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; | 1089 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); |
1096 | u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; | 1090 | u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); |
1097 | 1091 | ||
1098 | cpu_to_apicid[cpu] = apicid; | 1092 | cpu_to_apicid[cpu] = apicid; |
1099 | bios_cpu_apicid[cpu] = apicid; | 1093 | bios_cpu_apicid[cpu] = apicid; |
@@ -1269,7 +1263,7 @@ __cpuinit int apic_is_clustered_box(void) | |||
1269 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) | 1263 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) |
1270 | return 0; | 1264 | return 0; |
1271 | 1265 | ||
1272 | bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; | 1266 | bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); |
1273 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); | 1267 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); |
1274 | 1268 | ||
1275 | for (i = 0; i < NR_CPUS; i++) { | 1269 | for (i = 0; i < NR_CPUS; i++) { |
@@ -1347,7 +1341,7 @@ early_param("apic", apic_set_verbosity); | |||
1347 | static __init int setup_disableapic(char *str) | 1341 | static __init int setup_disableapic(char *str) |
1348 | { | 1342 | { |
1349 | disable_apic = 1; | 1343 | disable_apic = 1; |
1350 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); | 1344 | setup_clear_cpu_cap(X86_FEATURE_APIC); |
1351 | return 0; | 1345 | return 0; |
1352 | } | 1346 | } |
1353 | early_param("disableapic", setup_disableapic); | 1347 | early_param("disableapic", setup_disableapic); |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9290e29013..9ee24e6bc4b0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -204,6 +204,7 @@ | |||
204 | #include <linux/module.h> | 204 | #include <linux/module.h> |
205 | 205 | ||
206 | #include <linux/poll.h> | 206 | #include <linux/poll.h> |
207 | #include <linux/smp_lock.h> | ||
207 | #include <linux/types.h> | 208 | #include <linux/types.h> |
208 | #include <linux/stddef.h> | 209 | #include <linux/stddef.h> |
209 | #include <linux/timer.h> | 210 | #include <linux/timer.h> |
@@ -218,7 +219,6 @@ | |||
218 | #include <linux/time.h> | 219 | #include <linux/time.h> |
219 | #include <linux/sched.h> | 220 | #include <linux/sched.h> |
220 | #include <linux/pm.h> | 221 | #include <linux/pm.h> |
221 | #include <linux/pm_legacy.h> | ||
222 | #include <linux/capability.h> | 222 | #include <linux/capability.h> |
223 | #include <linux/device.h> | 223 | #include <linux/device.h> |
224 | #include <linux/kernel.h> | 224 | #include <linux/kernel.h> |
@@ -228,6 +228,7 @@ | |||
228 | #include <linux/suspend.h> | 228 | #include <linux/suspend.h> |
229 | #include <linux/kthread.h> | 229 | #include <linux/kthread.h> |
230 | #include <linux/jiffies.h> | 230 | #include <linux/jiffies.h> |
231 | #include <linux/smp_lock.h> | ||
231 | 232 | ||
232 | #include <asm/system.h> | 233 | #include <asm/system.h> |
233 | #include <asm/uaccess.h> | 234 | #include <asm/uaccess.h> |
@@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) | |||
1149 | as->event_tail = 0; | 1150 | as->event_tail = 0; |
1150 | } | 1151 | } |
1151 | as->events[as->event_head] = event; | 1152 | as->events[as->event_head] = event; |
1152 | if ((!as->suser) || (!as->writer)) | 1153 | if (!as->suser || !as->writer) |
1153 | continue; | 1154 | continue; |
1154 | switch (event) { | 1155 | switch (event) { |
1155 | case APM_SYS_SUSPEND: | 1156 | case APM_SYS_SUSPEND: |
@@ -1211,9 +1212,9 @@ static int suspend(int vetoable) | |||
1211 | if (err != APM_SUCCESS) | 1212 | if (err != APM_SUCCESS) |
1212 | apm_error("suspend", err); | 1213 | apm_error("suspend", err); |
1213 | err = (err == APM_SUCCESS) ? 0 : -EIO; | 1214 | err = (err == APM_SUCCESS) ? 0 : -EIO; |
1214 | device_power_up(); | 1215 | device_power_up(PMSG_RESUME); |
1215 | local_irq_enable(); | 1216 | local_irq_enable(); |
1216 | device_resume(); | 1217 | device_resume(PMSG_RESUME); |
1217 | queue_event(APM_NORMAL_RESUME, NULL); | 1218 | queue_event(APM_NORMAL_RESUME, NULL); |
1218 | spin_lock(&user_list_lock); | 1219 | spin_lock(&user_list_lock); |
1219 | for (as = user_list; as != NULL; as = as->next) { | 1220 | for (as = user_list; as != NULL; as = as->next) { |
@@ -1238,7 +1239,7 @@ static void standby(void) | |||
1238 | apm_error("standby", err); | 1239 | apm_error("standby", err); |
1239 | 1240 | ||
1240 | local_irq_disable(); | 1241 | local_irq_disable(); |
1241 | device_power_up(); | 1242 | device_power_up(PMSG_RESUME); |
1242 | local_irq_enable(); | 1243 | local_irq_enable(); |
1243 | } | 1244 | } |
1244 | 1245 | ||
@@ -1324,7 +1325,7 @@ static void check_events(void) | |||
1324 | ignore_bounce = 1; | 1325 | ignore_bounce = 1; |
1325 | if ((event != APM_NORMAL_RESUME) | 1326 | if ((event != APM_NORMAL_RESUME) |
1326 | || (ignore_normal_resume == 0)) { | 1327 | || (ignore_normal_resume == 0)) { |
1327 | device_resume(); | 1328 | device_resume(PMSG_RESUME); |
1328 | queue_event(event, NULL); | 1329 | queue_event(event, NULL); |
1329 | } | 1330 | } |
1330 | ignore_normal_resume = 0; | 1331 | ignore_normal_resume = 0; |
@@ -1396,7 +1397,7 @@ static void apm_mainloop(void) | |||
1396 | 1397 | ||
1397 | static int check_apm_user(struct apm_user *as, const char *func) | 1398 | static int check_apm_user(struct apm_user *as, const char *func) |
1398 | { | 1399 | { |
1399 | if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { | 1400 | if (as == NULL || as->magic != APM_BIOS_MAGIC) { |
1400 | printk(KERN_ERR "apm: %s passed bad filp\n", func); | 1401 | printk(KERN_ERR "apm: %s passed bad filp\n", func); |
1401 | return 1; | 1402 | return 1; |
1402 | } | 1403 | } |
@@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait) | |||
1459 | return 0; | 1460 | return 0; |
1460 | } | 1461 | } |
1461 | 1462 | ||
1462 | static int do_ioctl(struct inode *inode, struct file *filp, | 1463 | static long do_ioctl(struct file *filp, u_int cmd, u_long arg) |
1463 | u_int cmd, u_long arg) | ||
1464 | { | 1464 | { |
1465 | struct apm_user *as; | 1465 | struct apm_user *as; |
1466 | int ret; | ||
1466 | 1467 | ||
1467 | as = filp->private_data; | 1468 | as = filp->private_data; |
1468 | if (check_apm_user(as, "ioctl")) | 1469 | if (check_apm_user(as, "ioctl")) |
1469 | return -EIO; | 1470 | return -EIO; |
1470 | if ((!as->suser) || (!as->writer)) | 1471 | if (!as->suser || !as->writer) |
1471 | return -EPERM; | 1472 | return -EPERM; |
1472 | switch (cmd) { | 1473 | switch (cmd) { |
1473 | case APM_IOC_STANDBY: | 1474 | case APM_IOC_STANDBY: |
1475 | lock_kernel(); | ||
1474 | if (as->standbys_read > 0) { | 1476 | if (as->standbys_read > 0) { |
1475 | as->standbys_read--; | 1477 | as->standbys_read--; |
1476 | as->standbys_pending--; | 1478 | as->standbys_pending--; |
@@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp, | |||
1479 | queue_event(APM_USER_STANDBY, as); | 1481 | queue_event(APM_USER_STANDBY, as); |
1480 | if (standbys_pending <= 0) | 1482 | if (standbys_pending <= 0) |
1481 | standby(); | 1483 | standby(); |
1484 | unlock_kernel(); | ||
1482 | break; | 1485 | break; |
1483 | case APM_IOC_SUSPEND: | 1486 | case APM_IOC_SUSPEND: |
1487 | lock_kernel(); | ||
1484 | if (as->suspends_read > 0) { | 1488 | if (as->suspends_read > 0) { |
1485 | as->suspends_read--; | 1489 | as->suspends_read--; |
1486 | as->suspends_pending--; | 1490 | as->suspends_pending--; |
@@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp, | |||
1488 | } else | 1492 | } else |
1489 | queue_event(APM_USER_SUSPEND, as); | 1493 | queue_event(APM_USER_SUSPEND, as); |
1490 | if (suspends_pending <= 0) { | 1494 | if (suspends_pending <= 0) { |
1491 | return suspend(1); | 1495 | ret = suspend(1); |
1492 | } else { | 1496 | } else { |
1493 | as->suspend_wait = 1; | 1497 | as->suspend_wait = 1; |
1494 | wait_event_interruptible(apm_suspend_waitqueue, | 1498 | wait_event_interruptible(apm_suspend_waitqueue, |
1495 | as->suspend_wait == 0); | 1499 | as->suspend_wait == 0); |
1496 | return as->suspend_result; | 1500 | ret = as->suspend_result; |
1497 | } | 1501 | } |
1498 | break; | 1502 | unlock_kernel(); |
1503 | return ret; | ||
1499 | default: | 1504 | default: |
1500 | return -EINVAL; | 1505 | return -ENOTTY; |
1501 | } | 1506 | } |
1502 | return 0; | 1507 | return 0; |
1503 | } | 1508 | } |
@@ -1544,10 +1549,12 @@ static int do_open(struct inode *inode, struct file *filp) | |||
1544 | { | 1549 | { |
1545 | struct apm_user *as; | 1550 | struct apm_user *as; |
1546 | 1551 | ||
1552 | lock_kernel(); | ||
1547 | as = kmalloc(sizeof(*as), GFP_KERNEL); | 1553 | as = kmalloc(sizeof(*as), GFP_KERNEL); |
1548 | if (as == NULL) { | 1554 | if (as == NULL) { |
1549 | printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", | 1555 | printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", |
1550 | sizeof(*as)); | 1556 | sizeof(*as)); |
1557 | unlock_kernel(); | ||
1551 | return -ENOMEM; | 1558 | return -ENOMEM; |
1552 | } | 1559 | } |
1553 | as->magic = APM_BIOS_MAGIC; | 1560 | as->magic = APM_BIOS_MAGIC; |
@@ -1569,6 +1576,7 @@ static int do_open(struct inode *inode, struct file *filp) | |||
1569 | user_list = as; | 1576 | user_list = as; |
1570 | spin_unlock(&user_list_lock); | 1577 | spin_unlock(&user_list_lock); |
1571 | filp->private_data = as; | 1578 | filp->private_data = as; |
1579 | unlock_kernel(); | ||
1572 | return 0; | 1580 | return 0; |
1573 | } | 1581 | } |
1574 | 1582 | ||
@@ -1860,7 +1868,7 @@ static const struct file_operations apm_bios_fops = { | |||
1860 | .owner = THIS_MODULE, | 1868 | .owner = THIS_MODULE, |
1861 | .read = do_read, | 1869 | .read = do_read, |
1862 | .poll = do_poll, | 1870 | .poll = do_poll, |
1863 | .ioctl = do_ioctl, | 1871 | .unlocked_ioctl = do_ioctl, |
1864 | .open = do_open, | 1872 | .open = do_open, |
1865 | .release = do_release, | 1873 | .release = do_release, |
1866 | }; | 1874 | }; |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 92588083950f..6649d09ad88f 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -111,7 +111,7 @@ void foo(void) | |||
111 | OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); | 111 | OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); |
112 | OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); | 112 | OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); |
113 | OFFSET(PV_CPU_iret, pv_cpu_ops, iret); | 113 | OFFSET(PV_CPU_iret, pv_cpu_ops, iret); |
114 | OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); | 114 | OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); |
115 | OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); | 115 | OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); |
116 | #endif | 116 | #endif |
117 | 117 | ||
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f126c05d6170..aa89387006fe 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -18,6 +18,8 @@ | |||
18 | #include <asm/ia32.h> | 18 | #include <asm/ia32.h> |
19 | #include <asm/bootparam.h> | 19 | #include <asm/bootparam.h> |
20 | 20 | ||
21 | #include <xen/interface/xen.h> | ||
22 | |||
21 | #define __NO_STUBS 1 | 23 | #define __NO_STUBS 1 |
22 | #undef __SYSCALL | 24 | #undef __SYSCALL |
23 | #undef _ASM_X86_64_UNISTD_H_ | 25 | #undef _ASM_X86_64_UNISTD_H_ |
@@ -34,7 +36,7 @@ int main(void) | |||
34 | ENTRY(pid); | 36 | ENTRY(pid); |
35 | BLANK(); | 37 | BLANK(); |
36 | #undef ENTRY | 38 | #undef ENTRY |
37 | #define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) | 39 | #define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry)) |
38 | ENTRY(flags); | 40 | ENTRY(flags); |
39 | ENTRY(addr_limit); | 41 | ENTRY(addr_limit); |
40 | ENTRY(preempt_count); | 42 | ENTRY(preempt_count); |
@@ -61,8 +63,11 @@ int main(void) | |||
61 | OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); | 63 | OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); |
62 | OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); | 64 | OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); |
63 | OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); | 65 | OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); |
66 | OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); | ||
64 | OFFSET(PV_CPU_iret, pv_cpu_ops, iret); | 67 | OFFSET(PV_CPU_iret, pv_cpu_ops, iret); |
65 | OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); | 68 | OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); |
69 | OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); | ||
70 | OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); | ||
66 | OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); | 71 | OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); |
67 | OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); | 72 | OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); |
68 | #endif | 73 | #endif |
@@ -128,5 +133,14 @@ int main(void) | |||
128 | OFFSET(BP_loadflags, boot_params, hdr.loadflags); | 133 | OFFSET(BP_loadflags, boot_params, hdr.loadflags); |
129 | OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); | 134 | OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); |
130 | OFFSET(BP_version, boot_params, hdr.version); | 135 | OFFSET(BP_version, boot_params, hdr.version); |
136 | |||
137 | BLANK(); | ||
138 | DEFINE(PAGE_SIZE_asm, PAGE_SIZE); | ||
139 | #ifdef CONFIG_XEN | ||
140 | BLANK(); | ||
141 | OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); | ||
142 | OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); | ||
143 | #undef ENTRY | ||
144 | #endif | ||
131 | return 0; | 145 | return 0; |
132 | } | 146 | } |
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c new file mode 100644 index 000000000000..c639bd55391c --- /dev/null +++ b/arch/x86/kernel/bios_uv.c | |||
@@ -0,0 +1,48 @@ | |||
1 | /* | ||
2 | * BIOS run time interface routines. | ||
3 | * | ||
4 | * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | |||
21 | #include <asm/uv/bios.h> | ||
22 | |||
23 | const char * | ||
24 | x86_bios_strerror(long status) | ||
25 | { | ||
26 | const char *str; | ||
27 | switch (status) { | ||
28 | case 0: str = "Call completed without error"; break; | ||
29 | case -1: str = "Not implemented"; break; | ||
30 | case -2: str = "Invalid argument"; break; | ||
31 | case -3: str = "Call completed with error"; break; | ||
32 | default: str = "Unknown BIOS status code"; break; | ||
33 | } | ||
34 | return str; | ||
35 | } | ||
36 | |||
37 | long | ||
38 | x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, | ||
39 | unsigned long *drift_info) | ||
40 | { | ||
41 | struct uv_bios_retval isrv; | ||
42 | |||
43 | BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); | ||
44 | *ticks_per_second = isrv.v0; | ||
45 | *drift_info = isrv.v1; | ||
46 | return isrv.status; | ||
47 | } | ||
48 | EXPORT_SYMBOL_GPL(x86_bios_freq_base); | ||
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a0c6f8190887..ee76eaad3001 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -6,11 +6,15 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o | |||
6 | obj-y += proc.o feature_names.o | 6 | obj-y += proc.o feature_names.o |
7 | 7 | ||
8 | obj-$(CONFIG_X86_32) += common.o bugs.o | 8 | obj-$(CONFIG_X86_32) += common.o bugs.o |
9 | obj-$(CONFIG_X86_64) += common_64.o bugs_64.o | ||
9 | obj-$(CONFIG_X86_32) += amd.o | 10 | obj-$(CONFIG_X86_32) += amd.o |
11 | obj-$(CONFIG_X86_64) += amd_64.o | ||
10 | obj-$(CONFIG_X86_32) += cyrix.o | 12 | obj-$(CONFIG_X86_32) += cyrix.o |
11 | obj-$(CONFIG_X86_32) += centaur.o | 13 | obj-$(CONFIG_X86_32) += centaur.o |
14 | obj-$(CONFIG_X86_64) += centaur_64.o | ||
12 | obj-$(CONFIG_X86_32) += transmeta.o | 15 | obj-$(CONFIG_X86_32) += transmeta.o |
13 | obj-$(CONFIG_X86_32) += intel.o | 16 | obj-$(CONFIG_X86_32) += intel.o |
17 | obj-$(CONFIG_X86_64) += intel_64.o | ||
14 | obj-$(CONFIG_X86_32) += umc.o | 18 | obj-$(CONFIG_X86_32) += umc.o |
15 | 19 | ||
16 | obj-$(CONFIG_X86_MCE) += mcheck/ | 20 | obj-$(CONFIG_X86_MCE) += mcheck/ |
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index c2e1ce33c7cb..84a8220a6072 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c | |||
@@ -1,9 +1,7 @@ | |||
1 | |||
2 | /* | 1 | /* |
3 | * Routines to indentify additional cpu features that are scattered in | 2 | * Routines to indentify additional cpu features that are scattered in |
4 | * cpuid space. | 3 | * cpuid space. |
5 | */ | 4 | */ |
6 | |||
7 | #include <linux/cpu.h> | 5 | #include <linux/cpu.h> |
8 | 6 | ||
9 | #include <asm/pat.h> | 7 | #include <asm/pat.h> |
@@ -53,19 +51,20 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | |||
53 | #ifdef CONFIG_X86_PAT | 51 | #ifdef CONFIG_X86_PAT |
54 | void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) | 52 | void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) |
55 | { | 53 | { |
54 | if (!cpu_has_pat) | ||
55 | pat_disable("PAT not supported by CPU."); | ||
56 | |||
56 | switch (c->x86_vendor) { | 57 | switch (c->x86_vendor) { |
57 | case X86_VENDOR_AMD: | ||
58 | if (c->x86 >= 0xf && c->x86 <= 0x11) | ||
59 | return; | ||
60 | break; | ||
61 | case X86_VENDOR_INTEL: | 58 | case X86_VENDOR_INTEL: |
62 | if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) | 59 | if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) |
63 | return; | 60 | return; |
64 | break; | 61 | break; |
62 | case X86_VENDOR_AMD: | ||
63 | case X86_VENDOR_CENTAUR: | ||
64 | case X86_VENDOR_TRANSMETA: | ||
65 | return; | ||
65 | } | 66 | } |
66 | 67 | ||
67 | pat_disable(cpu_has_pat ? | 68 | pat_disable("PAT disabled. Not yet verified on this CPU type."); |
68 | "PAT disabled. Not yet verified on this CPU type." : | ||
69 | "PAT not supported by CPU."); | ||
70 | } | 69 | } |
71 | #endif | 70 | #endif |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 245866828294..cae9cabc3031 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -24,45 +24,6 @@ | |||
24 | extern void vide(void); | 24 | extern void vide(void); |
25 | __asm__(".align 4\nvide: ret"); | 25 | __asm__(".align 4\nvide: ret"); |
26 | 26 | ||
27 | #ifdef CONFIG_X86_LOCAL_APIC | ||
28 | #define ENABLE_C1E_MASK 0x18000000 | ||
29 | #define CPUID_PROCESSOR_SIGNATURE 1 | ||
30 | #define CPUID_XFAM 0x0ff00000 | ||
31 | #define CPUID_XFAM_K8 0x00000000 | ||
32 | #define CPUID_XFAM_10H 0x00100000 | ||
33 | #define CPUID_XFAM_11H 0x00200000 | ||
34 | #define CPUID_XMOD 0x000f0000 | ||
35 | #define CPUID_XMOD_REV_F 0x00040000 | ||
36 | |||
37 | /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ | ||
38 | static __cpuinit int amd_apic_timer_broken(void) | ||
39 | { | ||
40 | u32 lo, hi; | ||
41 | u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); | ||
42 | switch (eax & CPUID_XFAM) { | ||
43 | case CPUID_XFAM_K8: | ||
44 | if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) | ||
45 | break; | ||
46 | case CPUID_XFAM_10H: | ||
47 | case CPUID_XFAM_11H: | ||
48 | rdmsr(MSR_K8_ENABLE_C1E, lo, hi); | ||
49 | if (lo & ENABLE_C1E_MASK) { | ||
50 | if (smp_processor_id() != boot_cpu_physical_apicid) | ||
51 | printk(KERN_INFO "AMD C1E detected late. " | ||
52 | " Force timer broadcast.\n"); | ||
53 | return 1; | ||
54 | } | ||
55 | break; | ||
56 | default: | ||
57 | /* err on the side of caution */ | ||
58 | return 1; | ||
59 | } | ||
60 | return 0; | ||
61 | } | ||
62 | #endif | ||
63 | |||
64 | int force_mwait __cpuinitdata; | ||
65 | |||
66 | static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | 27 | static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) |
67 | { | 28 | { |
68 | if (cpuid_eax(0x80000000) >= 0x80000007) { | 29 | if (cpuid_eax(0x80000000) >= 0x80000007) { |
@@ -297,11 +258,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
297 | num_cache_leaves = 3; | 258 | num_cache_leaves = 3; |
298 | } | 259 | } |
299 | 260 | ||
300 | #ifdef CONFIG_X86_LOCAL_APIC | ||
301 | if (amd_apic_timer_broken()) | ||
302 | local_apic_timer_disabled = 1; | ||
303 | #endif | ||
304 | |||
305 | /* K6s reports MCEs but don't actually have all the MSRs */ | 261 | /* K6s reports MCEs but don't actually have all the MSRs */ |
306 | if (c->x86 < 6) | 262 | if (c->x86 < 6) |
307 | clear_cpu_cap(c, X86_FEATURE_MCE); | 263 | clear_cpu_cap(c, X86_FEATURE_MCE); |
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c new file mode 100644 index 000000000000..d1692b2a41ff --- /dev/null +++ b/arch/x86/kernel/cpu/amd_64.c | |||
@@ -0,0 +1,224 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/mm.h> | ||
3 | |||
4 | #include <asm/numa_64.h> | ||
5 | #include <asm/mmconfig.h> | ||
6 | #include <asm/cacheflush.h> | ||
7 | |||
8 | #include <mach_apic.h> | ||
9 | |||
10 | #include "cpu.h" | ||
11 | |||
12 | int force_mwait __cpuinitdata; | ||
13 | |||
14 | #ifdef CONFIG_NUMA | ||
15 | static int __cpuinit nearby_node(int apicid) | ||
16 | { | ||
17 | int i, node; | ||
18 | |||
19 | for (i = apicid - 1; i >= 0; i--) { | ||
20 | node = apicid_to_node[i]; | ||
21 | if (node != NUMA_NO_NODE && node_online(node)) | ||
22 | return node; | ||
23 | } | ||
24 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | ||
25 | node = apicid_to_node[i]; | ||
26 | if (node != NUMA_NO_NODE && node_online(node)) | ||
27 | return node; | ||
28 | } | ||
29 | return first_node(node_online_map); /* Shouldn't happen */ | ||
30 | } | ||
31 | #endif | ||
32 | |||
33 | /* | ||
34 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | ||
35 | * Assumes number of cores is a power of two. | ||
36 | */ | ||
37 | static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | ||
38 | { | ||
39 | #ifdef CONFIG_SMP | ||
40 | unsigned bits; | ||
41 | #ifdef CONFIG_NUMA | ||
42 | int cpu = smp_processor_id(); | ||
43 | int node = 0; | ||
44 | unsigned apicid = hard_smp_processor_id(); | ||
45 | #endif | ||
46 | bits = c->x86_coreid_bits; | ||
47 | |||
48 | /* Low order bits define the core id (index of core in socket) */ | ||
49 | c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); | ||
50 | /* Convert the initial APIC ID into the socket ID */ | ||
51 | c->phys_proc_id = c->initial_apicid >> bits; | ||
52 | |||
53 | #ifdef CONFIG_NUMA | ||
54 | node = c->phys_proc_id; | ||
55 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | ||
56 | node = apicid_to_node[apicid]; | ||
57 | if (!node_online(node)) { | ||
58 | /* Two possibilities here: | ||
59 | - The CPU is missing memory and no node was created. | ||
60 | In that case try picking one from a nearby CPU | ||
61 | - The APIC IDs differ from the HyperTransport node IDs | ||
62 | which the K8 northbridge parsing fills in. | ||
63 | Assume they are all increased by a constant offset, | ||
64 | but in the same order as the HT nodeids. | ||
65 | If that doesn't result in a usable node fall back to the | ||
66 | path for the previous case. */ | ||
67 | |||
68 | int ht_nodeid = c->initial_apicid; | ||
69 | |||
70 | if (ht_nodeid >= 0 && | ||
71 | apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | ||
72 | node = apicid_to_node[ht_nodeid]; | ||
73 | /* Pick a nearby node */ | ||
74 | if (!node_online(node)) | ||
75 | node = nearby_node(apicid); | ||
76 | } | ||
77 | numa_set_node(cpu, node); | ||
78 | |||
79 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
80 | #endif | ||
81 | #endif | ||
82 | } | ||
83 | |||
84 | static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) | ||
85 | { | ||
86 | #ifdef CONFIG_SMP | ||
87 | unsigned bits, ecx; | ||
88 | |||
89 | /* Multi core CPU? */ | ||
90 | if (c->extended_cpuid_level < 0x80000008) | ||
91 | return; | ||
92 | |||
93 | ecx = cpuid_ecx(0x80000008); | ||
94 | |||
95 | c->x86_max_cores = (ecx & 0xff) + 1; | ||
96 | |||
97 | /* CPU telling us the core id bits shift? */ | ||
98 | bits = (ecx >> 12) & 0xF; | ||
99 | |||
100 | /* Otherwise recompute */ | ||
101 | if (bits == 0) { | ||
102 | while ((1 << bits) < c->x86_max_cores) | ||
103 | bits++; | ||
104 | } | ||
105 | |||
106 | c->x86_coreid_bits = bits; | ||
107 | |||
108 | #endif | ||
109 | } | ||
110 | |||
111 | static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | ||
112 | { | ||
113 | early_init_amd_mc(c); | ||
114 | |||
115 | /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | ||
116 | if (c->x86_power & (1<<8)) | ||
117 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
118 | |||
119 | set_cpu_cap(c, X86_FEATURE_SYSCALL32); | ||
120 | } | ||
121 | |||
122 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | ||
123 | { | ||
124 | unsigned level; | ||
125 | |||
126 | #ifdef CONFIG_SMP | ||
127 | unsigned long value; | ||
128 | |||
129 | /* | ||
130 | * Disable TLB flush filter by setting HWCR.FFDIS on K8 | ||
131 | * bit 6 of msr C001_0015 | ||
132 | * | ||
133 | * Errata 63 for SH-B3 steppings | ||
134 | * Errata 122 for all steppings (F+ have it disabled by default) | ||
135 | */ | ||
136 | if (c->x86 == 0xf) { | ||
137 | rdmsrl(MSR_K8_HWCR, value); | ||
138 | value |= 1 << 6; | ||
139 | wrmsrl(MSR_K8_HWCR, value); | ||
140 | } | ||
141 | #endif | ||
142 | |||
143 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | ||
144 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | ||
145 | clear_cpu_cap(c, 0*32+31); | ||
146 | |||
147 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | ||
148 | if (c->x86 == 0xf) { | ||
149 | level = cpuid_eax(1); | ||
150 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) | ||
151 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
152 | } | ||
153 | if (c->x86 == 0x10 || c->x86 == 0x11) | ||
154 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
155 | |||
156 | /* Enable workaround for FXSAVE leak */ | ||
157 | if (c->x86 >= 6) | ||
158 | set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); | ||
159 | |||
160 | level = get_model_name(c); | ||
161 | if (!level) { | ||
162 | switch (c->x86) { | ||
163 | case 0xf: | ||
164 | /* Should distinguish Models here, but this is only | ||
165 | a fallback anyways. */ | ||
166 | strcpy(c->x86_model_id, "Hammer"); | ||
167 | break; | ||
168 | } | ||
169 | } | ||
170 | display_cacheinfo(c); | ||
171 | |||
172 | /* Multi core CPU? */ | ||
173 | if (c->extended_cpuid_level >= 0x80000008) | ||
174 | amd_detect_cmp(c); | ||
175 | |||
176 | if (c->extended_cpuid_level >= 0x80000006 && | ||
177 | (cpuid_edx(0x80000006) & 0xf000)) | ||
178 | num_cache_leaves = 4; | ||
179 | else | ||
180 | num_cache_leaves = 3; | ||
181 | |||
182 | if (c->x86 >= 0xf && c->x86 <= 0x11) | ||
183 | set_cpu_cap(c, X86_FEATURE_K8); | ||
184 | |||
185 | /* MFENCE stops RDTSC speculation */ | ||
186 | set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); | ||
187 | |||
188 | if (c->x86 == 0x10) { | ||
189 | /* do this for boot cpu */ | ||
190 | if (c == &boot_cpu_data) | ||
191 | check_enable_amd_mmconf_dmi(); | ||
192 | |||
193 | fam10h_check_enable_mmcfg(); | ||
194 | } | ||
195 | |||
196 | if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { | ||
197 | unsigned long long tseg; | ||
198 | |||
199 | /* | ||
200 | * Split up direct mapping around the TSEG SMM area. | ||
201 | * Don't do it for gbpages because there seems very little | ||
202 | * benefit in doing so. | ||
203 | */ | ||
204 | if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { | ||
205 | printk(KERN_DEBUG "tseg: %010llx\n", tseg); | ||
206 | if ((tseg>>PMD_SHIFT) < | ||
207 | (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || | ||
208 | ((tseg>>PMD_SHIFT) < | ||
209 | (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && | ||
210 | (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) | ||
211 | set_memory_4k((unsigned long)__va(tseg), 1); | ||
212 | } | ||
213 | } | ||
214 | } | ||
215 | |||
216 | static struct cpu_dev amd_cpu_dev __cpuinitdata = { | ||
217 | .c_vendor = "AMD", | ||
218 | .c_ident = { "AuthenticAMD" }, | ||
219 | .c_early_init = early_init_amd, | ||
220 | .c_init = init_amd, | ||
221 | }; | ||
222 | |||
223 | cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); | ||
224 | |||
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 170d2f5523b2..c9b58a806e85 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c | |||
@@ -59,8 +59,12 @@ static void __init check_fpu(void) | |||
59 | return; | 59 | return; |
60 | } | 60 | } |
61 | 61 | ||
62 | /* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ | 62 | /* |
63 | /* Test for the divl bug.. */ | 63 | * trap_init() enabled FXSR and company _before_ testing for FP |
64 | * problems here. | ||
65 | * | ||
66 | * Test for the divl bug.. | ||
67 | */ | ||
64 | __asm__("fninit\n\t" | 68 | __asm__("fninit\n\t" |
65 | "fldl %1\n\t" | 69 | "fldl %1\n\t" |
66 | "fdivl %2\n\t" | 70 | "fdivl %2\n\t" |
@@ -108,10 +112,15 @@ static void __init check_popad(void) | |||
108 | "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " | 112 | "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " |
109 | : "=&a" (res) | 113 | : "=&a" (res) |
110 | : "d" (inp) | 114 | : "d" (inp) |
111 | : "ecx", "edi" ); | 115 | : "ecx", "edi"); |
112 | /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ | 116 | /* |
113 | if (res != 12345678) printk( "Buggy.\n" ); | 117 | * If this fails, it means that any user program may lock the |
114 | else printk( "OK.\n" ); | 118 | * CPU hard. Too bad. |
119 | */ | ||
120 | if (res != 12345678) | ||
121 | printk("Buggy.\n"); | ||
122 | else | ||
123 | printk("OK.\n"); | ||
115 | #endif | 124 | #endif |
116 | } | 125 | } |
117 | 126 | ||
@@ -122,13 +131,7 @@ static void __init check_popad(void) | |||
122 | * (for due to lack of "invlpg" and working WP on a i386) | 131 | * (for due to lack of "invlpg" and working WP on a i386) |
123 | * - In order to run on anything without a TSC, we need to be | 132 | * - In order to run on anything without a TSC, we need to be |
124 | * compiled for a i486. | 133 | * compiled for a i486. |
125 | * - In order to support the local APIC on a buggy Pentium machine, | 134 | */ |
126 | * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, | ||
127 | * which happens implicitly if compiled for a Pentium or lower | ||
128 | * (unless an advanced selection of CPU features is used) as an | ||
129 | * otherwise config implies a properly working local APIC without | ||
130 | * the need to do extra reads from the APIC. | ||
131 | */ | ||
132 | 135 | ||
133 | static void __init check_config(void) | 136 | static void __init check_config(void) |
134 | { | 137 | { |
@@ -137,25 +140,11 @@ static void __init check_config(void) | |||
137 | * i486+ only features! (WP works in supervisor mode and the | 140 | * i486+ only features! (WP works in supervisor mode and the |
138 | * new "invlpg" and "bswap" instructions) | 141 | * new "invlpg" and "bswap" instructions) |
139 | */ | 142 | */ |
140 | #if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) | 143 | #if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ |
144 | defined(CONFIG_X86_BSWAP) | ||
141 | if (boot_cpu_data.x86 == 3) | 145 | if (boot_cpu_data.x86 == 3) |
142 | panic("Kernel requires i486+ for 'invlpg' and other features"); | 146 | panic("Kernel requires i486+ for 'invlpg' and other features"); |
143 | #endif | 147 | #endif |
144 | |||
145 | /* | ||
146 | * If we were told we had a good local APIC, check for buggy Pentia, | ||
147 | * i.e. all B steppings and the C2 stepping of P54C when using their | ||
148 | * integrated APIC (see 11AP erratum in "Pentium Processor | ||
149 | * Specification Update"). | ||
150 | */ | ||
151 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) | ||
152 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL | ||
153 | && cpu_has_apic | ||
154 | && boot_cpu_data.x86 == 5 | ||
155 | && boot_cpu_data.x86_model == 2 | ||
156 | && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) | ||
157 | panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); | ||
158 | #endif | ||
159 | } | 148 | } |
160 | 149 | ||
161 | 150 | ||
@@ -170,6 +159,7 @@ void __init check_bugs(void) | |||
170 | check_fpu(); | 159 | check_fpu(); |
171 | check_hlt(); | 160 | check_hlt(); |
172 | check_popad(); | 161 | check_popad(); |
173 | init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); | 162 | init_utsname()->machine[1] = |
163 | '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); | ||
174 | alternative_instructions(); | 164 | alternative_instructions(); |
175 | } | 165 | } |
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 9a3ed0649d4e..9a3ed0649d4e 100644 --- a/arch/x86/kernel/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c | |||
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c new file mode 100644 index 000000000000..1d181c40e2e1 --- /dev/null +++ b/arch/x86/kernel/cpu/centaur_64.c | |||
@@ -0,0 +1,35 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/smp.h> | ||
3 | |||
4 | #include <asm/cpufeature.h> | ||
5 | #include <asm/processor.h> | ||
6 | |||
7 | #include "cpu.h" | ||
8 | |||
9 | static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) | ||
10 | { | ||
11 | if (c->x86 == 0x6 && c->x86_model >= 0xf) | ||
12 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
13 | |||
14 | set_cpu_cap(c, X86_FEATURE_SYSENTER32); | ||
15 | } | ||
16 | |||
17 | static void __cpuinit init_centaur(struct cpuinfo_x86 *c) | ||
18 | { | ||
19 | if (c->x86 == 0x6 && c->x86_model >= 0xf) { | ||
20 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
21 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
22 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
23 | } | ||
24 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | ||
25 | } | ||
26 | |||
27 | static struct cpu_dev centaur_cpu_dev __cpuinitdata = { | ||
28 | .c_vendor = "Centaur", | ||
29 | .c_ident = { "CentaurHauls" }, | ||
30 | .c_early_init = early_init_centaur, | ||
31 | .c_init = init_centaur, | ||
32 | }; | ||
33 | |||
34 | cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); | ||
35 | |||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d0463a946247..80ab20d4fa39 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -427,7 +427,7 @@ __setup("serialnumber", x86_serial_nr_setup); | |||
427 | /* | 427 | /* |
428 | * This does the hard work of actually picking apart the CPU stuff... | 428 | * This does the hard work of actually picking apart the CPU stuff... |
429 | */ | 429 | */ |
430 | void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | 430 | static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) |
431 | { | 431 | { |
432 | int i; | 432 | int i; |
433 | 433 | ||
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c new file mode 100644 index 000000000000..dd6e3f15017e --- /dev/null +++ b/arch/x86/kernel/cpu/common_64.c | |||
@@ -0,0 +1,670 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/kernel.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <linux/bootmem.h> | ||
6 | #include <linux/bitops.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/kgdb.h> | ||
9 | #include <linux/topology.h> | ||
10 | #include <linux/delay.h> | ||
11 | #include <linux/smp.h> | ||
12 | #include <linux/percpu.h> | ||
13 | #include <asm/i387.h> | ||
14 | #include <asm/msr.h> | ||
15 | #include <asm/io.h> | ||
16 | #include <asm/linkage.h> | ||
17 | #include <asm/mmu_context.h> | ||
18 | #include <asm/mtrr.h> | ||
19 | #include <asm/mce.h> | ||
20 | #include <asm/pat.h> | ||
21 | #include <asm/numa.h> | ||
22 | #ifdef CONFIG_X86_LOCAL_APIC | ||
23 | #include <asm/mpspec.h> | ||
24 | #include <asm/apic.h> | ||
25 | #include <mach_apic.h> | ||
26 | #endif | ||
27 | #include <asm/pda.h> | ||
28 | #include <asm/pgtable.h> | ||
29 | #include <asm/processor.h> | ||
30 | #include <asm/desc.h> | ||
31 | #include <asm/atomic.h> | ||
32 | #include <asm/proto.h> | ||
33 | #include <asm/sections.h> | ||
34 | #include <asm/setup.h> | ||
35 | #include <asm/genapic.h> | ||
36 | |||
37 | #include "cpu.h" | ||
38 | |||
39 | /* We need valid kernel segments for data and code in long mode too | ||
40 | * IRET will check the segment types kkeil 2000/10/28 | ||
41 | * Also sysret mandates a special GDT layout | ||
42 | */ | ||
43 | /* The TLS descriptors are currently at a different place compared to i386. | ||
44 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
45 | DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { | ||
46 | [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, | ||
47 | [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, | ||
48 | [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, | ||
49 | [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, | ||
50 | [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, | ||
51 | [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, | ||
52 | } }; | ||
53 | EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | ||
54 | |||
55 | __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | ||
56 | |||
57 | /* Current gdt points %fs at the "master" per-cpu area: after this, | ||
58 | * it's on the real one. */ | ||
59 | void switch_to_new_gdt(void) | ||
60 | { | ||
61 | struct desc_ptr gdt_descr; | ||
62 | |||
63 | gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); | ||
64 | gdt_descr.size = GDT_SIZE - 1; | ||
65 | load_gdt(&gdt_descr); | ||
66 | } | ||
67 | |||
68 | struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; | ||
69 | |||
70 | static void __cpuinit default_init(struct cpuinfo_x86 *c) | ||
71 | { | ||
72 | display_cacheinfo(c); | ||
73 | } | ||
74 | |||
75 | static struct cpu_dev __cpuinitdata default_cpu = { | ||
76 | .c_init = default_init, | ||
77 | .c_vendor = "Unknown", | ||
78 | }; | ||
79 | static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; | ||
80 | |||
81 | int __cpuinit get_model_name(struct cpuinfo_x86 *c) | ||
82 | { | ||
83 | unsigned int *v; | ||
84 | |||
85 | if (c->extended_cpuid_level < 0x80000004) | ||
86 | return 0; | ||
87 | |||
88 | v = (unsigned int *) c->x86_model_id; | ||
89 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | ||
90 | cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | ||
91 | cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | ||
92 | c->x86_model_id[48] = 0; | ||
93 | return 1; | ||
94 | } | ||
95 | |||
96 | |||
97 | void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | ||
98 | { | ||
99 | unsigned int n, dummy, ebx, ecx, edx; | ||
100 | |||
101 | n = c->extended_cpuid_level; | ||
102 | |||
103 | if (n >= 0x80000005) { | ||
104 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | ||
105 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " | ||
106 | "D cache %dK (%d bytes/line)\n", | ||
107 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | ||
108 | c->x86_cache_size = (ecx>>24) + (edx>>24); | ||
109 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
110 | c->x86_tlbsize = 0; | ||
111 | } | ||
112 | |||
113 | if (n >= 0x80000006) { | ||
114 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | ||
115 | ecx = cpuid_ecx(0x80000006); | ||
116 | c->x86_cache_size = ecx >> 16; | ||
117 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
118 | |||
119 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | ||
120 | c->x86_cache_size, ecx & 0xFF); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | void __cpuinit detect_ht(struct cpuinfo_x86 *c) | ||
125 | { | ||
126 | #ifdef CONFIG_SMP | ||
127 | u32 eax, ebx, ecx, edx; | ||
128 | int index_msb, core_bits; | ||
129 | |||
130 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
131 | |||
132 | |||
133 | if (!cpu_has(c, X86_FEATURE_HT)) | ||
134 | return; | ||
135 | if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
136 | goto out; | ||
137 | |||
138 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
139 | |||
140 | if (smp_num_siblings == 1) { | ||
141 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
142 | } else if (smp_num_siblings > 1) { | ||
143 | |||
144 | if (smp_num_siblings > NR_CPUS) { | ||
145 | printk(KERN_WARNING "CPU: Unsupported number of " | ||
146 | "siblings %d", smp_num_siblings); | ||
147 | smp_num_siblings = 1; | ||
148 | return; | ||
149 | } | ||
150 | |||
151 | index_msb = get_count_order(smp_num_siblings); | ||
152 | c->phys_proc_id = phys_pkg_id(index_msb); | ||
153 | |||
154 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | ||
155 | |||
156 | index_msb = get_count_order(smp_num_siblings); | ||
157 | |||
158 | core_bits = get_count_order(c->x86_max_cores); | ||
159 | |||
160 | c->cpu_core_id = phys_pkg_id(index_msb) & | ||
161 | ((1 << core_bits) - 1); | ||
162 | } | ||
163 | out: | ||
164 | if ((c->x86_max_cores * smp_num_siblings) > 1) { | ||
165 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | ||
166 | c->phys_proc_id); | ||
167 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", | ||
168 | c->cpu_core_id); | ||
169 | } | ||
170 | |||
171 | #endif | ||
172 | } | ||
173 | |||
174 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | ||
175 | { | ||
176 | char *v = c->x86_vendor_id; | ||
177 | int i; | ||
178 | static int printed; | ||
179 | |||
180 | for (i = 0; i < X86_VENDOR_NUM; i++) { | ||
181 | if (cpu_devs[i]) { | ||
182 | if (!strcmp(v, cpu_devs[i]->c_ident[0]) || | ||
183 | (cpu_devs[i]->c_ident[1] && | ||
184 | !strcmp(v, cpu_devs[i]->c_ident[1]))) { | ||
185 | c->x86_vendor = i; | ||
186 | this_cpu = cpu_devs[i]; | ||
187 | return; | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | if (!printed) { | ||
192 | printed++; | ||
193 | printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); | ||
194 | printk(KERN_ERR "CPU: Your system may be unstable.\n"); | ||
195 | } | ||
196 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
197 | } | ||
198 | |||
199 | static void __init early_cpu_support_print(void) | ||
200 | { | ||
201 | int i,j; | ||
202 | struct cpu_dev *cpu_devx; | ||
203 | |||
204 | printk("KERNEL supported cpus:\n"); | ||
205 | for (i = 0; i < X86_VENDOR_NUM; i++) { | ||
206 | cpu_devx = cpu_devs[i]; | ||
207 | if (!cpu_devx) | ||
208 | continue; | ||
209 | for (j = 0; j < 2; j++) { | ||
210 | if (!cpu_devx->c_ident[j]) | ||
211 | continue; | ||
212 | printk(" %s %s\n", cpu_devx->c_vendor, | ||
213 | cpu_devx->c_ident[j]); | ||
214 | } | ||
215 | } | ||
216 | } | ||
217 | |||
218 | static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); | ||
219 | |||
220 | void __init early_cpu_init(void) | ||
221 | { | ||
222 | struct cpu_vendor_dev *cvdev; | ||
223 | |||
224 | for (cvdev = __x86cpuvendor_start ; | ||
225 | cvdev < __x86cpuvendor_end ; | ||
226 | cvdev++) | ||
227 | cpu_devs[cvdev->vendor] = cvdev->cpu_dev; | ||
228 | early_cpu_support_print(); | ||
229 | early_identify_cpu(&boot_cpu_data); | ||
230 | } | ||
231 | |||
232 | /* Do some early cpuid on the boot CPU to get some parameter that are | ||
233 | needed before check_bugs. Everything advanced is in identify_cpu | ||
234 | below. */ | ||
235 | static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | ||
236 | { | ||
237 | u32 tfms, xlvl; | ||
238 | |||
239 | c->loops_per_jiffy = loops_per_jiffy; | ||
240 | c->x86_cache_size = -1; | ||
241 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
242 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | ||
243 | c->x86_vendor_id[0] = '\0'; /* Unset */ | ||
244 | c->x86_model_id[0] = '\0'; /* Unset */ | ||
245 | c->x86_clflush_size = 64; | ||
246 | c->x86_cache_alignment = c->x86_clflush_size; | ||
247 | c->x86_max_cores = 1; | ||
248 | c->x86_coreid_bits = 0; | ||
249 | c->extended_cpuid_level = 0; | ||
250 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
251 | |||
252 | /* Get vendor name */ | ||
253 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | ||
254 | (unsigned int *)&c->x86_vendor_id[0], | ||
255 | (unsigned int *)&c->x86_vendor_id[8], | ||
256 | (unsigned int *)&c->x86_vendor_id[4]); | ||
257 | |||
258 | get_cpu_vendor(c); | ||
259 | |||
260 | /* Initialize the standard set of capabilities */ | ||
261 | /* Note that the vendor-specific code below might override */ | ||
262 | |||
263 | /* Intel-defined flags: level 0x00000001 */ | ||
264 | if (c->cpuid_level >= 0x00000001) { | ||
265 | __u32 misc; | ||
266 | cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | ||
267 | &c->x86_capability[0]); | ||
268 | c->x86 = (tfms >> 8) & 0xf; | ||
269 | c->x86_model = (tfms >> 4) & 0xf; | ||
270 | c->x86_mask = tfms & 0xf; | ||
271 | if (c->x86 == 0xf) | ||
272 | c->x86 += (tfms >> 20) & 0xff; | ||
273 | if (c->x86 >= 0x6) | ||
274 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
275 | if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) | ||
276 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | ||
277 | } else { | ||
278 | /* Have CPUID level 0 only - unheard of */ | ||
279 | c->x86 = 4; | ||
280 | } | ||
281 | |||
282 | c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; | ||
283 | #ifdef CONFIG_SMP | ||
284 | c->phys_proc_id = c->initial_apicid; | ||
285 | #endif | ||
286 | /* AMD-defined flags: level 0x80000001 */ | ||
287 | xlvl = cpuid_eax(0x80000000); | ||
288 | c->extended_cpuid_level = xlvl; | ||
289 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
290 | if (xlvl >= 0x80000001) { | ||
291 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
292 | c->x86_capability[6] = cpuid_ecx(0x80000001); | ||
293 | } | ||
294 | if (xlvl >= 0x80000004) | ||
295 | get_model_name(c); /* Default name */ | ||
296 | } | ||
297 | |||
298 | /* Transmeta-defined flags: level 0x80860001 */ | ||
299 | xlvl = cpuid_eax(0x80860000); | ||
300 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
301 | /* Don't set x86_cpuid_level here for now to not confuse. */ | ||
302 | if (xlvl >= 0x80860001) | ||
303 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
304 | } | ||
305 | |||
306 | if (c->extended_cpuid_level >= 0x80000007) | ||
307 | c->x86_power = cpuid_edx(0x80000007); | ||
308 | |||
309 | if (c->extended_cpuid_level >= 0x80000008) { | ||
310 | u32 eax = cpuid_eax(0x80000008); | ||
311 | |||
312 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
313 | c->x86_phys_bits = eax & 0xff; | ||
314 | } | ||
315 | |||
316 | if (c->x86_vendor != X86_VENDOR_UNKNOWN && | ||
317 | cpu_devs[c->x86_vendor]->c_early_init) | ||
318 | cpu_devs[c->x86_vendor]->c_early_init(c); | ||
319 | |||
320 | validate_pat_support(c); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * This does the hard work of actually picking apart the CPU stuff... | ||
325 | */ | ||
326 | static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | ||
327 | { | ||
328 | int i; | ||
329 | |||
330 | early_identify_cpu(c); | ||
331 | |||
332 | init_scattered_cpuid_features(c); | ||
333 | |||
334 | c->apicid = phys_pkg_id(0); | ||
335 | |||
336 | /* | ||
337 | * Vendor-specific initialization. In this section we | ||
338 | * canonicalize the feature flags, meaning if there are | ||
339 | * features a certain CPU supports which CPUID doesn't | ||
340 | * tell us, CPUID claiming incorrect flags, or other bugs, | ||
341 | * we handle them here. | ||
342 | * | ||
343 | * At the end of this section, c->x86_capability better | ||
344 | * indicate the features this CPU genuinely supports! | ||
345 | */ | ||
346 | if (this_cpu->c_init) | ||
347 | this_cpu->c_init(c); | ||
348 | |||
349 | detect_ht(c); | ||
350 | |||
351 | /* | ||
352 | * On SMP, boot_cpu_data holds the common feature set between | ||
353 | * all CPUs; so make sure that we indicate which features are | ||
354 | * common between the CPUs. The first time this routine gets | ||
355 | * executed, c == &boot_cpu_data. | ||
356 | */ | ||
357 | if (c != &boot_cpu_data) { | ||
358 | /* AND the already accumulated flags with these */ | ||
359 | for (i = 0; i < NCAPINTS; i++) | ||
360 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | ||
361 | } | ||
362 | |||
363 | /* Clear all flags overriden by options */ | ||
364 | for (i = 0; i < NCAPINTS; i++) | ||
365 | c->x86_capability[i] &= ~cleared_cpu_caps[i]; | ||
366 | |||
367 | #ifdef CONFIG_X86_MCE | ||
368 | mcheck_init(c); | ||
369 | #endif | ||
370 | select_idle_routine(c); | ||
371 | |||
372 | #ifdef CONFIG_NUMA | ||
373 | numa_add_cpu(smp_processor_id()); | ||
374 | #endif | ||
375 | |||
376 | } | ||
377 | |||
378 | void __cpuinit identify_boot_cpu(void) | ||
379 | { | ||
380 | identify_cpu(&boot_cpu_data); | ||
381 | } | ||
382 | |||
383 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | ||
384 | { | ||
385 | BUG_ON(c == &boot_cpu_data); | ||
386 | identify_cpu(c); | ||
387 | mtrr_ap_init(); | ||
388 | } | ||
389 | |||
390 | static __init int setup_noclflush(char *arg) | ||
391 | { | ||
392 | setup_clear_cpu_cap(X86_FEATURE_CLFLSH); | ||
393 | return 1; | ||
394 | } | ||
395 | __setup("noclflush", setup_noclflush); | ||
396 | |||
397 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | ||
398 | { | ||
399 | if (c->x86_model_id[0]) | ||
400 | printk(KERN_CONT "%s", c->x86_model_id); | ||
401 | |||
402 | if (c->x86_mask || c->cpuid_level >= 0) | ||
403 | printk(KERN_CONT " stepping %02x\n", c->x86_mask); | ||
404 | else | ||
405 | printk(KERN_CONT "\n"); | ||
406 | } | ||
407 | |||
408 | static __init int setup_disablecpuid(char *arg) | ||
409 | { | ||
410 | int bit; | ||
411 | if (get_option(&arg, &bit) && bit < NCAPINTS*32) | ||
412 | setup_clear_cpu_cap(bit); | ||
413 | else | ||
414 | return 0; | ||
415 | return 1; | ||
416 | } | ||
417 | __setup("clearcpuid=", setup_disablecpuid); | ||
418 | |||
419 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | ||
420 | |||
421 | struct x8664_pda **_cpu_pda __read_mostly; | ||
422 | EXPORT_SYMBOL(_cpu_pda); | ||
423 | |||
424 | struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | ||
425 | |||
426 | char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; | ||
427 | |||
428 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | ||
429 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | ||
430 | |||
431 | static int do_not_nx __cpuinitdata; | ||
432 | |||
433 | /* noexec=on|off | ||
434 | Control non executable mappings for 64bit processes. | ||
435 | |||
436 | on Enable(default) | ||
437 | off Disable | ||
438 | */ | ||
439 | static int __init nonx_setup(char *str) | ||
440 | { | ||
441 | if (!str) | ||
442 | return -EINVAL; | ||
443 | if (!strncmp(str, "on", 2)) { | ||
444 | __supported_pte_mask |= _PAGE_NX; | ||
445 | do_not_nx = 0; | ||
446 | } else if (!strncmp(str, "off", 3)) { | ||
447 | do_not_nx = 1; | ||
448 | __supported_pte_mask &= ~_PAGE_NX; | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | early_param("noexec", nonx_setup); | ||
453 | |||
454 | int force_personality32; | ||
455 | |||
456 | /* noexec32=on|off | ||
457 | Control non executable heap for 32bit processes. | ||
458 | To control the stack too use noexec=off | ||
459 | |||
460 | on PROT_READ does not imply PROT_EXEC for 32bit processes (default) | ||
461 | off PROT_READ implies PROT_EXEC | ||
462 | */ | ||
463 | static int __init nonx32_setup(char *str) | ||
464 | { | ||
465 | if (!strcmp(str, "on")) | ||
466 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
467 | else if (!strcmp(str, "off")) | ||
468 | force_personality32 |= READ_IMPLIES_EXEC; | ||
469 | return 1; | ||
470 | } | ||
471 | __setup("noexec32=", nonx32_setup); | ||
472 | |||
473 | void pda_init(int cpu) | ||
474 | { | ||
475 | struct x8664_pda *pda = cpu_pda(cpu); | ||
476 | |||
477 | /* Setup up data that may be needed in __get_free_pages early */ | ||
478 | loadsegment(fs, 0); | ||
479 | loadsegment(gs, 0); | ||
480 | /* Memory clobbers used to order PDA accessed */ | ||
481 | mb(); | ||
482 | wrmsrl(MSR_GS_BASE, pda); | ||
483 | mb(); | ||
484 | |||
485 | pda->cpunumber = cpu; | ||
486 | pda->irqcount = -1; | ||
487 | pda->kernelstack = (unsigned long)stack_thread_info() - | ||
488 | PDA_STACKOFFSET + THREAD_SIZE; | ||
489 | pda->active_mm = &init_mm; | ||
490 | pda->mmu_state = 0; | ||
491 | |||
492 | if (cpu == 0) { | ||
493 | /* others are initialized in smpboot.c */ | ||
494 | pda->pcurrent = &init_task; | ||
495 | pda->irqstackptr = boot_cpu_stack; | ||
496 | } else { | ||
497 | pda->irqstackptr = (char *) | ||
498 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
499 | if (!pda->irqstackptr) | ||
500 | panic("cannot allocate irqstack for cpu %d", cpu); | ||
501 | |||
502 | if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) | ||
503 | pda->nodenumber = cpu_to_node(cpu); | ||
504 | } | ||
505 | |||
506 | pda->irqstackptr += IRQSTACKSIZE-64; | ||
507 | } | ||
508 | |||
509 | char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + | ||
510 | DEBUG_STKSZ] __page_aligned_bss; | ||
511 | |||
512 | extern asmlinkage void ignore_sysret(void); | ||
513 | |||
514 | /* May not be marked __init: used by software suspend */ | ||
515 | void syscall_init(void) | ||
516 | { | ||
517 | /* | ||
518 | * LSTAR and STAR live in a bit strange symbiosis. | ||
519 | * They both write to the same internal register. STAR allows to | ||
520 | * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. | ||
521 | */ | ||
522 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
523 | wrmsrl(MSR_LSTAR, system_call); | ||
524 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
525 | |||
526 | #ifdef CONFIG_IA32_EMULATION | ||
527 | syscall32_cpu_init(); | ||
528 | #endif | ||
529 | |||
530 | /* Flags to clear on syscall */ | ||
531 | wrmsrl(MSR_SYSCALL_MASK, | ||
532 | X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); | ||
533 | } | ||
534 | |||
535 | void __cpuinit check_efer(void) | ||
536 | { | ||
537 | unsigned long efer; | ||
538 | |||
539 | rdmsrl(MSR_EFER, efer); | ||
540 | if (!(efer & EFER_NX) || do_not_nx) | ||
541 | __supported_pte_mask &= ~_PAGE_NX; | ||
542 | } | ||
543 | |||
544 | unsigned long kernel_eflags; | ||
545 | |||
546 | /* | ||
547 | * Copies of the original ist values from the tss are only accessed during | ||
548 | * debugging, no special alignment required. | ||
549 | */ | ||
550 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | ||
551 | |||
552 | /* | ||
553 | * cpu_init() initializes state that is per-CPU. Some data is already | ||
554 | * initialized (naturally) in the bootstrap process, such as the GDT | ||
555 | * and IDT. We reload them nevertheless, this function acts as a | ||
556 | * 'CPU state barrier', nothing should get across. | ||
557 | * A lot of state is already set up in PDA init. | ||
558 | */ | ||
559 | void __cpuinit cpu_init(void) | ||
560 | { | ||
561 | int cpu = stack_smp_processor_id(); | ||
562 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
563 | struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); | ||
564 | unsigned long v; | ||
565 | char *estacks = NULL; | ||
566 | struct task_struct *me; | ||
567 | int i; | ||
568 | |||
569 | /* CPU 0 is initialised in head64.c */ | ||
570 | if (cpu != 0) | ||
571 | pda_init(cpu); | ||
572 | else | ||
573 | estacks = boot_exception_stacks; | ||
574 | |||
575 | me = current; | ||
576 | |||
577 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
578 | panic("CPU#%d already initialized!\n", cpu); | ||
579 | |||
580 | printk(KERN_INFO "Initializing CPU#%d\n", cpu); | ||
581 | |||
582 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
583 | |||
584 | /* | ||
585 | * Initialize the per-CPU GDT with the boot GDT, | ||
586 | * and set up the GDT descriptor: | ||
587 | */ | ||
588 | |||
589 | switch_to_new_gdt(); | ||
590 | load_idt((const struct desc_ptr *)&idt_descr); | ||
591 | |||
592 | memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); | ||
593 | syscall_init(); | ||
594 | |||
595 | wrmsrl(MSR_FS_BASE, 0); | ||
596 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
597 | barrier(); | ||
598 | |||
599 | check_efer(); | ||
600 | |||
601 | /* | ||
602 | * set up and load the per-CPU TSS | ||
603 | */ | ||
604 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
605 | static const unsigned int order[N_EXCEPTION_STACKS] = { | ||
606 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, | ||
607 | [DEBUG_STACK - 1] = DEBUG_STACK_ORDER | ||
608 | }; | ||
609 | if (cpu) { | ||
610 | estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); | ||
611 | if (!estacks) | ||
612 | panic("Cannot allocate exception stack %ld %d\n", | ||
613 | v, cpu); | ||
614 | } | ||
615 | estacks += PAGE_SIZE << order[v]; | ||
616 | orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; | ||
617 | } | ||
618 | |||
619 | t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
620 | /* | ||
621 | * <= is required because the CPU will access up to | ||
622 | * 8 bits beyond the end of the IO permission bitmap. | ||
623 | */ | ||
624 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
625 | t->io_bitmap[i] = ~0UL; | ||
626 | |||
627 | atomic_inc(&init_mm.mm_count); | ||
628 | me->active_mm = &init_mm; | ||
629 | if (me->mm) | ||
630 | BUG(); | ||
631 | enter_lazy_tlb(&init_mm, me); | ||
632 | |||
633 | load_sp0(t, ¤t->thread); | ||
634 | set_tss_desc(cpu, t); | ||
635 | load_TR_desc(); | ||
636 | load_LDT(&init_mm.context); | ||
637 | |||
638 | #ifdef CONFIG_KGDB | ||
639 | /* | ||
640 | * If the kgdb is connected no debug regs should be altered. This | ||
641 | * is only applicable when KGDB and a KGDB I/O module are built | ||
642 | * into the kernel and you are using early debugging with | ||
643 | * kgdbwait. KGDB will control the kernel HW breakpoint registers. | ||
644 | */ | ||
645 | if (kgdb_connected && arch_kgdb_ops.correct_hw_break) | ||
646 | arch_kgdb_ops.correct_hw_break(); | ||
647 | else { | ||
648 | #endif | ||
649 | /* | ||
650 | * Clear all 6 debug registers: | ||
651 | */ | ||
652 | |||
653 | set_debugreg(0UL, 0); | ||
654 | set_debugreg(0UL, 1); | ||
655 | set_debugreg(0UL, 2); | ||
656 | set_debugreg(0UL, 3); | ||
657 | set_debugreg(0UL, 6); | ||
658 | set_debugreg(0UL, 7); | ||
659 | #ifdef CONFIG_KGDB | ||
660 | /* If the kgdb is connected no debug regs should be altered. */ | ||
661 | } | ||
662 | #endif | ||
663 | |||
664 | fpu_init(); | ||
665 | |||
666 | raw_local_save_flags(kernel_eflags); | ||
667 | |||
668 | if (is_uv_system()) | ||
669 | uv_cpu_init(); | ||
670 | } | ||
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 783691b2a738..4d894e8565fe 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h | |||
@@ -1,3 +1,6 @@ | |||
1 | #ifndef ARCH_X86_CPU_H | ||
2 | |||
3 | #define ARCH_X86_CPU_H | ||
1 | 4 | ||
2 | struct cpu_model_info { | 5 | struct cpu_model_info { |
3 | int vendor; | 6 | int vendor; |
@@ -36,3 +39,5 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; | |||
36 | 39 | ||
37 | extern int get_model_name(struct cpuinfo_x86 *c); | 40 | extern int get_model_name(struct cpuinfo_x86 *c); |
38 | extern void display_cacheinfo(struct cpuinfo_x86 *c); | 41 | extern void display_cacheinfo(struct cpuinfo_x86 *c); |
42 | |||
43 | #endif | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index b0c8208df9fa..dd097b835839 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd) | |||
202 | cpumask_t saved_mask = current->cpus_allowed; | 202 | cpumask_t saved_mask = current->cpus_allowed; |
203 | unsigned int i; | 203 | unsigned int i; |
204 | 204 | ||
205 | for_each_cpu_mask(i, cmd->mask) { | 205 | for_each_cpu_mask_nr(i, cmd->mask) { |
206 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); | 206 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); |
207 | do_drv_write(cmd); | 207 | do_drv_write(cmd); |
208 | } | 208 | } |
@@ -451,7 +451,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
451 | 451 | ||
452 | freqs.old = perf->states[perf->state].core_frequency * 1000; | 452 | freqs.old = perf->states[perf->state].core_frequency * 1000; |
453 | freqs.new = data->freq_table[next_state].frequency; | 453 | freqs.new = data->freq_table[next_state].frequency; |
454 | for_each_cpu_mask(i, cmd.mask) { | 454 | for_each_cpu_mask_nr(i, cmd.mask) { |
455 | freqs.cpu = i; | 455 | freqs.cpu = i; |
456 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 456 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
457 | } | 457 | } |
@@ -466,7 +466,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
466 | } | 466 | } |
467 | } | 467 | } |
468 | 468 | ||
469 | for_each_cpu_mask(i, cmd.mask) { | 469 | for_each_cpu_mask_nr(i, cmd.mask) { |
470 | freqs.cpu = i; | 470 | freqs.cpu = i; |
471 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 471 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
472 | } | 472 | } |
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index f03e9153618e..965ea52767ac 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c | |||
@@ -26,9 +26,10 @@ | |||
26 | #define NFORCE2_SAFE_DISTANCE 50 | 26 | #define NFORCE2_SAFE_DISTANCE 50 |
27 | 27 | ||
28 | /* Delay in ms between FSB changes */ | 28 | /* Delay in ms between FSB changes */ |
29 | //#define NFORCE2_DELAY 10 | 29 | /* #define NFORCE2_DELAY 10 */ |
30 | 30 | ||
31 | /* nforce2_chipset: | 31 | /* |
32 | * nforce2_chipset: | ||
32 | * FSB is changed using the chipset | 33 | * FSB is changed using the chipset |
33 | */ | 34 | */ |
34 | static struct pci_dev *nforce2_chipset_dev; | 35 | static struct pci_dev *nforce2_chipset_dev; |
@@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev; | |||
36 | /* fid: | 37 | /* fid: |
37 | * multiplier * 10 | 38 | * multiplier * 10 |
38 | */ | 39 | */ |
39 | static int fid = 0; | 40 | static int fid; |
40 | 41 | ||
41 | /* min_fsb, max_fsb: | 42 | /* min_fsb, max_fsb: |
42 | * minimum and maximum FSB (= FSB at boot time) | 43 | * minimum and maximum FSB (= FSB at boot time) |
43 | */ | 44 | */ |
44 | static int min_fsb = 0; | 45 | static int min_fsb; |
45 | static int max_fsb = 0; | 46 | static int max_fsb; |
46 | 47 | ||
47 | MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); | 48 | MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); |
48 | MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); | 49 | MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); |
@@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444); | |||
53 | 54 | ||
54 | MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); | 55 | MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); |
55 | MODULE_PARM_DESC(min_fsb, | 56 | MODULE_PARM_DESC(min_fsb, |
56 | "Minimum FSB to use, if not defined: current FSB - 50"); | 57 | "Minimum FSB to use, if not defined: current FSB - 50"); |
57 | 58 | ||
58 | #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) | 59 | #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) |
59 | 60 | ||
@@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb) | |||
139 | 140 | ||
140 | /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ | 141 | /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ |
141 | nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, | 142 | nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, |
142 | 0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL); | 143 | 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL); |
143 | if (!nforce2_sub5) | 144 | if (!nforce2_sub5) |
144 | return 0; | 145 | return 0; |
145 | 146 | ||
@@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb) | |||
147 | fsb /= 1000000; | 148 | fsb /= 1000000; |
148 | 149 | ||
149 | /* Check if PLL register is already set */ | 150 | /* Check if PLL register is already set */ |
150 | pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); | 151 | pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); |
151 | 152 | ||
152 | if(bootfsb || !temp) | 153 | if (bootfsb || !temp) |
153 | return fsb; | 154 | return fsb; |
154 | 155 | ||
155 | /* Use PLL register FSB value */ | 156 | /* Use PLL register FSB value */ |
156 | pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp); | 157 | pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp); |
157 | fsb = nforce2_calc_fsb(temp); | 158 | fsb = nforce2_calc_fsb(temp); |
158 | 159 | ||
159 | return fsb; | 160 | return fsb; |
@@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb) | |||
184 | } | 185 | } |
185 | 186 | ||
186 | /* First write? Then set actual value */ | 187 | /* First write? Then set actual value */ |
187 | pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); | 188 | pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); |
188 | if (!temp) { | 189 | if (!temp) { |
189 | pll = nforce2_calc_pll(tfsb); | 190 | pll = nforce2_calc_pll(tfsb); |
190 | 191 | ||
@@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb) | |||
210 | tfsb--; | 211 | tfsb--; |
211 | 212 | ||
212 | /* Calculate the PLL reg. value */ | 213 | /* Calculate the PLL reg. value */ |
213 | if ((pll = nforce2_calc_pll(tfsb)) == -1) | 214 | pll = nforce2_calc_pll(tfsb); |
215 | if (pll == -1) | ||
214 | return -EINVAL; | 216 | return -EINVAL; |
215 | 217 | ||
216 | nforce2_write_pll(pll); | 218 | nforce2_write_pll(pll); |
@@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu) | |||
249 | static int nforce2_target(struct cpufreq_policy *policy, | 251 | static int nforce2_target(struct cpufreq_policy *policy, |
250 | unsigned int target_freq, unsigned int relation) | 252 | unsigned int target_freq, unsigned int relation) |
251 | { | 253 | { |
252 | // unsigned long flags; | 254 | /* unsigned long flags; */ |
253 | struct cpufreq_freqs freqs; | 255 | struct cpufreq_freqs freqs; |
254 | unsigned int target_fsb; | 256 | unsigned int target_fsb; |
255 | 257 | ||
@@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy, | |||
271 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 273 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
272 | 274 | ||
273 | /* Disable IRQs */ | 275 | /* Disable IRQs */ |
274 | //local_irq_save(flags); | 276 | /* local_irq_save(flags); */ |
275 | 277 | ||
276 | if (nforce2_set_fsb(target_fsb) < 0) | 278 | if (nforce2_set_fsb(target_fsb) < 0) |
277 | printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", | 279 | printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", |
278 | target_fsb); | 280 | target_fsb); |
279 | else | 281 | else |
280 | dprintk("Changed FSB successfully to %d\n", | 282 | dprintk("Changed FSB successfully to %d\n", |
281 | target_fsb); | 283 | target_fsb); |
282 | 284 | ||
283 | /* Enable IRQs */ | 285 | /* Enable IRQs */ |
284 | //local_irq_restore(flags); | 286 | /* local_irq_restore(flags); */ |
285 | 287 | ||
286 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 288 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
287 | 289 | ||
@@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy) | |||
302 | policy->max = (fsb_pol_max + 1) * fid * 100; | 304 | policy->max = (fsb_pol_max + 1) * fid * 100; |
303 | 305 | ||
304 | cpufreq_verify_within_limits(policy, | 306 | cpufreq_verify_within_limits(policy, |
305 | policy->cpuinfo.min_freq, | 307 | policy->cpuinfo.min_freq, |
306 | policy->cpuinfo.max_freq); | 308 | policy->cpuinfo.max_freq); |
307 | return 0; | 309 | return 0; |
308 | } | 310 | } |
309 | 311 | ||
@@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy) | |||
347 | /* Set maximum FSB to FSB at boot time */ | 349 | /* Set maximum FSB to FSB at boot time */ |
348 | max_fsb = nforce2_fsb_read(1); | 350 | max_fsb = nforce2_fsb_read(1); |
349 | 351 | ||
350 | if(!max_fsb) | 352 | if (!max_fsb) |
351 | return -EIO; | 353 | return -EIO; |
352 | 354 | ||
353 | if (!min_fsb) | 355 | if (!min_fsb) |
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 199e4e05e5dc..f1685fb91fbd 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | |||
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, | |||
122 | return 0; | 122 | return 0; |
123 | 123 | ||
124 | /* notifiers */ | 124 | /* notifiers */ |
125 | for_each_cpu_mask(i, policy->cpus) { | 125 | for_each_cpu_mask_nr(i, policy->cpus) { |
126 | freqs.cpu = i; | 126 | freqs.cpu = i; |
127 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 127 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
128 | } | 128 | } |
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, | |||
130 | /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software | 130 | /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software |
131 | * Developer's Manual, Volume 3 | 131 | * Developer's Manual, Volume 3 |
132 | */ | 132 | */ |
133 | for_each_cpu_mask(i, policy->cpus) | 133 | for_each_cpu_mask_nr(i, policy->cpus) |
134 | cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); | 134 | cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); |
135 | 135 | ||
136 | /* notifiers */ | 136 | /* notifiers */ |
137 | for_each_cpu_mask(i, policy->cpus) { | 137 | for_each_cpu_mask_nr(i, policy->cpus) { |
138 | freqs.cpu = i; | 138 | freqs.cpu = i; |
139 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 139 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
140 | } | 140 | } |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h index f8a63b3664e3..35fb4eaf6e1c 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h | |||
@@ -1,5 +1,4 @@ | |||
1 | /* | 1 | /* |
2 | * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $ | ||
3 | * (C) 2003 Dave Jones. | 2 | * (C) 2003 Dave Jones. |
4 | * | 3 | * |
5 | * Licensed under the terms of the GNU GPL License version 2. | 4 | * Licensed under the terms of the GNU GPL License version 2. |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 206791eb46e3..c45ca6d4dce1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -966,7 +966,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i | |||
966 | freqs.old = find_khz_freq_from_fid(data->currfid); | 966 | freqs.old = find_khz_freq_from_fid(data->currfid); |
967 | freqs.new = find_khz_freq_from_fid(fid); | 967 | freqs.new = find_khz_freq_from_fid(fid); |
968 | 968 | ||
969 | for_each_cpu_mask(i, *(data->available_cores)) { | 969 | for_each_cpu_mask_nr(i, *(data->available_cores)) { |
970 | freqs.cpu = i; | 970 | freqs.cpu = i; |
971 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 971 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
972 | } | 972 | } |
@@ -974,7 +974,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i | |||
974 | res = transition_fid_vid(data, fid, vid); | 974 | res = transition_fid_vid(data, fid, vid); |
975 | freqs.new = find_khz_freq_from_fid(data->currfid); | 975 | freqs.new = find_khz_freq_from_fid(data->currfid); |
976 | 976 | ||
977 | for_each_cpu_mask(i, *(data->available_cores)) { | 977 | for_each_cpu_mask_nr(i, *(data->available_cores)) { |
978 | freqs.cpu = i; | 978 | freqs.cpu = i; |
979 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 979 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
980 | } | 980 | } |
@@ -997,7 +997,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i | |||
997 | freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); | 997 | freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); |
998 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); | 998 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); |
999 | 999 | ||
1000 | for_each_cpu_mask(i, *(data->available_cores)) { | 1000 | for_each_cpu_mask_nr(i, *(data->available_cores)) { |
1001 | freqs.cpu = i; | 1001 | freqs.cpu = i; |
1002 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 1002 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
1003 | } | 1003 | } |
@@ -1005,7 +1005,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i | |||
1005 | res = transition_pstate(data, pstate); | 1005 | res = transition_pstate(data, pstate); |
1006 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); | 1006 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); |
1007 | 1007 | ||
1008 | for_each_cpu_mask(i, *(data->available_cores)) { | 1008 | for_each_cpu_mask_nr(i, *(data->available_cores)) { |
1009 | freqs.cpu = i; | 1009 | freqs.cpu = i; |
1010 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 1010 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
1011 | } | 1011 | } |
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 908dd347c67e..15e13c01cc36 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | |||
@@ -28,7 +28,8 @@ | |||
28 | #define PFX "speedstep-centrino: " | 28 | #define PFX "speedstep-centrino: " |
29 | #define MAINTAINER "cpufreq@lists.linux.org.uk" | 29 | #define MAINTAINER "cpufreq@lists.linux.org.uk" |
30 | 30 | ||
31 | #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) | 31 | #define dprintk(msg...) \ |
32 | cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) | ||
32 | 33 | ||
33 | #define INTEL_MSR_RANGE (0xffff) | 34 | #define INTEL_MSR_RANGE (0xffff) |
34 | 35 | ||
@@ -66,11 +67,12 @@ struct cpu_model | |||
66 | 67 | ||
67 | struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ | 68 | struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ |
68 | }; | 69 | }; |
69 | static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); | 70 | static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, |
71 | const struct cpu_id *x); | ||
70 | 72 | ||
71 | /* Operating points for current CPU */ | 73 | /* Operating points for current CPU */ |
72 | static struct cpu_model *centrino_model[NR_CPUS]; | 74 | static DEFINE_PER_CPU(struct cpu_model *, centrino_model); |
73 | static const struct cpu_id *centrino_cpu[NR_CPUS]; | 75 | static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); |
74 | 76 | ||
75 | static struct cpufreq_driver centrino_driver; | 77 | static struct cpufreq_driver centrino_driver; |
76 | 78 | ||
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) | |||
255 | return -ENOENT; | 257 | return -ENOENT; |
256 | } | 258 | } |
257 | 259 | ||
258 | centrino_model[policy->cpu] = model; | 260 | per_cpu(centrino_model, policy->cpu) = model; |
259 | 261 | ||
260 | dprintk("found \"%s\": max frequency: %dkHz\n", | 262 | dprintk("found \"%s\": max frequency: %dkHz\n", |
261 | model->model_name, model->max_freq); | 263 | model->model_name, model->max_freq); |
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) | |||
264 | } | 266 | } |
265 | 267 | ||
266 | #else | 268 | #else |
267 | static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } | 269 | static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) |
270 | { | ||
271 | return -ENODEV; | ||
272 | } | ||
268 | #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ | 273 | #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ |
269 | 274 | ||
270 | static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) | 275 | static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, |
276 | const struct cpu_id *x) | ||
271 | { | 277 | { |
272 | if ((c->x86 == x->x86) && | 278 | if ((c->x86 == x->x86) && |
273 | (c->x86_model == x->x86_model) && | 279 | (c->x86_model == x->x86_model) && |
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) | |||
286 | * for centrino, as some DSDTs are buggy. | 292 | * for centrino, as some DSDTs are buggy. |
287 | * Ideally, this can be done using the acpi_data structure. | 293 | * Ideally, this can be done using the acpi_data structure. |
288 | */ | 294 | */ |
289 | if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || | 295 | if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || |
290 | (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || | 296 | (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || |
291 | (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { | 297 | (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { |
292 | msr = (msr >> 8) & 0xff; | 298 | msr = (msr >> 8) & 0xff; |
293 | return msr * 100000; | 299 | return msr * 100000; |
294 | } | 300 | } |
295 | 301 | ||
296 | if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) | 302 | if ((!per_cpu(centrino_model, cpu)) || |
303 | (!per_cpu(centrino_model, cpu)->op_points)) | ||
297 | return 0; | 304 | return 0; |
298 | 305 | ||
299 | msr &= 0xffff; | 306 | msr &= 0xffff; |
300 | for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { | 307 | for (i = 0; |
301 | if (msr == centrino_model[cpu]->op_points[i].index) | 308 | per_cpu(centrino_model, cpu)->op_points[i].frequency |
302 | return centrino_model[cpu]->op_points[i].frequency; | 309 | != CPUFREQ_TABLE_END; |
310 | i++) { | ||
311 | if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) | ||
312 | return per_cpu(centrino_model, cpu)-> | ||
313 | op_points[i].frequency; | ||
303 | } | 314 | } |
304 | if (failsafe) | 315 | if (failsafe) |
305 | return centrino_model[cpu]->op_points[i-1].frequency; | 316 | return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; |
306 | else | 317 | else |
307 | return 0; | 318 | return 0; |
308 | } | 319 | } |
@@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) | |||
347 | int i; | 358 | int i; |
348 | 359 | ||
349 | /* Only Intel makes Enhanced Speedstep-capable CPUs */ | 360 | /* Only Intel makes Enhanced Speedstep-capable CPUs */ |
350 | if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) | 361 | if (cpu->x86_vendor != X86_VENDOR_INTEL || |
362 | !cpu_has(cpu, X86_FEATURE_EST)) | ||
351 | return -ENODEV; | 363 | return -ENODEV; |
352 | 364 | ||
353 | if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) | 365 | if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) |
@@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) | |||
361 | break; | 373 | break; |
362 | 374 | ||
363 | if (i != N_IDS) | 375 | if (i != N_IDS) |
364 | centrino_cpu[policy->cpu] = &cpu_ids[i]; | 376 | per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; |
365 | 377 | ||
366 | if (!centrino_cpu[policy->cpu]) { | 378 | if (!per_cpu(centrino_cpu, policy->cpu)) { |
367 | dprintk("found unsupported CPU with " | 379 | dprintk("found unsupported CPU with " |
368 | "Enhanced SpeedStep: send /proc/cpuinfo to " | 380 | "Enhanced SpeedStep: send /proc/cpuinfo to " |
369 | MAINTAINER "\n"); | 381 | MAINTAINER "\n"); |
@@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) | |||
386 | /* check to see if it stuck */ | 398 | /* check to see if it stuck */ |
387 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | 399 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); |
388 | if (!(l & (1<<16))) { | 400 | if (!(l & (1<<16))) { |
389 | printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); | 401 | printk(KERN_INFO PFX |
402 | "couldn't enable Enhanced SpeedStep\n"); | ||
390 | return -ENODEV; | 403 | return -ENODEV; |
391 | } | 404 | } |
392 | } | 405 | } |
393 | 406 | ||
394 | freq = get_cur_freq(policy->cpu); | 407 | freq = get_cur_freq(policy->cpu); |
395 | 408 | policy->cpuinfo.transition_latency = 10000; | |
396 | policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ | 409 | /* 10uS transition latency */ |
397 | policy->cur = freq; | 410 | policy->cur = freq; |
398 | 411 | ||
399 | dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); | 412 | dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); |
400 | 413 | ||
401 | ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); | 414 | ret = cpufreq_frequency_table_cpuinfo(policy, |
415 | per_cpu(centrino_model, policy->cpu)->op_points); | ||
402 | if (ret) | 416 | if (ret) |
403 | return (ret); | 417 | return (ret); |
404 | 418 | ||
405 | cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); | 419 | cpufreq_frequency_table_get_attr( |
420 | per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); | ||
406 | 421 | ||
407 | return 0; | 422 | return 0; |
408 | } | 423 | } |
@@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) | |||
411 | { | 426 | { |
412 | unsigned int cpu = policy->cpu; | 427 | unsigned int cpu = policy->cpu; |
413 | 428 | ||
414 | if (!centrino_model[cpu]) | 429 | if (!per_cpu(centrino_model, cpu)) |
415 | return -ENODEV; | 430 | return -ENODEV; |
416 | 431 | ||
417 | cpufreq_frequency_table_put_attr(cpu); | 432 | cpufreq_frequency_table_put_attr(cpu); |
418 | 433 | ||
419 | centrino_model[cpu] = NULL; | 434 | per_cpu(centrino_model, cpu) = NULL; |
420 | 435 | ||
421 | return 0; | 436 | return 0; |
422 | } | 437 | } |
@@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) | |||
430 | */ | 445 | */ |
431 | static int centrino_verify (struct cpufreq_policy *policy) | 446 | static int centrino_verify (struct cpufreq_policy *policy) |
432 | { | 447 | { |
433 | return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); | 448 | return cpufreq_frequency_table_verify(policy, |
449 | per_cpu(centrino_model, policy->cpu)->op_points); | ||
434 | } | 450 | } |
435 | 451 | ||
436 | /** | 452 | /** |
437 | * centrino_setpolicy - set a new CPUFreq policy | 453 | * centrino_setpolicy - set a new CPUFreq policy |
438 | * @policy: new policy | 454 | * @policy: new policy |
439 | * @target_freq: the target frequency | 455 | * @target_freq: the target frequency |
440 | * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) | 456 | * @relation: how that frequency relates to achieved frequency |
457 | * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) | ||
441 | * | 458 | * |
442 | * Sets a new CPUFreq policy. | 459 | * Sets a new CPUFreq policy. |
443 | */ | 460 | */ |
461 | struct allmasks { | ||
462 | cpumask_t online_policy_cpus; | ||
463 | cpumask_t saved_mask; | ||
464 | cpumask_t set_mask; | ||
465 | cpumask_t covered_cpus; | ||
466 | }; | ||
467 | |||
444 | static int centrino_target (struct cpufreq_policy *policy, | 468 | static int centrino_target (struct cpufreq_policy *policy, |
445 | unsigned int target_freq, | 469 | unsigned int target_freq, |
446 | unsigned int relation) | 470 | unsigned int relation) |
@@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
448 | unsigned int newstate = 0; | 472 | unsigned int newstate = 0; |
449 | unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; | 473 | unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; |
450 | struct cpufreq_freqs freqs; | 474 | struct cpufreq_freqs freqs; |
451 | cpumask_t online_policy_cpus; | ||
452 | cpumask_t saved_mask; | ||
453 | cpumask_t set_mask; | ||
454 | cpumask_t covered_cpus; | ||
455 | int retval = 0; | 475 | int retval = 0; |
456 | unsigned int j, k, first_cpu, tmp; | 476 | unsigned int j, k, first_cpu, tmp; |
457 | 477 | CPUMASK_ALLOC(allmasks); | |
458 | if (unlikely(centrino_model[cpu] == NULL)) | 478 | CPUMASK_PTR(online_policy_cpus, allmasks); |
459 | return -ENODEV; | 479 | CPUMASK_PTR(saved_mask, allmasks); |
480 | CPUMASK_PTR(set_mask, allmasks); | ||
481 | CPUMASK_PTR(covered_cpus, allmasks); | ||
482 | |||
483 | if (unlikely(allmasks == NULL)) | ||
484 | return -ENOMEM; | ||
485 | |||
486 | if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { | ||
487 | retval = -ENODEV; | ||
488 | goto out; | ||
489 | } | ||
460 | 490 | ||
461 | if (unlikely(cpufreq_frequency_table_target(policy, | 491 | if (unlikely(cpufreq_frequency_table_target(policy, |
462 | centrino_model[cpu]->op_points, | 492 | per_cpu(centrino_model, cpu)->op_points, |
463 | target_freq, | 493 | target_freq, |
464 | relation, | 494 | relation, |
465 | &newstate))) { | 495 | &newstate))) { |
466 | return -EINVAL; | 496 | retval = -EINVAL; |
497 | goto out; | ||
467 | } | 498 | } |
468 | 499 | ||
469 | #ifdef CONFIG_HOTPLUG_CPU | 500 | #ifdef CONFIG_HOTPLUG_CPU |
470 | /* cpufreq holds the hotplug lock, so we are safe from here on */ | 501 | /* cpufreq holds the hotplug lock, so we are safe from here on */ |
471 | cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); | 502 | cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus); |
472 | #else | 503 | #else |
473 | online_policy_cpus = policy->cpus; | 504 | *online_policy_cpus = policy->cpus; |
474 | #endif | 505 | #endif |
475 | 506 | ||
476 | saved_mask = current->cpus_allowed; | 507 | *saved_mask = current->cpus_allowed; |
477 | first_cpu = 1; | 508 | first_cpu = 1; |
478 | cpus_clear(covered_cpus); | 509 | cpus_clear(*covered_cpus); |
479 | for_each_cpu_mask(j, online_policy_cpus) { | 510 | for_each_cpu_mask_nr(j, *online_policy_cpus) { |
480 | /* | 511 | /* |
481 | * Support for SMP systems. | 512 | * Support for SMP systems. |
482 | * Make sure we are running on CPU that wants to change freq | 513 | * Make sure we are running on CPU that wants to change freq |
483 | */ | 514 | */ |
484 | cpus_clear(set_mask); | 515 | cpus_clear(*set_mask); |
485 | if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) | 516 | if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) |
486 | cpus_or(set_mask, set_mask, online_policy_cpus); | 517 | cpus_or(*set_mask, *set_mask, *online_policy_cpus); |
487 | else | 518 | else |
488 | cpu_set(j, set_mask); | 519 | cpu_set(j, *set_mask); |
489 | 520 | ||
490 | set_cpus_allowed_ptr(current, &set_mask); | 521 | set_cpus_allowed_ptr(current, set_mask); |
491 | preempt_disable(); | 522 | preempt_disable(); |
492 | if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { | 523 | if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { |
493 | dprintk("couldn't limit to CPUs in this domain\n"); | 524 | dprintk("couldn't limit to CPUs in this domain\n"); |
494 | retval = -EAGAIN; | 525 | retval = -EAGAIN; |
495 | if (first_cpu) { | 526 | if (first_cpu) { |
@@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
500 | break; | 531 | break; |
501 | } | 532 | } |
502 | 533 | ||
503 | msr = centrino_model[cpu]->op_points[newstate].index; | 534 | msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; |
504 | 535 | ||
505 | if (first_cpu) { | 536 | if (first_cpu) { |
506 | rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); | 537 | rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); |
@@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
517 | dprintk("target=%dkHz old=%d new=%d msr=%04x\n", | 548 | dprintk("target=%dkHz old=%d new=%d msr=%04x\n", |
518 | target_freq, freqs.old, freqs.new, msr); | 549 | target_freq, freqs.old, freqs.new, msr); |
519 | 550 | ||
520 | for_each_cpu_mask(k, online_policy_cpus) { | 551 | for_each_cpu_mask_nr(k, *online_policy_cpus) { |
521 | freqs.cpu = k; | 552 | freqs.cpu = k; |
522 | cpufreq_notify_transition(&freqs, | 553 | cpufreq_notify_transition(&freqs, |
523 | CPUFREQ_PRECHANGE); | 554 | CPUFREQ_PRECHANGE); |
@@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
536 | break; | 567 | break; |
537 | } | 568 | } |
538 | 569 | ||
539 | cpu_set(j, covered_cpus); | 570 | cpu_set(j, *covered_cpus); |
540 | preempt_enable(); | 571 | preempt_enable(); |
541 | } | 572 | } |
542 | 573 | ||
543 | for_each_cpu_mask(k, online_policy_cpus) { | 574 | for_each_cpu_mask_nr(k, *online_policy_cpus) { |
544 | freqs.cpu = k; | 575 | freqs.cpu = k; |
545 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 576 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
546 | } | 577 | } |
@@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
553 | * Best effort undo.. | 584 | * Best effort undo.. |
554 | */ | 585 | */ |
555 | 586 | ||
556 | if (!cpus_empty(covered_cpus)) { | 587 | if (!cpus_empty(*covered_cpus)) |
557 | for_each_cpu_mask(j, covered_cpus) { | 588 | for_each_cpu_mask_nr(j, *covered_cpus) { |
558 | set_cpus_allowed_ptr(current, | 589 | set_cpus_allowed_ptr(current, |
559 | &cpumask_of_cpu(j)); | 590 | &cpumask_of_cpu(j)); |
560 | wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); | 591 | wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); |
561 | } | 592 | } |
562 | } | ||
563 | 593 | ||
564 | tmp = freqs.new; | 594 | tmp = freqs.new; |
565 | freqs.new = freqs.old; | 595 | freqs.new = freqs.old; |
566 | freqs.old = tmp; | 596 | freqs.old = tmp; |
567 | for_each_cpu_mask(j, online_policy_cpus) { | 597 | for_each_cpu_mask_nr(j, *online_policy_cpus) { |
568 | freqs.cpu = j; | 598 | freqs.cpu = j; |
569 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 599 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
570 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 600 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
571 | } | 601 | } |
572 | } | 602 | } |
573 | set_cpus_allowed_ptr(current, &saved_mask); | 603 | set_cpus_allowed_ptr(current, saved_mask); |
574 | return 0; | 604 | retval = 0; |
605 | goto out; | ||
575 | 606 | ||
576 | migrate_end: | 607 | migrate_end: |
577 | preempt_enable(); | 608 | preempt_enable(); |
578 | set_cpus_allowed_ptr(current, &saved_mask); | 609 | set_cpus_allowed_ptr(current, saved_mask); |
579 | return 0; | 610 | out: |
611 | CPUMASK_FREE(allmasks); | ||
612 | return retval; | ||
580 | } | 613 | } |
581 | 614 | ||
582 | static struct freq_attr* centrino_attr[] = { | 615 | static struct freq_attr* centrino_attr[] = { |
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 1b50244b1fdf..191f7263c61d 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | |||
@@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy, | |||
279 | 279 | ||
280 | cpus_allowed = current->cpus_allowed; | 280 | cpus_allowed = current->cpus_allowed; |
281 | 281 | ||
282 | for_each_cpu_mask(i, policy->cpus) { | 282 | for_each_cpu_mask_nr(i, policy->cpus) { |
283 | freqs.cpu = i; | 283 | freqs.cpu = i; |
284 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 284 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
285 | } | 285 | } |
@@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy, | |||
292 | /* allow to be run on all CPUs */ | 292 | /* allow to be run on all CPUs */ |
293 | set_cpus_allowed_ptr(current, &cpus_allowed); | 293 | set_cpus_allowed_ptr(current, &cpus_allowed); |
294 | 294 | ||
295 | for_each_cpu_mask(i, policy->cpus) { | 295 | for_each_cpu_mask_nr(i, policy->cpus) { |
296 | freqs.cpu = i; | 296 | freqs.cpu = i; |
297 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 297 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
298 | } | 298 | } |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fe9224c51d37..b75f2569b8f8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -226,6 +226,20 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |||
226 | 226 | ||
227 | if (cpu_has_bts) | 227 | if (cpu_has_bts) |
228 | ds_init_intel(c); | 228 | ds_init_intel(c); |
229 | |||
230 | /* | ||
231 | * See if we have a good local APIC by checking for buggy Pentia, | ||
232 | * i.e. all B steppings and the C2 stepping of P54C when using their | ||
233 | * integrated APIC (see 11AP erratum in "Pentium Processor | ||
234 | * Specification Update"). | ||
235 | */ | ||
236 | if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && | ||
237 | (c->x86_mask < 0x6 || c->x86_mask == 0xb)) | ||
238 | set_cpu_cap(c, X86_FEATURE_11AP); | ||
239 | |||
240 | #ifdef CONFIG_X86_NUMAQ | ||
241 | numaq_tsc_disable(); | ||
242 | #endif | ||
229 | } | 243 | } |
230 | 244 | ||
231 | static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) | 245 | static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) |
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c new file mode 100644 index 000000000000..1019c58d39f0 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_64.c | |||
@@ -0,0 +1,95 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/smp.h> | ||
3 | #include <asm/processor.h> | ||
4 | #include <asm/ptrace.h> | ||
5 | #include <asm/topology.h> | ||
6 | #include <asm/numa_64.h> | ||
7 | |||
8 | #include "cpu.h" | ||
9 | |||
10 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | ||
11 | { | ||
12 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || | ||
13 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | ||
14 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
15 | |||
16 | set_cpu_cap(c, X86_FEATURE_SYSENTER32); | ||
17 | } | ||
18 | |||
19 | /* | ||
20 | * find out the number of processor cores on the die | ||
21 | */ | ||
22 | static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) | ||
23 | { | ||
24 | unsigned int eax, t; | ||
25 | |||
26 | if (c->cpuid_level < 4) | ||
27 | return 1; | ||
28 | |||
29 | cpuid_count(4, 0, &eax, &t, &t, &t); | ||
30 | |||
31 | if (eax & 0x1f) | ||
32 | return ((eax >> 26) + 1); | ||
33 | else | ||
34 | return 1; | ||
35 | } | ||
36 | |||
37 | static void __cpuinit srat_detect_node(void) | ||
38 | { | ||
39 | #ifdef CONFIG_NUMA | ||
40 | unsigned node; | ||
41 | int cpu = smp_processor_id(); | ||
42 | int apicid = hard_smp_processor_id(); | ||
43 | |||
44 | /* Don't do the funky fallback heuristics the AMD version employs | ||
45 | for now. */ | ||
46 | node = apicid_to_node[apicid]; | ||
47 | if (node == NUMA_NO_NODE || !node_online(node)) | ||
48 | node = first_node(node_online_map); | ||
49 | numa_set_node(cpu, node); | ||
50 | |||
51 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
52 | #endif | ||
53 | } | ||
54 | |||
55 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) | ||
56 | { | ||
57 | init_intel_cacheinfo(c); | ||
58 | if (c->cpuid_level > 9) { | ||
59 | unsigned eax = cpuid_eax(10); | ||
60 | /* Check for version and the number of counters */ | ||
61 | if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) | ||
62 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | ||
63 | } | ||
64 | |||
65 | if (cpu_has_ds) { | ||
66 | unsigned int l1, l2; | ||
67 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | ||
68 | if (!(l1 & (1<<11))) | ||
69 | set_cpu_cap(c, X86_FEATURE_BTS); | ||
70 | if (!(l1 & (1<<12))) | ||
71 | set_cpu_cap(c, X86_FEATURE_PEBS); | ||
72 | } | ||
73 | |||
74 | |||
75 | if (cpu_has_bts) | ||
76 | ds_init_intel(c); | ||
77 | |||
78 | if (c->x86 == 15) | ||
79 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
80 | if (c->x86 == 6) | ||
81 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
82 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | ||
83 | c->x86_max_cores = intel_num_cpu_cores(c); | ||
84 | |||
85 | srat_detect_node(); | ||
86 | } | ||
87 | |||
88 | static struct cpu_dev intel_cpu_dev __cpuinitdata = { | ||
89 | .c_vendor = "Intel", | ||
90 | .c_ident = { "GenuineIntel" }, | ||
91 | .c_early_init = early_init_intel, | ||
92 | .c_init = init_intel, | ||
93 | }; | ||
94 | cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); | ||
95 | |||
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 26d615dcb149..6b0a10b002f1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata = | |||
62 | { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ | 62 | { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ |
63 | { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ | 63 | { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ |
64 | { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ | 64 | { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ |
65 | { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ | ||
65 | { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ | 66 | { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ |
66 | { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ | 67 | { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ |
67 | { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ | 68 | { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ |
@@ -488,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) | |||
488 | int sibling; | 489 | int sibling; |
489 | 490 | ||
490 | this_leaf = CPUID4_INFO_IDX(cpu, index); | 491 | this_leaf = CPUID4_INFO_IDX(cpu, index); |
491 | for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { | 492 | for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { |
492 | sibling_leaf = CPUID4_INFO_IDX(sibling, index); | 493 | sibling_leaf = CPUID4_INFO_IDX(sibling, index); |
493 | cpu_clear(cpu, sibling_leaf->shared_cpu_map); | 494 | cpu_clear(cpu, sibling_leaf->shared_cpu_map); |
494 | } | 495 | } |
@@ -779,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
779 | } | 780 | } |
780 | kobject_put(per_cpu(cache_kobject, cpu)); | 781 | kobject_put(per_cpu(cache_kobject, cpu)); |
781 | cpuid4_cache_sysfs_exit(cpu); | 782 | cpuid4_cache_sysfs_exit(cpu); |
782 | break; | 783 | return retval; |
783 | } | 784 | } |
784 | kobject_uevent(&(this_object->kobj), KOBJ_ADD); | 785 | kobject_uevent(&(this_object->kobj), KOBJ_ADD); |
785 | } | 786 | } |
786 | if (!retval) | 787 | cpu_set(cpu, cache_dev_map); |
787 | cpu_set(cpu, cache_dev_map); | ||
788 | 788 | ||
789 | kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); | 789 | kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); |
790 | return retval; | 790 | return 0; |
791 | } | 791 | } |
792 | 792 | ||
793 | static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) | 793 | static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) |
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index e633c9c2b764..f390c9f66351 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c | |||
@@ -9,23 +9,23 @@ | |||
9 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
10 | #include <linux/smp.h> | 10 | #include <linux/smp.h> |
11 | 11 | ||
12 | #include <asm/processor.h> | 12 | #include <asm/processor.h> |
13 | #include <asm/system.h> | 13 | #include <asm/system.h> |
14 | #include <asm/msr.h> | 14 | #include <asm/msr.h> |
15 | 15 | ||
16 | #include "mce.h" | 16 | #include "mce.h" |
17 | 17 | ||
18 | /* Machine Check Handler For AMD Athlon/Duron */ | 18 | /* Machine Check Handler For AMD Athlon/Duron */ |
19 | static void k7_machine_check(struct pt_regs * regs, long error_code) | 19 | static void k7_machine_check(struct pt_regs *regs, long error_code) |
20 | { | 20 | { |
21 | int recover=1; | 21 | int recover = 1; |
22 | u32 alow, ahigh, high, low; | 22 | u32 alow, ahigh, high, low; |
23 | u32 mcgstl, mcgsth; | 23 | u32 mcgstl, mcgsth; |
24 | int i; | 24 | int i; |
25 | 25 | ||
26 | rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 26 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
27 | if (mcgstl & (1<<0)) /* Recoverable ? */ | 27 | if (mcgstl & (1<<0)) /* Recoverable ? */ |
28 | recover=0; | 28 | recover = 0; |
29 | 29 | ||
30 | printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", | 30 | printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", |
31 | smp_processor_id(), mcgsth, mcgstl); | 31 | smp_processor_id(), mcgsth, mcgstl); |
@@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code) | |||
60 | } | 60 | } |
61 | 61 | ||
62 | if (recover&2) | 62 | if (recover&2) |
63 | panic ("CPU context corrupt"); | 63 | panic("CPU context corrupt"); |
64 | if (recover&1) | 64 | if (recover&1) |
65 | panic ("Unable to continue"); | 65 | panic("Unable to continue"); |
66 | printk (KERN_EMERG "Attempting to continue.\n"); | 66 | printk(KERN_EMERG "Attempting to continue.\n"); |
67 | mcgstl &= ~(1<<2); | 67 | mcgstl &= ~(1<<2); |
68 | wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); | 68 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
69 | } | 69 | } |
70 | 70 | ||
71 | 71 | ||
@@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) | |||
81 | machine_check_vector = k7_machine_check; | 81 | machine_check_vector = k7_machine_check; |
82 | wmb(); | 82 | wmb(); |
83 | 83 | ||
84 | printk (KERN_INFO "Intel machine check architecture supported.\n"); | 84 | printk(KERN_INFO "Intel machine check architecture supported.\n"); |
85 | rdmsr (MSR_IA32_MCG_CAP, l, h); | 85 | rdmsr(MSR_IA32_MCG_CAP, l, h); |
86 | if (l & (1<<8)) /* Control register present ? */ | 86 | if (l & (1<<8)) /* Control register present ? */ |
87 | wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 87 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
88 | nr_mce_banks = l & 0xff; | 88 | nr_mce_banks = l & 0xff; |
89 | 89 | ||
90 | /* Clear status for MC index 0 separately, we don't touch CTL, | 90 | /* Clear status for MC index 0 separately, we don't touch CTL, |
91 | * as some K7 Athlons cause spurious MCEs when its enabled. */ | 91 | * as some K7 Athlons cause spurious MCEs when its enabled. */ |
92 | if (boot_cpu_data.x86 == 6) { | 92 | if (boot_cpu_data.x86 == 6) { |
93 | wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); | 93 | wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); |
94 | i = 1; | 94 | i = 1; |
95 | } else | 95 | } else |
96 | i = 0; | 96 | i = 0; |
97 | for (; i<nr_mce_banks; i++) { | 97 | for (; i < nr_mce_banks; i++) { |
98 | wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); | 98 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); |
99 | wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); | 99 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); |
100 | } | 100 | } |
101 | 101 | ||
102 | set_in_cr4 (X86_CR4_MCE); | 102 | set_in_cr4(X86_CR4_MCE); |
103 | printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", | 103 | printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", |
104 | smp_processor_id()); | 104 | smp_processor_id()); |
105 | } | 105 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index e07e8c068ae0..65a339678ece 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/types.h> | 9 | #include <linux/types.h> |
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
12 | #include <linux/smp_lock.h> | ||
12 | #include <linux/string.h> | 13 | #include <linux/string.h> |
13 | #include <linux/rcupdate.h> | 14 | #include <linux/rcupdate.h> |
14 | #include <linux/kallsyms.h> | 15 | #include <linux/kallsyms.h> |
@@ -31,7 +32,7 @@ | |||
31 | #include <asm/idle.h> | 32 | #include <asm/idle.h> |
32 | 33 | ||
33 | #define MISC_MCELOG_MINOR 227 | 34 | #define MISC_MCELOG_MINOR 227 |
34 | #define NR_BANKS 6 | 35 | #define NR_SYSFS_BANKS 6 |
35 | 36 | ||
36 | atomic_t mce_entry; | 37 | atomic_t mce_entry; |
37 | 38 | ||
@@ -46,7 +47,7 @@ static int mce_dont_init; | |||
46 | */ | 47 | */ |
47 | static int tolerant = 1; | 48 | static int tolerant = 1; |
48 | static int banks; | 49 | static int banks; |
49 | static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; | 50 | static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; |
50 | static unsigned long notify_user; | 51 | static unsigned long notify_user; |
51 | static int rip_msr; | 52 | static int rip_msr; |
52 | static int mce_bootlog = -1; | 53 | static int mce_bootlog = -1; |
@@ -209,7 +210,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
209 | barrier(); | 210 | barrier(); |
210 | 211 | ||
211 | for (i = 0; i < banks; i++) { | 212 | for (i = 0; i < banks; i++) { |
212 | if (!bank[i]) | 213 | if (i < NR_SYSFS_BANKS && !bank[i]) |
213 | continue; | 214 | continue; |
214 | 215 | ||
215 | m.misc = 0; | 216 | m.misc = 0; |
@@ -363,7 +364,7 @@ static void mcheck_check_cpu(void *info) | |||
363 | 364 | ||
364 | static void mcheck_timer(struct work_struct *work) | 365 | static void mcheck_timer(struct work_struct *work) |
365 | { | 366 | { |
366 | on_each_cpu(mcheck_check_cpu, NULL, 1, 1); | 367 | on_each_cpu(mcheck_check_cpu, NULL, 1); |
367 | 368 | ||
368 | /* | 369 | /* |
369 | * Alert userspace if needed. If we logged an MCE, reduce the | 370 | * Alert userspace if needed. If we logged an MCE, reduce the |
@@ -444,9 +445,10 @@ static void mce_init(void *dummy) | |||
444 | 445 | ||
445 | rdmsrl(MSR_IA32_MCG_CAP, cap); | 446 | rdmsrl(MSR_IA32_MCG_CAP, cap); |
446 | banks = cap & 0xff; | 447 | banks = cap & 0xff; |
447 | if (banks > NR_BANKS) { | 448 | if (banks > MCE_EXTENDED_BANK) { |
448 | printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); | 449 | banks = MCE_EXTENDED_BANK; |
449 | banks = NR_BANKS; | 450 | printk(KERN_INFO "MCE: warning: using only %d banks\n", |
451 | MCE_EXTENDED_BANK); | ||
450 | } | 452 | } |
451 | /* Use accurate RIP reporting if available. */ | 453 | /* Use accurate RIP reporting if available. */ |
452 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | 454 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) |
@@ -462,7 +464,11 @@ static void mce_init(void *dummy) | |||
462 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 464 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
463 | 465 | ||
464 | for (i = 0; i < banks; i++) { | 466 | for (i = 0; i < banks; i++) { |
465 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | 467 | if (i < NR_SYSFS_BANKS) |
468 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
469 | else | ||
470 | wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); | ||
471 | |||
466 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 472 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); |
467 | } | 473 | } |
468 | } | 474 | } |
@@ -527,10 +533,12 @@ static int open_exclu; /* already open exclusive? */ | |||
527 | 533 | ||
528 | static int mce_open(struct inode *inode, struct file *file) | 534 | static int mce_open(struct inode *inode, struct file *file) |
529 | { | 535 | { |
536 | lock_kernel(); | ||
530 | spin_lock(&mce_state_lock); | 537 | spin_lock(&mce_state_lock); |
531 | 538 | ||
532 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { | 539 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { |
533 | spin_unlock(&mce_state_lock); | 540 | spin_unlock(&mce_state_lock); |
541 | unlock_kernel(); | ||
534 | return -EBUSY; | 542 | return -EBUSY; |
535 | } | 543 | } |
536 | 544 | ||
@@ -539,6 +547,7 @@ static int mce_open(struct inode *inode, struct file *file) | |||
539 | open_count++; | 547 | open_count++; |
540 | 548 | ||
541 | spin_unlock(&mce_state_lock); | 549 | spin_unlock(&mce_state_lock); |
550 | unlock_kernel(); | ||
542 | 551 | ||
543 | return nonseekable_open(inode, file); | 552 | return nonseekable_open(inode, file); |
544 | } | 553 | } |
@@ -571,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | |||
571 | char __user *buf = ubuf; | 580 | char __user *buf = ubuf; |
572 | int i, err; | 581 | int i, err; |
573 | 582 | ||
574 | cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); | 583 | cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); |
575 | if (!cpu_tsc) | 584 | if (!cpu_tsc) |
576 | return -ENOMEM; | 585 | return -ENOMEM; |
577 | 586 | ||
@@ -612,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | |||
612 | * Collect entries that were still getting written before the | 621 | * Collect entries that were still getting written before the |
613 | * synchronize. | 622 | * synchronize. |
614 | */ | 623 | */ |
615 | on_each_cpu(collect_tscs, cpu_tsc, 1, 1); | 624 | on_each_cpu(collect_tscs, cpu_tsc, 1); |
616 | for (i = next; i < MCE_LOG_LEN; i++) { | 625 | for (i = next; i < MCE_LOG_LEN; i++) { |
617 | if (mcelog.entry[i].finished && | 626 | if (mcelog.entry[i].finished && |
618 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | 627 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { |
@@ -737,7 +746,7 @@ static void mce_restart(void) | |||
737 | if (next_interval) | 746 | if (next_interval) |
738 | cancel_delayed_work(&mcheck_work); | 747 | cancel_delayed_work(&mcheck_work); |
739 | /* Timer race is harmless here */ | 748 | /* Timer race is harmless here */ |
740 | on_each_cpu(mce_init, NULL, 1, 1); | 749 | on_each_cpu(mce_init, NULL, 1); |
741 | next_interval = check_interval * HZ; | 750 | next_interval = check_interval * HZ; |
742 | if (next_interval) | 751 | if (next_interval) |
743 | schedule_delayed_work(&mcheck_work, | 752 | schedule_delayed_work(&mcheck_work, |
@@ -753,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce); | |||
753 | 762 | ||
754 | /* Why are there no generic functions for this? */ | 763 | /* Why are there no generic functions for this? */ |
755 | #define ACCESSOR(name, var, start) \ | 764 | #define ACCESSOR(name, var, start) \ |
756 | static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ | 765 | static ssize_t show_ ## name(struct sys_device *s, \ |
766 | struct sysdev_attribute *attr, \ | ||
767 | char *buf) { \ | ||
757 | return sprintf(buf, "%lx\n", (unsigned long)var); \ | 768 | return sprintf(buf, "%lx\n", (unsigned long)var); \ |
758 | } \ | 769 | } \ |
759 | static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ | 770 | static ssize_t set_ ## name(struct sys_device *s, \ |
771 | struct sysdev_attribute *attr, \ | ||
772 | const char *buf, size_t siz) { \ | ||
760 | char *end; \ | 773 | char *end; \ |
761 | unsigned long new = simple_strtoul(buf, &end, 0); \ | 774 | unsigned long new = simple_strtoul(buf, &end, 0); \ |
762 | if (end == buf) return -EINVAL; \ | 775 | if (end == buf) return -EINVAL; \ |
@@ -766,7 +779,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce); | |||
766 | } \ | 779 | } \ |
767 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | 780 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); |
768 | 781 | ||
769 | /* TBD should generate these dynamically based on number of available banks */ | 782 | /* |
783 | * TBD should generate these dynamically based on number of available banks. | ||
784 | * Have only 6 contol banks in /sysfs until then. | ||
785 | */ | ||
770 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | 786 | ACCESSOR(bank0ctl,bank[0],mce_restart()) |
771 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | 787 | ACCESSOR(bank1ctl,bank[1],mce_restart()) |
772 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | 788 | ACCESSOR(bank2ctl,bank[2],mce_restart()) |
@@ -774,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart()) | |||
774 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | 790 | ACCESSOR(bank4ctl,bank[4],mce_restart()) |
775 | ACCESSOR(bank5ctl,bank[5],mce_restart()) | 791 | ACCESSOR(bank5ctl,bank[5],mce_restart()) |
776 | 792 | ||
777 | static ssize_t show_trigger(struct sys_device *s, char *buf) | 793 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, |
794 | char *buf) | ||
778 | { | 795 | { |
779 | strcpy(buf, trigger); | 796 | strcpy(buf, trigger); |
780 | strcat(buf, "\n"); | 797 | strcat(buf, "\n"); |
781 | return strlen(trigger) + 1; | 798 | return strlen(trigger) + 1; |
782 | } | 799 | } |
783 | 800 | ||
784 | static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) | 801 | static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, |
802 | const char *buf,size_t siz) | ||
785 | { | 803 | { |
786 | char *p; | 804 | char *p; |
787 | int len; | 805 | int len; |
@@ -794,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) | |||
794 | } | 812 | } |
795 | 813 | ||
796 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | 814 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); |
797 | ACCESSOR(tolerant,tolerant,) | 815 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); |
798 | ACCESSOR(check_interval,check_interval,mce_restart()) | 816 | ACCESSOR(check_interval,check_interval,mce_restart()) |
799 | static struct sysdev_attribute *mce_attributes[] = { | 817 | static struct sysdev_attribute *mce_attributes[] = { |
800 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | 818 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, |
801 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, | 819 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, |
802 | &attr_tolerant, &attr_check_interval, &attr_trigger, | 820 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, |
803 | NULL | 821 | NULL |
804 | }; | 822 | }; |
805 | 823 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 7c9a813e1193..88736cadbaa6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c | |||
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
527 | if (err) | 527 | if (err) |
528 | goto out_free; | 528 | goto out_free; |
529 | 529 | ||
530 | for_each_cpu_mask(i, b->cpus) { | 530 | for_each_cpu_mask_nr(i, b->cpus) { |
531 | if (i == cpu) | 531 | if (i == cpu) |
532 | continue; | 532 | continue; |
533 | 533 | ||
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
617 | #endif | 617 | #endif |
618 | 618 | ||
619 | /* remove all sibling symlinks before unregistering */ | 619 | /* remove all sibling symlinks before unregistering */ |
620 | for_each_cpu_mask(i, b->cpus) { | 620 | for_each_cpu_mask_nr(i, b->cpus) { |
621 | if (i == cpu) | 621 | if (i == cpu) |
622 | continue; | 622 | continue; |
623 | 623 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index 00ccb6c14ec2..cc1fccdd31e0 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c | |||
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); | |||
59 | 59 | ||
60 | static void mce_work_fn(struct work_struct *work) | 60 | static void mce_work_fn(struct work_struct *work) |
61 | { | 61 | { |
62 | on_each_cpu(mce_checkregs, NULL, 1, 1); | 62 | on_each_cpu(mce_checkregs, NULL, 1); |
63 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); | 63 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); |
64 | } | 64 | } |
65 | 65 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index cb03345554a5..9b60fce09f75 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c | |||
@@ -8,7 +8,7 @@ | |||
8 | #include <linux/interrupt.h> | 8 | #include <linux/interrupt.h> |
9 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
10 | 10 | ||
11 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 12 | #include <asm/system.h> |
13 | #include <asm/msr.h> | 13 | #include <asm/msr.h> |
14 | #include <asm/apic.h> | 14 | #include <asm/apic.h> |
@@ -32,12 +32,12 @@ struct intel_mce_extended_msrs { | |||
32 | /* u32 *reserved[]; */ | 32 | /* u32 *reserved[]; */ |
33 | }; | 33 | }; |
34 | 34 | ||
35 | static int mce_num_extended_msrs = 0; | 35 | static int mce_num_extended_msrs; |
36 | 36 | ||
37 | 37 | ||
38 | #ifdef CONFIG_X86_MCE_P4THERMAL | 38 | #ifdef CONFIG_X86_MCE_P4THERMAL |
39 | static void unexpected_thermal_interrupt(struct pt_regs *regs) | 39 | static void unexpected_thermal_interrupt(struct pt_regs *regs) |
40 | { | 40 | { |
41 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", | 41 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", |
42 | smp_processor_id()); | 42 | smp_processor_id()); |
43 | add_taint(TAINT_MACHINE_CHECK); | 43 | add_taint(TAINT_MACHINE_CHECK); |
@@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) | |||
83 | * be some SMM goo which handles it, so we can't even put a handler | 83 | * be some SMM goo which handles it, so we can't even put a handler |
84 | * since it might be delivered via SMI already -zwanem. | 84 | * since it might be delivered via SMI already -zwanem. |
85 | */ | 85 | */ |
86 | rdmsr (MSR_IA32_MISC_ENABLE, l, h); | 86 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); |
87 | h = apic_read(APIC_LVTTHMR); | 87 | h = apic_read(APIC_LVTTHMR); |
88 | if ((l & (1<<3)) && (h & APIC_DM_SMI)) { | 88 | if ((l & (1<<3)) && (h & APIC_DM_SMI)) { |
89 | printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", | 89 | printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", |
@@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) | |||
91 | return; /* -EBUSY */ | 91 | return; /* -EBUSY */ |
92 | } | 92 | } |
93 | 93 | ||
94 | /* check whether a vector already exists, temporarily masked? */ | 94 | /* check whether a vector already exists, temporarily masked? */ |
95 | if (h & APIC_VECTOR_MASK) { | 95 | if (h & APIC_VECTOR_MASK) { |
96 | printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " | 96 | printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " |
97 | "installed\n", | 97 | "installed\n", |
@@ -102,20 +102,20 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) | |||
102 | /* The temperature transition interrupt handler setup */ | 102 | /* The temperature transition interrupt handler setup */ |
103 | h = THERMAL_APIC_VECTOR; /* our delivery vector */ | 103 | h = THERMAL_APIC_VECTOR; /* our delivery vector */ |
104 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ | 104 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ |
105 | apic_write_around(APIC_LVTTHMR, h); | 105 | apic_write(APIC_LVTTHMR, h); |
106 | 106 | ||
107 | rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); | 107 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); |
108 | wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); | 108 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); |
109 | 109 | ||
110 | /* ok we're good to go... */ | 110 | /* ok we're good to go... */ |
111 | vendor_thermal_interrupt = intel_thermal_interrupt; | 111 | vendor_thermal_interrupt = intel_thermal_interrupt; |
112 | |||
113 | rdmsr (MSR_IA32_MISC_ENABLE, l, h); | ||
114 | wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); | ||
115 | 112 | ||
116 | l = apic_read (APIC_LVTTHMR); | 113 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); |
117 | apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | 114 | wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); |
118 | printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); | 115 | |
116 | l = apic_read(APIC_LVTTHMR); | ||
117 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
118 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); | ||
119 | 119 | ||
120 | /* enable thermal throttle processing */ | 120 | /* enable thermal throttle processing */ |
121 | atomic_set(&therm_throt_en, 1); | 121 | atomic_set(&therm_throt_en, 1); |
@@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) | |||
129 | { | 129 | { |
130 | u32 h; | 130 | u32 h; |
131 | 131 | ||
132 | rdmsr (MSR_IA32_MCG_EAX, r->eax, h); | 132 | rdmsr(MSR_IA32_MCG_EAX, r->eax, h); |
133 | rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); | 133 | rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); |
134 | rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); | 134 | rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); |
135 | rdmsr (MSR_IA32_MCG_EDX, r->edx, h); | 135 | rdmsr(MSR_IA32_MCG_EDX, r->edx, h); |
136 | rdmsr (MSR_IA32_MCG_ESI, r->esi, h); | 136 | rdmsr(MSR_IA32_MCG_ESI, r->esi, h); |
137 | rdmsr (MSR_IA32_MCG_EDI, r->edi, h); | 137 | rdmsr(MSR_IA32_MCG_EDI, r->edi, h); |
138 | rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); | 138 | rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); |
139 | rdmsr (MSR_IA32_MCG_ESP, r->esp, h); | 139 | rdmsr(MSR_IA32_MCG_ESP, r->esp, h); |
140 | rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); | 140 | rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); |
141 | rdmsr (MSR_IA32_MCG_EIP, r->eip, h); | 141 | rdmsr(MSR_IA32_MCG_EIP, r->eip, h); |
142 | } | 142 | } |
143 | 143 | ||
144 | static void intel_machine_check(struct pt_regs * regs, long error_code) | 144 | static void intel_machine_check(struct pt_regs *regs, long error_code) |
145 | { | 145 | { |
146 | int recover=1; | 146 | int recover = 1; |
147 | u32 alow, ahigh, high, low; | 147 | u32 alow, ahigh, high, low; |
148 | u32 mcgstl, mcgsth; | 148 | u32 mcgstl, mcgsth; |
149 | int i; | 149 | int i; |
150 | 150 | ||
151 | rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | 151 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
152 | if (mcgstl & (1<<0)) /* Recoverable ? */ | 152 | if (mcgstl & (1<<0)) /* Recoverable ? */ |
153 | recover=0; | 153 | recover = 0; |
154 | 154 | ||
155 | printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", | 155 | printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", |
156 | smp_processor_id(), mcgsth, mcgstl); | 156 | smp_processor_id(), mcgsth, mcgstl); |
@@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code) | |||
191 | } | 191 | } |
192 | 192 | ||
193 | if (recover & 2) | 193 | if (recover & 2) |
194 | panic ("CPU context corrupt"); | 194 | panic("CPU context corrupt"); |
195 | if (recover & 1) | 195 | if (recover & 1) |
196 | panic ("Unable to continue"); | 196 | panic("Unable to continue"); |
197 | 197 | ||
198 | printk(KERN_EMERG "Attempting to continue.\n"); | 198 | printk(KERN_EMERG "Attempting to continue.\n"); |
199 | /* | 199 | /* |
200 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not | 200 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not |
201 | * recoverable/continuable.This will allow BIOS to look at the MSRs | 201 | * recoverable/continuable.This will allow BIOS to look at the MSRs |
202 | * for errors if the OS could not log the error. | 202 | * for errors if the OS could not log the error. |
203 | */ | 203 | */ |
204 | for (i=0; i<nr_mce_banks; i++) { | 204 | for (i = 0; i < nr_mce_banks; i++) { |
205 | u32 msr; | 205 | u32 msr; |
206 | msr = MSR_IA32_MC0_STATUS+i*4; | 206 | msr = MSR_IA32_MC0_STATUS+i*4; |
207 | rdmsr (msr, low, high); | 207 | rdmsr(msr, low, high); |
208 | if (high&(1<<31)) { | 208 | if (high&(1<<31)) { |
209 | /* Clear it */ | 209 | /* Clear it */ |
210 | wrmsr(msr, 0UL, 0UL); | 210 | wrmsr(msr, 0UL, 0UL); |
@@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code) | |||
214 | } | 214 | } |
215 | } | 215 | } |
216 | mcgstl &= ~(1<<2); | 216 | mcgstl &= ~(1<<2); |
217 | wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); | 217 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); |
218 | } | 218 | } |
219 | 219 | ||
220 | 220 | ||
@@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c) | |||
222 | { | 222 | { |
223 | u32 l, h; | 223 | u32 l, h; |
224 | int i; | 224 | int i; |
225 | 225 | ||
226 | machine_check_vector = intel_machine_check; | 226 | machine_check_vector = intel_machine_check; |
227 | wmb(); | 227 | wmb(); |
228 | 228 | ||
229 | printk (KERN_INFO "Intel machine check architecture supported.\n"); | 229 | printk(KERN_INFO "Intel machine check architecture supported.\n"); |
230 | rdmsr (MSR_IA32_MCG_CAP, l, h); | 230 | rdmsr(MSR_IA32_MCG_CAP, l, h); |
231 | if (l & (1<<8)) /* Control register present ? */ | 231 | if (l & (1<<8)) /* Control register present ? */ |
232 | wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 232 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
233 | nr_mce_banks = l & 0xff; | 233 | nr_mce_banks = l & 0xff; |
234 | 234 | ||
235 | for (i=0; i<nr_mce_banks; i++) { | 235 | for (i = 0; i < nr_mce_banks; i++) { |
236 | wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); | 236 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); |
237 | wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); | 237 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); |
238 | } | 238 | } |
239 | 239 | ||
240 | set_in_cr4 (X86_CR4_MCE); | 240 | set_in_cr4(X86_CR4_MCE); |
241 | printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", | 241 | printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", |
242 | smp_processor_id()); | 242 | smp_processor_id()); |
243 | 243 | ||
244 | /* Check for P4/Xeon extended MCE MSRs */ | 244 | /* Check for P4/Xeon extended MCE MSRs */ |
245 | rdmsr (MSR_IA32_MCG_CAP, l, h); | 245 | rdmsr(MSR_IA32_MCG_CAP, l, h); |
246 | if (l & (1<<9)) {/* MCG_EXT_P */ | 246 | if (l & (1<<9)) {/* MCG_EXT_P */ |
247 | mce_num_extended_msrs = (l >> 16) & 0xff; | 247 | mce_num_extended_msrs = (l >> 16) & 0xff; |
248 | printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" | 248 | printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" |
249 | " available\n", | 249 | " available\n", |
250 | smp_processor_id(), mce_num_extended_msrs); | 250 | smp_processor_id(), mce_num_extended_msrs); |
251 | 251 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1f4cc48c14c6..d5ae2243f0b9 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0); | |||
35 | 35 | ||
36 | #define define_therm_throt_sysdev_show_func(name) \ | 36 | #define define_therm_throt_sysdev_show_func(name) \ |
37 | static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ | 37 | static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ |
38 | struct sysdev_attribute *attr, \ | ||
38 | char *buf) \ | 39 | char *buf) \ |
39 | { \ | 40 | { \ |
40 | unsigned int cpu = dev->id; \ | 41 | unsigned int cpu = dev->id; \ |
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5d241ce94a44..509bd3d9eacd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = { | |||
37 | static unsigned long smp_changes_mask; | 37 | static unsigned long smp_changes_mask; |
38 | static struct mtrr_state mtrr_state = {}; | 38 | static struct mtrr_state mtrr_state = {}; |
39 | static int mtrr_state_set; | 39 | static int mtrr_state_set; |
40 | static u64 tom2; | 40 | u64 mtrr_tom2; |
41 | 41 | ||
42 | #undef MODULE_PARAM_PREFIX | 42 | #undef MODULE_PARAM_PREFIX |
43 | #define MODULE_PARAM_PREFIX "mtrr." | 43 | #define MODULE_PARAM_PREFIX "mtrr." |
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
139 | } | 139 | } |
140 | } | 140 | } |
141 | 141 | ||
142 | if (tom2) { | 142 | if (mtrr_tom2) { |
143 | if (start >= (1ULL<<32) && (end < tom2)) | 143 | if (start >= (1ULL<<32) && (end < mtrr_tom2)) |
144 | return MTRR_TYPE_WRBACK; | 144 | return MTRR_TYPE_WRBACK; |
145 | } | 145 | } |
146 | 146 | ||
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) | |||
158 | rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); | 158 | rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); |
159 | } | 159 | } |
160 | 160 | ||
161 | /* fill the MSR pair relating to a var range */ | ||
162 | void fill_mtrr_var_range(unsigned int index, | ||
163 | u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) | ||
164 | { | ||
165 | struct mtrr_var_range *vr; | ||
166 | |||
167 | vr = mtrr_state.var_ranges; | ||
168 | |||
169 | vr[index].base_lo = base_lo; | ||
170 | vr[index].base_hi = base_hi; | ||
171 | vr[index].mask_lo = mask_lo; | ||
172 | vr[index].mask_hi = mask_hi; | ||
173 | } | ||
174 | |||
161 | static void | 175 | static void |
162 | get_fixed_ranges(mtrr_type * frs) | 176 | get_fixed_ranges(mtrr_type * frs) |
163 | { | 177 | { |
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void) | |||
213 | mtrr_state.enabled = (lo & 0xc00) >> 10; | 227 | mtrr_state.enabled = (lo & 0xc00) >> 10; |
214 | 228 | ||
215 | if (amd_special_default_mtrr()) { | 229 | if (amd_special_default_mtrr()) { |
216 | unsigned lo, hi; | 230 | unsigned low, high; |
217 | /* TOP_MEM2 */ | 231 | /* TOP_MEM2 */ |
218 | rdmsr(MSR_K8_TOP_MEM2, lo, hi); | 232 | rdmsr(MSR_K8_TOP_MEM2, low, high); |
219 | tom2 = hi; | 233 | mtrr_tom2 = high; |
220 | tom2 <<= 32; | 234 | mtrr_tom2 <<= 32; |
221 | tom2 |= lo; | 235 | mtrr_tom2 |= low; |
222 | tom2 &= 0xffffff8000000ULL; | 236 | mtrr_tom2 &= 0xffffff800000ULL; |
223 | } | 237 | } |
224 | if (mtrr_show) { | 238 | if (mtrr_show) { |
225 | int high_width; | 239 | int high_width; |
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void) | |||
251 | else | 265 | else |
252 | printk(KERN_INFO "MTRR %u disabled\n", i); | 266 | printk(KERN_INFO "MTRR %u disabled\n", i); |
253 | } | 267 | } |
254 | if (tom2) { | 268 | if (mtrr_tom2) { |
255 | printk(KERN_INFO "TOM2: %016llx aka %lldM\n", | 269 | printk(KERN_INFO "TOM2: %016llx aka %lldM\n", |
256 | tom2, tom2>>20); | 270 | mtrr_tom2, mtrr_tom2>>20); |
257 | } | 271 | } |
258 | } | 272 | } |
259 | mtrr_state_set = 1; | 273 | mtrr_state_set = 1; |
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) | |||
328 | 342 | ||
329 | if (lo != msrwords[0] || hi != msrwords[1]) { | 343 | if (lo != msrwords[0] || hi != msrwords[1]) { |
330 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | 344 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && |
331 | boot_cpu_data.x86 == 15 && | 345 | (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) && |
332 | ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) | 346 | ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) |
333 | k8_enable_fixed_iorrs(); | 347 | k8_enable_fixed_iorrs(); |
334 | mtrr_wrmsr(msr, msrwords[0], msrwords[1]); | 348 | mtrr_wrmsr(msr, msrwords[0], msrwords[1]); |
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6a1e278d9323..6f23969c8faf 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/cpu.h> | 38 | #include <linux/cpu.h> |
39 | #include <linux/mutex.h> | 39 | #include <linux/mutex.h> |
40 | #include <linux/sort.h> | ||
40 | 41 | ||
41 | #include <asm/e820.h> | 42 | #include <asm/e820.h> |
42 | #include <asm/mtrr.h> | 43 | #include <asm/mtrr.h> |
@@ -222,7 +223,7 @@ static void set_mtrr(unsigned int reg, unsigned long base, | |||
222 | atomic_set(&data.gate,0); | 223 | atomic_set(&data.gate,0); |
223 | 224 | ||
224 | /* Start the ball rolling on other CPUs */ | 225 | /* Start the ball rolling on other CPUs */ |
225 | if (smp_call_function(ipi_handler, &data, 1, 0) != 0) | 226 | if (smp_call_function(ipi_handler, &data, 0) != 0) |
226 | panic("mtrr: timed out waiting for other CPUs\n"); | 227 | panic("mtrr: timed out waiting for other CPUs\n"); |
227 | 228 | ||
228 | local_irq_save(flags); | 229 | local_irq_save(flags); |
@@ -609,6 +610,787 @@ static struct sysdev_driver mtrr_sysdev_driver = { | |||
609 | .resume = mtrr_restore, | 610 | .resume = mtrr_restore, |
610 | }; | 611 | }; |
611 | 612 | ||
613 | /* should be related to MTRR_VAR_RANGES nums */ | ||
614 | #define RANGE_NUM 256 | ||
615 | |||
616 | struct res_range { | ||
617 | unsigned long start; | ||
618 | unsigned long end; | ||
619 | }; | ||
620 | |||
621 | static int __init | ||
622 | add_range(struct res_range *range, int nr_range, unsigned long start, | ||
623 | unsigned long end) | ||
624 | { | ||
625 | /* out of slots */ | ||
626 | if (nr_range >= RANGE_NUM) | ||
627 | return nr_range; | ||
628 | |||
629 | range[nr_range].start = start; | ||
630 | range[nr_range].end = end; | ||
631 | |||
632 | nr_range++; | ||
633 | |||
634 | return nr_range; | ||
635 | } | ||
636 | |||
637 | static int __init | ||
638 | add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, | ||
639 | unsigned long end) | ||
640 | { | ||
641 | int i; | ||
642 | |||
643 | /* try to merge it with old one */ | ||
644 | for (i = 0; i < nr_range; i++) { | ||
645 | unsigned long final_start, final_end; | ||
646 | unsigned long common_start, common_end; | ||
647 | |||
648 | if (!range[i].end) | ||
649 | continue; | ||
650 | |||
651 | common_start = max(range[i].start, start); | ||
652 | common_end = min(range[i].end, end); | ||
653 | if (common_start > common_end + 1) | ||
654 | continue; | ||
655 | |||
656 | final_start = min(range[i].start, start); | ||
657 | final_end = max(range[i].end, end); | ||
658 | |||
659 | range[i].start = final_start; | ||
660 | range[i].end = final_end; | ||
661 | return nr_range; | ||
662 | } | ||
663 | |||
664 | /* need to add that */ | ||
665 | return add_range(range, nr_range, start, end); | ||
666 | } | ||
667 | |||
668 | static void __init | ||
669 | subtract_range(struct res_range *range, unsigned long start, unsigned long end) | ||
670 | { | ||
671 | int i, j; | ||
672 | |||
673 | for (j = 0; j < RANGE_NUM; j++) { | ||
674 | if (!range[j].end) | ||
675 | continue; | ||
676 | |||
677 | if (start <= range[j].start && end >= range[j].end) { | ||
678 | range[j].start = 0; | ||
679 | range[j].end = 0; | ||
680 | continue; | ||
681 | } | ||
682 | |||
683 | if (start <= range[j].start && end < range[j].end && | ||
684 | range[j].start < end + 1) { | ||
685 | range[j].start = end + 1; | ||
686 | continue; | ||
687 | } | ||
688 | |||
689 | |||
690 | if (start > range[j].start && end >= range[j].end && | ||
691 | range[j].end > start - 1) { | ||
692 | range[j].end = start - 1; | ||
693 | continue; | ||
694 | } | ||
695 | |||
696 | if (start > range[j].start && end < range[j].end) { | ||
697 | /* find the new spare */ | ||
698 | for (i = 0; i < RANGE_NUM; i++) { | ||
699 | if (range[i].end == 0) | ||
700 | break; | ||
701 | } | ||
702 | if (i < RANGE_NUM) { | ||
703 | range[i].end = range[j].end; | ||
704 | range[i].start = end + 1; | ||
705 | } else { | ||
706 | printk(KERN_ERR "run of slot in ranges\n"); | ||
707 | } | ||
708 | range[j].end = start - 1; | ||
709 | continue; | ||
710 | } | ||
711 | } | ||
712 | } | ||
713 | |||
714 | static int __init cmp_range(const void *x1, const void *x2) | ||
715 | { | ||
716 | const struct res_range *r1 = x1; | ||
717 | const struct res_range *r2 = x2; | ||
718 | long start1, start2; | ||
719 | |||
720 | start1 = r1->start; | ||
721 | start2 = r2->start; | ||
722 | |||
723 | return start1 - start2; | ||
724 | } | ||
725 | |||
726 | struct var_mtrr_range_state { | ||
727 | unsigned long base_pfn; | ||
728 | unsigned long size_pfn; | ||
729 | mtrr_type type; | ||
730 | }; | ||
731 | |||
732 | struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; | ||
733 | static int __initdata debug_print; | ||
734 | |||
735 | static int __init | ||
736 | x86_get_mtrr_mem_range(struct res_range *range, int nr_range, | ||
737 | unsigned long extra_remove_base, | ||
738 | unsigned long extra_remove_size) | ||
739 | { | ||
740 | unsigned long i, base, size; | ||
741 | mtrr_type type; | ||
742 | |||
743 | for (i = 0; i < num_var_ranges; i++) { | ||
744 | type = range_state[i].type; | ||
745 | if (type != MTRR_TYPE_WRBACK) | ||
746 | continue; | ||
747 | base = range_state[i].base_pfn; | ||
748 | size = range_state[i].size_pfn; | ||
749 | nr_range = add_range_with_merge(range, nr_range, base, | ||
750 | base + size - 1); | ||
751 | } | ||
752 | if (debug_print) { | ||
753 | printk(KERN_DEBUG "After WB checking\n"); | ||
754 | for (i = 0; i < nr_range; i++) | ||
755 | printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", | ||
756 | range[i].start, range[i].end + 1); | ||
757 | } | ||
758 | |||
759 | /* take out UC ranges */ | ||
760 | for (i = 0; i < num_var_ranges; i++) { | ||
761 | type = range_state[i].type; | ||
762 | if (type != MTRR_TYPE_UNCACHABLE) | ||
763 | continue; | ||
764 | size = range_state[i].size_pfn; | ||
765 | if (!size) | ||
766 | continue; | ||
767 | base = range_state[i].base_pfn; | ||
768 | subtract_range(range, base, base + size - 1); | ||
769 | } | ||
770 | if (extra_remove_size) | ||
771 | subtract_range(range, extra_remove_base, | ||
772 | extra_remove_base + extra_remove_size - 1); | ||
773 | |||
774 | /* get new range num */ | ||
775 | nr_range = 0; | ||
776 | for (i = 0; i < RANGE_NUM; i++) { | ||
777 | if (!range[i].end) | ||
778 | continue; | ||
779 | nr_range++; | ||
780 | } | ||
781 | if (debug_print) { | ||
782 | printk(KERN_DEBUG "After UC checking\n"); | ||
783 | for (i = 0; i < nr_range; i++) | ||
784 | printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", | ||
785 | range[i].start, range[i].end + 1); | ||
786 | } | ||
787 | |||
788 | /* sort the ranges */ | ||
789 | sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); | ||
790 | if (debug_print) { | ||
791 | printk(KERN_DEBUG "After sorting\n"); | ||
792 | for (i = 0; i < nr_range; i++) | ||
793 | printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", | ||
794 | range[i].start, range[i].end + 1); | ||
795 | } | ||
796 | |||
797 | /* clear those is not used */ | ||
798 | for (i = nr_range; i < RANGE_NUM; i++) | ||
799 | memset(&range[i], 0, sizeof(range[i])); | ||
800 | |||
801 | return nr_range; | ||
802 | } | ||
803 | |||
804 | static struct res_range __initdata range[RANGE_NUM]; | ||
805 | |||
806 | #ifdef CONFIG_MTRR_SANITIZER | ||
807 | |||
808 | static unsigned long __init sum_ranges(struct res_range *range, int nr_range) | ||
809 | { | ||
810 | unsigned long sum; | ||
811 | int i; | ||
812 | |||
813 | sum = 0; | ||
814 | for (i = 0; i < nr_range; i++) | ||
815 | sum += range[i].end + 1 - range[i].start; | ||
816 | |||
817 | return sum; | ||
818 | } | ||
819 | |||
820 | static int enable_mtrr_cleanup __initdata = | ||
821 | CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; | ||
822 | |||
823 | static int __init disable_mtrr_cleanup_setup(char *str) | ||
824 | { | ||
825 | if (enable_mtrr_cleanup != -1) | ||
826 | enable_mtrr_cleanup = 0; | ||
827 | return 0; | ||
828 | } | ||
829 | early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); | ||
830 | |||
831 | static int __init enable_mtrr_cleanup_setup(char *str) | ||
832 | { | ||
833 | if (enable_mtrr_cleanup != -1) | ||
834 | enable_mtrr_cleanup = 1; | ||
835 | return 0; | ||
836 | } | ||
837 | early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); | ||
838 | |||
839 | struct var_mtrr_state { | ||
840 | unsigned long range_startk; | ||
841 | unsigned long range_sizek; | ||
842 | unsigned long chunk_sizek; | ||
843 | unsigned long gran_sizek; | ||
844 | unsigned int reg; | ||
845 | }; | ||
846 | |||
847 | static void __init | ||
848 | set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, | ||
849 | unsigned char type, unsigned int address_bits) | ||
850 | { | ||
851 | u32 base_lo, base_hi, mask_lo, mask_hi; | ||
852 | u64 base, mask; | ||
853 | |||
854 | if (!sizek) { | ||
855 | fill_mtrr_var_range(reg, 0, 0, 0, 0); | ||
856 | return; | ||
857 | } | ||
858 | |||
859 | mask = (1ULL << address_bits) - 1; | ||
860 | mask &= ~((((u64)sizek) << 10) - 1); | ||
861 | |||
862 | base = ((u64)basek) << 10; | ||
863 | |||
864 | base |= type; | ||
865 | mask |= 0x800; | ||
866 | |||
867 | base_lo = base & ((1ULL<<32) - 1); | ||
868 | base_hi = base >> 32; | ||
869 | |||
870 | mask_lo = mask & ((1ULL<<32) - 1); | ||
871 | mask_hi = mask >> 32; | ||
872 | |||
873 | fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); | ||
874 | } | ||
875 | |||
876 | static void __init | ||
877 | save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, | ||
878 | unsigned char type) | ||
879 | { | ||
880 | range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); | ||
881 | range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); | ||
882 | range_state[reg].type = type; | ||
883 | } | ||
884 | |||
885 | static void __init | ||
886 | set_var_mtrr_all(unsigned int address_bits) | ||
887 | { | ||
888 | unsigned long basek, sizek; | ||
889 | unsigned char type; | ||
890 | unsigned int reg; | ||
891 | |||
892 | for (reg = 0; reg < num_var_ranges; reg++) { | ||
893 | basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); | ||
894 | sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); | ||
895 | type = range_state[reg].type; | ||
896 | |||
897 | set_var_mtrr(reg, basek, sizek, type, address_bits); | ||
898 | } | ||
899 | } | ||
900 | |||
901 | static unsigned int __init | ||
902 | range_to_mtrr(unsigned int reg, unsigned long range_startk, | ||
903 | unsigned long range_sizek, unsigned char type) | ||
904 | { | ||
905 | if (!range_sizek || (reg >= num_var_ranges)) | ||
906 | return reg; | ||
907 | |||
908 | while (range_sizek) { | ||
909 | unsigned long max_align, align; | ||
910 | unsigned long sizek; | ||
911 | |||
912 | /* Compute the maximum size I can make a range */ | ||
913 | if (range_startk) | ||
914 | max_align = ffs(range_startk) - 1; | ||
915 | else | ||
916 | max_align = 32; | ||
917 | align = fls(range_sizek) - 1; | ||
918 | if (align > max_align) | ||
919 | align = max_align; | ||
920 | |||
921 | sizek = 1 << align; | ||
922 | if (debug_print) | ||
923 | printk(KERN_DEBUG "Setting variable MTRR %d, " | ||
924 | "base: %ldMB, range: %ldMB, type %s\n", | ||
925 | reg, range_startk >> 10, sizek >> 10, | ||
926 | (type == MTRR_TYPE_UNCACHABLE)?"UC": | ||
927 | ((type == MTRR_TYPE_WRBACK)?"WB":"Other") | ||
928 | ); | ||
929 | save_var_mtrr(reg++, range_startk, sizek, type); | ||
930 | range_startk += sizek; | ||
931 | range_sizek -= sizek; | ||
932 | if (reg >= num_var_ranges) | ||
933 | break; | ||
934 | } | ||
935 | return reg; | ||
936 | } | ||
937 | |||
938 | static unsigned __init | ||
939 | range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, | ||
940 | unsigned long sizek) | ||
941 | { | ||
942 | unsigned long hole_basek, hole_sizek; | ||
943 | unsigned long second_basek, second_sizek; | ||
944 | unsigned long range0_basek, range0_sizek; | ||
945 | unsigned long range_basek, range_sizek; | ||
946 | unsigned long chunk_sizek; | ||
947 | unsigned long gran_sizek; | ||
948 | |||
949 | hole_basek = 0; | ||
950 | hole_sizek = 0; | ||
951 | second_basek = 0; | ||
952 | second_sizek = 0; | ||
953 | chunk_sizek = state->chunk_sizek; | ||
954 | gran_sizek = state->gran_sizek; | ||
955 | |||
956 | /* align with gran size, prevent small block used up MTRRs */ | ||
957 | range_basek = ALIGN(state->range_startk, gran_sizek); | ||
958 | if ((range_basek > basek) && basek) | ||
959 | return second_sizek; | ||
960 | state->range_sizek -= (range_basek - state->range_startk); | ||
961 | range_sizek = ALIGN(state->range_sizek, gran_sizek); | ||
962 | |||
963 | while (range_sizek > state->range_sizek) { | ||
964 | range_sizek -= gran_sizek; | ||
965 | if (!range_sizek) | ||
966 | return 0; | ||
967 | } | ||
968 | state->range_sizek = range_sizek; | ||
969 | |||
970 | /* try to append some small hole */ | ||
971 | range0_basek = state->range_startk; | ||
972 | range0_sizek = ALIGN(state->range_sizek, chunk_sizek); | ||
973 | if (range0_sizek == state->range_sizek) { | ||
974 | if (debug_print) | ||
975 | printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", | ||
976 | range0_basek<<10, | ||
977 | (range0_basek + state->range_sizek)<<10); | ||
978 | state->reg = range_to_mtrr(state->reg, range0_basek, | ||
979 | state->range_sizek, MTRR_TYPE_WRBACK); | ||
980 | return 0; | ||
981 | } | ||
982 | |||
983 | range0_sizek -= chunk_sizek; | ||
984 | if (range0_sizek && sizek) { | ||
985 | while (range0_basek + range0_sizek > (basek + sizek)) { | ||
986 | range0_sizek -= chunk_sizek; | ||
987 | if (!range0_sizek) | ||
988 | break; | ||
989 | } | ||
990 | } | ||
991 | |||
992 | if (range0_sizek) { | ||
993 | if (debug_print) | ||
994 | printk(KERN_DEBUG "range0: %016lx - %016lx\n", | ||
995 | range0_basek<<10, | ||
996 | (range0_basek + range0_sizek)<<10); | ||
997 | state->reg = range_to_mtrr(state->reg, range0_basek, | ||
998 | range0_sizek, MTRR_TYPE_WRBACK); | ||
999 | |||
1000 | } | ||
1001 | |||
1002 | range_basek = range0_basek + range0_sizek; | ||
1003 | range_sizek = chunk_sizek; | ||
1004 | |||
1005 | if (range_basek + range_sizek > basek && | ||
1006 | range_basek + range_sizek <= (basek + sizek)) { | ||
1007 | /* one hole */ | ||
1008 | second_basek = basek; | ||
1009 | second_sizek = range_basek + range_sizek - basek; | ||
1010 | } | ||
1011 | |||
1012 | /* if last piece, only could one hole near end */ | ||
1013 | if ((second_basek || !basek) && | ||
1014 | range_sizek - (state->range_sizek - range0_sizek) - second_sizek < | ||
1015 | (chunk_sizek >> 1)) { | ||
1016 | /* | ||
1017 | * one hole in middle (second_sizek is 0) or at end | ||
1018 | * (second_sizek is 0 ) | ||
1019 | */ | ||
1020 | hole_sizek = range_sizek - (state->range_sizek - range0_sizek) | ||
1021 | - second_sizek; | ||
1022 | hole_basek = range_basek + range_sizek - hole_sizek | ||
1023 | - second_sizek; | ||
1024 | } else { | ||
1025 | /* fallback for big hole, or several holes */ | ||
1026 | range_sizek = state->range_sizek - range0_sizek; | ||
1027 | second_basek = 0; | ||
1028 | second_sizek = 0; | ||
1029 | } | ||
1030 | |||
1031 | if (debug_print) | ||
1032 | printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, | ||
1033 | (range_basek + range_sizek)<<10); | ||
1034 | state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, | ||
1035 | MTRR_TYPE_WRBACK); | ||
1036 | if (hole_sizek) { | ||
1037 | if (debug_print) | ||
1038 | printk(KERN_DEBUG "hole: %016lx - %016lx\n", | ||
1039 | hole_basek<<10, (hole_basek + hole_sizek)<<10); | ||
1040 | state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, | ||
1041 | MTRR_TYPE_UNCACHABLE); | ||
1042 | |||
1043 | } | ||
1044 | |||
1045 | return second_sizek; | ||
1046 | } | ||
1047 | |||
1048 | static void __init | ||
1049 | set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, | ||
1050 | unsigned long size_pfn) | ||
1051 | { | ||
1052 | unsigned long basek, sizek; | ||
1053 | unsigned long second_sizek = 0; | ||
1054 | |||
1055 | if (state->reg >= num_var_ranges) | ||
1056 | return; | ||
1057 | |||
1058 | basek = base_pfn << (PAGE_SHIFT - 10); | ||
1059 | sizek = size_pfn << (PAGE_SHIFT - 10); | ||
1060 | |||
1061 | /* See if I can merge with the last range */ | ||
1062 | if ((basek <= 1024) || | ||
1063 | (state->range_startk + state->range_sizek == basek)) { | ||
1064 | unsigned long endk = basek + sizek; | ||
1065 | state->range_sizek = endk - state->range_startk; | ||
1066 | return; | ||
1067 | } | ||
1068 | /* Write the range mtrrs */ | ||
1069 | if (state->range_sizek != 0) | ||
1070 | second_sizek = range_to_mtrr_with_hole(state, basek, sizek); | ||
1071 | |||
1072 | /* Allocate an msr */ | ||
1073 | state->range_startk = basek + second_sizek; | ||
1074 | state->range_sizek = sizek - second_sizek; | ||
1075 | } | ||
1076 | |||
1077 | /* mininum size of mtrr block that can take hole */ | ||
1078 | static u64 mtrr_chunk_size __initdata = (256ULL<<20); | ||
1079 | |||
1080 | static int __init parse_mtrr_chunk_size_opt(char *p) | ||
1081 | { | ||
1082 | if (!p) | ||
1083 | return -EINVAL; | ||
1084 | mtrr_chunk_size = memparse(p, &p); | ||
1085 | return 0; | ||
1086 | } | ||
1087 | early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); | ||
1088 | |||
1089 | /* granity of mtrr of block */ | ||
1090 | static u64 mtrr_gran_size __initdata; | ||
1091 | |||
1092 | static int __init parse_mtrr_gran_size_opt(char *p) | ||
1093 | { | ||
1094 | if (!p) | ||
1095 | return -EINVAL; | ||
1096 | mtrr_gran_size = memparse(p, &p); | ||
1097 | return 0; | ||
1098 | } | ||
1099 | early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); | ||
1100 | |||
1101 | static int nr_mtrr_spare_reg __initdata = | ||
1102 | CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; | ||
1103 | |||
1104 | static int __init parse_mtrr_spare_reg(char *arg) | ||
1105 | { | ||
1106 | if (arg) | ||
1107 | nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); | ||
1108 | return 0; | ||
1109 | } | ||
1110 | |||
1111 | early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); | ||
1112 | |||
1113 | static int __init | ||
1114 | x86_setup_var_mtrrs(struct res_range *range, int nr_range, | ||
1115 | u64 chunk_size, u64 gran_size) | ||
1116 | { | ||
1117 | struct var_mtrr_state var_state; | ||
1118 | int i; | ||
1119 | int num_reg; | ||
1120 | |||
1121 | var_state.range_startk = 0; | ||
1122 | var_state.range_sizek = 0; | ||
1123 | var_state.reg = 0; | ||
1124 | var_state.chunk_sizek = chunk_size >> 10; | ||
1125 | var_state.gran_sizek = gran_size >> 10; | ||
1126 | |||
1127 | memset(range_state, 0, sizeof(range_state)); | ||
1128 | |||
1129 | /* Write the range etc */ | ||
1130 | for (i = 0; i < nr_range; i++) | ||
1131 | set_var_mtrr_range(&var_state, range[i].start, | ||
1132 | range[i].end - range[i].start + 1); | ||
1133 | |||
1134 | /* Write the last range */ | ||
1135 | if (var_state.range_sizek != 0) | ||
1136 | range_to_mtrr_with_hole(&var_state, 0, 0); | ||
1137 | |||
1138 | num_reg = var_state.reg; | ||
1139 | /* Clear out the extra MTRR's */ | ||
1140 | while (var_state.reg < num_var_ranges) { | ||
1141 | save_var_mtrr(var_state.reg, 0, 0, 0); | ||
1142 | var_state.reg++; | ||
1143 | } | ||
1144 | |||
1145 | return num_reg; | ||
1146 | } | ||
1147 | |||
1148 | struct mtrr_cleanup_result { | ||
1149 | unsigned long gran_sizek; | ||
1150 | unsigned long chunk_sizek; | ||
1151 | unsigned long lose_cover_sizek; | ||
1152 | unsigned int num_reg; | ||
1153 | int bad; | ||
1154 | }; | ||
1155 | |||
1156 | /* | ||
1157 | * gran_size: 1M, 2M, ..., 2G | ||
1158 | * chunk size: gran_size, ..., 4G | ||
1159 | * so we need (2+13)*6 | ||
1160 | */ | ||
1161 | #define NUM_RESULT 90 | ||
1162 | #define PSHIFT (PAGE_SHIFT - 10) | ||
1163 | |||
1164 | static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; | ||
1165 | static struct res_range __initdata range_new[RANGE_NUM]; | ||
1166 | static unsigned long __initdata min_loss_pfn[RANGE_NUM]; | ||
1167 | |||
1168 | static int __init mtrr_cleanup(unsigned address_bits) | ||
1169 | { | ||
1170 | unsigned long extra_remove_base, extra_remove_size; | ||
1171 | unsigned long i, base, size, def, dummy; | ||
1172 | mtrr_type type; | ||
1173 | int nr_range, nr_range_new; | ||
1174 | u64 chunk_size, gran_size; | ||
1175 | unsigned long range_sums, range_sums_new; | ||
1176 | int index_good; | ||
1177 | int num_reg_good; | ||
1178 | |||
1179 | /* extra one for all 0 */ | ||
1180 | int num[MTRR_NUM_TYPES + 1]; | ||
1181 | |||
1182 | if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) | ||
1183 | return 0; | ||
1184 | rdmsr(MTRRdefType_MSR, def, dummy); | ||
1185 | def &= 0xff; | ||
1186 | if (def != MTRR_TYPE_UNCACHABLE) | ||
1187 | return 0; | ||
1188 | |||
1189 | /* get it and store it aside */ | ||
1190 | memset(range_state, 0, sizeof(range_state)); | ||
1191 | for (i = 0; i < num_var_ranges; i++) { | ||
1192 | mtrr_if->get(i, &base, &size, &type); | ||
1193 | range_state[i].base_pfn = base; | ||
1194 | range_state[i].size_pfn = size; | ||
1195 | range_state[i].type = type; | ||
1196 | } | ||
1197 | |||
1198 | /* check entries number */ | ||
1199 | memset(num, 0, sizeof(num)); | ||
1200 | for (i = 0; i < num_var_ranges; i++) { | ||
1201 | type = range_state[i].type; | ||
1202 | size = range_state[i].size_pfn; | ||
1203 | if (type >= MTRR_NUM_TYPES) | ||
1204 | continue; | ||
1205 | if (!size) | ||
1206 | type = MTRR_NUM_TYPES; | ||
1207 | num[type]++; | ||
1208 | } | ||
1209 | |||
1210 | /* check if we got UC entries */ | ||
1211 | if (!num[MTRR_TYPE_UNCACHABLE]) | ||
1212 | return 0; | ||
1213 | |||
1214 | /* check if we only had WB and UC */ | ||
1215 | if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != | ||
1216 | num_var_ranges - num[MTRR_NUM_TYPES]) | ||
1217 | return 0; | ||
1218 | |||
1219 | memset(range, 0, sizeof(range)); | ||
1220 | extra_remove_size = 0; | ||
1221 | if (mtrr_tom2) { | ||
1222 | extra_remove_base = 1 << (32 - PAGE_SHIFT); | ||
1223 | extra_remove_size = | ||
1224 | (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; | ||
1225 | } | ||
1226 | nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, | ||
1227 | extra_remove_size); | ||
1228 | range_sums = sum_ranges(range, nr_range); | ||
1229 | printk(KERN_INFO "total RAM coverred: %ldM\n", | ||
1230 | range_sums >> (20 - PAGE_SHIFT)); | ||
1231 | |||
1232 | if (mtrr_chunk_size && mtrr_gran_size) { | ||
1233 | int num_reg; | ||
1234 | |||
1235 | debug_print = 1; | ||
1236 | /* convert ranges to var ranges state */ | ||
1237 | num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, | ||
1238 | mtrr_gran_size); | ||
1239 | |||
1240 | /* we got new setting in range_state, check it */ | ||
1241 | memset(range_new, 0, sizeof(range_new)); | ||
1242 | nr_range_new = x86_get_mtrr_mem_range(range_new, 0, | ||
1243 | extra_remove_base, | ||
1244 | extra_remove_size); | ||
1245 | range_sums_new = sum_ranges(range_new, nr_range_new); | ||
1246 | |||
1247 | i = 0; | ||
1248 | result[i].chunk_sizek = mtrr_chunk_size >> 10; | ||
1249 | result[i].gran_sizek = mtrr_gran_size >> 10; | ||
1250 | result[i].num_reg = num_reg; | ||
1251 | if (range_sums < range_sums_new) { | ||
1252 | result[i].lose_cover_sizek = | ||
1253 | (range_sums_new - range_sums) << PSHIFT; | ||
1254 | result[i].bad = 1; | ||
1255 | } else | ||
1256 | result[i].lose_cover_sizek = | ||
1257 | (range_sums - range_sums_new) << PSHIFT; | ||
1258 | |||
1259 | printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", | ||
1260 | result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, | ||
1261 | result[i].chunk_sizek >> 10); | ||
1262 | printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", | ||
1263 | result[i].num_reg, result[i].bad?"-":"", | ||
1264 | result[i].lose_cover_sizek >> 10); | ||
1265 | if (!result[i].bad) { | ||
1266 | set_var_mtrr_all(address_bits); | ||
1267 | return 1; | ||
1268 | } | ||
1269 | printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " | ||
1270 | "will find optimal one\n"); | ||
1271 | debug_print = 0; | ||
1272 | memset(result, 0, sizeof(result[0])); | ||
1273 | } | ||
1274 | |||
1275 | i = 0; | ||
1276 | memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); | ||
1277 | memset(result, 0, sizeof(result)); | ||
1278 | for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { | ||
1279 | for (chunk_size = gran_size; chunk_size < (1ULL<<33); | ||
1280 | chunk_size <<= 1) { | ||
1281 | int num_reg; | ||
1282 | |||
1283 | if (debug_print) | ||
1284 | printk(KERN_INFO | ||
1285 | "\ngran_size: %lldM chunk_size_size: %lldM\n", | ||
1286 | gran_size >> 20, chunk_size >> 20); | ||
1287 | if (i >= NUM_RESULT) | ||
1288 | continue; | ||
1289 | |||
1290 | /* convert ranges to var ranges state */ | ||
1291 | num_reg = x86_setup_var_mtrrs(range, nr_range, | ||
1292 | chunk_size, gran_size); | ||
1293 | |||
1294 | /* we got new setting in range_state, check it */ | ||
1295 | memset(range_new, 0, sizeof(range_new)); | ||
1296 | nr_range_new = x86_get_mtrr_mem_range(range_new, 0, | ||
1297 | extra_remove_base, extra_remove_size); | ||
1298 | range_sums_new = sum_ranges(range_new, nr_range_new); | ||
1299 | |||
1300 | result[i].chunk_sizek = chunk_size >> 10; | ||
1301 | result[i].gran_sizek = gran_size >> 10; | ||
1302 | result[i].num_reg = num_reg; | ||
1303 | if (range_sums < range_sums_new) { | ||
1304 | result[i].lose_cover_sizek = | ||
1305 | (range_sums_new - range_sums) << PSHIFT; | ||
1306 | result[i].bad = 1; | ||
1307 | } else | ||
1308 | result[i].lose_cover_sizek = | ||
1309 | (range_sums - range_sums_new) << PSHIFT; | ||
1310 | |||
1311 | /* double check it */ | ||
1312 | if (!result[i].bad && !result[i].lose_cover_sizek) { | ||
1313 | if (nr_range_new != nr_range || | ||
1314 | memcmp(range, range_new, sizeof(range))) | ||
1315 | result[i].bad = 1; | ||
1316 | } | ||
1317 | |||
1318 | if (!result[i].bad && (range_sums - range_sums_new < | ||
1319 | min_loss_pfn[num_reg])) { | ||
1320 | min_loss_pfn[num_reg] = | ||
1321 | range_sums - range_sums_new; | ||
1322 | } | ||
1323 | i++; | ||
1324 | } | ||
1325 | } | ||
1326 | |||
1327 | /* print out all */ | ||
1328 | for (i = 0; i < NUM_RESULT; i++) { | ||
1329 | printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", | ||
1330 | result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, | ||
1331 | result[i].chunk_sizek >> 10); | ||
1332 | printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", | ||
1333 | result[i].num_reg, result[i].bad?"-":"", | ||
1334 | result[i].lose_cover_sizek >> 10); | ||
1335 | } | ||
1336 | |||
1337 | /* try to find the optimal index */ | ||
1338 | if (nr_mtrr_spare_reg >= num_var_ranges) | ||
1339 | nr_mtrr_spare_reg = num_var_ranges - 1; | ||
1340 | num_reg_good = -1; | ||
1341 | for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { | ||
1342 | if (!min_loss_pfn[i]) { | ||
1343 | num_reg_good = i; | ||
1344 | break; | ||
1345 | } | ||
1346 | } | ||
1347 | |||
1348 | index_good = -1; | ||
1349 | if (num_reg_good != -1) { | ||
1350 | for (i = 0; i < NUM_RESULT; i++) { | ||
1351 | if (!result[i].bad && | ||
1352 | result[i].num_reg == num_reg_good && | ||
1353 | !result[i].lose_cover_sizek) { | ||
1354 | index_good = i; | ||
1355 | break; | ||
1356 | } | ||
1357 | } | ||
1358 | } | ||
1359 | |||
1360 | if (index_good != -1) { | ||
1361 | printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); | ||
1362 | i = index_good; | ||
1363 | printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", | ||
1364 | result[i].gran_sizek >> 10, | ||
1365 | result[i].chunk_sizek >> 10); | ||
1366 | printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", | ||
1367 | result[i].num_reg, | ||
1368 | result[i].lose_cover_sizek >> 10); | ||
1369 | /* convert ranges to var ranges state */ | ||
1370 | chunk_size = result[i].chunk_sizek; | ||
1371 | chunk_size <<= 10; | ||
1372 | gran_size = result[i].gran_sizek; | ||
1373 | gran_size <<= 10; | ||
1374 | debug_print = 1; | ||
1375 | x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); | ||
1376 | set_var_mtrr_all(address_bits); | ||
1377 | return 1; | ||
1378 | } | ||
1379 | |||
1380 | printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); | ||
1381 | printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); | ||
1382 | |||
1383 | return 0; | ||
1384 | } | ||
1385 | #else | ||
1386 | static int __init mtrr_cleanup(unsigned address_bits) | ||
1387 | { | ||
1388 | return 0; | ||
1389 | } | ||
1390 | #endif | ||
1391 | |||
1392 | static int __initdata changed_by_mtrr_cleanup; | ||
1393 | |||
612 | static int disable_mtrr_trim; | 1394 | static int disable_mtrr_trim; |
613 | 1395 | ||
614 | static int __init disable_mtrr_trim_setup(char *str) | 1396 | static int __init disable_mtrr_trim_setup(char *str) |
@@ -648,6 +1430,19 @@ int __init amd_special_default_mtrr(void) | |||
648 | return 0; | 1430 | return 0; |
649 | } | 1431 | } |
650 | 1432 | ||
1433 | static u64 __init real_trim_memory(unsigned long start_pfn, | ||
1434 | unsigned long limit_pfn) | ||
1435 | { | ||
1436 | u64 trim_start, trim_size; | ||
1437 | trim_start = start_pfn; | ||
1438 | trim_start <<= PAGE_SHIFT; | ||
1439 | trim_size = limit_pfn; | ||
1440 | trim_size <<= PAGE_SHIFT; | ||
1441 | trim_size -= trim_start; | ||
1442 | |||
1443 | return e820_update_range(trim_start, trim_size, E820_RAM, | ||
1444 | E820_RESERVED); | ||
1445 | } | ||
651 | /** | 1446 | /** |
652 | * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs | 1447 | * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs |
653 | * @end_pfn: ending page frame number | 1448 | * @end_pfn: ending page frame number |
@@ -663,8 +1458,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
663 | { | 1458 | { |
664 | unsigned long i, base, size, highest_pfn = 0, def, dummy; | 1459 | unsigned long i, base, size, highest_pfn = 0, def, dummy; |
665 | mtrr_type type; | 1460 | mtrr_type type; |
666 | u64 trim_start, trim_size; | 1461 | int nr_range; |
1462 | u64 total_trim_size; | ||
667 | 1463 | ||
1464 | /* extra one for all 0 */ | ||
1465 | int num[MTRR_NUM_TYPES + 1]; | ||
668 | /* | 1466 | /* |
669 | * Make sure we only trim uncachable memory on machines that | 1467 | * Make sure we only trim uncachable memory on machines that |
670 | * support the Intel MTRR architecture: | 1468 | * support the Intel MTRR architecture: |
@@ -676,14 +1474,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
676 | if (def != MTRR_TYPE_UNCACHABLE) | 1474 | if (def != MTRR_TYPE_UNCACHABLE) |
677 | return 0; | 1475 | return 0; |
678 | 1476 | ||
679 | if (amd_special_default_mtrr()) | 1477 | /* get it and store it aside */ |
680 | return 0; | 1478 | memset(range_state, 0, sizeof(range_state)); |
1479 | for (i = 0; i < num_var_ranges; i++) { | ||
1480 | mtrr_if->get(i, &base, &size, &type); | ||
1481 | range_state[i].base_pfn = base; | ||
1482 | range_state[i].size_pfn = size; | ||
1483 | range_state[i].type = type; | ||
1484 | } | ||
681 | 1485 | ||
682 | /* Find highest cached pfn */ | 1486 | /* Find highest cached pfn */ |
683 | for (i = 0; i < num_var_ranges; i++) { | 1487 | for (i = 0; i < num_var_ranges; i++) { |
684 | mtrr_if->get(i, &base, &size, &type); | 1488 | type = range_state[i].type; |
685 | if (type != MTRR_TYPE_WRBACK) | 1489 | if (type != MTRR_TYPE_WRBACK) |
686 | continue; | 1490 | continue; |
1491 | base = range_state[i].base_pfn; | ||
1492 | size = range_state[i].size_pfn; | ||
687 | if (highest_pfn < base + size) | 1493 | if (highest_pfn < base + size) |
688 | highest_pfn = base + size; | 1494 | highest_pfn = base + size; |
689 | } | 1495 | } |
@@ -698,22 +1504,65 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
698 | return 0; | 1504 | return 0; |
699 | } | 1505 | } |
700 | 1506 | ||
701 | if (highest_pfn < end_pfn) { | 1507 | /* check entries number */ |
1508 | memset(num, 0, sizeof(num)); | ||
1509 | for (i = 0; i < num_var_ranges; i++) { | ||
1510 | type = range_state[i].type; | ||
1511 | if (type >= MTRR_NUM_TYPES) | ||
1512 | continue; | ||
1513 | size = range_state[i].size_pfn; | ||
1514 | if (!size) | ||
1515 | type = MTRR_NUM_TYPES; | ||
1516 | num[type]++; | ||
1517 | } | ||
1518 | |||
1519 | /* no entry for WB? */ | ||
1520 | if (!num[MTRR_TYPE_WRBACK]) | ||
1521 | return 0; | ||
1522 | |||
1523 | /* check if we only had WB and UC */ | ||
1524 | if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != | ||
1525 | num_var_ranges - num[MTRR_NUM_TYPES]) | ||
1526 | return 0; | ||
1527 | |||
1528 | memset(range, 0, sizeof(range)); | ||
1529 | nr_range = 0; | ||
1530 | if (mtrr_tom2) { | ||
1531 | range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); | ||
1532 | range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; | ||
1533 | if (highest_pfn < range[nr_range].end + 1) | ||
1534 | highest_pfn = range[nr_range].end + 1; | ||
1535 | nr_range++; | ||
1536 | } | ||
1537 | nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); | ||
1538 | |||
1539 | total_trim_size = 0; | ||
1540 | /* check the head */ | ||
1541 | if (range[0].start) | ||
1542 | total_trim_size += real_trim_memory(0, range[0].start); | ||
1543 | /* check the holes */ | ||
1544 | for (i = 0; i < nr_range - 1; i++) { | ||
1545 | if (range[i].end + 1 < range[i+1].start) | ||
1546 | total_trim_size += real_trim_memory(range[i].end + 1, | ||
1547 | range[i+1].start); | ||
1548 | } | ||
1549 | /* check the top */ | ||
1550 | i = nr_range - 1; | ||
1551 | if (range[i].end + 1 < end_pfn) | ||
1552 | total_trim_size += real_trim_memory(range[i].end + 1, | ||
1553 | end_pfn); | ||
1554 | |||
1555 | if (total_trim_size) { | ||
702 | printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" | 1556 | printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" |
703 | " all of memory, losing %luMB of RAM.\n", | 1557 | " all of memory, losing %lluMB of RAM.\n", |
704 | (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); | 1558 | total_trim_size >> 20); |
705 | 1559 | ||
706 | WARN_ON(1); | 1560 | if (!changed_by_mtrr_cleanup) |
1561 | WARN_ON(1); | ||
707 | 1562 | ||
708 | printk(KERN_INFO "update e820 for mtrr\n"); | 1563 | printk(KERN_INFO "update e820 for mtrr\n"); |
709 | trim_start = highest_pfn; | ||
710 | trim_start <<= PAGE_SHIFT; | ||
711 | trim_size = end_pfn; | ||
712 | trim_size <<= PAGE_SHIFT; | ||
713 | trim_size -= trim_start; | ||
714 | update_memory_range(trim_start, trim_size, E820_RAM, | ||
715 | E820_RESERVED); | ||
716 | update_e820(); | 1564 | update_e820(); |
1565 | |||
717 | return 1; | 1566 | return 1; |
718 | } | 1567 | } |
719 | 1568 | ||
@@ -729,18 +1578,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
729 | */ | 1578 | */ |
730 | void __init mtrr_bp_init(void) | 1579 | void __init mtrr_bp_init(void) |
731 | { | 1580 | { |
1581 | u32 phys_addr; | ||
732 | init_ifs(); | 1582 | init_ifs(); |
733 | 1583 | ||
1584 | phys_addr = 32; | ||
1585 | |||
734 | if (cpu_has_mtrr) { | 1586 | if (cpu_has_mtrr) { |
735 | mtrr_if = &generic_mtrr_ops; | 1587 | mtrr_if = &generic_mtrr_ops; |
736 | size_or_mask = 0xff000000; /* 36 bits */ | 1588 | size_or_mask = 0xff000000; /* 36 bits */ |
737 | size_and_mask = 0x00f00000; | 1589 | size_and_mask = 0x00f00000; |
1590 | phys_addr = 36; | ||
738 | 1591 | ||
739 | /* This is an AMD specific MSR, but we assume(hope?) that | 1592 | /* This is an AMD specific MSR, but we assume(hope?) that |
740 | Intel will implement it to when they extend the address | 1593 | Intel will implement it to when they extend the address |
741 | bus of the Xeon. */ | 1594 | bus of the Xeon. */ |
742 | if (cpuid_eax(0x80000000) >= 0x80000008) { | 1595 | if (cpuid_eax(0x80000000) >= 0x80000008) { |
743 | u32 phys_addr; | ||
744 | phys_addr = cpuid_eax(0x80000008) & 0xff; | 1596 | phys_addr = cpuid_eax(0x80000008) & 0xff; |
745 | /* CPUID workaround for Intel 0F33/0F34 CPU */ | 1597 | /* CPUID workaround for Intel 0F33/0F34 CPU */ |
746 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && | 1598 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && |
@@ -758,6 +1610,7 @@ void __init mtrr_bp_init(void) | |||
758 | don't support PAE */ | 1610 | don't support PAE */ |
759 | size_or_mask = 0xfff00000; /* 32 bits */ | 1611 | size_or_mask = 0xfff00000; /* 32 bits */ |
760 | size_and_mask = 0; | 1612 | size_and_mask = 0; |
1613 | phys_addr = 32; | ||
761 | } | 1614 | } |
762 | } else { | 1615 | } else { |
763 | switch (boot_cpu_data.x86_vendor) { | 1616 | switch (boot_cpu_data.x86_vendor) { |
@@ -791,8 +1644,15 @@ void __init mtrr_bp_init(void) | |||
791 | if (mtrr_if) { | 1644 | if (mtrr_if) { |
792 | set_num_var_ranges(); | 1645 | set_num_var_ranges(); |
793 | init_table(); | 1646 | init_table(); |
794 | if (use_intel()) | 1647 | if (use_intel()) { |
795 | get_mtrr_state(); | 1648 | get_mtrr_state(); |
1649 | |||
1650 | if (mtrr_cleanup(phys_addr)) { | ||
1651 | changed_by_mtrr_cleanup = 1; | ||
1652 | mtrr_if->set_all(); | ||
1653 | } | ||
1654 | |||
1655 | } | ||
796 | } | 1656 | } |
797 | } | 1657 | } |
798 | 1658 | ||
@@ -822,16 +1682,17 @@ void mtrr_ap_init(void) | |||
822 | */ | 1682 | */ |
823 | void mtrr_save_state(void) | 1683 | void mtrr_save_state(void) |
824 | { | 1684 | { |
825 | smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); | 1685 | smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); |
826 | } | 1686 | } |
827 | 1687 | ||
828 | static int __init mtrr_init_finialize(void) | 1688 | static int __init mtrr_init_finialize(void) |
829 | { | 1689 | { |
830 | if (!mtrr_if) | 1690 | if (!mtrr_if) |
831 | return 0; | 1691 | return 0; |
832 | if (use_intel()) | 1692 | if (use_intel()) { |
833 | mtrr_state_warn(); | 1693 | if (!changed_by_mtrr_cleanup) |
834 | else { | 1694 | mtrr_state_warn(); |
1695 | } else { | ||
835 | /* The CPUs haven't MTRR and seem to not support SMP. They have | 1696 | /* The CPUs haven't MTRR and seem to not support SMP. They have |
836 | * specific drivers, we use a tricky method to support | 1697 | * specific drivers, we use a tricky method to support |
837 | * suspend/resume for them. | 1698 | * suspend/resume for them. |
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2cc77eb6fea3..2dc4ec656b23 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h | |||
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt); | |||
81 | void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); | 81 | void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); |
82 | void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); | 82 | void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); |
83 | 83 | ||
84 | void fill_mtrr_var_range(unsigned int index, | ||
85 | u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); | ||
84 | void get_mtrr_state(void); | 86 | void get_mtrr_state(void); |
85 | 87 | ||
86 | extern void set_mtrr_ops(struct mtrr_ops * ops); | 88 | extern void set_mtrr_ops(struct mtrr_ops * ops); |
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if; | |||
92 | #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) | 94 | #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) |
93 | 95 | ||
94 | extern unsigned int num_var_ranges; | 96 | extern unsigned int num_var_ranges; |
97 | extern u64 mtrr_tom2; | ||
95 | 98 | ||
96 | void mtrr_state_warn(void); | 99 | void mtrr_state_warn(void); |
97 | const char *mtrr_attrib_to_str(int x); | 100 | const char *mtrr_attrib_to_str(int x); |
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f9ae93adffe5..de7439f82b92 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -1,11 +1,15 @@ | |||
1 | /* local apic based NMI watchdog for various CPUs. | 1 | /* |
2 | This file also handles reservation of performance counters for coordination | 2 | * local apic based NMI watchdog for various CPUs. |
3 | with other users (like oprofile). | 3 | * |
4 | 4 | * This file also handles reservation of performance counters for coordination | |
5 | Note that these events normally don't tick when the CPU idles. This means | 5 | * with other users (like oprofile). |
6 | the frequency varies with CPU load. | 6 | * |
7 | 7 | * Note that these events normally don't tick when the CPU idles. This means | |
8 | Original code for K7/P6 written by Keith Owens */ | 8 | * the frequency varies with CPU load. |
9 | * | ||
10 | * Original code for K7/P6 written by Keith Owens | ||
11 | * | ||
12 | */ | ||
9 | 13 | ||
10 | #include <linux/percpu.h> | 14 | #include <linux/percpu.h> |
11 | #include <linux/module.h> | 15 | #include <linux/module.h> |
@@ -36,12 +40,16 @@ struct wd_ops { | |||
36 | 40 | ||
37 | static const struct wd_ops *wd_ops; | 41 | static const struct wd_ops *wd_ops; |
38 | 42 | ||
39 | /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's | 43 | /* |
40 | * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) | 44 | * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's |
45 | * offset from MSR_P4_BSU_ESCR0. | ||
46 | * | ||
47 | * It will be the max for all platforms (for now) | ||
41 | */ | 48 | */ |
42 | #define NMI_MAX_COUNTER_BITS 66 | 49 | #define NMI_MAX_COUNTER_BITS 66 |
43 | 50 | ||
44 | /* perfctr_nmi_owner tracks the ownership of the perfctr registers: | 51 | /* |
52 | * perfctr_nmi_owner tracks the ownership of the perfctr registers: | ||
45 | * evtsel_nmi_owner tracks the ownership of the event selection | 53 | * evtsel_nmi_owner tracks the ownership of the event selection |
46 | * - different performance counters/ event selection may be reserved for | 54 | * - different performance counters/ event selection may be reserved for |
47 | * different subsystems this reservation system just tries to coordinate | 55 | * different subsystems this reservation system just tries to coordinate |
@@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) | |||
73 | return 0; | 81 | return 0; |
74 | } | 82 | } |
75 | 83 | ||
76 | /* converts an msr to an appropriate reservation bit */ | 84 | /* |
77 | /* returns the bit offset of the event selection register */ | 85 | * converts an msr to an appropriate reservation bit |
86 | * returns the bit offset of the event selection register | ||
87 | */ | ||
78 | static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) | 88 | static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) |
79 | { | 89 | { |
80 | /* returns the bit offset of the event selection register */ | 90 | /* returns the bit offset of the event selection register */ |
@@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) | |||
114 | 124 | ||
115 | return (!test_bit(counter, perfctr_nmi_owner)); | 125 | return (!test_bit(counter, perfctr_nmi_owner)); |
116 | } | 126 | } |
127 | EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); | ||
117 | 128 | ||
118 | int reserve_perfctr_nmi(unsigned int msr) | 129 | int reserve_perfctr_nmi(unsigned int msr) |
119 | { | 130 | { |
@@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr) | |||
128 | return 1; | 139 | return 1; |
129 | return 0; | 140 | return 0; |
130 | } | 141 | } |
142 | EXPORT_SYMBOL(reserve_perfctr_nmi); | ||
131 | 143 | ||
132 | void release_perfctr_nmi(unsigned int msr) | 144 | void release_perfctr_nmi(unsigned int msr) |
133 | { | 145 | { |
@@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr) | |||
140 | 152 | ||
141 | clear_bit(counter, perfctr_nmi_owner); | 153 | clear_bit(counter, perfctr_nmi_owner); |
142 | } | 154 | } |
155 | EXPORT_SYMBOL(release_perfctr_nmi); | ||
143 | 156 | ||
144 | int reserve_evntsel_nmi(unsigned int msr) | 157 | int reserve_evntsel_nmi(unsigned int msr) |
145 | { | 158 | { |
@@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr) | |||
154 | return 1; | 167 | return 1; |
155 | return 0; | 168 | return 0; |
156 | } | 169 | } |
170 | EXPORT_SYMBOL(reserve_evntsel_nmi); | ||
157 | 171 | ||
158 | void release_evntsel_nmi(unsigned int msr) | 172 | void release_evntsel_nmi(unsigned int msr) |
159 | { | 173 | { |
@@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr) | |||
166 | 180 | ||
167 | clear_bit(counter, evntsel_nmi_owner); | 181 | clear_bit(counter, evntsel_nmi_owner); |
168 | } | 182 | } |
169 | |||
170 | EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); | ||
171 | EXPORT_SYMBOL(reserve_perfctr_nmi); | ||
172 | EXPORT_SYMBOL(release_perfctr_nmi); | ||
173 | EXPORT_SYMBOL(reserve_evntsel_nmi); | ||
174 | EXPORT_SYMBOL(release_evntsel_nmi); | 183 | EXPORT_SYMBOL(release_evntsel_nmi); |
175 | 184 | ||
176 | void disable_lapic_nmi_watchdog(void) | 185 | void disable_lapic_nmi_watchdog(void) |
@@ -180,8 +189,10 @@ void disable_lapic_nmi_watchdog(void) | |||
180 | if (atomic_read(&nmi_active) <= 0) | 189 | if (atomic_read(&nmi_active) <= 0) |
181 | return; | 190 | return; |
182 | 191 | ||
183 | on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); | 192 | on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); |
184 | wd_ops->unreserve(); | 193 | |
194 | if (wd_ops) | ||
195 | wd_ops->unreserve(); | ||
185 | 196 | ||
186 | BUG_ON(atomic_read(&nmi_active) != 0); | 197 | BUG_ON(atomic_read(&nmi_active) != 0); |
187 | } | 198 | } |
@@ -202,7 +213,7 @@ void enable_lapic_nmi_watchdog(void) | |||
202 | return; | 213 | return; |
203 | } | 214 | } |
204 | 215 | ||
205 | on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); | 216 | on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); |
206 | touch_nmi_watchdog(); | 217 | touch_nmi_watchdog(); |
207 | } | 218 | } |
208 | 219 | ||
@@ -232,31 +243,32 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) | |||
232 | return retval; | 243 | return retval; |
233 | } | 244 | } |
234 | 245 | ||
235 | static void | 246 | static void write_watchdog_counter(unsigned int perfctr_msr, |
236 | write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) | 247 | const char *descr, unsigned nmi_hz) |
237 | { | 248 | { |
238 | u64 count = (u64)cpu_khz * 1000; | 249 | u64 count = (u64)cpu_khz * 1000; |
239 | 250 | ||
240 | do_div(count, nmi_hz); | 251 | do_div(count, nmi_hz); |
241 | if(descr) | 252 | if(descr) |
242 | Dprintk("setting %s to -0x%08Lx\n", descr, count); | 253 | pr_debug("setting %s to -0x%08Lx\n", descr, count); |
243 | wrmsrl(perfctr_msr, 0 - count); | 254 | wrmsrl(perfctr_msr, 0 - count); |
244 | } | 255 | } |
245 | 256 | ||
246 | static void write_watchdog_counter32(unsigned int perfctr_msr, | 257 | static void write_watchdog_counter32(unsigned int perfctr_msr, |
247 | const char *descr, unsigned nmi_hz) | 258 | const char *descr, unsigned nmi_hz) |
248 | { | 259 | { |
249 | u64 count = (u64)cpu_khz * 1000; | 260 | u64 count = (u64)cpu_khz * 1000; |
250 | 261 | ||
251 | do_div(count, nmi_hz); | 262 | do_div(count, nmi_hz); |
252 | if(descr) | 263 | if(descr) |
253 | Dprintk("setting %s to -0x%08Lx\n", descr, count); | 264 | pr_debug("setting %s to -0x%08Lx\n", descr, count); |
254 | wrmsr(perfctr_msr, (u32)(-count), 0); | 265 | wrmsr(perfctr_msr, (u32)(-count), 0); |
255 | } | 266 | } |
256 | 267 | ||
257 | /* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface | 268 | /* |
258 | nicely stable so there is not much variety */ | 269 | * AMD K7/K8/Family10h/Family11h support. |
259 | 270 | * AMD keeps this interface nicely stable so there is not much variety | |
271 | */ | ||
260 | #define K7_EVNTSEL_ENABLE (1 << 22) | 272 | #define K7_EVNTSEL_ENABLE (1 << 22) |
261 | #define K7_EVNTSEL_INT (1 << 20) | 273 | #define K7_EVNTSEL_INT (1 << 20) |
262 | #define K7_EVNTSEL_OS (1 << 17) | 274 | #define K7_EVNTSEL_OS (1 << 17) |
@@ -289,7 +301,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) | |||
289 | 301 | ||
290 | wd->perfctr_msr = perfctr_msr; | 302 | wd->perfctr_msr = perfctr_msr; |
291 | wd->evntsel_msr = evntsel_msr; | 303 | wd->evntsel_msr = evntsel_msr; |
292 | wd->cccr_msr = 0; //unused | 304 | wd->cccr_msr = 0; /* unused */ |
293 | return 1; | 305 | return 1; |
294 | } | 306 | } |
295 | 307 | ||
@@ -325,18 +337,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | |||
325 | } | 337 | } |
326 | 338 | ||
327 | static const struct wd_ops k7_wd_ops = { | 339 | static const struct wd_ops k7_wd_ops = { |
328 | .reserve = single_msr_reserve, | 340 | .reserve = single_msr_reserve, |
329 | .unreserve = single_msr_unreserve, | 341 | .unreserve = single_msr_unreserve, |
330 | .setup = setup_k7_watchdog, | 342 | .setup = setup_k7_watchdog, |
331 | .rearm = single_msr_rearm, | 343 | .rearm = single_msr_rearm, |
332 | .stop = single_msr_stop_watchdog, | 344 | .stop = single_msr_stop_watchdog, |
333 | .perfctr = MSR_K7_PERFCTR0, | 345 | .perfctr = MSR_K7_PERFCTR0, |
334 | .evntsel = MSR_K7_EVNTSEL0, | 346 | .evntsel = MSR_K7_EVNTSEL0, |
335 | .checkbit = 1ULL<<47, | 347 | .checkbit = 1ULL << 47, |
336 | }; | 348 | }; |
337 | 349 | ||
338 | /* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ | 350 | /* |
339 | 351 | * Intel Model 6 (PPro+,P2,P3,P-M,Core1) | |
352 | */ | ||
340 | #define P6_EVNTSEL0_ENABLE (1 << 22) | 353 | #define P6_EVNTSEL0_ENABLE (1 << 22) |
341 | #define P6_EVNTSEL_INT (1 << 20) | 354 | #define P6_EVNTSEL_INT (1 << 20) |
342 | #define P6_EVNTSEL_OS (1 << 17) | 355 | #define P6_EVNTSEL_OS (1 << 17) |
@@ -372,52 +385,58 @@ static int setup_p6_watchdog(unsigned nmi_hz) | |||
372 | 385 | ||
373 | wd->perfctr_msr = perfctr_msr; | 386 | wd->perfctr_msr = perfctr_msr; |
374 | wd->evntsel_msr = evntsel_msr; | 387 | wd->evntsel_msr = evntsel_msr; |
375 | wd->cccr_msr = 0; //unused | 388 | wd->cccr_msr = 0; /* unused */ |
376 | return 1; | 389 | return 1; |
377 | } | 390 | } |
378 | 391 | ||
379 | static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | 392 | static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) |
380 | { | 393 | { |
381 | /* P6 based Pentium M need to re-unmask | 394 | /* |
395 | * P6 based Pentium M need to re-unmask | ||
382 | * the apic vector but it doesn't hurt | 396 | * the apic vector but it doesn't hurt |
383 | * other P6 variant. | 397 | * other P6 variant. |
384 | * ArchPerfom/Core Duo also needs this */ | 398 | * ArchPerfom/Core Duo also needs this |
399 | */ | ||
385 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 400 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
401 | |||
386 | /* P6/ARCH_PERFMON has 32 bit counter write */ | 402 | /* P6/ARCH_PERFMON has 32 bit counter write */ |
387 | write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); | 403 | write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); |
388 | } | 404 | } |
389 | 405 | ||
390 | static const struct wd_ops p6_wd_ops = { | 406 | static const struct wd_ops p6_wd_ops = { |
391 | .reserve = single_msr_reserve, | 407 | .reserve = single_msr_reserve, |
392 | .unreserve = single_msr_unreserve, | 408 | .unreserve = single_msr_unreserve, |
393 | .setup = setup_p6_watchdog, | 409 | .setup = setup_p6_watchdog, |
394 | .rearm = p6_rearm, | 410 | .rearm = p6_rearm, |
395 | .stop = single_msr_stop_watchdog, | 411 | .stop = single_msr_stop_watchdog, |
396 | .perfctr = MSR_P6_PERFCTR0, | 412 | .perfctr = MSR_P6_PERFCTR0, |
397 | .evntsel = MSR_P6_EVNTSEL0, | 413 | .evntsel = MSR_P6_EVNTSEL0, |
398 | .checkbit = 1ULL<<39, | 414 | .checkbit = 1ULL << 39, |
399 | }; | 415 | }; |
400 | 416 | ||
401 | /* Intel P4 performance counters. By far the most complicated of all. */ | 417 | /* |
402 | 418 | * Intel P4 performance counters. | |
403 | #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) | 419 | * By far the most complicated of all. |
404 | #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) | 420 | */ |
405 | #define P4_ESCR_OS (1<<3) | 421 | #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) |
406 | #define P4_ESCR_USR (1<<2) | 422 | #define P4_ESCR_EVENT_SELECT(N) ((N) << 25) |
407 | #define P4_CCCR_OVF_PMI0 (1<<26) | 423 | #define P4_ESCR_OS (1 << 3) |
408 | #define P4_CCCR_OVF_PMI1 (1<<27) | 424 | #define P4_ESCR_USR (1 << 2) |
409 | #define P4_CCCR_THRESHOLD(N) ((N)<<20) | 425 | #define P4_CCCR_OVF_PMI0 (1 << 26) |
410 | #define P4_CCCR_COMPLEMENT (1<<19) | 426 | #define P4_CCCR_OVF_PMI1 (1 << 27) |
411 | #define P4_CCCR_COMPARE (1<<18) | 427 | #define P4_CCCR_THRESHOLD(N) ((N) << 20) |
412 | #define P4_CCCR_REQUIRED (3<<16) | 428 | #define P4_CCCR_COMPLEMENT (1 << 19) |
413 | #define P4_CCCR_ESCR_SELECT(N) ((N)<<13) | 429 | #define P4_CCCR_COMPARE (1 << 18) |
414 | #define P4_CCCR_ENABLE (1<<12) | 430 | #define P4_CCCR_REQUIRED (3 << 16) |
415 | #define P4_CCCR_OVF (1<<31) | 431 | #define P4_CCCR_ESCR_SELECT(N) ((N) << 13) |
416 | 432 | #define P4_CCCR_ENABLE (1 << 12) | |
417 | /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter | 433 | #define P4_CCCR_OVF (1 << 31) |
418 | CRU_ESCR0 (with any non-null event selector) through a complemented | ||
419 | max threshold. [IA32-Vol3, Section 14.9.9] */ | ||
420 | 434 | ||
435 | /* | ||
436 | * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter | ||
437 | * CRU_ESCR0 (with any non-null event selector) through a complemented | ||
438 | * max threshold. [IA32-Vol3, Section 14.9.9] | ||
439 | */ | ||
421 | static int setup_p4_watchdog(unsigned nmi_hz) | 440 | static int setup_p4_watchdog(unsigned nmi_hz) |
422 | { | 441 | { |
423 | unsigned int perfctr_msr, evntsel_msr, cccr_msr; | 442 | unsigned int perfctr_msr, evntsel_msr, cccr_msr; |
@@ -442,7 +461,8 @@ static int setup_p4_watchdog(unsigned nmi_hz) | |||
442 | #endif | 461 | #endif |
443 | ht_num = 0; | 462 | ht_num = 0; |
444 | 463 | ||
445 | /* performance counters are shared resources | 464 | /* |
465 | * performance counters are shared resources | ||
446 | * assign each hyperthread its own set | 466 | * assign each hyperthread its own set |
447 | * (re-use the ESCR0 register, seems safe | 467 | * (re-use the ESCR0 register, seems safe |
448 | * and keeps the cccr_val the same) | 468 | * and keeps the cccr_val the same) |
@@ -540,20 +560,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | |||
540 | } | 560 | } |
541 | 561 | ||
542 | static const struct wd_ops p4_wd_ops = { | 562 | static const struct wd_ops p4_wd_ops = { |
543 | .reserve = p4_reserve, | 563 | .reserve = p4_reserve, |
544 | .unreserve = p4_unreserve, | 564 | .unreserve = p4_unreserve, |
545 | .setup = setup_p4_watchdog, | 565 | .setup = setup_p4_watchdog, |
546 | .rearm = p4_rearm, | 566 | .rearm = p4_rearm, |
547 | .stop = stop_p4_watchdog, | 567 | .stop = stop_p4_watchdog, |
548 | /* RED-PEN this is wrong for the other sibling */ | 568 | /* RED-PEN this is wrong for the other sibling */ |
549 | .perfctr = MSR_P4_BPU_PERFCTR0, | 569 | .perfctr = MSR_P4_BPU_PERFCTR0, |
550 | .evntsel = MSR_P4_BSU_ESCR0, | 570 | .evntsel = MSR_P4_BSU_ESCR0, |
551 | .checkbit = 1ULL<<39, | 571 | .checkbit = 1ULL << 39, |
552 | }; | 572 | }; |
553 | 573 | ||
554 | /* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully | 574 | /* |
555 | all future Intel CPUs. */ | 575 | * Watchdog using the Intel architected PerfMon. |
556 | 576 | * Used for Core2 and hopefully all future Intel CPUs. | |
577 | */ | ||
557 | #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL | 578 | #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL |
558 | #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK | 579 | #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK |
559 | 580 | ||
@@ -599,19 +620,19 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) | |||
599 | 620 | ||
600 | wd->perfctr_msr = perfctr_msr; | 621 | wd->perfctr_msr = perfctr_msr; |
601 | wd->evntsel_msr = evntsel_msr; | 622 | wd->evntsel_msr = evntsel_msr; |
602 | wd->cccr_msr = 0; //unused | 623 | wd->cccr_msr = 0; /* unused */ |
603 | intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); | 624 | intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); |
604 | return 1; | 625 | return 1; |
605 | } | 626 | } |
606 | 627 | ||
607 | static struct wd_ops intel_arch_wd_ops __read_mostly = { | 628 | static struct wd_ops intel_arch_wd_ops __read_mostly = { |
608 | .reserve = single_msr_reserve, | 629 | .reserve = single_msr_reserve, |
609 | .unreserve = single_msr_unreserve, | 630 | .unreserve = single_msr_unreserve, |
610 | .setup = setup_intel_arch_watchdog, | 631 | .setup = setup_intel_arch_watchdog, |
611 | .rearm = p6_rearm, | 632 | .rearm = p6_rearm, |
612 | .stop = single_msr_stop_watchdog, | 633 | .stop = single_msr_stop_watchdog, |
613 | .perfctr = MSR_ARCH_PERFMON_PERFCTR1, | 634 | .perfctr = MSR_ARCH_PERFMON_PERFCTR1, |
614 | .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, | 635 | .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, |
615 | }; | 636 | }; |
616 | 637 | ||
617 | static void probe_nmi_watchdog(void) | 638 | static void probe_nmi_watchdog(void) |
@@ -624,8 +645,10 @@ static void probe_nmi_watchdog(void) | |||
624 | wd_ops = &k7_wd_ops; | 645 | wd_ops = &k7_wd_ops; |
625 | break; | 646 | break; |
626 | case X86_VENDOR_INTEL: | 647 | case X86_VENDOR_INTEL: |
627 | /* Work around Core Duo (Yonah) errata AE49 where perfctr1 | 648 | /* |
628 | doesn't have a working enable bit. */ | 649 | * Work around Core Duo (Yonah) errata AE49 where perfctr1 |
650 | * doesn't have a working enable bit. | ||
651 | */ | ||
629 | if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { | 652 | if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { |
630 | intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; | 653 | intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; |
631 | intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; | 654 | intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; |
@@ -636,7 +659,7 @@ static void probe_nmi_watchdog(void) | |||
636 | } | 659 | } |
637 | switch (boot_cpu_data.x86) { | 660 | switch (boot_cpu_data.x86) { |
638 | case 6: | 661 | case 6: |
639 | if (boot_cpu_data.x86_model > 0xd) | 662 | if (boot_cpu_data.x86_model > 13) |
640 | return; | 663 | return; |
641 | 664 | ||
642 | wd_ops = &p6_wd_ops; | 665 | wd_ops = &p6_wd_ops; |
@@ -697,10 +720,11 @@ int lapic_wd_event(unsigned nmi_hz) | |||
697 | { | 720 | { |
698 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | 721 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); |
699 | u64 ctr; | 722 | u64 ctr; |
723 | |||
700 | rdmsrl(wd->perfctr_msr, ctr); | 724 | rdmsrl(wd->perfctr_msr, ctr); |
701 | if (ctr & wd_ops->checkbit) { /* perfctr still running? */ | 725 | if (ctr & wd_ops->checkbit) /* perfctr still running? */ |
702 | return 0; | 726 | return 0; |
703 | } | 727 | |
704 | wd_ops->rearm(wd, nmi_hz); | 728 | wd_ops->rearm(wd, nmi_hz); |
705 | return 1; | 729 | return 1; |
706 | } | 730 | } |
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 0d0d9057e7c0..a26c480b9491 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c | |||
@@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) | |||
160 | { | 160 | { |
161 | if (*pos == 0) /* just in case, cpu 0 is not the first */ | 161 | if (*pos == 0) /* just in case, cpu 0 is not the first */ |
162 | *pos = first_cpu(cpu_online_map); | 162 | *pos = first_cpu(cpu_online_map); |
163 | if ((*pos) < NR_CPUS && cpu_online(*pos)) | 163 | if ((*pos) < nr_cpu_ids && cpu_online(*pos)) |
164 | return &cpu_data(*pos); | 164 | return &cpu_data(*pos); |
165 | return NULL; | 165 | return NULL; |
166 | } | 166 | } |
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index daff52a62248..14b11b3be31c 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/init.h> | 33 | #include <linux/init.h> |
34 | #include <linux/poll.h> | 34 | #include <linux/poll.h> |
35 | #include <linux/smp.h> | 35 | #include <linux/smp.h> |
36 | #include <linux/smp_lock.h> | ||
36 | #include <linux/major.h> | 37 | #include <linux/major.h> |
37 | #include <linux/fs.h> | 38 | #include <linux/fs.h> |
38 | #include <linux/smp_lock.h> | 39 | #include <linux/smp_lock.h> |
@@ -95,7 +96,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, | |||
95 | for (; count; count -= 16) { | 96 | for (; count; count -= 16) { |
96 | cmd.eax = pos; | 97 | cmd.eax = pos; |
97 | cmd.ecx = pos >> 32; | 98 | cmd.ecx = pos >> 32; |
98 | smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); | 99 | smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); |
99 | if (copy_to_user(tmp, &cmd, 16)) | 100 | if (copy_to_user(tmp, &cmd, 16)) |
100 | return -EFAULT; | 101 | return -EFAULT; |
101 | tmp += 16; | 102 | tmp += 16; |
@@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, | |||
107 | 108 | ||
108 | static int cpuid_open(struct inode *inode, struct file *file) | 109 | static int cpuid_open(struct inode *inode, struct file *file) |
109 | { | 110 | { |
110 | unsigned int cpu = iminor(file->f_path.dentry->d_inode); | 111 | unsigned int cpu; |
111 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 112 | struct cpuinfo_x86 *c; |
112 | 113 | int ret = 0; | |
113 | if (cpu >= NR_CPUS || !cpu_online(cpu)) | 114 | |
114 | return -ENXIO; /* No such CPU */ | 115 | lock_kernel(); |
116 | |||
117 | cpu = iminor(file->f_path.dentry->d_inode); | ||
118 | if (cpu >= NR_CPUS || !cpu_online(cpu)) { | ||
119 | ret = -ENXIO; /* No such CPU */ | ||
120 | goto out; | ||
121 | } | ||
122 | c = &cpu_data(cpu); | ||
115 | if (c->cpuid_level < 0) | 123 | if (c->cpuid_level < 0) |
116 | return -EIO; /* CPUID not supported */ | 124 | ret = -EIO; /* CPUID not supported */ |
117 | 125 | out: | |
118 | return 0; | 126 | unlock_kernel(); |
127 | return ret; | ||
119 | } | 128 | } |
120 | 129 | ||
121 | /* | 130 | /* |
@@ -132,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu) | |||
132 | { | 141 | { |
133 | struct device *dev; | 142 | struct device *dev; |
134 | 143 | ||
135 | dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), | 144 | dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), |
136 | "cpu%d", cpu); | 145 | NULL, "cpu%d", cpu); |
137 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 146 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
138 | } | 147 | } |
139 | 148 | ||
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c new file mode 100644 index 000000000000..9af89078f7bb --- /dev/null +++ b/arch/x86/kernel/e820.c | |||
@@ -0,0 +1,1365 @@ | |||
1 | /* | ||
2 | * Handle the memory map. | ||
3 | * The functions here do the job until bootmem takes over. | ||
4 | * | ||
5 | * Getting sanitize_e820_map() in sync with i386 version by applying change: | ||
6 | * - Provisions for empty E820 memory regions (reported by certain BIOSes). | ||
7 | * Alex Achenbach <xela@slit.de>, December 2002. | ||
8 | * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | ||
9 | * | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/ioport.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/kexec.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/pfn.h> | ||
21 | #include <linux/suspend.h> | ||
22 | #include <linux/firmware-map.h> | ||
23 | |||
24 | #include <asm/pgtable.h> | ||
25 | #include <asm/page.h> | ||
26 | #include <asm/e820.h> | ||
27 | #include <asm/proto.h> | ||
28 | #include <asm/setup.h> | ||
29 | #include <asm/trampoline.h> | ||
30 | |||
31 | /* | ||
32 | * The e820 map is the map that gets modified e.g. with command line parameters | ||
33 | * and that is also registered with modifications in the kernel resource tree | ||
34 | * with the iomem_resource as parent. | ||
35 | * | ||
36 | * The e820_saved is directly saved after the BIOS-provided memory map is | ||
37 | * copied. It doesn't get modified afterwards. It's registered for the | ||
38 | * /sys/firmware/memmap interface. | ||
39 | * | ||
40 | * That memory map is not modified and is used as base for kexec. The kexec'd | ||
41 | * kernel should get the same memory map as the firmware provides. Then the | ||
42 | * user can e.g. boot the original kernel with mem=1G while still booting the | ||
43 | * next kernel with full memory. | ||
44 | */ | ||
45 | struct e820map e820; | ||
46 | struct e820map e820_saved; | ||
47 | |||
48 | /* For PCI or other memory-mapped resources */ | ||
49 | unsigned long pci_mem_start = 0xaeedbabe; | ||
50 | #ifdef CONFIG_PCI | ||
51 | EXPORT_SYMBOL(pci_mem_start); | ||
52 | #endif | ||
53 | |||
54 | /* | ||
55 | * This function checks if any part of the range <start,end> is mapped | ||
56 | * with type. | ||
57 | */ | ||
58 | int | ||
59 | e820_any_mapped(u64 start, u64 end, unsigned type) | ||
60 | { | ||
61 | int i; | ||
62 | |||
63 | for (i = 0; i < e820.nr_map; i++) { | ||
64 | struct e820entry *ei = &e820.map[i]; | ||
65 | |||
66 | if (type && ei->type != type) | ||
67 | continue; | ||
68 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
69 | continue; | ||
70 | return 1; | ||
71 | } | ||
72 | return 0; | ||
73 | } | ||
74 | EXPORT_SYMBOL_GPL(e820_any_mapped); | ||
75 | |||
76 | /* | ||
77 | * This function checks if the entire range <start,end> is mapped with type. | ||
78 | * | ||
79 | * Note: this function only works correct if the e820 table is sorted and | ||
80 | * not-overlapping, which is the case | ||
81 | */ | ||
82 | int __init e820_all_mapped(u64 start, u64 end, unsigned type) | ||
83 | { | ||
84 | int i; | ||
85 | |||
86 | for (i = 0; i < e820.nr_map; i++) { | ||
87 | struct e820entry *ei = &e820.map[i]; | ||
88 | |||
89 | if (type && ei->type != type) | ||
90 | continue; | ||
91 | /* is the region (part) in overlap with the current region ?*/ | ||
92 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
93 | continue; | ||
94 | |||
95 | /* if the region is at the beginning of <start,end> we move | ||
96 | * start to the end of the region since it's ok until there | ||
97 | */ | ||
98 | if (ei->addr <= start) | ||
99 | start = ei->addr + ei->size; | ||
100 | /* | ||
101 | * if start is now at or beyond end, we're done, full | ||
102 | * coverage | ||
103 | */ | ||
104 | if (start >= end) | ||
105 | return 1; | ||
106 | } | ||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Add a memory region to the kernel e820 map. | ||
112 | */ | ||
113 | void __init e820_add_region(u64 start, u64 size, int type) | ||
114 | { | ||
115 | int x = e820.nr_map; | ||
116 | |||
117 | if (x == ARRAY_SIZE(e820.map)) { | ||
118 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
119 | return; | ||
120 | } | ||
121 | |||
122 | e820.map[x].addr = start; | ||
123 | e820.map[x].size = size; | ||
124 | e820.map[x].type = type; | ||
125 | e820.nr_map++; | ||
126 | } | ||
127 | |||
128 | void __init e820_print_map(char *who) | ||
129 | { | ||
130 | int i; | ||
131 | |||
132 | for (i = 0; i < e820.nr_map; i++) { | ||
133 | printk(KERN_INFO " %s: %016Lx - %016Lx ", who, | ||
134 | (unsigned long long) e820.map[i].addr, | ||
135 | (unsigned long long) | ||
136 | (e820.map[i].addr + e820.map[i].size)); | ||
137 | switch (e820.map[i].type) { | ||
138 | case E820_RAM: | ||
139 | case E820_RESERVED_KERN: | ||
140 | printk(KERN_CONT "(usable)\n"); | ||
141 | break; | ||
142 | case E820_RESERVED: | ||
143 | printk(KERN_CONT "(reserved)\n"); | ||
144 | break; | ||
145 | case E820_ACPI: | ||
146 | printk(KERN_CONT "(ACPI data)\n"); | ||
147 | break; | ||
148 | case E820_NVS: | ||
149 | printk(KERN_CONT "(ACPI NVS)\n"); | ||
150 | break; | ||
151 | default: | ||
152 | printk(KERN_CONT "type %u\n", e820.map[i].type); | ||
153 | break; | ||
154 | } | ||
155 | } | ||
156 | } | ||
157 | |||
158 | /* | ||
159 | * Sanitize the BIOS e820 map. | ||
160 | * | ||
161 | * Some e820 responses include overlapping entries. The following | ||
162 | * replaces the original e820 map with a new one, removing overlaps, | ||
163 | * and resolving conflicting memory types in favor of highest | ||
164 | * numbered type. | ||
165 | * | ||
166 | * The input parameter biosmap points to an array of 'struct | ||
167 | * e820entry' which on entry has elements in the range [0, *pnr_map) | ||
168 | * valid, and which has space for up to max_nr_map entries. | ||
169 | * On return, the resulting sanitized e820 map entries will be in | ||
170 | * overwritten in the same location, starting at biosmap. | ||
171 | * | ||
172 | * The integer pointed to by pnr_map must be valid on entry (the | ||
173 | * current number of valid entries located at biosmap) and will | ||
174 | * be updated on return, with the new number of valid entries | ||
175 | * (something no more than max_nr_map.) | ||
176 | * | ||
177 | * The return value from sanitize_e820_map() is zero if it | ||
178 | * successfully 'sanitized' the map entries passed in, and is -1 | ||
179 | * if it did nothing, which can happen if either of (1) it was | ||
180 | * only passed one map entry, or (2) any of the input map entries | ||
181 | * were invalid (start + size < start, meaning that the size was | ||
182 | * so big the described memory range wrapped around through zero.) | ||
183 | * | ||
184 | * Visually we're performing the following | ||
185 | * (1,2,3,4 = memory types)... | ||
186 | * | ||
187 | * Sample memory map (w/overlaps): | ||
188 | * ____22__________________ | ||
189 | * ______________________4_ | ||
190 | * ____1111________________ | ||
191 | * _44_____________________ | ||
192 | * 11111111________________ | ||
193 | * ____________________33__ | ||
194 | * ___________44___________ | ||
195 | * __________33333_________ | ||
196 | * ______________22________ | ||
197 | * ___________________2222_ | ||
198 | * _________111111111______ | ||
199 | * _____________________11_ | ||
200 | * _________________4______ | ||
201 | * | ||
202 | * Sanitized equivalent (no overlap): | ||
203 | * 1_______________________ | ||
204 | * _44_____________________ | ||
205 | * ___1____________________ | ||
206 | * ____22__________________ | ||
207 | * ______11________________ | ||
208 | * _________1______________ | ||
209 | * __________3_____________ | ||
210 | * ___________44___________ | ||
211 | * _____________33_________ | ||
212 | * _______________2________ | ||
213 | * ________________1_______ | ||
214 | * _________________4______ | ||
215 | * ___________________2____ | ||
216 | * ____________________33__ | ||
217 | * ______________________4_ | ||
218 | */ | ||
219 | |||
220 | int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, | ||
221 | int *pnr_map) | ||
222 | { | ||
223 | struct change_member { | ||
224 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
225 | unsigned long long addr; /* address for this change point */ | ||
226 | }; | ||
227 | static struct change_member change_point_list[2*E820_X_MAX] __initdata; | ||
228 | static struct change_member *change_point[2*E820_X_MAX] __initdata; | ||
229 | static struct e820entry *overlap_list[E820_X_MAX] __initdata; | ||
230 | static struct e820entry new_bios[E820_X_MAX] __initdata; | ||
231 | struct change_member *change_tmp; | ||
232 | unsigned long current_type, last_type; | ||
233 | unsigned long long last_addr; | ||
234 | int chgidx, still_changing; | ||
235 | int overlap_entries; | ||
236 | int new_bios_entry; | ||
237 | int old_nr, new_nr, chg_nr; | ||
238 | int i; | ||
239 | |||
240 | /* if there's only one memory region, don't bother */ | ||
241 | if (*pnr_map < 2) | ||
242 | return -1; | ||
243 | |||
244 | old_nr = *pnr_map; | ||
245 | BUG_ON(old_nr > max_nr_map); | ||
246 | |||
247 | /* bail out if we find any unreasonable addresses in bios map */ | ||
248 | for (i = 0; i < old_nr; i++) | ||
249 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | ||
250 | return -1; | ||
251 | |||
252 | /* create pointers for initial change-point information (for sorting) */ | ||
253 | for (i = 0; i < 2 * old_nr; i++) | ||
254 | change_point[i] = &change_point_list[i]; | ||
255 | |||
256 | /* record all known change-points (starting and ending addresses), | ||
257 | omitting those that are for empty memory regions */ | ||
258 | chgidx = 0; | ||
259 | for (i = 0; i < old_nr; i++) { | ||
260 | if (biosmap[i].size != 0) { | ||
261 | change_point[chgidx]->addr = biosmap[i].addr; | ||
262 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
263 | change_point[chgidx]->addr = biosmap[i].addr + | ||
264 | biosmap[i].size; | ||
265 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
266 | } | ||
267 | } | ||
268 | chg_nr = chgidx; | ||
269 | |||
270 | /* sort change-point list by memory addresses (low -> high) */ | ||
271 | still_changing = 1; | ||
272 | while (still_changing) { | ||
273 | still_changing = 0; | ||
274 | for (i = 1; i < chg_nr; i++) { | ||
275 | unsigned long long curaddr, lastaddr; | ||
276 | unsigned long long curpbaddr, lastpbaddr; | ||
277 | |||
278 | curaddr = change_point[i]->addr; | ||
279 | lastaddr = change_point[i - 1]->addr; | ||
280 | curpbaddr = change_point[i]->pbios->addr; | ||
281 | lastpbaddr = change_point[i - 1]->pbios->addr; | ||
282 | |||
283 | /* | ||
284 | * swap entries, when: | ||
285 | * | ||
286 | * curaddr > lastaddr or | ||
287 | * curaddr == lastaddr and curaddr == curpbaddr and | ||
288 | * lastaddr != lastpbaddr | ||
289 | */ | ||
290 | if (curaddr < lastaddr || | ||
291 | (curaddr == lastaddr && curaddr == curpbaddr && | ||
292 | lastaddr != lastpbaddr)) { | ||
293 | change_tmp = change_point[i]; | ||
294 | change_point[i] = change_point[i-1]; | ||
295 | change_point[i-1] = change_tmp; | ||
296 | still_changing = 1; | ||
297 | } | ||
298 | } | ||
299 | } | ||
300 | |||
301 | /* create a new bios memory map, removing overlaps */ | ||
302 | overlap_entries = 0; /* number of entries in the overlap table */ | ||
303 | new_bios_entry = 0; /* index for creating new bios map entries */ | ||
304 | last_type = 0; /* start with undefined memory type */ | ||
305 | last_addr = 0; /* start with 0 as last starting address */ | ||
306 | |||
307 | /* loop through change-points, determining affect on the new bios map */ | ||
308 | for (chgidx = 0; chgidx < chg_nr; chgidx++) { | ||
309 | /* keep track of all overlapping bios entries */ | ||
310 | if (change_point[chgidx]->addr == | ||
311 | change_point[chgidx]->pbios->addr) { | ||
312 | /* | ||
313 | * add map entry to overlap list (> 1 entry | ||
314 | * implies an overlap) | ||
315 | */ | ||
316 | overlap_list[overlap_entries++] = | ||
317 | change_point[chgidx]->pbios; | ||
318 | } else { | ||
319 | /* | ||
320 | * remove entry from list (order independent, | ||
321 | * so swap with last) | ||
322 | */ | ||
323 | for (i = 0; i < overlap_entries; i++) { | ||
324 | if (overlap_list[i] == | ||
325 | change_point[chgidx]->pbios) | ||
326 | overlap_list[i] = | ||
327 | overlap_list[overlap_entries-1]; | ||
328 | } | ||
329 | overlap_entries--; | ||
330 | } | ||
331 | /* | ||
332 | * if there are overlapping entries, decide which | ||
333 | * "type" to use (larger value takes precedence -- | ||
334 | * 1=usable, 2,3,4,4+=unusable) | ||
335 | */ | ||
336 | current_type = 0; | ||
337 | for (i = 0; i < overlap_entries; i++) | ||
338 | if (overlap_list[i]->type > current_type) | ||
339 | current_type = overlap_list[i]->type; | ||
340 | /* | ||
341 | * continue building up new bios map based on this | ||
342 | * information | ||
343 | */ | ||
344 | if (current_type != last_type) { | ||
345 | if (last_type != 0) { | ||
346 | new_bios[new_bios_entry].size = | ||
347 | change_point[chgidx]->addr - last_addr; | ||
348 | /* | ||
349 | * move forward only if the new size | ||
350 | * was non-zero | ||
351 | */ | ||
352 | if (new_bios[new_bios_entry].size != 0) | ||
353 | /* | ||
354 | * no more space left for new | ||
355 | * bios entries ? | ||
356 | */ | ||
357 | if (++new_bios_entry >= max_nr_map) | ||
358 | break; | ||
359 | } | ||
360 | if (current_type != 0) { | ||
361 | new_bios[new_bios_entry].addr = | ||
362 | change_point[chgidx]->addr; | ||
363 | new_bios[new_bios_entry].type = current_type; | ||
364 | last_addr = change_point[chgidx]->addr; | ||
365 | } | ||
366 | last_type = current_type; | ||
367 | } | ||
368 | } | ||
369 | /* retain count for new bios entries */ | ||
370 | new_nr = new_bios_entry; | ||
371 | |||
372 | /* copy new bios mapping into original location */ | ||
373 | memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); | ||
374 | *pnr_map = new_nr; | ||
375 | |||
376 | return 0; | ||
377 | } | ||
378 | |||
379 | static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) | ||
380 | { | ||
381 | while (nr_map) { | ||
382 | u64 start = biosmap->addr; | ||
383 | u64 size = biosmap->size; | ||
384 | u64 end = start + size; | ||
385 | u32 type = biosmap->type; | ||
386 | |||
387 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
388 | if (start > end) | ||
389 | return -1; | ||
390 | |||
391 | e820_add_region(start, size, type); | ||
392 | |||
393 | biosmap++; | ||
394 | nr_map--; | ||
395 | } | ||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Copy the BIOS e820 map into a safe place. | ||
401 | * | ||
402 | * Sanity-check it while we're at it.. | ||
403 | * | ||
404 | * If we're lucky and live on a modern system, the setup code | ||
405 | * will have given us a memory map that we can use to properly | ||
406 | * set up memory. If we aren't, we'll fake a memory map. | ||
407 | */ | ||
408 | static int __init append_e820_map(struct e820entry *biosmap, int nr_map) | ||
409 | { | ||
410 | /* Only one memory region (or negative)? Ignore it */ | ||
411 | if (nr_map < 2) | ||
412 | return -1; | ||
413 | |||
414 | return __append_e820_map(biosmap, nr_map); | ||
415 | } | ||
416 | |||
417 | static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, | ||
418 | u64 size, unsigned old_type, | ||
419 | unsigned new_type) | ||
420 | { | ||
421 | int i; | ||
422 | u64 real_updated_size = 0; | ||
423 | |||
424 | BUG_ON(old_type == new_type); | ||
425 | |||
426 | if (size > (ULLONG_MAX - start)) | ||
427 | size = ULLONG_MAX - start; | ||
428 | |||
429 | for (i = 0; i < e820.nr_map; i++) { | ||
430 | struct e820entry *ei = &e820x->map[i]; | ||
431 | u64 final_start, final_end; | ||
432 | if (ei->type != old_type) | ||
433 | continue; | ||
434 | /* totally covered? */ | ||
435 | if (ei->addr >= start && | ||
436 | (ei->addr + ei->size) <= (start + size)) { | ||
437 | ei->type = new_type; | ||
438 | real_updated_size += ei->size; | ||
439 | continue; | ||
440 | } | ||
441 | /* partially covered */ | ||
442 | final_start = max(start, ei->addr); | ||
443 | final_end = min(start + size, ei->addr + ei->size); | ||
444 | if (final_start >= final_end) | ||
445 | continue; | ||
446 | e820_add_region(final_start, final_end - final_start, | ||
447 | new_type); | ||
448 | real_updated_size += final_end - final_start; | ||
449 | |||
450 | ei->size -= final_end - final_start; | ||
451 | if (ei->addr < final_start) | ||
452 | continue; | ||
453 | ei->addr = final_end; | ||
454 | } | ||
455 | return real_updated_size; | ||
456 | } | ||
457 | |||
458 | u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, | ||
459 | unsigned new_type) | ||
460 | { | ||
461 | return e820_update_range_map(&e820, start, size, old_type, new_type); | ||
462 | } | ||
463 | |||
464 | static u64 __init e820_update_range_saved(u64 start, u64 size, | ||
465 | unsigned old_type, unsigned new_type) | ||
466 | { | ||
467 | return e820_update_range_map(&e820_saved, start, size, old_type, | ||
468 | new_type); | ||
469 | } | ||
470 | |||
471 | /* make e820 not cover the range */ | ||
472 | u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, | ||
473 | int checktype) | ||
474 | { | ||
475 | int i; | ||
476 | u64 real_removed_size = 0; | ||
477 | |||
478 | if (size > (ULLONG_MAX - start)) | ||
479 | size = ULLONG_MAX - start; | ||
480 | |||
481 | for (i = 0; i < e820.nr_map; i++) { | ||
482 | struct e820entry *ei = &e820.map[i]; | ||
483 | u64 final_start, final_end; | ||
484 | |||
485 | if (checktype && ei->type != old_type) | ||
486 | continue; | ||
487 | /* totally covered? */ | ||
488 | if (ei->addr >= start && | ||
489 | (ei->addr + ei->size) <= (start + size)) { | ||
490 | real_removed_size += ei->size; | ||
491 | memset(ei, 0, sizeof(struct e820entry)); | ||
492 | continue; | ||
493 | } | ||
494 | /* partially covered */ | ||
495 | final_start = max(start, ei->addr); | ||
496 | final_end = min(start + size, ei->addr + ei->size); | ||
497 | if (final_start >= final_end) | ||
498 | continue; | ||
499 | real_removed_size += final_end - final_start; | ||
500 | |||
501 | ei->size -= final_end - final_start; | ||
502 | if (ei->addr < final_start) | ||
503 | continue; | ||
504 | ei->addr = final_end; | ||
505 | } | ||
506 | return real_removed_size; | ||
507 | } | ||
508 | |||
509 | void __init update_e820(void) | ||
510 | { | ||
511 | int nr_map; | ||
512 | |||
513 | nr_map = e820.nr_map; | ||
514 | if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) | ||
515 | return; | ||
516 | e820.nr_map = nr_map; | ||
517 | printk(KERN_INFO "modified physical RAM map:\n"); | ||
518 | e820_print_map("modified"); | ||
519 | } | ||
520 | static void __init update_e820_saved(void) | ||
521 | { | ||
522 | int nr_map; | ||
523 | |||
524 | nr_map = e820_saved.nr_map; | ||
525 | if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) | ||
526 | return; | ||
527 | e820_saved.nr_map = nr_map; | ||
528 | } | ||
529 | #define MAX_GAP_END 0x100000000ull | ||
530 | /* | ||
531 | * Search for a gap in the e820 memory space from start_addr to end_addr. | ||
532 | */ | ||
533 | __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, | ||
534 | unsigned long start_addr, unsigned long long end_addr) | ||
535 | { | ||
536 | unsigned long long last; | ||
537 | int i = e820.nr_map; | ||
538 | int found = 0; | ||
539 | |||
540 | last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; | ||
541 | |||
542 | while (--i >= 0) { | ||
543 | unsigned long long start = e820.map[i].addr; | ||
544 | unsigned long long end = start + e820.map[i].size; | ||
545 | |||
546 | if (end < start_addr) | ||
547 | continue; | ||
548 | |||
549 | /* | ||
550 | * Since "last" is at most 4GB, we know we'll | ||
551 | * fit in 32 bits if this condition is true | ||
552 | */ | ||
553 | if (last > end) { | ||
554 | unsigned long gap = last - end; | ||
555 | |||
556 | if (gap >= *gapsize) { | ||
557 | *gapsize = gap; | ||
558 | *gapstart = end; | ||
559 | found = 1; | ||
560 | } | ||
561 | } | ||
562 | if (start < last) | ||
563 | last = start; | ||
564 | } | ||
565 | return found; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * Search for the biggest gap in the low 32 bits of the e820 | ||
570 | * memory space. We pass this space to PCI to assign MMIO resources | ||
571 | * for hotplug or unconfigured devices in. | ||
572 | * Hopefully the BIOS let enough space left. | ||
573 | */ | ||
574 | __init void e820_setup_gap(void) | ||
575 | { | ||
576 | unsigned long gapstart, gapsize, round; | ||
577 | int found; | ||
578 | |||
579 | gapstart = 0x10000000; | ||
580 | gapsize = 0x400000; | ||
581 | found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); | ||
582 | |||
583 | #ifdef CONFIG_X86_64 | ||
584 | if (!found) { | ||
585 | gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; | ||
586 | printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " | ||
587 | "address range\n" | ||
588 | KERN_ERR "PCI: Unassigned devices with 32bit resource " | ||
589 | "registers may break!\n"); | ||
590 | } | ||
591 | #endif | ||
592 | |||
593 | /* | ||
594 | * See how much we want to round up: start off with | ||
595 | * rounding to the next 1MB area. | ||
596 | */ | ||
597 | round = 0x100000; | ||
598 | while ((gapsize >> 4) > round) | ||
599 | round += round; | ||
600 | /* Fun with two's complement */ | ||
601 | pci_mem_start = (gapstart + round) & -round; | ||
602 | |||
603 | printk(KERN_INFO | ||
604 | "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | ||
605 | pci_mem_start, gapstart, gapsize); | ||
606 | } | ||
607 | |||
608 | /** | ||
609 | * Because of the size limitation of struct boot_params, only first | ||
610 | * 128 E820 memory entries are passed to kernel via | ||
611 | * boot_params.e820_map, others are passed via SETUP_E820_EXT node of | ||
612 | * linked list of struct setup_data, which is parsed here. | ||
613 | */ | ||
614 | void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) | ||
615 | { | ||
616 | u32 map_len; | ||
617 | int entries; | ||
618 | struct e820entry *extmap; | ||
619 | |||
620 | entries = sdata->len / sizeof(struct e820entry); | ||
621 | map_len = sdata->len + sizeof(struct setup_data); | ||
622 | if (map_len > PAGE_SIZE) | ||
623 | sdata = early_ioremap(pa_data, map_len); | ||
624 | extmap = (struct e820entry *)(sdata->data); | ||
625 | __append_e820_map(extmap, entries); | ||
626 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | ||
627 | if (map_len > PAGE_SIZE) | ||
628 | early_iounmap(sdata, map_len); | ||
629 | printk(KERN_INFO "extended physical RAM map:\n"); | ||
630 | e820_print_map("extended"); | ||
631 | } | ||
632 | |||
633 | #if defined(CONFIG_X86_64) || \ | ||
634 | (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) | ||
635 | /** | ||
636 | * Find the ranges of physical addresses that do not correspond to | ||
637 | * e820 RAM areas and mark the corresponding pages as nosave for | ||
638 | * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). | ||
639 | * | ||
640 | * This function requires the e820 map to be sorted and without any | ||
641 | * overlapping entries and assumes the first e820 area to be RAM. | ||
642 | */ | ||
643 | void __init e820_mark_nosave_regions(unsigned long limit_pfn) | ||
644 | { | ||
645 | int i; | ||
646 | unsigned long pfn; | ||
647 | |||
648 | pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); | ||
649 | for (i = 1; i < e820.nr_map; i++) { | ||
650 | struct e820entry *ei = &e820.map[i]; | ||
651 | |||
652 | if (pfn < PFN_UP(ei->addr)) | ||
653 | register_nosave_region(pfn, PFN_UP(ei->addr)); | ||
654 | |||
655 | pfn = PFN_DOWN(ei->addr + ei->size); | ||
656 | if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) | ||
657 | register_nosave_region(PFN_UP(ei->addr), pfn); | ||
658 | |||
659 | if (pfn >= limit_pfn) | ||
660 | break; | ||
661 | } | ||
662 | } | ||
663 | #endif | ||
664 | |||
665 | /* | ||
666 | * Early reserved memory areas. | ||
667 | */ | ||
668 | #define MAX_EARLY_RES 20 | ||
669 | |||
670 | struct early_res { | ||
671 | u64 start, end; | ||
672 | char name[16]; | ||
673 | char overlap_ok; | ||
674 | }; | ||
675 | static struct early_res early_res[MAX_EARLY_RES] __initdata = { | ||
676 | { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ | ||
677 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) | ||
678 | { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, | ||
679 | #endif | ||
680 | #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) | ||
681 | /* | ||
682 | * But first pinch a few for the stack/trampoline stuff | ||
683 | * FIXME: Don't need the extra page at 4K, but need to fix | ||
684 | * trampoline before removing it. (see the GDT stuff) | ||
685 | */ | ||
686 | { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, | ||
687 | /* | ||
688 | * Has to be in very low memory so we can execute | ||
689 | * real-mode AP code. | ||
690 | */ | ||
691 | { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, | ||
692 | #endif | ||
693 | {} | ||
694 | }; | ||
695 | |||
696 | static int __init find_overlapped_early(u64 start, u64 end) | ||
697 | { | ||
698 | int i; | ||
699 | struct early_res *r; | ||
700 | |||
701 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
702 | r = &early_res[i]; | ||
703 | if (end > r->start && start < r->end) | ||
704 | break; | ||
705 | } | ||
706 | |||
707 | return i; | ||
708 | } | ||
709 | |||
710 | /* | ||
711 | * Drop the i-th range from the early reservation map, | ||
712 | * by copying any higher ranges down one over it, and | ||
713 | * clearing what had been the last slot. | ||
714 | */ | ||
715 | static void __init drop_range(int i) | ||
716 | { | ||
717 | int j; | ||
718 | |||
719 | for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) | ||
720 | ; | ||
721 | |||
722 | memmove(&early_res[i], &early_res[i + 1], | ||
723 | (j - 1 - i) * sizeof(struct early_res)); | ||
724 | |||
725 | early_res[j - 1].end = 0; | ||
726 | } | ||
727 | |||
728 | /* | ||
729 | * Split any existing ranges that: | ||
730 | * 1) are marked 'overlap_ok', and | ||
731 | * 2) overlap with the stated range [start, end) | ||
732 | * into whatever portion (if any) of the existing range is entirely | ||
733 | * below or entirely above the stated range. Drop the portion | ||
734 | * of the existing range that overlaps with the stated range, | ||
735 | * which will allow the caller of this routine to then add that | ||
736 | * stated range without conflicting with any existing range. | ||
737 | */ | ||
738 | static void __init drop_overlaps_that_are_ok(u64 start, u64 end) | ||
739 | { | ||
740 | int i; | ||
741 | struct early_res *r; | ||
742 | u64 lower_start, lower_end; | ||
743 | u64 upper_start, upper_end; | ||
744 | char name[16]; | ||
745 | |||
746 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
747 | r = &early_res[i]; | ||
748 | |||
749 | /* Continue past non-overlapping ranges */ | ||
750 | if (end <= r->start || start >= r->end) | ||
751 | continue; | ||
752 | |||
753 | /* | ||
754 | * Leave non-ok overlaps as is; let caller | ||
755 | * panic "Overlapping early reservations" | ||
756 | * when it hits this overlap. | ||
757 | */ | ||
758 | if (!r->overlap_ok) | ||
759 | return; | ||
760 | |||
761 | /* | ||
762 | * We have an ok overlap. We will drop it from the early | ||
763 | * reservation map, and add back in any non-overlapping | ||
764 | * portions (lower or upper) as separate, overlap_ok, | ||
765 | * non-overlapping ranges. | ||
766 | */ | ||
767 | |||
768 | /* 1. Note any non-overlapping (lower or upper) ranges. */ | ||
769 | strncpy(name, r->name, sizeof(name) - 1); | ||
770 | |||
771 | lower_start = lower_end = 0; | ||
772 | upper_start = upper_end = 0; | ||
773 | if (r->start < start) { | ||
774 | lower_start = r->start; | ||
775 | lower_end = start; | ||
776 | } | ||
777 | if (r->end > end) { | ||
778 | upper_start = end; | ||
779 | upper_end = r->end; | ||
780 | } | ||
781 | |||
782 | /* 2. Drop the original ok overlapping range */ | ||
783 | drop_range(i); | ||
784 | |||
785 | i--; /* resume for-loop on copied down entry */ | ||
786 | |||
787 | /* 3. Add back in any non-overlapping ranges. */ | ||
788 | if (lower_end) | ||
789 | reserve_early_overlap_ok(lower_start, lower_end, name); | ||
790 | if (upper_end) | ||
791 | reserve_early_overlap_ok(upper_start, upper_end, name); | ||
792 | } | ||
793 | } | ||
794 | |||
795 | static void __init __reserve_early(u64 start, u64 end, char *name, | ||
796 | int overlap_ok) | ||
797 | { | ||
798 | int i; | ||
799 | struct early_res *r; | ||
800 | |||
801 | i = find_overlapped_early(start, end); | ||
802 | if (i >= MAX_EARLY_RES) | ||
803 | panic("Too many early reservations"); | ||
804 | r = &early_res[i]; | ||
805 | if (r->end) | ||
806 | panic("Overlapping early reservations " | ||
807 | "%llx-%llx %s to %llx-%llx %s\n", | ||
808 | start, end - 1, name?name:"", r->start, | ||
809 | r->end - 1, r->name); | ||
810 | r->start = start; | ||
811 | r->end = end; | ||
812 | r->overlap_ok = overlap_ok; | ||
813 | if (name) | ||
814 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
815 | } | ||
816 | |||
817 | /* | ||
818 | * A few early reservtations come here. | ||
819 | * | ||
820 | * The 'overlap_ok' in the name of this routine does -not- mean it | ||
821 | * is ok for these reservations to overlap an earlier reservation. | ||
822 | * Rather it means that it is ok for subsequent reservations to | ||
823 | * overlap this one. | ||
824 | * | ||
825 | * Use this entry point to reserve early ranges when you are doing | ||
826 | * so out of "Paranoia", reserving perhaps more memory than you need, | ||
827 | * just in case, and don't mind a subsequent overlapping reservation | ||
828 | * that is known to be needed. | ||
829 | * | ||
830 | * The drop_overlaps_that_are_ok() call here isn't really needed. | ||
831 | * It would be needed if we had two colliding 'overlap_ok' | ||
832 | * reservations, so that the second such would not panic on the | ||
833 | * overlap with the first. We don't have any such as of this | ||
834 | * writing, but might as well tolerate such if it happens in | ||
835 | * the future. | ||
836 | */ | ||
837 | void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) | ||
838 | { | ||
839 | drop_overlaps_that_are_ok(start, end); | ||
840 | __reserve_early(start, end, name, 1); | ||
841 | } | ||
842 | |||
843 | /* | ||
844 | * Most early reservations come here. | ||
845 | * | ||
846 | * We first have drop_overlaps_that_are_ok() drop any pre-existing | ||
847 | * 'overlap_ok' ranges, so that we can then reserve this memory | ||
848 | * range without risk of panic'ing on an overlapping overlap_ok | ||
849 | * early reservation. | ||
850 | */ | ||
851 | void __init reserve_early(u64 start, u64 end, char *name) | ||
852 | { | ||
853 | drop_overlaps_that_are_ok(start, end); | ||
854 | __reserve_early(start, end, name, 0); | ||
855 | } | ||
856 | |||
857 | void __init free_early(u64 start, u64 end) | ||
858 | { | ||
859 | struct early_res *r; | ||
860 | int i; | ||
861 | |||
862 | i = find_overlapped_early(start, end); | ||
863 | r = &early_res[i]; | ||
864 | if (i >= MAX_EARLY_RES || r->end != end || r->start != start) | ||
865 | panic("free_early on not reserved area: %llx-%llx!", | ||
866 | start, end - 1); | ||
867 | |||
868 | drop_range(i); | ||
869 | } | ||
870 | |||
871 | void __init early_res_to_bootmem(u64 start, u64 end) | ||
872 | { | ||
873 | int i, count; | ||
874 | u64 final_start, final_end; | ||
875 | |||
876 | count = 0; | ||
877 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) | ||
878 | count++; | ||
879 | |||
880 | printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", | ||
881 | count, start, end); | ||
882 | for (i = 0; i < count; i++) { | ||
883 | struct early_res *r = &early_res[i]; | ||
884 | printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, | ||
885 | r->start, r->end, r->name); | ||
886 | final_start = max(start, r->start); | ||
887 | final_end = min(end, r->end); | ||
888 | if (final_start >= final_end) { | ||
889 | printk(KERN_CONT "\n"); | ||
890 | continue; | ||
891 | } | ||
892 | printk(KERN_CONT " ==> [%010llx - %010llx]\n", | ||
893 | final_start, final_end); | ||
894 | reserve_bootmem_generic(final_start, final_end - final_start, | ||
895 | BOOTMEM_DEFAULT); | ||
896 | } | ||
897 | } | ||
898 | |||
899 | /* Check for already reserved areas */ | ||
900 | static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) | ||
901 | { | ||
902 | int i; | ||
903 | u64 addr = *addrp; | ||
904 | int changed = 0; | ||
905 | struct early_res *r; | ||
906 | again: | ||
907 | i = find_overlapped_early(addr, addr + size); | ||
908 | r = &early_res[i]; | ||
909 | if (i < MAX_EARLY_RES && r->end) { | ||
910 | *addrp = addr = round_up(r->end, align); | ||
911 | changed = 1; | ||
912 | goto again; | ||
913 | } | ||
914 | return changed; | ||
915 | } | ||
916 | |||
917 | /* Check for already reserved areas */ | ||
918 | static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) | ||
919 | { | ||
920 | int i; | ||
921 | u64 addr = *addrp, last; | ||
922 | u64 size = *sizep; | ||
923 | int changed = 0; | ||
924 | again: | ||
925 | last = addr + size; | ||
926 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
927 | struct early_res *r = &early_res[i]; | ||
928 | if (last > r->start && addr < r->start) { | ||
929 | size = r->start - addr; | ||
930 | changed = 1; | ||
931 | goto again; | ||
932 | } | ||
933 | if (last > r->end && addr < r->end) { | ||
934 | addr = round_up(r->end, align); | ||
935 | size = last - addr; | ||
936 | changed = 1; | ||
937 | goto again; | ||
938 | } | ||
939 | if (last <= r->end && addr >= r->start) { | ||
940 | (*sizep)++; | ||
941 | return 0; | ||
942 | } | ||
943 | } | ||
944 | if (changed) { | ||
945 | *addrp = addr; | ||
946 | *sizep = size; | ||
947 | } | ||
948 | return changed; | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * Find a free area with specified alignment in a specific range. | ||
953 | */ | ||
954 | u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) | ||
955 | { | ||
956 | int i; | ||
957 | |||
958 | for (i = 0; i < e820.nr_map; i++) { | ||
959 | struct e820entry *ei = &e820.map[i]; | ||
960 | u64 addr, last; | ||
961 | u64 ei_last; | ||
962 | |||
963 | if (ei->type != E820_RAM) | ||
964 | continue; | ||
965 | addr = round_up(ei->addr, align); | ||
966 | ei_last = ei->addr + ei->size; | ||
967 | if (addr < start) | ||
968 | addr = round_up(start, align); | ||
969 | if (addr >= ei_last) | ||
970 | continue; | ||
971 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
972 | ; | ||
973 | last = addr + size; | ||
974 | if (last > ei_last) | ||
975 | continue; | ||
976 | if (last > end) | ||
977 | continue; | ||
978 | return addr; | ||
979 | } | ||
980 | return -1ULL; | ||
981 | } | ||
982 | |||
983 | /* | ||
984 | * Find next free range after *start | ||
985 | */ | ||
986 | u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) | ||
987 | { | ||
988 | int i; | ||
989 | |||
990 | for (i = 0; i < e820.nr_map; i++) { | ||
991 | struct e820entry *ei = &e820.map[i]; | ||
992 | u64 addr, last; | ||
993 | u64 ei_last; | ||
994 | |||
995 | if (ei->type != E820_RAM) | ||
996 | continue; | ||
997 | addr = round_up(ei->addr, align); | ||
998 | ei_last = ei->addr + ei->size; | ||
999 | if (addr < start) | ||
1000 | addr = round_up(start, align); | ||
1001 | if (addr >= ei_last) | ||
1002 | continue; | ||
1003 | *sizep = ei_last - addr; | ||
1004 | while (bad_addr_size(&addr, sizep, align) && | ||
1005 | addr + *sizep <= ei_last) | ||
1006 | ; | ||
1007 | last = addr + *sizep; | ||
1008 | if (last > ei_last) | ||
1009 | continue; | ||
1010 | return addr; | ||
1011 | } | ||
1012 | return -1UL; | ||
1013 | |||
1014 | } | ||
1015 | |||
1016 | /* | ||
1017 | * pre allocated 4k and reserved it in e820 | ||
1018 | */ | ||
1019 | u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) | ||
1020 | { | ||
1021 | u64 size = 0; | ||
1022 | u64 addr; | ||
1023 | u64 start; | ||
1024 | |||
1025 | start = startt; | ||
1026 | while (size < sizet) | ||
1027 | start = find_e820_area_size(start, &size, align); | ||
1028 | |||
1029 | if (size < sizet) | ||
1030 | return 0; | ||
1031 | |||
1032 | addr = round_down(start + size - sizet, align); | ||
1033 | e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); | ||
1034 | e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); | ||
1035 | printk(KERN_INFO "update e820 for early_reserve_e820\n"); | ||
1036 | update_e820(); | ||
1037 | update_e820_saved(); | ||
1038 | |||
1039 | return addr; | ||
1040 | } | ||
1041 | |||
1042 | #ifdef CONFIG_X86_32 | ||
1043 | # ifdef CONFIG_X86_PAE | ||
1044 | # define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) | ||
1045 | # else | ||
1046 | # define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) | ||
1047 | # endif | ||
1048 | #else /* CONFIG_X86_32 */ | ||
1049 | # define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT | ||
1050 | #endif | ||
1051 | |||
1052 | /* | ||
1053 | * Find the highest page frame number we have available | ||
1054 | */ | ||
1055 | static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) | ||
1056 | { | ||
1057 | int i; | ||
1058 | unsigned long last_pfn = 0; | ||
1059 | unsigned long max_arch_pfn = MAX_ARCH_PFN; | ||
1060 | |||
1061 | for (i = 0; i < e820.nr_map; i++) { | ||
1062 | struct e820entry *ei = &e820.map[i]; | ||
1063 | unsigned long start_pfn; | ||
1064 | unsigned long end_pfn; | ||
1065 | |||
1066 | if (ei->type != type) | ||
1067 | continue; | ||
1068 | |||
1069 | start_pfn = ei->addr >> PAGE_SHIFT; | ||
1070 | end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; | ||
1071 | |||
1072 | if (start_pfn >= limit_pfn) | ||
1073 | continue; | ||
1074 | if (end_pfn > limit_pfn) { | ||
1075 | last_pfn = limit_pfn; | ||
1076 | break; | ||
1077 | } | ||
1078 | if (end_pfn > last_pfn) | ||
1079 | last_pfn = end_pfn; | ||
1080 | } | ||
1081 | |||
1082 | if (last_pfn > max_arch_pfn) | ||
1083 | last_pfn = max_arch_pfn; | ||
1084 | |||
1085 | printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", | ||
1086 | last_pfn, max_arch_pfn); | ||
1087 | return last_pfn; | ||
1088 | } | ||
1089 | unsigned long __init e820_end_of_ram_pfn(void) | ||
1090 | { | ||
1091 | return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); | ||
1092 | } | ||
1093 | |||
1094 | unsigned long __init e820_end_of_low_ram_pfn(void) | ||
1095 | { | ||
1096 | return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); | ||
1097 | } | ||
1098 | /* | ||
1099 | * Finds an active region in the address range from start_pfn to last_pfn and | ||
1100 | * returns its range in ei_startpfn and ei_endpfn for the e820 entry. | ||
1101 | */ | ||
1102 | int __init e820_find_active_region(const struct e820entry *ei, | ||
1103 | unsigned long start_pfn, | ||
1104 | unsigned long last_pfn, | ||
1105 | unsigned long *ei_startpfn, | ||
1106 | unsigned long *ei_endpfn) | ||
1107 | { | ||
1108 | u64 align = PAGE_SIZE; | ||
1109 | |||
1110 | *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; | ||
1111 | *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; | ||
1112 | |||
1113 | /* Skip map entries smaller than a page */ | ||
1114 | if (*ei_startpfn >= *ei_endpfn) | ||
1115 | return 0; | ||
1116 | |||
1117 | /* Skip if map is outside the node */ | ||
1118 | if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || | ||
1119 | *ei_startpfn >= last_pfn) | ||
1120 | return 0; | ||
1121 | |||
1122 | /* Check for overlaps */ | ||
1123 | if (*ei_startpfn < start_pfn) | ||
1124 | *ei_startpfn = start_pfn; | ||
1125 | if (*ei_endpfn > last_pfn) | ||
1126 | *ei_endpfn = last_pfn; | ||
1127 | |||
1128 | return 1; | ||
1129 | } | ||
1130 | |||
1131 | /* Walk the e820 map and register active regions within a node */ | ||
1132 | void __init e820_register_active_regions(int nid, unsigned long start_pfn, | ||
1133 | unsigned long last_pfn) | ||
1134 | { | ||
1135 | unsigned long ei_startpfn; | ||
1136 | unsigned long ei_endpfn; | ||
1137 | int i; | ||
1138 | |||
1139 | for (i = 0; i < e820.nr_map; i++) | ||
1140 | if (e820_find_active_region(&e820.map[i], | ||
1141 | start_pfn, last_pfn, | ||
1142 | &ei_startpfn, &ei_endpfn)) | ||
1143 | add_active_range(nid, ei_startpfn, ei_endpfn); | ||
1144 | } | ||
1145 | |||
1146 | /* | ||
1147 | * Find the hole size (in bytes) in the memory range. | ||
1148 | * @start: starting address of the memory range to scan | ||
1149 | * @end: ending address of the memory range to scan | ||
1150 | */ | ||
1151 | u64 __init e820_hole_size(u64 start, u64 end) | ||
1152 | { | ||
1153 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
1154 | unsigned long last_pfn = end >> PAGE_SHIFT; | ||
1155 | unsigned long ei_startpfn, ei_endpfn, ram = 0; | ||
1156 | int i; | ||
1157 | |||
1158 | for (i = 0; i < e820.nr_map; i++) { | ||
1159 | if (e820_find_active_region(&e820.map[i], | ||
1160 | start_pfn, last_pfn, | ||
1161 | &ei_startpfn, &ei_endpfn)) | ||
1162 | ram += ei_endpfn - ei_startpfn; | ||
1163 | } | ||
1164 | return end - start - ((u64)ram << PAGE_SHIFT); | ||
1165 | } | ||
1166 | |||
1167 | static void early_panic(char *msg) | ||
1168 | { | ||
1169 | early_printk(msg); | ||
1170 | panic(msg); | ||
1171 | } | ||
1172 | |||
1173 | static int userdef __initdata; | ||
1174 | |||
1175 | /* "mem=nopentium" disables the 4MB page tables. */ | ||
1176 | static int __init parse_memopt(char *p) | ||
1177 | { | ||
1178 | u64 mem_size; | ||
1179 | |||
1180 | if (!p) | ||
1181 | return -EINVAL; | ||
1182 | |||
1183 | #ifdef CONFIG_X86_32 | ||
1184 | if (!strcmp(p, "nopentium")) { | ||
1185 | setup_clear_cpu_cap(X86_FEATURE_PSE); | ||
1186 | return 0; | ||
1187 | } | ||
1188 | #endif | ||
1189 | |||
1190 | userdef = 1; | ||
1191 | mem_size = memparse(p, &p); | ||
1192 | e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); | ||
1193 | |||
1194 | return 0; | ||
1195 | } | ||
1196 | early_param("mem", parse_memopt); | ||
1197 | |||
1198 | static int __init parse_memmap_opt(char *p) | ||
1199 | { | ||
1200 | char *oldp; | ||
1201 | u64 start_at, mem_size; | ||
1202 | |||
1203 | if (!p) | ||
1204 | return -EINVAL; | ||
1205 | |||
1206 | if (!strcmp(p, "exactmap")) { | ||
1207 | #ifdef CONFIG_CRASH_DUMP | ||
1208 | /* | ||
1209 | * If we are doing a crash dump, we still need to know | ||
1210 | * the real mem size before original memory map is | ||
1211 | * reset. | ||
1212 | */ | ||
1213 | saved_max_pfn = e820_end_of_ram_pfn(); | ||
1214 | #endif | ||
1215 | e820.nr_map = 0; | ||
1216 | userdef = 1; | ||
1217 | return 0; | ||
1218 | } | ||
1219 | |||
1220 | oldp = p; | ||
1221 | mem_size = memparse(p, &p); | ||
1222 | if (p == oldp) | ||
1223 | return -EINVAL; | ||
1224 | |||
1225 | userdef = 1; | ||
1226 | if (*p == '@') { | ||
1227 | start_at = memparse(p+1, &p); | ||
1228 | e820_add_region(start_at, mem_size, E820_RAM); | ||
1229 | } else if (*p == '#') { | ||
1230 | start_at = memparse(p+1, &p); | ||
1231 | e820_add_region(start_at, mem_size, E820_ACPI); | ||
1232 | } else if (*p == '$') { | ||
1233 | start_at = memparse(p+1, &p); | ||
1234 | e820_add_region(start_at, mem_size, E820_RESERVED); | ||
1235 | } else | ||
1236 | e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); | ||
1237 | |||
1238 | return *p == '\0' ? 0 : -EINVAL; | ||
1239 | } | ||
1240 | early_param("memmap", parse_memmap_opt); | ||
1241 | |||
1242 | void __init finish_e820_parsing(void) | ||
1243 | { | ||
1244 | if (userdef) { | ||
1245 | int nr = e820.nr_map; | ||
1246 | |||
1247 | if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) | ||
1248 | early_panic("Invalid user supplied memory map"); | ||
1249 | e820.nr_map = nr; | ||
1250 | |||
1251 | printk(KERN_INFO "user-defined physical RAM map:\n"); | ||
1252 | e820_print_map("user"); | ||
1253 | } | ||
1254 | } | ||
1255 | |||
1256 | static inline const char *e820_type_to_string(int e820_type) | ||
1257 | { | ||
1258 | switch (e820_type) { | ||
1259 | case E820_RESERVED_KERN: | ||
1260 | case E820_RAM: return "System RAM"; | ||
1261 | case E820_ACPI: return "ACPI Tables"; | ||
1262 | case E820_NVS: return "ACPI Non-volatile Storage"; | ||
1263 | default: return "reserved"; | ||
1264 | } | ||
1265 | } | ||
1266 | |||
1267 | /* | ||
1268 | * Mark e820 reserved areas as busy for the resource manager. | ||
1269 | */ | ||
1270 | void __init e820_reserve_resources(void) | ||
1271 | { | ||
1272 | int i; | ||
1273 | struct resource *res; | ||
1274 | u64 end; | ||
1275 | |||
1276 | res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); | ||
1277 | for (i = 0; i < e820.nr_map; i++) { | ||
1278 | end = e820.map[i].addr + e820.map[i].size - 1; | ||
1279 | #ifndef CONFIG_RESOURCES_64BIT | ||
1280 | if (end > 0x100000000ULL) { | ||
1281 | res++; | ||
1282 | continue; | ||
1283 | } | ||
1284 | #endif | ||
1285 | res->name = e820_type_to_string(e820.map[i].type); | ||
1286 | res->start = e820.map[i].addr; | ||
1287 | res->end = end; | ||
1288 | |||
1289 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
1290 | insert_resource(&iomem_resource, res); | ||
1291 | res++; | ||
1292 | } | ||
1293 | |||
1294 | for (i = 0; i < e820_saved.nr_map; i++) { | ||
1295 | struct e820entry *entry = &e820_saved.map[i]; | ||
1296 | firmware_map_add_early(entry->addr, | ||
1297 | entry->addr + entry->size - 1, | ||
1298 | e820_type_to_string(entry->type)); | ||
1299 | } | ||
1300 | } | ||
1301 | |||
1302 | char *__init default_machine_specific_memory_setup(void) | ||
1303 | { | ||
1304 | char *who = "BIOS-e820"; | ||
1305 | int new_nr; | ||
1306 | /* | ||
1307 | * Try to copy the BIOS-supplied E820-map. | ||
1308 | * | ||
1309 | * Otherwise fake a memory map; one section from 0k->640k, | ||
1310 | * the next section from 1mb->appropriate_mem_k | ||
1311 | */ | ||
1312 | new_nr = boot_params.e820_entries; | ||
1313 | sanitize_e820_map(boot_params.e820_map, | ||
1314 | ARRAY_SIZE(boot_params.e820_map), | ||
1315 | &new_nr); | ||
1316 | boot_params.e820_entries = new_nr; | ||
1317 | if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) | ||
1318 | < 0) { | ||
1319 | u64 mem_size; | ||
1320 | |||
1321 | /* compare results from other methods and take the greater */ | ||
1322 | if (boot_params.alt_mem_k | ||
1323 | < boot_params.screen_info.ext_mem_k) { | ||
1324 | mem_size = boot_params.screen_info.ext_mem_k; | ||
1325 | who = "BIOS-88"; | ||
1326 | } else { | ||
1327 | mem_size = boot_params.alt_mem_k; | ||
1328 | who = "BIOS-e801"; | ||
1329 | } | ||
1330 | |||
1331 | e820.nr_map = 0; | ||
1332 | e820_add_region(0, LOWMEMSIZE(), E820_RAM); | ||
1333 | e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); | ||
1334 | } | ||
1335 | |||
1336 | /* In case someone cares... */ | ||
1337 | return who; | ||
1338 | } | ||
1339 | |||
1340 | char *__init __attribute__((weak)) machine_specific_memory_setup(void) | ||
1341 | { | ||
1342 | if (x86_quirks->arch_memory_setup) { | ||
1343 | char *who = x86_quirks->arch_memory_setup(); | ||
1344 | |||
1345 | if (who) | ||
1346 | return who; | ||
1347 | } | ||
1348 | return default_machine_specific_memory_setup(); | ||
1349 | } | ||
1350 | |||
1351 | /* Overridden in paravirt.c if CONFIG_PARAVIRT */ | ||
1352 | char * __init __attribute__((weak)) memory_setup(void) | ||
1353 | { | ||
1354 | return machine_specific_memory_setup(); | ||
1355 | } | ||
1356 | |||
1357 | void __init setup_memory_map(void) | ||
1358 | { | ||
1359 | char *who; | ||
1360 | |||
1361 | who = memory_setup(); | ||
1362 | memcpy(&e820_saved, &e820, sizeof(struct e820map)); | ||
1363 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
1364 | e820_print_map(who); | ||
1365 | } | ||
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c deleted file mode 100644 index ed733e7cf4e6..000000000000 --- a/arch/x86/kernel/e820_32.c +++ /dev/null | |||
@@ -1,775 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/types.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/bootmem.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <linux/kexec.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/pfn.h> | ||
11 | #include <linux/uaccess.h> | ||
12 | #include <linux/suspend.h> | ||
13 | |||
14 | #include <asm/pgtable.h> | ||
15 | #include <asm/page.h> | ||
16 | #include <asm/e820.h> | ||
17 | #include <asm/setup.h> | ||
18 | |||
19 | struct e820map e820; | ||
20 | struct change_member { | ||
21 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
22 | unsigned long long addr; /* address for this change point */ | ||
23 | }; | ||
24 | static struct change_member change_point_list[2*E820MAX] __initdata; | ||
25 | static struct change_member *change_point[2*E820MAX] __initdata; | ||
26 | static struct e820entry *overlap_list[E820MAX] __initdata; | ||
27 | static struct e820entry new_bios[E820MAX] __initdata; | ||
28 | /* For PCI or other memory-mapped resources */ | ||
29 | unsigned long pci_mem_start = 0x10000000; | ||
30 | #ifdef CONFIG_PCI | ||
31 | EXPORT_SYMBOL(pci_mem_start); | ||
32 | #endif | ||
33 | extern int user_defined_memmap; | ||
34 | |||
35 | static struct resource system_rom_resource = { | ||
36 | .name = "System ROM", | ||
37 | .start = 0xf0000, | ||
38 | .end = 0xfffff, | ||
39 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
40 | }; | ||
41 | |||
42 | static struct resource extension_rom_resource = { | ||
43 | .name = "Extension ROM", | ||
44 | .start = 0xe0000, | ||
45 | .end = 0xeffff, | ||
46 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
47 | }; | ||
48 | |||
49 | static struct resource adapter_rom_resources[] = { { | ||
50 | .name = "Adapter ROM", | ||
51 | .start = 0xc8000, | ||
52 | .end = 0, | ||
53 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
54 | }, { | ||
55 | .name = "Adapter ROM", | ||
56 | .start = 0, | ||
57 | .end = 0, | ||
58 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
59 | }, { | ||
60 | .name = "Adapter ROM", | ||
61 | .start = 0, | ||
62 | .end = 0, | ||
63 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
64 | }, { | ||
65 | .name = "Adapter ROM", | ||
66 | .start = 0, | ||
67 | .end = 0, | ||
68 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
69 | }, { | ||
70 | .name = "Adapter ROM", | ||
71 | .start = 0, | ||
72 | .end = 0, | ||
73 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
74 | }, { | ||
75 | .name = "Adapter ROM", | ||
76 | .start = 0, | ||
77 | .end = 0, | ||
78 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
79 | } }; | ||
80 | |||
81 | static struct resource video_rom_resource = { | ||
82 | .name = "Video ROM", | ||
83 | .start = 0xc0000, | ||
84 | .end = 0xc7fff, | ||
85 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
86 | }; | ||
87 | |||
88 | #define ROMSIGNATURE 0xaa55 | ||
89 | |||
90 | static int __init romsignature(const unsigned char *rom) | ||
91 | { | ||
92 | const unsigned short * const ptr = (const unsigned short *)rom; | ||
93 | unsigned short sig; | ||
94 | |||
95 | return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; | ||
96 | } | ||
97 | |||
98 | static int __init romchecksum(const unsigned char *rom, unsigned long length) | ||
99 | { | ||
100 | unsigned char sum, c; | ||
101 | |||
102 | for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) | ||
103 | sum += c; | ||
104 | return !length && !sum; | ||
105 | } | ||
106 | |||
107 | static void __init probe_roms(void) | ||
108 | { | ||
109 | const unsigned char *rom; | ||
110 | unsigned long start, length, upper; | ||
111 | unsigned char c; | ||
112 | int i; | ||
113 | |||
114 | /* video rom */ | ||
115 | upper = adapter_rom_resources[0].start; | ||
116 | for (start = video_rom_resource.start; start < upper; start += 2048) { | ||
117 | rom = isa_bus_to_virt(start); | ||
118 | if (!romsignature(rom)) | ||
119 | continue; | ||
120 | |||
121 | video_rom_resource.start = start; | ||
122 | |||
123 | if (probe_kernel_address(rom + 2, c) != 0) | ||
124 | continue; | ||
125 | |||
126 | /* 0 < length <= 0x7f * 512, historically */ | ||
127 | length = c * 512; | ||
128 | |||
129 | /* if checksum okay, trust length byte */ | ||
130 | if (length && romchecksum(rom, length)) | ||
131 | video_rom_resource.end = start + length - 1; | ||
132 | |||
133 | request_resource(&iomem_resource, &video_rom_resource); | ||
134 | break; | ||
135 | } | ||
136 | |||
137 | start = (video_rom_resource.end + 1 + 2047) & ~2047UL; | ||
138 | if (start < upper) | ||
139 | start = upper; | ||
140 | |||
141 | /* system rom */ | ||
142 | request_resource(&iomem_resource, &system_rom_resource); | ||
143 | upper = system_rom_resource.start; | ||
144 | |||
145 | /* check for extension rom (ignore length byte!) */ | ||
146 | rom = isa_bus_to_virt(extension_rom_resource.start); | ||
147 | if (romsignature(rom)) { | ||
148 | length = extension_rom_resource.end - extension_rom_resource.start + 1; | ||
149 | if (romchecksum(rom, length)) { | ||
150 | request_resource(&iomem_resource, &extension_rom_resource); | ||
151 | upper = extension_rom_resource.start; | ||
152 | } | ||
153 | } | ||
154 | |||
155 | /* check for adapter roms on 2k boundaries */ | ||
156 | for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { | ||
157 | rom = isa_bus_to_virt(start); | ||
158 | if (!romsignature(rom)) | ||
159 | continue; | ||
160 | |||
161 | if (probe_kernel_address(rom + 2, c) != 0) | ||
162 | continue; | ||
163 | |||
164 | /* 0 < length <= 0x7f * 512, historically */ | ||
165 | length = c * 512; | ||
166 | |||
167 | /* but accept any length that fits if checksum okay */ | ||
168 | if (!length || start + length > upper || !romchecksum(rom, length)) | ||
169 | continue; | ||
170 | |||
171 | adapter_rom_resources[i].start = start; | ||
172 | adapter_rom_resources[i].end = start + length - 1; | ||
173 | request_resource(&iomem_resource, &adapter_rom_resources[i]); | ||
174 | |||
175 | start = adapter_rom_resources[i++].end & ~2047UL; | ||
176 | } | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Request address space for all standard RAM and ROM resources | ||
181 | * and also for regions reported as reserved by the e820. | ||
182 | */ | ||
183 | void __init init_iomem_resources(struct resource *code_resource, | ||
184 | struct resource *data_resource, | ||
185 | struct resource *bss_resource) | ||
186 | { | ||
187 | int i; | ||
188 | |||
189 | probe_roms(); | ||
190 | for (i = 0; i < e820.nr_map; i++) { | ||
191 | struct resource *res; | ||
192 | #ifndef CONFIG_RESOURCES_64BIT | ||
193 | if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) | ||
194 | continue; | ||
195 | #endif | ||
196 | res = kzalloc(sizeof(struct resource), GFP_ATOMIC); | ||
197 | switch (e820.map[i].type) { | ||
198 | case E820_RAM: res->name = "System RAM"; break; | ||
199 | case E820_ACPI: res->name = "ACPI Tables"; break; | ||
200 | case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | ||
201 | default: res->name = "reserved"; | ||
202 | } | ||
203 | res->start = e820.map[i].addr; | ||
204 | res->end = res->start + e820.map[i].size - 1; | ||
205 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
206 | if (request_resource(&iomem_resource, res)) { | ||
207 | kfree(res); | ||
208 | continue; | ||
209 | } | ||
210 | if (e820.map[i].type == E820_RAM) { | ||
211 | /* | ||
212 | * We don't know which RAM region contains kernel data, | ||
213 | * so we try it repeatedly and let the resource manager | ||
214 | * test it. | ||
215 | */ | ||
216 | request_resource(res, code_resource); | ||
217 | request_resource(res, data_resource); | ||
218 | request_resource(res, bss_resource); | ||
219 | #ifdef CONFIG_KEXEC | ||
220 | if (crashk_res.start != crashk_res.end) | ||
221 | request_resource(res, &crashk_res); | ||
222 | #endif | ||
223 | } | ||
224 | } | ||
225 | } | ||
226 | |||
227 | #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) | ||
228 | /** | ||
229 | * e820_mark_nosave_regions - Find the ranges of physical addresses that do not | ||
230 | * correspond to e820 RAM areas and mark the corresponding pages as nosave for | ||
231 | * hibernation. | ||
232 | * | ||
233 | * This function requires the e820 map to be sorted and without any | ||
234 | * overlapping entries and assumes the first e820 area to be RAM. | ||
235 | */ | ||
236 | void __init e820_mark_nosave_regions(void) | ||
237 | { | ||
238 | int i; | ||
239 | unsigned long pfn; | ||
240 | |||
241 | pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); | ||
242 | for (i = 1; i < e820.nr_map; i++) { | ||
243 | struct e820entry *ei = &e820.map[i]; | ||
244 | |||
245 | if (pfn < PFN_UP(ei->addr)) | ||
246 | register_nosave_region(pfn, PFN_UP(ei->addr)); | ||
247 | |||
248 | pfn = PFN_DOWN(ei->addr + ei->size); | ||
249 | if (ei->type != E820_RAM) | ||
250 | register_nosave_region(PFN_UP(ei->addr), pfn); | ||
251 | |||
252 | if (pfn >= max_low_pfn) | ||
253 | break; | ||
254 | } | ||
255 | } | ||
256 | #endif | ||
257 | |||
258 | void __init add_memory_region(unsigned long long start, | ||
259 | unsigned long long size, int type) | ||
260 | { | ||
261 | int x; | ||
262 | |||
263 | x = e820.nr_map; | ||
264 | |||
265 | if (x == E820MAX) { | ||
266 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
267 | return; | ||
268 | } | ||
269 | |||
270 | e820.map[x].addr = start; | ||
271 | e820.map[x].size = size; | ||
272 | e820.map[x].type = type; | ||
273 | e820.nr_map++; | ||
274 | } /* add_memory_region */ | ||
275 | |||
276 | /* | ||
277 | * Sanitize the BIOS e820 map. | ||
278 | * | ||
279 | * Some e820 responses include overlapping entries. The following | ||
280 | * replaces the original e820 map with a new one, removing overlaps. | ||
281 | * | ||
282 | */ | ||
283 | int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | ||
284 | { | ||
285 | struct change_member *change_tmp; | ||
286 | unsigned long current_type, last_type; | ||
287 | unsigned long long last_addr; | ||
288 | int chgidx, still_changing; | ||
289 | int overlap_entries; | ||
290 | int new_bios_entry; | ||
291 | int old_nr, new_nr, chg_nr; | ||
292 | int i; | ||
293 | |||
294 | /* | ||
295 | Visually we're performing the following (1,2,3,4 = memory types)... | ||
296 | |||
297 | Sample memory map (w/overlaps): | ||
298 | ____22__________________ | ||
299 | ______________________4_ | ||
300 | ____1111________________ | ||
301 | _44_____________________ | ||
302 | 11111111________________ | ||
303 | ____________________33__ | ||
304 | ___________44___________ | ||
305 | __________33333_________ | ||
306 | ______________22________ | ||
307 | ___________________2222_ | ||
308 | _________111111111______ | ||
309 | _____________________11_ | ||
310 | _________________4______ | ||
311 | |||
312 | Sanitized equivalent (no overlap): | ||
313 | 1_______________________ | ||
314 | _44_____________________ | ||
315 | ___1____________________ | ||
316 | ____22__________________ | ||
317 | ______11________________ | ||
318 | _________1______________ | ||
319 | __________3_____________ | ||
320 | ___________44___________ | ||
321 | _____________33_________ | ||
322 | _______________2________ | ||
323 | ________________1_______ | ||
324 | _________________4______ | ||
325 | ___________________2____ | ||
326 | ____________________33__ | ||
327 | ______________________4_ | ||
328 | */ | ||
329 | /* if there's only one memory region, don't bother */ | ||
330 | if (*pnr_map < 2) { | ||
331 | return -1; | ||
332 | } | ||
333 | |||
334 | old_nr = *pnr_map; | ||
335 | |||
336 | /* bail out if we find any unreasonable addresses in bios map */ | ||
337 | for (i=0; i<old_nr; i++) | ||
338 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { | ||
339 | return -1; | ||
340 | } | ||
341 | |||
342 | /* create pointers for initial change-point information (for sorting) */ | ||
343 | for (i=0; i < 2*old_nr; i++) | ||
344 | change_point[i] = &change_point_list[i]; | ||
345 | |||
346 | /* record all known change-points (starting and ending addresses), | ||
347 | omitting those that are for empty memory regions */ | ||
348 | chgidx = 0; | ||
349 | for (i=0; i < old_nr; i++) { | ||
350 | if (biosmap[i].size != 0) { | ||
351 | change_point[chgidx]->addr = biosmap[i].addr; | ||
352 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
353 | change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | ||
354 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
355 | } | ||
356 | } | ||
357 | chg_nr = chgidx; /* true number of change-points */ | ||
358 | |||
359 | /* sort change-point list by memory addresses (low -> high) */ | ||
360 | still_changing = 1; | ||
361 | while (still_changing) { | ||
362 | still_changing = 0; | ||
363 | for (i=1; i < chg_nr; i++) { | ||
364 | /* if <current_addr> > <last_addr>, swap */ | ||
365 | /* or, if current=<start_addr> & last=<end_addr>, swap */ | ||
366 | if ((change_point[i]->addr < change_point[i-1]->addr) || | ||
367 | ((change_point[i]->addr == change_point[i-1]->addr) && | ||
368 | (change_point[i]->addr == change_point[i]->pbios->addr) && | ||
369 | (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | ||
370 | ) | ||
371 | { | ||
372 | change_tmp = change_point[i]; | ||
373 | change_point[i] = change_point[i-1]; | ||
374 | change_point[i-1] = change_tmp; | ||
375 | still_changing=1; | ||
376 | } | ||
377 | } | ||
378 | } | ||
379 | |||
380 | /* create a new bios memory map, removing overlaps */ | ||
381 | overlap_entries=0; /* number of entries in the overlap table */ | ||
382 | new_bios_entry=0; /* index for creating new bios map entries */ | ||
383 | last_type = 0; /* start with undefined memory type */ | ||
384 | last_addr = 0; /* start with 0 as last starting address */ | ||
385 | /* loop through change-points, determining affect on the new bios map */ | ||
386 | for (chgidx=0; chgidx < chg_nr; chgidx++) | ||
387 | { | ||
388 | /* keep track of all overlapping bios entries */ | ||
389 | if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | ||
390 | { | ||
391 | /* add map entry to overlap list (> 1 entry implies an overlap) */ | ||
392 | overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | ||
393 | } | ||
394 | else | ||
395 | { | ||
396 | /* remove entry from list (order independent, so swap with last) */ | ||
397 | for (i=0; i<overlap_entries; i++) | ||
398 | { | ||
399 | if (overlap_list[i] == change_point[chgidx]->pbios) | ||
400 | overlap_list[i] = overlap_list[overlap_entries-1]; | ||
401 | } | ||
402 | overlap_entries--; | ||
403 | } | ||
404 | /* if there are overlapping entries, decide which "type" to use */ | ||
405 | /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | ||
406 | current_type = 0; | ||
407 | for (i=0; i<overlap_entries; i++) | ||
408 | if (overlap_list[i]->type > current_type) | ||
409 | current_type = overlap_list[i]->type; | ||
410 | /* continue building up new bios map based on this information */ | ||
411 | if (current_type != last_type) { | ||
412 | if (last_type != 0) { | ||
413 | new_bios[new_bios_entry].size = | ||
414 | change_point[chgidx]->addr - last_addr; | ||
415 | /* move forward only if the new size was non-zero */ | ||
416 | if (new_bios[new_bios_entry].size != 0) | ||
417 | if (++new_bios_entry >= E820MAX) | ||
418 | break; /* no more space left for new bios entries */ | ||
419 | } | ||
420 | if (current_type != 0) { | ||
421 | new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | ||
422 | new_bios[new_bios_entry].type = current_type; | ||
423 | last_addr=change_point[chgidx]->addr; | ||
424 | } | ||
425 | last_type = current_type; | ||
426 | } | ||
427 | } | ||
428 | new_nr = new_bios_entry; /* retain count for new bios entries */ | ||
429 | |||
430 | /* copy new bios mapping into original location */ | ||
431 | memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | ||
432 | *pnr_map = new_nr; | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | |||
437 | /* | ||
438 | * Copy the BIOS e820 map into a safe place. | ||
439 | * | ||
440 | * Sanity-check it while we're at it.. | ||
441 | * | ||
442 | * If we're lucky and live on a modern system, the setup code | ||
443 | * will have given us a memory map that we can use to properly | ||
444 | * set up memory. If we aren't, we'll fake a memory map. | ||
445 | * | ||
446 | * We check to see that the memory map contains at least 2 elements | ||
447 | * before we'll use it, because the detection code in setup.S may | ||
448 | * not be perfect and most every PC known to man has two memory | ||
449 | * regions: one from 0 to 640k, and one from 1mb up. (The IBM | ||
450 | * thinkpad 560x, for example, does not cooperate with the memory | ||
451 | * detection code.) | ||
452 | */ | ||
453 | int __init copy_e820_map(struct e820entry *biosmap, int nr_map) | ||
454 | { | ||
455 | /* Only one memory region (or negative)? Ignore it */ | ||
456 | if (nr_map < 2) | ||
457 | return -1; | ||
458 | |||
459 | do { | ||
460 | u64 start = biosmap->addr; | ||
461 | u64 size = biosmap->size; | ||
462 | u64 end = start + size; | ||
463 | u32 type = biosmap->type; | ||
464 | |||
465 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
466 | if (start > end) | ||
467 | return -1; | ||
468 | |||
469 | add_memory_region(start, size, type); | ||
470 | } while (biosmap++, --nr_map); | ||
471 | |||
472 | return 0; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * Find the highest page frame number we have available | ||
477 | */ | ||
478 | void __init propagate_e820_map(void) | ||
479 | { | ||
480 | int i; | ||
481 | |||
482 | max_pfn = 0; | ||
483 | |||
484 | for (i = 0; i < e820.nr_map; i++) { | ||
485 | unsigned long start, end; | ||
486 | /* RAM? */ | ||
487 | if (e820.map[i].type != E820_RAM) | ||
488 | continue; | ||
489 | start = PFN_UP(e820.map[i].addr); | ||
490 | end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); | ||
491 | if (start >= end) | ||
492 | continue; | ||
493 | if (end > max_pfn) | ||
494 | max_pfn = end; | ||
495 | memory_present(0, start, end); | ||
496 | } | ||
497 | } | ||
498 | |||
499 | /* | ||
500 | * Register fully available low RAM pages with the bootmem allocator. | ||
501 | */ | ||
502 | void __init register_bootmem_low_pages(unsigned long max_low_pfn) | ||
503 | { | ||
504 | int i; | ||
505 | |||
506 | for (i = 0; i < e820.nr_map; i++) { | ||
507 | unsigned long curr_pfn, last_pfn, size; | ||
508 | /* | ||
509 | * Reserve usable low memory | ||
510 | */ | ||
511 | if (e820.map[i].type != E820_RAM) | ||
512 | continue; | ||
513 | /* | ||
514 | * We are rounding up the start address of usable memory: | ||
515 | */ | ||
516 | curr_pfn = PFN_UP(e820.map[i].addr); | ||
517 | if (curr_pfn >= max_low_pfn) | ||
518 | continue; | ||
519 | /* | ||
520 | * ... and at the end of the usable range downwards: | ||
521 | */ | ||
522 | last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); | ||
523 | |||
524 | if (last_pfn > max_low_pfn) | ||
525 | last_pfn = max_low_pfn; | ||
526 | |||
527 | /* | ||
528 | * .. finally, did all the rounding and playing | ||
529 | * around just make the area go away? | ||
530 | */ | ||
531 | if (last_pfn <= curr_pfn) | ||
532 | continue; | ||
533 | |||
534 | size = last_pfn - curr_pfn; | ||
535 | free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); | ||
536 | } | ||
537 | } | ||
538 | |||
539 | void __init e820_register_memory(void) | ||
540 | { | ||
541 | unsigned long gapstart, gapsize, round; | ||
542 | unsigned long long last; | ||
543 | int i; | ||
544 | |||
545 | /* | ||
546 | * Search for the biggest gap in the low 32 bits of the e820 | ||
547 | * memory space. | ||
548 | */ | ||
549 | last = 0x100000000ull; | ||
550 | gapstart = 0x10000000; | ||
551 | gapsize = 0x400000; | ||
552 | i = e820.nr_map; | ||
553 | while (--i >= 0) { | ||
554 | unsigned long long start = e820.map[i].addr; | ||
555 | unsigned long long end = start + e820.map[i].size; | ||
556 | |||
557 | /* | ||
558 | * Since "last" is at most 4GB, we know we'll | ||
559 | * fit in 32 bits if this condition is true | ||
560 | */ | ||
561 | if (last > end) { | ||
562 | unsigned long gap = last - end; | ||
563 | |||
564 | if (gap > gapsize) { | ||
565 | gapsize = gap; | ||
566 | gapstart = end; | ||
567 | } | ||
568 | } | ||
569 | if (start < last) | ||
570 | last = start; | ||
571 | } | ||
572 | |||
573 | /* | ||
574 | * See how much we want to round up: start off with | ||
575 | * rounding to the next 1MB area. | ||
576 | */ | ||
577 | round = 0x100000; | ||
578 | while ((gapsize >> 4) > round) | ||
579 | round += round; | ||
580 | /* Fun with two's complement */ | ||
581 | pci_mem_start = (gapstart + round) & -round; | ||
582 | |||
583 | printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", | ||
584 | pci_mem_start, gapstart, gapsize); | ||
585 | } | ||
586 | |||
587 | void __init print_memory_map(char *who) | ||
588 | { | ||
589 | int i; | ||
590 | |||
591 | for (i = 0; i < e820.nr_map; i++) { | ||
592 | printk(" %s: %016Lx - %016Lx ", who, | ||
593 | e820.map[i].addr, | ||
594 | e820.map[i].addr + e820.map[i].size); | ||
595 | switch (e820.map[i].type) { | ||
596 | case E820_RAM: printk("(usable)\n"); | ||
597 | break; | ||
598 | case E820_RESERVED: | ||
599 | printk("(reserved)\n"); | ||
600 | break; | ||
601 | case E820_ACPI: | ||
602 | printk("(ACPI data)\n"); | ||
603 | break; | ||
604 | case E820_NVS: | ||
605 | printk("(ACPI NVS)\n"); | ||
606 | break; | ||
607 | default: printk("type %u\n", e820.map[i].type); | ||
608 | break; | ||
609 | } | ||
610 | } | ||
611 | } | ||
612 | |||
613 | void __init limit_regions(unsigned long long size) | ||
614 | { | ||
615 | unsigned long long current_addr; | ||
616 | int i; | ||
617 | |||
618 | print_memory_map("limit_regions start"); | ||
619 | for (i = 0; i < e820.nr_map; i++) { | ||
620 | current_addr = e820.map[i].addr + e820.map[i].size; | ||
621 | if (current_addr < size) | ||
622 | continue; | ||
623 | |||
624 | if (e820.map[i].type != E820_RAM) | ||
625 | continue; | ||
626 | |||
627 | if (e820.map[i].addr >= size) { | ||
628 | /* | ||
629 | * This region starts past the end of the | ||
630 | * requested size, skip it completely. | ||
631 | */ | ||
632 | e820.nr_map = i; | ||
633 | } else { | ||
634 | e820.nr_map = i + 1; | ||
635 | e820.map[i].size -= current_addr - size; | ||
636 | } | ||
637 | print_memory_map("limit_regions endfor"); | ||
638 | return; | ||
639 | } | ||
640 | print_memory_map("limit_regions endfunc"); | ||
641 | } | ||
642 | |||
643 | /* | ||
644 | * This function checks if any part of the range <start,end> is mapped | ||
645 | * with type. | ||
646 | */ | ||
647 | int | ||
648 | e820_any_mapped(u64 start, u64 end, unsigned type) | ||
649 | { | ||
650 | int i; | ||
651 | for (i = 0; i < e820.nr_map; i++) { | ||
652 | const struct e820entry *ei = &e820.map[i]; | ||
653 | if (type && ei->type != type) | ||
654 | continue; | ||
655 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
656 | continue; | ||
657 | return 1; | ||
658 | } | ||
659 | return 0; | ||
660 | } | ||
661 | EXPORT_SYMBOL_GPL(e820_any_mapped); | ||
662 | |||
663 | /* | ||
664 | * This function checks if the entire range <start,end> is mapped with type. | ||
665 | * | ||
666 | * Note: this function only works correct if the e820 table is sorted and | ||
667 | * not-overlapping, which is the case | ||
668 | */ | ||
669 | int __init | ||
670 | e820_all_mapped(unsigned long s, unsigned long e, unsigned type) | ||
671 | { | ||
672 | u64 start = s; | ||
673 | u64 end = e; | ||
674 | int i; | ||
675 | for (i = 0; i < e820.nr_map; i++) { | ||
676 | struct e820entry *ei = &e820.map[i]; | ||
677 | if (type && ei->type != type) | ||
678 | continue; | ||
679 | /* is the region (part) in overlap with the current region ?*/ | ||
680 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
681 | continue; | ||
682 | /* if the region is at the beginning of <start,end> we move | ||
683 | * start to the end of the region since it's ok until there | ||
684 | */ | ||
685 | if (ei->addr <= start) | ||
686 | start = ei->addr + ei->size; | ||
687 | /* if start is now at or beyond end, we're done, full | ||
688 | * coverage */ | ||
689 | if (start >= end) | ||
690 | return 1; /* we're done */ | ||
691 | } | ||
692 | return 0; | ||
693 | } | ||
694 | |||
695 | static int __init parse_memmap(char *arg) | ||
696 | { | ||
697 | if (!arg) | ||
698 | return -EINVAL; | ||
699 | |||
700 | if (strcmp(arg, "exactmap") == 0) { | ||
701 | #ifdef CONFIG_CRASH_DUMP | ||
702 | /* If we are doing a crash dump, we | ||
703 | * still need to know the real mem | ||
704 | * size before original memory map is | ||
705 | * reset. | ||
706 | */ | ||
707 | propagate_e820_map(); | ||
708 | saved_max_pfn = max_pfn; | ||
709 | #endif | ||
710 | e820.nr_map = 0; | ||
711 | user_defined_memmap = 1; | ||
712 | } else { | ||
713 | /* If the user specifies memory size, we | ||
714 | * limit the BIOS-provided memory map to | ||
715 | * that size. exactmap can be used to specify | ||
716 | * the exact map. mem=number can be used to | ||
717 | * trim the existing memory map. | ||
718 | */ | ||
719 | unsigned long long start_at, mem_size; | ||
720 | |||
721 | mem_size = memparse(arg, &arg); | ||
722 | if (*arg == '@') { | ||
723 | start_at = memparse(arg+1, &arg); | ||
724 | add_memory_region(start_at, mem_size, E820_RAM); | ||
725 | } else if (*arg == '#') { | ||
726 | start_at = memparse(arg+1, &arg); | ||
727 | add_memory_region(start_at, mem_size, E820_ACPI); | ||
728 | } else if (*arg == '$') { | ||
729 | start_at = memparse(arg+1, &arg); | ||
730 | add_memory_region(start_at, mem_size, E820_RESERVED); | ||
731 | } else { | ||
732 | limit_regions(mem_size); | ||
733 | user_defined_memmap = 1; | ||
734 | } | ||
735 | } | ||
736 | return 0; | ||
737 | } | ||
738 | early_param("memmap", parse_memmap); | ||
739 | void __init update_memory_range(u64 start, u64 size, unsigned old_type, | ||
740 | unsigned new_type) | ||
741 | { | ||
742 | int i; | ||
743 | |||
744 | BUG_ON(old_type == new_type); | ||
745 | |||
746 | for (i = 0; i < e820.nr_map; i++) { | ||
747 | struct e820entry *ei = &e820.map[i]; | ||
748 | u64 final_start, final_end; | ||
749 | if (ei->type != old_type) | ||
750 | continue; | ||
751 | /* totally covered? */ | ||
752 | if (ei->addr >= start && ei->size <= size) { | ||
753 | ei->type = new_type; | ||
754 | continue; | ||
755 | } | ||
756 | /* partially covered */ | ||
757 | final_start = max(start, ei->addr); | ||
758 | final_end = min(start + size, ei->addr + ei->size); | ||
759 | if (final_start >= final_end) | ||
760 | continue; | ||
761 | add_memory_region(final_start, final_end - final_start, | ||
762 | new_type); | ||
763 | } | ||
764 | } | ||
765 | void __init update_e820(void) | ||
766 | { | ||
767 | u8 nr_map; | ||
768 | |||
769 | nr_map = e820.nr_map; | ||
770 | if (sanitize_e820_map(e820.map, &nr_map)) | ||
771 | return; | ||
772 | e820.nr_map = nr_map; | ||
773 | printk(KERN_INFO "modified physical RAM map:\n"); | ||
774 | print_memory_map("modified"); | ||
775 | } | ||
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c deleted file mode 100644 index 124480c0008d..000000000000 --- a/arch/x86/kernel/e820_64.c +++ /dev/null | |||
@@ -1,952 +0,0 @@ | |||
1 | /* | ||
2 | * Handle the memory map. | ||
3 | * The functions here do the job until bootmem takes over. | ||
4 | * | ||
5 | * Getting sanitize_e820_map() in sync with i386 version by applying change: | ||
6 | * - Provisions for empty E820 memory regions (reported by certain BIOSes). | ||
7 | * Alex Achenbach <xela@slit.de>, December 2002. | ||
8 | * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | ||
9 | * | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/ioport.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/kexec.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/suspend.h> | ||
21 | #include <linux/pfn.h> | ||
22 | |||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/page.h> | ||
25 | #include <asm/e820.h> | ||
26 | #include <asm/proto.h> | ||
27 | #include <asm/setup.h> | ||
28 | #include <asm/sections.h> | ||
29 | #include <asm/kdebug.h> | ||
30 | #include <asm/trampoline.h> | ||
31 | |||
32 | struct e820map e820; | ||
33 | |||
34 | /* | ||
35 | * PFN of last memory page. | ||
36 | */ | ||
37 | unsigned long end_pfn; | ||
38 | |||
39 | /* | ||
40 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. | ||
41 | * The direct mapping extends to max_pfn_mapped, so that we can directly access | ||
42 | * apertures, ACPI and other tables without having to play with fixmaps. | ||
43 | */ | ||
44 | unsigned long max_pfn_mapped; | ||
45 | |||
46 | /* | ||
47 | * Last pfn which the user wants to use. | ||
48 | */ | ||
49 | static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; | ||
50 | |||
51 | /* | ||
52 | * Early reserved memory areas. | ||
53 | */ | ||
54 | #define MAX_EARLY_RES 20 | ||
55 | |||
56 | struct early_res { | ||
57 | unsigned long start, end; | ||
58 | char name[16]; | ||
59 | }; | ||
60 | static struct early_res early_res[MAX_EARLY_RES] __initdata = { | ||
61 | { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ | ||
62 | #ifdef CONFIG_X86_TRAMPOLINE | ||
63 | { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, | ||
64 | #endif | ||
65 | {} | ||
66 | }; | ||
67 | |||
68 | void __init reserve_early(unsigned long start, unsigned long end, char *name) | ||
69 | { | ||
70 | int i; | ||
71 | struct early_res *r; | ||
72 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
73 | r = &early_res[i]; | ||
74 | if (end > r->start && start < r->end) | ||
75 | panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n", | ||
76 | start, end - 1, name?name:"", r->start, r->end - 1, r->name); | ||
77 | } | ||
78 | if (i >= MAX_EARLY_RES) | ||
79 | panic("Too many early reservations"); | ||
80 | r = &early_res[i]; | ||
81 | r->start = start; | ||
82 | r->end = end; | ||
83 | if (name) | ||
84 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
85 | } | ||
86 | |||
87 | void __init free_early(unsigned long start, unsigned long end) | ||
88 | { | ||
89 | struct early_res *r; | ||
90 | int i, j; | ||
91 | |||
92 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
93 | r = &early_res[i]; | ||
94 | if (start == r->start && end == r->end) | ||
95 | break; | ||
96 | } | ||
97 | if (i >= MAX_EARLY_RES || !early_res[i].end) | ||
98 | panic("free_early on not reserved area: %lx-%lx!", start, end); | ||
99 | |||
100 | for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) | ||
101 | ; | ||
102 | |||
103 | memmove(&early_res[i], &early_res[i + 1], | ||
104 | (j - 1 - i) * sizeof(struct early_res)); | ||
105 | |||
106 | early_res[j - 1].end = 0; | ||
107 | } | ||
108 | |||
109 | void __init early_res_to_bootmem(unsigned long start, unsigned long end) | ||
110 | { | ||
111 | int i; | ||
112 | unsigned long final_start, final_end; | ||
113 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
114 | struct early_res *r = &early_res[i]; | ||
115 | final_start = max(start, r->start); | ||
116 | final_end = min(end, r->end); | ||
117 | if (final_start >= final_end) | ||
118 | continue; | ||
119 | printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, | ||
120 | final_start, final_end - 1, r->name); | ||
121 | reserve_bootmem_generic(final_start, final_end - final_start); | ||
122 | } | ||
123 | } | ||
124 | |||
125 | /* Check for already reserved areas */ | ||
126 | static inline int __init | ||
127 | bad_addr(unsigned long *addrp, unsigned long size, unsigned long align) | ||
128 | { | ||
129 | int i; | ||
130 | unsigned long addr = *addrp, last; | ||
131 | int changed = 0; | ||
132 | again: | ||
133 | last = addr + size; | ||
134 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
135 | struct early_res *r = &early_res[i]; | ||
136 | if (last >= r->start && addr < r->end) { | ||
137 | *addrp = addr = round_up(r->end, align); | ||
138 | changed = 1; | ||
139 | goto again; | ||
140 | } | ||
141 | } | ||
142 | return changed; | ||
143 | } | ||
144 | |||
145 | /* Check for already reserved areas */ | ||
146 | static inline int __init | ||
147 | bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align) | ||
148 | { | ||
149 | int i; | ||
150 | unsigned long addr = *addrp, last; | ||
151 | unsigned long size = *sizep; | ||
152 | int changed = 0; | ||
153 | again: | ||
154 | last = addr + size; | ||
155 | for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { | ||
156 | struct early_res *r = &early_res[i]; | ||
157 | if (last > r->start && addr < r->start) { | ||
158 | size = r->start - addr; | ||
159 | changed = 1; | ||
160 | goto again; | ||
161 | } | ||
162 | if (last > r->end && addr < r->end) { | ||
163 | addr = round_up(r->end, align); | ||
164 | size = last - addr; | ||
165 | changed = 1; | ||
166 | goto again; | ||
167 | } | ||
168 | if (last <= r->end && addr >= r->start) { | ||
169 | (*sizep)++; | ||
170 | return 0; | ||
171 | } | ||
172 | } | ||
173 | if (changed) { | ||
174 | *addrp = addr; | ||
175 | *sizep = size; | ||
176 | } | ||
177 | return changed; | ||
178 | } | ||
179 | /* | ||
180 | * This function checks if any part of the range <start,end> is mapped | ||
181 | * with type. | ||
182 | */ | ||
183 | int | ||
184 | e820_any_mapped(unsigned long start, unsigned long end, unsigned type) | ||
185 | { | ||
186 | int i; | ||
187 | |||
188 | for (i = 0; i < e820.nr_map; i++) { | ||
189 | struct e820entry *ei = &e820.map[i]; | ||
190 | |||
191 | if (type && ei->type != type) | ||
192 | continue; | ||
193 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
194 | continue; | ||
195 | return 1; | ||
196 | } | ||
197 | return 0; | ||
198 | } | ||
199 | EXPORT_SYMBOL_GPL(e820_any_mapped); | ||
200 | |||
201 | /* | ||
202 | * This function checks if the entire range <start,end> is mapped with type. | ||
203 | * | ||
204 | * Note: this function only works correct if the e820 table is sorted and | ||
205 | * not-overlapping, which is the case | ||
206 | */ | ||
207 | int __init e820_all_mapped(unsigned long start, unsigned long end, | ||
208 | unsigned type) | ||
209 | { | ||
210 | int i; | ||
211 | |||
212 | for (i = 0; i < e820.nr_map; i++) { | ||
213 | struct e820entry *ei = &e820.map[i]; | ||
214 | |||
215 | if (type && ei->type != type) | ||
216 | continue; | ||
217 | /* is the region (part) in overlap with the current region ?*/ | ||
218 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
219 | continue; | ||
220 | |||
221 | /* if the region is at the beginning of <start,end> we move | ||
222 | * start to the end of the region since it's ok until there | ||
223 | */ | ||
224 | if (ei->addr <= start) | ||
225 | start = ei->addr + ei->size; | ||
226 | /* | ||
227 | * if start is now at or beyond end, we're done, full | ||
228 | * coverage | ||
229 | */ | ||
230 | if (start >= end) | ||
231 | return 1; | ||
232 | } | ||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Find a free area with specified alignment in a specific range. | ||
238 | */ | ||
239 | unsigned long __init find_e820_area(unsigned long start, unsigned long end, | ||
240 | unsigned long size, unsigned long align) | ||
241 | { | ||
242 | int i; | ||
243 | |||
244 | for (i = 0; i < e820.nr_map; i++) { | ||
245 | struct e820entry *ei = &e820.map[i]; | ||
246 | unsigned long addr, last; | ||
247 | unsigned long ei_last; | ||
248 | |||
249 | if (ei->type != E820_RAM) | ||
250 | continue; | ||
251 | addr = round_up(ei->addr, align); | ||
252 | ei_last = ei->addr + ei->size; | ||
253 | if (addr < start) | ||
254 | addr = round_up(start, align); | ||
255 | if (addr >= ei_last) | ||
256 | continue; | ||
257 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
258 | ; | ||
259 | last = addr + size; | ||
260 | if (last > ei_last) | ||
261 | continue; | ||
262 | if (last > end) | ||
263 | continue; | ||
264 | return addr; | ||
265 | } | ||
266 | return -1UL; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Find next free range after *start | ||
271 | */ | ||
272 | unsigned long __init find_e820_area_size(unsigned long start, | ||
273 | unsigned long *sizep, | ||
274 | unsigned long align) | ||
275 | { | ||
276 | int i; | ||
277 | |||
278 | for (i = 0; i < e820.nr_map; i++) { | ||
279 | struct e820entry *ei = &e820.map[i]; | ||
280 | unsigned long addr, last; | ||
281 | unsigned long ei_last; | ||
282 | |||
283 | if (ei->type != E820_RAM) | ||
284 | continue; | ||
285 | addr = round_up(ei->addr, align); | ||
286 | ei_last = ei->addr + ei->size; | ||
287 | if (addr < start) | ||
288 | addr = round_up(start, align); | ||
289 | if (addr >= ei_last) | ||
290 | continue; | ||
291 | *sizep = ei_last - addr; | ||
292 | while (bad_addr_size(&addr, sizep, align) && | ||
293 | addr + *sizep <= ei_last) | ||
294 | ; | ||
295 | last = addr + *sizep; | ||
296 | if (last > ei_last) | ||
297 | continue; | ||
298 | return addr; | ||
299 | } | ||
300 | return -1UL; | ||
301 | |||
302 | } | ||
303 | /* | ||
304 | * Find the highest page frame number we have available | ||
305 | */ | ||
306 | unsigned long __init e820_end_of_ram(void) | ||
307 | { | ||
308 | unsigned long end_pfn; | ||
309 | |||
310 | end_pfn = find_max_pfn_with_active_regions(); | ||
311 | |||
312 | if (end_pfn > max_pfn_mapped) | ||
313 | max_pfn_mapped = end_pfn; | ||
314 | if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT) | ||
315 | max_pfn_mapped = MAXMEM>>PAGE_SHIFT; | ||
316 | if (end_pfn > end_user_pfn) | ||
317 | end_pfn = end_user_pfn; | ||
318 | if (end_pfn > max_pfn_mapped) | ||
319 | end_pfn = max_pfn_mapped; | ||
320 | |||
321 | printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped); | ||
322 | return end_pfn; | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Mark e820 reserved areas as busy for the resource manager. | ||
327 | */ | ||
328 | void __init e820_reserve_resources(void) | ||
329 | { | ||
330 | int i; | ||
331 | struct resource *res; | ||
332 | |||
333 | res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); | ||
334 | for (i = 0; i < e820.nr_map; i++) { | ||
335 | switch (e820.map[i].type) { | ||
336 | case E820_RAM: res->name = "System RAM"; break; | ||
337 | case E820_ACPI: res->name = "ACPI Tables"; break; | ||
338 | case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | ||
339 | default: res->name = "reserved"; | ||
340 | } | ||
341 | res->start = e820.map[i].addr; | ||
342 | res->end = res->start + e820.map[i].size - 1; | ||
343 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
344 | insert_resource(&iomem_resource, res); | ||
345 | res++; | ||
346 | } | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * Find the ranges of physical addresses that do not correspond to | ||
351 | * e820 RAM areas and mark the corresponding pages as nosave for software | ||
352 | * suspend and suspend to RAM. | ||
353 | * | ||
354 | * This function requires the e820 map to be sorted and without any | ||
355 | * overlapping entries and assumes the first e820 area to be RAM. | ||
356 | */ | ||
357 | void __init e820_mark_nosave_regions(void) | ||
358 | { | ||
359 | int i; | ||
360 | unsigned long paddr; | ||
361 | |||
362 | paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); | ||
363 | for (i = 1; i < e820.nr_map; i++) { | ||
364 | struct e820entry *ei = &e820.map[i]; | ||
365 | |||
366 | if (paddr < ei->addr) | ||
367 | register_nosave_region(PFN_DOWN(paddr), | ||
368 | PFN_UP(ei->addr)); | ||
369 | |||
370 | paddr = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
371 | if (ei->type != E820_RAM) | ||
372 | register_nosave_region(PFN_UP(ei->addr), | ||
373 | PFN_DOWN(paddr)); | ||
374 | |||
375 | if (paddr >= (end_pfn << PAGE_SHIFT)) | ||
376 | break; | ||
377 | } | ||
378 | } | ||
379 | |||
380 | /* | ||
381 | * Finds an active region in the address range from start_pfn to end_pfn and | ||
382 | * returns its range in ei_startpfn and ei_endpfn for the e820 entry. | ||
383 | */ | ||
384 | static int __init e820_find_active_region(const struct e820entry *ei, | ||
385 | unsigned long start_pfn, | ||
386 | unsigned long end_pfn, | ||
387 | unsigned long *ei_startpfn, | ||
388 | unsigned long *ei_endpfn) | ||
389 | { | ||
390 | *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; | ||
391 | *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; | ||
392 | |||
393 | /* Skip map entries smaller than a page */ | ||
394 | if (*ei_startpfn >= *ei_endpfn) | ||
395 | return 0; | ||
396 | |||
397 | /* Check if max_pfn_mapped should be updated */ | ||
398 | if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped) | ||
399 | max_pfn_mapped = *ei_endpfn; | ||
400 | |||
401 | /* Skip if map is outside the node */ | ||
402 | if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || | ||
403 | *ei_startpfn >= end_pfn) | ||
404 | return 0; | ||
405 | |||
406 | /* Check for overlaps */ | ||
407 | if (*ei_startpfn < start_pfn) | ||
408 | *ei_startpfn = start_pfn; | ||
409 | if (*ei_endpfn > end_pfn) | ||
410 | *ei_endpfn = end_pfn; | ||
411 | |||
412 | /* Obey end_user_pfn to save on memmap */ | ||
413 | if (*ei_startpfn >= end_user_pfn) | ||
414 | return 0; | ||
415 | if (*ei_endpfn > end_user_pfn) | ||
416 | *ei_endpfn = end_user_pfn; | ||
417 | |||
418 | return 1; | ||
419 | } | ||
420 | |||
421 | /* Walk the e820 map and register active regions within a node */ | ||
422 | void __init | ||
423 | e820_register_active_regions(int nid, unsigned long start_pfn, | ||
424 | unsigned long end_pfn) | ||
425 | { | ||
426 | unsigned long ei_startpfn; | ||
427 | unsigned long ei_endpfn; | ||
428 | int i; | ||
429 | |||
430 | for (i = 0; i < e820.nr_map; i++) | ||
431 | if (e820_find_active_region(&e820.map[i], | ||
432 | start_pfn, end_pfn, | ||
433 | &ei_startpfn, &ei_endpfn)) | ||
434 | add_active_range(nid, ei_startpfn, ei_endpfn); | ||
435 | } | ||
436 | |||
437 | /* | ||
438 | * Add a memory region to the kernel e820 map. | ||
439 | */ | ||
440 | void __init add_memory_region(unsigned long start, unsigned long size, int type) | ||
441 | { | ||
442 | int x = e820.nr_map; | ||
443 | |||
444 | if (x == E820MAX) { | ||
445 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
446 | return; | ||
447 | } | ||
448 | |||
449 | e820.map[x].addr = start; | ||
450 | e820.map[x].size = size; | ||
451 | e820.map[x].type = type; | ||
452 | e820.nr_map++; | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * Find the hole size (in bytes) in the memory range. | ||
457 | * @start: starting address of the memory range to scan | ||
458 | * @end: ending address of the memory range to scan | ||
459 | */ | ||
460 | unsigned long __init e820_hole_size(unsigned long start, unsigned long end) | ||
461 | { | ||
462 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
463 | unsigned long end_pfn = end >> PAGE_SHIFT; | ||
464 | unsigned long ei_startpfn, ei_endpfn, ram = 0; | ||
465 | int i; | ||
466 | |||
467 | for (i = 0; i < e820.nr_map; i++) { | ||
468 | if (e820_find_active_region(&e820.map[i], | ||
469 | start_pfn, end_pfn, | ||
470 | &ei_startpfn, &ei_endpfn)) | ||
471 | ram += ei_endpfn - ei_startpfn; | ||
472 | } | ||
473 | return end - start - (ram << PAGE_SHIFT); | ||
474 | } | ||
475 | |||
476 | static void __init e820_print_map(char *who) | ||
477 | { | ||
478 | int i; | ||
479 | |||
480 | for (i = 0; i < e820.nr_map; i++) { | ||
481 | printk(KERN_INFO " %s: %016Lx - %016Lx ", who, | ||
482 | (unsigned long long) e820.map[i].addr, | ||
483 | (unsigned long long) | ||
484 | (e820.map[i].addr + e820.map[i].size)); | ||
485 | switch (e820.map[i].type) { | ||
486 | case E820_RAM: | ||
487 | printk(KERN_CONT "(usable)\n"); | ||
488 | break; | ||
489 | case E820_RESERVED: | ||
490 | printk(KERN_CONT "(reserved)\n"); | ||
491 | break; | ||
492 | case E820_ACPI: | ||
493 | printk(KERN_CONT "(ACPI data)\n"); | ||
494 | break; | ||
495 | case E820_NVS: | ||
496 | printk(KERN_CONT "(ACPI NVS)\n"); | ||
497 | break; | ||
498 | default: | ||
499 | printk(KERN_CONT "type %u\n", e820.map[i].type); | ||
500 | break; | ||
501 | } | ||
502 | } | ||
503 | } | ||
504 | |||
505 | /* | ||
506 | * Sanitize the BIOS e820 map. | ||
507 | * | ||
508 | * Some e820 responses include overlapping entries. The following | ||
509 | * replaces the original e820 map with a new one, removing overlaps. | ||
510 | * | ||
511 | */ | ||
512 | static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) | ||
513 | { | ||
514 | struct change_member { | ||
515 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
516 | unsigned long long addr; /* address for this change point */ | ||
517 | }; | ||
518 | static struct change_member change_point_list[2*E820MAX] __initdata; | ||
519 | static struct change_member *change_point[2*E820MAX] __initdata; | ||
520 | static struct e820entry *overlap_list[E820MAX] __initdata; | ||
521 | static struct e820entry new_bios[E820MAX] __initdata; | ||
522 | struct change_member *change_tmp; | ||
523 | unsigned long current_type, last_type; | ||
524 | unsigned long long last_addr; | ||
525 | int chgidx, still_changing; | ||
526 | int overlap_entries; | ||
527 | int new_bios_entry; | ||
528 | int old_nr, new_nr, chg_nr; | ||
529 | int i; | ||
530 | |||
531 | /* | ||
532 | Visually we're performing the following | ||
533 | (1,2,3,4 = memory types)... | ||
534 | |||
535 | Sample memory map (w/overlaps): | ||
536 | ____22__________________ | ||
537 | ______________________4_ | ||
538 | ____1111________________ | ||
539 | _44_____________________ | ||
540 | 11111111________________ | ||
541 | ____________________33__ | ||
542 | ___________44___________ | ||
543 | __________33333_________ | ||
544 | ______________22________ | ||
545 | ___________________2222_ | ||
546 | _________111111111______ | ||
547 | _____________________11_ | ||
548 | _________________4______ | ||
549 | |||
550 | Sanitized equivalent (no overlap): | ||
551 | 1_______________________ | ||
552 | _44_____________________ | ||
553 | ___1____________________ | ||
554 | ____22__________________ | ||
555 | ______11________________ | ||
556 | _________1______________ | ||
557 | __________3_____________ | ||
558 | ___________44___________ | ||
559 | _____________33_________ | ||
560 | _______________2________ | ||
561 | ________________1_______ | ||
562 | _________________4______ | ||
563 | ___________________2____ | ||
564 | ____________________33__ | ||
565 | ______________________4_ | ||
566 | */ | ||
567 | |||
568 | /* if there's only one memory region, don't bother */ | ||
569 | if (*pnr_map < 2) | ||
570 | return -1; | ||
571 | |||
572 | old_nr = *pnr_map; | ||
573 | |||
574 | /* bail out if we find any unreasonable addresses in bios map */ | ||
575 | for (i = 0; i < old_nr; i++) | ||
576 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | ||
577 | return -1; | ||
578 | |||
579 | /* create pointers for initial change-point information (for sorting) */ | ||
580 | for (i = 0; i < 2 * old_nr; i++) | ||
581 | change_point[i] = &change_point_list[i]; | ||
582 | |||
583 | /* record all known change-points (starting and ending addresses), | ||
584 | omitting those that are for empty memory regions */ | ||
585 | chgidx = 0; | ||
586 | for (i = 0; i < old_nr; i++) { | ||
587 | if (biosmap[i].size != 0) { | ||
588 | change_point[chgidx]->addr = biosmap[i].addr; | ||
589 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
590 | change_point[chgidx]->addr = biosmap[i].addr + | ||
591 | biosmap[i].size; | ||
592 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
593 | } | ||
594 | } | ||
595 | chg_nr = chgidx; | ||
596 | |||
597 | /* sort change-point list by memory addresses (low -> high) */ | ||
598 | still_changing = 1; | ||
599 | while (still_changing) { | ||
600 | still_changing = 0; | ||
601 | for (i = 1; i < chg_nr; i++) { | ||
602 | unsigned long long curaddr, lastaddr; | ||
603 | unsigned long long curpbaddr, lastpbaddr; | ||
604 | |||
605 | curaddr = change_point[i]->addr; | ||
606 | lastaddr = change_point[i - 1]->addr; | ||
607 | curpbaddr = change_point[i]->pbios->addr; | ||
608 | lastpbaddr = change_point[i - 1]->pbios->addr; | ||
609 | |||
610 | /* | ||
611 | * swap entries, when: | ||
612 | * | ||
613 | * curaddr > lastaddr or | ||
614 | * curaddr == lastaddr and curaddr == curpbaddr and | ||
615 | * lastaddr != lastpbaddr | ||
616 | */ | ||
617 | if (curaddr < lastaddr || | ||
618 | (curaddr == lastaddr && curaddr == curpbaddr && | ||
619 | lastaddr != lastpbaddr)) { | ||
620 | change_tmp = change_point[i]; | ||
621 | change_point[i] = change_point[i-1]; | ||
622 | change_point[i-1] = change_tmp; | ||
623 | still_changing = 1; | ||
624 | } | ||
625 | } | ||
626 | } | ||
627 | |||
628 | /* create a new bios memory map, removing overlaps */ | ||
629 | overlap_entries = 0; /* number of entries in the overlap table */ | ||
630 | new_bios_entry = 0; /* index for creating new bios map entries */ | ||
631 | last_type = 0; /* start with undefined memory type */ | ||
632 | last_addr = 0; /* start with 0 as last starting address */ | ||
633 | |||
634 | /* loop through change-points, determining affect on the new bios map */ | ||
635 | for (chgidx = 0; chgidx < chg_nr; chgidx++) { | ||
636 | /* keep track of all overlapping bios entries */ | ||
637 | if (change_point[chgidx]->addr == | ||
638 | change_point[chgidx]->pbios->addr) { | ||
639 | /* | ||
640 | * add map entry to overlap list (> 1 entry | ||
641 | * implies an overlap) | ||
642 | */ | ||
643 | overlap_list[overlap_entries++] = | ||
644 | change_point[chgidx]->pbios; | ||
645 | } else { | ||
646 | /* | ||
647 | * remove entry from list (order independent, | ||
648 | * so swap with last) | ||
649 | */ | ||
650 | for (i = 0; i < overlap_entries; i++) { | ||
651 | if (overlap_list[i] == | ||
652 | change_point[chgidx]->pbios) | ||
653 | overlap_list[i] = | ||
654 | overlap_list[overlap_entries-1]; | ||
655 | } | ||
656 | overlap_entries--; | ||
657 | } | ||
658 | /* | ||
659 | * if there are overlapping entries, decide which | ||
660 | * "type" to use (larger value takes precedence -- | ||
661 | * 1=usable, 2,3,4,4+=unusable) | ||
662 | */ | ||
663 | current_type = 0; | ||
664 | for (i = 0; i < overlap_entries; i++) | ||
665 | if (overlap_list[i]->type > current_type) | ||
666 | current_type = overlap_list[i]->type; | ||
667 | /* | ||
668 | * continue building up new bios map based on this | ||
669 | * information | ||
670 | */ | ||
671 | if (current_type != last_type) { | ||
672 | if (last_type != 0) { | ||
673 | new_bios[new_bios_entry].size = | ||
674 | change_point[chgidx]->addr - last_addr; | ||
675 | /* | ||
676 | * move forward only if the new size | ||
677 | * was non-zero | ||
678 | */ | ||
679 | if (new_bios[new_bios_entry].size != 0) | ||
680 | /* | ||
681 | * no more space left for new | ||
682 | * bios entries ? | ||
683 | */ | ||
684 | if (++new_bios_entry >= E820MAX) | ||
685 | break; | ||
686 | } | ||
687 | if (current_type != 0) { | ||
688 | new_bios[new_bios_entry].addr = | ||
689 | change_point[chgidx]->addr; | ||
690 | new_bios[new_bios_entry].type = current_type; | ||
691 | last_addr = change_point[chgidx]->addr; | ||
692 | } | ||
693 | last_type = current_type; | ||
694 | } | ||
695 | } | ||
696 | /* retain count for new bios entries */ | ||
697 | new_nr = new_bios_entry; | ||
698 | |||
699 | /* copy new bios mapping into original location */ | ||
700 | memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); | ||
701 | *pnr_map = new_nr; | ||
702 | |||
703 | return 0; | ||
704 | } | ||
705 | |||
706 | /* | ||
707 | * Copy the BIOS e820 map into a safe place. | ||
708 | * | ||
709 | * Sanity-check it while we're at it.. | ||
710 | * | ||
711 | * If we're lucky and live on a modern system, the setup code | ||
712 | * will have given us a memory map that we can use to properly | ||
713 | * set up memory. If we aren't, we'll fake a memory map. | ||
714 | */ | ||
715 | static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) | ||
716 | { | ||
717 | /* Only one memory region (or negative)? Ignore it */ | ||
718 | if (nr_map < 2) | ||
719 | return -1; | ||
720 | |||
721 | do { | ||
722 | u64 start = biosmap->addr; | ||
723 | u64 size = biosmap->size; | ||
724 | u64 end = start + size; | ||
725 | u32 type = biosmap->type; | ||
726 | |||
727 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
728 | if (start > end) | ||
729 | return -1; | ||
730 | |||
731 | add_memory_region(start, size, type); | ||
732 | } while (biosmap++, --nr_map); | ||
733 | return 0; | ||
734 | } | ||
735 | |||
736 | static void early_panic(char *msg) | ||
737 | { | ||
738 | early_printk(msg); | ||
739 | panic(msg); | ||
740 | } | ||
741 | |||
742 | /* We're not void only for x86 32-bit compat */ | ||
743 | char * __init machine_specific_memory_setup(void) | ||
744 | { | ||
745 | char *who = "BIOS-e820"; | ||
746 | /* | ||
747 | * Try to copy the BIOS-supplied E820-map. | ||
748 | * | ||
749 | * Otherwise fake a memory map; one section from 0k->640k, | ||
750 | * the next section from 1mb->appropriate_mem_k | ||
751 | */ | ||
752 | sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); | ||
753 | if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) | ||
754 | early_panic("Cannot find a valid memory map"); | ||
755 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
756 | e820_print_map(who); | ||
757 | |||
758 | /* In case someone cares... */ | ||
759 | return who; | ||
760 | } | ||
761 | |||
762 | static int __init parse_memopt(char *p) | ||
763 | { | ||
764 | if (!p) | ||
765 | return -EINVAL; | ||
766 | end_user_pfn = memparse(p, &p); | ||
767 | end_user_pfn >>= PAGE_SHIFT; | ||
768 | return 0; | ||
769 | } | ||
770 | early_param("mem", parse_memopt); | ||
771 | |||
772 | static int userdef __initdata; | ||
773 | |||
774 | static int __init parse_memmap_opt(char *p) | ||
775 | { | ||
776 | char *oldp; | ||
777 | unsigned long long start_at, mem_size; | ||
778 | |||
779 | if (!strcmp(p, "exactmap")) { | ||
780 | #ifdef CONFIG_CRASH_DUMP | ||
781 | /* | ||
782 | * If we are doing a crash dump, we still need to know | ||
783 | * the real mem size before original memory map is | ||
784 | * reset. | ||
785 | */ | ||
786 | e820_register_active_regions(0, 0, -1UL); | ||
787 | saved_max_pfn = e820_end_of_ram(); | ||
788 | remove_all_active_ranges(); | ||
789 | #endif | ||
790 | max_pfn_mapped = 0; | ||
791 | e820.nr_map = 0; | ||
792 | userdef = 1; | ||
793 | return 0; | ||
794 | } | ||
795 | |||
796 | oldp = p; | ||
797 | mem_size = memparse(p, &p); | ||
798 | if (p == oldp) | ||
799 | return -EINVAL; | ||
800 | |||
801 | userdef = 1; | ||
802 | if (*p == '@') { | ||
803 | start_at = memparse(p+1, &p); | ||
804 | add_memory_region(start_at, mem_size, E820_RAM); | ||
805 | } else if (*p == '#') { | ||
806 | start_at = memparse(p+1, &p); | ||
807 | add_memory_region(start_at, mem_size, E820_ACPI); | ||
808 | } else if (*p == '$') { | ||
809 | start_at = memparse(p+1, &p); | ||
810 | add_memory_region(start_at, mem_size, E820_RESERVED); | ||
811 | } else { | ||
812 | end_user_pfn = (mem_size >> PAGE_SHIFT); | ||
813 | } | ||
814 | return *p == '\0' ? 0 : -EINVAL; | ||
815 | } | ||
816 | early_param("memmap", parse_memmap_opt); | ||
817 | |||
818 | void __init finish_e820_parsing(void) | ||
819 | { | ||
820 | if (userdef) { | ||
821 | char nr = e820.nr_map; | ||
822 | |||
823 | if (sanitize_e820_map(e820.map, &nr) < 0) | ||
824 | early_panic("Invalid user supplied memory map"); | ||
825 | e820.nr_map = nr; | ||
826 | |||
827 | printk(KERN_INFO "user-defined physical RAM map:\n"); | ||
828 | e820_print_map("user"); | ||
829 | } | ||
830 | } | ||
831 | |||
832 | void __init update_memory_range(u64 start, u64 size, unsigned old_type, | ||
833 | unsigned new_type) | ||
834 | { | ||
835 | int i; | ||
836 | |||
837 | BUG_ON(old_type == new_type); | ||
838 | |||
839 | for (i = 0; i < e820.nr_map; i++) { | ||
840 | struct e820entry *ei = &e820.map[i]; | ||
841 | u64 final_start, final_end; | ||
842 | if (ei->type != old_type) | ||
843 | continue; | ||
844 | /* totally covered? */ | ||
845 | if (ei->addr >= start && ei->size <= size) { | ||
846 | ei->type = new_type; | ||
847 | continue; | ||
848 | } | ||
849 | /* partially covered */ | ||
850 | final_start = max(start, ei->addr); | ||
851 | final_end = min(start + size, ei->addr + ei->size); | ||
852 | if (final_start >= final_end) | ||
853 | continue; | ||
854 | add_memory_region(final_start, final_end - final_start, | ||
855 | new_type); | ||
856 | } | ||
857 | } | ||
858 | |||
859 | void __init update_e820(void) | ||
860 | { | ||
861 | u8 nr_map; | ||
862 | |||
863 | nr_map = e820.nr_map; | ||
864 | if (sanitize_e820_map(e820.map, &nr_map)) | ||
865 | return; | ||
866 | e820.nr_map = nr_map; | ||
867 | printk(KERN_INFO "modified physical RAM map:\n"); | ||
868 | e820_print_map("modified"); | ||
869 | } | ||
870 | |||
871 | unsigned long pci_mem_start = 0xaeedbabe; | ||
872 | EXPORT_SYMBOL(pci_mem_start); | ||
873 | |||
874 | /* | ||
875 | * Search for the biggest gap in the low 32 bits of the e820 | ||
876 | * memory space. We pass this space to PCI to assign MMIO resources | ||
877 | * for hotplug or unconfigured devices in. | ||
878 | * Hopefully the BIOS let enough space left. | ||
879 | */ | ||
880 | __init void e820_setup_gap(void) | ||
881 | { | ||
882 | unsigned long gapstart, gapsize, round; | ||
883 | unsigned long last; | ||
884 | int i; | ||
885 | int found = 0; | ||
886 | |||
887 | last = 0x100000000ull; | ||
888 | gapstart = 0x10000000; | ||
889 | gapsize = 0x400000; | ||
890 | i = e820.nr_map; | ||
891 | while (--i >= 0) { | ||
892 | unsigned long long start = e820.map[i].addr; | ||
893 | unsigned long long end = start + e820.map[i].size; | ||
894 | |||
895 | /* | ||
896 | * Since "last" is at most 4GB, we know we'll | ||
897 | * fit in 32 bits if this condition is true | ||
898 | */ | ||
899 | if (last > end) { | ||
900 | unsigned long gap = last - end; | ||
901 | |||
902 | if (gap > gapsize) { | ||
903 | gapsize = gap; | ||
904 | gapstart = end; | ||
905 | found = 1; | ||
906 | } | ||
907 | } | ||
908 | if (start < last) | ||
909 | last = start; | ||
910 | } | ||
911 | |||
912 | if (!found) { | ||
913 | gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; | ||
914 | printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " | ||
915 | "address range\n" | ||
916 | KERN_ERR "PCI: Unassigned devices with 32bit resource " | ||
917 | "registers may break!\n"); | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * See how much we want to round up: start off with | ||
922 | * rounding to the next 1MB area. | ||
923 | */ | ||
924 | round = 0x100000; | ||
925 | while ((gapsize >> 4) > round) | ||
926 | round += round; | ||
927 | /* Fun with two's complement */ | ||
928 | pci_mem_start = (gapstart + round) & -round; | ||
929 | |||
930 | printk(KERN_INFO | ||
931 | "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | ||
932 | pci_mem_start, gapstart, gapsize); | ||
933 | } | ||
934 | |||
935 | int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) | ||
936 | { | ||
937 | int i; | ||
938 | |||
939 | if (slot < 0 || slot >= e820.nr_map) | ||
940 | return -1; | ||
941 | for (i = slot; i < e820.nr_map; i++) { | ||
942 | if (e820.map[i].type != E820_RAM) | ||
943 | continue; | ||
944 | break; | ||
945 | } | ||
946 | if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) | ||
947 | return -1; | ||
948 | *addr = e820.map[i].addr; | ||
949 | *size = min_t(u64, e820.map[i].size + e820.map[i].addr, | ||
950 | max_pfn << PAGE_SHIFT) - *addr; | ||
951 | return i + 1; | ||
952 | } | ||
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 9f51e1ea9e82..4353cf5e6fac 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c | |||
@@ -16,10 +16,7 @@ | |||
16 | #include <asm/dma.h> | 16 | #include <asm/dma.h> |
17 | #include <asm/io_apic.h> | 17 | #include <asm/io_apic.h> |
18 | #include <asm/apic.h> | 18 | #include <asm/apic.h> |
19 | 19 | #include <asm/iommu.h> | |
20 | #ifdef CONFIG_GART_IOMMU | ||
21 | #include <asm/gart.h> | ||
22 | #endif | ||
23 | 20 | ||
24 | static void __init fix_hypertransport_config(int num, int slot, int func) | 21 | static void __init fix_hypertransport_config(int num, int slot, int func) |
25 | { | 22 | { |
@@ -50,7 +47,7 @@ static void __init fix_hypertransport_config(int num, int slot, int func) | |||
50 | static void __init via_bugs(int num, int slot, int func) | 47 | static void __init via_bugs(int num, int slot, int func) |
51 | { | 48 | { |
52 | #ifdef CONFIG_GART_IOMMU | 49 | #ifdef CONFIG_GART_IOMMU |
53 | if ((end_pfn > MAX_DMA32_PFN || force_iommu) && | 50 | if ((max_pfn > MAX_DMA32_PFN || force_iommu) && |
54 | !gart_iommu_aperture_allowed) { | 51 | !gart_iommu_aperture_allowed) { |
55 | printk(KERN_INFO | 52 | printk(KERN_INFO |
56 | "Looks like a VIA chipset. Disabling IOMMU." | 53 | "Looks like a VIA chipset. Disabling IOMMU." |
@@ -98,17 +95,6 @@ static void __init nvidia_bugs(int num, int slot, int func) | |||
98 | 95 | ||
99 | } | 96 | } |
100 | 97 | ||
101 | static void __init ati_bugs(int num, int slot, int func) | ||
102 | { | ||
103 | #ifdef CONFIG_X86_IO_APIC | ||
104 | if (timer_over_8254 == 1) { | ||
105 | timer_over_8254 = 0; | ||
106 | printk(KERN_INFO | ||
107 | "ATI board detected. Disabling timer routing over 8254.\n"); | ||
108 | } | ||
109 | #endif | ||
110 | } | ||
111 | |||
112 | #define QFLAG_APPLY_ONCE 0x1 | 98 | #define QFLAG_APPLY_ONCE 0x1 |
113 | #define QFLAG_APPLIED 0x2 | 99 | #define QFLAG_APPLIED 0x2 |
114 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) | 100 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) |
@@ -126,14 +112,23 @@ static struct chipset early_qrk[] __initdata = { | |||
126 | PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, | 112 | PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, |
127 | { PCI_VENDOR_ID_VIA, PCI_ANY_ID, | 113 | { PCI_VENDOR_ID_VIA, PCI_ANY_ID, |
128 | PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, | 114 | PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, |
129 | { PCI_VENDOR_ID_ATI, PCI_ANY_ID, | ||
130 | PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs }, | ||
131 | { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, | 115 | { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, |
132 | PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, | 116 | PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, |
133 | {} | 117 | {} |
134 | }; | 118 | }; |
135 | 119 | ||
136 | static void __init check_dev_quirk(int num, int slot, int func) | 120 | /** |
121 | * check_dev_quirk - apply early quirks to a given PCI device | ||
122 | * @num: bus number | ||
123 | * @slot: slot number | ||
124 | * @func: PCI function | ||
125 | * | ||
126 | * Check the vendor & device ID against the early quirks table. | ||
127 | * | ||
128 | * If the device is single function, let early_quirks() know so we don't | ||
129 | * poke at this device again. | ||
130 | */ | ||
131 | static int __init check_dev_quirk(int num, int slot, int func) | ||
137 | { | 132 | { |
138 | u16 class; | 133 | u16 class; |
139 | u16 vendor; | 134 | u16 vendor; |
@@ -144,7 +139,7 @@ static void __init check_dev_quirk(int num, int slot, int func) | |||
144 | class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); | 139 | class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); |
145 | 140 | ||
146 | if (class == 0xffff) | 141 | if (class == 0xffff) |
147 | return; | 142 | return -1; /* no class, treat as single function */ |
148 | 143 | ||
149 | vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); | 144 | vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); |
150 | 145 | ||
@@ -167,7 +162,9 @@ static void __init check_dev_quirk(int num, int slot, int func) | |||
167 | type = read_pci_config_byte(num, slot, func, | 162 | type = read_pci_config_byte(num, slot, func, |
168 | PCI_HEADER_TYPE); | 163 | PCI_HEADER_TYPE); |
169 | if (!(type & 0x80)) | 164 | if (!(type & 0x80)) |
170 | return; | 165 | return -1; |
166 | |||
167 | return 0; | ||
171 | } | 168 | } |
172 | 169 | ||
173 | void __init early_quirks(void) | 170 | void __init early_quirks(void) |
@@ -180,6 +177,9 @@ void __init early_quirks(void) | |||
180 | /* Poor man's PCI discovery */ | 177 | /* Poor man's PCI discovery */ |
181 | for (num = 0; num < 32; num++) | 178 | for (num = 0; num < 32; num++) |
182 | for (slot = 0; slot < 32; slot++) | 179 | for (slot = 0; slot < 32; slot++) |
183 | for (func = 0; func < 8; func++) | 180 | for (func = 0; func < 8; func++) { |
184 | check_dev_quirk(num, slot, func); | 181 | /* Only probe function 0 on single fn devices */ |
182 | if (check_dev_quirk(num, slot, func)) | ||
183 | break; | ||
184 | } | ||
185 | } | 185 | } |
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 643fd861b724..ff9e7350da54 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c | |||
@@ -196,7 +196,7 @@ static struct console simnow_console = { | |||
196 | static struct console *early_console = &early_vga_console; | 196 | static struct console *early_console = &early_vga_console; |
197 | static int early_console_initialized; | 197 | static int early_console_initialized; |
198 | 198 | ||
199 | void early_printk(const char *fmt, ...) | 199 | asmlinkage void early_printk(const char *fmt, ...) |
200 | { | 200 | { |
201 | char buf[512]; | 201 | char buf[512]; |
202 | int n; | 202 | int n; |
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 77d424cf68b3..06cc8d4254b1 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
@@ -64,6 +64,17 @@ static int __init setup_noefi(char *arg) | |||
64 | } | 64 | } |
65 | early_param("noefi", setup_noefi); | 65 | early_param("noefi", setup_noefi); |
66 | 66 | ||
67 | int add_efi_memmap; | ||
68 | EXPORT_SYMBOL(add_efi_memmap); | ||
69 | |||
70 | static int __init setup_add_efi_memmap(char *arg) | ||
71 | { | ||
72 | add_efi_memmap = 1; | ||
73 | return 0; | ||
74 | } | ||
75 | early_param("add_efi_memmap", setup_add_efi_memmap); | ||
76 | |||
77 | |||
67 | static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) | 78 | static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) |
68 | { | 79 | { |
69 | return efi_call_virt2(get_time, tm, tc); | 80 | return efi_call_virt2(get_time, tm, tc); |
@@ -213,6 +224,50 @@ unsigned long efi_get_time(void) | |||
213 | eft.minute, eft.second); | 224 | eft.minute, eft.second); |
214 | } | 225 | } |
215 | 226 | ||
227 | /* | ||
228 | * Tell the kernel about the EFI memory map. This might include | ||
229 | * more than the max 128 entries that can fit in the e820 legacy | ||
230 | * (zeropage) memory map. | ||
231 | */ | ||
232 | |||
233 | static void __init do_add_efi_memmap(void) | ||
234 | { | ||
235 | void *p; | ||
236 | |||
237 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
238 | efi_memory_desc_t *md = p; | ||
239 | unsigned long long start = md->phys_addr; | ||
240 | unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; | ||
241 | int e820_type; | ||
242 | |||
243 | if (md->attribute & EFI_MEMORY_WB) | ||
244 | e820_type = E820_RAM; | ||
245 | else | ||
246 | e820_type = E820_RESERVED; | ||
247 | e820_add_region(start, size, e820_type); | ||
248 | } | ||
249 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | ||
250 | } | ||
251 | |||
252 | void __init efi_reserve_early(void) | ||
253 | { | ||
254 | unsigned long pmap; | ||
255 | |||
256 | #ifdef CONFIG_X86_32 | ||
257 | pmap = boot_params.efi_info.efi_memmap; | ||
258 | #else | ||
259 | pmap = (boot_params.efi_info.efi_memmap | | ||
260 | ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); | ||
261 | #endif | ||
262 | memmap.phys_map = (void *)pmap; | ||
263 | memmap.nr_map = boot_params.efi_info.efi_memmap_size / | ||
264 | boot_params.efi_info.efi_memdesc_size; | ||
265 | memmap.desc_version = boot_params.efi_info.efi_memdesc_version; | ||
266 | memmap.desc_size = boot_params.efi_info.efi_memdesc_size; | ||
267 | reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size, | ||
268 | "EFI memmap"); | ||
269 | } | ||
270 | |||
216 | #if EFI_DEBUG | 271 | #if EFI_DEBUG |
217 | static void __init print_efi_memmap(void) | 272 | static void __init print_efi_memmap(void) |
218 | { | 273 | { |
@@ -244,19 +299,11 @@ void __init efi_init(void) | |||
244 | 299 | ||
245 | #ifdef CONFIG_X86_32 | 300 | #ifdef CONFIG_X86_32 |
246 | efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; | 301 | efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; |
247 | memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; | ||
248 | #else | 302 | #else |
249 | efi_phys.systab = (efi_system_table_t *) | 303 | efi_phys.systab = (efi_system_table_t *) |
250 | (boot_params.efi_info.efi_systab | | 304 | (boot_params.efi_info.efi_systab | |
251 | ((__u64)boot_params.efi_info.efi_systab_hi<<32)); | 305 | ((__u64)boot_params.efi_info.efi_systab_hi<<32)); |
252 | memmap.phys_map = (void *) | ||
253 | (boot_params.efi_info.efi_memmap | | ||
254 | ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); | ||
255 | #endif | 306 | #endif |
256 | memmap.nr_map = boot_params.efi_info.efi_memmap_size / | ||
257 | boot_params.efi_info.efi_memdesc_size; | ||
258 | memmap.desc_version = boot_params.efi_info.efi_memdesc_version; | ||
259 | memmap.desc_size = boot_params.efi_info.efi_memdesc_size; | ||
260 | 307 | ||
261 | efi.systab = early_ioremap((unsigned long)efi_phys.systab, | 308 | efi.systab = early_ioremap((unsigned long)efi_phys.systab, |
262 | sizeof(efi_system_table_t)); | 309 | sizeof(efi_system_table_t)); |
@@ -370,6 +417,8 @@ void __init efi_init(void) | |||
370 | if (memmap.desc_size != sizeof(efi_memory_desc_t)) | 417 | if (memmap.desc_size != sizeof(efi_memory_desc_t)) |
371 | printk(KERN_WARNING "Kernel-defined memdesc" | 418 | printk(KERN_WARNING "Kernel-defined memdesc" |
372 | "doesn't match the one from EFI!\n"); | 419 | "doesn't match the one from EFI!\n"); |
420 | if (add_efi_memmap) | ||
421 | do_add_efi_memmap(); | ||
373 | 422 | ||
374 | /* Setup for EFI runtime service */ | 423 | /* Setup for EFI runtime service */ |
375 | reboot_type = BOOT_EFI; | 424 | reboot_type = BOOT_EFI; |
@@ -424,7 +473,7 @@ void __init efi_enter_virtual_mode(void) | |||
424 | size = md->num_pages << EFI_PAGE_SHIFT; | 473 | size = md->num_pages << EFI_PAGE_SHIFT; |
425 | end = md->phys_addr + size; | 474 | end = md->phys_addr + size; |
426 | 475 | ||
427 | if (PFN_UP(end) <= max_pfn_mapped) | 476 | if (PFN_UP(end) <= max_low_pfn_mapped) |
428 | va = __va(md->phys_addr); | 477 | va = __va(md->phys_addr); |
429 | else | 478 | else |
430 | va = efi_ioremap(md->phys_addr, size); | 479 | va = efi_ioremap(md->phys_addr, size); |
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index 5d23d85624d4..4b63c8e1f13b 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c | |||
@@ -49,13 +49,13 @@ void efi_call_phys_prelog(void) | |||
49 | local_irq_save(efi_rt_eflags); | 49 | local_irq_save(efi_rt_eflags); |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * If I don't have PSE, I should just duplicate two entries in page | 52 | * If I don't have PAE, I should just duplicate two entries in page |
53 | * directory. If I have PSE, I just need to duplicate one entry in | 53 | * directory. If I have PAE, I just need to duplicate one entry in |
54 | * page directory. | 54 | * page directory. |
55 | */ | 55 | */ |
56 | cr4 = read_cr4(); | 56 | cr4 = read_cr4(); |
57 | 57 | ||
58 | if (cr4 & X86_CR4_PSE) { | 58 | if (cr4 & X86_CR4_PAE) { |
59 | efi_bak_pg_dir_pointer[0].pgd = | 59 | efi_bak_pg_dir_pointer[0].pgd = |
60 | swapper_pg_dir[pgd_index(0)].pgd; | 60 | swapper_pg_dir[pgd_index(0)].pgd; |
61 | swapper_pg_dir[0].pgd = | 61 | swapper_pg_dir[0].pgd = |
@@ -93,7 +93,7 @@ void efi_call_phys_epilog(void) | |||
93 | 93 | ||
94 | cr4 = read_cr4(); | 94 | cr4 = read_cr4(); |
95 | 95 | ||
96 | if (cr4 & X86_CR4_PSE) { | 96 | if (cr4 & X86_CR4_PAE) { |
97 | swapper_pg_dir[pgd_index(0)].pgd = | 97 | swapper_pg_dir[pgd_index(0)].pgd = |
98 | efi_bak_pg_dir_pointer[0].pgd; | 98 | efi_bak_pg_dir_pointer[0].pgd; |
99 | } else { | 99 | } else { |
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index d0060fdcccac..652c5287215f 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c | |||
@@ -97,13 +97,7 @@ void __init efi_call_phys_epilog(void) | |||
97 | early_runtime_code_mapping_set_exec(0); | 97 | early_runtime_code_mapping_set_exec(0); |
98 | } | 98 | } |
99 | 99 | ||
100 | void __init efi_reserve_bootmem(void) | 100 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) |
101 | { | ||
102 | reserve_bootmem_generic((unsigned long)memmap.phys_map, | ||
103 | memmap.nr_map * memmap.desc_size); | ||
104 | } | ||
105 | |||
106 | void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) | ||
107 | { | 101 | { |
108 | static unsigned pages_mapped __initdata; | 102 | static unsigned pages_mapped __initdata; |
109 | unsigned i, pages; | 103 | unsigned i, pages; |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c778e4fa55a2..109792bc7cfa 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -51,14 +51,25 @@ | |||
51 | #include <asm/percpu.h> | 51 | #include <asm/percpu.h> |
52 | #include <asm/dwarf2.h> | 52 | #include <asm/dwarf2.h> |
53 | #include <asm/processor-flags.h> | 53 | #include <asm/processor-flags.h> |
54 | #include "irq_vectors.h" | 54 | #include <asm/ftrace.h> |
55 | #include <asm/irq_vectors.h> | ||
56 | |||
57 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | ||
58 | #include <linux/elf-em.h> | ||
59 | #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) | ||
60 | #define __AUDIT_ARCH_LE 0x40000000 | ||
61 | |||
62 | #ifndef CONFIG_AUDITSYSCALL | ||
63 | #define sysenter_audit syscall_trace_entry | ||
64 | #define sysexit_audit syscall_exit_work | ||
65 | #endif | ||
55 | 66 | ||
56 | /* | 67 | /* |
57 | * We use macros for low-level operations which need to be overridden | 68 | * We use macros for low-level operations which need to be overridden |
58 | * for paravirtualization. The following will never clobber any registers: | 69 | * for paravirtualization. The following will never clobber any registers: |
59 | * INTERRUPT_RETURN (aka. "iret") | 70 | * INTERRUPT_RETURN (aka. "iret") |
60 | * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") | 71 | * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") |
61 | * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). | 72 | * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). |
62 | * | 73 | * |
63 | * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must | 74 | * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must |
64 | * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). | 75 | * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). |
@@ -331,8 +342,9 @@ sysenter_past_esp: | |||
331 | GET_THREAD_INFO(%ebp) | 342 | GET_THREAD_INFO(%ebp) |
332 | 343 | ||
333 | /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | 344 | /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ |
334 | testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | 345 | testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) |
335 | jnz syscall_trace_entry | 346 | jnz sysenter_audit |
347 | sysenter_do_call: | ||
336 | cmpl $(nr_syscalls), %eax | 348 | cmpl $(nr_syscalls), %eax |
337 | jae syscall_badsys | 349 | jae syscall_badsys |
338 | call *sys_call_table(,%eax,4) | 350 | call *sys_call_table(,%eax,4) |
@@ -342,14 +354,54 @@ sysenter_past_esp: | |||
342 | TRACE_IRQS_OFF | 354 | TRACE_IRQS_OFF |
343 | movl TI_flags(%ebp), %ecx | 355 | movl TI_flags(%ebp), %ecx |
344 | testw $_TIF_ALLWORK_MASK, %cx | 356 | testw $_TIF_ALLWORK_MASK, %cx |
345 | jne syscall_exit_work | 357 | jne sysexit_audit |
358 | sysenter_exit: | ||
346 | /* if something modifies registers it must also disable sysexit */ | 359 | /* if something modifies registers it must also disable sysexit */ |
347 | movl PT_EIP(%esp), %edx | 360 | movl PT_EIP(%esp), %edx |
348 | movl PT_OLDESP(%esp), %ecx | 361 | movl PT_OLDESP(%esp), %ecx |
349 | xorl %ebp,%ebp | 362 | xorl %ebp,%ebp |
350 | TRACE_IRQS_ON | 363 | TRACE_IRQS_ON |
351 | 1: mov PT_FS(%esp), %fs | 364 | 1: mov PT_FS(%esp), %fs |
352 | ENABLE_INTERRUPTS_SYSCALL_RET | 365 | ENABLE_INTERRUPTS_SYSEXIT |
366 | |||
367 | #ifdef CONFIG_AUDITSYSCALL | ||
368 | sysenter_audit: | ||
369 | testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | ||
370 | jnz syscall_trace_entry | ||
371 | addl $4,%esp | ||
372 | CFI_ADJUST_CFA_OFFSET -4 | ||
373 | /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ | ||
374 | /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ | ||
375 | /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ | ||
376 | movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ | ||
377 | movl %eax,%edx /* 2nd arg: syscall number */ | ||
378 | movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ | ||
379 | call audit_syscall_entry | ||
380 | pushl %ebx | ||
381 | CFI_ADJUST_CFA_OFFSET 4 | ||
382 | movl PT_EAX(%esp),%eax /* reload syscall number */ | ||
383 | jmp sysenter_do_call | ||
384 | |||
385 | sysexit_audit: | ||
386 | testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx | ||
387 | jne syscall_exit_work | ||
388 | TRACE_IRQS_ON | ||
389 | ENABLE_INTERRUPTS(CLBR_ANY) | ||
390 | movl %eax,%edx /* second arg, syscall return value */ | ||
391 | cmpl $0,%eax /* is it < 0? */ | ||
392 | setl %al /* 1 if so, 0 if not */ | ||
393 | movzbl %al,%eax /* zero-extend that */ | ||
394 | inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ | ||
395 | call audit_syscall_exit | ||
396 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
397 | TRACE_IRQS_OFF | ||
398 | movl TI_flags(%ebp), %ecx | ||
399 | testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx | ||
400 | jne syscall_exit_work | ||
401 | movl PT_EAX(%esp),%eax /* reload syscall return value */ | ||
402 | jmp sysenter_exit | ||
403 | #endif | ||
404 | |||
353 | CFI_ENDPROC | 405 | CFI_ENDPROC |
354 | .pushsection .fixup,"ax" | 406 | .pushsection .fixup,"ax" |
355 | 2: movl $0,PT_FS(%esp) | 407 | 2: movl $0,PT_FS(%esp) |
@@ -369,7 +421,7 @@ ENTRY(system_call) | |||
369 | GET_THREAD_INFO(%ebp) | 421 | GET_THREAD_INFO(%ebp) |
370 | # system call tracing in operation / emulation | 422 | # system call tracing in operation / emulation |
371 | /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | 423 | /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ |
372 | testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | 424 | testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) |
373 | jnz syscall_trace_entry | 425 | jnz syscall_trace_entry |
374 | cmpl $(nr_syscalls), %eax | 426 | cmpl $(nr_syscalls), %eax |
375 | jae syscall_badsys | 427 | jae syscall_badsys |
@@ -382,10 +434,6 @@ syscall_exit: | |||
382 | # setting need_resched or sigpending | 434 | # setting need_resched or sigpending |
383 | # between sampling and the iret | 435 | # between sampling and the iret |
384 | TRACE_IRQS_OFF | 436 | TRACE_IRQS_OFF |
385 | testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit | ||
386 | jz no_singlestep | ||
387 | orl $_TIF_SINGLESTEP,TI_flags(%ebp) | ||
388 | no_singlestep: | ||
389 | movl TI_flags(%ebp), %ecx | 437 | movl TI_flags(%ebp), %ecx |
390 | testw $_TIF_ALLWORK_MASK, %cx # current->work | 438 | testw $_TIF_ALLWORK_MASK, %cx # current->work |
391 | jne syscall_exit_work | 439 | jne syscall_exit_work |
@@ -513,12 +561,8 @@ END(work_pending) | |||
513 | syscall_trace_entry: | 561 | syscall_trace_entry: |
514 | movl $-ENOSYS,PT_EAX(%esp) | 562 | movl $-ENOSYS,PT_EAX(%esp) |
515 | movl %esp, %eax | 563 | movl %esp, %eax |
516 | xorl %edx,%edx | 564 | call syscall_trace_enter |
517 | call do_syscall_trace | 565 | /* What it returned is what we'll actually use. */ |
518 | cmpl $0, %eax | ||
519 | jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, | ||
520 | # so must skip actual syscall | ||
521 | movl PT_ORIG_EAX(%esp), %eax | ||
522 | cmpl $(nr_syscalls), %eax | 566 | cmpl $(nr_syscalls), %eax |
523 | jnae syscall_call | 567 | jnae syscall_call |
524 | jmp syscall_exit | 568 | jmp syscall_exit |
@@ -527,14 +571,13 @@ END(syscall_trace_entry) | |||
527 | # perform syscall exit tracing | 571 | # perform syscall exit tracing |
528 | ALIGN | 572 | ALIGN |
529 | syscall_exit_work: | 573 | syscall_exit_work: |
530 | testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl | 574 | testb $_TIF_WORK_SYSCALL_EXIT, %cl |
531 | jz work_pending | 575 | jz work_pending |
532 | TRACE_IRQS_ON | 576 | TRACE_IRQS_ON |
533 | ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call | 577 | ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call |
534 | # schedule() instead | 578 | # schedule() instead |
535 | movl %esp, %eax | 579 | movl %esp, %eax |
536 | movl $1, %edx | 580 | call syscall_trace_leave |
537 | call do_syscall_trace | ||
538 | jmp resume_userspace | 581 | jmp resume_userspace |
539 | END(syscall_exit_work) | 582 | END(syscall_exit_work) |
540 | CFI_ENDPROC | 583 | CFI_ENDPROC |
@@ -874,10 +917,10 @@ ENTRY(native_iret) | |||
874 | .previous | 917 | .previous |
875 | END(native_iret) | 918 | END(native_iret) |
876 | 919 | ||
877 | ENTRY(native_irq_enable_syscall_ret) | 920 | ENTRY(native_irq_enable_sysexit) |
878 | sti | 921 | sti |
879 | sysexit | 922 | sysexit |
880 | END(native_irq_enable_syscall_ret) | 923 | END(native_irq_enable_sysexit) |
881 | #endif | 924 | #endif |
882 | 925 | ||
883 | KPROBE_ENTRY(int3) | 926 | KPROBE_ENTRY(int3) |
@@ -1023,7 +1066,9 @@ ENDPROC(kernel_thread_helper) | |||
1023 | ENTRY(xen_sysenter_target) | 1066 | ENTRY(xen_sysenter_target) |
1024 | RING0_INT_FRAME | 1067 | RING0_INT_FRAME |
1025 | addl $5*4, %esp /* remove xen-provided frame */ | 1068 | addl $5*4, %esp /* remove xen-provided frame */ |
1069 | CFI_ADJUST_CFA_OFFSET -5*4 | ||
1026 | jmp sysenter_past_esp | 1070 | jmp sysenter_past_esp |
1071 | CFI_ENDPROC | ||
1027 | 1072 | ||
1028 | ENTRY(xen_hypervisor_callback) | 1073 | ENTRY(xen_hypervisor_callback) |
1029 | CFI_STARTPROC | 1074 | CFI_STARTPROC |
@@ -1110,6 +1155,77 @@ ENDPROC(xen_failsafe_callback) | |||
1110 | 1155 | ||
1111 | #endif /* CONFIG_XEN */ | 1156 | #endif /* CONFIG_XEN */ |
1112 | 1157 | ||
1158 | #ifdef CONFIG_FTRACE | ||
1159 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
1160 | |||
1161 | ENTRY(mcount) | ||
1162 | pushl %eax | ||
1163 | pushl %ecx | ||
1164 | pushl %edx | ||
1165 | movl 0xc(%esp), %eax | ||
1166 | subl $MCOUNT_INSN_SIZE, %eax | ||
1167 | |||
1168 | .globl mcount_call | ||
1169 | mcount_call: | ||
1170 | call ftrace_stub | ||
1171 | |||
1172 | popl %edx | ||
1173 | popl %ecx | ||
1174 | popl %eax | ||
1175 | |||
1176 | ret | ||
1177 | END(mcount) | ||
1178 | |||
1179 | ENTRY(ftrace_caller) | ||
1180 | pushl %eax | ||
1181 | pushl %ecx | ||
1182 | pushl %edx | ||
1183 | movl 0xc(%esp), %eax | ||
1184 | movl 0x4(%ebp), %edx | ||
1185 | subl $MCOUNT_INSN_SIZE, %eax | ||
1186 | |||
1187 | .globl ftrace_call | ||
1188 | ftrace_call: | ||
1189 | call ftrace_stub | ||
1190 | |||
1191 | popl %edx | ||
1192 | popl %ecx | ||
1193 | popl %eax | ||
1194 | |||
1195 | .globl ftrace_stub | ||
1196 | ftrace_stub: | ||
1197 | ret | ||
1198 | END(ftrace_caller) | ||
1199 | |||
1200 | #else /* ! CONFIG_DYNAMIC_FTRACE */ | ||
1201 | |||
1202 | ENTRY(mcount) | ||
1203 | cmpl $ftrace_stub, ftrace_trace_function | ||
1204 | jnz trace | ||
1205 | .globl ftrace_stub | ||
1206 | ftrace_stub: | ||
1207 | ret | ||
1208 | |||
1209 | /* taken from glibc */ | ||
1210 | trace: | ||
1211 | pushl %eax | ||
1212 | pushl %ecx | ||
1213 | pushl %edx | ||
1214 | movl 0xc(%esp), %eax | ||
1215 | movl 0x4(%ebp), %edx | ||
1216 | subl $MCOUNT_INSN_SIZE, %eax | ||
1217 | |||
1218 | call *ftrace_trace_function | ||
1219 | |||
1220 | popl %edx | ||
1221 | popl %ecx | ||
1222 | popl %eax | ||
1223 | |||
1224 | jmp ftrace_stub | ||
1225 | END(mcount) | ||
1226 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
1227 | #endif /* CONFIG_FTRACE */ | ||
1228 | |||
1113 | .section .rodata,"a" | 1229 | .section .rodata,"a" |
1114 | #include "syscall_table_32.S" | 1230 | #include "syscall_table_32.S" |
1115 | 1231 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df522a7..89434d439605 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -51,16 +51,127 @@ | |||
51 | #include <asm/page.h> | 51 | #include <asm/page.h> |
52 | #include <asm/irqflags.h> | 52 | #include <asm/irqflags.h> |
53 | #include <asm/paravirt.h> | 53 | #include <asm/paravirt.h> |
54 | #include <asm/ftrace.h> | ||
55 | |||
56 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | ||
57 | #include <linux/elf-em.h> | ||
58 | #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) | ||
59 | #define __AUDIT_ARCH_64BIT 0x80000000 | ||
60 | #define __AUDIT_ARCH_LE 0x40000000 | ||
54 | 61 | ||
55 | .code64 | 62 | .code64 |
56 | 63 | ||
64 | #ifdef CONFIG_FTRACE | ||
65 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
66 | ENTRY(mcount) | ||
67 | |||
68 | subq $0x38, %rsp | ||
69 | movq %rax, (%rsp) | ||
70 | movq %rcx, 8(%rsp) | ||
71 | movq %rdx, 16(%rsp) | ||
72 | movq %rsi, 24(%rsp) | ||
73 | movq %rdi, 32(%rsp) | ||
74 | movq %r8, 40(%rsp) | ||
75 | movq %r9, 48(%rsp) | ||
76 | |||
77 | movq 0x38(%rsp), %rdi | ||
78 | subq $MCOUNT_INSN_SIZE, %rdi | ||
79 | |||
80 | .globl mcount_call | ||
81 | mcount_call: | ||
82 | call ftrace_stub | ||
83 | |||
84 | movq 48(%rsp), %r9 | ||
85 | movq 40(%rsp), %r8 | ||
86 | movq 32(%rsp), %rdi | ||
87 | movq 24(%rsp), %rsi | ||
88 | movq 16(%rsp), %rdx | ||
89 | movq 8(%rsp), %rcx | ||
90 | movq (%rsp), %rax | ||
91 | addq $0x38, %rsp | ||
92 | |||
93 | retq | ||
94 | END(mcount) | ||
95 | |||
96 | ENTRY(ftrace_caller) | ||
97 | |||
98 | /* taken from glibc */ | ||
99 | subq $0x38, %rsp | ||
100 | movq %rax, (%rsp) | ||
101 | movq %rcx, 8(%rsp) | ||
102 | movq %rdx, 16(%rsp) | ||
103 | movq %rsi, 24(%rsp) | ||
104 | movq %rdi, 32(%rsp) | ||
105 | movq %r8, 40(%rsp) | ||
106 | movq %r9, 48(%rsp) | ||
107 | |||
108 | movq 0x38(%rsp), %rdi | ||
109 | movq 8(%rbp), %rsi | ||
110 | subq $MCOUNT_INSN_SIZE, %rdi | ||
111 | |||
112 | .globl ftrace_call | ||
113 | ftrace_call: | ||
114 | call ftrace_stub | ||
115 | |||
116 | movq 48(%rsp), %r9 | ||
117 | movq 40(%rsp), %r8 | ||
118 | movq 32(%rsp), %rdi | ||
119 | movq 24(%rsp), %rsi | ||
120 | movq 16(%rsp), %rdx | ||
121 | movq 8(%rsp), %rcx | ||
122 | movq (%rsp), %rax | ||
123 | addq $0x38, %rsp | ||
124 | |||
125 | .globl ftrace_stub | ||
126 | ftrace_stub: | ||
127 | retq | ||
128 | END(ftrace_caller) | ||
129 | |||
130 | #else /* ! CONFIG_DYNAMIC_FTRACE */ | ||
131 | ENTRY(mcount) | ||
132 | cmpq $ftrace_stub, ftrace_trace_function | ||
133 | jnz trace | ||
134 | .globl ftrace_stub | ||
135 | ftrace_stub: | ||
136 | retq | ||
137 | |||
138 | trace: | ||
139 | /* taken from glibc */ | ||
140 | subq $0x38, %rsp | ||
141 | movq %rax, (%rsp) | ||
142 | movq %rcx, 8(%rsp) | ||
143 | movq %rdx, 16(%rsp) | ||
144 | movq %rsi, 24(%rsp) | ||
145 | movq %rdi, 32(%rsp) | ||
146 | movq %r8, 40(%rsp) | ||
147 | movq %r9, 48(%rsp) | ||
148 | |||
149 | movq 0x38(%rsp), %rdi | ||
150 | movq 8(%rbp), %rsi | ||
151 | subq $MCOUNT_INSN_SIZE, %rdi | ||
152 | |||
153 | call *ftrace_trace_function | ||
154 | |||
155 | movq 48(%rsp), %r9 | ||
156 | movq 40(%rsp), %r8 | ||
157 | movq 32(%rsp), %rdi | ||
158 | movq 24(%rsp), %rsi | ||
159 | movq 16(%rsp), %rdx | ||
160 | movq 8(%rsp), %rcx | ||
161 | movq (%rsp), %rax | ||
162 | addq $0x38, %rsp | ||
163 | |||
164 | jmp ftrace_stub | ||
165 | END(mcount) | ||
166 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
167 | #endif /* CONFIG_FTRACE */ | ||
168 | |||
57 | #ifndef CONFIG_PREEMPT | 169 | #ifndef CONFIG_PREEMPT |
58 | #define retint_kernel retint_restore_args | 170 | #define retint_kernel retint_restore_args |
59 | #endif | 171 | #endif |
60 | 172 | ||
61 | #ifdef CONFIG_PARAVIRT | 173 | #ifdef CONFIG_PARAVIRT |
62 | ENTRY(native_irq_enable_syscall_ret) | 174 | ENTRY(native_usergs_sysret64) |
63 | movq %gs:pda_oldrsp,%rsp | ||
64 | swapgs | 175 | swapgs |
65 | sysretq | 176 | sysretq |
66 | #endif /* CONFIG_PARAVIRT */ | 177 | #endif /* CONFIG_PARAVIRT */ |
@@ -104,7 +215,7 @@ ENTRY(native_irq_enable_syscall_ret) | |||
104 | .macro FAKE_STACK_FRAME child_rip | 215 | .macro FAKE_STACK_FRAME child_rip |
105 | /* push in order ss, rsp, eflags, cs, rip */ | 216 | /* push in order ss, rsp, eflags, cs, rip */ |
106 | xorl %eax, %eax | 217 | xorl %eax, %eax |
107 | pushq %rax /* ss */ | 218 | pushq $__KERNEL_DS /* ss */ |
108 | CFI_ADJUST_CFA_OFFSET 8 | 219 | CFI_ADJUST_CFA_OFFSET 8 |
109 | /*CFI_REL_OFFSET ss,0*/ | 220 | /*CFI_REL_OFFSET ss,0*/ |
110 | pushq %rax /* rsp */ | 221 | pushq %rax /* rsp */ |
@@ -169,13 +280,13 @@ ENTRY(ret_from_fork) | |||
169 | CFI_ADJUST_CFA_OFFSET -4 | 280 | CFI_ADJUST_CFA_OFFSET -4 |
170 | call schedule_tail | 281 | call schedule_tail |
171 | GET_THREAD_INFO(%rcx) | 282 | GET_THREAD_INFO(%rcx) |
172 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) | 283 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) |
173 | jnz rff_trace | 284 | jnz rff_trace |
174 | rff_action: | 285 | rff_action: |
175 | RESTORE_REST | 286 | RESTORE_REST |
176 | testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? | 287 | testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? |
177 | je int_ret_from_sys_call | 288 | je int_ret_from_sys_call |
178 | testl $_TIF_IA32,threadinfo_flags(%rcx) | 289 | testl $_TIF_IA32,TI_flags(%rcx) |
179 | jnz int_ret_from_sys_call | 290 | jnz int_ret_from_sys_call |
180 | RESTORE_TOP_OF_STACK %rdi,ARGOFFSET | 291 | RESTORE_TOP_OF_STACK %rdi,ARGOFFSET |
181 | jmp ret_from_sys_call | 292 | jmp ret_from_sys_call |
@@ -244,8 +355,9 @@ ENTRY(system_call_after_swapgs) | |||
244 | movq %rcx,RIP-ARGOFFSET(%rsp) | 355 | movq %rcx,RIP-ARGOFFSET(%rsp) |
245 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | 356 | CFI_REL_OFFSET rip,RIP-ARGOFFSET |
246 | GET_THREAD_INFO(%rcx) | 357 | GET_THREAD_INFO(%rcx) |
247 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) | 358 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) |
248 | jnz tracesys | 359 | jnz tracesys |
360 | system_call_fastpath: | ||
249 | cmpq $__NR_syscall_max,%rax | 361 | cmpq $__NR_syscall_max,%rax |
250 | ja badsys | 362 | ja badsys |
251 | movq %r10,%rcx | 363 | movq %r10,%rcx |
@@ -263,7 +375,7 @@ sysret_check: | |||
263 | GET_THREAD_INFO(%rcx) | 375 | GET_THREAD_INFO(%rcx) |
264 | DISABLE_INTERRUPTS(CLBR_NONE) | 376 | DISABLE_INTERRUPTS(CLBR_NONE) |
265 | TRACE_IRQS_OFF | 377 | TRACE_IRQS_OFF |
266 | movl threadinfo_flags(%rcx),%edx | 378 | movl TI_flags(%rcx),%edx |
267 | andl %edi,%edx | 379 | andl %edi,%edx |
268 | jnz sysret_careful | 380 | jnz sysret_careful |
269 | CFI_REMEMBER_STATE | 381 | CFI_REMEMBER_STATE |
@@ -275,7 +387,8 @@ sysret_check: | |||
275 | CFI_REGISTER rip,rcx | 387 | CFI_REGISTER rip,rcx |
276 | RESTORE_ARGS 0,-ARG_SKIP,1 | 388 | RESTORE_ARGS 0,-ARG_SKIP,1 |
277 | /*CFI_REGISTER rflags,r11*/ | 389 | /*CFI_REGISTER rflags,r11*/ |
278 | ENABLE_INTERRUPTS_SYSCALL_RET | 390 | movq %gs:pda_oldrsp, %rsp |
391 | USERGS_SYSRET64 | ||
279 | 392 | ||
280 | CFI_RESTORE_STATE | 393 | CFI_RESTORE_STATE |
281 | /* Handle reschedules */ | 394 | /* Handle reschedules */ |
@@ -296,16 +409,16 @@ sysret_careful: | |||
296 | sysret_signal: | 409 | sysret_signal: |
297 | TRACE_IRQS_ON | 410 | TRACE_IRQS_ON |
298 | ENABLE_INTERRUPTS(CLBR_NONE) | 411 | ENABLE_INTERRUPTS(CLBR_NONE) |
299 | testl $_TIF_DO_NOTIFY_MASK,%edx | 412 | #ifdef CONFIG_AUDITSYSCALL |
300 | jz 1f | 413 | bt $TIF_SYSCALL_AUDIT,%edx |
301 | 414 | jc sysret_audit | |
302 | /* Really a signal */ | 415 | #endif |
303 | /* edx: work flags (arg3) */ | 416 | /* edx: work flags (arg3) */ |
304 | leaq do_notify_resume(%rip),%rax | 417 | leaq do_notify_resume(%rip),%rax |
305 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | 418 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 |
306 | xorl %esi,%esi # oldset -> arg2 | 419 | xorl %esi,%esi # oldset -> arg2 |
307 | call ptregscall_common | 420 | call ptregscall_common |
308 | 1: movl $_TIF_NEED_RESCHED,%edi | 421 | movl $_TIF_WORK_MASK,%edi |
309 | /* Use IRET because user could have changed frame. This | 422 | /* Use IRET because user could have changed frame. This |
310 | works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ | 423 | works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ |
311 | DISABLE_INTERRUPTS(CLBR_NONE) | 424 | DISABLE_INTERRUPTS(CLBR_NONE) |
@@ -316,14 +429,56 @@ badsys: | |||
316 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | 429 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) |
317 | jmp ret_from_sys_call | 430 | jmp ret_from_sys_call |
318 | 431 | ||
432 | #ifdef CONFIG_AUDITSYSCALL | ||
433 | /* | ||
434 | * Fast path for syscall audit without full syscall trace. | ||
435 | * We just call audit_syscall_entry() directly, and then | ||
436 | * jump back to the normal fast path. | ||
437 | */ | ||
438 | auditsys: | ||
439 | movq %r10,%r9 /* 6th arg: 4th syscall arg */ | ||
440 | movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ | ||
441 | movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ | ||
442 | movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ | ||
443 | movq %rax,%rsi /* 2nd arg: syscall number */ | ||
444 | movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ | ||
445 | call audit_syscall_entry | ||
446 | LOAD_ARGS 0 /* reload call-clobbered registers */ | ||
447 | jmp system_call_fastpath | ||
448 | |||
449 | /* | ||
450 | * Return fast path for syscall audit. Call audit_syscall_exit() | ||
451 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT | ||
452 | * masked off. | ||
453 | */ | ||
454 | sysret_audit: | ||
455 | movq %rax,%rsi /* second arg, syscall return value */ | ||
456 | cmpq $0,%rax /* is it < 0? */ | ||
457 | setl %al /* 1 if so, 0 if not */ | ||
458 | movzbl %al,%edi /* zero-extend that into %edi */ | ||
459 | inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ | ||
460 | call audit_syscall_exit | ||
461 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | ||
462 | jmp sysret_check | ||
463 | #endif /* CONFIG_AUDITSYSCALL */ | ||
464 | |||
319 | /* Do syscall tracing */ | 465 | /* Do syscall tracing */ |
320 | tracesys: | 466 | tracesys: |
467 | #ifdef CONFIG_AUDITSYSCALL | ||
468 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) | ||
469 | jz auditsys | ||
470 | #endif | ||
321 | SAVE_REST | 471 | SAVE_REST |
322 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | 472 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ |
323 | FIXUP_TOP_OF_STACK %rdi | 473 | FIXUP_TOP_OF_STACK %rdi |
324 | movq %rsp,%rdi | 474 | movq %rsp,%rdi |
325 | call syscall_trace_enter | 475 | call syscall_trace_enter |
326 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | 476 | /* |
477 | * Reload arg registers from stack in case ptrace changed them. | ||
478 | * We don't reload %rax because syscall_trace_enter() returned | ||
479 | * the value it wants us to use in the table lookup. | ||
480 | */ | ||
481 | LOAD_ARGS ARGOFFSET, 1 | ||
327 | RESTORE_REST | 482 | RESTORE_REST |
328 | cmpq $__NR_syscall_max,%rax | 483 | cmpq $__NR_syscall_max,%rax |
329 | ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ | 484 | ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ |
@@ -337,6 +492,7 @@ tracesys: | |||
337 | * Has correct top of stack, but partial stack frame. | 492 | * Has correct top of stack, but partial stack frame. |
338 | */ | 493 | */ |
339 | .globl int_ret_from_sys_call | 494 | .globl int_ret_from_sys_call |
495 | .globl int_with_check | ||
340 | int_ret_from_sys_call: | 496 | int_ret_from_sys_call: |
341 | DISABLE_INTERRUPTS(CLBR_NONE) | 497 | DISABLE_INTERRUPTS(CLBR_NONE) |
342 | TRACE_IRQS_OFF | 498 | TRACE_IRQS_OFF |
@@ -347,10 +503,10 @@ int_ret_from_sys_call: | |||
347 | int_with_check: | 503 | int_with_check: |
348 | LOCKDEP_SYS_EXIT_IRQ | 504 | LOCKDEP_SYS_EXIT_IRQ |
349 | GET_THREAD_INFO(%rcx) | 505 | GET_THREAD_INFO(%rcx) |
350 | movl threadinfo_flags(%rcx),%edx | 506 | movl TI_flags(%rcx),%edx |
351 | andl %edi,%edx | 507 | andl %edi,%edx |
352 | jnz int_careful | 508 | jnz int_careful |
353 | andl $~TS_COMPAT,threadinfo_status(%rcx) | 509 | andl $~TS_COMPAT,TI_status(%rcx) |
354 | jmp retint_swapgs | 510 | jmp retint_swapgs |
355 | 511 | ||
356 | /* Either reschedule or signal or syscall exit tracking needed. */ | 512 | /* Either reschedule or signal or syscall exit tracking needed. */ |
@@ -376,7 +532,7 @@ int_very_careful: | |||
376 | ENABLE_INTERRUPTS(CLBR_NONE) | 532 | ENABLE_INTERRUPTS(CLBR_NONE) |
377 | SAVE_REST | 533 | SAVE_REST |
378 | /* Check for syscall exit trace */ | 534 | /* Check for syscall exit trace */ |
379 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx | 535 | testl $_TIF_WORK_SYSCALL_EXIT,%edx |
380 | jz int_signal | 536 | jz int_signal |
381 | pushq %rdi | 537 | pushq %rdi |
382 | CFI_ADJUST_CFA_OFFSET 8 | 538 | CFI_ADJUST_CFA_OFFSET 8 |
@@ -384,7 +540,7 @@ int_very_careful: | |||
384 | call syscall_trace_leave | 540 | call syscall_trace_leave |
385 | popq %rdi | 541 | popq %rdi |
386 | CFI_ADJUST_CFA_OFFSET -8 | 542 | CFI_ADJUST_CFA_OFFSET -8 |
387 | andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi | 543 | andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi |
388 | jmp int_restore_rest | 544 | jmp int_restore_rest |
389 | 545 | ||
390 | int_signal: | 546 | int_signal: |
@@ -393,7 +549,7 @@ int_signal: | |||
393 | movq %rsp,%rdi # &ptregs -> arg1 | 549 | movq %rsp,%rdi # &ptregs -> arg1 |
394 | xorl %esi,%esi # oldset -> arg2 | 550 | xorl %esi,%esi # oldset -> arg2 |
395 | call do_notify_resume | 551 | call do_notify_resume |
396 | 1: movl $_TIF_NEED_RESCHED,%edi | 552 | 1: movl $_TIF_WORK_MASK,%edi |
397 | int_restore_rest: | 553 | int_restore_rest: |
398 | RESTORE_REST | 554 | RESTORE_REST |
399 | DISABLE_INTERRUPTS(CLBR_NONE) | 555 | DISABLE_INTERRUPTS(CLBR_NONE) |
@@ -420,7 +576,6 @@ END(\label) | |||
420 | PTREGSCALL stub_clone, sys_clone, %r8 | 576 | PTREGSCALL stub_clone, sys_clone, %r8 |
421 | PTREGSCALL stub_fork, sys_fork, %rdi | 577 | PTREGSCALL stub_fork, sys_fork, %rdi |
422 | PTREGSCALL stub_vfork, sys_vfork, %rdi | 578 | PTREGSCALL stub_vfork, sys_vfork, %rdi |
423 | PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx | ||
424 | PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx | 579 | PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx |
425 | PTREGSCALL stub_iopl, sys_iopl, %rsi | 580 | PTREGSCALL stub_iopl, sys_iopl, %rsi |
426 | 581 | ||
@@ -559,7 +714,7 @@ retint_with_reschedule: | |||
559 | movl $_TIF_WORK_MASK,%edi | 714 | movl $_TIF_WORK_MASK,%edi |
560 | retint_check: | 715 | retint_check: |
561 | LOCKDEP_SYS_EXIT_IRQ | 716 | LOCKDEP_SYS_EXIT_IRQ |
562 | movl threadinfo_flags(%rcx),%edx | 717 | movl TI_flags(%rcx),%edx |
563 | andl %edi,%edx | 718 | andl %edi,%edx |
564 | CFI_REMEMBER_STATE | 719 | CFI_REMEMBER_STATE |
565 | jnz retint_careful | 720 | jnz retint_careful |
@@ -647,17 +802,16 @@ retint_signal: | |||
647 | RESTORE_REST | 802 | RESTORE_REST |
648 | DISABLE_INTERRUPTS(CLBR_NONE) | 803 | DISABLE_INTERRUPTS(CLBR_NONE) |
649 | TRACE_IRQS_OFF | 804 | TRACE_IRQS_OFF |
650 | movl $_TIF_NEED_RESCHED,%edi | ||
651 | GET_THREAD_INFO(%rcx) | 805 | GET_THREAD_INFO(%rcx) |
652 | jmp retint_check | 806 | jmp retint_with_reschedule |
653 | 807 | ||
654 | #ifdef CONFIG_PREEMPT | 808 | #ifdef CONFIG_PREEMPT |
655 | /* Returning to kernel space. Check if we need preemption */ | 809 | /* Returning to kernel space. Check if we need preemption */ |
656 | /* rcx: threadinfo. interrupts off. */ | 810 | /* rcx: threadinfo. interrupts off. */ |
657 | ENTRY(retint_kernel) | 811 | ENTRY(retint_kernel) |
658 | cmpl $0,threadinfo_preempt_count(%rcx) | 812 | cmpl $0,TI_preempt_count(%rcx) |
659 | jnz retint_restore_args | 813 | jnz retint_restore_args |
660 | bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) | 814 | bt $TIF_NEED_RESCHED,TI_flags(%rcx) |
661 | jnc retint_restore_args | 815 | jnc retint_restore_args |
662 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | 816 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ |
663 | jnc retint_restore_args | 817 | jnc retint_restore_args |
@@ -711,6 +865,9 @@ END(invalidate_interrupt\num) | |||
711 | ENTRY(call_function_interrupt) | 865 | ENTRY(call_function_interrupt) |
712 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt | 866 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt |
713 | END(call_function_interrupt) | 867 | END(call_function_interrupt) |
868 | ENTRY(call_function_single_interrupt) | ||
869 | apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt | ||
870 | END(call_function_single_interrupt) | ||
714 | ENTRY(irq_move_cleanup_interrupt) | 871 | ENTRY(irq_move_cleanup_interrupt) |
715 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt | 872 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt |
716 | END(irq_move_cleanup_interrupt) | 873 | END(irq_move_cleanup_interrupt) |
@@ -720,6 +877,10 @@ ENTRY(apic_timer_interrupt) | |||
720 | apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt | 877 | apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt |
721 | END(apic_timer_interrupt) | 878 | END(apic_timer_interrupt) |
722 | 879 | ||
880 | ENTRY(uv_bau_message_intr1) | ||
881 | apicinterrupt 220,uv_bau_message_interrupt | ||
882 | END(uv_bau_message_intr1) | ||
883 | |||
723 | ENTRY(error_interrupt) | 884 | ENTRY(error_interrupt) |
724 | apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt | 885 | apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt |
725 | END(error_interrupt) | 886 | END(error_interrupt) |
@@ -733,6 +894,7 @@ END(spurious_interrupt) | |||
733 | */ | 894 | */ |
734 | .macro zeroentry sym | 895 | .macro zeroentry sym |
735 | INTR_FRAME | 896 | INTR_FRAME |
897 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
736 | pushq $0 /* push error code/oldrax */ | 898 | pushq $0 /* push error code/oldrax */ |
737 | CFI_ADJUST_CFA_OFFSET 8 | 899 | CFI_ADJUST_CFA_OFFSET 8 |
738 | pushq %rax /* push real oldrax to the rdi slot */ | 900 | pushq %rax /* push real oldrax to the rdi slot */ |
@@ -745,6 +907,7 @@ END(spurious_interrupt) | |||
745 | 907 | ||
746 | .macro errorentry sym | 908 | .macro errorentry sym |
747 | XCPT_FRAME | 909 | XCPT_FRAME |
910 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
748 | pushq %rax | 911 | pushq %rax |
749 | CFI_ADJUST_CFA_OFFSET 8 | 912 | CFI_ADJUST_CFA_OFFSET 8 |
750 | CFI_REL_OFFSET rax,0 | 913 | CFI_REL_OFFSET rax,0 |
@@ -814,7 +977,7 @@ paranoid_restore\trace: | |||
814 | jmp irq_return | 977 | jmp irq_return |
815 | paranoid_userspace\trace: | 978 | paranoid_userspace\trace: |
816 | GET_THREAD_INFO(%rcx) | 979 | GET_THREAD_INFO(%rcx) |
817 | movl threadinfo_flags(%rcx),%ebx | 980 | movl TI_flags(%rcx),%ebx |
818 | andl $_TIF_WORK_MASK,%ebx | 981 | andl $_TIF_WORK_MASK,%ebx |
819 | jz paranoid_swapgs\trace | 982 | jz paranoid_swapgs\trace |
820 | movq %rsp,%rdi /* &pt_regs */ | 983 | movq %rsp,%rdi /* &pt_regs */ |
@@ -912,7 +1075,7 @@ error_exit: | |||
912 | testl %eax,%eax | 1075 | testl %eax,%eax |
913 | jne retint_kernel | 1076 | jne retint_kernel |
914 | LOCKDEP_SYS_EXIT_IRQ | 1077 | LOCKDEP_SYS_EXIT_IRQ |
915 | movl threadinfo_flags(%rcx),%edx | 1078 | movl TI_flags(%rcx),%edx |
916 | movl $_TIF_WORK_MASK,%edi | 1079 | movl $_TIF_WORK_MASK,%edi |
917 | andl %edi,%edx | 1080 | andl %edi,%edx |
918 | jnz retint_careful | 1081 | jnz retint_careful |
@@ -926,11 +1089,11 @@ error_kernelspace: | |||
926 | iret run with kernel gs again, so don't set the user space flag. | 1089 | iret run with kernel gs again, so don't set the user space flag. |
927 | B stepping K8s sometimes report an truncated RIP for IRET | 1090 | B stepping K8s sometimes report an truncated RIP for IRET |
928 | exceptions returning to compat mode. Check for these here too. */ | 1091 | exceptions returning to compat mode. Check for these here too. */ |
929 | leaq irq_return(%rip),%rbp | 1092 | leaq irq_return(%rip),%rcx |
930 | cmpq %rbp,RIP(%rsp) | 1093 | cmpq %rcx,RIP(%rsp) |
931 | je error_swapgs | 1094 | je error_swapgs |
932 | movl %ebp,%ebp /* zero extend */ | 1095 | movl %ecx,%ecx /* zero extend */ |
933 | cmpq %rbp,RIP(%rsp) | 1096 | cmpq %rcx,RIP(%rsp) |
934 | je error_swapgs | 1097 | je error_swapgs |
935 | cmpq $gs_change,RIP(%rsp) | 1098 | cmpq $gs_change,RIP(%rsp) |
936 | je error_swapgs | 1099 | je error_swapgs |
@@ -939,7 +1102,7 @@ KPROBE_END(error_entry) | |||
939 | 1102 | ||
940 | /* Reload gs selector with exception handling */ | 1103 | /* Reload gs selector with exception handling */ |
941 | /* edi: new selector */ | 1104 | /* edi: new selector */ |
942 | ENTRY(load_gs_index) | 1105 | ENTRY(native_load_gs_index) |
943 | CFI_STARTPROC | 1106 | CFI_STARTPROC |
944 | pushf | 1107 | pushf |
945 | CFI_ADJUST_CFA_OFFSET 8 | 1108 | CFI_ADJUST_CFA_OFFSET 8 |
@@ -953,7 +1116,7 @@ gs_change: | |||
953 | CFI_ADJUST_CFA_OFFSET -8 | 1116 | CFI_ADJUST_CFA_OFFSET -8 |
954 | ret | 1117 | ret |
955 | CFI_ENDPROC | 1118 | CFI_ENDPROC |
956 | ENDPROC(load_gs_index) | 1119 | ENDPROC(native_load_gs_index) |
957 | 1120 | ||
958 | .section __ex_table,"a" | 1121 | .section __ex_table,"a" |
959 | .align 8 | 1122 | .align 8 |
@@ -1075,6 +1238,7 @@ END(device_not_available) | |||
1075 | /* runs on exception stack */ | 1238 | /* runs on exception stack */ |
1076 | KPROBE_ENTRY(debug) | 1239 | KPROBE_ENTRY(debug) |
1077 | INTR_FRAME | 1240 | INTR_FRAME |
1241 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
1078 | pushq $0 | 1242 | pushq $0 |
1079 | CFI_ADJUST_CFA_OFFSET 8 | 1243 | CFI_ADJUST_CFA_OFFSET 8 |
1080 | paranoidentry do_debug, DEBUG_STACK | 1244 | paranoidentry do_debug, DEBUG_STACK |
@@ -1084,6 +1248,7 @@ KPROBE_END(debug) | |||
1084 | /* runs on exception stack */ | 1248 | /* runs on exception stack */ |
1085 | KPROBE_ENTRY(nmi) | 1249 | KPROBE_ENTRY(nmi) |
1086 | INTR_FRAME | 1250 | INTR_FRAME |
1251 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
1087 | pushq $-1 | 1252 | pushq $-1 |
1088 | CFI_ADJUST_CFA_OFFSET 8 | 1253 | CFI_ADJUST_CFA_OFFSET 8 |
1089 | paranoidentry do_nmi, 0, 0 | 1254 | paranoidentry do_nmi, 0, 0 |
@@ -1097,6 +1262,7 @@ KPROBE_END(nmi) | |||
1097 | 1262 | ||
1098 | KPROBE_ENTRY(int3) | 1263 | KPROBE_ENTRY(int3) |
1099 | INTR_FRAME | 1264 | INTR_FRAME |
1265 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
1100 | pushq $0 | 1266 | pushq $0 |
1101 | CFI_ADJUST_CFA_OFFSET 8 | 1267 | CFI_ADJUST_CFA_OFFSET 8 |
1102 | paranoidentry do_int3, DEBUG_STACK | 1268 | paranoidentry do_int3, DEBUG_STACK |
@@ -1120,13 +1286,10 @@ ENTRY(coprocessor_segment_overrun) | |||
1120 | zeroentry do_coprocessor_segment_overrun | 1286 | zeroentry do_coprocessor_segment_overrun |
1121 | END(coprocessor_segment_overrun) | 1287 | END(coprocessor_segment_overrun) |
1122 | 1288 | ||
1123 | ENTRY(reserved) | ||
1124 | zeroentry do_reserved | ||
1125 | END(reserved) | ||
1126 | |||
1127 | /* runs on exception stack */ | 1289 | /* runs on exception stack */ |
1128 | ENTRY(double_fault) | 1290 | ENTRY(double_fault) |
1129 | XCPT_FRAME | 1291 | XCPT_FRAME |
1292 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
1130 | paranoidentry do_double_fault | 1293 | paranoidentry do_double_fault |
1131 | jmp paranoid_exit1 | 1294 | jmp paranoid_exit1 |
1132 | CFI_ENDPROC | 1295 | CFI_ENDPROC |
@@ -1143,6 +1306,7 @@ END(segment_not_present) | |||
1143 | /* runs on exception stack */ | 1306 | /* runs on exception stack */ |
1144 | ENTRY(stack_segment) | 1307 | ENTRY(stack_segment) |
1145 | XCPT_FRAME | 1308 | XCPT_FRAME |
1309 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
1146 | paranoidentry do_stack_segment | 1310 | paranoidentry do_stack_segment |
1147 | jmp paranoid_exit1 | 1311 | jmp paranoid_exit1 |
1148 | CFI_ENDPROC | 1312 | CFI_ENDPROC |
@@ -1168,6 +1332,7 @@ END(spurious_interrupt_bug) | |||
1168 | /* runs on exception stack */ | 1332 | /* runs on exception stack */ |
1169 | ENTRY(machine_check) | 1333 | ENTRY(machine_check) |
1170 | INTR_FRAME | 1334 | INTR_FRAME |
1335 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
1171 | pushq $0 | 1336 | pushq $0 |
1172 | CFI_ADJUST_CFA_OFFSET 8 | 1337 | CFI_ADJUST_CFA_OFFSET 8 |
1173 | paranoidentry do_machine_check | 1338 | paranoidentry do_machine_check |
@@ -1202,3 +1367,103 @@ KPROBE_ENTRY(ignore_sysret) | |||
1202 | sysret | 1367 | sysret |
1203 | CFI_ENDPROC | 1368 | CFI_ENDPROC |
1204 | ENDPROC(ignore_sysret) | 1369 | ENDPROC(ignore_sysret) |
1370 | |||
1371 | #ifdef CONFIG_XEN | ||
1372 | ENTRY(xen_hypervisor_callback) | ||
1373 | zeroentry xen_do_hypervisor_callback | ||
1374 | END(xen_hypervisor_callback) | ||
1375 | |||
1376 | /* | ||
1377 | # A note on the "critical region" in our callback handler. | ||
1378 | # We want to avoid stacking callback handlers due to events occurring | ||
1379 | # during handling of the last event. To do this, we keep events disabled | ||
1380 | # until we've done all processing. HOWEVER, we must enable events before | ||
1381 | # popping the stack frame (can't be done atomically) and so it would still | ||
1382 | # be possible to get enough handler activations to overflow the stack. | ||
1383 | # Although unlikely, bugs of that kind are hard to track down, so we'd | ||
1384 | # like to avoid the possibility. | ||
1385 | # So, on entry to the handler we detect whether we interrupted an | ||
1386 | # existing activation in its critical region -- if so, we pop the current | ||
1387 | # activation and restart the handler using the previous one. | ||
1388 | */ | ||
1389 | ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) | ||
1390 | CFI_STARTPROC | ||
1391 | /* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will | ||
1392 | see the correct pointer to the pt_regs */ | ||
1393 | movq %rdi, %rsp # we don't return, adjust the stack frame | ||
1394 | CFI_ENDPROC | ||
1395 | CFI_DEFAULT_STACK | ||
1396 | 11: incl %gs:pda_irqcount | ||
1397 | movq %rsp,%rbp | ||
1398 | CFI_DEF_CFA_REGISTER rbp | ||
1399 | cmovzq %gs:pda_irqstackptr,%rsp | ||
1400 | pushq %rbp # backlink for old unwinder | ||
1401 | call xen_evtchn_do_upcall | ||
1402 | popq %rsp | ||
1403 | CFI_DEF_CFA_REGISTER rsp | ||
1404 | decl %gs:pda_irqcount | ||
1405 | jmp error_exit | ||
1406 | CFI_ENDPROC | ||
1407 | END(do_hypervisor_callback) | ||
1408 | |||
1409 | /* | ||
1410 | # Hypervisor uses this for application faults while it executes. | ||
1411 | # We get here for two reasons: | ||
1412 | # 1. Fault while reloading DS, ES, FS or GS | ||
1413 | # 2. Fault while executing IRET | ||
1414 | # Category 1 we do not need to fix up as Xen has already reloaded all segment | ||
1415 | # registers that could be reloaded and zeroed the others. | ||
1416 | # Category 2 we fix up by killing the current process. We cannot use the | ||
1417 | # normal Linux return path in this case because if we use the IRET hypercall | ||
1418 | # to pop the stack frame we end up in an infinite loop of failsafe callbacks. | ||
1419 | # We distinguish between categories by comparing each saved segment register | ||
1420 | # with its current contents: any discrepancy means we in category 1. | ||
1421 | */ | ||
1422 | ENTRY(xen_failsafe_callback) | ||
1423 | framesz = (RIP-0x30) /* workaround buggy gas */ | ||
1424 | _frame framesz | ||
1425 | CFI_REL_OFFSET rcx, 0 | ||
1426 | CFI_REL_OFFSET r11, 8 | ||
1427 | movw %ds,%cx | ||
1428 | cmpw %cx,0x10(%rsp) | ||
1429 | CFI_REMEMBER_STATE | ||
1430 | jne 1f | ||
1431 | movw %es,%cx | ||
1432 | cmpw %cx,0x18(%rsp) | ||
1433 | jne 1f | ||
1434 | movw %fs,%cx | ||
1435 | cmpw %cx,0x20(%rsp) | ||
1436 | jne 1f | ||
1437 | movw %gs,%cx | ||
1438 | cmpw %cx,0x28(%rsp) | ||
1439 | jne 1f | ||
1440 | /* All segments match their saved values => Category 2 (Bad IRET). */ | ||
1441 | movq (%rsp),%rcx | ||
1442 | CFI_RESTORE rcx | ||
1443 | movq 8(%rsp),%r11 | ||
1444 | CFI_RESTORE r11 | ||
1445 | addq $0x30,%rsp | ||
1446 | CFI_ADJUST_CFA_OFFSET -0x30 | ||
1447 | pushq $0 | ||
1448 | CFI_ADJUST_CFA_OFFSET 8 | ||
1449 | pushq %r11 | ||
1450 | CFI_ADJUST_CFA_OFFSET 8 | ||
1451 | pushq %rcx | ||
1452 | CFI_ADJUST_CFA_OFFSET 8 | ||
1453 | jmp general_protection | ||
1454 | CFI_RESTORE_STATE | ||
1455 | 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ | ||
1456 | movq (%rsp),%rcx | ||
1457 | CFI_RESTORE rcx | ||
1458 | movq 8(%rsp),%r11 | ||
1459 | CFI_RESTORE r11 | ||
1460 | addq $0x30,%rsp | ||
1461 | CFI_ADJUST_CFA_OFFSET -0x30 | ||
1462 | pushq $0 | ||
1463 | CFI_ADJUST_CFA_OFFSET 8 | ||
1464 | SAVE_ALL | ||
1465 | jmp error_exit | ||
1466 | CFI_ENDPROC | ||
1467 | END(xen_failsafe_callback) | ||
1468 | |||
1469 | #endif /* CONFIG_XEN */ | ||
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 000000000000..ab115cd15fdf --- /dev/null +++ b/arch/x86/kernel/ftrace.c | |||
@@ -0,0 +1,141 @@ | |||
1 | /* | ||
2 | * Code for replacing ftrace calls with jumps. | ||
3 | * | ||
4 | * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> | ||
5 | * | ||
6 | * Thanks goes to Ingo Molnar, for suggesting the idea. | ||
7 | * Mathieu Desnoyers, for suggesting postponing the modifications. | ||
8 | * Arjan van de Ven, for keeping me straight, and explaining to me | ||
9 | * the dangers of modifying code on the run. | ||
10 | */ | ||
11 | |||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/hardirq.h> | ||
14 | #include <linux/ftrace.h> | ||
15 | #include <linux/percpu.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/list.h> | ||
18 | |||
19 | #include <asm/alternative.h> | ||
20 | #include <asm/ftrace.h> | ||
21 | |||
22 | |||
23 | /* Long is fine, even if it is only 4 bytes ;-) */ | ||
24 | static long *ftrace_nop; | ||
25 | |||
26 | union ftrace_code_union { | ||
27 | char code[MCOUNT_INSN_SIZE]; | ||
28 | struct { | ||
29 | char e8; | ||
30 | int offset; | ||
31 | } __attribute__((packed)); | ||
32 | }; | ||
33 | |||
34 | |||
35 | static int notrace ftrace_calc_offset(long ip, long addr) | ||
36 | { | ||
37 | return (int)(addr - ip); | ||
38 | } | ||
39 | |||
40 | notrace unsigned char *ftrace_nop_replace(void) | ||
41 | { | ||
42 | return (char *)ftrace_nop; | ||
43 | } | ||
44 | |||
45 | notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) | ||
46 | { | ||
47 | static union ftrace_code_union calc; | ||
48 | |||
49 | calc.e8 = 0xe8; | ||
50 | calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); | ||
51 | |||
52 | /* | ||
53 | * No locking needed, this must be called via kstop_machine | ||
54 | * which in essence is like running on a uniprocessor machine. | ||
55 | */ | ||
56 | return calc.code; | ||
57 | } | ||
58 | |||
59 | notrace int | ||
60 | ftrace_modify_code(unsigned long ip, unsigned char *old_code, | ||
61 | unsigned char *new_code) | ||
62 | { | ||
63 | unsigned replaced; | ||
64 | unsigned old = *(unsigned *)old_code; /* 4 bytes */ | ||
65 | unsigned new = *(unsigned *)new_code; /* 4 bytes */ | ||
66 | unsigned char newch = new_code[4]; | ||
67 | int faulted = 0; | ||
68 | |||
69 | /* | ||
70 | * Note: Due to modules and __init, code can | ||
71 | * disappear and change, we need to protect against faulting | ||
72 | * as well as code changing. | ||
73 | * | ||
74 | * No real locking needed, this code is run through | ||
75 | * kstop_machine. | ||
76 | */ | ||
77 | asm volatile ( | ||
78 | "1: lock\n" | ||
79 | " cmpxchg %3, (%2)\n" | ||
80 | " jnz 2f\n" | ||
81 | " movb %b4, 4(%2)\n" | ||
82 | "2:\n" | ||
83 | ".section .fixup, \"ax\"\n" | ||
84 | "3: movl $1, %0\n" | ||
85 | " jmp 2b\n" | ||
86 | ".previous\n" | ||
87 | _ASM_EXTABLE(1b, 3b) | ||
88 | : "=r"(faulted), "=a"(replaced) | ||
89 | : "r"(ip), "r"(new), "c"(newch), | ||
90 | "0"(faulted), "a"(old) | ||
91 | : "memory"); | ||
92 | sync_core(); | ||
93 | |||
94 | if (replaced != old && replaced != new) | ||
95 | faulted = 2; | ||
96 | |||
97 | return faulted; | ||
98 | } | ||
99 | |||
100 | notrace int ftrace_update_ftrace_func(ftrace_func_t func) | ||
101 | { | ||
102 | unsigned long ip = (unsigned long)(&ftrace_call); | ||
103 | unsigned char old[MCOUNT_INSN_SIZE], *new; | ||
104 | int ret; | ||
105 | |||
106 | memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); | ||
107 | new = ftrace_call_replace(ip, (unsigned long)func); | ||
108 | ret = ftrace_modify_code(ip, old, new); | ||
109 | |||
110 | return ret; | ||
111 | } | ||
112 | |||
113 | notrace int ftrace_mcount_set(unsigned long *data) | ||
114 | { | ||
115 | unsigned long ip = (long)(&mcount_call); | ||
116 | unsigned long *addr = data; | ||
117 | unsigned char old[MCOUNT_INSN_SIZE], *new; | ||
118 | |||
119 | /* | ||
120 | * Replace the mcount stub with a pointer to the | ||
121 | * ip recorder function. | ||
122 | */ | ||
123 | memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); | ||
124 | new = ftrace_call_replace(ip, *addr); | ||
125 | *addr = ftrace_modify_code(ip, old, new); | ||
126 | |||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | int __init ftrace_dyn_arch_init(void *data) | ||
131 | { | ||
132 | const unsigned char *const *noptable = find_nop_table(); | ||
133 | |||
134 | /* This is running in kstop_machine */ | ||
135 | |||
136 | ftrace_mcount_set(data); | ||
137 | |||
138 | ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; | ||
139 | |||
140 | return 0; | ||
141 | } | ||
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index cbaaf69bedb2..1fa8be5bd217 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c | |||
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void) | |||
51 | else | 51 | else |
52 | #endif | 52 | #endif |
53 | 53 | ||
54 | if (num_possible_cpus() <= 8) | 54 | if (max_physical_apicid < 8) |
55 | genapic = &apic_flat; | 55 | genapic = &apic_flat; |
56 | else | 56 | else |
57 | genapic = &apic_physflat; | 57 | genapic = &apic_physflat; |
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 1a9c68845ee8..786548a62d38 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c | |||
@@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) | |||
168 | * May as well be the first. | 168 | * May as well be the first. |
169 | */ | 169 | */ |
170 | cpu = first_cpu(cpumask); | 170 | cpu = first_cpu(cpumask); |
171 | if ((unsigned)cpu < NR_CPUS) | 171 | if ((unsigned)cpu < nr_cpu_ids) |
172 | return per_cpu(x86_cpu_to_apicid, cpu); | 172 | return per_cpu(x86_cpu_to_apicid, cpu); |
173 | else | 173 | else |
174 | return BAD_APICID; | 174 | return BAD_APICID; |
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index ebf13908a743..2cfcbded888a 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c | |||
@@ -5,9 +5,10 @@ | |||
5 | * | 5 | * |
6 | * SGI UV APIC functions (note: not an Intel compatible APIC) | 6 | * SGI UV APIC functions (note: not an Intel compatible APIC) |
7 | * | 7 | * |
8 | * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved. | 8 | * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/kernel.h> | ||
11 | #include <linux/threads.h> | 12 | #include <linux/threads.h> |
12 | #include <linux/cpumask.h> | 13 | #include <linux/cpumask.h> |
13 | #include <linux/string.h> | 14 | #include <linux/string.h> |
@@ -20,8 +21,10 @@ | |||
20 | #include <asm/smp.h> | 21 | #include <asm/smp.h> |
21 | #include <asm/ipi.h> | 22 | #include <asm/ipi.h> |
22 | #include <asm/genapic.h> | 23 | #include <asm/genapic.h> |
24 | #include <asm/pgtable.h> | ||
23 | #include <asm/uv/uv_mmrs.h> | 25 | #include <asm/uv/uv_mmrs.h> |
24 | #include <asm/uv/uv_hub.h> | 26 | #include <asm/uv/uv_hub.h> |
27 | #include <asm/uv/bios.h> | ||
25 | 28 | ||
26 | DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); | 29 | DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); |
27 | EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); | 30 | EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); |
@@ -38,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade); | |||
38 | short uv_possible_blades; | 41 | short uv_possible_blades; |
39 | EXPORT_SYMBOL_GPL(uv_possible_blades); | 42 | EXPORT_SYMBOL_GPL(uv_possible_blades); |
40 | 43 | ||
44 | unsigned long sn_rtc_cycles_per_second; | ||
45 | EXPORT_SYMBOL(sn_rtc_cycles_per_second); | ||
46 | |||
41 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | 47 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ |
42 | 48 | ||
43 | static cpumask_t uv_target_cpus(void) | 49 | static cpumask_t uv_target_cpus(void) |
@@ -55,44 +61,44 @@ static cpumask_t uv_vector_allocation_domain(int cpu) | |||
55 | int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) | 61 | int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) |
56 | { | 62 | { |
57 | unsigned long val; | 63 | unsigned long val; |
58 | int nasid; | 64 | int pnode; |
59 | 65 | ||
60 | nasid = uv_apicid_to_nasid(phys_apicid); | 66 | pnode = uv_apicid_to_pnode(phys_apicid); |
61 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | | 67 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | |
62 | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | | 68 | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | |
63 | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | | 69 | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | |
64 | APIC_DM_INIT; | 70 | APIC_DM_INIT; |
65 | uv_write_global_mmr64(nasid, UVH_IPI_INT, val); | 71 | uv_write_global_mmr64(pnode, UVH_IPI_INT, val); |
66 | mdelay(10); | 72 | mdelay(10); |
67 | 73 | ||
68 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | | 74 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | |
69 | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | | 75 | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | |
70 | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | | 76 | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | |
71 | APIC_DM_STARTUP; | 77 | APIC_DM_STARTUP; |
72 | uv_write_global_mmr64(nasid, UVH_IPI_INT, val); | 78 | uv_write_global_mmr64(pnode, UVH_IPI_INT, val); |
73 | return 0; | 79 | return 0; |
74 | } | 80 | } |
75 | 81 | ||
76 | static void uv_send_IPI_one(int cpu, int vector) | 82 | static void uv_send_IPI_one(int cpu, int vector) |
77 | { | 83 | { |
78 | unsigned long val, apicid, lapicid; | 84 | unsigned long val, apicid, lapicid; |
79 | int nasid; | 85 | int pnode; |
80 | 86 | ||
81 | apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ | 87 | apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ |
82 | lapicid = apicid & 0x3f; /* ZZZ macro needed */ | 88 | lapicid = apicid & 0x3f; /* ZZZ macro needed */ |
83 | nasid = uv_apicid_to_nasid(apicid); | 89 | pnode = uv_apicid_to_pnode(apicid); |
84 | val = | 90 | val = |
85 | (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << | 91 | (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << |
86 | UVH_IPI_INT_APIC_ID_SHFT) | | 92 | UVH_IPI_INT_APIC_ID_SHFT) | |
87 | (vector << UVH_IPI_INT_VECTOR_SHFT); | 93 | (vector << UVH_IPI_INT_VECTOR_SHFT); |
88 | uv_write_global_mmr64(nasid, UVH_IPI_INT, val); | 94 | uv_write_global_mmr64(pnode, UVH_IPI_INT, val); |
89 | } | 95 | } |
90 | 96 | ||
91 | static void uv_send_IPI_mask(cpumask_t mask, int vector) | 97 | static void uv_send_IPI_mask(cpumask_t mask, int vector) |
92 | { | 98 | { |
93 | unsigned int cpu; | 99 | unsigned int cpu; |
94 | 100 | ||
95 | for (cpu = 0; cpu < NR_CPUS; ++cpu) | 101 | for_each_possible_cpu(cpu) |
96 | if (cpu_isset(cpu, mask)) | 102 | if (cpu_isset(cpu, mask)) |
97 | uv_send_IPI_one(cpu, vector); | 103 | uv_send_IPI_one(cpu, vector); |
98 | } | 104 | } |
@@ -126,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) | |||
126 | * May as well be the first. | 132 | * May as well be the first. |
127 | */ | 133 | */ |
128 | cpu = first_cpu(cpumask); | 134 | cpu = first_cpu(cpumask); |
129 | if ((unsigned)cpu < NR_CPUS) | 135 | if ((unsigned)cpu < nr_cpu_ids) |
130 | return per_cpu(x86_cpu_to_apicid, cpu); | 136 | return per_cpu(x86_cpu_to_apicid, cpu); |
131 | else | 137 | else |
132 | return BAD_APICID; | 138 | return BAD_APICID; |
@@ -159,39 +165,163 @@ struct genapic apic_x2apic_uv_x = { | |||
159 | .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ | 165 | .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ |
160 | }; | 166 | }; |
161 | 167 | ||
162 | static __cpuinit void set_x2apic_extra_bits(int nasid) | 168 | static __cpuinit void set_x2apic_extra_bits(int pnode) |
163 | { | 169 | { |
164 | __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6); | 170 | __get_cpu_var(x2apic_extra_bits) = (pnode << 6); |
165 | } | 171 | } |
166 | 172 | ||
167 | /* | 173 | /* |
168 | * Called on boot cpu. | 174 | * Called on boot cpu. |
169 | */ | 175 | */ |
176 | static __init int boot_pnode_to_blade(int pnode) | ||
177 | { | ||
178 | int blade; | ||
179 | |||
180 | for (blade = 0; blade < uv_num_possible_blades(); blade++) | ||
181 | if (pnode == uv_blade_info[blade].pnode) | ||
182 | return blade; | ||
183 | BUG(); | ||
184 | } | ||
185 | |||
186 | struct redir_addr { | ||
187 | unsigned long redirect; | ||
188 | unsigned long alias; | ||
189 | }; | ||
190 | |||
191 | #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT | ||
192 | |||
193 | static __initdata struct redir_addr redir_addrs[] = { | ||
194 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, | ||
195 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, | ||
196 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, | ||
197 | }; | ||
198 | |||
199 | static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) | ||
200 | { | ||
201 | union uvh_si_alias0_overlay_config_u alias; | ||
202 | union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; | ||
203 | int i; | ||
204 | |||
205 | for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { | ||
206 | alias.v = uv_read_local_mmr(redir_addrs[i].alias); | ||
207 | if (alias.s.base == 0) { | ||
208 | *size = (1UL << alias.s.m_alias); | ||
209 | redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); | ||
210 | *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; | ||
211 | return; | ||
212 | } | ||
213 | } | ||
214 | BUG(); | ||
215 | } | ||
216 | |||
217 | static __init void map_low_mmrs(void) | ||
218 | { | ||
219 | init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); | ||
220 | init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); | ||
221 | } | ||
222 | |||
223 | enum map_type {map_wb, map_uc}; | ||
224 | |||
225 | static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) | ||
226 | { | ||
227 | unsigned long bytes, paddr; | ||
228 | |||
229 | paddr = base << shift; | ||
230 | bytes = (1UL << shift); | ||
231 | printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, | ||
232 | paddr + bytes); | ||
233 | if (map_type == map_uc) | ||
234 | init_extra_mapping_uc(paddr, bytes); | ||
235 | else | ||
236 | init_extra_mapping_wb(paddr, bytes); | ||
237 | |||
238 | } | ||
239 | static __init void map_gru_high(int max_pnode) | ||
240 | { | ||
241 | union uvh_rh_gam_gru_overlay_config_mmr_u gru; | ||
242 | int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
243 | |||
244 | gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); | ||
245 | if (gru.s.enable) | ||
246 | map_high("GRU", gru.s.base, shift, map_wb); | ||
247 | } | ||
248 | |||
249 | static __init void map_config_high(int max_pnode) | ||
250 | { | ||
251 | union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; | ||
252 | int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
253 | |||
254 | cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); | ||
255 | if (cfg.s.enable) | ||
256 | map_high("CONFIG", cfg.s.base, shift, map_uc); | ||
257 | } | ||
258 | |||
259 | static __init void map_mmr_high(int max_pnode) | ||
260 | { | ||
261 | union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; | ||
262 | int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
263 | |||
264 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); | ||
265 | if (mmr.s.enable) | ||
266 | map_high("MMR", mmr.s.base, shift, map_uc); | ||
267 | } | ||
268 | |||
269 | static __init void map_mmioh_high(int max_pnode) | ||
270 | { | ||
271 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; | ||
272 | int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
273 | |||
274 | mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); | ||
275 | if (mmioh.s.enable) | ||
276 | map_high("MMIOH", mmioh.s.base, shift, map_uc); | ||
277 | } | ||
278 | |||
279 | static __init void uv_rtc_init(void) | ||
280 | { | ||
281 | long status, ticks_per_sec, drift; | ||
282 | |||
283 | status = | ||
284 | x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, | ||
285 | &drift); | ||
286 | if (status != 0 || ticks_per_sec < 100000) { | ||
287 | printk(KERN_WARNING | ||
288 | "unable to determine platform RTC clock frequency, " | ||
289 | "guessing.\n"); | ||
290 | /* BIOS gives wrong value for clock freq. so guess */ | ||
291 | sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; | ||
292 | } else | ||
293 | sn_rtc_cycles_per_second = ticks_per_sec; | ||
294 | } | ||
295 | |||
170 | static __init void uv_system_init(void) | 296 | static __init void uv_system_init(void) |
171 | { | 297 | { |
172 | union uvh_si_addr_map_config_u m_n_config; | 298 | union uvh_si_addr_map_config_u m_n_config; |
173 | int bytes, nid, cpu, lcpu, nasid, last_nasid, blade; | 299 | union uvh_node_id_u node_id; |
174 | unsigned long mmr_base; | 300 | unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; |
301 | int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; | ||
302 | int max_pnode = 0; | ||
303 | unsigned long mmr_base, present; | ||
304 | |||
305 | map_low_mmrs(); | ||
175 | 306 | ||
176 | m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); | 307 | m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); |
308 | m_val = m_n_config.s.m_skt; | ||
309 | n_val = m_n_config.s.n_skt; | ||
177 | mmr_base = | 310 | mmr_base = |
178 | uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & | 311 | uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & |
179 | ~UV_MMR_ENABLE; | 312 | ~UV_MMR_ENABLE; |
180 | printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); | 313 | printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); |
181 | 314 | ||
182 | last_nasid = -1; | 315 | for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) |
183 | for_each_possible_cpu(cpu) { | 316 | uv_possible_blades += |
184 | nid = cpu_to_node(cpu); | 317 | hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); |
185 | nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); | ||
186 | if (nasid != last_nasid) | ||
187 | uv_possible_blades++; | ||
188 | last_nasid = nasid; | ||
189 | } | ||
190 | printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); | 318 | printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); |
191 | 319 | ||
192 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); | 320 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); |
193 | uv_blade_info = alloc_bootmem_pages(bytes); | 321 | uv_blade_info = alloc_bootmem_pages(bytes); |
194 | 322 | ||
323 | get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); | ||
324 | |||
195 | bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); | 325 | bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); |
196 | uv_node_to_blade = alloc_bootmem_pages(bytes); | 326 | uv_node_to_blade = alloc_bootmem_pages(bytes); |
197 | memset(uv_node_to_blade, 255, bytes); | 327 | memset(uv_node_to_blade, 255, bytes); |
@@ -200,43 +330,64 @@ static __init void uv_system_init(void) | |||
200 | uv_cpu_to_blade = alloc_bootmem_pages(bytes); | 330 | uv_cpu_to_blade = alloc_bootmem_pages(bytes); |
201 | memset(uv_cpu_to_blade, 255, bytes); | 331 | memset(uv_cpu_to_blade, 255, bytes); |
202 | 332 | ||
203 | last_nasid = -1; | 333 | blade = 0; |
204 | blade = -1; | 334 | for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { |
205 | lcpu = -1; | 335 | present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); |
206 | for_each_possible_cpu(cpu) { | 336 | for (j = 0; j < 64; j++) { |
207 | nid = cpu_to_node(cpu); | 337 | if (!test_bit(j, &present)) |
208 | nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); | 338 | continue; |
209 | if (nasid != last_nasid) { | 339 | uv_blade_info[blade].pnode = (i * 64 + j); |
210 | blade++; | 340 | uv_blade_info[blade].nr_possible_cpus = 0; |
211 | lcpu = -1; | ||
212 | uv_blade_info[blade].nr_posible_cpus = 0; | ||
213 | uv_blade_info[blade].nr_online_cpus = 0; | 341 | uv_blade_info[blade].nr_online_cpus = 0; |
342 | blade++; | ||
214 | } | 343 | } |
215 | last_nasid = nasid; | 344 | } |
216 | lcpu++; | 345 | |
346 | node_id.v = uv_read_local_mmr(UVH_NODE_ID); | ||
347 | gnode_upper = (((unsigned long)node_id.s.node_id) & | ||
348 | ~((1 << n_val) - 1)) << m_val; | ||
217 | 349 | ||
218 | uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt; | 350 | uv_rtc_init(); |
219 | uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt; | 351 | |
352 | for_each_present_cpu(cpu) { | ||
353 | nid = cpu_to_node(cpu); | ||
354 | pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); | ||
355 | blade = boot_pnode_to_blade(pnode); | ||
356 | lcpu = uv_blade_info[blade].nr_possible_cpus; | ||
357 | uv_blade_info[blade].nr_possible_cpus++; | ||
358 | |||
359 | uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; | ||
360 | uv_cpu_hub_info(cpu)->lowmem_remap_top = | ||
361 | lowmem_redir_base + lowmem_redir_size; | ||
362 | uv_cpu_hub_info(cpu)->m_val = m_val; | ||
363 | uv_cpu_hub_info(cpu)->n_val = m_val; | ||
220 | uv_cpu_hub_info(cpu)->numa_blade_id = blade; | 364 | uv_cpu_hub_info(cpu)->numa_blade_id = blade; |
221 | uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; | 365 | uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; |
222 | uv_cpu_hub_info(cpu)->local_nasid = nasid; | 366 | uv_cpu_hub_info(cpu)->pnode = pnode; |
223 | uv_cpu_hub_info(cpu)->gnode_upper = | 367 | uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1; |
224 | nasid & ~((1 << uv_hub_info->n_val) - 1); | 368 | uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; |
369 | uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; | ||
225 | uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; | 370 | uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; |
226 | uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ | 371 | uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ |
227 | uv_blade_info[blade].nasid = nasid; | ||
228 | uv_blade_info[blade].nr_posible_cpus++; | ||
229 | uv_node_to_blade[nid] = blade; | 372 | uv_node_to_blade[nid] = blade; |
230 | uv_cpu_to_blade[cpu] = blade; | 373 | uv_cpu_to_blade[cpu] = blade; |
374 | max_pnode = max(pnode, max_pnode); | ||
231 | 375 | ||
232 | printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n", | 376 | printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " |
233 | cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid); | 377 | "lcpu %d, blade %d\n", |
234 | printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade); | 378 | cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, |
379 | lcpu, blade); | ||
235 | } | 380 | } |
381 | |||
382 | map_gru_high(max_pnode); | ||
383 | map_mmr_high(max_pnode); | ||
384 | map_config_high(max_pnode); | ||
385 | map_mmioh_high(max_pnode); | ||
236 | } | 386 | } |
237 | 387 | ||
238 | /* | 388 | /* |
239 | * Called on each cpu to initialize the per_cpu UV data area. | 389 | * Called on each cpu to initialize the per_cpu UV data area. |
390 | * ZZZ hotplug not supported yet | ||
240 | */ | 391 | */ |
241 | void __cpuinit uv_cpu_init(void) | 392 | void __cpuinit uv_cpu_init(void) |
242 | { | 393 | { |
@@ -246,5 +397,5 @@ void __cpuinit uv_cpu_init(void) | |||
246 | uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; | 397 | uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; |
247 | 398 | ||
248 | if (get_uv_system_type() == UV_NON_UNIQUE_APIC) | 399 | if (get_uv_system_type() == UV_NON_UNIQUE_APIC) |
249 | set_x2apic_extra_bits(uv_hub_info->local_nasid); | 400 | set_x2apic_extra_bits(uv_hub_info->pnode); |
250 | } | 401 | } |
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index e8edd63ab000..9b08e852fd1a 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c | |||
@@ -166,6 +166,8 @@ int geode_has_vsa2(void) | |||
166 | static int has_vsa2 = -1; | 166 | static int has_vsa2 = -1; |
167 | 167 | ||
168 | if (has_vsa2 == -1) { | 168 | if (has_vsa2 == -1) { |
169 | u16 val; | ||
170 | |||
169 | /* | 171 | /* |
170 | * The VSA has virtual registers that we can query for a | 172 | * The VSA has virtual registers that we can query for a |
171 | * signature. | 173 | * signature. |
@@ -173,7 +175,8 @@ int geode_has_vsa2(void) | |||
173 | outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); | 175 | outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); |
174 | outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); | 176 | outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); |
175 | 177 | ||
176 | has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG); | 178 | val = inw(VSA_VRC_DATA); |
179 | has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG); | ||
177 | } | 180 | } |
178 | 181 | ||
179 | return has_vsa2; | 182 | return has_vsa2; |
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c new file mode 100644 index 000000000000..3e66bd364a9d --- /dev/null +++ b/arch/x86/kernel/head.c | |||
@@ -0,0 +1,55 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/init.h> | ||
3 | |||
4 | #include <asm/setup.h> | ||
5 | #include <asm/bios_ebda.h> | ||
6 | |||
7 | #define BIOS_LOWMEM_KILOBYTES 0x413 | ||
8 | |||
9 | /* | ||
10 | * The BIOS places the EBDA/XBDA at the top of conventional | ||
11 | * memory, and usually decreases the reported amount of | ||
12 | * conventional memory (int 0x12) too. This also contains a | ||
13 | * workaround for Dell systems that neglect to reserve EBDA. | ||
14 | * The same workaround also avoids a problem with the AMD768MPX | ||
15 | * chipset: reserve a page before VGA to prevent PCI prefetch | ||
16 | * into it (errata #56). Usually the page is reserved anyways, | ||
17 | * unless you have no PS/2 mouse plugged in. | ||
18 | */ | ||
19 | void __init reserve_ebda_region(void) | ||
20 | { | ||
21 | unsigned int lowmem, ebda_addr; | ||
22 | |||
23 | /* To determine the position of the EBDA and the */ | ||
24 | /* end of conventional memory, we need to look at */ | ||
25 | /* the BIOS data area. In a paravirtual environment */ | ||
26 | /* that area is absent. We'll just have to assume */ | ||
27 | /* that the paravirt case can handle memory setup */ | ||
28 | /* correctly, without our help. */ | ||
29 | if (paravirt_enabled()) | ||
30 | return; | ||
31 | |||
32 | /* end of low (conventional) memory */ | ||
33 | lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); | ||
34 | lowmem <<= 10; | ||
35 | |||
36 | /* start of EBDA area */ | ||
37 | ebda_addr = get_bios_ebda(); | ||
38 | |||
39 | /* Fixup: bios puts an EBDA in the top 64K segment */ | ||
40 | /* of conventional memory, but does not adjust lowmem. */ | ||
41 | if ((lowmem - ebda_addr) <= 0x10000) | ||
42 | lowmem = ebda_addr; | ||
43 | |||
44 | /* Fixup: bios does not report an EBDA at all. */ | ||
45 | /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ | ||
46 | if ((ebda_addr == 0) && (lowmem >= 0x9f000)) | ||
47 | lowmem = 0x9f000; | ||
48 | |||
49 | /* Paranoia: should never happen, but... */ | ||
50 | if ((lowmem == 0) || (lowmem >= 0x100000)) | ||
51 | lowmem = 0x9f000; | ||
52 | |||
53 | /* reserve all memory between lowmem and the 1MB mark */ | ||
54 | reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); | ||
55 | } | ||
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3db059058927..fa1d25dd83e3 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c | |||
@@ -8,7 +8,34 @@ | |||
8 | #include <linux/init.h> | 8 | #include <linux/init.h> |
9 | #include <linux/start_kernel.h> | 9 | #include <linux/start_kernel.h> |
10 | 10 | ||
11 | #include <asm/setup.h> | ||
12 | #include <asm/sections.h> | ||
13 | #include <asm/e820.h> | ||
14 | #include <asm/bios_ebda.h> | ||
15 | |||
11 | void __init i386_start_kernel(void) | 16 | void __init i386_start_kernel(void) |
12 | { | 17 | { |
18 | reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); | ||
19 | |||
20 | #ifdef CONFIG_BLK_DEV_INITRD | ||
21 | /* Reserve INITRD */ | ||
22 | if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { | ||
23 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; | ||
24 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; | ||
25 | u64 ramdisk_end = ramdisk_image + ramdisk_size; | ||
26 | reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); | ||
27 | } | ||
28 | #endif | ||
29 | reserve_early(init_pg_tables_start, init_pg_tables_end, | ||
30 | "INIT_PG_TABLE"); | ||
31 | |||
32 | reserve_ebda_region(); | ||
33 | |||
34 | /* | ||
35 | * At this point everything still needed from the boot loader | ||
36 | * or BIOS or kernel text should be early reserved or marked not | ||
37 | * RAM in e820. All other memory is free game. | ||
38 | */ | ||
39 | |||
13 | start_kernel(); | 40 | start_kernel(); |
14 | } | 41 | } |
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b8aa84..1b318e903bf6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -25,6 +25,27 @@ | |||
25 | #include <asm/e820.h> | 25 | #include <asm/e820.h> |
26 | #include <asm/bios_ebda.h> | 26 | #include <asm/bios_ebda.h> |
27 | 27 | ||
28 | /* boot cpu pda */ | ||
29 | static struct x8664_pda _boot_cpu_pda __read_mostly; | ||
30 | |||
31 | #ifdef CONFIG_SMP | ||
32 | /* | ||
33 | * We install an empty cpu_pda pointer table to indicate to early users | ||
34 | * (numa_set_node) that the cpu_pda pointer table for cpus other than | ||
35 | * the boot cpu is not yet setup. | ||
36 | */ | ||
37 | static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; | ||
38 | #else | ||
39 | static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; | ||
40 | #endif | ||
41 | |||
42 | void __init x86_64_init_pda(void) | ||
43 | { | ||
44 | _cpu_pda = __cpu_pda; | ||
45 | cpu_pda(0) = &_boot_cpu_pda; | ||
46 | pda_init(0); | ||
47 | } | ||
48 | |||
28 | static void __init zap_identity_mappings(void) | 49 | static void __init zap_identity_mappings(void) |
29 | { | 50 | { |
30 | pgd_t *pgd = pgd_offset_k(0UL); | 51 | pgd_t *pgd = pgd_offset_k(0UL); |
@@ -51,74 +72,6 @@ static void __init copy_bootdata(char *real_mode_data) | |||
51 | } | 72 | } |
52 | } | 73 | } |
53 | 74 | ||
54 | #define BIOS_LOWMEM_KILOBYTES 0x413 | ||
55 | |||
56 | /* | ||
57 | * The BIOS places the EBDA/XBDA at the top of conventional | ||
58 | * memory, and usually decreases the reported amount of | ||
59 | * conventional memory (int 0x12) too. This also contains a | ||
60 | * workaround for Dell systems that neglect to reserve EBDA. | ||
61 | * The same workaround also avoids a problem with the AMD768MPX | ||
62 | * chipset: reserve a page before VGA to prevent PCI prefetch | ||
63 | * into it (errata #56). Usually the page is reserved anyways, | ||
64 | * unless you have no PS/2 mouse plugged in. | ||
65 | */ | ||
66 | static void __init reserve_ebda_region(void) | ||
67 | { | ||
68 | unsigned int lowmem, ebda_addr; | ||
69 | |||
70 | /* To determine the position of the EBDA and the */ | ||
71 | /* end of conventional memory, we need to look at */ | ||
72 | /* the BIOS data area. In a paravirtual environment */ | ||
73 | /* that area is absent. We'll just have to assume */ | ||
74 | /* that the paravirt case can handle memory setup */ | ||
75 | /* correctly, without our help. */ | ||
76 | if (paravirt_enabled()) | ||
77 | return; | ||
78 | |||
79 | /* end of low (conventional) memory */ | ||
80 | lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); | ||
81 | lowmem <<= 10; | ||
82 | |||
83 | /* start of EBDA area */ | ||
84 | ebda_addr = get_bios_ebda(); | ||
85 | |||
86 | /* Fixup: bios puts an EBDA in the top 64K segment */ | ||
87 | /* of conventional memory, but does not adjust lowmem. */ | ||
88 | if ((lowmem - ebda_addr) <= 0x10000) | ||
89 | lowmem = ebda_addr; | ||
90 | |||
91 | /* Fixup: bios does not report an EBDA at all. */ | ||
92 | /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ | ||
93 | if ((ebda_addr == 0) && (lowmem >= 0x9f000)) | ||
94 | lowmem = 0x9f000; | ||
95 | |||
96 | /* Paranoia: should never happen, but... */ | ||
97 | if ((lowmem == 0) || (lowmem >= 0x100000)) | ||
98 | lowmem = 0x9f000; | ||
99 | |||
100 | /* reserve all memory between lowmem and the 1MB mark */ | ||
101 | reserve_early(lowmem, 0x100000, "BIOS reserved"); | ||
102 | } | ||
103 | |||
104 | static void __init reserve_setup_data(void) | ||
105 | { | ||
106 | struct setup_data *data; | ||
107 | unsigned long pa_data; | ||
108 | char buf[32]; | ||
109 | |||
110 | if (boot_params.hdr.version < 0x0209) | ||
111 | return; | ||
112 | pa_data = boot_params.hdr.setup_data; | ||
113 | while (pa_data) { | ||
114 | data = early_ioremap(pa_data, sizeof(*data)); | ||
115 | sprintf(buf, "setup data %x", data->type); | ||
116 | reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); | ||
117 | pa_data = data->next; | ||
118 | early_iounmap(data, sizeof(*data)); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | void __init x86_64_start_kernel(char * real_mode_data) | 75 | void __init x86_64_start_kernel(char * real_mode_data) |
123 | { | 76 | { |
124 | int i; | 77 | int i; |
@@ -156,10 +109,15 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
156 | 109 | ||
157 | early_printk("Kernel alive\n"); | 110 | early_printk("Kernel alive\n"); |
158 | 111 | ||
159 | for (i = 0; i < NR_CPUS; i++) | 112 | x86_64_init_pda(); |
160 | cpu_pda(i) = &boot_cpu_pda[i]; | ||
161 | 113 | ||
162 | pda_init(0); | 114 | early_printk("Kernel really alive\n"); |
115 | |||
116 | x86_64_start_reservations(real_mode_data); | ||
117 | } | ||
118 | |||
119 | void __init x86_64_start_reservations(char *real_mode_data) | ||
120 | { | ||
163 | copy_bootdata(__va(real_mode_data)); | 121 | copy_bootdata(__va(real_mode_data)); |
164 | 122 | ||
165 | reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); | 123 | reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); |
@@ -175,7 +133,6 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
175 | #endif | 133 | #endif |
176 | 134 | ||
177 | reserve_ebda_region(); | 135 | reserve_ebda_region(); |
178 | reserve_setup_data(); | ||
179 | 136 | ||
180 | /* | 137 | /* |
181 | * At this point everything still needed from the boot loader | 138 | * At this point everything still needed from the boot loader |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f7357cc0162c..a7010c3a377a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -194,6 +194,7 @@ default_entry: | |||
194 | xorl %ebx,%ebx /* %ebx is kept at zero */ | 194 | xorl %ebx,%ebx /* %ebx is kept at zero */ |
195 | 195 | ||
196 | movl $pa(pg0), %edi | 196 | movl $pa(pg0), %edi |
197 | movl %edi, pa(init_pg_tables_start) | ||
197 | movl $pa(swapper_pg_pmd), %edx | 198 | movl $pa(swapper_pg_pmd), %edx |
198 | movl $PTE_ATTR, %eax | 199 | movl $PTE_ATTR, %eax |
199 | 10: | 200 | 10: |
@@ -219,6 +220,8 @@ default_entry: | |||
219 | jb 10b | 220 | jb 10b |
220 | 1: | 221 | 1: |
221 | movl %edi,pa(init_pg_tables_end) | 222 | movl %edi,pa(init_pg_tables_end) |
223 | shrl $12, %eax | ||
224 | movl %eax, pa(max_pfn_mapped) | ||
222 | 225 | ||
223 | /* Do early initialization of the fixmap area */ | 226 | /* Do early initialization of the fixmap area */ |
224 | movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax | 227 | movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax |
@@ -228,6 +231,7 @@ default_entry: | |||
228 | page_pde_offset = (__PAGE_OFFSET >> 20); | 231 | page_pde_offset = (__PAGE_OFFSET >> 20); |
229 | 232 | ||
230 | movl $pa(pg0), %edi | 233 | movl $pa(pg0), %edi |
234 | movl %edi, pa(init_pg_tables_start) | ||
231 | movl $pa(swapper_pg_dir), %edx | 235 | movl $pa(swapper_pg_dir), %edx |
232 | movl $PTE_ATTR, %eax | 236 | movl $PTE_ATTR, %eax |
233 | 10: | 237 | 10: |
@@ -249,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
249 | cmpl %ebp,%eax | 253 | cmpl %ebp,%eax |
250 | jb 10b | 254 | jb 10b |
251 | movl %edi,pa(init_pg_tables_end) | 255 | movl %edi,pa(init_pg_tables_end) |
256 | shrl $12, %eax | ||
257 | movl %eax, pa(max_pfn_mapped) | ||
252 | 258 | ||
253 | /* Do early initialization of the fixmap area */ | 259 | /* Do early initialization of the fixmap area */ |
254 | movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax | 260 | movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax |
@@ -446,10 +452,10 @@ is386: movl $2,%ecx # set MP | |||
446 | je 1f | 452 | je 1f |
447 | movl $(__KERNEL_PERCPU), %eax | 453 | movl $(__KERNEL_PERCPU), %eax |
448 | movl %eax,%fs # set this cpu's percpu | 454 | movl %eax,%fs # set this cpu's percpu |
449 | jmp initialize_secondary # all other CPUs call initialize_secondary | 455 | movl (stack_start), %esp |
450 | 1: | 456 | 1: |
451 | #endif /* CONFIG_SMP */ | 457 | #endif /* CONFIG_SMP */ |
452 | jmp i386_start_kernel | 458 | jmp *(initial_code) |
453 | 459 | ||
454 | /* | 460 | /* |
455 | * We depend on ET to be correct. This checks for 287/387. | 461 | * We depend on ET to be correct. This checks for 287/387. |
@@ -592,6 +598,11 @@ ignore_int: | |||
592 | #endif | 598 | #endif |
593 | iret | 599 | iret |
594 | 600 | ||
601 | .section .cpuinit.data,"wa" | ||
602 | .align 4 | ||
603 | ENTRY(initial_code) | ||
604 | .long i386_start_kernel | ||
605 | |||
595 | .section .text | 606 | .section .text |
596 | /* | 607 | /* |
597 | * Real beginning of normal "text" segment | 608 | * Real beginning of normal "text" segment |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 10a1955bb1d1..db3280afe886 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/page.h> | 18 | #include <asm/page.h> |
19 | #include <asm/msr.h> | 19 | #include <asm/msr.h> |
20 | #include <asm/cache.h> | 20 | #include <asm/cache.h> |
21 | #include <asm/processor-flags.h> | ||
21 | 22 | ||
22 | #ifdef CONFIG_PARAVIRT | 23 | #ifdef CONFIG_PARAVIRT |
23 | #include <asm/asm-offsets.h> | 24 | #include <asm/asm-offsets.h> |
@@ -31,6 +32,13 @@ | |||
31 | * | 32 | * |
32 | */ | 33 | */ |
33 | 34 | ||
35 | #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) | ||
36 | |||
37 | L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) | ||
38 | L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) | ||
39 | L4_START_KERNEL = pgd_index(__START_KERNEL_map) | ||
40 | L3_START_KERNEL = pud_index(__START_KERNEL_map) | ||
41 | |||
34 | .text | 42 | .text |
35 | .section .text.head | 43 | .section .text.head |
36 | .code64 | 44 | .code64 |
@@ -76,8 +84,8 @@ startup_64: | |||
76 | /* Fixup the physical addresses in the page table | 84 | /* Fixup the physical addresses in the page table |
77 | */ | 85 | */ |
78 | addq %rbp, init_level4_pgt + 0(%rip) | 86 | addq %rbp, init_level4_pgt + 0(%rip) |
79 | addq %rbp, init_level4_pgt + (258*8)(%rip) | 87 | addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) |
80 | addq %rbp, init_level4_pgt + (511*8)(%rip) | 88 | addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) |
81 | 89 | ||
82 | addq %rbp, level3_ident_pgt + 0(%rip) | 90 | addq %rbp, level3_ident_pgt + 0(%rip) |
83 | 91 | ||
@@ -128,7 +136,7 @@ ident_complete: | |||
128 | /* Fixup phys_base */ | 136 | /* Fixup phys_base */ |
129 | addq %rbp, phys_base(%rip) | 137 | addq %rbp, phys_base(%rip) |
130 | 138 | ||
131 | #ifdef CONFIG_SMP | 139 | #ifdef CONFIG_X86_TRAMPOLINE |
132 | addq %rbp, trampoline_level4_pgt + 0(%rip) | 140 | addq %rbp, trampoline_level4_pgt + 0(%rip) |
133 | addq %rbp, trampoline_level4_pgt + (511*8)(%rip) | 141 | addq %rbp, trampoline_level4_pgt + (511*8)(%rip) |
134 | #endif | 142 | #endif |
@@ -154,9 +162,7 @@ ENTRY(secondary_startup_64) | |||
154 | */ | 162 | */ |
155 | 163 | ||
156 | /* Enable PAE mode and PGE */ | 164 | /* Enable PAE mode and PGE */ |
157 | xorq %rax, %rax | 165 | movl $(X86_CR4_PAE | X86_CR4_PGE), %eax |
158 | btsq $5, %rax | ||
159 | btsq $7, %rax | ||
160 | movq %rax, %cr4 | 166 | movq %rax, %cr4 |
161 | 167 | ||
162 | /* Setup early boot stage 4 level pagetables. */ | 168 | /* Setup early boot stage 4 level pagetables. */ |
@@ -184,19 +190,15 @@ ENTRY(secondary_startup_64) | |||
184 | 1: wrmsr /* Make changes effective */ | 190 | 1: wrmsr /* Make changes effective */ |
185 | 191 | ||
186 | /* Setup cr0 */ | 192 | /* Setup cr0 */ |
187 | #define CR0_PM 1 /* protected mode */ | 193 | #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ |
188 | #define CR0_MP (1<<1) | 194 | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ |
189 | #define CR0_ET (1<<4) | 195 | X86_CR0_PG) |
190 | #define CR0_NE (1<<5) | 196 | movl $CR0_STATE, %eax |
191 | #define CR0_WP (1<<16) | ||
192 | #define CR0_AM (1<<18) | ||
193 | #define CR0_PAGING (1<<31) | ||
194 | movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax | ||
195 | /* Make changes effective */ | 197 | /* Make changes effective */ |
196 | movq %rax, %cr0 | 198 | movq %rax, %cr0 |
197 | 199 | ||
198 | /* Setup a boot time stack */ | 200 | /* Setup a boot time stack */ |
199 | movq init_rsp(%rip),%rsp | 201 | movq stack_start(%rip),%rsp |
200 | 202 | ||
201 | /* zero EFLAGS after setting rsp */ | 203 | /* zero EFLAGS after setting rsp */ |
202 | pushq $0 | 204 | pushq $0 |
@@ -208,7 +210,7 @@ ENTRY(secondary_startup_64) | |||
208 | * addresses where we're currently running on. We have to do that here | 210 | * addresses where we're currently running on. We have to do that here |
209 | * because in 32bit we couldn't load a 64bit linear address. | 211 | * because in 32bit we couldn't load a 64bit linear address. |
210 | */ | 212 | */ |
211 | lgdt cpu_gdt_descr(%rip) | 213 | lgdt early_gdt_descr(%rip) |
212 | 214 | ||
213 | /* set up data segments. actually 0 would do too */ | 215 | /* set up data segments. actually 0 would do too */ |
214 | movl $__KERNEL_DS,%eax | 216 | movl $__KERNEL_DS,%eax |
@@ -257,8 +259,9 @@ ENTRY(secondary_startup_64) | |||
257 | .quad x86_64_start_kernel | 259 | .quad x86_64_start_kernel |
258 | __FINITDATA | 260 | __FINITDATA |
259 | 261 | ||
260 | ENTRY(init_rsp) | 262 | ENTRY(stack_start) |
261 | .quad init_thread_union+THREAD_SIZE-8 | 263 | .quad init_thread_union+THREAD_SIZE-8 |
264 | .word 0 | ||
262 | 265 | ||
263 | bad_address: | 266 | bad_address: |
264 | jmp bad_address | 267 | jmp bad_address |
@@ -327,11 +330,11 @@ early_idt_ripmsg: | |||
327 | ENTRY(name) | 330 | ENTRY(name) |
328 | 331 | ||
329 | /* Automate the creation of 1 to 1 mapping pmd entries */ | 332 | /* Automate the creation of 1 to 1 mapping pmd entries */ |
330 | #define PMDS(START, PERM, COUNT) \ | 333 | #define PMDS(START, PERM, COUNT) \ |
331 | i = 0 ; \ | 334 | i = 0 ; \ |
332 | .rept (COUNT) ; \ | 335 | .rept (COUNT) ; \ |
333 | .quad (START) + (i << 21) + (PERM) ; \ | 336 | .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ |
334 | i = i + 1 ; \ | 337 | i = i + 1 ; \ |
335 | .endr | 338 | .endr |
336 | 339 | ||
337 | /* | 340 | /* |
@@ -342,9 +345,9 @@ ENTRY(name) | |||
342 | */ | 345 | */ |
343 | NEXT_PAGE(init_level4_pgt) | 346 | NEXT_PAGE(init_level4_pgt) |
344 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 347 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
345 | .fill 257,8,0 | 348 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 |
346 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 349 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
347 | .fill 252,8,0 | 350 | .org init_level4_pgt + L4_START_KERNEL*8, 0 |
348 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 351 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
349 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 352 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
350 | 353 | ||
@@ -353,7 +356,7 @@ NEXT_PAGE(level3_ident_pgt) | |||
353 | .fill 511,8,0 | 356 | .fill 511,8,0 |
354 | 357 | ||
355 | NEXT_PAGE(level3_kernel_pgt) | 358 | NEXT_PAGE(level3_kernel_pgt) |
356 | .fill 510,8,0 | 359 | .fill L3_START_KERNEL,8,0 |
357 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | 360 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ |
358 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | 361 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE |
359 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | 362 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE |
@@ -384,7 +387,7 @@ NEXT_PAGE(level2_kernel_pgt) | |||
384 | * If you want to increase this then increase MODULES_VADDR | 387 | * If you want to increase this then increase MODULES_VADDR |
385 | * too.) | 388 | * too.) |
386 | */ | 389 | */ |
387 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, | 390 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, |
388 | KERNEL_IMAGE_SIZE/PMD_SIZE) | 391 | KERNEL_IMAGE_SIZE/PMD_SIZE) |
389 | 392 | ||
390 | NEXT_PAGE(level2_spare_pgt) | 393 | NEXT_PAGE(level2_spare_pgt) |
@@ -395,54 +398,17 @@ NEXT_PAGE(level2_spare_pgt) | |||
395 | 398 | ||
396 | .data | 399 | .data |
397 | .align 16 | 400 | .align 16 |
398 | .globl cpu_gdt_descr | 401 | .globl early_gdt_descr |
399 | cpu_gdt_descr: | 402 | early_gdt_descr: |
400 | .word gdt_end-cpu_gdt_table-1 | 403 | .word GDT_ENTRIES*8-1 |
401 | gdt: | 404 | .quad per_cpu__gdt_page |
402 | .quad cpu_gdt_table | ||
403 | #ifdef CONFIG_SMP | ||
404 | .rept NR_CPUS-1 | ||
405 | .word 0 | ||
406 | .quad 0 | ||
407 | .endr | ||
408 | #endif | ||
409 | 405 | ||
410 | ENTRY(phys_base) | 406 | ENTRY(phys_base) |
411 | /* This must match the first entry in level2_kernel_pgt */ | 407 | /* This must match the first entry in level2_kernel_pgt */ |
412 | .quad 0x0000000000000000 | 408 | .quad 0x0000000000000000 |
413 | 409 | ||
414 | /* We need valid kernel segments for data and code in long mode too | 410 | #include "../../x86/xen/xen-head.S" |
415 | * IRET will check the segment types kkeil 2000/10/28 | ||
416 | * Also sysret mandates a special GDT layout | ||
417 | */ | ||
418 | |||
419 | .section .data.page_aligned, "aw" | ||
420 | .align PAGE_SIZE | ||
421 | |||
422 | /* The TLS descriptors are currently at a different place compared to i386. | ||
423 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
424 | 411 | ||
425 | ENTRY(cpu_gdt_table) | ||
426 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
427 | .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ | ||
428 | .quad 0x00af9b000000ffff /* __KERNEL_CS */ | ||
429 | .quad 0x00cf93000000ffff /* __KERNEL_DS */ | ||
430 | .quad 0x00cffb000000ffff /* __USER32_CS */ | ||
431 | .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ | ||
432 | .quad 0x00affb000000ffff /* __USER_CS */ | ||
433 | .quad 0x0 /* unused */ | ||
434 | .quad 0,0 /* TSS */ | ||
435 | .quad 0,0 /* LDT */ | ||
436 | .quad 0,0,0 /* three TLS descriptors */ | ||
437 | .quad 0x0000f40000000000 /* node/CPU stored in limit */ | ||
438 | gdt_end: | ||
439 | /* asm/segment.h:GDT_ENTRIES must match this */ | ||
440 | /* This should be a multiple of the cache line size */ | ||
441 | /* GDTs of other CPUs are now dynamically allocated */ | ||
442 | |||
443 | /* zero the remaining page */ | ||
444 | .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 | ||
445 | |||
446 | .section .bss, "aw", @nobits | 412 | .section .bss, "aw", @nobits |
447 | .align L1_CACHE_BYTES | 413 | .align L1_CACHE_BYTES |
448 | ENTRY(idt_table) | 414 | ENTRY(idt_table) |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 9b5cfcdfc426..ad2b15a1334d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -17,7 +17,7 @@ | |||
17 | 17 | ||
18 | /* FSEC = 10^-15 | 18 | /* FSEC = 10^-15 |
19 | NSEC = 10^-9 */ | 19 | NSEC = 10^-9 */ |
20 | #define FSEC_PER_NSEC 1000000 | 20 | #define FSEC_PER_NSEC 1000000L |
21 | 21 | ||
22 | /* | 22 | /* |
23 | * HPET address is set in acpi/boot.c, when an ACPI entry exists | 23 | * HPET address is set in acpi/boot.c, when an ACPI entry exists |
@@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a) | |||
36 | } | 36 | } |
37 | 37 | ||
38 | #ifdef CONFIG_X86_64 | 38 | #ifdef CONFIG_X86_64 |
39 | |||
40 | #include <asm/pgtable.h> | 39 | #include <asm/pgtable.h> |
41 | 40 | #endif | |
42 | static inline void hpet_set_mapping(void) | ||
43 | { | ||
44 | set_fixmap_nocache(FIX_HPET_BASE, hpet_address); | ||
45 | __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); | ||
46 | hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); | ||
47 | } | ||
48 | |||
49 | static inline void hpet_clear_mapping(void) | ||
50 | { | ||
51 | hpet_virt_address = NULL; | ||
52 | } | ||
53 | |||
54 | #else | ||
55 | 41 | ||
56 | static inline void hpet_set_mapping(void) | 42 | static inline void hpet_set_mapping(void) |
57 | { | 43 | { |
58 | hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); | 44 | hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); |
45 | #ifdef CONFIG_X86_64 | ||
46 | __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); | ||
47 | #endif | ||
59 | } | 48 | } |
60 | 49 | ||
61 | static inline void hpet_clear_mapping(void) | 50 | static inline void hpet_clear_mapping(void) |
@@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void) | |||
63 | iounmap(hpet_virt_address); | 52 | iounmap(hpet_virt_address); |
64 | hpet_virt_address = NULL; | 53 | hpet_virt_address = NULL; |
65 | } | 54 | } |
66 | #endif | ||
67 | 55 | ||
68 | /* | 56 | /* |
69 | * HPET command line enable / disable | 57 | * HPET command line enable / disable |
@@ -206,20 +194,19 @@ static void hpet_enable_legacy_int(void) | |||
206 | 194 | ||
207 | static void hpet_legacy_clockevent_register(void) | 195 | static void hpet_legacy_clockevent_register(void) |
208 | { | 196 | { |
209 | uint64_t hpet_freq; | ||
210 | |||
211 | /* Start HPET legacy interrupts */ | 197 | /* Start HPET legacy interrupts */ |
212 | hpet_enable_legacy_int(); | 198 | hpet_enable_legacy_int(); |
213 | 199 | ||
214 | /* | 200 | /* |
215 | * The period is a femto seconds value. We need to calculate the | 201 | * The mult factor is defined as (include/linux/clockchips.h) |
216 | * scaled math multiplication factor for nanosecond to hpet tick | 202 | * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h) |
217 | * conversion. | 203 | * hpet_period is in units of femtoseconds (per cycle), so |
204 | * mult/2^shift = cyc/ns = 10^6/hpet_period | ||
205 | * mult = (10^6 * 2^shift)/hpet_period | ||
206 | * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period | ||
218 | */ | 207 | */ |
219 | hpet_freq = 1000000000000000ULL; | 208 | hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC, |
220 | do_div(hpet_freq, hpet_period); | 209 | hpet_period, hpet_clockevent.shift); |
221 | hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, | ||
222 | NSEC_PER_SEC, hpet_clockevent.shift); | ||
223 | /* Calculate the min / max delta */ | 210 | /* Calculate the min / max delta */ |
224 | hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, | 211 | hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, |
225 | &hpet_clockevent); | 212 | &hpet_clockevent); |
@@ -324,7 +311,7 @@ static struct clocksource clocksource_hpet = { | |||
324 | 311 | ||
325 | static int hpet_clocksource_register(void) | 312 | static int hpet_clocksource_register(void) |
326 | { | 313 | { |
327 | u64 tmp, start, now; | 314 | u64 start, now; |
328 | cycle_t t1; | 315 | cycle_t t1; |
329 | 316 | ||
330 | /* Start the counter */ | 317 | /* Start the counter */ |
@@ -351,21 +338,15 @@ static int hpet_clocksource_register(void) | |||
351 | return -ENODEV; | 338 | return -ENODEV; |
352 | } | 339 | } |
353 | 340 | ||
354 | /* Initialize and register HPET clocksource | 341 | /* |
355 | * | 342 | * The definition of mult is (include/linux/clocksource.h) |
356 | * hpet period is in femto seconds per cycle | 343 | * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc |
357 | * so we need to convert this to ns/cyc units | 344 | * so we first need to convert hpet_period to ns/cyc units: |
358 | * approximated by mult/2^shift | 345 | * mult/2^shift = ns/cyc = hpet_period/10^6 |
359 | * | 346 | * mult = (hpet_period * 2^shift)/10^6 |
360 | * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift | 347 | * mult = (hpet_period << shift)/FSEC_PER_NSEC |
361 | * fsec/cyc * 1ns/1000000fsec * 2^shift = mult | ||
362 | * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult | ||
363 | * (fsec/cyc << shift)/1000000 = mult | ||
364 | * (hpet_period << shift)/FSEC_PER_NSEC = mult | ||
365 | */ | 348 | */ |
366 | tmp = (u64)hpet_period << HPET_SHIFT; | 349 | clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); |
367 | do_div(tmp, FSEC_PER_NSEC); | ||
368 | clocksource_hpet.mult = (u32)tmp; | ||
369 | 350 | ||
370 | clocksource_register(&clocksource_hpet); | 351 | clocksource_register(&clocksource_hpet); |
371 | 352 | ||
@@ -487,7 +468,7 @@ void hpet_disable(void) | |||
487 | #define RTC_NUM_INTS 1 | 468 | #define RTC_NUM_INTS 1 |
488 | 469 | ||
489 | static unsigned long hpet_rtc_flags; | 470 | static unsigned long hpet_rtc_flags; |
490 | static unsigned long hpet_prev_update_sec; | 471 | static int hpet_prev_update_sec; |
491 | static struct rtc_time hpet_alarm_time; | 472 | static struct rtc_time hpet_alarm_time; |
492 | static unsigned long hpet_pie_count; | 473 | static unsigned long hpet_pie_count; |
493 | static unsigned long hpet_t1_cmp; | 474 | static unsigned long hpet_t1_cmp; |
@@ -594,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) | |||
594 | 575 | ||
595 | hpet_rtc_flags |= bit_mask; | 576 | hpet_rtc_flags |= bit_mask; |
596 | 577 | ||
578 | if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) | ||
579 | hpet_prev_update_sec = -1; | ||
580 | |||
597 | if (!oldbits) | 581 | if (!oldbits) |
598 | hpet_rtc_timer_init(); | 582 | hpet_rtc_timer_init(); |
599 | 583 | ||
@@ -671,7 +655,7 @@ static void hpet_rtc_timer_reinit(void) | |||
671 | if (hpet_rtc_flags & RTC_PIE) | 655 | if (hpet_rtc_flags & RTC_PIE) |
672 | hpet_pie_count += lost_ints; | 656 | hpet_pie_count += lost_ints; |
673 | if (printk_ratelimit()) | 657 | if (printk_ratelimit()) |
674 | printk(KERN_WARNING "rtc: lost %d interrupts\n", | 658 | printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", |
675 | lost_ints); | 659 | lost_ints); |
676 | } | 660 | } |
677 | } | 661 | } |
@@ -689,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) | |||
689 | 673 | ||
690 | if (hpet_rtc_flags & RTC_UIE && | 674 | if (hpet_rtc_flags & RTC_UIE && |
691 | curr_time.tm_sec != hpet_prev_update_sec) { | 675 | curr_time.tm_sec != hpet_prev_update_sec) { |
692 | rtc_int_flag = RTC_UF; | 676 | if (hpet_prev_update_sec >= 0) |
677 | rtc_int_flag = RTC_UF; | ||
693 | hpet_prev_update_sec = curr_time.tm_sec; | 678 | hpet_prev_update_sec = curr_time.tm_sec; |
694 | } | 679 | } |
695 | 680 | ||
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index deb43785e923..dd7ebee446af 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c | |||
@@ -1,7 +1,14 @@ | |||
1 | #include <linux/module.h> | 1 | #include <linux/module.h> |
2 | |||
2 | #include <asm/checksum.h> | 3 | #include <asm/checksum.h> |
3 | #include <asm/desc.h> | ||
4 | #include <asm/pgtable.h> | 4 | #include <asm/pgtable.h> |
5 | #include <asm/desc.h> | ||
6 | #include <asm/ftrace.h> | ||
7 | |||
8 | #ifdef CONFIG_FTRACE | ||
9 | /* mcount is defined in assembly */ | ||
10 | EXPORT_SYMBOL(mcount); | ||
11 | #endif | ||
5 | 12 | ||
6 | /* Networking helper routines. */ | 13 | /* Networking helper routines. */ |
7 | EXPORT_SYMBOL(csum_partial_copy_generic); | 14 | EXPORT_SYMBOL(csum_partial_copy_generic); |
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259.c index fe631967d625..dc92b49d9204 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259.c | |||
@@ -1,8 +1,10 @@ | |||
1 | #include <linux/linkage.h> | ||
1 | #include <linux/errno.h> | 2 | #include <linux/errno.h> |
2 | #include <linux/signal.h> | 3 | #include <linux/signal.h> |
3 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
4 | #include <linux/ioport.h> | 5 | #include <linux/ioport.h> |
5 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
7 | #include <linux/timex.h> | ||
6 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
7 | #include <linux/random.h> | 9 | #include <linux/random.h> |
8 | #include <linux/init.h> | 10 | #include <linux/init.h> |
@@ -10,10 +12,12 @@ | |||
10 | #include <linux/sysdev.h> | 12 | #include <linux/sysdev.h> |
11 | #include <linux/bitops.h> | 13 | #include <linux/bitops.h> |
12 | 14 | ||
15 | #include <asm/acpi.h> | ||
13 | #include <asm/atomic.h> | 16 | #include <asm/atomic.h> |
14 | #include <asm/system.h> | 17 | #include <asm/system.h> |
15 | #include <asm/io.h> | 18 | #include <asm/io.h> |
16 | #include <asm/timer.h> | 19 | #include <asm/timer.h> |
20 | #include <asm/hw_irq.h> | ||
17 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
18 | #include <asm/delay.h> | 22 | #include <asm/delay.h> |
19 | #include <asm/desc.h> | 23 | #include <asm/desc.h> |
@@ -32,7 +36,7 @@ static int i8259A_auto_eoi; | |||
32 | DEFINE_SPINLOCK(i8259A_lock); | 36 | DEFINE_SPINLOCK(i8259A_lock); |
33 | static void mask_and_ack_8259A(unsigned int); | 37 | static void mask_and_ack_8259A(unsigned int); |
34 | 38 | ||
35 | static struct irq_chip i8259A_chip = { | 39 | struct irq_chip i8259A_chip = { |
36 | .name = "XT-PIC", | 40 | .name = "XT-PIC", |
37 | .mask = disable_8259A_irq, | 41 | .mask = disable_8259A_irq, |
38 | .disable = disable_8259A_irq, | 42 | .disable = disable_8259A_irq, |
@@ -125,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq) | |||
125 | int irqmask = 1<<irq; | 129 | int irqmask = 1<<irq; |
126 | 130 | ||
127 | if (irq < 8) { | 131 | if (irq < 8) { |
128 | outb(0x0B,PIC_MASTER_CMD); /* ISR register */ | 132 | outb(0x0B, PIC_MASTER_CMD); /* ISR register */ |
129 | value = inb(PIC_MASTER_CMD) & irqmask; | 133 | value = inb(PIC_MASTER_CMD) & irqmask; |
130 | outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ | 134 | outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */ |
131 | return value; | 135 | return value; |
132 | } | 136 | } |
133 | outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ | 137 | outb(0x0B, PIC_SLAVE_CMD); /* ISR register */ |
134 | value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); | 138 | value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); |
135 | outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ | 139 | outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ |
136 | return value; | 140 | return value; |
137 | } | 141 | } |
138 | 142 | ||
@@ -171,12 +175,14 @@ handle_real_irq: | |||
171 | if (irq & 8) { | 175 | if (irq & 8) { |
172 | inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ | 176 | inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ |
173 | outb(cached_slave_mask, PIC_SLAVE_IMR); | 177 | outb(cached_slave_mask, PIC_SLAVE_IMR); |
174 | outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ | 178 | /* 'Specific EOI' to slave */ |
175 | outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ | 179 | outb(0x60+(irq&7), PIC_SLAVE_CMD); |
180 | /* 'Specific EOI' to master-IRQ2 */ | ||
181 | outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); | ||
176 | } else { | 182 | } else { |
177 | inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ | 183 | inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ |
178 | outb(cached_master_mask, PIC_MASTER_IMR); | 184 | outb(cached_master_mask, PIC_MASTER_IMR); |
179 | outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ | 185 | outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ |
180 | } | 186 | } |
181 | spin_unlock_irqrestore(&i8259A_lock, flags); | 187 | spin_unlock_irqrestore(&i8259A_lock, flags); |
182 | return; | 188 | return; |
@@ -199,7 +205,8 @@ spurious_8259A_irq: | |||
199 | * lets ACK and report it. [once per IRQ] | 205 | * lets ACK and report it. [once per IRQ] |
200 | */ | 206 | */ |
201 | if (!(spurious_irq_mask & irqmask)) { | 207 | if (!(spurious_irq_mask & irqmask)) { |
202 | printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); | 208 | printk(KERN_DEBUG |
209 | "spurious 8259A interrupt: IRQ%d.\n", irq); | ||
203 | spurious_irq_mask |= irqmask; | 210 | spurious_irq_mask |= irqmask; |
204 | } | 211 | } |
205 | atomic_inc(&irq_err_count); | 212 | atomic_inc(&irq_err_count); |
@@ -290,17 +297,28 @@ void init_8259A(int auto_eoi) | |||
290 | * outb_pic - this has to work on a wide range of PC hardware. | 297 | * outb_pic - this has to work on a wide range of PC hardware. |
291 | */ | 298 | */ |
292 | outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ | 299 | outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ |
293 | outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ | 300 | |
294 | outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ | 301 | /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64, |
302 | to 0x20-0x27 on i386 */ | ||
303 | outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); | ||
304 | |||
305 | /* 8259A-1 (the master) has a slave on IR2 */ | ||
306 | outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); | ||
307 | |||
295 | if (auto_eoi) /* master does Auto EOI */ | 308 | if (auto_eoi) /* master does Auto EOI */ |
296 | outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); | 309 | outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); |
297 | else /* master expects normal EOI */ | 310 | else /* master expects normal EOI */ |
298 | outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); | 311 | outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); |
299 | 312 | ||
300 | outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ | 313 | outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ |
301 | outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ | 314 | |
302 | outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ | 315 | /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ |
303 | outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ | 316 | outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); |
317 | /* 8259A-2 is a slave on master's IR2 */ | ||
318 | outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); | ||
319 | /* (slave's support for AEOI in flat mode is to be investigated) */ | ||
320 | outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); | ||
321 | |||
304 | if (auto_eoi) | 322 | if (auto_eoi) |
305 | /* | 323 | /* |
306 | * In AEOI mode we just have to mask the interrupt | 324 | * In AEOI mode we just have to mask the interrupt |
@@ -317,93 +335,3 @@ void init_8259A(int auto_eoi) | |||
317 | 335 | ||
318 | spin_unlock_irqrestore(&i8259A_lock, flags); | 336 | spin_unlock_irqrestore(&i8259A_lock, flags); |
319 | } | 337 | } |
320 | |||
321 | /* | ||
322 | * Note that on a 486, we don't want to do a SIGFPE on an irq13 | ||
323 | * as the irq is unreliable, and exception 16 works correctly | ||
324 | * (ie as explained in the intel literature). On a 386, you | ||
325 | * can't use exception 16 due to bad IBM design, so we have to | ||
326 | * rely on the less exact irq13. | ||
327 | * | ||
328 | * Careful.. Not only is IRQ13 unreliable, but it is also | ||
329 | * leads to races. IBM designers who came up with it should | ||
330 | * be shot. | ||
331 | */ | ||
332 | |||
333 | |||
334 | static irqreturn_t math_error_irq(int cpl, void *dev_id) | ||
335 | { | ||
336 | extern void math_error(void __user *); | ||
337 | outb(0,0xF0); | ||
338 | if (ignore_fpu_irq || !boot_cpu_data.hard_math) | ||
339 | return IRQ_NONE; | ||
340 | math_error((void __user *)get_irq_regs()->ip); | ||
341 | return IRQ_HANDLED; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * New motherboards sometimes make IRQ 13 be a PCI interrupt, | ||
346 | * so allow interrupt sharing. | ||
347 | */ | ||
348 | static struct irqaction fpu_irq = { | ||
349 | .handler = math_error_irq, | ||
350 | .mask = CPU_MASK_NONE, | ||
351 | .name = "fpu", | ||
352 | }; | ||
353 | |||
354 | void __init init_ISA_irqs (void) | ||
355 | { | ||
356 | int i; | ||
357 | |||
358 | #ifdef CONFIG_X86_LOCAL_APIC | ||
359 | init_bsp_APIC(); | ||
360 | #endif | ||
361 | init_8259A(0); | ||
362 | |||
363 | /* | ||
364 | * 16 old-style INTA-cycle interrupts: | ||
365 | */ | ||
366 | for (i = 0; i < 16; i++) { | ||
367 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
368 | handle_level_irq, "XT"); | ||
369 | } | ||
370 | } | ||
371 | |||
372 | /* Overridden in paravirt.c */ | ||
373 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
374 | |||
375 | void __init native_init_IRQ(void) | ||
376 | { | ||
377 | int i; | ||
378 | |||
379 | /* all the set up before the call gates are initialised */ | ||
380 | pre_intr_init_hook(); | ||
381 | |||
382 | /* | ||
383 | * Cover the whole vector space, no vector can escape | ||
384 | * us. (some of these will be overridden and become | ||
385 | * 'special' SMP interrupts) | ||
386 | */ | ||
387 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
388 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
389 | if (i >= NR_IRQS) | ||
390 | break; | ||
391 | /* SYSCALL_VECTOR was reserved in trap_init. */ | ||
392 | if (!test_bit(vector, used_vectors)) | ||
393 | set_intr_gate(vector, interrupt[i]); | ||
394 | } | ||
395 | |||
396 | /* setup after call gates are initialised (usually add in | ||
397 | * the architecture specific gates) | ||
398 | */ | ||
399 | intr_init_hook(); | ||
400 | |||
401 | /* | ||
402 | * External FPU? Set up irq13 if so, for | ||
403 | * original braindamaged IBM FERR coupling. | ||
404 | */ | ||
405 | if (boot_cpu_data.hard_math && !cpu_has_fpu) | ||
406 | setup_irq(FPU_IRQ, &fpu_irq); | ||
407 | |||
408 | irq_ctx_init(smp_processor_id()); | ||
409 | } | ||
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c deleted file mode 100644 index fa57a1568508..000000000000 --- a/arch/x86/kernel/i8259_64.c +++ /dev/null | |||
@@ -1,512 +0,0 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/signal.h> | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/interrupt.h> | ||
7 | #include <linux/timex.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/random.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/kernel_stat.h> | ||
12 | #include <linux/sysdev.h> | ||
13 | #include <linux/bitops.h> | ||
14 | |||
15 | #include <asm/acpi.h> | ||
16 | #include <asm/atomic.h> | ||
17 | #include <asm/system.h> | ||
18 | #include <asm/io.h> | ||
19 | #include <asm/hw_irq.h> | ||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/delay.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/apic.h> | ||
24 | #include <asm/i8259.h> | ||
25 | |||
26 | /* | ||
27 | * Common place to define all x86 IRQ vectors | ||
28 | * | ||
29 | * This builds up the IRQ handler stubs using some ugly macros in irq.h | ||
30 | * | ||
31 | * These macros create the low-level assembly IRQ routines that save | ||
32 | * register context and call do_IRQ(). do_IRQ() then does all the | ||
33 | * operations that are needed to keep the AT (or SMP IOAPIC) | ||
34 | * interrupt-controller happy. | ||
35 | */ | ||
36 | |||
37 | #define BI(x,y) \ | ||
38 | BUILD_IRQ(x##y) | ||
39 | |||
40 | #define BUILD_16_IRQS(x) \ | ||
41 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
42 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
43 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
44 | BI(x,c) BI(x,d) BI(x,e) BI(x,f) | ||
45 | |||
46 | /* | ||
47 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
48 | * (these are usually mapped to vectors 0x30-0x3f) | ||
49 | */ | ||
50 | |||
51 | /* | ||
52 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
53 | * are unused but an SMP system is supposed to have enough memory ... | ||
54 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
55 | * across the spectrum, so we really want to be prepared to get all | ||
56 | * of these. Plus, more powerful systems might have more than 64 | ||
57 | * IO-APIC registers. | ||
58 | * | ||
59 | * (these are usually mapped into the 0x30-0xff vector range) | ||
60 | */ | ||
61 | BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) | ||
62 | BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) | ||
63 | BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) | ||
64 | BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) | ||
65 | |||
66 | #undef BUILD_16_IRQS | ||
67 | #undef BI | ||
68 | |||
69 | |||
70 | #define IRQ(x,y) \ | ||
71 | IRQ##x##y##_interrupt | ||
72 | |||
73 | #define IRQLIST_16(x) \ | ||
74 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
75 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
76 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
77 | IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) | ||
78 | |||
79 | /* for the irq vectors */ | ||
80 | static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { | ||
81 | IRQLIST_16(0x2), IRQLIST_16(0x3), | ||
82 | IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), | ||
83 | IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), | ||
84 | IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) | ||
85 | }; | ||
86 | |||
87 | #undef IRQ | ||
88 | #undef IRQLIST_16 | ||
89 | |||
90 | /* | ||
91 | * This is the 'legacy' 8259A Programmable Interrupt Controller, | ||
92 | * present in the majority of PC/AT boxes. | ||
93 | * plus some generic x86 specific things if generic specifics makes | ||
94 | * any sense at all. | ||
95 | * this file should become arch/i386/kernel/irq.c when the old irq.c | ||
96 | * moves to arch independent land | ||
97 | */ | ||
98 | |||
99 | static int i8259A_auto_eoi; | ||
100 | DEFINE_SPINLOCK(i8259A_lock); | ||
101 | static void mask_and_ack_8259A(unsigned int); | ||
102 | |||
103 | static struct irq_chip i8259A_chip = { | ||
104 | .name = "XT-PIC", | ||
105 | .mask = disable_8259A_irq, | ||
106 | .disable = disable_8259A_irq, | ||
107 | .unmask = enable_8259A_irq, | ||
108 | .mask_ack = mask_and_ack_8259A, | ||
109 | }; | ||
110 | |||
111 | /* | ||
112 | * 8259A PIC functions to handle ISA devices: | ||
113 | */ | ||
114 | |||
115 | /* | ||
116 | * This contains the irq mask for both 8259A irq controllers, | ||
117 | */ | ||
118 | unsigned int cached_irq_mask = 0xffff; | ||
119 | |||
120 | /* | ||
121 | * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) | ||
122 | * boards the timer interrupt is not really connected to any IO-APIC pin, | ||
123 | * it's fed to the master 8259A's IR0 line only. | ||
124 | * | ||
125 | * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. | ||
126 | * this 'mixed mode' IRQ handling costs nothing because it's only used | ||
127 | * at IRQ setup time. | ||
128 | */ | ||
129 | unsigned long io_apic_irqs; | ||
130 | |||
131 | void disable_8259A_irq(unsigned int irq) | ||
132 | { | ||
133 | unsigned int mask = 1 << irq; | ||
134 | unsigned long flags; | ||
135 | |||
136 | spin_lock_irqsave(&i8259A_lock, flags); | ||
137 | cached_irq_mask |= mask; | ||
138 | if (irq & 8) | ||
139 | outb(cached_slave_mask, PIC_SLAVE_IMR); | ||
140 | else | ||
141 | outb(cached_master_mask, PIC_MASTER_IMR); | ||
142 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
143 | } | ||
144 | |||
145 | void enable_8259A_irq(unsigned int irq) | ||
146 | { | ||
147 | unsigned int mask = ~(1 << irq); | ||
148 | unsigned long flags; | ||
149 | |||
150 | spin_lock_irqsave(&i8259A_lock, flags); | ||
151 | cached_irq_mask &= mask; | ||
152 | if (irq & 8) | ||
153 | outb(cached_slave_mask, PIC_SLAVE_IMR); | ||
154 | else | ||
155 | outb(cached_master_mask, PIC_MASTER_IMR); | ||
156 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
157 | } | ||
158 | |||
159 | int i8259A_irq_pending(unsigned int irq) | ||
160 | { | ||
161 | unsigned int mask = 1<<irq; | ||
162 | unsigned long flags; | ||
163 | int ret; | ||
164 | |||
165 | spin_lock_irqsave(&i8259A_lock, flags); | ||
166 | if (irq < 8) | ||
167 | ret = inb(PIC_MASTER_CMD) & mask; | ||
168 | else | ||
169 | ret = inb(PIC_SLAVE_CMD) & (mask >> 8); | ||
170 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
171 | |||
172 | return ret; | ||
173 | } | ||
174 | |||
175 | void make_8259A_irq(unsigned int irq) | ||
176 | { | ||
177 | disable_irq_nosync(irq); | ||
178 | io_apic_irqs &= ~(1<<irq); | ||
179 | set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, | ||
180 | "XT"); | ||
181 | enable_irq(irq); | ||
182 | } | ||
183 | |||
184 | /* | ||
185 | * This function assumes to be called rarely. Switching between | ||
186 | * 8259A registers is slow. | ||
187 | * This has to be protected by the irq controller spinlock | ||
188 | * before being called. | ||
189 | */ | ||
190 | static inline int i8259A_irq_real(unsigned int irq) | ||
191 | { | ||
192 | int value; | ||
193 | int irqmask = 1<<irq; | ||
194 | |||
195 | if (irq < 8) { | ||
196 | outb(0x0B,PIC_MASTER_CMD); /* ISR register */ | ||
197 | value = inb(PIC_MASTER_CMD) & irqmask; | ||
198 | outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ | ||
199 | return value; | ||
200 | } | ||
201 | outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ | ||
202 | value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); | ||
203 | outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ | ||
204 | return value; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * Careful! The 8259A is a fragile beast, it pretty | ||
209 | * much _has_ to be done exactly like this (mask it | ||
210 | * first, _then_ send the EOI, and the order of EOI | ||
211 | * to the two 8259s is important! | ||
212 | */ | ||
213 | static void mask_and_ack_8259A(unsigned int irq) | ||
214 | { | ||
215 | unsigned int irqmask = 1 << irq; | ||
216 | unsigned long flags; | ||
217 | |||
218 | spin_lock_irqsave(&i8259A_lock, flags); | ||
219 | /* | ||
220 | * Lightweight spurious IRQ detection. We do not want | ||
221 | * to overdo spurious IRQ handling - it's usually a sign | ||
222 | * of hardware problems, so we only do the checks we can | ||
223 | * do without slowing down good hardware unnecessarily. | ||
224 | * | ||
225 | * Note that IRQ7 and IRQ15 (the two spurious IRQs | ||
226 | * usually resulting from the 8259A-1|2 PICs) occur | ||
227 | * even if the IRQ is masked in the 8259A. Thus we | ||
228 | * can check spurious 8259A IRQs without doing the | ||
229 | * quite slow i8259A_irq_real() call for every IRQ. | ||
230 | * This does not cover 100% of spurious interrupts, | ||
231 | * but should be enough to warn the user that there | ||
232 | * is something bad going on ... | ||
233 | */ | ||
234 | if (cached_irq_mask & irqmask) | ||
235 | goto spurious_8259A_irq; | ||
236 | cached_irq_mask |= irqmask; | ||
237 | |||
238 | handle_real_irq: | ||
239 | if (irq & 8) { | ||
240 | inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ | ||
241 | outb(cached_slave_mask, PIC_SLAVE_IMR); | ||
242 | /* 'Specific EOI' to slave */ | ||
243 | outb(0x60+(irq&7),PIC_SLAVE_CMD); | ||
244 | /* 'Specific EOI' to master-IRQ2 */ | ||
245 | outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); | ||
246 | } else { | ||
247 | inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ | ||
248 | outb(cached_master_mask, PIC_MASTER_IMR); | ||
249 | /* 'Specific EOI' to master */ | ||
250 | outb(0x60+irq,PIC_MASTER_CMD); | ||
251 | } | ||
252 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
253 | return; | ||
254 | |||
255 | spurious_8259A_irq: | ||
256 | /* | ||
257 | * this is the slow path - should happen rarely. | ||
258 | */ | ||
259 | if (i8259A_irq_real(irq)) | ||
260 | /* | ||
261 | * oops, the IRQ _is_ in service according to the | ||
262 | * 8259A - not spurious, go handle it. | ||
263 | */ | ||
264 | goto handle_real_irq; | ||
265 | |||
266 | { | ||
267 | static int spurious_irq_mask; | ||
268 | /* | ||
269 | * At this point we can be sure the IRQ is spurious, | ||
270 | * lets ACK and report it. [once per IRQ] | ||
271 | */ | ||
272 | if (!(spurious_irq_mask & irqmask)) { | ||
273 | printk(KERN_DEBUG | ||
274 | "spurious 8259A interrupt: IRQ%d.\n", irq); | ||
275 | spurious_irq_mask |= irqmask; | ||
276 | } | ||
277 | atomic_inc(&irq_err_count); | ||
278 | /* | ||
279 | * Theoretically we do not have to handle this IRQ, | ||
280 | * but in Linux this does not cause problems and is | ||
281 | * simpler for us. | ||
282 | */ | ||
283 | goto handle_real_irq; | ||
284 | } | ||
285 | } | ||
286 | |||
287 | static char irq_trigger[2]; | ||
288 | /** | ||
289 | * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ | ||
290 | */ | ||
291 | static void restore_ELCR(char *trigger) | ||
292 | { | ||
293 | outb(trigger[0], 0x4d0); | ||
294 | outb(trigger[1], 0x4d1); | ||
295 | } | ||
296 | |||
297 | static void save_ELCR(char *trigger) | ||
298 | { | ||
299 | /* IRQ 0,1,2,8,13 are marked as reserved */ | ||
300 | trigger[0] = inb(0x4d0) & 0xF8; | ||
301 | trigger[1] = inb(0x4d1) & 0xDE; | ||
302 | } | ||
303 | |||
304 | static int i8259A_resume(struct sys_device *dev) | ||
305 | { | ||
306 | init_8259A(i8259A_auto_eoi); | ||
307 | restore_ELCR(irq_trigger); | ||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | static int i8259A_suspend(struct sys_device *dev, pm_message_t state) | ||
312 | { | ||
313 | save_ELCR(irq_trigger); | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | static int i8259A_shutdown(struct sys_device *dev) | ||
318 | { | ||
319 | /* Put the i8259A into a quiescent state that | ||
320 | * the kernel initialization code can get it | ||
321 | * out of. | ||
322 | */ | ||
323 | outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ | ||
324 | outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ | ||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | static struct sysdev_class i8259_sysdev_class = { | ||
329 | .name = "i8259", | ||
330 | .suspend = i8259A_suspend, | ||
331 | .resume = i8259A_resume, | ||
332 | .shutdown = i8259A_shutdown, | ||
333 | }; | ||
334 | |||
335 | static struct sys_device device_i8259A = { | ||
336 | .id = 0, | ||
337 | .cls = &i8259_sysdev_class, | ||
338 | }; | ||
339 | |||
340 | static int __init i8259A_init_sysfs(void) | ||
341 | { | ||
342 | int error = sysdev_class_register(&i8259_sysdev_class); | ||
343 | if (!error) | ||
344 | error = sysdev_register(&device_i8259A); | ||
345 | return error; | ||
346 | } | ||
347 | |||
348 | device_initcall(i8259A_init_sysfs); | ||
349 | |||
350 | void init_8259A(int auto_eoi) | ||
351 | { | ||
352 | unsigned long flags; | ||
353 | |||
354 | i8259A_auto_eoi = auto_eoi; | ||
355 | |||
356 | spin_lock_irqsave(&i8259A_lock, flags); | ||
357 | |||
358 | outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ | ||
359 | outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ | ||
360 | |||
361 | /* | ||
362 | * outb_pic - this has to work on a wide range of PC hardware. | ||
363 | */ | ||
364 | outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ | ||
365 | /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ | ||
366 | outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); | ||
367 | /* 8259A-1 (the master) has a slave on IR2 */ | ||
368 | outb_pic(0x04, PIC_MASTER_IMR); | ||
369 | if (auto_eoi) /* master does Auto EOI */ | ||
370 | outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); | ||
371 | else /* master expects normal EOI */ | ||
372 | outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); | ||
373 | |||
374 | outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ | ||
375 | /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ | ||
376 | outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); | ||
377 | /* 8259A-2 is a slave on master's IR2 */ | ||
378 | outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); | ||
379 | /* (slave's support for AEOI in flat mode is to be investigated) */ | ||
380 | outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); | ||
381 | |||
382 | if (auto_eoi) | ||
383 | /* | ||
384 | * In AEOI mode we just have to mask the interrupt | ||
385 | * when acking. | ||
386 | */ | ||
387 | i8259A_chip.mask_ack = disable_8259A_irq; | ||
388 | else | ||
389 | i8259A_chip.mask_ack = mask_and_ack_8259A; | ||
390 | |||
391 | udelay(100); /* wait for 8259A to initialize */ | ||
392 | |||
393 | outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ | ||
394 | outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ | ||
395 | |||
396 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
397 | } | ||
398 | |||
399 | |||
400 | |||
401 | |||
402 | /* | ||
403 | * IRQ2 is cascade interrupt to second interrupt controller | ||
404 | */ | ||
405 | |||
406 | static struct irqaction irq2 = { | ||
407 | .handler = no_action, | ||
408 | .mask = CPU_MASK_NONE, | ||
409 | .name = "cascade", | ||
410 | }; | ||
411 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | ||
412 | [0 ... IRQ0_VECTOR - 1] = -1, | ||
413 | [IRQ0_VECTOR] = 0, | ||
414 | [IRQ1_VECTOR] = 1, | ||
415 | [IRQ2_VECTOR] = 2, | ||
416 | [IRQ3_VECTOR] = 3, | ||
417 | [IRQ4_VECTOR] = 4, | ||
418 | [IRQ5_VECTOR] = 5, | ||
419 | [IRQ6_VECTOR] = 6, | ||
420 | [IRQ7_VECTOR] = 7, | ||
421 | [IRQ8_VECTOR] = 8, | ||
422 | [IRQ9_VECTOR] = 9, | ||
423 | [IRQ10_VECTOR] = 10, | ||
424 | [IRQ11_VECTOR] = 11, | ||
425 | [IRQ12_VECTOR] = 12, | ||
426 | [IRQ13_VECTOR] = 13, | ||
427 | [IRQ14_VECTOR] = 14, | ||
428 | [IRQ15_VECTOR] = 15, | ||
429 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | ||
430 | }; | ||
431 | |||
432 | void __init init_ISA_irqs (void) | ||
433 | { | ||
434 | int i; | ||
435 | |||
436 | init_bsp_APIC(); | ||
437 | init_8259A(0); | ||
438 | |||
439 | for (i = 0; i < NR_IRQS; i++) { | ||
440 | irq_desc[i].status = IRQ_DISABLED; | ||
441 | irq_desc[i].action = NULL; | ||
442 | irq_desc[i].depth = 1; | ||
443 | |||
444 | if (i < 16) { | ||
445 | /* | ||
446 | * 16 old-style INTA-cycle interrupts: | ||
447 | */ | ||
448 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
449 | handle_level_irq, "XT"); | ||
450 | } else { | ||
451 | /* | ||
452 | * 'high' PCI IRQs filled in on demand | ||
453 | */ | ||
454 | irq_desc[i].chip = &no_irq_chip; | ||
455 | } | ||
456 | } | ||
457 | } | ||
458 | |||
459 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
460 | |||
461 | void __init native_init_IRQ(void) | ||
462 | { | ||
463 | int i; | ||
464 | |||
465 | init_ISA_irqs(); | ||
466 | /* | ||
467 | * Cover the whole vector space, no vector can escape | ||
468 | * us. (some of these will be overridden and become | ||
469 | * 'special' SMP interrupts) | ||
470 | */ | ||
471 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
472 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
473 | if (vector != IA32_SYSCALL_VECTOR) | ||
474 | set_intr_gate(vector, interrupt[i]); | ||
475 | } | ||
476 | |||
477 | #ifdef CONFIG_SMP | ||
478 | /* | ||
479 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
480 | * IPI, driven by wakeup. | ||
481 | */ | ||
482 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
483 | |||
484 | /* IPIs for invalidation */ | ||
485 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); | ||
486 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | ||
487 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | ||
488 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | ||
489 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | ||
490 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | ||
491 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | ||
492 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | ||
493 | |||
494 | /* IPI for generic function call */ | ||
495 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
496 | |||
497 | /* Low priority IPI to cleanup after moving an irq */ | ||
498 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | ||
499 | #endif | ||
500 | set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
501 | set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | ||
502 | |||
503 | /* self generated IPI for local APIC timer */ | ||
504 | set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
505 | |||
506 | /* IPI vectors for APIC spurious and error interrupts */ | ||
507 | set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
508 | set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
509 | |||
510 | if (!acpi_ioapic) | ||
511 | setup_irq(2, &irq2); | ||
512 | } | ||
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 4dc8600d9d20..de9aa0e3a9c5 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/init.h> | 25 | #include <linux/init.h> |
26 | #include <linux/delay.h> | 26 | #include <linux/delay.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/bootmem.h> | ||
28 | #include <linux/mc146818rtc.h> | 29 | #include <linux/mc146818rtc.h> |
29 | #include <linux/compiler.h> | 30 | #include <linux/compiler.h> |
30 | #include <linux/acpi.h> | 31 | #include <linux/acpi.h> |
@@ -58,7 +59,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | |||
58 | static DEFINE_SPINLOCK(ioapic_lock); | 59 | static DEFINE_SPINLOCK(ioapic_lock); |
59 | static DEFINE_SPINLOCK(vector_lock); | 60 | static DEFINE_SPINLOCK(vector_lock); |
60 | 61 | ||
61 | int timer_over_8254 __initdata = 1; | 62 | int timer_through_8259 __initdata; |
62 | 63 | ||
63 | /* | 64 | /* |
64 | * Is the SiS APIC rmw bug present ? | 65 | * Is the SiS APIC rmw bug present ? |
@@ -72,15 +73,21 @@ int sis_apic_bug = -1; | |||
72 | int nr_ioapic_registers[MAX_IO_APICS]; | 73 | int nr_ioapic_registers[MAX_IO_APICS]; |
73 | 74 | ||
74 | /* I/O APIC entries */ | 75 | /* I/O APIC entries */ |
75 | struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | 76 | struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; |
76 | int nr_ioapics; | 77 | int nr_ioapics; |
77 | 78 | ||
78 | /* MP IRQ source entries */ | 79 | /* MP IRQ source entries */ |
79 | struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | 80 | struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; |
80 | 81 | ||
81 | /* # of MP IRQ source entries */ | 82 | /* # of MP IRQ source entries */ |
82 | int mp_irq_entries; | 83 | int mp_irq_entries; |
83 | 84 | ||
85 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | ||
86 | int mp_bus_id_to_type[MAX_MP_BUSSES]; | ||
87 | #endif | ||
88 | |||
89 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | ||
90 | |||
84 | static int disable_timer_pin_1 __initdata; | 91 | static int disable_timer_pin_1 __initdata; |
85 | 92 | ||
86 | /* | 93 | /* |
@@ -110,7 +117,7 @@ struct io_apic { | |||
110 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) | 117 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) |
111 | { | 118 | { |
112 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) | 119 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) |
113 | + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); | 120 | + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); |
114 | } | 121 | } |
115 | 122 | ||
116 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) | 123 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) |
@@ -239,7 +246,7 @@ static void __init replace_pin_at_irq(unsigned int irq, | |||
239 | } | 246 | } |
240 | } | 247 | } |
241 | 248 | ||
242 | static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) | 249 | static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) |
243 | { | 250 | { |
244 | struct irq_pin_list *entry = irq_2_pin + irq; | 251 | struct irq_pin_list *entry = irq_2_pin + irq; |
245 | unsigned int pin, reg; | 252 | unsigned int pin, reg; |
@@ -259,30 +266,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign | |||
259 | } | 266 | } |
260 | 267 | ||
261 | /* mask = 1 */ | 268 | /* mask = 1 */ |
262 | static void __mask_IO_APIC_irq (unsigned int irq) | 269 | static void __mask_IO_APIC_irq(unsigned int irq) |
263 | { | 270 | { |
264 | __modify_IO_APIC_irq(irq, 0x00010000, 0); | 271 | __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); |
265 | } | 272 | } |
266 | 273 | ||
267 | /* mask = 0 */ | 274 | /* mask = 0 */ |
268 | static void __unmask_IO_APIC_irq (unsigned int irq) | 275 | static void __unmask_IO_APIC_irq(unsigned int irq) |
269 | { | 276 | { |
270 | __modify_IO_APIC_irq(irq, 0, 0x00010000); | 277 | __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); |
271 | } | 278 | } |
272 | 279 | ||
273 | /* mask = 1, trigger = 0 */ | 280 | /* mask = 1, trigger = 0 */ |
274 | static void __mask_and_edge_IO_APIC_irq (unsigned int irq) | 281 | static void __mask_and_edge_IO_APIC_irq(unsigned int irq) |
275 | { | 282 | { |
276 | __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); | 283 | __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, |
284 | IO_APIC_REDIR_LEVEL_TRIGGER); | ||
277 | } | 285 | } |
278 | 286 | ||
279 | /* mask = 0, trigger = 1 */ | 287 | /* mask = 0, trigger = 1 */ |
280 | static void __unmask_and_level_IO_APIC_irq (unsigned int irq) | 288 | static void __unmask_and_level_IO_APIC_irq(unsigned int irq) |
281 | { | 289 | { |
282 | __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); | 290 | __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, |
291 | IO_APIC_REDIR_MASKED); | ||
283 | } | 292 | } |
284 | 293 | ||
285 | static void mask_IO_APIC_irq (unsigned int irq) | 294 | static void mask_IO_APIC_irq(unsigned int irq) |
286 | { | 295 | { |
287 | unsigned long flags; | 296 | unsigned long flags; |
288 | 297 | ||
@@ -291,7 +300,7 @@ static void mask_IO_APIC_irq (unsigned int irq) | |||
291 | spin_unlock_irqrestore(&ioapic_lock, flags); | 300 | spin_unlock_irqrestore(&ioapic_lock, flags); |
292 | } | 301 | } |
293 | 302 | ||
294 | static void unmask_IO_APIC_irq (unsigned int irq) | 303 | static void unmask_IO_APIC_irq(unsigned int irq) |
295 | { | 304 | { |
296 | unsigned long flags; | 305 | unsigned long flags; |
297 | 306 | ||
@@ -303,7 +312,7 @@ static void unmask_IO_APIC_irq (unsigned int irq) | |||
303 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | 312 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) |
304 | { | 313 | { |
305 | struct IO_APIC_route_entry entry; | 314 | struct IO_APIC_route_entry entry; |
306 | 315 | ||
307 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ | 316 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ |
308 | entry = ioapic_read_entry(apic, pin); | 317 | entry = ioapic_read_entry(apic, pin); |
309 | if (entry.delivery_mode == dest_SMI) | 318 | if (entry.delivery_mode == dest_SMI) |
@@ -315,7 +324,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | |||
315 | ioapic_mask_entry(apic, pin); | 324 | ioapic_mask_entry(apic, pin); |
316 | } | 325 | } |
317 | 326 | ||
318 | static void clear_IO_APIC (void) | 327 | static void clear_IO_APIC(void) |
319 | { | 328 | { |
320 | int apic, pin; | 329 | int apic, pin; |
321 | 330 | ||
@@ -332,7 +341,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) | |||
332 | struct irq_pin_list *entry = irq_2_pin + irq; | 341 | struct irq_pin_list *entry = irq_2_pin + irq; |
333 | unsigned int apicid_value; | 342 | unsigned int apicid_value; |
334 | cpumask_t tmp; | 343 | cpumask_t tmp; |
335 | 344 | ||
336 | cpus_and(tmp, cpumask, cpu_online_map); | 345 | cpus_and(tmp, cpumask, cpu_online_map); |
337 | if (cpus_empty(tmp)) | 346 | if (cpus_empty(tmp)) |
338 | tmp = TARGET_CPUS; | 347 | tmp = TARGET_CPUS; |
@@ -361,7 +370,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) | |||
361 | # include <linux/kernel_stat.h> /* kstat */ | 370 | # include <linux/kernel_stat.h> /* kstat */ |
362 | # include <linux/slab.h> /* kmalloc() */ | 371 | # include <linux/slab.h> /* kmalloc() */ |
363 | # include <linux/timer.h> | 372 | # include <linux/timer.h> |
364 | 373 | ||
365 | #define IRQBALANCE_CHECK_ARCH -999 | 374 | #define IRQBALANCE_CHECK_ARCH -999 |
366 | #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) | 375 | #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) |
367 | #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) | 376 | #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) |
@@ -373,14 +382,14 @@ static int physical_balance __read_mostly; | |||
373 | static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; | 382 | static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; |
374 | 383 | ||
375 | static struct irq_cpu_info { | 384 | static struct irq_cpu_info { |
376 | unsigned long * last_irq; | 385 | unsigned long *last_irq; |
377 | unsigned long * irq_delta; | 386 | unsigned long *irq_delta; |
378 | unsigned long irq; | 387 | unsigned long irq; |
379 | } irq_cpu_data[NR_CPUS]; | 388 | } irq_cpu_data[NR_CPUS]; |
380 | 389 | ||
381 | #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) | 390 | #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) |
382 | #define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) | 391 | #define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) |
383 | #define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) | 392 | #define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) |
384 | 393 | ||
385 | #define IDLE_ENOUGH(cpu,now) \ | 394 | #define IDLE_ENOUGH(cpu,now) \ |
386 | (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) | 395 | (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) |
@@ -419,8 +428,8 @@ inside: | |||
419 | if (cpu == -1) | 428 | if (cpu == -1) |
420 | cpu = NR_CPUS-1; | 429 | cpu = NR_CPUS-1; |
421 | } | 430 | } |
422 | } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || | 431 | } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || |
423 | (search_idle && !IDLE_ENOUGH(cpu,now))); | 432 | (search_idle && !IDLE_ENOUGH(cpu, now))); |
424 | 433 | ||
425 | return cpu; | 434 | return cpu; |
426 | } | 435 | } |
@@ -430,15 +439,14 @@ static inline void balance_irq(int cpu, int irq) | |||
430 | unsigned long now = jiffies; | 439 | unsigned long now = jiffies; |
431 | cpumask_t allowed_mask; | 440 | cpumask_t allowed_mask; |
432 | unsigned int new_cpu; | 441 | unsigned int new_cpu; |
433 | 442 | ||
434 | if (irqbalance_disabled) | 443 | if (irqbalance_disabled) |
435 | return; | 444 | return; |
436 | 445 | ||
437 | cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); | 446 | cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); |
438 | new_cpu = move(cpu, allowed_mask, now, 1); | 447 | new_cpu = move(cpu, allowed_mask, now, 1); |
439 | if (cpu != new_cpu) { | 448 | if (cpu != new_cpu) |
440 | set_pending_irq(irq, cpumask_of_cpu(new_cpu)); | 449 | set_pending_irq(irq, cpumask_of_cpu(new_cpu)); |
441 | } | ||
442 | } | 450 | } |
443 | 451 | ||
444 | static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) | 452 | static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) |
@@ -450,14 +458,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) | |||
450 | if (!irq_desc[j].action) | 458 | if (!irq_desc[j].action) |
451 | continue; | 459 | continue; |
452 | /* Is it a significant load ? */ | 460 | /* Is it a significant load ? */ |
453 | if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < | 461 | if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < |
454 | useful_load_threshold) | 462 | useful_load_threshold) |
455 | continue; | 463 | continue; |
456 | balance_irq(i, j); | 464 | balance_irq(i, j); |
457 | } | 465 | } |
458 | } | 466 | } |
459 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | 467 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, |
460 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | 468 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); |
461 | return; | 469 | return; |
462 | } | 470 | } |
463 | 471 | ||
@@ -486,22 +494,22 @@ static void do_irq_balance(void) | |||
486 | /* Is this an active IRQ or balancing disabled ? */ | 494 | /* Is this an active IRQ or balancing disabled ? */ |
487 | if (!irq_desc[j].action || irq_balancing_disabled(j)) | 495 | if (!irq_desc[j].action || irq_balancing_disabled(j)) |
488 | continue; | 496 | continue; |
489 | if ( package_index == i ) | 497 | if (package_index == i) |
490 | IRQ_DELTA(package_index,j) = 0; | 498 | IRQ_DELTA(package_index, j) = 0; |
491 | /* Determine the total count per processor per IRQ */ | 499 | /* Determine the total count per processor per IRQ */ |
492 | value_now = (unsigned long) kstat_cpu(i).irqs[j]; | 500 | value_now = (unsigned long) kstat_cpu(i).irqs[j]; |
493 | 501 | ||
494 | /* Determine the activity per processor per IRQ */ | 502 | /* Determine the activity per processor per IRQ */ |
495 | delta = value_now - LAST_CPU_IRQ(i,j); | 503 | delta = value_now - LAST_CPU_IRQ(i, j); |
496 | 504 | ||
497 | /* Update last_cpu_irq[][] for the next time */ | 505 | /* Update last_cpu_irq[][] for the next time */ |
498 | LAST_CPU_IRQ(i,j) = value_now; | 506 | LAST_CPU_IRQ(i, j) = value_now; |
499 | 507 | ||
500 | /* Ignore IRQs whose rate is less than the clock */ | 508 | /* Ignore IRQs whose rate is less than the clock */ |
501 | if (delta < useful_load_threshold) | 509 | if (delta < useful_load_threshold) |
502 | continue; | 510 | continue; |
503 | /* update the load for the processor or package total */ | 511 | /* update the load for the processor or package total */ |
504 | IRQ_DELTA(package_index,j) += delta; | 512 | IRQ_DELTA(package_index, j) += delta; |
505 | 513 | ||
506 | /* Keep track of the higher numbered sibling as well */ | 514 | /* Keep track of the higher numbered sibling as well */ |
507 | if (i != package_index) | 515 | if (i != package_index) |
@@ -527,7 +535,8 @@ static void do_irq_balance(void) | |||
527 | max_cpu_irq = ULONG_MAX; | 535 | max_cpu_irq = ULONG_MAX; |
528 | 536 | ||
529 | tryanothercpu: | 537 | tryanothercpu: |
530 | /* Look for heaviest loaded processor. | 538 | /* |
539 | * Look for heaviest loaded processor. | ||
531 | * We may come back to get the next heaviest loaded processor. | 540 | * We may come back to get the next heaviest loaded processor. |
532 | * Skip processors with trivial loads. | 541 | * Skip processors with trivial loads. |
533 | */ | 542 | */ |
@@ -536,7 +545,7 @@ tryanothercpu: | |||
536 | for_each_online_cpu(i) { | 545 | for_each_online_cpu(i) { |
537 | if (i != CPU_TO_PACKAGEINDEX(i)) | 546 | if (i != CPU_TO_PACKAGEINDEX(i)) |
538 | continue; | 547 | continue; |
539 | if (max_cpu_irq <= CPU_IRQ(i)) | 548 | if (max_cpu_irq <= CPU_IRQ(i)) |
540 | continue; | 549 | continue; |
541 | if (tmp_cpu_irq < CPU_IRQ(i)) { | 550 | if (tmp_cpu_irq < CPU_IRQ(i)) { |
542 | tmp_cpu_irq = CPU_IRQ(i); | 551 | tmp_cpu_irq = CPU_IRQ(i); |
@@ -545,8 +554,9 @@ tryanothercpu: | |||
545 | } | 554 | } |
546 | 555 | ||
547 | if (tmp_loaded == -1) { | 556 | if (tmp_loaded == -1) { |
548 | /* In the case of small number of heavy interrupt sources, | 557 | /* |
549 | * loading some of the cpus too much. We use Ingo's original | 558 | * In the case of small number of heavy interrupt sources, |
559 | * loading some of the cpus too much. We use Ingo's original | ||
550 | * approach to rotate them around. | 560 | * approach to rotate them around. |
551 | */ | 561 | */ |
552 | if (!first_attempt && imbalance >= useful_load_threshold) { | 562 | if (!first_attempt && imbalance >= useful_load_threshold) { |
@@ -555,13 +565,14 @@ tryanothercpu: | |||
555 | } | 565 | } |
556 | goto not_worth_the_effort; | 566 | goto not_worth_the_effort; |
557 | } | 567 | } |
558 | 568 | ||
559 | first_attempt = 0; /* heaviest search */ | 569 | first_attempt = 0; /* heaviest search */ |
560 | max_cpu_irq = tmp_cpu_irq; /* load */ | 570 | max_cpu_irq = tmp_cpu_irq; /* load */ |
561 | max_loaded = tmp_loaded; /* processor */ | 571 | max_loaded = tmp_loaded; /* processor */ |
562 | imbalance = (max_cpu_irq - min_cpu_irq) / 2; | 572 | imbalance = (max_cpu_irq - min_cpu_irq) / 2; |
563 | 573 | ||
564 | /* if imbalance is less than approx 10% of max load, then | 574 | /* |
575 | * if imbalance is less than approx 10% of max load, then | ||
565 | * observe diminishing returns action. - quit | 576 | * observe diminishing returns action. - quit |
566 | */ | 577 | */ |
567 | if (imbalance < (max_cpu_irq >> 3)) | 578 | if (imbalance < (max_cpu_irq >> 3)) |
@@ -577,26 +588,25 @@ tryanotherirq: | |||
577 | /* Is this an active IRQ? */ | 588 | /* Is this an active IRQ? */ |
578 | if (!irq_desc[j].action) | 589 | if (!irq_desc[j].action) |
579 | continue; | 590 | continue; |
580 | if (imbalance <= IRQ_DELTA(max_loaded,j)) | 591 | if (imbalance <= IRQ_DELTA(max_loaded, j)) |
581 | continue; | 592 | continue; |
582 | /* Try to find the IRQ that is closest to the imbalance | 593 | /* Try to find the IRQ that is closest to the imbalance |
583 | * without going over. | 594 | * without going over. |
584 | */ | 595 | */ |
585 | if (move_this_load < IRQ_DELTA(max_loaded,j)) { | 596 | if (move_this_load < IRQ_DELTA(max_loaded, j)) { |
586 | move_this_load = IRQ_DELTA(max_loaded,j); | 597 | move_this_load = IRQ_DELTA(max_loaded, j); |
587 | selected_irq = j; | 598 | selected_irq = j; |
588 | } | 599 | } |
589 | } | 600 | } |
590 | if (selected_irq == -1) { | 601 | if (selected_irq == -1) |
591 | goto tryanothercpu; | 602 | goto tryanothercpu; |
592 | } | ||
593 | 603 | ||
594 | imbalance = move_this_load; | 604 | imbalance = move_this_load; |
595 | 605 | ||
596 | /* For physical_balance case, we accumulated both load | 606 | /* For physical_balance case, we accumulated both load |
597 | * values in the one of the siblings cpu_irq[], | 607 | * values in the one of the siblings cpu_irq[], |
598 | * to use the same code for physical and logical processors | 608 | * to use the same code for physical and logical processors |
599 | * as much as possible. | 609 | * as much as possible. |
600 | * | 610 | * |
601 | * NOTE: the cpu_irq[] array holds the sum of the load for | 611 | * NOTE: the cpu_irq[] array holds the sum of the load for |
602 | * sibling A and sibling B in the slot for the lowest numbered | 612 | * sibling A and sibling B in the slot for the lowest numbered |
@@ -625,11 +635,11 @@ tryanotherirq: | |||
625 | /* mark for change destination */ | 635 | /* mark for change destination */ |
626 | set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); | 636 | set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); |
627 | 637 | ||
628 | /* Since we made a change, come back sooner to | 638 | /* Since we made a change, come back sooner to |
629 | * check for more variation. | 639 | * check for more variation. |
630 | */ | 640 | */ |
631 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | 641 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, |
632 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | 642 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); |
633 | return; | 643 | return; |
634 | } | 644 | } |
635 | goto tryanotherirq; | 645 | goto tryanotherirq; |
@@ -640,7 +650,7 @@ not_worth_the_effort: | |||
640 | * upward | 650 | * upward |
641 | */ | 651 | */ |
642 | balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, | 652 | balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, |
643 | balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); | 653 | balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); |
644 | return; | 654 | return; |
645 | } | 655 | } |
646 | 656 | ||
@@ -679,13 +689,13 @@ static int __init balanced_irq_init(void) | |||
679 | cpumask_t tmp; | 689 | cpumask_t tmp; |
680 | 690 | ||
681 | cpus_shift_right(tmp, cpu_online_map, 2); | 691 | cpus_shift_right(tmp, cpu_online_map, 2); |
682 | c = &boot_cpu_data; | 692 | c = &boot_cpu_data; |
683 | /* When not overwritten by the command line ask subarchitecture. */ | 693 | /* When not overwritten by the command line ask subarchitecture. */ |
684 | if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) | 694 | if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) |
685 | irqbalance_disabled = NO_BALANCE_IRQ; | 695 | irqbalance_disabled = NO_BALANCE_IRQ; |
686 | if (irqbalance_disabled) | 696 | if (irqbalance_disabled) |
687 | return 0; | 697 | return 0; |
688 | 698 | ||
689 | /* disable irqbalance completely if there is only one processor online */ | 699 | /* disable irqbalance completely if there is only one processor online */ |
690 | if (num_online_cpus() < 2) { | 700 | if (num_online_cpus() < 2) { |
691 | irqbalance_disabled = 1; | 701 | irqbalance_disabled = 1; |
@@ -699,16 +709,14 @@ static int __init balanced_irq_init(void) | |||
699 | physical_balance = 1; | 709 | physical_balance = 1; |
700 | 710 | ||
701 | for_each_online_cpu(i) { | 711 | for_each_online_cpu(i) { |
702 | irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | 712 | irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); |
703 | irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | 713 | irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); |
704 | if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { | 714 | if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { |
705 | printk(KERN_ERR "balanced_irq_init: out of memory"); | 715 | printk(KERN_ERR "balanced_irq_init: out of memory"); |
706 | goto failed; | 716 | goto failed; |
707 | } | 717 | } |
708 | memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); | ||
709 | memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); | ||
710 | } | 718 | } |
711 | 719 | ||
712 | printk(KERN_INFO "Starting balanced_irq\n"); | 720 | printk(KERN_INFO "Starting balanced_irq\n"); |
713 | if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) | 721 | if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) |
714 | return 0; | 722 | return 0; |
@@ -748,7 +756,7 @@ void send_IPI_self(int vector) | |||
748 | /* | 756 | /* |
749 | * Send the IPI. The write to APIC_ICR fires this off. | 757 | * Send the IPI. The write to APIC_ICR fires this off. |
750 | */ | 758 | */ |
751 | apic_write_around(APIC_ICR, cfg); | 759 | apic_write(APIC_ICR, cfg); |
752 | } | 760 | } |
753 | #endif /* !CONFIG_SMP */ | 761 | #endif /* !CONFIG_SMP */ |
754 | 762 | ||
@@ -801,10 +809,10 @@ static int find_irq_entry(int apic, int pin, int type) | |||
801 | int i; | 809 | int i; |
802 | 810 | ||
803 | for (i = 0; i < mp_irq_entries; i++) | 811 | for (i = 0; i < mp_irq_entries; i++) |
804 | if (mp_irqs[i].mpc_irqtype == type && | 812 | if (mp_irqs[i].mp_irqtype == type && |
805 | (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | 813 | (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || |
806 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | 814 | mp_irqs[i].mp_dstapic == MP_APIC_ALL) && |
807 | mp_irqs[i].mpc_dstirq == pin) | 815 | mp_irqs[i].mp_dstirq == pin) |
808 | return i; | 816 | return i; |
809 | 817 | ||
810 | return -1; | 818 | return -1; |
@@ -818,13 +826,13 @@ static int __init find_isa_irq_pin(int irq, int type) | |||
818 | int i; | 826 | int i; |
819 | 827 | ||
820 | for (i = 0; i < mp_irq_entries; i++) { | 828 | for (i = 0; i < mp_irq_entries; i++) { |
821 | int lbus = mp_irqs[i].mpc_srcbus; | 829 | int lbus = mp_irqs[i].mp_srcbus; |
822 | 830 | ||
823 | if (test_bit(lbus, mp_bus_not_pci) && | 831 | if (test_bit(lbus, mp_bus_not_pci) && |
824 | (mp_irqs[i].mpc_irqtype == type) && | 832 | (mp_irqs[i].mp_irqtype == type) && |
825 | (mp_irqs[i].mpc_srcbusirq == irq)) | 833 | (mp_irqs[i].mp_srcbusirq == irq)) |
826 | 834 | ||
827 | return mp_irqs[i].mpc_dstirq; | 835 | return mp_irqs[i].mp_dstirq; |
828 | } | 836 | } |
829 | return -1; | 837 | return -1; |
830 | } | 838 | } |
@@ -834,17 +842,17 @@ static int __init find_isa_irq_apic(int irq, int type) | |||
834 | int i; | 842 | int i; |
835 | 843 | ||
836 | for (i = 0; i < mp_irq_entries; i++) { | 844 | for (i = 0; i < mp_irq_entries; i++) { |
837 | int lbus = mp_irqs[i].mpc_srcbus; | 845 | int lbus = mp_irqs[i].mp_srcbus; |
838 | 846 | ||
839 | if (test_bit(lbus, mp_bus_not_pci) && | 847 | if (test_bit(lbus, mp_bus_not_pci) && |
840 | (mp_irqs[i].mpc_irqtype == type) && | 848 | (mp_irqs[i].mp_irqtype == type) && |
841 | (mp_irqs[i].mpc_srcbusirq == irq)) | 849 | (mp_irqs[i].mp_srcbusirq == irq)) |
842 | break; | 850 | break; |
843 | } | 851 | } |
844 | if (i < mp_irq_entries) { | 852 | if (i < mp_irq_entries) { |
845 | int apic; | 853 | int apic; |
846 | for(apic = 0; apic < nr_ioapics; apic++) { | 854 | for (apic = 0; apic < nr_ioapics; apic++) { |
847 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) | 855 | if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) |
848 | return apic; | 856 | return apic; |
849 | } | 857 | } |
850 | } | 858 | } |
@@ -864,28 +872,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | |||
864 | 872 | ||
865 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " | 873 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " |
866 | "slot:%d, pin:%d.\n", bus, slot, pin); | 874 | "slot:%d, pin:%d.\n", bus, slot, pin); |
867 | if (mp_bus_id_to_pci_bus[bus] == -1) { | 875 | if (test_bit(bus, mp_bus_not_pci)) { |
868 | printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | 876 | printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); |
869 | return -1; | 877 | return -1; |
870 | } | 878 | } |
871 | for (i = 0; i < mp_irq_entries; i++) { | 879 | for (i = 0; i < mp_irq_entries; i++) { |
872 | int lbus = mp_irqs[i].mpc_srcbus; | 880 | int lbus = mp_irqs[i].mp_srcbus; |
873 | 881 | ||
874 | for (apic = 0; apic < nr_ioapics; apic++) | 882 | for (apic = 0; apic < nr_ioapics; apic++) |
875 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | 883 | if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || |
876 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | 884 | mp_irqs[i].mp_dstapic == MP_APIC_ALL) |
877 | break; | 885 | break; |
878 | 886 | ||
879 | if (!test_bit(lbus, mp_bus_not_pci) && | 887 | if (!test_bit(lbus, mp_bus_not_pci) && |
880 | !mp_irqs[i].mpc_irqtype && | 888 | !mp_irqs[i].mp_irqtype && |
881 | (bus == lbus) && | 889 | (bus == lbus) && |
882 | (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | 890 | (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { |
883 | int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | 891 | int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq); |
884 | 892 | ||
885 | if (!(apic || IO_APIC_IRQ(irq))) | 893 | if (!(apic || IO_APIC_IRQ(irq))) |
886 | continue; | 894 | continue; |
887 | 895 | ||
888 | if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | 896 | if (pin == (mp_irqs[i].mp_srcbusirq & 3)) |
889 | return irq; | 897 | return irq; |
890 | /* | 898 | /* |
891 | * Use the first all-but-pin matching entry as a | 899 | * Use the first all-but-pin matching entry as a |
@@ -900,7 +908,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | |||
900 | EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); | 908 | EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); |
901 | 909 | ||
902 | /* | 910 | /* |
903 | * This function currently is only a helper for the i386 smp boot process where | 911 | * This function currently is only a helper for the i386 smp boot process where |
904 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | 912 | * we need to reprogram the ioredtbls to cater for the cpus which have come online |
905 | * so mask in all cases should simply be TARGET_CPUS | 913 | * so mask in all cases should simply be TARGET_CPUS |
906 | */ | 914 | */ |
@@ -952,7 +960,7 @@ static int EISA_ELCR(unsigned int irq) | |||
952 | * EISA conforming in the MP table, that means its trigger type must | 960 | * EISA conforming in the MP table, that means its trigger type must |
953 | * be read in from the ELCR */ | 961 | * be read in from the ELCR */ |
954 | 962 | ||
955 | #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) | 963 | #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) |
956 | #define default_EISA_polarity(idx) default_ISA_polarity(idx) | 964 | #define default_EISA_polarity(idx) default_ISA_polarity(idx) |
957 | 965 | ||
958 | /* PCI interrupts are always polarity one level triggered, | 966 | /* PCI interrupts are always polarity one level triggered, |
@@ -969,118 +977,115 @@ static int EISA_ELCR(unsigned int irq) | |||
969 | 977 | ||
970 | static int MPBIOS_polarity(int idx) | 978 | static int MPBIOS_polarity(int idx) |
971 | { | 979 | { |
972 | int bus = mp_irqs[idx].mpc_srcbus; | 980 | int bus = mp_irqs[idx].mp_srcbus; |
973 | int polarity; | 981 | int polarity; |
974 | 982 | ||
975 | /* | 983 | /* |
976 | * Determine IRQ line polarity (high active or low active): | 984 | * Determine IRQ line polarity (high active or low active): |
977 | */ | 985 | */ |
978 | switch (mp_irqs[idx].mpc_irqflag & 3) | 986 | switch (mp_irqs[idx].mp_irqflag & 3) { |
987 | case 0: /* conforms, ie. bus-type dependent polarity */ | ||
979 | { | 988 | { |
980 | case 0: /* conforms, ie. bus-type dependent polarity */ | 989 | polarity = test_bit(bus, mp_bus_not_pci)? |
981 | { | 990 | default_ISA_polarity(idx): |
982 | polarity = test_bit(bus, mp_bus_not_pci)? | 991 | default_PCI_polarity(idx); |
983 | default_ISA_polarity(idx): | 992 | break; |
984 | default_PCI_polarity(idx); | 993 | } |
985 | break; | 994 | case 1: /* high active */ |
986 | } | 995 | { |
987 | case 1: /* high active */ | 996 | polarity = 0; |
988 | { | 997 | break; |
989 | polarity = 0; | 998 | } |
990 | break; | 999 | case 2: /* reserved */ |
991 | } | 1000 | { |
992 | case 2: /* reserved */ | 1001 | printk(KERN_WARNING "broken BIOS!!\n"); |
993 | { | 1002 | polarity = 1; |
994 | printk(KERN_WARNING "broken BIOS!!\n"); | 1003 | break; |
995 | polarity = 1; | 1004 | } |
996 | break; | 1005 | case 3: /* low active */ |
997 | } | 1006 | { |
998 | case 3: /* low active */ | 1007 | polarity = 1; |
999 | { | 1008 | break; |
1000 | polarity = 1; | 1009 | } |
1001 | break; | 1010 | default: /* invalid */ |
1002 | } | 1011 | { |
1003 | default: /* invalid */ | 1012 | printk(KERN_WARNING "broken BIOS!!\n"); |
1004 | { | 1013 | polarity = 1; |
1005 | printk(KERN_WARNING "broken BIOS!!\n"); | 1014 | break; |
1006 | polarity = 1; | 1015 | } |
1007 | break; | ||
1008 | } | ||
1009 | } | 1016 | } |
1010 | return polarity; | 1017 | return polarity; |
1011 | } | 1018 | } |
1012 | 1019 | ||
1013 | static int MPBIOS_trigger(int idx) | 1020 | static int MPBIOS_trigger(int idx) |
1014 | { | 1021 | { |
1015 | int bus = mp_irqs[idx].mpc_srcbus; | 1022 | int bus = mp_irqs[idx].mp_srcbus; |
1016 | int trigger; | 1023 | int trigger; |
1017 | 1024 | ||
1018 | /* | 1025 | /* |
1019 | * Determine IRQ trigger mode (edge or level sensitive): | 1026 | * Determine IRQ trigger mode (edge or level sensitive): |
1020 | */ | 1027 | */ |
1021 | switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | 1028 | switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { |
1029 | case 0: /* conforms, ie. bus-type dependent */ | ||
1022 | { | 1030 | { |
1023 | case 0: /* conforms, ie. bus-type dependent */ | 1031 | trigger = test_bit(bus, mp_bus_not_pci)? |
1024 | { | 1032 | default_ISA_trigger(idx): |
1025 | trigger = test_bit(bus, mp_bus_not_pci)? | 1033 | default_PCI_trigger(idx); |
1026 | default_ISA_trigger(idx): | ||
1027 | default_PCI_trigger(idx); | ||
1028 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | 1034 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) |
1029 | switch (mp_bus_id_to_type[bus]) | 1035 | switch (mp_bus_id_to_type[bus]) { |
1030 | { | 1036 | case MP_BUS_ISA: /* ISA pin */ |
1031 | case MP_BUS_ISA: /* ISA pin */ | 1037 | { |
1032 | { | 1038 | /* set before the switch */ |
1033 | /* set before the switch */ | ||
1034 | break; | ||
1035 | } | ||
1036 | case MP_BUS_EISA: /* EISA pin */ | ||
1037 | { | ||
1038 | trigger = default_EISA_trigger(idx); | ||
1039 | break; | ||
1040 | } | ||
1041 | case MP_BUS_PCI: /* PCI pin */ | ||
1042 | { | ||
1043 | /* set before the switch */ | ||
1044 | break; | ||
1045 | } | ||
1046 | case MP_BUS_MCA: /* MCA pin */ | ||
1047 | { | ||
1048 | trigger = default_MCA_trigger(idx); | ||
1049 | break; | ||
1050 | } | ||
1051 | default: | ||
1052 | { | ||
1053 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1054 | trigger = 1; | ||
1055 | break; | ||
1056 | } | ||
1057 | } | ||
1058 | #endif | ||
1059 | break; | 1039 | break; |
1060 | } | 1040 | } |
1061 | case 1: /* edge */ | 1041 | case MP_BUS_EISA: /* EISA pin */ |
1062 | { | 1042 | { |
1063 | trigger = 0; | 1043 | trigger = default_EISA_trigger(idx); |
1064 | break; | 1044 | break; |
1065 | } | 1045 | } |
1066 | case 2: /* reserved */ | 1046 | case MP_BUS_PCI: /* PCI pin */ |
1067 | { | 1047 | { |
1068 | printk(KERN_WARNING "broken BIOS!!\n"); | 1048 | /* set before the switch */ |
1069 | trigger = 1; | ||
1070 | break; | 1049 | break; |
1071 | } | 1050 | } |
1072 | case 3: /* level */ | 1051 | case MP_BUS_MCA: /* MCA pin */ |
1073 | { | 1052 | { |
1074 | trigger = 1; | 1053 | trigger = default_MCA_trigger(idx); |
1075 | break; | 1054 | break; |
1076 | } | 1055 | } |
1077 | default: /* invalid */ | 1056 | default: |
1078 | { | 1057 | { |
1079 | printk(KERN_WARNING "broken BIOS!!\n"); | 1058 | printk(KERN_WARNING "broken BIOS!!\n"); |
1080 | trigger = 0; | 1059 | trigger = 1; |
1081 | break; | 1060 | break; |
1082 | } | 1061 | } |
1083 | } | 1062 | } |
1063 | #endif | ||
1064 | break; | ||
1065 | } | ||
1066 | case 1: /* edge */ | ||
1067 | { | ||
1068 | trigger = 0; | ||
1069 | break; | ||
1070 | } | ||
1071 | case 2: /* reserved */ | ||
1072 | { | ||
1073 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1074 | trigger = 1; | ||
1075 | break; | ||
1076 | } | ||
1077 | case 3: /* level */ | ||
1078 | { | ||
1079 | trigger = 1; | ||
1080 | break; | ||
1081 | } | ||
1082 | default: /* invalid */ | ||
1083 | { | ||
1084 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1085 | trigger = 0; | ||
1086 | break; | ||
1087 | } | ||
1088 | } | ||
1084 | return trigger; | 1089 | return trigger; |
1085 | } | 1090 | } |
1086 | 1091 | ||
@@ -1097,16 +1102,16 @@ static inline int irq_trigger(int idx) | |||
1097 | static int pin_2_irq(int idx, int apic, int pin) | 1102 | static int pin_2_irq(int idx, int apic, int pin) |
1098 | { | 1103 | { |
1099 | int irq, i; | 1104 | int irq, i; |
1100 | int bus = mp_irqs[idx].mpc_srcbus; | 1105 | int bus = mp_irqs[idx].mp_srcbus; |
1101 | 1106 | ||
1102 | /* | 1107 | /* |
1103 | * Debugging check, we are in big trouble if this message pops up! | 1108 | * Debugging check, we are in big trouble if this message pops up! |
1104 | */ | 1109 | */ |
1105 | if (mp_irqs[idx].mpc_dstirq != pin) | 1110 | if (mp_irqs[idx].mp_dstirq != pin) |
1106 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | 1111 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); |
1107 | 1112 | ||
1108 | if (test_bit(bus, mp_bus_not_pci)) | 1113 | if (test_bit(bus, mp_bus_not_pci)) |
1109 | irq = mp_irqs[idx].mpc_srcbusirq; | 1114 | irq = mp_irqs[idx].mp_srcbusirq; |
1110 | else { | 1115 | else { |
1111 | /* | 1116 | /* |
1112 | * PCI IRQs are mapped in order | 1117 | * PCI IRQs are mapped in order |
@@ -1148,8 +1153,8 @@ static inline int IO_APIC_irq_trigger(int irq) | |||
1148 | 1153 | ||
1149 | for (apic = 0; apic < nr_ioapics; apic++) { | 1154 | for (apic = 0; apic < nr_ioapics; apic++) { |
1150 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | 1155 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { |
1151 | idx = find_irq_entry(apic,pin,mp_INT); | 1156 | idx = find_irq_entry(apic, pin, mp_INT); |
1152 | if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) | 1157 | if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) |
1153 | return irq_trigger(idx); | 1158 | return irq_trigger(idx); |
1154 | } | 1159 | } |
1155 | } | 1160 | } |
@@ -1164,7 +1169,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 } | |||
1164 | 1169 | ||
1165 | static int __assign_irq_vector(int irq) | 1170 | static int __assign_irq_vector(int irq) |
1166 | { | 1171 | { |
1167 | static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; | 1172 | static int current_vector = FIRST_DEVICE_VECTOR, current_offset; |
1168 | int vector, offset; | 1173 | int vector, offset; |
1169 | 1174 | ||
1170 | BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); | 1175 | BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); |
@@ -1176,7 +1181,7 @@ static int __assign_irq_vector(int irq) | |||
1176 | offset = current_offset; | 1181 | offset = current_offset; |
1177 | next: | 1182 | next: |
1178 | vector += 8; | 1183 | vector += 8; |
1179 | if (vector >= FIRST_SYSTEM_VECTOR) { | 1184 | if (vector >= first_system_vector) { |
1180 | offset = (offset + 1) % 8; | 1185 | offset = (offset + 1) % 8; |
1181 | vector = FIRST_DEVICE_VECTOR + offset; | 1186 | vector = FIRST_DEVICE_VECTOR + offset; |
1182 | } | 1187 | } |
@@ -1203,6 +1208,11 @@ static int assign_irq_vector(int irq) | |||
1203 | 1208 | ||
1204 | return vector; | 1209 | return vector; |
1205 | } | 1210 | } |
1211 | |||
1212 | void setup_vector_irq(int cpu) | ||
1213 | { | ||
1214 | } | ||
1215 | |||
1206 | static struct irq_chip ioapic_chip; | 1216 | static struct irq_chip ioapic_chip; |
1207 | 1217 | ||
1208 | #define IOAPIC_AUTO -1 | 1218 | #define IOAPIC_AUTO -1 |
@@ -1237,25 +1247,25 @@ static void __init setup_IO_APIC_irqs(void) | |||
1237 | /* | 1247 | /* |
1238 | * add it to the IO-APIC irq-routing table: | 1248 | * add it to the IO-APIC irq-routing table: |
1239 | */ | 1249 | */ |
1240 | memset(&entry,0,sizeof(entry)); | 1250 | memset(&entry, 0, sizeof(entry)); |
1241 | 1251 | ||
1242 | entry.delivery_mode = INT_DELIVERY_MODE; | 1252 | entry.delivery_mode = INT_DELIVERY_MODE; |
1243 | entry.dest_mode = INT_DEST_MODE; | 1253 | entry.dest_mode = INT_DEST_MODE; |
1244 | entry.mask = 0; /* enable IRQ */ | 1254 | entry.mask = 0; /* enable IRQ */ |
1245 | entry.dest.logical.logical_dest = | 1255 | entry.dest.logical.logical_dest = |
1246 | cpu_mask_to_apicid(TARGET_CPUS); | 1256 | cpu_mask_to_apicid(TARGET_CPUS); |
1247 | 1257 | ||
1248 | idx = find_irq_entry(apic,pin,mp_INT); | 1258 | idx = find_irq_entry(apic, pin, mp_INT); |
1249 | if (idx == -1) { | 1259 | if (idx == -1) { |
1250 | if (first_notcon) { | 1260 | if (first_notcon) { |
1251 | apic_printk(APIC_VERBOSE, KERN_DEBUG | 1261 | apic_printk(APIC_VERBOSE, KERN_DEBUG |
1252 | " IO-APIC (apicid-pin) %d-%d", | 1262 | " IO-APIC (apicid-pin) %d-%d", |
1253 | mp_ioapics[apic].mpc_apicid, | 1263 | mp_ioapics[apic].mp_apicid, |
1254 | pin); | 1264 | pin); |
1255 | first_notcon = 0; | 1265 | first_notcon = 0; |
1256 | } else | 1266 | } else |
1257 | apic_printk(APIC_VERBOSE, ", %d-%d", | 1267 | apic_printk(APIC_VERBOSE, ", %d-%d", |
1258 | mp_ioapics[apic].mpc_apicid, pin); | 1268 | mp_ioapics[apic].mp_apicid, pin); |
1259 | continue; | 1269 | continue; |
1260 | } | 1270 | } |
1261 | 1271 | ||
@@ -1289,7 +1299,7 @@ static void __init setup_IO_APIC_irqs(void) | |||
1289 | vector = assign_irq_vector(irq); | 1299 | vector = assign_irq_vector(irq); |
1290 | entry.vector = vector; | 1300 | entry.vector = vector; |
1291 | ioapic_register_intr(irq, vector, IOAPIC_AUTO); | 1301 | ioapic_register_intr(irq, vector, IOAPIC_AUTO); |
1292 | 1302 | ||
1293 | if (!apic && (irq < 16)) | 1303 | if (!apic && (irq < 16)) |
1294 | disable_8259A_irq(irq); | 1304 | disable_8259A_irq(irq); |
1295 | } | 1305 | } |
@@ -1302,25 +1312,21 @@ static void __init setup_IO_APIC_irqs(void) | |||
1302 | } | 1312 | } |
1303 | 1313 | ||
1304 | /* | 1314 | /* |
1305 | * Set up the 8259A-master output pin: | 1315 | * Set up the timer pin, possibly with the 8259A-master behind. |
1306 | */ | 1316 | */ |
1307 | static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) | 1317 | static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, |
1318 | int vector) | ||
1308 | { | 1319 | { |
1309 | struct IO_APIC_route_entry entry; | 1320 | struct IO_APIC_route_entry entry; |
1310 | 1321 | ||
1311 | memset(&entry,0,sizeof(entry)); | 1322 | memset(&entry, 0, sizeof(entry)); |
1312 | |||
1313 | disable_8259A_irq(0); | ||
1314 | |||
1315 | /* mask LVT0 */ | ||
1316 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
1317 | 1323 | ||
1318 | /* | 1324 | /* |
1319 | * We use logical delivery to get the timer IRQ | 1325 | * We use logical delivery to get the timer IRQ |
1320 | * to the first CPU. | 1326 | * to the first CPU. |
1321 | */ | 1327 | */ |
1322 | entry.dest_mode = INT_DEST_MODE; | 1328 | entry.dest_mode = INT_DEST_MODE; |
1323 | entry.mask = 0; /* unmask IRQ now */ | 1329 | entry.mask = 1; /* mask IRQ now */ |
1324 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | 1330 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); |
1325 | entry.delivery_mode = INT_DELIVERY_MODE; | 1331 | entry.delivery_mode = INT_DELIVERY_MODE; |
1326 | entry.polarity = 0; | 1332 | entry.polarity = 0; |
@@ -1329,17 +1335,14 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in | |||
1329 | 1335 | ||
1330 | /* | 1336 | /* |
1331 | * The timer IRQ doesn't have to know that behind the | 1337 | * The timer IRQ doesn't have to know that behind the |
1332 | * scene we have a 8259A-master in AEOI mode ... | 1338 | * scene we may have a 8259A-master in AEOI mode ... |
1333 | */ | 1339 | */ |
1334 | irq_desc[0].chip = &ioapic_chip; | 1340 | ioapic_register_intr(0, vector, IOAPIC_EDGE); |
1335 | set_irq_handler(0, handle_edge_irq); | ||
1336 | 1341 | ||
1337 | /* | 1342 | /* |
1338 | * Add it to the IO-APIC irq-routing table: | 1343 | * Add it to the IO-APIC irq-routing table: |
1339 | */ | 1344 | */ |
1340 | ioapic_write_entry(apic, pin, entry); | 1345 | ioapic_write_entry(apic, pin, entry); |
1341 | |||
1342 | enable_8259A_irq(0); | ||
1343 | } | 1346 | } |
1344 | 1347 | ||
1345 | void __init print_IO_APIC(void) | 1348 | void __init print_IO_APIC(void) |
@@ -1354,10 +1357,10 @@ void __init print_IO_APIC(void) | |||
1354 | if (apic_verbosity == APIC_QUIET) | 1357 | if (apic_verbosity == APIC_QUIET) |
1355 | return; | 1358 | return; |
1356 | 1359 | ||
1357 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | 1360 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); |
1358 | for (i = 0; i < nr_ioapics; i++) | 1361 | for (i = 0; i < nr_ioapics; i++) |
1359 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | 1362 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", |
1360 | mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | 1363 | mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); |
1361 | 1364 | ||
1362 | /* | 1365 | /* |
1363 | * We are a bit conservative about what we expect. We have to | 1366 | * We are a bit conservative about what we expect. We have to |
@@ -1376,7 +1379,7 @@ void __init print_IO_APIC(void) | |||
1376 | reg_03.raw = io_apic_read(apic, 3); | 1379 | reg_03.raw = io_apic_read(apic, 3); |
1377 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1380 | spin_unlock_irqrestore(&ioapic_lock, flags); |
1378 | 1381 | ||
1379 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | 1382 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); |
1380 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | 1383 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); |
1381 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | 1384 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); |
1382 | printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); | 1385 | printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); |
@@ -1459,7 +1462,7 @@ void __init print_IO_APIC(void) | |||
1459 | 1462 | ||
1460 | #if 0 | 1463 | #if 0 |
1461 | 1464 | ||
1462 | static void print_APIC_bitfield (int base) | 1465 | static void print_APIC_bitfield(int base) |
1463 | { | 1466 | { |
1464 | unsigned int v; | 1467 | unsigned int v; |
1465 | int i, j; | 1468 | int i, j; |
@@ -1480,7 +1483,7 @@ static void print_APIC_bitfield (int base) | |||
1480 | } | 1483 | } |
1481 | } | 1484 | } |
1482 | 1485 | ||
1483 | void /*__init*/ print_local_APIC(void * dummy) | 1486 | void /*__init*/ print_local_APIC(void *dummy) |
1484 | { | 1487 | { |
1485 | unsigned int v, ver, maxlvt; | 1488 | unsigned int v, ver, maxlvt; |
1486 | 1489 | ||
@@ -1489,6 +1492,7 @@ void /*__init*/ print_local_APIC(void * dummy) | |||
1489 | 1492 | ||
1490 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | 1493 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", |
1491 | smp_processor_id(), hard_smp_processor_id()); | 1494 | smp_processor_id(), hard_smp_processor_id()); |
1495 | v = apic_read(APIC_ID); | ||
1492 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, | 1496 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, |
1493 | GET_APIC_ID(read_apic_id())); | 1497 | GET_APIC_ID(read_apic_id())); |
1494 | v = apic_read(APIC_LVR); | 1498 | v = apic_read(APIC_LVR); |
@@ -1563,9 +1567,9 @@ void /*__init*/ print_local_APIC(void * dummy) | |||
1563 | printk("\n"); | 1567 | printk("\n"); |
1564 | } | 1568 | } |
1565 | 1569 | ||
1566 | void print_all_local_APICs (void) | 1570 | void print_all_local_APICs(void) |
1567 | { | 1571 | { |
1568 | on_each_cpu(print_local_APIC, NULL, 1, 1); | 1572 | on_each_cpu(print_local_APIC, NULL, 1); |
1569 | } | 1573 | } |
1570 | 1574 | ||
1571 | void /*__init*/ print_PIC(void) | 1575 | void /*__init*/ print_PIC(void) |
@@ -1586,11 +1590,11 @@ void /*__init*/ print_PIC(void) | |||
1586 | v = inb(0xa0) << 8 | inb(0x20); | 1590 | v = inb(0xa0) << 8 | inb(0x20); |
1587 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | 1591 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); |
1588 | 1592 | ||
1589 | outb(0x0b,0xa0); | 1593 | outb(0x0b, 0xa0); |
1590 | outb(0x0b,0x20); | 1594 | outb(0x0b, 0x20); |
1591 | v = inb(0xa0) << 8 | inb(0x20); | 1595 | v = inb(0xa0) << 8 | inb(0x20); |
1592 | outb(0x0a,0xa0); | 1596 | outb(0x0a, 0xa0); |
1593 | outb(0x0a,0x20); | 1597 | outb(0x0a, 0x20); |
1594 | 1598 | ||
1595 | spin_unlock_irqrestore(&i8259A_lock, flags); | 1599 | spin_unlock_irqrestore(&i8259A_lock, flags); |
1596 | 1600 | ||
@@ -1626,7 +1630,7 @@ static void __init enable_IO_APIC(void) | |||
1626 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1630 | spin_unlock_irqrestore(&ioapic_lock, flags); |
1627 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | 1631 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; |
1628 | } | 1632 | } |
1629 | for(apic = 0; apic < nr_ioapics; apic++) { | 1633 | for (apic = 0; apic < nr_ioapics; apic++) { |
1630 | int pin; | 1634 | int pin; |
1631 | /* See if any of the pins is in ExtINT mode */ | 1635 | /* See if any of the pins is in ExtINT mode */ |
1632 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | 1636 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { |
@@ -1716,7 +1720,6 @@ void disable_IO_APIC(void) | |||
1716 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | 1720 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 |
1717 | */ | 1721 | */ |
1718 | 1722 | ||
1719 | #ifndef CONFIG_X86_NUMAQ | ||
1720 | static void __init setup_ioapic_ids_from_mpc(void) | 1723 | static void __init setup_ioapic_ids_from_mpc(void) |
1721 | { | 1724 | { |
1722 | union IO_APIC_reg_00 reg_00; | 1725 | union IO_APIC_reg_00 reg_00; |
@@ -1726,6 +1729,11 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
1726 | unsigned char old_id; | 1729 | unsigned char old_id; |
1727 | unsigned long flags; | 1730 | unsigned long flags; |
1728 | 1731 | ||
1732 | #ifdef CONFIG_X86_NUMAQ | ||
1733 | if (found_numaq) | ||
1734 | return; | ||
1735 | #endif | ||
1736 | |||
1729 | /* | 1737 | /* |
1730 | * Don't check I/O APIC IDs for xAPIC systems. They have | 1738 | * Don't check I/O APIC IDs for xAPIC systems. They have |
1731 | * no meaning without the serial APIC bus. | 1739 | * no meaning without the serial APIC bus. |
@@ -1748,15 +1756,15 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
1748 | spin_lock_irqsave(&ioapic_lock, flags); | 1756 | spin_lock_irqsave(&ioapic_lock, flags); |
1749 | reg_00.raw = io_apic_read(apic, 0); | 1757 | reg_00.raw = io_apic_read(apic, 0); |
1750 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1758 | spin_unlock_irqrestore(&ioapic_lock, flags); |
1751 | |||
1752 | old_id = mp_ioapics[apic].mpc_apicid; | ||
1753 | 1759 | ||
1754 | if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { | 1760 | old_id = mp_ioapics[apic].mp_apicid; |
1761 | |||
1762 | if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { | ||
1755 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", | 1763 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", |
1756 | apic, mp_ioapics[apic].mpc_apicid); | 1764 | apic, mp_ioapics[apic].mp_apicid); |
1757 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | 1765 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", |
1758 | reg_00.bits.ID); | 1766 | reg_00.bits.ID); |
1759 | mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; | 1767 | mp_ioapics[apic].mp_apicid = reg_00.bits.ID; |
1760 | } | 1768 | } |
1761 | 1769 | ||
1762 | /* | 1770 | /* |
@@ -1765,9 +1773,9 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
1765 | * 'stuck on smp_invalidate_needed IPI wait' messages. | 1773 | * 'stuck on smp_invalidate_needed IPI wait' messages. |
1766 | */ | 1774 | */ |
1767 | if (check_apicid_used(phys_id_present_map, | 1775 | if (check_apicid_used(phys_id_present_map, |
1768 | mp_ioapics[apic].mpc_apicid)) { | 1776 | mp_ioapics[apic].mp_apicid)) { |
1769 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", | 1777 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", |
1770 | apic, mp_ioapics[apic].mpc_apicid); | 1778 | apic, mp_ioapics[apic].mp_apicid); |
1771 | for (i = 0; i < get_physical_broadcast(); i++) | 1779 | for (i = 0; i < get_physical_broadcast(); i++) |
1772 | if (!physid_isset(i, phys_id_present_map)) | 1780 | if (!physid_isset(i, phys_id_present_map)) |
1773 | break; | 1781 | break; |
@@ -1776,13 +1784,13 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
1776 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | 1784 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", |
1777 | i); | 1785 | i); |
1778 | physid_set(i, phys_id_present_map); | 1786 | physid_set(i, phys_id_present_map); |
1779 | mp_ioapics[apic].mpc_apicid = i; | 1787 | mp_ioapics[apic].mp_apicid = i; |
1780 | } else { | 1788 | } else { |
1781 | physid_mask_t tmp; | 1789 | physid_mask_t tmp; |
1782 | tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); | 1790 | tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); |
1783 | apic_printk(APIC_VERBOSE, "Setting %d in the " | 1791 | apic_printk(APIC_VERBOSE, "Setting %d in the " |
1784 | "phys_id_present_map\n", | 1792 | "phys_id_present_map\n", |
1785 | mp_ioapics[apic].mpc_apicid); | 1793 | mp_ioapics[apic].mp_apicid); |
1786 | physids_or(phys_id_present_map, phys_id_present_map, tmp); | 1794 | physids_or(phys_id_present_map, phys_id_present_map, tmp); |
1787 | } | 1795 | } |
1788 | 1796 | ||
@@ -1791,21 +1799,21 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
1791 | * We need to adjust the IRQ routing table | 1799 | * We need to adjust the IRQ routing table |
1792 | * if the ID changed. | 1800 | * if the ID changed. |
1793 | */ | 1801 | */ |
1794 | if (old_id != mp_ioapics[apic].mpc_apicid) | 1802 | if (old_id != mp_ioapics[apic].mp_apicid) |
1795 | for (i = 0; i < mp_irq_entries; i++) | 1803 | for (i = 0; i < mp_irq_entries; i++) |
1796 | if (mp_irqs[i].mpc_dstapic == old_id) | 1804 | if (mp_irqs[i].mp_dstapic == old_id) |
1797 | mp_irqs[i].mpc_dstapic | 1805 | mp_irqs[i].mp_dstapic |
1798 | = mp_ioapics[apic].mpc_apicid; | 1806 | = mp_ioapics[apic].mp_apicid; |
1799 | 1807 | ||
1800 | /* | 1808 | /* |
1801 | * Read the right value from the MPC table and | 1809 | * Read the right value from the MPC table and |
1802 | * write it into the ID register. | 1810 | * write it into the ID register. |
1803 | */ | 1811 | */ |
1804 | apic_printk(APIC_VERBOSE, KERN_INFO | 1812 | apic_printk(APIC_VERBOSE, KERN_INFO |
1805 | "...changing IO-APIC physical APIC ID to %d ...", | 1813 | "...changing IO-APIC physical APIC ID to %d ...", |
1806 | mp_ioapics[apic].mpc_apicid); | 1814 | mp_ioapics[apic].mp_apicid); |
1807 | 1815 | ||
1808 | reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; | 1816 | reg_00.bits.ID = mp_ioapics[apic].mp_apicid; |
1809 | spin_lock_irqsave(&ioapic_lock, flags); | 1817 | spin_lock_irqsave(&ioapic_lock, flags); |
1810 | io_apic_write(apic, 0, reg_00.raw); | 1818 | io_apic_write(apic, 0, reg_00.raw); |
1811 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1819 | spin_unlock_irqrestore(&ioapic_lock, flags); |
@@ -1816,15 +1824,12 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
1816 | spin_lock_irqsave(&ioapic_lock, flags); | 1824 | spin_lock_irqsave(&ioapic_lock, flags); |
1817 | reg_00.raw = io_apic_read(apic, 0); | 1825 | reg_00.raw = io_apic_read(apic, 0); |
1818 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1826 | spin_unlock_irqrestore(&ioapic_lock, flags); |
1819 | if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) | 1827 | if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) |
1820 | printk("could not set ID!\n"); | 1828 | printk("could not set ID!\n"); |
1821 | else | 1829 | else |
1822 | apic_printk(APIC_VERBOSE, " ok.\n"); | 1830 | apic_printk(APIC_VERBOSE, " ok.\n"); |
1823 | } | 1831 | } |
1824 | } | 1832 | } |
1825 | #else | ||
1826 | static void __init setup_ioapic_ids_from_mpc(void) { } | ||
1827 | #endif | ||
1828 | 1833 | ||
1829 | int no_timer_check __initdata; | 1834 | int no_timer_check __initdata; |
1830 | 1835 | ||
@@ -2015,45 +2020,53 @@ static inline void init_IO_APIC_traps(void) | |||
2015 | * The local APIC irq-chip implementation: | 2020 | * The local APIC irq-chip implementation: |
2016 | */ | 2021 | */ |
2017 | 2022 | ||
2018 | static void ack_apic(unsigned int irq) | 2023 | static void ack_lapic_irq(unsigned int irq) |
2019 | { | 2024 | { |
2020 | ack_APIC_irq(); | 2025 | ack_APIC_irq(); |
2021 | } | 2026 | } |
2022 | 2027 | ||
2023 | static void mask_lapic_irq (unsigned int irq) | 2028 | static void mask_lapic_irq(unsigned int irq) |
2024 | { | 2029 | { |
2025 | unsigned long v; | 2030 | unsigned long v; |
2026 | 2031 | ||
2027 | v = apic_read(APIC_LVT0); | 2032 | v = apic_read(APIC_LVT0); |
2028 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | 2033 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); |
2029 | } | 2034 | } |
2030 | 2035 | ||
2031 | static void unmask_lapic_irq (unsigned int irq) | 2036 | static void unmask_lapic_irq(unsigned int irq) |
2032 | { | 2037 | { |
2033 | unsigned long v; | 2038 | unsigned long v; |
2034 | 2039 | ||
2035 | v = apic_read(APIC_LVT0); | 2040 | v = apic_read(APIC_LVT0); |
2036 | apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); | 2041 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); |
2037 | } | 2042 | } |
2038 | 2043 | ||
2039 | static struct irq_chip lapic_chip __read_mostly = { | 2044 | static struct irq_chip lapic_chip __read_mostly = { |
2040 | .name = "local-APIC-edge", | 2045 | .name = "local-APIC", |
2041 | .mask = mask_lapic_irq, | 2046 | .mask = mask_lapic_irq, |
2042 | .unmask = unmask_lapic_irq, | 2047 | .unmask = unmask_lapic_irq, |
2043 | .eoi = ack_apic, | 2048 | .ack = ack_lapic_irq, |
2044 | }; | 2049 | }; |
2045 | 2050 | ||
2051 | static void lapic_register_intr(int irq, int vector) | ||
2052 | { | ||
2053 | irq_desc[irq].status &= ~IRQ_LEVEL; | ||
2054 | set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, | ||
2055 | "edge"); | ||
2056 | set_intr_gate(vector, interrupt[irq]); | ||
2057 | } | ||
2058 | |||
2046 | static void __init setup_nmi(void) | 2059 | static void __init setup_nmi(void) |
2047 | { | 2060 | { |
2048 | /* | 2061 | /* |
2049 | * Dirty trick to enable the NMI watchdog ... | 2062 | * Dirty trick to enable the NMI watchdog ... |
2050 | * We put the 8259A master into AEOI mode and | 2063 | * We put the 8259A master into AEOI mode and |
2051 | * unmask on all local APICs LVT0 as NMI. | 2064 | * unmask on all local APICs LVT0 as NMI. |
2052 | * | 2065 | * |
2053 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | 2066 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') |
2054 | * is from Maciej W. Rozycki - so we do not have to EOI from | 2067 | * is from Maciej W. Rozycki - so we do not have to EOI from |
2055 | * the NMI handler or the timer interrupt. | 2068 | * the NMI handler or the timer interrupt. |
2056 | */ | 2069 | */ |
2057 | apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); | 2070 | apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); |
2058 | 2071 | ||
2059 | enable_NMI_through_LVT0(); | 2072 | enable_NMI_through_LVT0(); |
@@ -2129,11 +2142,16 @@ static inline void __init unlock_ExtINT_logic(void) | |||
2129 | static inline void __init check_timer(void) | 2142 | static inline void __init check_timer(void) |
2130 | { | 2143 | { |
2131 | int apic1, pin1, apic2, pin2; | 2144 | int apic1, pin1, apic2, pin2; |
2145 | int no_pin1 = 0; | ||
2132 | int vector; | 2146 | int vector; |
2147 | unsigned int ver; | ||
2133 | unsigned long flags; | 2148 | unsigned long flags; |
2134 | 2149 | ||
2135 | local_irq_save(flags); | 2150 | local_irq_save(flags); |
2136 | 2151 | ||
2152 | ver = apic_read(APIC_LVR); | ||
2153 | ver = GET_APIC_VERSION(ver); | ||
2154 | |||
2137 | /* | 2155 | /* |
2138 | * get/set the timer IRQ vector: | 2156 | * get/set the timer IRQ vector: |
2139 | */ | 2157 | */ |
@@ -2142,34 +2160,54 @@ static inline void __init check_timer(void) | |||
2142 | set_intr_gate(vector, interrupt[0]); | 2160 | set_intr_gate(vector, interrupt[0]); |
2143 | 2161 | ||
2144 | /* | 2162 | /* |
2145 | * Subtle, code in do_timer_interrupt() expects an AEOI | 2163 | * As IRQ0 is to be enabled in the 8259A, the virtual |
2146 | * mode for the 8259A whenever interrupts are routed | 2164 | * wire has to be disabled in the local APIC. Also |
2147 | * through I/O APICs. Also IRQ0 has to be enabled in | 2165 | * timer interrupts need to be acknowledged manually in |
2148 | * the 8259A which implies the virtual wire has to be | 2166 | * the 8259A for the i82489DX when using the NMI |
2149 | * disabled in the local APIC. | 2167 | * watchdog as that APIC treats NMIs as level-triggered. |
2168 | * The AEOI mode will finish them in the 8259A | ||
2169 | * automatically. | ||
2150 | */ | 2170 | */ |
2151 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | 2171 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); |
2152 | init_8259A(1); | 2172 | init_8259A(1); |
2153 | timer_ack = 1; | 2173 | timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); |
2154 | if (timer_over_8254 > 0) | ||
2155 | enable_8259A_irq(0); | ||
2156 | 2174 | ||
2157 | pin1 = find_isa_irq_pin(0, mp_INT); | 2175 | pin1 = find_isa_irq_pin(0, mp_INT); |
2158 | apic1 = find_isa_irq_apic(0, mp_INT); | 2176 | apic1 = find_isa_irq_apic(0, mp_INT); |
2159 | pin2 = ioapic_i8259.pin; | 2177 | pin2 = ioapic_i8259.pin; |
2160 | apic2 = ioapic_i8259.apic; | 2178 | apic2 = ioapic_i8259.apic; |
2161 | 2179 | ||
2162 | printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", | 2180 | apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " |
2163 | vector, apic1, pin1, apic2, pin2); | 2181 | "apic1=%d pin1=%d apic2=%d pin2=%d\n", |
2182 | vector, apic1, pin1, apic2, pin2); | ||
2183 | |||
2184 | /* | ||
2185 | * Some BIOS writers are clueless and report the ExtINTA | ||
2186 | * I/O APIC input from the cascaded 8259A as the timer | ||
2187 | * interrupt input. So just in case, if only one pin | ||
2188 | * was found above, try it both directly and through the | ||
2189 | * 8259A. | ||
2190 | */ | ||
2191 | if (pin1 == -1) { | ||
2192 | pin1 = pin2; | ||
2193 | apic1 = apic2; | ||
2194 | no_pin1 = 1; | ||
2195 | } else if (pin2 == -1) { | ||
2196 | pin2 = pin1; | ||
2197 | apic2 = apic1; | ||
2198 | } | ||
2164 | 2199 | ||
2165 | if (pin1 != -1) { | 2200 | if (pin1 != -1) { |
2166 | /* | 2201 | /* |
2167 | * Ok, does IRQ0 through the IOAPIC work? | 2202 | * Ok, does IRQ0 through the IOAPIC work? |
2168 | */ | 2203 | */ |
2204 | if (no_pin1) { | ||
2205 | add_pin_to_irq(0, apic1, pin1); | ||
2206 | setup_timer_IRQ0_pin(apic1, pin1, vector); | ||
2207 | } | ||
2169 | unmask_IO_APIC_irq(0); | 2208 | unmask_IO_APIC_irq(0); |
2170 | if (timer_irq_works()) { | 2209 | if (timer_irq_works()) { |
2171 | if (nmi_watchdog == NMI_IO_APIC) { | 2210 | if (nmi_watchdog == NMI_IO_APIC) { |
2172 | disable_8259A_irq(0); | ||
2173 | setup_nmi(); | 2211 | setup_nmi(); |
2174 | enable_8259A_irq(0); | 2212 | enable_8259A_irq(0); |
2175 | } | 2213 | } |
@@ -2178,81 +2216,97 @@ static inline void __init check_timer(void) | |||
2178 | goto out; | 2216 | goto out; |
2179 | } | 2217 | } |
2180 | clear_IO_APIC_pin(apic1, pin1); | 2218 | clear_IO_APIC_pin(apic1, pin1); |
2181 | printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " | 2219 | if (!no_pin1) |
2182 | "IO-APIC\n"); | 2220 | apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " |
2183 | } | 2221 | "8254 timer not connected to IO-APIC\n"); |
2184 | 2222 | ||
2185 | printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); | 2223 | apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " |
2186 | if (pin2 != -1) { | 2224 | "(IRQ0) through the 8259A ...\n"); |
2187 | printk("\n..... (found pin %d) ...", pin2); | 2225 | apic_printk(APIC_QUIET, KERN_INFO |
2226 | "..... (found apic %d pin %d) ...\n", apic2, pin2); | ||
2188 | /* | 2227 | /* |
2189 | * legacy devices should be connected to IO APIC #0 | 2228 | * legacy devices should be connected to IO APIC #0 |
2190 | */ | 2229 | */ |
2191 | setup_ExtINT_IRQ0_pin(apic2, pin2, vector); | 2230 | replace_pin_at_irq(0, apic1, pin1, apic2, pin2); |
2231 | setup_timer_IRQ0_pin(apic2, pin2, vector); | ||
2232 | unmask_IO_APIC_irq(0); | ||
2233 | enable_8259A_irq(0); | ||
2192 | if (timer_irq_works()) { | 2234 | if (timer_irq_works()) { |
2193 | printk("works.\n"); | 2235 | apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); |
2194 | if (pin1 != -1) | 2236 | timer_through_8259 = 1; |
2195 | replace_pin_at_irq(0, apic1, pin1, apic2, pin2); | ||
2196 | else | ||
2197 | add_pin_to_irq(0, apic2, pin2); | ||
2198 | if (nmi_watchdog == NMI_IO_APIC) { | 2237 | if (nmi_watchdog == NMI_IO_APIC) { |
2238 | disable_8259A_irq(0); | ||
2199 | setup_nmi(); | 2239 | setup_nmi(); |
2240 | enable_8259A_irq(0); | ||
2200 | } | 2241 | } |
2201 | goto out; | 2242 | goto out; |
2202 | } | 2243 | } |
2203 | /* | 2244 | /* |
2204 | * Cleanup, just in case ... | 2245 | * Cleanup, just in case ... |
2205 | */ | 2246 | */ |
2247 | disable_8259A_irq(0); | ||
2206 | clear_IO_APIC_pin(apic2, pin2); | 2248 | clear_IO_APIC_pin(apic2, pin2); |
2249 | apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); | ||
2207 | } | 2250 | } |
2208 | printk(" failed.\n"); | ||
2209 | 2251 | ||
2210 | if (nmi_watchdog == NMI_IO_APIC) { | 2252 | if (nmi_watchdog == NMI_IO_APIC) { |
2211 | printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | 2253 | apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " |
2212 | nmi_watchdog = 0; | 2254 | "through the IO-APIC - disabling NMI Watchdog!\n"); |
2255 | nmi_watchdog = NMI_NONE; | ||
2213 | } | 2256 | } |
2257 | timer_ack = 0; | ||
2214 | 2258 | ||
2215 | printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | 2259 | apic_printk(APIC_QUIET, KERN_INFO |
2260 | "...trying to set up timer as Virtual Wire IRQ...\n"); | ||
2216 | 2261 | ||
2217 | disable_8259A_irq(0); | 2262 | lapic_register_intr(0, vector); |
2218 | set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, | 2263 | apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ |
2219 | "fasteoi"); | ||
2220 | apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ | ||
2221 | enable_8259A_irq(0); | 2264 | enable_8259A_irq(0); |
2222 | 2265 | ||
2223 | if (timer_irq_works()) { | 2266 | if (timer_irq_works()) { |
2224 | printk(" works.\n"); | 2267 | apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); |
2225 | goto out; | 2268 | goto out; |
2226 | } | 2269 | } |
2227 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); | 2270 | disable_8259A_irq(0); |
2228 | printk(" failed.\n"); | 2271 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); |
2272 | apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); | ||
2229 | 2273 | ||
2230 | printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | 2274 | apic_printk(APIC_QUIET, KERN_INFO |
2275 | "...trying to set up timer as ExtINT IRQ...\n"); | ||
2231 | 2276 | ||
2232 | timer_ack = 0; | ||
2233 | init_8259A(0); | 2277 | init_8259A(0); |
2234 | make_8259A_irq(0); | 2278 | make_8259A_irq(0); |
2235 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | 2279 | apic_write(APIC_LVT0, APIC_DM_EXTINT); |
2236 | 2280 | ||
2237 | unlock_ExtINT_logic(); | 2281 | unlock_ExtINT_logic(); |
2238 | 2282 | ||
2239 | if (timer_irq_works()) { | 2283 | if (timer_irq_works()) { |
2240 | printk(" works.\n"); | 2284 | apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); |
2241 | goto out; | 2285 | goto out; |
2242 | } | 2286 | } |
2243 | printk(" failed :(.\n"); | 2287 | apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); |
2244 | panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " | 2288 | panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " |
2245 | "report. Then try booting with the 'noapic' option"); | 2289 | "report. Then try booting with the 'noapic' option.\n"); |
2246 | out: | 2290 | out: |
2247 | local_irq_restore(flags); | 2291 | local_irq_restore(flags); |
2248 | } | 2292 | } |
2249 | 2293 | ||
2250 | /* | 2294 | /* |
2251 | * | 2295 | * Traditionally ISA IRQ2 is the cascade IRQ, and is not available |
2252 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. | 2296 | * to devices. However there may be an I/O APIC pin available for |
2253 | * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | 2297 | * this interrupt regardless. The pin may be left unconnected, but |
2254 | * Linux doesn't really care, as it's not actually used | 2298 | * typically it will be reused as an ExtINT cascade interrupt for |
2255 | * for any interrupt handling anyway. | 2299 | * the master 8259A. In the MPS case such a pin will normally be |
2300 | * reported as an ExtINT interrupt in the MP table. With ACPI | ||
2301 | * there is no provision for ExtINT interrupts, and in the absence | ||
2302 | * of an override it would be treated as an ordinary ISA I/O APIC | ||
2303 | * interrupt, that is edge-triggered and unmasked by default. We | ||
2304 | * used to do this, but it caused problems on some systems because | ||
2305 | * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using | ||
2306 | * the same ExtINT cascade interrupt to drive the local APIC of the | ||
2307 | * bootstrap processor. Therefore we refrain from routing IRQ2 to | ||
2308 | * the I/O APIC in all cases now. No actual device should request | ||
2309 | * it anyway. --macro | ||
2256 | */ | 2310 | */ |
2257 | #define PIC_IRQS (1 << PIC_CASCADE_IR) | 2311 | #define PIC_IRQS (1 << PIC_CASCADE_IR) |
2258 | 2312 | ||
@@ -2261,15 +2315,12 @@ void __init setup_IO_APIC(void) | |||
2261 | int i; | 2315 | int i; |
2262 | 2316 | ||
2263 | /* Reserve all the system vectors. */ | 2317 | /* Reserve all the system vectors. */ |
2264 | for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) | 2318 | for (i = first_system_vector; i < NR_VECTORS; i++) |
2265 | set_bit(i, used_vectors); | 2319 | set_bit(i, used_vectors); |
2266 | 2320 | ||
2267 | enable_IO_APIC(); | 2321 | enable_IO_APIC(); |
2268 | 2322 | ||
2269 | if (acpi_ioapic) | 2323 | io_apic_irqs = ~PIC_IRQS; |
2270 | io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | ||
2271 | else | ||
2272 | io_apic_irqs = ~PIC_IRQS; | ||
2273 | 2324 | ||
2274 | printk("ENABLING IO-APIC IRQs\n"); | 2325 | printk("ENABLING IO-APIC IRQs\n"); |
2275 | 2326 | ||
@@ -2286,28 +2337,14 @@ void __init setup_IO_APIC(void) | |||
2286 | print_IO_APIC(); | 2337 | print_IO_APIC(); |
2287 | } | 2338 | } |
2288 | 2339 | ||
2289 | static int __init setup_disable_8254_timer(char *s) | ||
2290 | { | ||
2291 | timer_over_8254 = -1; | ||
2292 | return 1; | ||
2293 | } | ||
2294 | static int __init setup_enable_8254_timer(char *s) | ||
2295 | { | ||
2296 | timer_over_8254 = 2; | ||
2297 | return 1; | ||
2298 | } | ||
2299 | |||
2300 | __setup("disable_8254_timer", setup_disable_8254_timer); | ||
2301 | __setup("enable_8254_timer", setup_enable_8254_timer); | ||
2302 | |||
2303 | /* | 2340 | /* |
2304 | * Called after all the initialization is done. If we didnt find any | 2341 | * Called after all the initialization is done. If we didnt find any |
2305 | * APIC bugs then we can allow the modify fast path | 2342 | * APIC bugs then we can allow the modify fast path |
2306 | */ | 2343 | */ |
2307 | 2344 | ||
2308 | static int __init io_apic_bug_finalize(void) | 2345 | static int __init io_apic_bug_finalize(void) |
2309 | { | 2346 | { |
2310 | if(sis_apic_bug == -1) | 2347 | if (sis_apic_bug == -1) |
2311 | sis_apic_bug = 0; | 2348 | sis_apic_bug = 0; |
2312 | return 0; | 2349 | return 0; |
2313 | } | 2350 | } |
@@ -2318,17 +2355,17 @@ struct sysfs_ioapic_data { | |||
2318 | struct sys_device dev; | 2355 | struct sys_device dev; |
2319 | struct IO_APIC_route_entry entry[0]; | 2356 | struct IO_APIC_route_entry entry[0]; |
2320 | }; | 2357 | }; |
2321 | static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | 2358 | static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; |
2322 | 2359 | ||
2323 | static int ioapic_suspend(struct sys_device *dev, pm_message_t state) | 2360 | static int ioapic_suspend(struct sys_device *dev, pm_message_t state) |
2324 | { | 2361 | { |
2325 | struct IO_APIC_route_entry *entry; | 2362 | struct IO_APIC_route_entry *entry; |
2326 | struct sysfs_ioapic_data *data; | 2363 | struct sysfs_ioapic_data *data; |
2327 | int i; | 2364 | int i; |
2328 | 2365 | ||
2329 | data = container_of(dev, struct sysfs_ioapic_data, dev); | 2366 | data = container_of(dev, struct sysfs_ioapic_data, dev); |
2330 | entry = data->entry; | 2367 | entry = data->entry; |
2331 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) | 2368 | for (i = 0; i < nr_ioapic_registers[dev->id]; i++) |
2332 | entry[i] = ioapic_read_entry(dev->id, i); | 2369 | entry[i] = ioapic_read_entry(dev->id, i); |
2333 | 2370 | ||
2334 | return 0; | 2371 | return 0; |
@@ -2341,18 +2378,18 @@ static int ioapic_resume(struct sys_device *dev) | |||
2341 | unsigned long flags; | 2378 | unsigned long flags; |
2342 | union IO_APIC_reg_00 reg_00; | 2379 | union IO_APIC_reg_00 reg_00; |
2343 | int i; | 2380 | int i; |
2344 | 2381 | ||
2345 | data = container_of(dev, struct sysfs_ioapic_data, dev); | 2382 | data = container_of(dev, struct sysfs_ioapic_data, dev); |
2346 | entry = data->entry; | 2383 | entry = data->entry; |
2347 | 2384 | ||
2348 | spin_lock_irqsave(&ioapic_lock, flags); | 2385 | spin_lock_irqsave(&ioapic_lock, flags); |
2349 | reg_00.raw = io_apic_read(dev->id, 0); | 2386 | reg_00.raw = io_apic_read(dev->id, 0); |
2350 | if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | 2387 | if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { |
2351 | reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | 2388 | reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; |
2352 | io_apic_write(dev->id, 0, reg_00.raw); | 2389 | io_apic_write(dev->id, 0, reg_00.raw); |
2353 | } | 2390 | } |
2354 | spin_unlock_irqrestore(&ioapic_lock, flags); | 2391 | spin_unlock_irqrestore(&ioapic_lock, flags); |
2355 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) | 2392 | for (i = 0; i < nr_ioapic_registers[dev->id]; i++) |
2356 | ioapic_write_entry(dev->id, i, entry[i]); | 2393 | ioapic_write_entry(dev->id, i, entry[i]); |
2357 | 2394 | ||
2358 | return 0; | 2395 | return 0; |
@@ -2366,24 +2403,23 @@ static struct sysdev_class ioapic_sysdev_class = { | |||
2366 | 2403 | ||
2367 | static int __init ioapic_init_sysfs(void) | 2404 | static int __init ioapic_init_sysfs(void) |
2368 | { | 2405 | { |
2369 | struct sys_device * dev; | 2406 | struct sys_device *dev; |
2370 | int i, size, error = 0; | 2407 | int i, size, error = 0; |
2371 | 2408 | ||
2372 | error = sysdev_class_register(&ioapic_sysdev_class); | 2409 | error = sysdev_class_register(&ioapic_sysdev_class); |
2373 | if (error) | 2410 | if (error) |
2374 | return error; | 2411 | return error; |
2375 | 2412 | ||
2376 | for (i = 0; i < nr_ioapics; i++ ) { | 2413 | for (i = 0; i < nr_ioapics; i++) { |
2377 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] | 2414 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] |
2378 | * sizeof(struct IO_APIC_route_entry); | 2415 | * sizeof(struct IO_APIC_route_entry); |
2379 | mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | 2416 | mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); |
2380 | if (!mp_ioapic_data[i]) { | 2417 | if (!mp_ioapic_data[i]) { |
2381 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | 2418 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); |
2382 | continue; | 2419 | continue; |
2383 | } | 2420 | } |
2384 | memset(mp_ioapic_data[i], 0, size); | ||
2385 | dev = &mp_ioapic_data[i]->dev; | 2421 | dev = &mp_ioapic_data[i]->dev; |
2386 | dev->id = i; | 2422 | dev->id = i; |
2387 | dev->cls = &ioapic_sysdev_class; | 2423 | dev->cls = &ioapic_sysdev_class; |
2388 | error = sysdev_register(dev); | 2424 | error = sysdev_register(dev); |
2389 | if (error) { | 2425 | if (error) { |
@@ -2458,7 +2494,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms | |||
2458 | msg->address_lo = | 2494 | msg->address_lo = |
2459 | MSI_ADDR_BASE_LO | | 2495 | MSI_ADDR_BASE_LO | |
2460 | ((INT_DEST_MODE == 0) ? | 2496 | ((INT_DEST_MODE == 0) ? |
2461 | MSI_ADDR_DEST_MODE_PHYSICAL: | 2497 | MSI_ADDR_DEST_MODE_PHYSICAL: |
2462 | MSI_ADDR_DEST_MODE_LOGICAL) | | 2498 | MSI_ADDR_DEST_MODE_LOGICAL) | |
2463 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | 2499 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? |
2464 | MSI_ADDR_REDIRECTION_CPU: | 2500 | MSI_ADDR_REDIRECTION_CPU: |
@@ -2469,7 +2505,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms | |||
2469 | MSI_DATA_TRIGGER_EDGE | | 2505 | MSI_DATA_TRIGGER_EDGE | |
2470 | MSI_DATA_LEVEL_ASSERT | | 2506 | MSI_DATA_LEVEL_ASSERT | |
2471 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | 2507 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? |
2472 | MSI_DATA_DELIVERY_FIXED: | 2508 | MSI_DATA_DELIVERY_FIXED: |
2473 | MSI_DATA_DELIVERY_LOWPRI) | | 2509 | MSI_DATA_DELIVERY_LOWPRI) | |
2474 | MSI_DATA_VECTOR(vector); | 2510 | MSI_DATA_VECTOR(vector); |
2475 | } | 2511 | } |
@@ -2640,12 +2676,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | |||
2640 | #endif /* CONFIG_HT_IRQ */ | 2676 | #endif /* CONFIG_HT_IRQ */ |
2641 | 2677 | ||
2642 | /* -------------------------------------------------------------------------- | 2678 | /* -------------------------------------------------------------------------- |
2643 | ACPI-based IOAPIC Configuration | 2679 | ACPI-based IOAPIC Configuration |
2644 | -------------------------------------------------------------------------- */ | 2680 | -------------------------------------------------------------------------- */ |
2645 | 2681 | ||
2646 | #ifdef CONFIG_ACPI | 2682 | #ifdef CONFIG_ACPI |
2647 | 2683 | ||
2648 | int __init io_apic_get_unique_id (int ioapic, int apic_id) | 2684 | int __init io_apic_get_unique_id(int ioapic, int apic_id) |
2649 | { | 2685 | { |
2650 | union IO_APIC_reg_00 reg_00; | 2686 | union IO_APIC_reg_00 reg_00; |
2651 | static physid_mask_t apic_id_map = PHYSID_MASK_NONE; | 2687 | static physid_mask_t apic_id_map = PHYSID_MASK_NONE; |
@@ -2654,10 +2690,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) | |||
2654 | int i = 0; | 2690 | int i = 0; |
2655 | 2691 | ||
2656 | /* | 2692 | /* |
2657 | * The P4 platform supports up to 256 APIC IDs on two separate APIC | 2693 | * The P4 platform supports up to 256 APIC IDs on two separate APIC |
2658 | * buses (one for LAPICs, one for IOAPICs), where predecessors only | 2694 | * buses (one for LAPICs, one for IOAPICs), where predecessors only |
2659 | * supports up to 16 on one shared APIC bus. | 2695 | * supports up to 16 on one shared APIC bus. |
2660 | * | 2696 | * |
2661 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | 2697 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full |
2662 | * advantage of new APIC bus architecture. | 2698 | * advantage of new APIC bus architecture. |
2663 | */ | 2699 | */ |
@@ -2676,7 +2712,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) | |||
2676 | } | 2712 | } |
2677 | 2713 | ||
2678 | /* | 2714 | /* |
2679 | * Every APIC in a system must have a unique ID or we get lots of nice | 2715 | * Every APIC in a system must have a unique ID or we get lots of nice |
2680 | * 'stuck on smp_invalidate_needed IPI wait' messages. | 2716 | * 'stuck on smp_invalidate_needed IPI wait' messages. |
2681 | */ | 2717 | */ |
2682 | if (check_apicid_used(apic_id_map, apic_id)) { | 2718 | if (check_apicid_used(apic_id_map, apic_id)) { |
@@ -2693,7 +2729,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) | |||
2693 | "trying %d\n", ioapic, apic_id, i); | 2729 | "trying %d\n", ioapic, apic_id, i); |
2694 | 2730 | ||
2695 | apic_id = i; | 2731 | apic_id = i; |
2696 | } | 2732 | } |
2697 | 2733 | ||
2698 | tmp = apicid_to_cpu_present(apic_id); | 2734 | tmp = apicid_to_cpu_present(apic_id); |
2699 | physids_or(apic_id_map, apic_id_map, tmp); | 2735 | physids_or(apic_id_map, apic_id_map, tmp); |
@@ -2720,7 +2756,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) | |||
2720 | } | 2756 | } |
2721 | 2757 | ||
2722 | 2758 | ||
2723 | int __init io_apic_get_version (int ioapic) | 2759 | int __init io_apic_get_version(int ioapic) |
2724 | { | 2760 | { |
2725 | union IO_APIC_reg_01 reg_01; | 2761 | union IO_APIC_reg_01 reg_01; |
2726 | unsigned long flags; | 2762 | unsigned long flags; |
@@ -2733,7 +2769,7 @@ int __init io_apic_get_version (int ioapic) | |||
2733 | } | 2769 | } |
2734 | 2770 | ||
2735 | 2771 | ||
2736 | int __init io_apic_get_redir_entries (int ioapic) | 2772 | int __init io_apic_get_redir_entries(int ioapic) |
2737 | { | 2773 | { |
2738 | union IO_APIC_reg_01 reg_01; | 2774 | union IO_APIC_reg_01 reg_01; |
2739 | unsigned long flags; | 2775 | unsigned long flags; |
@@ -2746,7 +2782,7 @@ int __init io_apic_get_redir_entries (int ioapic) | |||
2746 | } | 2782 | } |
2747 | 2783 | ||
2748 | 2784 | ||
2749 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) | 2785 | int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) |
2750 | { | 2786 | { |
2751 | struct IO_APIC_route_entry entry; | 2787 | struct IO_APIC_route_entry entry; |
2752 | 2788 | ||
@@ -2762,7 +2798,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a | |||
2762 | * corresponding device driver registers for this IRQ. | 2798 | * corresponding device driver registers for this IRQ. |
2763 | */ | 2799 | */ |
2764 | 2800 | ||
2765 | memset(&entry,0,sizeof(entry)); | 2801 | memset(&entry, 0, sizeof(entry)); |
2766 | 2802 | ||
2767 | entry.delivery_mode = INT_DELIVERY_MODE; | 2803 | entry.delivery_mode = INT_DELIVERY_MODE; |
2768 | entry.dest_mode = INT_DEST_MODE; | 2804 | entry.dest_mode = INT_DEST_MODE; |
@@ -2781,7 +2817,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a | |||
2781 | 2817 | ||
2782 | apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " | 2818 | apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " |
2783 | "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, | 2819 | "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, |
2784 | mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, | 2820 | mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, |
2785 | edge_level, active_high_low); | 2821 | edge_level, active_high_low); |
2786 | 2822 | ||
2787 | ioapic_register_intr(irq, entry.vector, edge_level); | 2823 | ioapic_register_intr(irq, entry.vector, edge_level); |
@@ -2802,8 +2838,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | |||
2802 | return -1; | 2838 | return -1; |
2803 | 2839 | ||
2804 | for (i = 0; i < mp_irq_entries; i++) | 2840 | for (i = 0; i < mp_irq_entries; i++) |
2805 | if (mp_irqs[i].mpc_irqtype == mp_INT && | 2841 | if (mp_irqs[i].mp_irqtype == mp_INT && |
2806 | mp_irqs[i].mpc_srcbusirq == bus_irq) | 2842 | mp_irqs[i].mp_srcbusirq == bus_irq) |
2807 | break; | 2843 | break; |
2808 | if (i >= mp_irq_entries) | 2844 | if (i >= mp_irq_entries) |
2809 | return -1; | 2845 | return -1; |
@@ -2836,3 +2872,34 @@ static int __init parse_noapic(char *arg) | |||
2836 | return 0; | 2872 | return 0; |
2837 | } | 2873 | } |
2838 | early_param("noapic", parse_noapic); | 2874 | early_param("noapic", parse_noapic); |
2875 | |||
2876 | void __init ioapic_init_mappings(void) | ||
2877 | { | ||
2878 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
2879 | int i; | ||
2880 | |||
2881 | for (i = 0; i < nr_ioapics; i++) { | ||
2882 | if (smp_found_config) { | ||
2883 | ioapic_phys = mp_ioapics[i].mp_apicaddr; | ||
2884 | if (!ioapic_phys) { | ||
2885 | printk(KERN_ERR | ||
2886 | "WARNING: bogus zero IO-APIC " | ||
2887 | "address found in MPTABLE, " | ||
2888 | "disabling IO/APIC support!\n"); | ||
2889 | smp_found_config = 0; | ||
2890 | skip_ioapic_setup = 1; | ||
2891 | goto fake_ioapic_page; | ||
2892 | } | ||
2893 | } else { | ||
2894 | fake_ioapic_page: | ||
2895 | ioapic_phys = (unsigned long) | ||
2896 | alloc_bootmem_pages(PAGE_SIZE); | ||
2897 | ioapic_phys = __pa(ioapic_phys); | ||
2898 | } | ||
2899 | set_fixmap_nocache(idx, ioapic_phys); | ||
2900 | printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", | ||
2901 | __fix_to_virt(idx), ioapic_phys); | ||
2902 | idx++; | ||
2903 | } | ||
2904 | } | ||
2905 | |||
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8dfcc529..8269434d1707 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <asm/proto.h> | 45 | #include <asm/proto.h> |
46 | #include <asm/acpi.h> | 46 | #include <asm/acpi.h> |
47 | #include <asm/dma.h> | 47 | #include <asm/dma.h> |
48 | #include <asm/i8259.h> | ||
48 | #include <asm/nmi.h> | 49 | #include <asm/nmi.h> |
49 | #include <asm/msidef.h> | 50 | #include <asm/msidef.h> |
50 | #include <asm/hypertransport.h> | 51 | #include <asm/hypertransport.h> |
@@ -61,7 +62,7 @@ struct irq_cfg { | |||
61 | }; | 62 | }; |
62 | 63 | ||
63 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ | 64 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ |
64 | struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { | 65 | static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { |
65 | [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, | 66 | [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, |
66 | [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, | 67 | [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, |
67 | [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, | 68 | [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, |
@@ -82,6 +83,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { | |||
82 | 83 | ||
83 | static int assign_irq_vector(int irq, cpumask_t mask); | 84 | static int assign_irq_vector(int irq, cpumask_t mask); |
84 | 85 | ||
86 | int first_system_vector = 0xfe; | ||
87 | |||
88 | char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; | ||
89 | |||
85 | #define __apicdebuginit __init | 90 | #define __apicdebuginit __init |
86 | 91 | ||
87 | int sis_apic_bug; /* not actually supported, dummy for compile */ | 92 | int sis_apic_bug; /* not actually supported, dummy for compile */ |
@@ -90,7 +95,7 @@ static int no_timer_check; | |||
90 | 95 | ||
91 | static int disable_timer_pin_1 __initdata; | 96 | static int disable_timer_pin_1 __initdata; |
92 | 97 | ||
93 | int timer_over_8254 __initdata = 1; | 98 | int timer_through_8259 __initdata; |
94 | 99 | ||
95 | /* Where if anywhere is the i8259 connect in external int mode */ | 100 | /* Where if anywhere is the i8259 connect in external int mode */ |
96 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | 101 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; |
@@ -104,15 +109,17 @@ DEFINE_SPINLOCK(vector_lock); | |||
104 | int nr_ioapic_registers[MAX_IO_APICS]; | 109 | int nr_ioapic_registers[MAX_IO_APICS]; |
105 | 110 | ||
106 | /* I/O APIC entries */ | 111 | /* I/O APIC entries */ |
107 | struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | 112 | struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; |
108 | int nr_ioapics; | 113 | int nr_ioapics; |
109 | 114 | ||
110 | /* MP IRQ source entries */ | 115 | /* MP IRQ source entries */ |
111 | struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | 116 | struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; |
112 | 117 | ||
113 | /* # of MP IRQ source entries */ | 118 | /* # of MP IRQ source entries */ |
114 | int mp_irq_entries; | 119 | int mp_irq_entries; |
115 | 120 | ||
121 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | ||
122 | |||
116 | /* | 123 | /* |
117 | * Rough estimation of how many shared IRQs there are, can | 124 | * Rough estimation of how many shared IRQs there are, can |
118 | * be changed anytime. | 125 | * be changed anytime. |
@@ -140,7 +147,7 @@ struct io_apic { | |||
140 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) | 147 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) |
141 | { | 148 | { |
142 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) | 149 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) |
143 | + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); | 150 | + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); |
144 | } | 151 | } |
145 | 152 | ||
146 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) | 153 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) |
@@ -183,7 +190,7 @@ static bool io_apic_level_ack_pending(unsigned int irq) | |||
183 | break; | 190 | break; |
184 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | 191 | reg = io_apic_read(entry->apic, 0x10 + pin*2); |
185 | /* Is the remote IRR bit set? */ | 192 | /* Is the remote IRR bit set? */ |
186 | if ((reg >> 14) & 1) { | 193 | if (reg & IO_APIC_REDIR_REMOTE_IRR) { |
187 | spin_unlock_irqrestore(&ioapic_lock, flags); | 194 | spin_unlock_irqrestore(&ioapic_lock, flags); |
188 | return true; | 195 | return true; |
189 | } | 196 | } |
@@ -298,7 +305,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) | |||
298 | break; | 305 | break; |
299 | io_apic_write(apic, 0x11 + pin*2, dest); | 306 | io_apic_write(apic, 0x11 + pin*2, dest); |
300 | reg = io_apic_read(apic, 0x10 + pin*2); | 307 | reg = io_apic_read(apic, 0x10 + pin*2); |
301 | reg &= ~0x000000ff; | 308 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; |
302 | reg |= vector; | 309 | reg |= vector; |
303 | io_apic_modify(apic, reg); | 310 | io_apic_modify(apic, reg); |
304 | if (!entry->next) | 311 | if (!entry->next) |
@@ -360,16 +367,37 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) | |||
360 | entry->pin = pin; | 367 | entry->pin = pin; |
361 | } | 368 | } |
362 | 369 | ||
370 | /* | ||
371 | * Reroute an IRQ to a different pin. | ||
372 | */ | ||
373 | static void __init replace_pin_at_irq(unsigned int irq, | ||
374 | int oldapic, int oldpin, | ||
375 | int newapic, int newpin) | ||
376 | { | ||
377 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
378 | |||
379 | while (1) { | ||
380 | if (entry->apic == oldapic && entry->pin == oldpin) { | ||
381 | entry->apic = newapic; | ||
382 | entry->pin = newpin; | ||
383 | } | ||
384 | if (!entry->next) | ||
385 | break; | ||
386 | entry = irq_2_pin + entry->next; | ||
387 | } | ||
388 | } | ||
389 | |||
363 | 390 | ||
364 | #define DO_ACTION(name,R,ACTION, FINAL) \ | 391 | #define DO_ACTION(name,R,ACTION, FINAL) \ |
365 | \ | 392 | \ |
366 | static void name##_IO_APIC_irq (unsigned int irq) \ | 393 | static void name##_IO_APIC_irq (unsigned int irq) \ |
367 | __DO_ACTION(R, ACTION, FINAL) | 394 | __DO_ACTION(R, ACTION, FINAL) |
368 | 395 | ||
369 | DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) | 396 | /* mask = 1 */ |
370 | /* mask = 1 */ | 397 | DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) |
371 | DO_ACTION( __unmask, 0, &= 0xfffeffff, ) | 398 | |
372 | /* mask = 0 */ | 399 | /* mask = 0 */ |
400 | DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) | ||
373 | 401 | ||
374 | static void mask_IO_APIC_irq (unsigned int irq) | 402 | static void mask_IO_APIC_irq (unsigned int irq) |
375 | { | 403 | { |
@@ -430,20 +458,6 @@ static int __init disable_timer_pin_setup(char *arg) | |||
430 | } | 458 | } |
431 | __setup("disable_timer_pin_1", disable_timer_pin_setup); | 459 | __setup("disable_timer_pin_1", disable_timer_pin_setup); |
432 | 460 | ||
433 | static int __init setup_disable_8254_timer(char *s) | ||
434 | { | ||
435 | timer_over_8254 = -1; | ||
436 | return 1; | ||
437 | } | ||
438 | static int __init setup_enable_8254_timer(char *s) | ||
439 | { | ||
440 | timer_over_8254 = 2; | ||
441 | return 1; | ||
442 | } | ||
443 | |||
444 | __setup("disable_8254_timer", setup_disable_8254_timer); | ||
445 | __setup("enable_8254_timer", setup_enable_8254_timer); | ||
446 | |||
447 | 461 | ||
448 | /* | 462 | /* |
449 | * Find the IRQ entry number of a certain pin. | 463 | * Find the IRQ entry number of a certain pin. |
@@ -453,10 +467,10 @@ static int find_irq_entry(int apic, int pin, int type) | |||
453 | int i; | 467 | int i; |
454 | 468 | ||
455 | for (i = 0; i < mp_irq_entries; i++) | 469 | for (i = 0; i < mp_irq_entries; i++) |
456 | if (mp_irqs[i].mpc_irqtype == type && | 470 | if (mp_irqs[i].mp_irqtype == type && |
457 | (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | 471 | (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || |
458 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | 472 | mp_irqs[i].mp_dstapic == MP_APIC_ALL) && |
459 | mp_irqs[i].mpc_dstirq == pin) | 473 | mp_irqs[i].mp_dstirq == pin) |
460 | return i; | 474 | return i; |
461 | 475 | ||
462 | return -1; | 476 | return -1; |
@@ -470,13 +484,13 @@ static int __init find_isa_irq_pin(int irq, int type) | |||
470 | int i; | 484 | int i; |
471 | 485 | ||
472 | for (i = 0; i < mp_irq_entries; i++) { | 486 | for (i = 0; i < mp_irq_entries; i++) { |
473 | int lbus = mp_irqs[i].mpc_srcbus; | 487 | int lbus = mp_irqs[i].mp_srcbus; |
474 | 488 | ||
475 | if (test_bit(lbus, mp_bus_not_pci) && | 489 | if (test_bit(lbus, mp_bus_not_pci) && |
476 | (mp_irqs[i].mpc_irqtype == type) && | 490 | (mp_irqs[i].mp_irqtype == type) && |
477 | (mp_irqs[i].mpc_srcbusirq == irq)) | 491 | (mp_irqs[i].mp_srcbusirq == irq)) |
478 | 492 | ||
479 | return mp_irqs[i].mpc_dstirq; | 493 | return mp_irqs[i].mp_dstirq; |
480 | } | 494 | } |
481 | return -1; | 495 | return -1; |
482 | } | 496 | } |
@@ -486,17 +500,17 @@ static int __init find_isa_irq_apic(int irq, int type) | |||
486 | int i; | 500 | int i; |
487 | 501 | ||
488 | for (i = 0; i < mp_irq_entries; i++) { | 502 | for (i = 0; i < mp_irq_entries; i++) { |
489 | int lbus = mp_irqs[i].mpc_srcbus; | 503 | int lbus = mp_irqs[i].mp_srcbus; |
490 | 504 | ||
491 | if (test_bit(lbus, mp_bus_not_pci) && | 505 | if (test_bit(lbus, mp_bus_not_pci) && |
492 | (mp_irqs[i].mpc_irqtype == type) && | 506 | (mp_irqs[i].mp_irqtype == type) && |
493 | (mp_irqs[i].mpc_srcbusirq == irq)) | 507 | (mp_irqs[i].mp_srcbusirq == irq)) |
494 | break; | 508 | break; |
495 | } | 509 | } |
496 | if (i < mp_irq_entries) { | 510 | if (i < mp_irq_entries) { |
497 | int apic; | 511 | int apic; |
498 | for(apic = 0; apic < nr_ioapics; apic++) { | 512 | for(apic = 0; apic < nr_ioapics; apic++) { |
499 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) | 513 | if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) |
500 | return apic; | 514 | return apic; |
501 | } | 515 | } |
502 | } | 516 | } |
@@ -516,28 +530,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | |||
516 | 530 | ||
517 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | 531 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", |
518 | bus, slot, pin); | 532 | bus, slot, pin); |
519 | if (mp_bus_id_to_pci_bus[bus] == -1) { | 533 | if (test_bit(bus, mp_bus_not_pci)) { |
520 | apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | 534 | apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); |
521 | return -1; | 535 | return -1; |
522 | } | 536 | } |
523 | for (i = 0; i < mp_irq_entries; i++) { | 537 | for (i = 0; i < mp_irq_entries; i++) { |
524 | int lbus = mp_irqs[i].mpc_srcbus; | 538 | int lbus = mp_irqs[i].mp_srcbus; |
525 | 539 | ||
526 | for (apic = 0; apic < nr_ioapics; apic++) | 540 | for (apic = 0; apic < nr_ioapics; apic++) |
527 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | 541 | if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || |
528 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | 542 | mp_irqs[i].mp_dstapic == MP_APIC_ALL) |
529 | break; | 543 | break; |
530 | 544 | ||
531 | if (!test_bit(lbus, mp_bus_not_pci) && | 545 | if (!test_bit(lbus, mp_bus_not_pci) && |
532 | !mp_irqs[i].mpc_irqtype && | 546 | !mp_irqs[i].mp_irqtype && |
533 | (bus == lbus) && | 547 | (bus == lbus) && |
534 | (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | 548 | (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { |
535 | int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | 549 | int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); |
536 | 550 | ||
537 | if (!(apic || IO_APIC_IRQ(irq))) | 551 | if (!(apic || IO_APIC_IRQ(irq))) |
538 | continue; | 552 | continue; |
539 | 553 | ||
540 | if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | 554 | if (pin == (mp_irqs[i].mp_srcbusirq & 3)) |
541 | return irq; | 555 | return irq; |
542 | /* | 556 | /* |
543 | * Use the first all-but-pin matching entry as a | 557 | * Use the first all-but-pin matching entry as a |
@@ -565,13 +579,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | |||
565 | 579 | ||
566 | static int MPBIOS_polarity(int idx) | 580 | static int MPBIOS_polarity(int idx) |
567 | { | 581 | { |
568 | int bus = mp_irqs[idx].mpc_srcbus; | 582 | int bus = mp_irqs[idx].mp_srcbus; |
569 | int polarity; | 583 | int polarity; |
570 | 584 | ||
571 | /* | 585 | /* |
572 | * Determine IRQ line polarity (high active or low active): | 586 | * Determine IRQ line polarity (high active or low active): |
573 | */ | 587 | */ |
574 | switch (mp_irqs[idx].mpc_irqflag & 3) | 588 | switch (mp_irqs[idx].mp_irqflag & 3) |
575 | { | 589 | { |
576 | case 0: /* conforms, ie. bus-type dependent polarity */ | 590 | case 0: /* conforms, ie. bus-type dependent polarity */ |
577 | if (test_bit(bus, mp_bus_not_pci)) | 591 | if (test_bit(bus, mp_bus_not_pci)) |
@@ -607,13 +621,13 @@ static int MPBIOS_polarity(int idx) | |||
607 | 621 | ||
608 | static int MPBIOS_trigger(int idx) | 622 | static int MPBIOS_trigger(int idx) |
609 | { | 623 | { |
610 | int bus = mp_irqs[idx].mpc_srcbus; | 624 | int bus = mp_irqs[idx].mp_srcbus; |
611 | int trigger; | 625 | int trigger; |
612 | 626 | ||
613 | /* | 627 | /* |
614 | * Determine IRQ trigger mode (edge or level sensitive): | 628 | * Determine IRQ trigger mode (edge or level sensitive): |
615 | */ | 629 | */ |
616 | switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | 630 | switch ((mp_irqs[idx].mp_irqflag>>2) & 3) |
617 | { | 631 | { |
618 | case 0: /* conforms, ie. bus-type dependent */ | 632 | case 0: /* conforms, ie. bus-type dependent */ |
619 | if (test_bit(bus, mp_bus_not_pci)) | 633 | if (test_bit(bus, mp_bus_not_pci)) |
@@ -660,16 +674,16 @@ static inline int irq_trigger(int idx) | |||
660 | static int pin_2_irq(int idx, int apic, int pin) | 674 | static int pin_2_irq(int idx, int apic, int pin) |
661 | { | 675 | { |
662 | int irq, i; | 676 | int irq, i; |
663 | int bus = mp_irqs[idx].mpc_srcbus; | 677 | int bus = mp_irqs[idx].mp_srcbus; |
664 | 678 | ||
665 | /* | 679 | /* |
666 | * Debugging check, we are in big trouble if this message pops up! | 680 | * Debugging check, we are in big trouble if this message pops up! |
667 | */ | 681 | */ |
668 | if (mp_irqs[idx].mpc_dstirq != pin) | 682 | if (mp_irqs[idx].mp_dstirq != pin) |
669 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | 683 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); |
670 | 684 | ||
671 | if (test_bit(bus, mp_bus_not_pci)) { | 685 | if (test_bit(bus, mp_bus_not_pci)) { |
672 | irq = mp_irqs[idx].mpc_srcbusirq; | 686 | irq = mp_irqs[idx].mp_srcbusirq; |
673 | } else { | 687 | } else { |
674 | /* | 688 | /* |
675 | * PCI IRQs are mapped in order | 689 | * PCI IRQs are mapped in order |
@@ -718,7 +732,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) | |||
718 | return 0; | 732 | return 0; |
719 | } | 733 | } |
720 | 734 | ||
721 | for_each_cpu_mask(cpu, mask) { | 735 | for_each_cpu_mask_nr(cpu, mask) { |
722 | cpumask_t domain, new_mask; | 736 | cpumask_t domain, new_mask; |
723 | int new_cpu; | 737 | int new_cpu; |
724 | int vector, offset; | 738 | int vector, offset; |
@@ -730,7 +744,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) | |||
730 | offset = current_offset; | 744 | offset = current_offset; |
731 | next: | 745 | next: |
732 | vector += 8; | 746 | vector += 8; |
733 | if (vector >= FIRST_SYSTEM_VECTOR) { | 747 | if (vector >= first_system_vector) { |
734 | /* If we run out of vectors on large boxen, must share them. */ | 748 | /* If we run out of vectors on large boxen, must share them. */ |
735 | offset = (offset + 1) % 8; | 749 | offset = (offset + 1) % 8; |
736 | vector = FIRST_DEVICE_VECTOR + offset; | 750 | vector = FIRST_DEVICE_VECTOR + offset; |
@@ -739,7 +753,7 @@ next: | |||
739 | continue; | 753 | continue; |
740 | if (vector == IA32_SYSCALL_VECTOR) | 754 | if (vector == IA32_SYSCALL_VECTOR) |
741 | goto next; | 755 | goto next; |
742 | for_each_cpu_mask(new_cpu, new_mask) | 756 | for_each_cpu_mask_nr(new_cpu, new_mask) |
743 | if (per_cpu(vector_irq, new_cpu)[vector] != -1) | 757 | if (per_cpu(vector_irq, new_cpu)[vector] != -1) |
744 | goto next; | 758 | goto next; |
745 | /* Found one! */ | 759 | /* Found one! */ |
@@ -749,7 +763,7 @@ next: | |||
749 | cfg->move_in_progress = 1; | 763 | cfg->move_in_progress = 1; |
750 | cfg->old_domain = cfg->domain; | 764 | cfg->old_domain = cfg->domain; |
751 | } | 765 | } |
752 | for_each_cpu_mask(new_cpu, new_mask) | 766 | for_each_cpu_mask_nr(new_cpu, new_mask) |
753 | per_cpu(vector_irq, new_cpu)[vector] = irq; | 767 | per_cpu(vector_irq, new_cpu)[vector] = irq; |
754 | cfg->vector = vector; | 768 | cfg->vector = vector; |
755 | cfg->domain = domain; | 769 | cfg->domain = domain; |
@@ -781,14 +795,14 @@ static void __clear_irq_vector(int irq) | |||
781 | 795 | ||
782 | vector = cfg->vector; | 796 | vector = cfg->vector; |
783 | cpus_and(mask, cfg->domain, cpu_online_map); | 797 | cpus_and(mask, cfg->domain, cpu_online_map); |
784 | for_each_cpu_mask(cpu, mask) | 798 | for_each_cpu_mask_nr(cpu, mask) |
785 | per_cpu(vector_irq, cpu)[vector] = -1; | 799 | per_cpu(vector_irq, cpu)[vector] = -1; |
786 | 800 | ||
787 | cfg->vector = 0; | 801 | cfg->vector = 0; |
788 | cpus_clear(cfg->domain); | 802 | cpus_clear(cfg->domain); |
789 | } | 803 | } |
790 | 804 | ||
791 | void __setup_vector_irq(int cpu) | 805 | static void __setup_vector_irq(int cpu) |
792 | { | 806 | { |
793 | /* Initialize vector_irq on a new cpu */ | 807 | /* Initialize vector_irq on a new cpu */ |
794 | /* This function must be called with vector_lock held */ | 808 | /* This function must be called with vector_lock held */ |
@@ -811,6 +825,13 @@ void __setup_vector_irq(int cpu) | |||
811 | } | 825 | } |
812 | } | 826 | } |
813 | 827 | ||
828 | void setup_vector_irq(int cpu) | ||
829 | { | ||
830 | spin_lock(&vector_lock); | ||
831 | __setup_vector_irq(smp_processor_id()); | ||
832 | spin_unlock(&vector_lock); | ||
833 | } | ||
834 | |||
814 | 835 | ||
815 | static struct irq_chip ioapic_chip; | 836 | static struct irq_chip ioapic_chip; |
816 | 837 | ||
@@ -846,7 +867,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, | |||
846 | apic_printk(APIC_VERBOSE,KERN_DEBUG | 867 | apic_printk(APIC_VERBOSE,KERN_DEBUG |
847 | "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " | 868 | "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " |
848 | "IRQ %d Mode:%i Active:%i)\n", | 869 | "IRQ %d Mode:%i Active:%i)\n", |
849 | apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, | 870 | apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, |
850 | irq, trigger, polarity); | 871 | irq, trigger, polarity); |
851 | 872 | ||
852 | /* | 873 | /* |
@@ -887,10 +908,10 @@ static void __init setup_IO_APIC_irqs(void) | |||
887 | idx = find_irq_entry(apic,pin,mp_INT); | 908 | idx = find_irq_entry(apic,pin,mp_INT); |
888 | if (idx == -1) { | 909 | if (idx == -1) { |
889 | if (first_notcon) { | 910 | if (first_notcon) { |
890 | apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); | 911 | apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); |
891 | first_notcon = 0; | 912 | first_notcon = 0; |
892 | } else | 913 | } else |
893 | apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); | 914 | apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); |
894 | continue; | 915 | continue; |
895 | } | 916 | } |
896 | if (!first_notcon) { | 917 | if (!first_notcon) { |
@@ -911,26 +932,21 @@ static void __init setup_IO_APIC_irqs(void) | |||
911 | } | 932 | } |
912 | 933 | ||
913 | /* | 934 | /* |
914 | * Set up the 8259A-master output pin as broadcast to all | 935 | * Set up the timer pin, possibly with the 8259A-master behind. |
915 | * CPUs. | ||
916 | */ | 936 | */ |
917 | static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) | 937 | static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, |
938 | int vector) | ||
918 | { | 939 | { |
919 | struct IO_APIC_route_entry entry; | 940 | struct IO_APIC_route_entry entry; |
920 | 941 | ||
921 | memset(&entry, 0, sizeof(entry)); | 942 | memset(&entry, 0, sizeof(entry)); |
922 | 943 | ||
923 | disable_8259A_irq(0); | ||
924 | |||
925 | /* mask LVT0 */ | ||
926 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
927 | |||
928 | /* | 944 | /* |
929 | * We use logical delivery to get the timer IRQ | 945 | * We use logical delivery to get the timer IRQ |
930 | * to the first CPU. | 946 | * to the first CPU. |
931 | */ | 947 | */ |
932 | entry.dest_mode = INT_DEST_MODE; | 948 | entry.dest_mode = INT_DEST_MODE; |
933 | entry.mask = 0; /* unmask IRQ now */ | 949 | entry.mask = 1; /* mask IRQ now */ |
934 | entry.dest = cpu_mask_to_apicid(TARGET_CPUS); | 950 | entry.dest = cpu_mask_to_apicid(TARGET_CPUS); |
935 | entry.delivery_mode = INT_DELIVERY_MODE; | 951 | entry.delivery_mode = INT_DELIVERY_MODE; |
936 | entry.polarity = 0; | 952 | entry.polarity = 0; |
@@ -939,7 +955,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in | |||
939 | 955 | ||
940 | /* | 956 | /* |
941 | * The timer IRQ doesn't have to know that behind the | 957 | * The timer IRQ doesn't have to know that behind the |
942 | * scene we have a 8259A-master in AEOI mode ... | 958 | * scene we may have a 8259A-master in AEOI mode ... |
943 | */ | 959 | */ |
944 | set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); | 960 | set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); |
945 | 961 | ||
@@ -947,8 +963,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in | |||
947 | * Add it to the IO-APIC irq-routing table: | 963 | * Add it to the IO-APIC irq-routing table: |
948 | */ | 964 | */ |
949 | ioapic_write_entry(apic, pin, entry); | 965 | ioapic_write_entry(apic, pin, entry); |
950 | |||
951 | enable_8259A_irq(0); | ||
952 | } | 966 | } |
953 | 967 | ||
954 | void __apicdebuginit print_IO_APIC(void) | 968 | void __apicdebuginit print_IO_APIC(void) |
@@ -965,7 +979,7 @@ void __apicdebuginit print_IO_APIC(void) | |||
965 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | 979 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); |
966 | for (i = 0; i < nr_ioapics; i++) | 980 | for (i = 0; i < nr_ioapics; i++) |
967 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | 981 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", |
968 | mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | 982 | mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); |
969 | 983 | ||
970 | /* | 984 | /* |
971 | * We are a bit conservative about what we expect. We have to | 985 | * We are a bit conservative about what we expect. We have to |
@@ -983,7 +997,7 @@ void __apicdebuginit print_IO_APIC(void) | |||
983 | spin_unlock_irqrestore(&ioapic_lock, flags); | 997 | spin_unlock_irqrestore(&ioapic_lock, flags); |
984 | 998 | ||
985 | printk("\n"); | 999 | printk("\n"); |
986 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | 1000 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); |
987 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | 1001 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); |
988 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | 1002 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); |
989 | 1003 | ||
@@ -1077,6 +1091,7 @@ void __apicdebuginit print_local_APIC(void * dummy) | |||
1077 | 1091 | ||
1078 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | 1092 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", |
1079 | smp_processor_id(), hard_smp_processor_id()); | 1093 | smp_processor_id(), hard_smp_processor_id()); |
1094 | v = apic_read(APIC_ID); | ||
1080 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); | 1095 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); |
1081 | v = apic_read(APIC_LVR); | 1096 | v = apic_read(APIC_LVR); |
1082 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | 1097 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); |
@@ -1146,7 +1161,7 @@ void __apicdebuginit print_local_APIC(void * dummy) | |||
1146 | 1161 | ||
1147 | void print_all_local_APICs (void) | 1162 | void print_all_local_APICs (void) |
1148 | { | 1163 | { |
1149 | on_each_cpu(print_local_APIC, NULL, 1, 1); | 1164 | on_each_cpu(print_local_APIC, NULL, 1); |
1150 | } | 1165 | } |
1151 | 1166 | ||
1152 | void __apicdebuginit print_PIC(void) | 1167 | void __apicdebuginit print_PIC(void) |
@@ -1358,12 +1373,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) | |||
1358 | static int ioapic_retrigger_irq(unsigned int irq) | 1373 | static int ioapic_retrigger_irq(unsigned int irq) |
1359 | { | 1374 | { |
1360 | struct irq_cfg *cfg = &irq_cfg[irq]; | 1375 | struct irq_cfg *cfg = &irq_cfg[irq]; |
1361 | cpumask_t mask; | ||
1362 | unsigned long flags; | 1376 | unsigned long flags; |
1363 | 1377 | ||
1364 | spin_lock_irqsave(&vector_lock, flags); | 1378 | spin_lock_irqsave(&vector_lock, flags); |
1365 | mask = cpumask_of_cpu(first_cpu(cfg->domain)); | 1379 | send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); |
1366 | send_IPI_mask(mask, cfg->vector); | ||
1367 | spin_unlock_irqrestore(&vector_lock, flags); | 1380 | spin_unlock_irqrestore(&vector_lock, flags); |
1368 | 1381 | ||
1369 | return 1; | 1382 | return 1; |
@@ -1540,7 +1553,7 @@ static inline void init_IO_APIC_traps(void) | |||
1540 | } | 1553 | } |
1541 | } | 1554 | } |
1542 | 1555 | ||
1543 | static void enable_lapic_irq (unsigned int irq) | 1556 | static void unmask_lapic_irq(unsigned int irq) |
1544 | { | 1557 | { |
1545 | unsigned long v; | 1558 | unsigned long v; |
1546 | 1559 | ||
@@ -1548,7 +1561,7 @@ static void enable_lapic_irq (unsigned int irq) | |||
1548 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); | 1561 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); |
1549 | } | 1562 | } |
1550 | 1563 | ||
1551 | static void disable_lapic_irq (unsigned int irq) | 1564 | static void mask_lapic_irq(unsigned int irq) |
1552 | { | 1565 | { |
1553 | unsigned long v; | 1566 | unsigned long v; |
1554 | 1567 | ||
@@ -1561,19 +1574,20 @@ static void ack_lapic_irq (unsigned int irq) | |||
1561 | ack_APIC_irq(); | 1574 | ack_APIC_irq(); |
1562 | } | 1575 | } |
1563 | 1576 | ||
1564 | static void end_lapic_irq (unsigned int i) { /* nothing */ } | 1577 | static struct irq_chip lapic_chip __read_mostly = { |
1565 | 1578 | .name = "local-APIC", | |
1566 | static struct hw_interrupt_type lapic_irq_type __read_mostly = { | 1579 | .mask = mask_lapic_irq, |
1567 | .name = "local-APIC", | 1580 | .unmask = unmask_lapic_irq, |
1568 | .typename = "local-APIC-edge", | 1581 | .ack = ack_lapic_irq, |
1569 | .startup = NULL, /* startup_irq() not used for IRQ0 */ | ||
1570 | .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ | ||
1571 | .enable = enable_lapic_irq, | ||
1572 | .disable = disable_lapic_irq, | ||
1573 | .ack = ack_lapic_irq, | ||
1574 | .end = end_lapic_irq, | ||
1575 | }; | 1582 | }; |
1576 | 1583 | ||
1584 | static void lapic_register_intr(int irq) | ||
1585 | { | ||
1586 | irq_desc[irq].status &= ~IRQ_LEVEL; | ||
1587 | set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, | ||
1588 | "edge"); | ||
1589 | } | ||
1590 | |||
1577 | static void __init setup_nmi(void) | 1591 | static void __init setup_nmi(void) |
1578 | { | 1592 | { |
1579 | /* | 1593 | /* |
@@ -1659,6 +1673,7 @@ static inline void __init check_timer(void) | |||
1659 | struct irq_cfg *cfg = irq_cfg + 0; | 1673 | struct irq_cfg *cfg = irq_cfg + 0; |
1660 | int apic1, pin1, apic2, pin2; | 1674 | int apic1, pin1, apic2, pin2; |
1661 | unsigned long flags; | 1675 | unsigned long flags; |
1676 | int no_pin1 = 0; | ||
1662 | 1677 | ||
1663 | local_irq_save(flags); | 1678 | local_irq_save(flags); |
1664 | 1679 | ||
@@ -1669,34 +1684,48 @@ static inline void __init check_timer(void) | |||
1669 | assign_irq_vector(0, TARGET_CPUS); | 1684 | assign_irq_vector(0, TARGET_CPUS); |
1670 | 1685 | ||
1671 | /* | 1686 | /* |
1672 | * Subtle, code in do_timer_interrupt() expects an AEOI | 1687 | * As IRQ0 is to be enabled in the 8259A, the virtual |
1673 | * mode for the 8259A whenever interrupts are routed | 1688 | * wire has to be disabled in the local APIC. |
1674 | * through I/O APICs. Also IRQ0 has to be enabled in | ||
1675 | * the 8259A which implies the virtual wire has to be | ||
1676 | * disabled in the local APIC. | ||
1677 | */ | 1689 | */ |
1678 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | 1690 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); |
1679 | init_8259A(1); | 1691 | init_8259A(1); |
1680 | if (timer_over_8254 > 0) | ||
1681 | enable_8259A_irq(0); | ||
1682 | 1692 | ||
1683 | pin1 = find_isa_irq_pin(0, mp_INT); | 1693 | pin1 = find_isa_irq_pin(0, mp_INT); |
1684 | apic1 = find_isa_irq_apic(0, mp_INT); | 1694 | apic1 = find_isa_irq_apic(0, mp_INT); |
1685 | pin2 = ioapic_i8259.pin; | 1695 | pin2 = ioapic_i8259.pin; |
1686 | apic2 = ioapic_i8259.apic; | 1696 | apic2 = ioapic_i8259.apic; |
1687 | 1697 | ||
1688 | apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", | 1698 | apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " |
1689 | cfg->vector, apic1, pin1, apic2, pin2); | 1699 | "apic1=%d pin1=%d apic2=%d pin2=%d\n", |
1700 | cfg->vector, apic1, pin1, apic2, pin2); | ||
1701 | |||
1702 | /* | ||
1703 | * Some BIOS writers are clueless and report the ExtINTA | ||
1704 | * I/O APIC input from the cascaded 8259A as the timer | ||
1705 | * interrupt input. So just in case, if only one pin | ||
1706 | * was found above, try it both directly and through the | ||
1707 | * 8259A. | ||
1708 | */ | ||
1709 | if (pin1 == -1) { | ||
1710 | pin1 = pin2; | ||
1711 | apic1 = apic2; | ||
1712 | no_pin1 = 1; | ||
1713 | } else if (pin2 == -1) { | ||
1714 | pin2 = pin1; | ||
1715 | apic2 = apic1; | ||
1716 | } | ||
1690 | 1717 | ||
1691 | if (pin1 != -1) { | 1718 | if (pin1 != -1) { |
1692 | /* | 1719 | /* |
1693 | * Ok, does IRQ0 through the IOAPIC work? | 1720 | * Ok, does IRQ0 through the IOAPIC work? |
1694 | */ | 1721 | */ |
1722 | if (no_pin1) { | ||
1723 | add_pin_to_irq(0, apic1, pin1); | ||
1724 | setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); | ||
1725 | } | ||
1695 | unmask_IO_APIC_irq(0); | 1726 | unmask_IO_APIC_irq(0); |
1696 | if (!no_timer_check && timer_irq_works()) { | 1727 | if (!no_timer_check && timer_irq_works()) { |
1697 | nmi_watchdog_default(); | ||
1698 | if (nmi_watchdog == NMI_IO_APIC) { | 1728 | if (nmi_watchdog == NMI_IO_APIC) { |
1699 | disable_8259A_irq(0); | ||
1700 | setup_nmi(); | 1729 | setup_nmi(); |
1701 | enable_8259A_irq(0); | 1730 | enable_8259A_irq(0); |
1702 | } | 1731 | } |
@@ -1705,54 +1734,62 @@ static inline void __init check_timer(void) | |||
1705 | goto out; | 1734 | goto out; |
1706 | } | 1735 | } |
1707 | clear_IO_APIC_pin(apic1, pin1); | 1736 | clear_IO_APIC_pin(apic1, pin1); |
1708 | apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " | 1737 | if (!no_pin1) |
1709 | "connected to IO-APIC\n"); | 1738 | apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " |
1710 | } | 1739 | "8254 timer not connected to IO-APIC\n"); |
1711 | 1740 | ||
1712 | apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " | 1741 | apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " |
1713 | "through the 8259A ... "); | 1742 | "(IRQ0) through the 8259A ...\n"); |
1714 | if (pin2 != -1) { | 1743 | apic_printk(APIC_QUIET, KERN_INFO |
1715 | apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", | 1744 | "..... (found apic %d pin %d) ...\n", apic2, pin2); |
1716 | apic2, pin2); | ||
1717 | /* | 1745 | /* |
1718 | * legacy devices should be connected to IO APIC #0 | 1746 | * legacy devices should be connected to IO APIC #0 |
1719 | */ | 1747 | */ |
1720 | setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); | 1748 | replace_pin_at_irq(0, apic1, pin1, apic2, pin2); |
1749 | setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); | ||
1750 | unmask_IO_APIC_irq(0); | ||
1751 | enable_8259A_irq(0); | ||
1721 | if (timer_irq_works()) { | 1752 | if (timer_irq_works()) { |
1722 | apic_printk(APIC_VERBOSE," works.\n"); | 1753 | apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); |
1723 | nmi_watchdog_default(); | 1754 | timer_through_8259 = 1; |
1724 | if (nmi_watchdog == NMI_IO_APIC) { | 1755 | if (nmi_watchdog == NMI_IO_APIC) { |
1756 | disable_8259A_irq(0); | ||
1725 | setup_nmi(); | 1757 | setup_nmi(); |
1758 | enable_8259A_irq(0); | ||
1726 | } | 1759 | } |
1727 | goto out; | 1760 | goto out; |
1728 | } | 1761 | } |
1729 | /* | 1762 | /* |
1730 | * Cleanup, just in case ... | 1763 | * Cleanup, just in case ... |
1731 | */ | 1764 | */ |
1765 | disable_8259A_irq(0); | ||
1732 | clear_IO_APIC_pin(apic2, pin2); | 1766 | clear_IO_APIC_pin(apic2, pin2); |
1767 | apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); | ||
1733 | } | 1768 | } |
1734 | apic_printk(APIC_VERBOSE," failed.\n"); | ||
1735 | 1769 | ||
1736 | if (nmi_watchdog == NMI_IO_APIC) { | 1770 | if (nmi_watchdog == NMI_IO_APIC) { |
1737 | printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | 1771 | apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " |
1738 | nmi_watchdog = 0; | 1772 | "through the IO-APIC - disabling NMI Watchdog!\n"); |
1773 | nmi_watchdog = NMI_NONE; | ||
1739 | } | 1774 | } |
1740 | 1775 | ||
1741 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | 1776 | apic_printk(APIC_QUIET, KERN_INFO |
1777 | "...trying to set up timer as Virtual Wire IRQ...\n"); | ||
1742 | 1778 | ||
1743 | disable_8259A_irq(0); | 1779 | lapic_register_intr(0); |
1744 | irq_desc[0].chip = &lapic_irq_type; | ||
1745 | apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ | 1780 | apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ |
1746 | enable_8259A_irq(0); | 1781 | enable_8259A_irq(0); |
1747 | 1782 | ||
1748 | if (timer_irq_works()) { | 1783 | if (timer_irq_works()) { |
1749 | apic_printk(APIC_VERBOSE," works.\n"); | 1784 | apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); |
1750 | goto out; | 1785 | goto out; |
1751 | } | 1786 | } |
1787 | disable_8259A_irq(0); | ||
1752 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); | 1788 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); |
1753 | apic_printk(APIC_VERBOSE," failed.\n"); | 1789 | apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); |
1754 | 1790 | ||
1755 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | 1791 | apic_printk(APIC_QUIET, KERN_INFO |
1792 | "...trying to set up timer as ExtINT IRQ...\n"); | ||
1756 | 1793 | ||
1757 | init_8259A(0); | 1794 | init_8259A(0); |
1758 | make_8259A_irq(0); | 1795 | make_8259A_irq(0); |
@@ -1761,11 +1798,12 @@ static inline void __init check_timer(void) | |||
1761 | unlock_ExtINT_logic(); | 1798 | unlock_ExtINT_logic(); |
1762 | 1799 | ||
1763 | if (timer_irq_works()) { | 1800 | if (timer_irq_works()) { |
1764 | apic_printk(APIC_VERBOSE," works.\n"); | 1801 | apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); |
1765 | goto out; | 1802 | goto out; |
1766 | } | 1803 | } |
1767 | apic_printk(APIC_VERBOSE," failed :(.\n"); | 1804 | apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); |
1768 | panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); | 1805 | panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " |
1806 | "report. Then try booting with the 'noapic' option.\n"); | ||
1769 | out: | 1807 | out: |
1770 | local_irq_restore(flags); | 1808 | local_irq_restore(flags); |
1771 | } | 1809 | } |
@@ -1778,11 +1816,21 @@ static int __init notimercheck(char *s) | |||
1778 | __setup("no_timer_check", notimercheck); | 1816 | __setup("no_timer_check", notimercheck); |
1779 | 1817 | ||
1780 | /* | 1818 | /* |
1781 | * | 1819 | * Traditionally ISA IRQ2 is the cascade IRQ, and is not available |
1782 | * IRQs that are handled by the PIC in the MPS IOAPIC case. | 1820 | * to devices. However there may be an I/O APIC pin available for |
1783 | * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | 1821 | * this interrupt regardless. The pin may be left unconnected, but |
1784 | * Linux doesn't really care, as it's not actually used | 1822 | * typically it will be reused as an ExtINT cascade interrupt for |
1785 | * for any interrupt handling anyway. | 1823 | * the master 8259A. In the MPS case such a pin will normally be |
1824 | * reported as an ExtINT interrupt in the MP table. With ACPI | ||
1825 | * there is no provision for ExtINT interrupts, and in the absence | ||
1826 | * of an override it would be treated as an ordinary ISA I/O APIC | ||
1827 | * interrupt, that is edge-triggered and unmasked by default. We | ||
1828 | * used to do this, but it caused problems on some systems because | ||
1829 | * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using | ||
1830 | * the same ExtINT cascade interrupt to drive the local APIC of the | ||
1831 | * bootstrap processor. Therefore we refrain from routing IRQ2 to | ||
1832 | * the I/O APIC in all cases now. No actual device should request | ||
1833 | * it anyway. --macro | ||
1786 | */ | 1834 | */ |
1787 | #define PIC_IRQS (1<<2) | 1835 | #define PIC_IRQS (1<<2) |
1788 | 1836 | ||
@@ -1793,10 +1841,7 @@ void __init setup_IO_APIC(void) | |||
1793 | * calling enable_IO_APIC() is moved to setup_local_APIC for BP | 1841 | * calling enable_IO_APIC() is moved to setup_local_APIC for BP |
1794 | */ | 1842 | */ |
1795 | 1843 | ||
1796 | if (acpi_ioapic) | 1844 | io_apic_irqs = ~PIC_IRQS; |
1797 | io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | ||
1798 | else | ||
1799 | io_apic_irqs = ~PIC_IRQS; | ||
1800 | 1845 | ||
1801 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | 1846 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); |
1802 | 1847 | ||
@@ -1841,8 +1886,8 @@ static int ioapic_resume(struct sys_device *dev) | |||
1841 | 1886 | ||
1842 | spin_lock_irqsave(&ioapic_lock, flags); | 1887 | spin_lock_irqsave(&ioapic_lock, flags); |
1843 | reg_00.raw = io_apic_read(dev->id, 0); | 1888 | reg_00.raw = io_apic_read(dev->id, 0); |
1844 | if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | 1889 | if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { |
1845 | reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | 1890 | reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; |
1846 | io_apic_write(dev->id, 0, reg_00.raw); | 1891 | io_apic_write(dev->id, 0, reg_00.raw); |
1847 | } | 1892 | } |
1848 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1893 | spin_unlock_irqrestore(&ioapic_lock, flags); |
@@ -2242,8 +2287,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | |||
2242 | return -1; | 2287 | return -1; |
2243 | 2288 | ||
2244 | for (i = 0; i < mp_irq_entries; i++) | 2289 | for (i = 0; i < mp_irq_entries; i++) |
2245 | if (mp_irqs[i].mpc_irqtype == mp_INT && | 2290 | if (mp_irqs[i].mp_irqtype == mp_INT && |
2246 | mp_irqs[i].mpc_srcbusirq == bus_irq) | 2291 | mp_irqs[i].mp_srcbusirq == bus_irq) |
2247 | break; | 2292 | break; |
2248 | if (i >= mp_irq_entries) | 2293 | if (i >= mp_irq_entries) |
2249 | return -1; | 2294 | return -1; |
@@ -2336,7 +2381,7 @@ void __init ioapic_init_mappings(void) | |||
2336 | ioapic_res = ioapic_setup_resources(); | 2381 | ioapic_res = ioapic_setup_resources(); |
2337 | for (i = 0; i < nr_ioapics; i++) { | 2382 | for (i = 0; i < nr_ioapics; i++) { |
2338 | if (smp_found_config) { | 2383 | if (smp_found_config) { |
2339 | ioapic_phys = mp_ioapics[i].mpc_apicaddr; | 2384 | ioapic_phys = mp_ioapics[i].mp_apicaddr; |
2340 | } else { | 2385 | } else { |
2341 | ioapic_phys = (unsigned long) | 2386 | ioapic_phys = (unsigned long) |
2342 | alloc_bootmem_pages(PAGE_SIZE); | 2387 | alloc_bootmem_pages(PAGE_SIZE); |
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 5921e5f0a640..1c3a66a67f83 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c | |||
@@ -103,6 +103,9 @@ void __init io_delay_init(void) | |||
103 | 103 | ||
104 | static int __init io_delay_param(char *s) | 104 | static int __init io_delay_param(char *s) |
105 | { | 105 | { |
106 | if (!s) | ||
107 | return -EINVAL; | ||
108 | |||
106 | if (!strcmp(s, "0x80")) | 109 | if (!strcmp(s, "0x80")) |
107 | io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; | 110 | io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; |
108 | else if (!strcmp(s, "0xed")) | 111 | else if (!strcmp(s, "0xed")) |
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index c0df7b89ca23..3f7537b669d3 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/kernel_stat.h> | 8 | #include <linux/kernel_stat.h> |
9 | #include <linux/mc146818rtc.h> | 9 | #include <linux/mc146818rtc.h> |
10 | #include <linux/cache.h> | 10 | #include <linux/cache.h> |
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/cpu.h> | 11 | #include <linux/cpu.h> |
13 | #include <linux/module.h> | 12 | #include <linux/module.h> |
14 | 13 | ||
@@ -71,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) | |||
71 | /* | 70 | /* |
72 | * Send the IPI. The write to APIC_ICR fires this off. | 71 | * Send the IPI. The write to APIC_ICR fires this off. |
73 | */ | 72 | */ |
74 | apic_write_around(APIC_ICR, cfg); | 73 | apic_write(APIC_ICR, cfg); |
75 | } | 74 | } |
76 | 75 | ||
77 | void send_IPI_self(int vector) | 76 | void send_IPI_self(int vector) |
@@ -99,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) | |||
99 | * prepare target chip field | 98 | * prepare target chip field |
100 | */ | 99 | */ |
101 | cfg = __prepare_ICR2(mask); | 100 | cfg = __prepare_ICR2(mask); |
102 | apic_write_around(APIC_ICR2, cfg); | 101 | apic_write(APIC_ICR2, cfg); |
103 | 102 | ||
104 | /* | 103 | /* |
105 | * program the ICR | 104 | * program the ICR |
@@ -109,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) | |||
109 | /* | 108 | /* |
110 | * Send the IPI. The write to APIC_ICR fires this off. | 109 | * Send the IPI. The write to APIC_ICR fires this off. |
111 | */ | 110 | */ |
112 | apic_write_around(APIC_ICR, cfg); | 111 | apic_write(APIC_ICR, cfg); |
113 | } | 112 | } |
114 | 113 | ||
115 | /* | 114 | /* |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 147352df28b9..1cf8c1fcc088 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq) | |||
48 | #endif | 48 | #endif |
49 | } | 49 | } |
50 | 50 | ||
51 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
52 | /* Debugging check for stack overflow: is there less than 1KB free? */ | ||
53 | static int check_stack_overflow(void) | ||
54 | { | ||
55 | long sp; | ||
56 | |||
57 | __asm__ __volatile__("andl %%esp,%0" : | ||
58 | "=r" (sp) : "0" (THREAD_SIZE - 1)); | ||
59 | |||
60 | return sp < (sizeof(struct thread_info) + STACK_WARN); | ||
61 | } | ||
62 | |||
63 | static void print_stack_overflow(void) | ||
64 | { | ||
65 | printk(KERN_WARNING "low stack detected by irq handler\n"); | ||
66 | dump_stack(); | ||
67 | } | ||
68 | |||
69 | #else | ||
70 | static inline int check_stack_overflow(void) { return 0; } | ||
71 | static inline void print_stack_overflow(void) { } | ||
72 | #endif | ||
73 | |||
51 | #ifdef CONFIG_4KSTACKS | 74 | #ifdef CONFIG_4KSTACKS |
52 | /* | 75 | /* |
53 | * per-CPU IRQ handling contexts (thread information and stack) | 76 | * per-CPU IRQ handling contexts (thread information and stack) |
@@ -59,48 +82,26 @@ union irq_ctx { | |||
59 | 82 | ||
60 | static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; | 83 | static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; |
61 | static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; | 84 | static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; |
62 | #endif | ||
63 | |||
64 | /* | ||
65 | * do_IRQ handles all normal device IRQ's (the special | ||
66 | * SMP cross-CPU interrupts have their own specific | ||
67 | * handlers). | ||
68 | */ | ||
69 | unsigned int do_IRQ(struct pt_regs *regs) | ||
70 | { | ||
71 | struct pt_regs *old_regs; | ||
72 | /* high bit used in ret_from_ code */ | ||
73 | int irq = ~regs->orig_ax; | ||
74 | struct irq_desc *desc = irq_desc + irq; | ||
75 | #ifdef CONFIG_4KSTACKS | ||
76 | union irq_ctx *curctx, *irqctx; | ||
77 | u32 *isp; | ||
78 | #endif | ||
79 | 85 | ||
80 | if (unlikely((unsigned)irq >= NR_IRQS)) { | 86 | static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; |
81 | printk(KERN_EMERG "%s: cannot handle IRQ %d\n", | 87 | static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; |
82 | __func__, irq); | ||
83 | BUG(); | ||
84 | } | ||
85 | 88 | ||
86 | old_regs = set_irq_regs(regs); | 89 | static void call_on_stack(void *func, void *stack) |
87 | irq_enter(); | 90 | { |
88 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | 91 | asm volatile("xchgl %%ebx,%%esp \n" |
89 | /* Debugging check for stack overflow: is there less than 1KB free? */ | 92 | "call *%%edi \n" |
90 | { | 93 | "movl %%ebx,%%esp \n" |
91 | long sp; | 94 | : "=b" (stack) |
92 | 95 | : "0" (stack), | |
93 | __asm__ __volatile__("andl %%esp,%0" : | 96 | "D"(func) |
94 | "=r" (sp) : "0" (THREAD_SIZE - 1)); | 97 | : "memory", "cc", "edx", "ecx", "eax"); |
95 | if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { | 98 | } |
96 | printk("do_IRQ: stack overflow: %ld\n", | ||
97 | sp - sizeof(struct thread_info)); | ||
98 | dump_stack(); | ||
99 | } | ||
100 | } | ||
101 | #endif | ||
102 | 99 | ||
103 | #ifdef CONFIG_4KSTACKS | 100 | static inline int |
101 | execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | ||
102 | { | ||
103 | union irq_ctx *curctx, *irqctx; | ||
104 | u32 *isp, arg1, arg2; | ||
104 | 105 | ||
105 | curctx = (union irq_ctx *) current_thread_info(); | 106 | curctx = (union irq_ctx *) current_thread_info(); |
106 | irqctx = hardirq_ctx[smp_processor_id()]; | 107 | irqctx = hardirq_ctx[smp_processor_id()]; |
@@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs) | |||
111 | * handler) we can't do that and just have to keep using the | 112 | * handler) we can't do that and just have to keep using the |
112 | * current stack (which is the irq stack already after all) | 113 | * current stack (which is the irq stack already after all) |
113 | */ | 114 | */ |
114 | if (curctx != irqctx) { | 115 | if (unlikely(curctx == irqctx)) |
115 | int arg1, arg2, bx; | 116 | return 0; |
116 | |||
117 | /* build the stack frame on the IRQ stack */ | ||
118 | isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); | ||
119 | irqctx->tinfo.task = curctx->tinfo.task; | ||
120 | irqctx->tinfo.previous_esp = current_stack_pointer; | ||
121 | 117 | ||
122 | /* | 118 | /* build the stack frame on the IRQ stack */ |
123 | * Copy the softirq bits in preempt_count so that the | 119 | isp = (u32 *) ((char*)irqctx + sizeof(*irqctx)); |
124 | * softirq checks work in the hardirq context. | 120 | irqctx->tinfo.task = curctx->tinfo.task; |
125 | */ | 121 | irqctx->tinfo.previous_esp = current_stack_pointer; |
126 | irqctx->tinfo.preempt_count = | ||
127 | (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | | ||
128 | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); | ||
129 | |||
130 | asm volatile( | ||
131 | " xchgl %%ebx,%%esp \n" | ||
132 | " call *%%edi \n" | ||
133 | " movl %%ebx,%%esp \n" | ||
134 | : "=a" (arg1), "=d" (arg2), "=b" (bx) | ||
135 | : "0" (irq), "1" (desc), "2" (isp), | ||
136 | "D" (desc->handle_irq) | ||
137 | : "memory", "cc", "ecx" | ||
138 | ); | ||
139 | } else | ||
140 | #endif | ||
141 | desc->handle_irq(irq, desc); | ||
142 | 122 | ||
143 | irq_exit(); | 123 | /* |
144 | set_irq_regs(old_regs); | 124 | * Copy the softirq bits in preempt_count so that the |
125 | * softirq checks work in the hardirq context. | ||
126 | */ | ||
127 | irqctx->tinfo.preempt_count = | ||
128 | (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | | ||
129 | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); | ||
130 | |||
131 | if (unlikely(overflow)) | ||
132 | call_on_stack(print_stack_overflow, isp); | ||
133 | |||
134 | asm volatile("xchgl %%ebx,%%esp \n" | ||
135 | "call *%%edi \n" | ||
136 | "movl %%ebx,%%esp \n" | ||
137 | : "=a" (arg1), "=d" (arg2), "=b" (isp) | ||
138 | : "0" (irq), "1" (desc), "2" (isp), | ||
139 | "D" (desc->handle_irq) | ||
140 | : "memory", "cc", "ecx"); | ||
145 | return 1; | 141 | return 1; |
146 | } | 142 | } |
147 | 143 | ||
148 | #ifdef CONFIG_4KSTACKS | ||
149 | |||
150 | static char softirq_stack[NR_CPUS * THREAD_SIZE] | ||
151 | __attribute__((__section__(".bss.page_aligned"))); | ||
152 | |||
153 | static char hardirq_stack[NR_CPUS * THREAD_SIZE] | ||
154 | __attribute__((__section__(".bss.page_aligned"))); | ||
155 | |||
156 | /* | 144 | /* |
157 | * allocate per-cpu stacks for hardirq and for softirq processing | 145 | * allocate per-cpu stacks for hardirq and for softirq processing |
158 | */ | 146 | */ |
159 | void irq_ctx_init(int cpu) | 147 | void __cpuinit irq_ctx_init(int cpu) |
160 | { | 148 | { |
161 | union irq_ctx *irqctx; | 149 | union irq_ctx *irqctx; |
162 | 150 | ||
@@ -164,25 +152,25 @@ void irq_ctx_init(int cpu) | |||
164 | return; | 152 | return; |
165 | 153 | ||
166 | irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; | 154 | irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; |
167 | irqctx->tinfo.task = NULL; | 155 | irqctx->tinfo.task = NULL; |
168 | irqctx->tinfo.exec_domain = NULL; | 156 | irqctx->tinfo.exec_domain = NULL; |
169 | irqctx->tinfo.cpu = cpu; | 157 | irqctx->tinfo.cpu = cpu; |
170 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; | 158 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; |
171 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 159 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
172 | 160 | ||
173 | hardirq_ctx[cpu] = irqctx; | 161 | hardirq_ctx[cpu] = irqctx; |
174 | 162 | ||
175 | irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; | 163 | irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; |
176 | irqctx->tinfo.task = NULL; | 164 | irqctx->tinfo.task = NULL; |
177 | irqctx->tinfo.exec_domain = NULL; | 165 | irqctx->tinfo.exec_domain = NULL; |
178 | irqctx->tinfo.cpu = cpu; | 166 | irqctx->tinfo.cpu = cpu; |
179 | irqctx->tinfo.preempt_count = 0; | 167 | irqctx->tinfo.preempt_count = 0; |
180 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 168 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
181 | 169 | ||
182 | softirq_ctx[cpu] = irqctx; | 170 | softirq_ctx[cpu] = irqctx; |
183 | 171 | ||
184 | printk("CPU %u irqstacks, hard=%p soft=%p\n", | 172 | printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", |
185 | cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); | 173 | cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); |
186 | } | 174 | } |
187 | 175 | ||
188 | void irq_ctx_exit(int cpu) | 176 | void irq_ctx_exit(int cpu) |
@@ -211,25 +199,56 @@ asmlinkage void do_softirq(void) | |||
211 | /* build the stack frame on the softirq stack */ | 199 | /* build the stack frame on the softirq stack */ |
212 | isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); | 200 | isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); |
213 | 201 | ||
214 | asm volatile( | 202 | call_on_stack(__do_softirq, isp); |
215 | " xchgl %%ebx,%%esp \n" | ||
216 | " call __do_softirq \n" | ||
217 | " movl %%ebx,%%esp \n" | ||
218 | : "=b"(isp) | ||
219 | : "0"(isp) | ||
220 | : "memory", "cc", "edx", "ecx", "eax" | ||
221 | ); | ||
222 | /* | 203 | /* |
223 | * Shouldnt happen, we returned above if in_interrupt(): | 204 | * Shouldnt happen, we returned above if in_interrupt(): |
224 | */ | 205 | */ |
225 | WARN_ON_ONCE(softirq_count()); | 206 | WARN_ON_ONCE(softirq_count()); |
226 | } | 207 | } |
227 | 208 | ||
228 | local_irq_restore(flags); | 209 | local_irq_restore(flags); |
229 | } | 210 | } |
211 | |||
212 | #else | ||
213 | static inline int | ||
214 | execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } | ||
230 | #endif | 215 | #endif |
231 | 216 | ||
232 | /* | 217 | /* |
218 | * do_IRQ handles all normal device IRQ's (the special | ||
219 | * SMP cross-CPU interrupts have their own specific | ||
220 | * handlers). | ||
221 | */ | ||
222 | unsigned int do_IRQ(struct pt_regs *regs) | ||
223 | { | ||
224 | struct pt_regs *old_regs; | ||
225 | /* high bit used in ret_from_ code */ | ||
226 | int overflow, irq = ~regs->orig_ax; | ||
227 | struct irq_desc *desc = irq_desc + irq; | ||
228 | |||
229 | if (unlikely((unsigned)irq >= NR_IRQS)) { | ||
230 | printk(KERN_EMERG "%s: cannot handle IRQ %d\n", | ||
231 | __func__, irq); | ||
232 | BUG(); | ||
233 | } | ||
234 | |||
235 | old_regs = set_irq_regs(regs); | ||
236 | irq_enter(); | ||
237 | |||
238 | overflow = check_stack_overflow(); | ||
239 | |||
240 | if (!execute_on_irq_stack(overflow, desc, irq)) { | ||
241 | if (unlikely(overflow)) | ||
242 | print_stack_overflow(); | ||
243 | desc->handle_irq(irq, desc); | ||
244 | } | ||
245 | |||
246 | irq_exit(); | ||
247 | set_irq_regs(old_regs); | ||
248 | return 1; | ||
249 | } | ||
250 | |||
251 | /* | ||
233 | * Interrupt statistics: | 252 | * Interrupt statistics: |
234 | */ | 253 | */ |
235 | 254 | ||
@@ -313,16 +332,20 @@ skip: | |||
313 | per_cpu(irq_stat,j).irq_tlb_count); | 332 | per_cpu(irq_stat,j).irq_tlb_count); |
314 | seq_printf(p, " TLB shootdowns\n"); | 333 | seq_printf(p, " TLB shootdowns\n"); |
315 | #endif | 334 | #endif |
335 | #ifdef CONFIG_X86_MCE | ||
316 | seq_printf(p, "TRM: "); | 336 | seq_printf(p, "TRM: "); |
317 | for_each_online_cpu(j) | 337 | for_each_online_cpu(j) |
318 | seq_printf(p, "%10u ", | 338 | seq_printf(p, "%10u ", |
319 | per_cpu(irq_stat,j).irq_thermal_count); | 339 | per_cpu(irq_stat,j).irq_thermal_count); |
320 | seq_printf(p, " Thermal event interrupts\n"); | 340 | seq_printf(p, " Thermal event interrupts\n"); |
341 | #endif | ||
342 | #ifdef CONFIG_X86_LOCAL_APIC | ||
321 | seq_printf(p, "SPU: "); | 343 | seq_printf(p, "SPU: "); |
322 | for_each_online_cpu(j) | 344 | for_each_online_cpu(j) |
323 | seq_printf(p, "%10u ", | 345 | seq_printf(p, "%10u ", |
324 | per_cpu(irq_stat,j).irq_spurious_count); | 346 | per_cpu(irq_stat,j).irq_spurious_count); |
325 | seq_printf(p, " Spurious interrupts\n"); | 347 | seq_printf(p, " Spurious interrupts\n"); |
348 | #endif | ||
326 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | 349 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); |
327 | #if defined(CONFIG_X86_IO_APIC) | 350 | #if defined(CONFIG_X86_IO_APIC) |
328 | seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | 351 | seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); |
@@ -331,6 +354,40 @@ skip: | |||
331 | return 0; | 354 | return 0; |
332 | } | 355 | } |
333 | 356 | ||
357 | /* | ||
358 | * /proc/stat helpers | ||
359 | */ | ||
360 | u64 arch_irq_stat_cpu(unsigned int cpu) | ||
361 | { | ||
362 | u64 sum = nmi_count(cpu); | ||
363 | |||
364 | #ifdef CONFIG_X86_LOCAL_APIC | ||
365 | sum += per_cpu(irq_stat, cpu).apic_timer_irqs; | ||
366 | #endif | ||
367 | #ifdef CONFIG_SMP | ||
368 | sum += per_cpu(irq_stat, cpu).irq_resched_count; | ||
369 | sum += per_cpu(irq_stat, cpu).irq_call_count; | ||
370 | sum += per_cpu(irq_stat, cpu).irq_tlb_count; | ||
371 | #endif | ||
372 | #ifdef CONFIG_X86_MCE | ||
373 | sum += per_cpu(irq_stat, cpu).irq_thermal_count; | ||
374 | #endif | ||
375 | #ifdef CONFIG_X86_LOCAL_APIC | ||
376 | sum += per_cpu(irq_stat, cpu).irq_spurious_count; | ||
377 | #endif | ||
378 | return sum; | ||
379 | } | ||
380 | |||
381 | u64 arch_irq_stat(void) | ||
382 | { | ||
383 | u64 sum = atomic_read(&irq_err_count); | ||
384 | |||
385 | #ifdef CONFIG_X86_IO_APIC | ||
386 | sum += atomic_read(&irq_mis_count); | ||
387 | #endif | ||
388 | return sum; | ||
389 | } | ||
390 | |||
334 | #ifdef CONFIG_HOTPLUG_CPU | 391 | #ifdef CONFIG_HOTPLUG_CPU |
335 | #include <mach_apic.h> | 392 | #include <mach_apic.h> |
336 | 393 | ||
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 3aac15466a91..1f78b238d8d2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c | |||
@@ -135,6 +135,7 @@ skip: | |||
135 | seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); | 135 | seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); |
136 | seq_printf(p, " TLB shootdowns\n"); | 136 | seq_printf(p, " TLB shootdowns\n"); |
137 | #endif | 137 | #endif |
138 | #ifdef CONFIG_X86_MCE | ||
138 | seq_printf(p, "TRM: "); | 139 | seq_printf(p, "TRM: "); |
139 | for_each_online_cpu(j) | 140 | for_each_online_cpu(j) |
140 | seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); | 141 | seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); |
@@ -143,6 +144,7 @@ skip: | |||
143 | for_each_online_cpu(j) | 144 | for_each_online_cpu(j) |
144 | seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); | 145 | seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); |
145 | seq_printf(p, " Threshold APIC interrupts\n"); | 146 | seq_printf(p, " Threshold APIC interrupts\n"); |
147 | #endif | ||
146 | seq_printf(p, "SPU: "); | 148 | seq_printf(p, "SPU: "); |
147 | for_each_online_cpu(j) | 149 | for_each_online_cpu(j) |
148 | seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); | 150 | seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); |
@@ -153,6 +155,32 @@ skip: | |||
153 | } | 155 | } |
154 | 156 | ||
155 | /* | 157 | /* |
158 | * /proc/stat helpers | ||
159 | */ | ||
160 | u64 arch_irq_stat_cpu(unsigned int cpu) | ||
161 | { | ||
162 | u64 sum = cpu_pda(cpu)->__nmi_count; | ||
163 | |||
164 | sum += cpu_pda(cpu)->apic_timer_irqs; | ||
165 | #ifdef CONFIG_SMP | ||
166 | sum += cpu_pda(cpu)->irq_resched_count; | ||
167 | sum += cpu_pda(cpu)->irq_call_count; | ||
168 | sum += cpu_pda(cpu)->irq_tlb_count; | ||
169 | #endif | ||
170 | #ifdef CONFIG_X86_MCE | ||
171 | sum += cpu_pda(cpu)->irq_thermal_count; | ||
172 | sum += cpu_pda(cpu)->irq_threshold_count; | ||
173 | #endif | ||
174 | sum += cpu_pda(cpu)->irq_spurious_count; | ||
175 | return sum; | ||
176 | } | ||
177 | |||
178 | u64 arch_irq_stat(void) | ||
179 | { | ||
180 | return atomic_read(&irq_err_count); | ||
181 | } | ||
182 | |||
183 | /* | ||
156 | * do_IRQ handles all normal device IRQ's (the special | 184 | * do_IRQ handles all normal device IRQ's (the special |
157 | * SMP cross-CPU interrupts have their own specific | 185 | * SMP cross-CPU interrupts have their own specific |
158 | * handlers). | 186 | * handlers). |
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c new file mode 100644 index 000000000000..d66914287ee1 --- /dev/null +++ b/arch/x86/kernel/irqinit_32.c | |||
@@ -0,0 +1,114 @@ | |||
1 | #include <linux/errno.h> | ||
2 | #include <linux/signal.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/ioport.h> | ||
5 | #include <linux/interrupt.h> | ||
6 | #include <linux/slab.h> | ||
7 | #include <linux/random.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/kernel_stat.h> | ||
10 | #include <linux/sysdev.h> | ||
11 | #include <linux/bitops.h> | ||
12 | |||
13 | #include <asm/atomic.h> | ||
14 | #include <asm/system.h> | ||
15 | #include <asm/io.h> | ||
16 | #include <asm/timer.h> | ||
17 | #include <asm/pgtable.h> | ||
18 | #include <asm/delay.h> | ||
19 | #include <asm/desc.h> | ||
20 | #include <asm/apic.h> | ||
21 | #include <asm/arch_hooks.h> | ||
22 | #include <asm/i8259.h> | ||
23 | |||
24 | |||
25 | |||
26 | /* | ||
27 | * Note that on a 486, we don't want to do a SIGFPE on an irq13 | ||
28 | * as the irq is unreliable, and exception 16 works correctly | ||
29 | * (ie as explained in the intel literature). On a 386, you | ||
30 | * can't use exception 16 due to bad IBM design, so we have to | ||
31 | * rely on the less exact irq13. | ||
32 | * | ||
33 | * Careful.. Not only is IRQ13 unreliable, but it is also | ||
34 | * leads to races. IBM designers who came up with it should | ||
35 | * be shot. | ||
36 | */ | ||
37 | |||
38 | |||
39 | static irqreturn_t math_error_irq(int cpl, void *dev_id) | ||
40 | { | ||
41 | extern void math_error(void __user *); | ||
42 | outb(0,0xF0); | ||
43 | if (ignore_fpu_irq || !boot_cpu_data.hard_math) | ||
44 | return IRQ_NONE; | ||
45 | math_error((void __user *)get_irq_regs()->ip); | ||
46 | return IRQ_HANDLED; | ||
47 | } | ||
48 | |||
49 | /* | ||
50 | * New motherboards sometimes make IRQ 13 be a PCI interrupt, | ||
51 | * so allow interrupt sharing. | ||
52 | */ | ||
53 | static struct irqaction fpu_irq = { | ||
54 | .handler = math_error_irq, | ||
55 | .mask = CPU_MASK_NONE, | ||
56 | .name = "fpu", | ||
57 | }; | ||
58 | |||
59 | void __init init_ISA_irqs (void) | ||
60 | { | ||
61 | int i; | ||
62 | |||
63 | #ifdef CONFIG_X86_LOCAL_APIC | ||
64 | init_bsp_APIC(); | ||
65 | #endif | ||
66 | init_8259A(0); | ||
67 | |||
68 | /* | ||
69 | * 16 old-style INTA-cycle interrupts: | ||
70 | */ | ||
71 | for (i = 0; i < 16; i++) { | ||
72 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
73 | handle_level_irq, "XT"); | ||
74 | } | ||
75 | } | ||
76 | |||
77 | /* Overridden in paravirt.c */ | ||
78 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
79 | |||
80 | void __init native_init_IRQ(void) | ||
81 | { | ||
82 | int i; | ||
83 | |||
84 | /* all the set up before the call gates are initialised */ | ||
85 | pre_intr_init_hook(); | ||
86 | |||
87 | /* | ||
88 | * Cover the whole vector space, no vector can escape | ||
89 | * us. (some of these will be overridden and become | ||
90 | * 'special' SMP interrupts) | ||
91 | */ | ||
92 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
93 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
94 | if (i >= NR_IRQS) | ||
95 | break; | ||
96 | /* SYSCALL_VECTOR was reserved in trap_init. */ | ||
97 | if (!test_bit(vector, used_vectors)) | ||
98 | set_intr_gate(vector, interrupt[i]); | ||
99 | } | ||
100 | |||
101 | /* setup after call gates are initialised (usually add in | ||
102 | * the architecture specific gates) | ||
103 | */ | ||
104 | intr_init_hook(); | ||
105 | |||
106 | /* | ||
107 | * External FPU? Set up irq13 if so, for | ||
108 | * original braindamaged IBM FERR coupling. | ||
109 | */ | ||
110 | if (boot_cpu_data.hard_math && !cpu_has_fpu) | ||
111 | setup_irq(FPU_IRQ, &fpu_irq); | ||
112 | |||
113 | irq_ctx_init(smp_processor_id()); | ||
114 | } | ||
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c new file mode 100644 index 000000000000..1f26fd9ec4f4 --- /dev/null +++ b/arch/x86/kernel/irqinit_64.c | |||
@@ -0,0 +1,222 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/signal.h> | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/interrupt.h> | ||
7 | #include <linux/timex.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/random.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/kernel_stat.h> | ||
12 | #include <linux/sysdev.h> | ||
13 | #include <linux/bitops.h> | ||
14 | |||
15 | #include <asm/acpi.h> | ||
16 | #include <asm/atomic.h> | ||
17 | #include <asm/system.h> | ||
18 | #include <asm/io.h> | ||
19 | #include <asm/hw_irq.h> | ||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/delay.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/apic.h> | ||
24 | #include <asm/i8259.h> | ||
25 | |||
26 | /* | ||
27 | * Common place to define all x86 IRQ vectors | ||
28 | * | ||
29 | * This builds up the IRQ handler stubs using some ugly macros in irq.h | ||
30 | * | ||
31 | * These macros create the low-level assembly IRQ routines that save | ||
32 | * register context and call do_IRQ(). do_IRQ() then does all the | ||
33 | * operations that are needed to keep the AT (or SMP IOAPIC) | ||
34 | * interrupt-controller happy. | ||
35 | */ | ||
36 | |||
37 | #define IRQ_NAME2(nr) nr##_interrupt(void) | ||
38 | #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) | ||
39 | |||
40 | /* | ||
41 | * SMP has a few special interrupts for IPI messages | ||
42 | */ | ||
43 | |||
44 | #define BUILD_IRQ(nr) \ | ||
45 | asmlinkage void IRQ_NAME(nr); \ | ||
46 | asm("\n.text\n.p2align\n" \ | ||
47 | "IRQ" #nr "_interrupt:\n\t" \ | ||
48 | "push $~(" #nr ") ; " \ | ||
49 | "jmp common_interrupt\n" \ | ||
50 | ".previous"); | ||
51 | |||
52 | #define BI(x,y) \ | ||
53 | BUILD_IRQ(x##y) | ||
54 | |||
55 | #define BUILD_16_IRQS(x) \ | ||
56 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
57 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
58 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
59 | BI(x,c) BI(x,d) BI(x,e) BI(x,f) | ||
60 | |||
61 | /* | ||
62 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
63 | * (these are usually mapped to vectors 0x30-0x3f) | ||
64 | */ | ||
65 | |||
66 | /* | ||
67 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
68 | * are unused but an SMP system is supposed to have enough memory ... | ||
69 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
70 | * across the spectrum, so we really want to be prepared to get all | ||
71 | * of these. Plus, more powerful systems might have more than 64 | ||
72 | * IO-APIC registers. | ||
73 | * | ||
74 | * (these are usually mapped into the 0x30-0xff vector range) | ||
75 | */ | ||
76 | BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) | ||
77 | BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) | ||
78 | BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) | ||
79 | BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) | ||
80 | |||
81 | #undef BUILD_16_IRQS | ||
82 | #undef BI | ||
83 | |||
84 | |||
85 | #define IRQ(x,y) \ | ||
86 | IRQ##x##y##_interrupt | ||
87 | |||
88 | #define IRQLIST_16(x) \ | ||
89 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
90 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
91 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
92 | IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) | ||
93 | |||
94 | /* for the irq vectors */ | ||
95 | static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { | ||
96 | IRQLIST_16(0x2), IRQLIST_16(0x3), | ||
97 | IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), | ||
98 | IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), | ||
99 | IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) | ||
100 | }; | ||
101 | |||
102 | #undef IRQ | ||
103 | #undef IRQLIST_16 | ||
104 | |||
105 | |||
106 | |||
107 | |||
108 | /* | ||
109 | * IRQ2 is cascade interrupt to second interrupt controller | ||
110 | */ | ||
111 | |||
112 | static struct irqaction irq2 = { | ||
113 | .handler = no_action, | ||
114 | .mask = CPU_MASK_NONE, | ||
115 | .name = "cascade", | ||
116 | }; | ||
117 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | ||
118 | [0 ... IRQ0_VECTOR - 1] = -1, | ||
119 | [IRQ0_VECTOR] = 0, | ||
120 | [IRQ1_VECTOR] = 1, | ||
121 | [IRQ2_VECTOR] = 2, | ||
122 | [IRQ3_VECTOR] = 3, | ||
123 | [IRQ4_VECTOR] = 4, | ||
124 | [IRQ5_VECTOR] = 5, | ||
125 | [IRQ6_VECTOR] = 6, | ||
126 | [IRQ7_VECTOR] = 7, | ||
127 | [IRQ8_VECTOR] = 8, | ||
128 | [IRQ9_VECTOR] = 9, | ||
129 | [IRQ10_VECTOR] = 10, | ||
130 | [IRQ11_VECTOR] = 11, | ||
131 | [IRQ12_VECTOR] = 12, | ||
132 | [IRQ13_VECTOR] = 13, | ||
133 | [IRQ14_VECTOR] = 14, | ||
134 | [IRQ15_VECTOR] = 15, | ||
135 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | ||
136 | }; | ||
137 | |||
138 | static void __init init_ISA_irqs (void) | ||
139 | { | ||
140 | int i; | ||
141 | |||
142 | init_bsp_APIC(); | ||
143 | init_8259A(0); | ||
144 | |||
145 | for (i = 0; i < NR_IRQS; i++) { | ||
146 | irq_desc[i].status = IRQ_DISABLED; | ||
147 | irq_desc[i].action = NULL; | ||
148 | irq_desc[i].depth = 1; | ||
149 | |||
150 | if (i < 16) { | ||
151 | /* | ||
152 | * 16 old-style INTA-cycle interrupts: | ||
153 | */ | ||
154 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
155 | handle_level_irq, "XT"); | ||
156 | } else { | ||
157 | /* | ||
158 | * 'high' PCI IRQs filled in on demand | ||
159 | */ | ||
160 | irq_desc[i].chip = &no_irq_chip; | ||
161 | } | ||
162 | } | ||
163 | } | ||
164 | |||
165 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
166 | |||
167 | void __init native_init_IRQ(void) | ||
168 | { | ||
169 | int i; | ||
170 | |||
171 | init_ISA_irqs(); | ||
172 | /* | ||
173 | * Cover the whole vector space, no vector can escape | ||
174 | * us. (some of these will be overridden and become | ||
175 | * 'special' SMP interrupts) | ||
176 | */ | ||
177 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
178 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
179 | if (vector != IA32_SYSCALL_VECTOR) | ||
180 | set_intr_gate(vector, interrupt[i]); | ||
181 | } | ||
182 | |||
183 | #ifdef CONFIG_SMP | ||
184 | /* | ||
185 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
186 | * IPI, driven by wakeup. | ||
187 | */ | ||
188 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
189 | |||
190 | /* IPIs for invalidation */ | ||
191 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); | ||
192 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | ||
193 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | ||
194 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | ||
195 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | ||
196 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | ||
197 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | ||
198 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | ||
199 | |||
200 | /* IPI for generic function call */ | ||
201 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
202 | |||
203 | /* IPI for generic single function call */ | ||
204 | alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, | ||
205 | call_function_single_interrupt); | ||
206 | |||
207 | /* Low priority IPI to cleanup after moving an irq */ | ||
208 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | ||
209 | #endif | ||
210 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
211 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | ||
212 | |||
213 | /* self generated IPI for local APIC timer */ | ||
214 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
215 | |||
216 | /* IPI vectors for APIC spurious and error interrupts */ | ||
217 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
218 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
219 | |||
220 | if (!acpi_ioapic) | ||
221 | setup_irq(2, &irq2); | ||
222 | } | ||
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index c03205991718..f2d43bc75514 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c | |||
@@ -12,9 +12,13 @@ | |||
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/io.h> | 13 | #include <linux/io.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/module.h> | ||
15 | 16 | ||
16 | #include <asm/setup.h> | 17 | #include <asm/setup.h> |
17 | 18 | ||
19 | struct dentry *arch_debugfs_dir; | ||
20 | EXPORT_SYMBOL(arch_debugfs_dir); | ||
21 | |||
18 | #ifdef CONFIG_DEBUG_BOOT_PARAMS | 22 | #ifdef CONFIG_DEBUG_BOOT_PARAMS |
19 | struct setup_data_node { | 23 | struct setup_data_node { |
20 | u64 paddr; | 24 | u64 paddr; |
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void) | |||
209 | { | 213 | { |
210 | int error = 0; | 214 | int error = 0; |
211 | 215 | ||
216 | arch_debugfs_dir = debugfs_create_dir("x86", NULL); | ||
217 | if (!arch_debugfs_dir) | ||
218 | return -ENOMEM; | ||
219 | |||
212 | #ifdef CONFIG_DEBUG_BOOT_PARAMS | 220 | #ifdef CONFIG_DEBUG_BOOT_PARAMS |
213 | error = boot_params_kdebugfs_init(); | 221 | error = boot_params_kdebugfs_init(); |
214 | #endif | 222 | #endif |
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b8c6743a13da..6c27679ec6aa 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | |||
431 | regs->ip = (unsigned long)p->ainsn.insn; | 431 | regs->ip = (unsigned long)p->ainsn.insn; |
432 | } | 432 | } |
433 | 433 | ||
434 | /* Called with kretprobe_lock held */ | ||
435 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | 434 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, |
436 | struct pt_regs *regs) | 435 | struct pt_regs *regs) |
437 | { | 436 | { |
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
682 | unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; | 681 | unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; |
683 | 682 | ||
684 | INIT_HLIST_HEAD(&empty_rp); | 683 | INIT_HLIST_HEAD(&empty_rp); |
685 | spin_lock_irqsave(&kretprobe_lock, flags); | 684 | kretprobe_hash_lock(current, &head, &flags); |
686 | head = kretprobe_inst_table_head(current); | ||
687 | /* fixup registers */ | 685 | /* fixup registers */ |
688 | #ifdef CONFIG_X86_64 | 686 | #ifdef CONFIG_X86_64 |
689 | regs->cs = __KERNEL_CS; | 687 | regs->cs = __KERNEL_CS; |
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
732 | 730 | ||
733 | kretprobe_assert(ri, orig_ret_address, trampoline_address); | 731 | kretprobe_assert(ri, orig_ret_address, trampoline_address); |
734 | 732 | ||
735 | spin_unlock_irqrestore(&kretprobe_lock, flags); | 733 | kretprobe_hash_unlock(current, &flags); |
736 | 734 | ||
737 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { | 735 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { |
738 | hlist_del(&ri->hlist); | 736 | hlist_del(&ri->hlist); |
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) | |||
860 | 858 | ||
861 | resume_execution(cur, regs, kcb); | 859 | resume_execution(cur, regs, kcb); |
862 | regs->flags |= kcb->kprobe_saved_flags; | 860 | regs->flags |= kcb->kprobe_saved_flags; |
863 | trace_hardirqs_fixup_flags(regs->flags); | ||
864 | 861 | ||
865 | if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { | 862 | if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { |
866 | kcb->kprobe_status = KPROBE_HIT_SSDONE; | 863 | kcb->kprobe_status = KPROBE_HIT_SSDONE; |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d472..d02def06ca91 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -18,6 +18,7 @@ | |||
18 | 18 | ||
19 | #include <linux/clocksource.h> | 19 | #include <linux/clocksource.h> |
20 | #include <linux/kvm_para.h> | 20 | #include <linux/kvm_para.h> |
21 | #include <asm/pvclock.h> | ||
21 | #include <asm/arch_hooks.h> | 22 | #include <asm/arch_hooks.h> |
22 | #include <asm/msr.h> | 23 | #include <asm/msr.h> |
23 | #include <asm/apic.h> | 24 | #include <asm/apic.h> |
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg) | |||
36 | early_param("no-kvmclock", parse_no_kvmclock); | 37 | early_param("no-kvmclock", parse_no_kvmclock); |
37 | 38 | ||
38 | /* The hypervisor will put information about time periodically here */ | 39 | /* The hypervisor will put information about time periodically here */ |
39 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); | 40 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); |
40 | #define get_clock(cpu, field) per_cpu(hv_clock, cpu).field | 41 | static struct pvclock_wall_clock wall_clock; |
41 | 42 | ||
42 | static inline u64 kvm_get_delta(u64 last_tsc) | ||
43 | { | ||
44 | int cpu = smp_processor_id(); | ||
45 | u64 delta = native_read_tsc() - last_tsc; | ||
46 | return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; | ||
47 | } | ||
48 | |||
49 | static struct kvm_wall_clock wall_clock; | ||
50 | static cycle_t kvm_clock_read(void); | ||
51 | /* | 43 | /* |
52 | * The wallclock is the time of day when we booted. Since then, some time may | 44 | * The wallclock is the time of day when we booted. Since then, some time may |
53 | * have elapsed since the hypervisor wrote the data. So we try to account for | 45 | * have elapsed since the hypervisor wrote the data. So we try to account for |
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void); | |||
55 | */ | 47 | */ |
56 | static unsigned long kvm_get_wallclock(void) | 48 | static unsigned long kvm_get_wallclock(void) |
57 | { | 49 | { |
58 | u32 wc_sec, wc_nsec; | 50 | struct pvclock_vcpu_time_info *vcpu_time; |
59 | u64 delta; | ||
60 | struct timespec ts; | 51 | struct timespec ts; |
61 | int version, nsec; | ||
62 | int low, high; | 52 | int low, high; |
63 | 53 | ||
64 | low = (int)__pa(&wall_clock); | 54 | low = (int)__pa(&wall_clock); |
65 | high = ((u64)__pa(&wall_clock) >> 32); | 55 | high = ((u64)__pa(&wall_clock) >> 32); |
56 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | ||
66 | 57 | ||
67 | delta = kvm_clock_read(); | 58 | vcpu_time = &get_cpu_var(hv_clock); |
59 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); | ||
60 | put_cpu_var(hv_clock); | ||
68 | 61 | ||
69 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | 62 | return ts.tv_sec; |
70 | do { | ||
71 | version = wall_clock.wc_version; | ||
72 | rmb(); | ||
73 | wc_sec = wall_clock.wc_sec; | ||
74 | wc_nsec = wall_clock.wc_nsec; | ||
75 | rmb(); | ||
76 | } while ((wall_clock.wc_version != version) || (version & 1)); | ||
77 | |||
78 | delta = kvm_clock_read() - delta; | ||
79 | delta += wc_nsec; | ||
80 | nsec = do_div(delta, NSEC_PER_SEC); | ||
81 | set_normalized_timespec(&ts, wc_sec + delta, nsec); | ||
82 | /* | ||
83 | * Of all mechanisms of time adjustment I've tested, this one | ||
84 | * was the champion! | ||
85 | */ | ||
86 | return ts.tv_sec + 1; | ||
87 | } | 63 | } |
88 | 64 | ||
89 | static int kvm_set_wallclock(unsigned long now) | 65 | static int kvm_set_wallclock(unsigned long now) |
90 | { | 66 | { |
91 | return 0; | 67 | return -1; |
92 | } | 68 | } |
93 | 69 | ||
94 | /* | ||
95 | * This is our read_clock function. The host puts an tsc timestamp each time | ||
96 | * it updates a new time. Without the tsc adjustment, we can have a situation | ||
97 | * in which a vcpu starts to run earlier (smaller system_time), but probes | ||
98 | * time later (compared to another vcpu), leading to backwards time | ||
99 | */ | ||
100 | static cycle_t kvm_clock_read(void) | 70 | static cycle_t kvm_clock_read(void) |
101 | { | 71 | { |
102 | u64 last_tsc, now; | 72 | struct pvclock_vcpu_time_info *src; |
103 | int cpu; | 73 | cycle_t ret; |
104 | 74 | ||
105 | preempt_disable(); | 75 | src = &get_cpu_var(hv_clock); |
106 | cpu = smp_processor_id(); | 76 | ret = pvclock_clocksource_read(src); |
107 | 77 | put_cpu_var(hv_clock); | |
108 | last_tsc = get_clock(cpu, tsc_timestamp); | 78 | return ret; |
109 | now = get_clock(cpu, system_time); | ||
110 | |||
111 | now += kvm_get_delta(last_tsc); | ||
112 | preempt_enable(); | ||
113 | |||
114 | return now; | ||
115 | } | 79 | } |
80 | |||
116 | static struct clocksource kvm_clock = { | 81 | static struct clocksource kvm_clock = { |
117 | .name = "kvm-clock", | 82 | .name = "kvm-clock", |
118 | .read = kvm_clock_read, | 83 | .read = kvm_clock_read, |
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = { | |||
123 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 88 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
124 | }; | 89 | }; |
125 | 90 | ||
126 | static int kvm_register_clock(void) | 91 | static int kvm_register_clock(char *txt) |
127 | { | 92 | { |
128 | int cpu = smp_processor_id(); | 93 | int cpu = smp_processor_id(); |
129 | int low, high; | 94 | int low, high; |
130 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; | 95 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; |
131 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); | 96 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); |
132 | 97 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", | |
98 | cpu, high, low, txt); | ||
133 | return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); | 99 | return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); |
134 | } | 100 | } |
135 | 101 | ||
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void) | |||
140 | * Now that the first cpu already had this clocksource initialized, | 106 | * Now that the first cpu already had this clocksource initialized, |
141 | * we shouldn't fail. | 107 | * we shouldn't fail. |
142 | */ | 108 | */ |
143 | WARN_ON(kvm_register_clock()); | 109 | WARN_ON(kvm_register_clock("secondary cpu clock")); |
144 | /* ok, done with our trickery, call native */ | 110 | /* ok, done with our trickery, call native */ |
145 | setup_secondary_APIC_clock(); | 111 | setup_secondary_APIC_clock(); |
146 | } | 112 | } |
147 | #endif | 113 | #endif |
148 | 114 | ||
115 | #ifdef CONFIG_SMP | ||
116 | static void __init kvm_smp_prepare_boot_cpu(void) | ||
117 | { | ||
118 | WARN_ON(kvm_register_clock("primary cpu clock")); | ||
119 | native_smp_prepare_boot_cpu(); | ||
120 | } | ||
121 | #endif | ||
122 | |||
149 | /* | 123 | /* |
150 | * After the clock is registered, the host will keep writing to the | 124 | * After the clock is registered, the host will keep writing to the |
151 | * registered memory location. If the guest happens to shutdown, this memory | 125 | * registered memory location. If the guest happens to shutdown, this memory |
@@ -174,7 +148,7 @@ void __init kvmclock_init(void) | |||
174 | return; | 148 | return; |
175 | 149 | ||
176 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { | 150 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { |
177 | if (kvm_register_clock()) | 151 | if (kvm_register_clock("boot clock")) |
178 | return; | 152 | return; |
179 | pv_time_ops.get_wallclock = kvm_get_wallclock; | 153 | pv_time_ops.get_wallclock = kvm_get_wallclock; |
180 | pv_time_ops.set_wallclock = kvm_set_wallclock; | 154 | pv_time_ops.set_wallclock = kvm_set_wallclock; |
@@ -182,6 +156,9 @@ void __init kvmclock_init(void) | |||
182 | #ifdef CONFIG_X86_LOCAL_APIC | 156 | #ifdef CONFIG_X86_LOCAL_APIC |
183 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; | 157 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; |
184 | #endif | 158 | #endif |
159 | #ifdef CONFIG_SMP | ||
160 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | ||
161 | #endif | ||
185 | machine_ops.shutdown = kvm_shutdown; | 162 | machine_ops.shutdown = kvm_shutdown; |
186 | #ifdef CONFIG_KEXEC | 163 | #ifdef CONFIG_KEXEC |
187 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 164 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 0224c3637c73..b68e21f06f4f 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -20,9 +20,9 @@ | |||
20 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
21 | 21 | ||
22 | #ifdef CONFIG_SMP | 22 | #ifdef CONFIG_SMP |
23 | static void flush_ldt(void *null) | 23 | static void flush_ldt(void *current_mm) |
24 | { | 24 | { |
25 | if (current->active_mm) | 25 | if (current->active_mm == current_mm) |
26 | load_LDT(¤t->active_mm->context); | 26 | load_LDT(¤t->active_mm->context); |
27 | } | 27 | } |
28 | #endif | 28 | #endif |
@@ -62,13 +62,11 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |||
62 | 62 | ||
63 | if (reload) { | 63 | if (reload) { |
64 | #ifdef CONFIG_SMP | 64 | #ifdef CONFIG_SMP |
65 | cpumask_t mask; | ||
66 | |||
67 | preempt_disable(); | 65 | preempt_disable(); |
68 | load_LDT(pc); | 66 | load_LDT(pc); |
69 | mask = cpumask_of_cpu(smp_processor_id()); | 67 | if (!cpus_equal(current->mm->cpu_vm_mask, |
70 | if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | 68 | cpumask_of_cpu(smp_processor_id()))) |
71 | smp_call_function(flush_ldt, NULL, 1, 1); | 69 | smp_call_function(flush_ldt, current->mm, 1); |
72 | preempt_enable(); | 70 | preempt_enable(); |
73 | #else | 71 | #else |
74 | load_LDT(pc); | 72 | load_LDT(pc); |
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d0b234c9fc31..9fe478d98406 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/delay.h> | 11 | #include <linux/delay.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/numa.h> | 13 | #include <linux/numa.h> |
14 | #include <linux/ftrace.h> | ||
15 | |||
14 | #include <asm/pgtable.h> | 16 | #include <asm/pgtable.h> |
15 | #include <asm/pgalloc.h> | 17 | #include <asm/pgalloc.h> |
16 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
@@ -20,6 +22,7 @@ | |||
20 | #include <asm/cpufeature.h> | 22 | #include <asm/cpufeature.h> |
21 | #include <asm/desc.h> | 23 | #include <asm/desc.h> |
22 | #include <asm/system.h> | 24 | #include <asm/system.h> |
25 | #include <asm/cacheflush.h> | ||
23 | 26 | ||
24 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) | 27 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) |
25 | static u32 kexec_pgd[1024] PAGE_ALIGNED; | 28 | static u32 kexec_pgd[1024] PAGE_ALIGNED; |
@@ -39,7 +42,7 @@ static void set_idt(void *newidt, __u16 limit) | |||
39 | curidt.address = (unsigned long)newidt; | 42 | curidt.address = (unsigned long)newidt; |
40 | 43 | ||
41 | load_idt(&curidt); | 44 | load_idt(&curidt); |
42 | }; | 45 | } |
43 | 46 | ||
44 | 47 | ||
45 | static void set_gdt(void *newgdt, __u16 limit) | 48 | static void set_gdt(void *newgdt, __u16 limit) |
@@ -51,7 +54,7 @@ static void set_gdt(void *newgdt, __u16 limit) | |||
51 | curgdt.address = (unsigned long)newgdt; | 54 | curgdt.address = (unsigned long)newgdt; |
52 | 55 | ||
53 | load_gdt(&curgdt); | 56 | load_gdt(&curgdt); |
54 | }; | 57 | } |
55 | 58 | ||
56 | static void load_segments(void) | 59 | static void load_segments(void) |
57 | { | 60 | { |
@@ -83,10 +86,12 @@ static void load_segments(void) | |||
83 | * reboot code buffer to allow us to avoid allocations | 86 | * reboot code buffer to allow us to avoid allocations |
84 | * later. | 87 | * later. |
85 | * | 88 | * |
86 | * Currently nothing. | 89 | * Make control page executable. |
87 | */ | 90 | */ |
88 | int machine_kexec_prepare(struct kimage *image) | 91 | int machine_kexec_prepare(struct kimage *image) |
89 | { | 92 | { |
93 | if (nx_enabled) | ||
94 | set_pages_x(image->control_code_page, 1); | ||
90 | return 0; | 95 | return 0; |
91 | } | 96 | } |
92 | 97 | ||
@@ -96,25 +101,48 @@ int machine_kexec_prepare(struct kimage *image) | |||
96 | */ | 101 | */ |
97 | void machine_kexec_cleanup(struct kimage *image) | 102 | void machine_kexec_cleanup(struct kimage *image) |
98 | { | 103 | { |
104 | if (nx_enabled) | ||
105 | set_pages_nx(image->control_code_page, 1); | ||
99 | } | 106 | } |
100 | 107 | ||
101 | /* | 108 | /* |
102 | * Do not allocate memory (or fail in any way) in machine_kexec(). | 109 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
103 | * We are past the point of no return, committed to rebooting now. | 110 | * We are past the point of no return, committed to rebooting now. |
104 | */ | 111 | */ |
105 | NORET_TYPE void machine_kexec(struct kimage *image) | 112 | void machine_kexec(struct kimage *image) |
106 | { | 113 | { |
107 | unsigned long page_list[PAGES_NR]; | 114 | unsigned long page_list[PAGES_NR]; |
108 | void *control_page; | 115 | void *control_page; |
116 | asmlinkage unsigned long | ||
117 | (*relocate_kernel_ptr)(unsigned long indirection_page, | ||
118 | unsigned long control_page, | ||
119 | unsigned long start_address, | ||
120 | unsigned int has_pae, | ||
121 | unsigned int preserve_context); | ||
122 | |||
123 | tracer_disable(); | ||
109 | 124 | ||
110 | /* Interrupts aren't acceptable while we reboot */ | 125 | /* Interrupts aren't acceptable while we reboot */ |
111 | local_irq_disable(); | 126 | local_irq_disable(); |
112 | 127 | ||
128 | if (image->preserve_context) { | ||
129 | #ifdef CONFIG_X86_IO_APIC | ||
130 | /* We need to put APICs in legacy mode so that we can | ||
131 | * get timer interrupts in second kernel. kexec/kdump | ||
132 | * paths already have calls to disable_IO_APIC() in | ||
133 | * one form or other. kexec jump path also need | ||
134 | * one. | ||
135 | */ | ||
136 | disable_IO_APIC(); | ||
137 | #endif | ||
138 | } | ||
139 | |||
113 | control_page = page_address(image->control_code_page); | 140 | control_page = page_address(image->control_code_page); |
114 | memcpy(control_page, relocate_kernel, PAGE_SIZE); | 141 | memcpy(control_page, relocate_kernel, PAGE_SIZE/2); |
115 | 142 | ||
143 | relocate_kernel_ptr = control_page; | ||
116 | page_list[PA_CONTROL_PAGE] = __pa(control_page); | 144 | page_list[PA_CONTROL_PAGE] = __pa(control_page); |
117 | page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; | 145 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
118 | page_list[PA_PGD] = __pa(kexec_pgd); | 146 | page_list[PA_PGD] = __pa(kexec_pgd); |
119 | page_list[VA_PGD] = (unsigned long)kexec_pgd; | 147 | page_list[VA_PGD] = (unsigned long)kexec_pgd; |
120 | #ifdef CONFIG_X86_PAE | 148 | #ifdef CONFIG_X86_PAE |
@@ -127,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) | |||
127 | page_list[VA_PTE_0] = (unsigned long)kexec_pte0; | 155 | page_list[VA_PTE_0] = (unsigned long)kexec_pte0; |
128 | page_list[PA_PTE_1] = __pa(kexec_pte1); | 156 | page_list[PA_PTE_1] = __pa(kexec_pte1); |
129 | page_list[VA_PTE_1] = (unsigned long)kexec_pte1; | 157 | page_list[VA_PTE_1] = (unsigned long)kexec_pte1; |
158 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); | ||
130 | 159 | ||
131 | /* The segment registers are funny things, they have both a | 160 | /* The segment registers are funny things, they have both a |
132 | * visible and an invisible part. Whenever the visible part is | 161 | * visible and an invisible part. Whenever the visible part is |
@@ -145,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) | |||
145 | set_idt(phys_to_virt(0),0); | 174 | set_idt(phys_to_virt(0),0); |
146 | 175 | ||
147 | /* now call it */ | 176 | /* now call it */ |
148 | relocate_kernel((unsigned long)image->head, (unsigned long)page_list, | 177 | image->start = relocate_kernel_ptr((unsigned long)image->head, |
149 | image->start, cpu_has_pae); | 178 | (unsigned long)page_list, |
179 | image->start, cpu_has_pae, | ||
180 | image->preserve_context); | ||
150 | } | 181 | } |
151 | 182 | ||
152 | void arch_crash_save_vmcoreinfo(void) | 183 | void arch_crash_save_vmcoreinfo(void) |
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 576a03db4511..c43caa3a91f3 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/string.h> | 11 | #include <linux/string.h> |
12 | #include <linux/reboot.h> | 12 | #include <linux/reboot.h> |
13 | #include <linux/numa.h> | 13 | #include <linux/numa.h> |
14 | #include <linux/ftrace.h> | ||
15 | |||
14 | #include <asm/pgtable.h> | 16 | #include <asm/pgtable.h> |
15 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
16 | #include <asm/mmu_context.h> | 18 | #include <asm/mmu_context.h> |
@@ -110,7 +112,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | |||
110 | { | 112 | { |
111 | pgd_t *level4p; | 113 | pgd_t *level4p; |
112 | level4p = (pgd_t *)__va(start_pgtable); | 114 | level4p = (pgd_t *)__va(start_pgtable); |
113 | return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); | 115 | return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); |
114 | } | 116 | } |
115 | 117 | ||
116 | static void set_idt(void *newidt, u16 limit) | 118 | static void set_idt(void *newidt, u16 limit) |
@@ -179,11 +181,13 @@ void machine_kexec_cleanup(struct kimage *image) | |||
179 | * Do not allocate memory (or fail in any way) in machine_kexec(). | 181 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
180 | * We are past the point of no return, committed to rebooting now. | 182 | * We are past the point of no return, committed to rebooting now. |
181 | */ | 183 | */ |
182 | NORET_TYPE void machine_kexec(struct kimage *image) | 184 | void machine_kexec(struct kimage *image) |
183 | { | 185 | { |
184 | unsigned long page_list[PAGES_NR]; | 186 | unsigned long page_list[PAGES_NR]; |
185 | void *control_page; | 187 | void *control_page; |
186 | 188 | ||
189 | tracer_disable(); | ||
190 | |||
187 | /* Interrupts aren't acceptable while we reboot */ | 191 | /* Interrupts aren't acceptable while we reboot */ |
188 | local_irq_disable(); | 192 | local_irq_disable(); |
189 | 193 | ||
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 69729e38b78a..652fa5c38ebe 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c | |||
@@ -5,13 +5,14 @@ | |||
5 | * 2006 Shaohua Li <shaohua.li@intel.com> | 5 | * 2006 Shaohua Li <shaohua.li@intel.com> |
6 | * | 6 | * |
7 | * This driver allows to upgrade microcode on Intel processors | 7 | * This driver allows to upgrade microcode on Intel processors |
8 | * belonging to IA-32 family - PentiumPro, Pentium II, | 8 | * belonging to IA-32 family - PentiumPro, Pentium II, |
9 | * Pentium III, Xeon, Pentium 4, etc. | 9 | * Pentium III, Xeon, Pentium 4, etc. |
10 | * | 10 | * |
11 | * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, | 11 | * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture |
12 | * Order Number 245472 or free download from: | 12 | * Software Developer's Manual |
13 | * | 13 | * Order Number 253668 or free download from: |
14 | * http://developer.intel.com/design/pentium4/manuals/245472.htm | 14 | * |
15 | * http://developer.intel.com/design/pentium4/manuals/253668.htm | ||
15 | * | 16 | * |
16 | * For more information, go to http://www.urbanmyth.org/microcode | 17 | * For more information, go to http://www.urbanmyth.org/microcode |
17 | * | 18 | * |
@@ -58,12 +59,12 @@ | |||
58 | * nature of implementation. | 59 | * nature of implementation. |
59 | * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> | 60 | * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> |
60 | * Fix the panic when writing zero-length microcode chunk. | 61 | * Fix the panic when writing zero-length microcode chunk. |
61 | * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, | 62 | * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, |
62 | * Jun Nakajima <jun.nakajima@intel.com> | 63 | * Jun Nakajima <jun.nakajima@intel.com> |
63 | * Support for the microcode updates in the new format. | 64 | * Support for the microcode updates in the new format. |
64 | * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> | 65 | * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> |
65 | * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl | 66 | * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl |
66 | * because we no longer hold a copy of applied microcode | 67 | * because we no longer hold a copy of applied microcode |
67 | * in kernel memory. | 68 | * in kernel memory. |
68 | * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> | 69 | * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> |
69 | * Fix sigmatch() macro to handle old CPUs with pf == 0. | 70 | * Fix sigmatch() macro to handle old CPUs with pf == 0. |
@@ -75,6 +76,7 @@ | |||
75 | #include <linux/kernel.h> | 76 | #include <linux/kernel.h> |
76 | #include <linux/init.h> | 77 | #include <linux/init.h> |
77 | #include <linux/sched.h> | 78 | #include <linux/sched.h> |
79 | #include <linux/smp_lock.h> | ||
78 | #include <linux/cpumask.h> | 80 | #include <linux/cpumask.h> |
79 | #include <linux/module.h> | 81 | #include <linux/module.h> |
80 | #include <linux/slab.h> | 82 | #include <linux/slab.h> |
@@ -320,11 +322,11 @@ static void apply_microcode(int cpu) | |||
320 | return; | 322 | return; |
321 | 323 | ||
322 | /* serialize access to the physical write to MSR 0x79 */ | 324 | /* serialize access to the physical write to MSR 0x79 */ |
323 | spin_lock_irqsave(µcode_update_lock, flags); | 325 | spin_lock_irqsave(µcode_update_lock, flags); |
324 | 326 | ||
325 | /* write microcode via MSR 0x79 */ | 327 | /* write microcode via MSR 0x79 */ |
326 | wrmsr(MSR_IA32_UCODE_WRITE, | 328 | wrmsr(MSR_IA32_UCODE_WRITE, |
327 | (unsigned long) uci->mc->bits, | 329 | (unsigned long) uci->mc->bits, |
328 | (unsigned long) uci->mc->bits >> 16 >> 16); | 330 | (unsigned long) uci->mc->bits >> 16 >> 16); |
329 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | 331 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); |
330 | 332 | ||
@@ -341,7 +343,7 @@ static void apply_microcode(int cpu) | |||
341 | return; | 343 | return; |
342 | } | 344 | } |
343 | printk(KERN_INFO "microcode: CPU%d updated from revision " | 345 | printk(KERN_INFO "microcode: CPU%d updated from revision " |
344 | "0x%x to 0x%x, date = %08x \n", | 346 | "0x%x to 0x%x, date = %08x \n", |
345 | cpu_num, uci->rev, val[1], uci->mc->hdr.date); | 347 | cpu_num, uci->rev, val[1], uci->mc->hdr.date); |
346 | uci->rev = val[1]; | 348 | uci->rev = val[1]; |
347 | } | 349 | } |
@@ -422,6 +424,7 @@ out: | |||
422 | 424 | ||
423 | static int microcode_open (struct inode *unused1, struct file *unused2) | 425 | static int microcode_open (struct inode *unused1, struct file *unused2) |
424 | { | 426 | { |
427 | cycle_kernel_lock(); | ||
425 | return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; | 428 | return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; |
426 | } | 429 | } |
427 | 430 | ||
@@ -488,7 +491,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); | |||
488 | #define microcode_dev_exit() do { } while(0) | 491 | #define microcode_dev_exit() do { } while(0) |
489 | #endif | 492 | #endif |
490 | 493 | ||
491 | static long get_next_ucode_from_buffer(void **mc, void *buf, | 494 | static long get_next_ucode_from_buffer(void **mc, const u8 *buf, |
492 | unsigned long size, long offset) | 495 | unsigned long size, long offset) |
493 | { | 496 | { |
494 | microcode_header_t *mc_header; | 497 | microcode_header_t *mc_header; |
@@ -522,7 +525,7 @@ static int cpu_request_microcode(int cpu) | |||
522 | char name[30]; | 525 | char name[30]; |
523 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 526 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
524 | const struct firmware *firmware; | 527 | const struct firmware *firmware; |
525 | void *buf; | 528 | const u8 *buf; |
526 | unsigned long size; | 529 | unsigned long size; |
527 | long offset = 0; | 530 | long offset = 0; |
528 | int error; | 531 | int error; |
@@ -534,7 +537,7 @@ static int cpu_request_microcode(int cpu) | |||
534 | c->x86, c->x86_model, c->x86_mask); | 537 | c->x86, c->x86_model, c->x86_mask); |
535 | error = request_firmware(&firmware, name, µcode_pdev->dev); | 538 | error = request_firmware(&firmware, name, µcode_pdev->dev); |
536 | if (error) { | 539 | if (error) { |
537 | pr_debug("microcode: ucode data file %s load failed\n", name); | 540 | pr_debug("microcode: data file %s load failed\n", name); |
538 | return error; | 541 | return error; |
539 | } | 542 | } |
540 | buf = firmware->data; | 543 | buf = firmware->data; |
@@ -641,7 +644,9 @@ static void microcode_fini_cpu(int cpu) | |||
641 | mutex_unlock(µcode_mutex); | 644 | mutex_unlock(µcode_mutex); |
642 | } | 645 | } |
643 | 646 | ||
644 | static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) | 647 | static ssize_t reload_store(struct sys_device *dev, |
648 | struct sysdev_attribute *attr, | ||
649 | const char *buf, size_t sz) | ||
645 | { | 650 | { |
646 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | 651 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; |
647 | char *end; | 652 | char *end; |
@@ -652,9 +657,7 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) | |||
652 | if (end == buf) | 657 | if (end == buf) |
653 | return -EINVAL; | 658 | return -EINVAL; |
654 | if (val == 1) { | 659 | if (val == 1) { |
655 | cpumask_t old; | 660 | cpumask_t old = current->cpus_allowed; |
656 | |||
657 | old = current->cpus_allowed; | ||
658 | 661 | ||
659 | get_online_cpus(); | 662 | get_online_cpus(); |
660 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | 663 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); |
@@ -671,14 +674,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) | |||
671 | return sz; | 674 | return sz; |
672 | } | 675 | } |
673 | 676 | ||
674 | static ssize_t version_show(struct sys_device *dev, char *buf) | 677 | static ssize_t version_show(struct sys_device *dev, |
678 | struct sysdev_attribute *attr, char *buf) | ||
675 | { | 679 | { |
676 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | 680 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; |
677 | 681 | ||
678 | return sprintf(buf, "0x%x\n", uci->rev); | 682 | return sprintf(buf, "0x%x\n", uci->rev); |
679 | } | 683 | } |
680 | 684 | ||
681 | static ssize_t pf_show(struct sys_device *dev, char *buf) | 685 | static ssize_t pf_show(struct sys_device *dev, |
686 | struct sysdev_attribute *attr, char *buf) | ||
682 | { | 687 | { |
683 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | 688 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; |
684 | 689 | ||
@@ -805,6 +810,9 @@ static int __init microcode_init (void) | |||
805 | { | 810 | { |
806 | int error; | 811 | int error; |
807 | 812 | ||
813 | printk(KERN_INFO | ||
814 | "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); | ||
815 | |||
808 | error = microcode_dev_init(); | 816 | error = microcode_dev_init(); |
809 | if (error) | 817 | if (error) |
810 | return error; | 818 | return error; |
@@ -825,9 +833,6 @@ static int __init microcode_init (void) | |||
825 | } | 833 | } |
826 | 834 | ||
827 | register_hotcpu_notifier(&mc_cpu_notifier); | 835 | register_hotcpu_notifier(&mc_cpu_notifier); |
828 | |||
829 | printk(KERN_INFO | ||
830 | "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); | ||
831 | return 0; | 836 | return 0; |
832 | } | 837 | } |
833 | 838 | ||
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index edc5fbfe85c0..fdfdc550b366 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <asm/io.h> | 12 | #include <asm/io.h> |
13 | #include <asm/msr.h> | 13 | #include <asm/msr.h> |
14 | #include <asm/acpi.h> | 14 | #include <asm/acpi.h> |
15 | #include <asm/mmconfig.h> | ||
15 | 16 | ||
16 | #include "../pci/pci.h" | 17 | #include "../pci/pci.h" |
17 | 18 | ||
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index a888e67f5874..6ba87830d4b1 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
23 | #include <linux/string.h> | 23 | #include <linux/string.h> |
24 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
25 | #include <linux/mm.h> | ||
25 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
26 | #include <linux/bug.h> | 27 | #include <linux/bug.h> |
27 | 28 | ||
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr, | |||
150 | const Elf_Shdr *sechdrs, | 151 | const Elf_Shdr *sechdrs, |
151 | struct module *me) | 152 | struct module *me) |
152 | { | 153 | { |
153 | const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; | 154 | const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, |
155 | *para = NULL; | ||
154 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | 156 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; |
155 | 157 | ||
156 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | 158 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { |
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr, | |||
160 | alt = s; | 162 | alt = s; |
161 | if (!strcmp(".smp_locks", secstrings + s->sh_name)) | 163 | if (!strcmp(".smp_locks", secstrings + s->sh_name)) |
162 | locks= s; | 164 | locks= s; |
165 | if (!strcmp(".parainstructions", secstrings + s->sh_name)) | ||
166 | para = s; | ||
163 | } | 167 | } |
164 | 168 | ||
165 | if (alt) { | 169 | if (alt) { |
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr, | |||
175 | tseg, tseg + text->sh_size); | 179 | tseg, tseg + text->sh_size); |
176 | } | 180 | } |
177 | 181 | ||
182 | if (para) { | ||
183 | void *pseg = (void *)para->sh_addr; | ||
184 | apply_paravirt(pseg, pseg + para->sh_size); | ||
185 | } | ||
186 | |||
178 | return module_bug_finalize(hdr, sechdrs, me); | 187 | return module_bug_finalize(hdr, sechdrs, me); |
179 | } | 188 | } |
180 | 189 | ||
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 404683b94e79..6ae005ccaed8 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -25,6 +25,9 @@ | |||
25 | #include <asm/proto.h> | 25 | #include <asm/proto.h> |
26 | #include <asm/acpi.h> | 26 | #include <asm/acpi.h> |
27 | #include <asm/bios_ebda.h> | 27 | #include <asm/bios_ebda.h> |
28 | #include <asm/e820.h> | ||
29 | #include <asm/trampoline.h> | ||
30 | #include <asm/setup.h> | ||
28 | 31 | ||
29 | #include <mach_apic.h> | 32 | #include <mach_apic.h> |
30 | #ifdef CONFIG_X86_32 | 33 | #ifdef CONFIG_X86_32 |
@@ -32,28 +35,6 @@ | |||
32 | #include <mach_mpparse.h> | 35 | #include <mach_mpparse.h> |
33 | #endif | 36 | #endif |
34 | 37 | ||
35 | /* Have we found an MP table */ | ||
36 | int smp_found_config; | ||
37 | |||
38 | /* | ||
39 | * Various Linux-internal data structures created from the | ||
40 | * MP-table. | ||
41 | */ | ||
42 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | ||
43 | int mp_bus_id_to_type[MAX_MP_BUSSES]; | ||
44 | #endif | ||
45 | |||
46 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | ||
47 | int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 }; | ||
48 | |||
49 | static int mp_current_pci_id; | ||
50 | |||
51 | int pic_mode; | ||
52 | |||
53 | /* | ||
54 | * Intel MP BIOS table parsing routines: | ||
55 | */ | ||
56 | |||
57 | /* | 38 | /* |
58 | * Checksum an MP configuration block. | 39 | * Checksum an MP configuration block. |
59 | */ | 40 | */ |
@@ -68,18 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) | |||
68 | return sum & 0xFF; | 49 | return sum & 0xFF; |
69 | } | 50 | } |
70 | 51 | ||
71 | #ifdef CONFIG_X86_NUMAQ | ||
72 | /* | ||
73 | * Have to match translation table entries to main table entries by counter | ||
74 | * hence the mpc_record variable .... can't see a less disgusting way of | ||
75 | * doing this .... | ||
76 | */ | ||
77 | |||
78 | static int mpc_record; | ||
79 | static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] | ||
80 | __cpuinitdata; | ||
81 | #endif | ||
82 | |||
83 | static void __cpuinit MP_processor_info(struct mpc_config_processor *m) | 52 | static void __cpuinit MP_processor_info(struct mpc_config_processor *m) |
84 | { | 53 | { |
85 | int apicid; | 54 | int apicid; |
@@ -89,11 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) | |||
89 | disabled_cpus++; | 58 | disabled_cpus++; |
90 | return; | 59 | return; |
91 | } | 60 | } |
92 | #ifdef CONFIG_X86_NUMAQ | 61 | |
93 | apicid = mpc_apic_id(m, translation_table[mpc_record]); | 62 | if (x86_quirks->mpc_apic_id) |
94 | #else | 63 | apicid = x86_quirks->mpc_apic_id(m); |
95 | apicid = m->mpc_apicid; | 64 | else |
96 | #endif | 65 | apicid = m->mpc_apicid; |
66 | |||
97 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | 67 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { |
98 | bootup_cpu = " (Bootup-CPU)"; | 68 | bootup_cpu = " (Bootup-CPU)"; |
99 | boot_cpu_physical_apicid = m->mpc_apicid; | 69 | boot_cpu_physical_apicid = m->mpc_apicid; |
@@ -103,18 +73,17 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) | |||
103 | generic_processor_info(apicid, m->mpc_apicver); | 73 | generic_processor_info(apicid, m->mpc_apicver); |
104 | } | 74 | } |
105 | 75 | ||
76 | #ifdef CONFIG_X86_IO_APIC | ||
106 | static void __init MP_bus_info(struct mpc_config_bus *m) | 77 | static void __init MP_bus_info(struct mpc_config_bus *m) |
107 | { | 78 | { |
108 | char str[7]; | 79 | char str[7]; |
109 | |||
110 | memcpy(str, m->mpc_bustype, 6); | 80 | memcpy(str, m->mpc_bustype, 6); |
111 | str[6] = 0; | 81 | str[6] = 0; |
112 | 82 | ||
113 | #ifdef CONFIG_X86_NUMAQ | 83 | if (x86_quirks->mpc_oem_bus_info) |
114 | mpc_oem_bus_info(m, str, translation_table[mpc_record]); | 84 | x86_quirks->mpc_oem_bus_info(m, str); |
115 | #else | 85 | else |
116 | Dprintk("Bus #%d is %s\n", m->mpc_busid, str); | 86 | printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); |
117 | #endif | ||
118 | 87 | ||
119 | #if MAX_MP_BUSSES < 256 | 88 | #if MAX_MP_BUSSES < 256 |
120 | if (m->mpc_busid >= MAX_MP_BUSSES) { | 89 | if (m->mpc_busid >= MAX_MP_BUSSES) { |
@@ -131,12 +100,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m) | |||
131 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; | 100 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; |
132 | #endif | 101 | #endif |
133 | } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { | 102 | } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { |
134 | #ifdef CONFIG_X86_NUMAQ | 103 | if (x86_quirks->mpc_oem_pci_bus) |
135 | mpc_oem_pci_bus(m, translation_table[mpc_record]); | 104 | x86_quirks->mpc_oem_pci_bus(m); |
136 | #endif | 105 | |
137 | clear_bit(m->mpc_busid, mp_bus_not_pci); | 106 | clear_bit(m->mpc_busid, mp_bus_not_pci); |
138 | mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | ||
139 | mp_current_pci_id++; | ||
140 | #if defined(CONFIG_EISA) || defined (CONFIG_MCA) | 107 | #if defined(CONFIG_EISA) || defined (CONFIG_MCA) |
141 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; | 108 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; |
142 | } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { | 109 | } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { |
@@ -147,6 +114,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) | |||
147 | } else | 114 | } else |
148 | printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); | 115 | printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); |
149 | } | 116 | } |
117 | #endif | ||
150 | 118 | ||
151 | #ifdef CONFIG_X86_IO_APIC | 119 | #ifdef CONFIG_X86_IO_APIC |
152 | 120 | ||
@@ -176,117 +144,111 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) | |||
176 | if (bad_ioapic(m->mpc_apicaddr)) | 144 | if (bad_ioapic(m->mpc_apicaddr)) |
177 | return; | 145 | return; |
178 | 146 | ||
179 | mp_ioapics[nr_ioapics] = *m; | 147 | mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr; |
148 | mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid; | ||
149 | mp_ioapics[nr_ioapics].mp_type = m->mpc_type; | ||
150 | mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver; | ||
151 | mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags; | ||
180 | nr_ioapics++; | 152 | nr_ioapics++; |
181 | } | 153 | } |
182 | 154 | ||
183 | static void __init MP_intsrc_info(struct mpc_config_intsrc *m) | 155 | static void print_MP_intsrc_info(struct mpc_config_intsrc *m) |
184 | { | 156 | { |
185 | mp_irqs[mp_irq_entries] = *m; | 157 | printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," |
186 | Dprintk("Int: type %d, pol %d, trig %d, bus %d," | ||
187 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | 158 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", |
188 | m->mpc_irqtype, m->mpc_irqflag & 3, | 159 | m->mpc_irqtype, m->mpc_irqflag & 3, |
189 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | 160 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, |
190 | m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | 161 | m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); |
191 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
192 | panic("Max # of irq sources exceeded!!\n"); | ||
193 | } | 162 | } |
194 | 163 | ||
195 | #endif | 164 | static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) |
196 | |||
197 | static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) | ||
198 | { | 165 | { |
199 | Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | 166 | printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," |
200 | " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | 167 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", |
201 | m->mpc_irqtype, m->mpc_irqflag & 3, | 168 | mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, |
202 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, | 169 | (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, |
203 | m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | 170 | mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); |
204 | } | 171 | } |
205 | 172 | ||
206 | #ifdef CONFIG_X86_NUMAQ | 173 | static void __init assign_to_mp_irq(struct mpc_config_intsrc *m, |
207 | static void __init MP_translation_info(struct mpc_config_translation *m) | 174 | struct mp_config_intsrc *mp_irq) |
208 | { | 175 | { |
209 | printk(KERN_INFO | 176 | mp_irq->mp_dstapic = m->mpc_dstapic; |
210 | "Translation: record %d, type %d, quad %d, global %d, local %d\n", | 177 | mp_irq->mp_type = m->mpc_type; |
211 | mpc_record, m->trans_type, m->trans_quad, m->trans_global, | 178 | mp_irq->mp_irqtype = m->mpc_irqtype; |
212 | m->trans_local); | 179 | mp_irq->mp_irqflag = m->mpc_irqflag; |
180 | mp_irq->mp_srcbus = m->mpc_srcbus; | ||
181 | mp_irq->mp_srcbusirq = m->mpc_srcbusirq; | ||
182 | mp_irq->mp_dstirq = m->mpc_dstirq; | ||
183 | } | ||
213 | 184 | ||
214 | if (mpc_record >= MAX_MPC_ENTRY) | 185 | static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, |
215 | printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); | 186 | struct mpc_config_intsrc *m) |
216 | else | 187 | { |
217 | translation_table[mpc_record] = m; /* stash this for later */ | 188 | m->mpc_dstapic = mp_irq->mp_dstapic; |
218 | if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) | 189 | m->mpc_type = mp_irq->mp_type; |
219 | node_set_online(m->trans_quad); | 190 | m->mpc_irqtype = mp_irq->mp_irqtype; |
191 | m->mpc_irqflag = mp_irq->mp_irqflag; | ||
192 | m->mpc_srcbus = mp_irq->mp_srcbus; | ||
193 | m->mpc_srcbusirq = mp_irq->mp_srcbusirq; | ||
194 | m->mpc_dstirq = mp_irq->mp_dstirq; | ||
220 | } | 195 | } |
221 | 196 | ||
222 | /* | 197 | static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, |
223 | * Read/parse the MPC oem tables | 198 | struct mpc_config_intsrc *m) |
224 | */ | 199 | { |
200 | if (mp_irq->mp_dstapic != m->mpc_dstapic) | ||
201 | return 1; | ||
202 | if (mp_irq->mp_type != m->mpc_type) | ||
203 | return 2; | ||
204 | if (mp_irq->mp_irqtype != m->mpc_irqtype) | ||
205 | return 3; | ||
206 | if (mp_irq->mp_irqflag != m->mpc_irqflag) | ||
207 | return 4; | ||
208 | if (mp_irq->mp_srcbus != m->mpc_srcbus) | ||
209 | return 5; | ||
210 | if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq) | ||
211 | return 6; | ||
212 | if (mp_irq->mp_dstirq != m->mpc_dstirq) | ||
213 | return 7; | ||
214 | |||
215 | return 0; | ||
216 | } | ||
225 | 217 | ||
226 | static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, | 218 | static void __init MP_intsrc_info(struct mpc_config_intsrc *m) |
227 | unsigned short oemsize) | ||
228 | { | 219 | { |
229 | int count = sizeof(*oemtable); /* the header size */ | 220 | int i; |
230 | unsigned char *oemptr = ((unsigned char *)oemtable) + count; | 221 | |
231 | 222 | print_MP_intsrc_info(m); | |
232 | mpc_record = 0; | 223 | |
233 | printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", | 224 | for (i = 0; i < mp_irq_entries; i++) { |
234 | oemtable); | 225 | if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) |
235 | if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { | 226 | return; |
236 | printk(KERN_WARNING | ||
237 | "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", | ||
238 | oemtable->oem_signature[0], oemtable->oem_signature[1], | ||
239 | oemtable->oem_signature[2], oemtable->oem_signature[3]); | ||
240 | return; | ||
241 | } | ||
242 | if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { | ||
243 | printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); | ||
244 | return; | ||
245 | } | ||
246 | while (count < oemtable->oem_length) { | ||
247 | switch (*oemptr) { | ||
248 | case MP_TRANSLATION: | ||
249 | { | ||
250 | struct mpc_config_translation *m = | ||
251 | (struct mpc_config_translation *)oemptr; | ||
252 | MP_translation_info(m); | ||
253 | oemptr += sizeof(*m); | ||
254 | count += sizeof(*m); | ||
255 | ++mpc_record; | ||
256 | break; | ||
257 | } | ||
258 | default: | ||
259 | { | ||
260 | printk(KERN_WARNING | ||
261 | "Unrecognised OEM table entry type! - %d\n", | ||
262 | (int)*oemptr); | ||
263 | return; | ||
264 | } | ||
265 | } | ||
266 | } | 227 | } |
228 | |||
229 | assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); | ||
230 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
231 | panic("Max # of irq sources exceeded!!\n"); | ||
267 | } | 232 | } |
268 | 233 | ||
269 | static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, | 234 | #endif |
270 | char *productid) | 235 | |
236 | static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) | ||
271 | { | 237 | { |
272 | if (strncmp(oem, "IBM NUMA", 8)) | 238 | printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," |
273 | printk("Warning! May not be a NUMA-Q system!\n"); | 239 | " IRQ %02x, APIC ID %x, APIC LINT %02x\n", |
274 | if (mpc->mpc_oemptr) | 240 | m->mpc_irqtype, m->mpc_irqflag & 3, |
275 | smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, | 241 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, |
276 | mpc->mpc_oemsize); | 242 | m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); |
277 | } | 243 | } |
278 | #endif /* CONFIG_X86_NUMAQ */ | ||
279 | 244 | ||
280 | /* | 245 | /* |
281 | * Read/parse the MPC | 246 | * Read/parse the MPC |
282 | */ | 247 | */ |
283 | 248 | ||
284 | static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) | 249 | static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem, |
250 | char *str) | ||
285 | { | 251 | { |
286 | char str[16]; | ||
287 | char oem[10]; | ||
288 | int count = sizeof(*mpc); | ||
289 | unsigned char *mpt = ((unsigned char *)mpc) + count; | ||
290 | 252 | ||
291 | if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { | 253 | if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { |
292 | printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", | 254 | printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", |
@@ -309,19 +271,41 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) | |||
309 | } | 271 | } |
310 | memcpy(oem, mpc->mpc_oem, 8); | 272 | memcpy(oem, mpc->mpc_oem, 8); |
311 | oem[8] = 0; | 273 | oem[8] = 0; |
312 | printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); | 274 | printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); |
313 | 275 | ||
314 | memcpy(str, mpc->mpc_productid, 12); | 276 | memcpy(str, mpc->mpc_productid, 12); |
315 | str[12] = 0; | 277 | str[12] = 0; |
316 | printk("Product ID: %s ", str); | ||
317 | 278 | ||
318 | #ifdef CONFIG_X86_32 | 279 | printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); |
319 | mps_oem_check(mpc, oem, str); | ||
320 | #endif | ||
321 | printk(KERN_INFO "MPTABLE: Product ID: %s ", str); | ||
322 | 280 | ||
323 | printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); | 281 | printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); |
324 | 282 | ||
283 | return 1; | ||
284 | } | ||
285 | |||
286 | static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) | ||
287 | { | ||
288 | char str[16]; | ||
289 | char oem[10]; | ||
290 | |||
291 | int count = sizeof(*mpc); | ||
292 | unsigned char *mpt = ((unsigned char *)mpc) + count; | ||
293 | |||
294 | if (!smp_check_mpc(mpc, oem, str)) | ||
295 | return 0; | ||
296 | |||
297 | #ifdef CONFIG_X86_32 | ||
298 | /* | ||
299 | * need to make sure summit and es7000's mps_oem_check is safe to be | ||
300 | * called early via genericarch 's mps_oem_check | ||
301 | */ | ||
302 | if (early) { | ||
303 | #ifdef CONFIG_X86_NUMAQ | ||
304 | numaq_mps_oem_check(mpc, oem, str); | ||
305 | #endif | ||
306 | } else | ||
307 | mps_oem_check(mpc, oem, str); | ||
308 | #endif | ||
325 | /* save the local APIC address, it might be non-default */ | 309 | /* save the local APIC address, it might be non-default */ |
326 | if (!acpi_lapic) | 310 | if (!acpi_lapic) |
327 | mp_lapic_addr = mpc->mpc_lapic; | 311 | mp_lapic_addr = mpc->mpc_lapic; |
@@ -329,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) | |||
329 | if (early) | 313 | if (early) |
330 | return 1; | 314 | return 1; |
331 | 315 | ||
316 | if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) { | ||
317 | struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr; | ||
318 | x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize); | ||
319 | } | ||
320 | |||
332 | /* | 321 | /* |
333 | * Now process the configuration blocks. | 322 | * Now process the configuration blocks. |
334 | */ | 323 | */ |
335 | #ifdef CONFIG_X86_NUMAQ | 324 | if (x86_quirks->mpc_record) |
336 | mpc_record = 0; | 325 | *x86_quirks->mpc_record = 0; |
337 | #endif | 326 | |
338 | while (count < mpc->mpc_length) { | 327 | while (count < mpc->mpc_length) { |
339 | switch (*mpt) { | 328 | switch (*mpt) { |
340 | case MP_PROCESSOR: | 329 | case MP_PROCESSOR: |
@@ -352,7 +341,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) | |||
352 | { | 341 | { |
353 | struct mpc_config_bus *m = | 342 | struct mpc_config_bus *m = |
354 | (struct mpc_config_bus *)mpt; | 343 | (struct mpc_config_bus *)mpt; |
344 | #ifdef CONFIG_X86_IO_APIC | ||
355 | MP_bus_info(m); | 345 | MP_bus_info(m); |
346 | #endif | ||
356 | mpt += sizeof(*m); | 347 | mpt += sizeof(*m); |
357 | count += sizeof(*m); | 348 | count += sizeof(*m); |
358 | break; | 349 | break; |
@@ -398,10 +389,14 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) | |||
398 | count = mpc->mpc_length; | 389 | count = mpc->mpc_length; |
399 | break; | 390 | break; |
400 | } | 391 | } |
401 | #ifdef CONFIG_X86_NUMAQ | 392 | if (x86_quirks->mpc_record) |
402 | ++mpc_record; | 393 | (*x86_quirks->mpc_record)++; |
403 | #endif | ||
404 | } | 394 | } |
395 | |||
396 | #ifdef CONFIG_X86_GENERICARCH | ||
397 | generic_bigsmp_probe(); | ||
398 | #endif | ||
399 | |||
405 | setup_apic_routing(); | 400 | setup_apic_routing(); |
406 | if (!num_processors) | 401 | if (!num_processors) |
407 | printk(KERN_ERR "MPTABLE: no processors registered!\n"); | 402 | printk(KERN_ERR "MPTABLE: no processors registered!\n"); |
@@ -427,7 +422,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) | |||
427 | intsrc.mpc_type = MP_INTSRC; | 422 | intsrc.mpc_type = MP_INTSRC; |
428 | intsrc.mpc_irqflag = 0; /* conforming */ | 423 | intsrc.mpc_irqflag = 0; /* conforming */ |
429 | intsrc.mpc_srcbus = 0; | 424 | intsrc.mpc_srcbus = 0; |
430 | intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | 425 | intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid; |
431 | 426 | ||
432 | intsrc.mpc_irqtype = mp_INT; | 427 | intsrc.mpc_irqtype = mp_INT; |
433 | 428 | ||
@@ -488,40 +483,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) | |||
488 | MP_intsrc_info(&intsrc); | 483 | MP_intsrc_info(&intsrc); |
489 | } | 484 | } |
490 | 485 | ||
491 | #endif | ||
492 | 486 | ||
493 | static inline void __init construct_default_ISA_mptable(int mpc_default_type) | 487 | static void construct_ioapic_table(int mpc_default_type) |
494 | { | 488 | { |
495 | struct mpc_config_processor processor; | ||
496 | struct mpc_config_bus bus; | ||
497 | #ifdef CONFIG_X86_IO_APIC | ||
498 | struct mpc_config_ioapic ioapic; | 489 | struct mpc_config_ioapic ioapic; |
499 | #endif | 490 | struct mpc_config_bus bus; |
500 | struct mpc_config_lintsrc lintsrc; | ||
501 | int linttypes[2] = { mp_ExtINT, mp_NMI }; | ||
502 | int i; | ||
503 | |||
504 | /* | ||
505 | * local APIC has default address | ||
506 | */ | ||
507 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
508 | |||
509 | /* | ||
510 | * 2 CPUs, numbered 0 & 1. | ||
511 | */ | ||
512 | processor.mpc_type = MP_PROCESSOR; | ||
513 | /* Either an integrated APIC or a discrete 82489DX. */ | ||
514 | processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
515 | processor.mpc_cpuflag = CPU_ENABLED; | ||
516 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
517 | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; | ||
518 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
519 | processor.mpc_reserved[0] = 0; | ||
520 | processor.mpc_reserved[1] = 0; | ||
521 | for (i = 0; i < 2; i++) { | ||
522 | processor.mpc_apicid = i; | ||
523 | MP_processor_info(&processor); | ||
524 | } | ||
525 | 491 | ||
526 | bus.mpc_type = MP_BUS; | 492 | bus.mpc_type = MP_BUS; |
527 | bus.mpc_busid = 0; | 493 | bus.mpc_busid = 0; |
@@ -550,7 +516,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) | |||
550 | MP_bus_info(&bus); | 516 | MP_bus_info(&bus); |
551 | } | 517 | } |
552 | 518 | ||
553 | #ifdef CONFIG_X86_IO_APIC | ||
554 | ioapic.mpc_type = MP_IOAPIC; | 519 | ioapic.mpc_type = MP_IOAPIC; |
555 | ioapic.mpc_apicid = 2; | 520 | ioapic.mpc_apicid = 2; |
556 | ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | 521 | ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; |
@@ -562,7 +527,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) | |||
562 | * We set up most of the low 16 IO-APIC pins according to MPS rules. | 527 | * We set up most of the low 16 IO-APIC pins according to MPS rules. |
563 | */ | 528 | */ |
564 | construct_default_ioirq_mptable(mpc_default_type); | 529 | construct_default_ioirq_mptable(mpc_default_type); |
530 | } | ||
531 | #else | ||
532 | static inline void construct_ioapic_table(int mpc_default_type) { } | ||
565 | #endif | 533 | #endif |
534 | |||
535 | static inline void __init construct_default_ISA_mptable(int mpc_default_type) | ||
536 | { | ||
537 | struct mpc_config_processor processor; | ||
538 | struct mpc_config_lintsrc lintsrc; | ||
539 | int linttypes[2] = { mp_ExtINT, mp_NMI }; | ||
540 | int i; | ||
541 | |||
542 | /* | ||
543 | * local APIC has default address | ||
544 | */ | ||
545 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
546 | |||
547 | /* | ||
548 | * 2 CPUs, numbered 0 & 1. | ||
549 | */ | ||
550 | processor.mpc_type = MP_PROCESSOR; | ||
551 | /* Either an integrated APIC or a discrete 82489DX. */ | ||
552 | processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
553 | processor.mpc_cpuflag = CPU_ENABLED; | ||
554 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
555 | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; | ||
556 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
557 | processor.mpc_reserved[0] = 0; | ||
558 | processor.mpc_reserved[1] = 0; | ||
559 | for (i = 0; i < 2; i++) { | ||
560 | processor.mpc_apicid = i; | ||
561 | MP_processor_info(&processor); | ||
562 | } | ||
563 | |||
564 | construct_ioapic_table(mpc_default_type); | ||
565 | |||
566 | lintsrc.mpc_type = MP_LINTSRC; | 566 | lintsrc.mpc_type = MP_LINTSRC; |
567 | lintsrc.mpc_irqflag = 0; /* conforming */ | 567 | lintsrc.mpc_irqflag = 0; /* conforming */ |
568 | lintsrc.mpc_srcbusid = 0; | 568 | lintsrc.mpc_srcbusid = 0; |
@@ -580,10 +580,14 @@ static struct intel_mp_floating *mpf_found; | |||
580 | /* | 580 | /* |
581 | * Scan the memory blocks for an SMP configuration block. | 581 | * Scan the memory blocks for an SMP configuration block. |
582 | */ | 582 | */ |
583 | static void __init __get_smp_config(unsigned early) | 583 | static void __init __get_smp_config(unsigned int early) |
584 | { | 584 | { |
585 | struct intel_mp_floating *mpf = mpf_found; | 585 | struct intel_mp_floating *mpf = mpf_found; |
586 | 586 | ||
587 | if (x86_quirks->mach_get_smp_config) { | ||
588 | if (x86_quirks->mach_get_smp_config(early)) | ||
589 | return; | ||
590 | } | ||
587 | if (acpi_lapic && early) | 591 | if (acpi_lapic && early) |
588 | return; | 592 | return; |
589 | /* | 593 | /* |
@@ -600,7 +604,7 @@ static void __init __get_smp_config(unsigned early) | |||
600 | 604 | ||
601 | printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", | 605 | printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", |
602 | mpf->mpf_specification); | 606 | mpf->mpf_specification); |
603 | #ifdef CONFIG_X86_32 | 607 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) |
604 | if (mpf->mpf_feature2 & (1 << 7)) { | 608 | if (mpf->mpf_feature2 & (1 << 7)) { |
605 | printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); | 609 | printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); |
606 | pic_mode = 1; | 610 | pic_mode = 1; |
@@ -632,7 +636,9 @@ static void __init __get_smp_config(unsigned early) | |||
632 | * override the defaults. | 636 | * override the defaults. |
633 | */ | 637 | */ |
634 | if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { | 638 | if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { |
639 | #ifdef CONFIG_X86_LOCAL_APIC | ||
635 | smp_found_config = 0; | 640 | smp_found_config = 0; |
641 | #endif | ||
636 | printk(KERN_ERR | 642 | printk(KERN_ERR |
637 | "BIOS bug, MP table errors detected!...\n"); | 643 | "BIOS bug, MP table errors detected!...\n"); |
638 | printk(KERN_ERR "... disabling SMP support. " | 644 | printk(KERN_ERR "... disabling SMP support. " |
@@ -689,7 +695,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, | |||
689 | unsigned int *bp = phys_to_virt(base); | 695 | unsigned int *bp = phys_to_virt(base); |
690 | struct intel_mp_floating *mpf; | 696 | struct intel_mp_floating *mpf; |
691 | 697 | ||
692 | Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); | 698 | printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); |
693 | BUILD_BUG_ON(sizeof(*mpf) != 16); | 699 | BUILD_BUG_ON(sizeof(*mpf) != 16); |
694 | 700 | ||
695 | while (length > 0) { | 701 | while (length > 0) { |
@@ -699,15 +705,21 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, | |||
699 | !mpf_checksum((unsigned char *)bp, 16) && | 705 | !mpf_checksum((unsigned char *)bp, 16) && |
700 | ((mpf->mpf_specification == 1) | 706 | ((mpf->mpf_specification == 1) |
701 | || (mpf->mpf_specification == 4))) { | 707 | || (mpf->mpf_specification == 4))) { |
702 | 708 | #ifdef CONFIG_X86_LOCAL_APIC | |
703 | smp_found_config = 1; | 709 | smp_found_config = 1; |
710 | #endif | ||
704 | mpf_found = mpf; | 711 | mpf_found = mpf; |
705 | #ifdef CONFIG_X86_32 | 712 | |
706 | printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", | 713 | printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", |
707 | mpf, virt_to_phys(mpf)); | 714 | mpf, virt_to_phys(mpf)); |
708 | reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, | 715 | |
716 | if (!reserve) | ||
717 | return 1; | ||
718 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, | ||
709 | BOOTMEM_DEFAULT); | 719 | BOOTMEM_DEFAULT); |
710 | if (mpf->mpf_physptr) { | 720 | if (mpf->mpf_physptr) { |
721 | unsigned long size = PAGE_SIZE; | ||
722 | #ifdef CONFIG_X86_32 | ||
711 | /* | 723 | /* |
712 | * We cannot access to MPC table to compute | 724 | * We cannot access to MPC table to compute |
713 | * table size yet, as only few megabytes from | 725 | * table size yet, as only few megabytes from |
@@ -717,24 +729,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, | |||
717 | * PAGE_SIZE from mpg->mpf_physptr yields BUG() | 729 | * PAGE_SIZE from mpg->mpf_physptr yields BUG() |
718 | * in reserve_bootmem. | 730 | * in reserve_bootmem. |
719 | */ | 731 | */ |
720 | unsigned long size = PAGE_SIZE; | ||
721 | unsigned long end = max_low_pfn * PAGE_SIZE; | 732 | unsigned long end = max_low_pfn * PAGE_SIZE; |
722 | if (mpf->mpf_physptr + size > end) | 733 | if (mpf->mpf_physptr + size > end) |
723 | size = end - mpf->mpf_physptr; | 734 | size = end - mpf->mpf_physptr; |
724 | reserve_bootmem(mpf->mpf_physptr, size, | 735 | #endif |
736 | reserve_bootmem_generic(mpf->mpf_physptr, size, | ||
725 | BOOTMEM_DEFAULT); | 737 | BOOTMEM_DEFAULT); |
726 | } | 738 | } |
727 | 739 | ||
728 | #else | 740 | return 1; |
729 | if (!reserve) | ||
730 | return 1; | ||
731 | |||
732 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); | ||
733 | if (mpf->mpf_physptr) | ||
734 | reserve_bootmem_generic(mpf->mpf_physptr, | ||
735 | PAGE_SIZE); | ||
736 | #endif | ||
737 | return 1; | ||
738 | } | 741 | } |
739 | bp += 4; | 742 | bp += 4; |
740 | length -= 16; | 743 | length -= 16; |
@@ -742,10 +745,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, | |||
742 | return 0; | 745 | return 0; |
743 | } | 746 | } |
744 | 747 | ||
745 | static void __init __find_smp_config(unsigned reserve) | 748 | static void __init __find_smp_config(unsigned int reserve) |
746 | { | 749 | { |
747 | unsigned int address; | 750 | unsigned int address; |
748 | 751 | ||
752 | if (x86_quirks->mach_find_smp_config) { | ||
753 | if (x86_quirks->mach_find_smp_config(reserve)) | ||
754 | return; | ||
755 | } | ||
749 | /* | 756 | /* |
750 | * FIXME: Linux assumes you have 640K of base ram.. | 757 | * FIXME: Linux assumes you have 640K of base ram.. |
751 | * this continues the error... | 758 | * this continues the error... |
@@ -790,298 +797,294 @@ void __init find_smp_config(void) | |||
790 | __find_smp_config(1); | 797 | __find_smp_config(1); |
791 | } | 798 | } |
792 | 799 | ||
793 | /* -------------------------------------------------------------------------- | 800 | #ifdef CONFIG_X86_IO_APIC |
794 | ACPI-based MP Configuration | 801 | static u8 __initdata irq_used[MAX_IRQ_SOURCES]; |
795 | -------------------------------------------------------------------------- */ | ||
796 | 802 | ||
797 | /* | 803 | static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m) |
798 | * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: | 804 | { |
799 | */ | 805 | int i; |
800 | int es7000_plat; | ||
801 | 806 | ||
802 | #ifdef CONFIG_ACPI | 807 | if (m->mpc_irqtype != mp_INT) |
808 | return 0; | ||
803 | 809 | ||
804 | #ifdef CONFIG_X86_IO_APIC | 810 | if (m->mpc_irqflag != 0x0f) |
811 | return 0; | ||
805 | 812 | ||
806 | #define MP_ISA_BUS 0 | 813 | /* not legacy */ |
807 | 814 | ||
808 | extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; | 815 | for (i = 0; i < mp_irq_entries; i++) { |
816 | if (mp_irqs[i].mp_irqtype != mp_INT) | ||
817 | continue; | ||
809 | 818 | ||
810 | static int mp_find_ioapic(int gsi) | 819 | if (mp_irqs[i].mp_irqflag != 0x0f) |
811 | { | 820 | continue; |
812 | int i = 0; | ||
813 | 821 | ||
814 | /* Find the IOAPIC that manages this GSI. */ | 822 | if (mp_irqs[i].mp_srcbus != m->mpc_srcbus) |
815 | for (i = 0; i < nr_ioapics; i++) { | 823 | continue; |
816 | if ((gsi >= mp_ioapic_routing[i].gsi_base) | 824 | if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq) |
817 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | 825 | continue; |
818 | return i; | 826 | if (irq_used[i]) { |
827 | /* already claimed */ | ||
828 | return -2; | ||
829 | } | ||
830 | irq_used[i] = 1; | ||
831 | return i; | ||
819 | } | 832 | } |
820 | 833 | ||
821 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | 834 | /* not found */ |
822 | return -1; | 835 | return -1; |
823 | } | 836 | } |
824 | 837 | ||
825 | static u8 __init uniq_ioapic_id(u8 id) | 838 | #define SPARE_SLOT_NUM 20 |
826 | { | 839 | |
827 | #ifdef CONFIG_X86_32 | 840 | static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; |
828 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | ||
829 | !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
830 | return io_apic_get_unique_id(nr_ioapics, id); | ||
831 | else | ||
832 | return id; | ||
833 | #else | ||
834 | int i; | ||
835 | DECLARE_BITMAP(used, 256); | ||
836 | bitmap_zero(used, 256); | ||
837 | for (i = 0; i < nr_ioapics; i++) { | ||
838 | struct mpc_config_ioapic *ia = &mp_ioapics[i]; | ||
839 | __set_bit(ia->mpc_apicid, used); | ||
840 | } | ||
841 | if (!test_bit(id, used)) | ||
842 | return id; | ||
843 | return find_first_zero_bit(used, 256); | ||
844 | #endif | 841 | #endif |
845 | } | ||
846 | 842 | ||
847 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | 843 | static int __init replace_intsrc_all(struct mp_config_table *mpc, |
844 | unsigned long mpc_new_phys, | ||
845 | unsigned long mpc_new_length) | ||
848 | { | 846 | { |
849 | int idx = 0; | 847 | #ifdef CONFIG_X86_IO_APIC |
850 | 848 | int i; | |
851 | if (bad_ioapic(address)) | 849 | int nr_m_spare = 0; |
852 | return; | 850 | #endif |
853 | 851 | ||
854 | idx = nr_ioapics; | 852 | int count = sizeof(*mpc); |
853 | unsigned char *mpt = ((unsigned char *)mpc) + count; | ||
855 | 854 | ||
856 | mp_ioapics[idx].mpc_type = MP_IOAPIC; | 855 | printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length); |
857 | mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | 856 | while (count < mpc->mpc_length) { |
858 | mp_ioapics[idx].mpc_apicaddr = address; | 857 | switch (*mpt) { |
858 | case MP_PROCESSOR: | ||
859 | { | ||
860 | struct mpc_config_processor *m = | ||
861 | (struct mpc_config_processor *)mpt; | ||
862 | mpt += sizeof(*m); | ||
863 | count += sizeof(*m); | ||
864 | break; | ||
865 | } | ||
866 | case MP_BUS: | ||
867 | { | ||
868 | struct mpc_config_bus *m = | ||
869 | (struct mpc_config_bus *)mpt; | ||
870 | mpt += sizeof(*m); | ||
871 | count += sizeof(*m); | ||
872 | break; | ||
873 | } | ||
874 | case MP_IOAPIC: | ||
875 | { | ||
876 | mpt += sizeof(struct mpc_config_ioapic); | ||
877 | count += sizeof(struct mpc_config_ioapic); | ||
878 | break; | ||
879 | } | ||
880 | case MP_INTSRC: | ||
881 | { | ||
882 | #ifdef CONFIG_X86_IO_APIC | ||
883 | struct mpc_config_intsrc *m = | ||
884 | (struct mpc_config_intsrc *)mpt; | ||
859 | 885 | ||
860 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | 886 | printk(KERN_INFO "OLD "); |
861 | mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); | 887 | print_MP_intsrc_info(m); |
862 | #ifdef CONFIG_X86_32 | 888 | i = get_MP_intsrc_index(m); |
863 | mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); | 889 | if (i > 0) { |
864 | #else | 890 | assign_to_mpc_intsrc(&mp_irqs[i], m); |
865 | mp_ioapics[idx].mpc_apicver = 0; | 891 | printk(KERN_INFO "NEW "); |
892 | print_mp_irq_info(&mp_irqs[i]); | ||
893 | } else if (!i) { | ||
894 | /* legacy, do nothing */ | ||
895 | } else if (nr_m_spare < SPARE_SLOT_NUM) { | ||
896 | /* | ||
897 | * not found (-1), or duplicated (-2) | ||
898 | * are invalid entries, | ||
899 | * we need to use the slot later | ||
900 | */ | ||
901 | m_spare[nr_m_spare] = m; | ||
902 | nr_m_spare++; | ||
903 | } | ||
866 | #endif | 904 | #endif |
867 | /* | 905 | mpt += sizeof(struct mpc_config_intsrc); |
868 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups | 906 | count += sizeof(struct mpc_config_intsrc); |
869 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | 907 | break; |
870 | */ | 908 | } |
871 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | 909 | case MP_LINTSRC: |
872 | mp_ioapic_routing[idx].gsi_base = gsi_base; | 910 | { |
873 | mp_ioapic_routing[idx].gsi_end = gsi_base + | 911 | struct mpc_config_lintsrc *m = |
874 | io_apic_get_redir_entries(idx); | 912 | (struct mpc_config_lintsrc *)mpt; |
875 | 913 | mpt += sizeof(*m); | |
876 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | 914 | count += sizeof(*m); |
877 | "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | 915 | break; |
878 | mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | 916 | } |
879 | mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); | 917 | default: |
880 | 918 | /* wrong mptable */ | |
881 | nr_ioapics++; | 919 | printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); |
882 | } | 920 | printk(KERN_ERR "type %x\n", *mpt); |
921 | print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, | ||
922 | 1, mpc, mpc->mpc_length, 1); | ||
923 | goto out; | ||
924 | } | ||
925 | } | ||
883 | 926 | ||
884 | void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | 927 | #ifdef CONFIG_X86_IO_APIC |
885 | { | 928 | for (i = 0; i < mp_irq_entries; i++) { |
886 | struct mpc_config_intsrc intsrc; | 929 | if (irq_used[i]) |
887 | int ioapic = -1; | 930 | continue; |
888 | int pin = -1; | ||
889 | 931 | ||
890 | /* | 932 | if (mp_irqs[i].mp_irqtype != mp_INT) |
891 | * Convert 'gsi' to 'ioapic.pin'. | 933 | continue; |
892 | */ | ||
893 | ioapic = mp_find_ioapic(gsi); | ||
894 | if (ioapic < 0) | ||
895 | return; | ||
896 | pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | ||
897 | 934 | ||
898 | /* | 935 | if (mp_irqs[i].mp_irqflag != 0x0f) |
899 | * TBD: This check is for faulty timer entries, where the override | 936 | continue; |
900 | * erroneously sets the trigger to level, resulting in a HUGE | ||
901 | * increase of timer interrupts! | ||
902 | */ | ||
903 | if ((bus_irq == 0) && (trigger == 3)) | ||
904 | trigger = 1; | ||
905 | 937 | ||
906 | intsrc.mpc_type = MP_INTSRC; | 938 | if (nr_m_spare > 0) { |
907 | intsrc.mpc_irqtype = mp_INT; | 939 | printk(KERN_INFO "*NEW* found "); |
908 | intsrc.mpc_irqflag = (trigger << 2) | polarity; | 940 | nr_m_spare--; |
909 | intsrc.mpc_srcbus = MP_ISA_BUS; | 941 | assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); |
910 | intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | 942 | m_spare[nr_m_spare] = NULL; |
911 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | 943 | } else { |
912 | intsrc.mpc_dstirq = pin; /* INTIN# */ | 944 | struct mpc_config_intsrc *m = |
945 | (struct mpc_config_intsrc *)mpt; | ||
946 | count += sizeof(struct mpc_config_intsrc); | ||
947 | if (!mpc_new_phys) { | ||
948 | printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); | ||
949 | } else { | ||
950 | if (count <= mpc_new_length) | ||
951 | printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count); | ||
952 | else { | ||
953 | printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length); | ||
954 | goto out; | ||
955 | } | ||
956 | } | ||
957 | assign_to_mpc_intsrc(&mp_irqs[i], m); | ||
958 | mpc->mpc_length = count; | ||
959 | mpt += sizeof(struct mpc_config_intsrc); | ||
960 | } | ||
961 | print_mp_irq_info(&mp_irqs[i]); | ||
962 | } | ||
963 | #endif | ||
964 | out: | ||
965 | /* update checksum */ | ||
966 | mpc->mpc_checksum = 0; | ||
967 | mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc, | ||
968 | mpc->mpc_length); | ||
913 | 969 | ||
914 | MP_intsrc_info(&intsrc); | 970 | return 0; |
915 | } | 971 | } |
916 | 972 | ||
917 | void __init mp_config_acpi_legacy_irqs(void) | 973 | static int __initdata enable_update_mptable; |
918 | { | ||
919 | struct mpc_config_intsrc intsrc; | ||
920 | int i = 0; | ||
921 | int ioapic = -1; | ||
922 | 974 | ||
923 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | 975 | static int __init update_mptable_setup(char *str) |
924 | /* | 976 | { |
925 | * Fabricate the legacy ISA bus (bus #31). | 977 | enable_update_mptable = 1; |
926 | */ | 978 | return 0; |
927 | mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; | 979 | } |
928 | #endif | 980 | early_param("update_mptable", update_mptable_setup); |
929 | set_bit(MP_ISA_BUS, mp_bus_not_pci); | ||
930 | Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); | ||
931 | 981 | ||
932 | /* | 982 | static unsigned long __initdata mpc_new_phys; |
933 | * Older generations of ES7000 have no legacy identity mappings | 983 | static unsigned long mpc_new_length __initdata = 4096; |
934 | */ | ||
935 | if (es7000_plat == 1) | ||
936 | return; | ||
937 | 984 | ||
938 | /* | 985 | /* alloc_mptable or alloc_mptable=4k */ |
939 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | 986 | static int __initdata alloc_mptable; |
940 | */ | 987 | static int __init parse_alloc_mptable_opt(char *p) |
941 | ioapic = mp_find_ioapic(0); | 988 | { |
942 | if (ioapic < 0) | 989 | enable_update_mptable = 1; |
943 | return; | 990 | alloc_mptable = 1; |
991 | if (!p) | ||
992 | return 0; | ||
993 | mpc_new_length = memparse(p, &p); | ||
994 | return 0; | ||
995 | } | ||
996 | early_param("alloc_mptable", parse_alloc_mptable_opt); | ||
944 | 997 | ||
945 | intsrc.mpc_type = MP_INTSRC; | 998 | void __init early_reserve_e820_mpc_new(void) |
946 | intsrc.mpc_irqflag = 0; /* Conforming */ | 999 | { |
947 | intsrc.mpc_srcbus = MP_ISA_BUS; | 1000 | if (enable_update_mptable && alloc_mptable) { |
948 | #ifdef CONFIG_X86_IO_APIC | 1001 | u64 startt = 0; |
949 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | 1002 | #ifdef CONFIG_X86_TRAMPOLINE |
1003 | startt = TRAMPOLINE_BASE; | ||
950 | #endif | 1004 | #endif |
951 | /* | 1005 | mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); |
952 | * Use the default configuration for the IRQs 0-15. Unless | ||
953 | * overridden by (MADT) interrupt source override entries. | ||
954 | */ | ||
955 | for (i = 0; i < 16; i++) { | ||
956 | int idx; | ||
957 | |||
958 | for (idx = 0; idx < mp_irq_entries; idx++) { | ||
959 | struct mpc_config_intsrc *irq = mp_irqs + idx; | ||
960 | |||
961 | /* Do we already have a mapping for this ISA IRQ? */ | ||
962 | if (irq->mpc_srcbus == MP_ISA_BUS | ||
963 | && irq->mpc_srcbusirq == i) | ||
964 | break; | ||
965 | |||
966 | /* Do we already have a mapping for this IOAPIC pin */ | ||
967 | if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | ||
968 | (irq->mpc_dstirq == i)) | ||
969 | break; | ||
970 | } | ||
971 | |||
972 | if (idx != mp_irq_entries) { | ||
973 | printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | ||
974 | continue; /* IRQ already used */ | ||
975 | } | ||
976 | |||
977 | intsrc.mpc_irqtype = mp_INT; | ||
978 | intsrc.mpc_srcbusirq = i; /* Identity mapped */ | ||
979 | intsrc.mpc_dstirq = i; | ||
980 | |||
981 | MP_intsrc_info(&intsrc); | ||
982 | } | 1006 | } |
983 | } | 1007 | } |
984 | 1008 | ||
985 | int mp_register_gsi(u32 gsi, int triggering, int polarity) | 1009 | static int __init update_mp_table(void) |
986 | { | 1010 | { |
987 | int ioapic; | 1011 | char str[16]; |
988 | int ioapic_pin; | 1012 | char oem[10]; |
989 | #ifdef CONFIG_X86_32 | 1013 | struct intel_mp_floating *mpf; |
990 | #define MAX_GSI_NUM 4096 | 1014 | struct mp_config_table *mpc; |
991 | #define IRQ_COMPRESSION_START 64 | 1015 | struct mp_config_table *mpc_new; |
1016 | |||
1017 | if (!enable_update_mptable) | ||
1018 | return 0; | ||
1019 | |||
1020 | mpf = mpf_found; | ||
1021 | if (!mpf) | ||
1022 | return 0; | ||
992 | 1023 | ||
993 | static int pci_irq = IRQ_COMPRESSION_START; | ||
994 | /* | 1024 | /* |
995 | * Mapping between Global System Interrupts, which | 1025 | * Now see if we need to go further. |
996 | * represent all possible interrupts, and IRQs | ||
997 | * assigned to actual devices. | ||
998 | */ | 1026 | */ |
999 | static int gsi_to_irq[MAX_GSI_NUM]; | 1027 | if (mpf->mpf_feature1 != 0) |
1000 | #else | 1028 | return 0; |
1001 | 1029 | ||
1002 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | 1030 | if (!mpf->mpf_physptr) |
1003 | return gsi; | 1031 | return 0; |
1004 | #endif | ||
1005 | 1032 | ||
1006 | /* Don't set up the ACPI SCI because it's already set up */ | 1033 | mpc = phys_to_virt(mpf->mpf_physptr); |
1007 | if (acpi_gbl_FADT.sci_interrupt == gsi) | ||
1008 | return gsi; | ||
1009 | 1034 | ||
1010 | ioapic = mp_find_ioapic(gsi); | 1035 | if (!smp_check_mpc(mpc, oem, str)) |
1011 | if (ioapic < 0) { | 1036 | return 0; |
1012 | printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | ||
1013 | return gsi; | ||
1014 | } | ||
1015 | 1037 | ||
1016 | ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | 1038 | printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); |
1039 | printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); | ||
1017 | 1040 | ||
1018 | #ifdef CONFIG_X86_32 | 1041 | if (mpc_new_phys && mpc->mpc_length > mpc_new_length) { |
1019 | if (ioapic_renumber_irq) | 1042 | mpc_new_phys = 0; |
1020 | gsi = ioapic_renumber_irq(ioapic, gsi); | 1043 | printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", |
1021 | #endif | 1044 | mpc_new_length); |
1022 | |||
1023 | /* | ||
1024 | * Avoid pin reprogramming. PRTs typically include entries | ||
1025 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
1026 | * we only program the IOAPIC on the first. | ||
1027 | */ | ||
1028 | if (ioapic_pin > MP_MAX_IOAPIC_PIN) { | ||
1029 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | ||
1030 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | ||
1031 | ioapic_pin); | ||
1032 | return gsi; | ||
1033 | } | 1045 | } |
1034 | if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { | 1046 | |
1035 | Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | 1047 | if (!mpc_new_phys) { |
1036 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | 1048 | unsigned char old, new; |
1037 | #ifdef CONFIG_X86_32 | 1049 | /* check if we can change the postion */ |
1038 | return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); | 1050 | mpc->mpc_checksum = 0; |
1039 | #else | 1051 | old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); |
1040 | return gsi; | 1052 | mpc->mpc_checksum = 0xff; |
1041 | #endif | 1053 | new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); |
1054 | if (old == new) { | ||
1055 | printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); | ||
1056 | return 0; | ||
1057 | } | ||
1058 | printk(KERN_INFO "use in-positon replacing\n"); | ||
1059 | } else { | ||
1060 | mpf->mpf_physptr = mpc_new_phys; | ||
1061 | mpc_new = phys_to_virt(mpc_new_phys); | ||
1062 | memcpy(mpc_new, mpc, mpc->mpc_length); | ||
1063 | mpc = mpc_new; | ||
1064 | /* check if we can modify that */ | ||
1065 | if (mpc_new_phys - mpf->mpf_physptr) { | ||
1066 | struct intel_mp_floating *mpf_new; | ||
1067 | /* steal 16 bytes from [0, 1k) */ | ||
1068 | printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); | ||
1069 | mpf_new = phys_to_virt(0x400 - 16); | ||
1070 | memcpy(mpf_new, mpf, 16); | ||
1071 | mpf = mpf_new; | ||
1072 | mpf->mpf_physptr = mpc_new_phys; | ||
1073 | } | ||
1074 | mpf->mpf_checksum = 0; | ||
1075 | mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); | ||
1076 | printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); | ||
1042 | } | 1077 | } |
1043 | 1078 | ||
1044 | set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); | ||
1045 | #ifdef CONFIG_X86_32 | ||
1046 | /* | 1079 | /* |
1047 | * For GSI >= 64, use IRQ compression | 1080 | * only replace the one with mp_INT and |
1081 | * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, | ||
1082 | * already in mp_irqs , stored by ... and mp_config_acpi_gsi, | ||
1083 | * may need pci=routeirq for all coverage | ||
1048 | */ | 1084 | */ |
1049 | if ((gsi >= IRQ_COMPRESSION_START) | 1085 | replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); |
1050 | && (triggering == ACPI_LEVEL_SENSITIVE)) { | 1086 | |
1051 | /* | 1087 | return 0; |
1052 | * For PCI devices assign IRQs in order, avoiding gaps | ||
1053 | * due to unused I/O APIC pins. | ||
1054 | */ | ||
1055 | int irq = gsi; | ||
1056 | if (gsi < MAX_GSI_NUM) { | ||
1057 | /* | ||
1058 | * Retain the VIA chipset work-around (gsi > 15), but | ||
1059 | * avoid a problem where the 8254 timer (IRQ0) is setup | ||
1060 | * via an override (so it's not on pin 0 of the ioapic), | ||
1061 | * and at the same time, the pin 0 interrupt is a PCI | ||
1062 | * type. The gsi > 15 test could cause these two pins | ||
1063 | * to be shared as IRQ0, and they are not shareable. | ||
1064 | * So test for this condition, and if necessary, avoid | ||
1065 | * the pin collision. | ||
1066 | */ | ||
1067 | gsi = pci_irq++; | ||
1068 | /* | ||
1069 | * Don't assign IRQ used by ACPI SCI | ||
1070 | */ | ||
1071 | if (gsi == acpi_gbl_FADT.sci_interrupt) | ||
1072 | gsi = pci_irq++; | ||
1073 | gsi_to_irq[irq] = gsi; | ||
1074 | } else { | ||
1075 | printk(KERN_ERR "GSI %u is too high\n", gsi); | ||
1076 | return gsi; | ||
1077 | } | ||
1078 | } | ||
1079 | #endif | ||
1080 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
1081 | triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
1082 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
1083 | return gsi; | ||
1084 | } | 1088 | } |
1085 | 1089 | ||
1086 | #endif /* CONFIG_X86_IO_APIC */ | 1090 | late_initcall(update_mp_table); |
1087 | #endif /* CONFIG_ACPI */ | ||
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1f3abe048e93..9fd809552447 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file) | |||
117 | { | 117 | { |
118 | unsigned int cpu = iminor(file->f_path.dentry->d_inode); | 118 | unsigned int cpu = iminor(file->f_path.dentry->d_inode); |
119 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 119 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
120 | int ret = 0; | ||
120 | 121 | ||
121 | if (cpu >= NR_CPUS || !cpu_online(cpu)) | 122 | lock_kernel(); |
122 | return -ENXIO; /* No such CPU */ | 123 | cpu = iminor(file->f_path.dentry->d_inode); |
123 | if (!cpu_has(c, X86_FEATURE_MSR)) | ||
124 | return -EIO; /* MSR not supported */ | ||
125 | 124 | ||
125 | if (cpu >= NR_CPUS || !cpu_online(cpu)) { | ||
126 | ret = -ENXIO; /* No such CPU */ | ||
127 | goto out; | ||
128 | } | ||
129 | c = &cpu_data(cpu); | ||
130 | if (!cpu_has(c, X86_FEATURE_MSR)) | ||
131 | ret = -EIO; /* MSR not supported */ | ||
132 | out: | ||
133 | unlock_kernel(); | ||
126 | return 0; | 134 | return 0; |
127 | } | 135 | } |
128 | 136 | ||
@@ -141,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu) | |||
141 | { | 149 | { |
142 | struct device *dev; | 150 | struct device *dev; |
143 | 151 | ||
144 | dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), | 152 | dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), |
145 | "msr%d", cpu); | 153 | NULL, "msr%d", cpu); |
146 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 154 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
147 | } | 155 | } |
148 | 156 | ||
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi.c index 5a29ded994fa..ac6d51222e7d 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi.c | |||
@@ -6,10 +6,13 @@ | |||
6 | * Fixes: | 6 | * Fixes: |
7 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | 7 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. |
8 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | 8 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. |
9 | * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. | ||
9 | * Pavel Machek and | 10 | * Pavel Machek and |
10 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | 11 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. |
11 | */ | 12 | */ |
12 | 13 | ||
14 | #include <asm/apic.h> | ||
15 | |||
13 | #include <linux/nmi.h> | 16 | #include <linux/nmi.h> |
14 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
15 | #include <linux/delay.h> | 18 | #include <linux/delay.h> |
@@ -17,20 +20,26 @@ | |||
17 | #include <linux/module.h> | 20 | #include <linux/module.h> |
18 | #include <linux/sysdev.h> | 21 | #include <linux/sysdev.h> |
19 | #include <linux/sysctl.h> | 22 | #include <linux/sysctl.h> |
23 | #include <linux/percpu.h> | ||
20 | #include <linux/kprobes.h> | 24 | #include <linux/kprobes.h> |
21 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
26 | #include <linux/kernel_stat.h> | ||
22 | #include <linux/kdebug.h> | 27 | #include <linux/kdebug.h> |
28 | #include <linux/smp.h> | ||
23 | 29 | ||
30 | #include <asm/i8259.h> | ||
31 | #include <asm/io_apic.h> | ||
24 | #include <asm/smp.h> | 32 | #include <asm/smp.h> |
25 | #include <asm/nmi.h> | 33 | #include <asm/nmi.h> |
26 | #include <asm/proto.h> | 34 | #include <asm/proto.h> |
35 | #include <asm/timer.h> | ||
36 | |||
27 | #include <asm/mce.h> | 37 | #include <asm/mce.h> |
28 | 38 | ||
29 | #include <mach_traps.h> | 39 | #include <mach_traps.h> |
30 | 40 | ||
31 | int unknown_nmi_panic; | 41 | int unknown_nmi_panic; |
32 | int nmi_watchdog_enabled; | 42 | int nmi_watchdog_enabled; |
33 | int panic_on_unrecovered_nmi; | ||
34 | 43 | ||
35 | static cpumask_t backtrace_mask = CPU_MASK_NONE; | 44 | static cpumask_t backtrace_mask = CPU_MASK_NONE; |
36 | 45 | ||
@@ -41,37 +50,65 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; | |||
41 | * 0: the lapic NMI watchdog is disabled, but can be enabled | 50 | * 0: the lapic NMI watchdog is disabled, but can be enabled |
42 | */ | 51 | */ |
43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | 52 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ |
53 | EXPORT_SYMBOL(nmi_active); | ||
54 | |||
55 | unsigned int nmi_watchdog = NMI_NONE; | ||
56 | EXPORT_SYMBOL(nmi_watchdog); | ||
57 | |||
44 | static int panic_on_timeout; | 58 | static int panic_on_timeout; |
45 | 59 | ||
46 | unsigned int nmi_watchdog = NMI_DEFAULT; | ||
47 | static unsigned int nmi_hz = HZ; | 60 | static unsigned int nmi_hz = HZ; |
48 | |||
49 | static DEFINE_PER_CPU(short, wd_enabled); | 61 | static DEFINE_PER_CPU(short, wd_enabled); |
62 | static int endflag __initdata; | ||
50 | 63 | ||
51 | /* Run after command line and cpu_init init, but before all other checks */ | 64 | static inline unsigned int get_nmi_count(int cpu) |
52 | void nmi_watchdog_default(void) | ||
53 | { | 65 | { |
54 | if (nmi_watchdog != NMI_DEFAULT) | 66 | #ifdef CONFIG_X86_64 |
55 | return; | 67 | return cpu_pda(cpu)->__nmi_count; |
56 | nmi_watchdog = NMI_NONE; | 68 | #else |
69 | return nmi_count(cpu); | ||
70 | #endif | ||
71 | } | ||
72 | |||
73 | static inline int mce_in_progress(void) | ||
74 | { | ||
75 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) | ||
76 | return atomic_read(&mce_entry) > 0; | ||
77 | #endif | ||
78 | return 0; | ||
57 | } | 79 | } |
58 | 80 | ||
59 | static int endflag __initdata = 0; | 81 | /* |
82 | * Take the local apic timer and PIT/HPET into account. We don't | ||
83 | * know which one is active, when we have highres/dyntick on | ||
84 | */ | ||
85 | static inline unsigned int get_timer_irqs(int cpu) | ||
86 | { | ||
87 | #ifdef CONFIG_X86_64 | ||
88 | return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); | ||
89 | #else | ||
90 | return per_cpu(irq_stat, cpu).apic_timer_irqs + | ||
91 | per_cpu(irq_stat, cpu).irq0_irqs; | ||
92 | #endif | ||
93 | } | ||
60 | 94 | ||
61 | #ifdef CONFIG_SMP | 95 | #ifdef CONFIG_SMP |
62 | /* The performance counters used by NMI_LOCAL_APIC don't trigger when | 96 | /* |
97 | * The performance counters used by NMI_LOCAL_APIC don't trigger when | ||
63 | * the CPU is idle. To make sure the NMI watchdog really ticks on all | 98 | * the CPU is idle. To make sure the NMI watchdog really ticks on all |
64 | * CPUs during the test make them busy. | 99 | * CPUs during the test make them busy. |
65 | */ | 100 | */ |
66 | static __init void nmi_cpu_busy(void *data) | 101 | static __init void nmi_cpu_busy(void *data) |
67 | { | 102 | { |
68 | local_irq_enable_in_hardirq(); | 103 | local_irq_enable_in_hardirq(); |
69 | /* Intentionally don't use cpu_relax here. This is | 104 | /* |
70 | to make sure that the performance counter really ticks, | 105 | * Intentionally don't use cpu_relax here. This is |
71 | even if there is a simulator or similar that catches the | 106 | * to make sure that the performance counter really ticks, |
72 | pause instruction. On a real HT machine this is fine because | 107 | * even if there is a simulator or similar that catches the |
73 | all other CPUs are busy with "useless" delay loops and don't | 108 | * pause instruction. On a real HT machine this is fine because |
74 | care if they get somewhat less cycles. */ | 109 | * all other CPUs are busy with "useless" delay loops and don't |
110 | * care if they get somewhat less cycles. | ||
111 | */ | ||
75 | while (endflag == 0) | 112 | while (endflag == 0) |
76 | mb(); | 113 | mb(); |
77 | } | 114 | } |
@@ -79,40 +116,37 @@ static __init void nmi_cpu_busy(void *data) | |||
79 | 116 | ||
80 | int __init check_nmi_watchdog(void) | 117 | int __init check_nmi_watchdog(void) |
81 | { | 118 | { |
82 | int *prev_nmi_count; | 119 | unsigned int *prev_nmi_count; |
83 | int cpu; | 120 | int cpu; |
84 | 121 | ||
85 | if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) | 122 | if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) |
86 | return 0; | ||
87 | |||
88 | if (!atomic_read(&nmi_active)) | ||
89 | return 0; | 123 | return 0; |
90 | 124 | ||
91 | prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); | 125 | prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); |
92 | if (!prev_nmi_count) | 126 | if (!prev_nmi_count) |
93 | return -1; | 127 | goto error; |
94 | 128 | ||
95 | printk(KERN_INFO "Testing NMI watchdog ... "); | 129 | printk(KERN_INFO "Testing NMI watchdog ... "); |
96 | 130 | ||
97 | #ifdef CONFIG_SMP | 131 | #ifdef CONFIG_SMP |
98 | if (nmi_watchdog == NMI_LOCAL_APIC) | 132 | if (nmi_watchdog == NMI_LOCAL_APIC) |
99 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); | 133 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); |
100 | #endif | 134 | #endif |
101 | 135 | ||
102 | for (cpu = 0; cpu < NR_CPUS; cpu++) | 136 | for_each_possible_cpu(cpu) |
103 | prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; | 137 | prev_nmi_count[cpu] = get_nmi_count(cpu); |
104 | local_irq_enable(); | 138 | local_irq_enable(); |
105 | mdelay((20*1000)/nmi_hz); // wait 20 ticks | 139 | mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ |
106 | 140 | ||
107 | for_each_online_cpu(cpu) { | 141 | for_each_online_cpu(cpu) { |
108 | if (!per_cpu(wd_enabled, cpu)) | 142 | if (!per_cpu(wd_enabled, cpu)) |
109 | continue; | 143 | continue; |
110 | if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { | 144 | if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { |
111 | printk(KERN_WARNING "WARNING: CPU#%d: NMI " | 145 | printk(KERN_WARNING "WARNING: CPU#%d: NMI " |
112 | "appears to be stuck (%d->%d)!\n", | 146 | "appears to be stuck (%d->%d)!\n", |
113 | cpu, | 147 | cpu, |
114 | prev_nmi_count[cpu], | 148 | prev_nmi_count[cpu], |
115 | cpu_pda(cpu)->__nmi_count); | 149 | get_nmi_count(cpu)); |
116 | per_cpu(wd_enabled, cpu) = 0; | 150 | per_cpu(wd_enabled, cpu) = 0; |
117 | atomic_dec(&nmi_active); | 151 | atomic_dec(&nmi_active); |
118 | } | 152 | } |
@@ -121,24 +155,33 @@ int __init check_nmi_watchdog(void) | |||
121 | if (!atomic_read(&nmi_active)) { | 155 | if (!atomic_read(&nmi_active)) { |
122 | kfree(prev_nmi_count); | 156 | kfree(prev_nmi_count); |
123 | atomic_set(&nmi_active, -1); | 157 | atomic_set(&nmi_active, -1); |
124 | return -1; | 158 | goto error; |
125 | } | 159 | } |
126 | printk("OK.\n"); | 160 | printk("OK.\n"); |
127 | 161 | ||
128 | /* now that we know it works we can reduce NMI frequency to | 162 | /* |
129 | something more reasonable; makes a difference in some configs */ | 163 | * now that we know it works we can reduce NMI frequency to |
164 | * something more reasonable; makes a difference in some configs | ||
165 | */ | ||
130 | if (nmi_watchdog == NMI_LOCAL_APIC) | 166 | if (nmi_watchdog == NMI_LOCAL_APIC) |
131 | nmi_hz = lapic_adjust_nmi_hz(1); | 167 | nmi_hz = lapic_adjust_nmi_hz(1); |
132 | 168 | ||
133 | kfree(prev_nmi_count); | 169 | kfree(prev_nmi_count); |
134 | return 0; | 170 | return 0; |
171 | error: | ||
172 | if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) | ||
173 | disable_8259A_irq(0); | ||
174 | #ifdef CONFIG_X86_32 | ||
175 | timer_ack = 0; | ||
176 | #endif | ||
177 | return -1; | ||
135 | } | 178 | } |
136 | 179 | ||
137 | static int __init setup_nmi_watchdog(char *str) | 180 | static int __init setup_nmi_watchdog(char *str) |
138 | { | 181 | { |
139 | int nmi; | 182 | unsigned int nmi; |
140 | 183 | ||
141 | if (!strncmp(str,"panic",5)) { | 184 | if (!strncmp(str, "panic", 5)) { |
142 | panic_on_timeout = 1; | 185 | panic_on_timeout = 1; |
143 | str = strchr(str, ','); | 186 | str = strchr(str, ','); |
144 | if (!str) | 187 | if (!str) |
@@ -148,15 +191,17 @@ static int __init setup_nmi_watchdog(char *str) | |||
148 | 191 | ||
149 | get_option(&str, &nmi); | 192 | get_option(&str, &nmi); |
150 | 193 | ||
151 | if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) | 194 | if (nmi >= NMI_INVALID) |
152 | return 0; | 195 | return 0; |
153 | 196 | ||
154 | nmi_watchdog = nmi; | 197 | nmi_watchdog = nmi; |
155 | return 1; | 198 | return 1; |
156 | } | 199 | } |
157 | |||
158 | __setup("nmi_watchdog=", setup_nmi_watchdog); | 200 | __setup("nmi_watchdog=", setup_nmi_watchdog); |
159 | 201 | ||
202 | /* | ||
203 | * Suspend/resume support | ||
204 | */ | ||
160 | #ifdef CONFIG_PM | 205 | #ifdef CONFIG_PM |
161 | 206 | ||
162 | static int nmi_pm_active; /* nmi_active before suspend */ | 207 | static int nmi_pm_active; /* nmi_active before suspend */ |
@@ -195,7 +240,8 @@ static int __init init_lapic_nmi_sysfs(void) | |||
195 | { | 240 | { |
196 | int error; | 241 | int error; |
197 | 242 | ||
198 | /* should really be a BUG_ON but b/c this is an | 243 | /* |
244 | * should really be a BUG_ON but b/c this is an | ||
199 | * init call, it just doesn't work. -dcz | 245 | * init call, it just doesn't work. -dcz |
200 | */ | 246 | */ |
201 | if (nmi_watchdog != NMI_LOCAL_APIC) | 247 | if (nmi_watchdog != NMI_LOCAL_APIC) |
@@ -209,6 +255,7 @@ static int __init init_lapic_nmi_sysfs(void) | |||
209 | error = sysdev_register(&device_lapic_nmi); | 255 | error = sysdev_register(&device_lapic_nmi); |
210 | return error; | 256 | return error; |
211 | } | 257 | } |
258 | |||
212 | /* must come after the local APIC's device_initcall() */ | 259 | /* must come after the local APIC's device_initcall() */ |
213 | late_initcall(init_lapic_nmi_sysfs); | 260 | late_initcall(init_lapic_nmi_sysfs); |
214 | 261 | ||
@@ -225,7 +272,7 @@ static void __acpi_nmi_enable(void *__unused) | |||
225 | void acpi_nmi_enable(void) | 272 | void acpi_nmi_enable(void) |
226 | { | 273 | { |
227 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | 274 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) |
228 | on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); | 275 | on_each_cpu(__acpi_nmi_enable, NULL, 1); |
229 | } | 276 | } |
230 | 277 | ||
231 | static void __acpi_nmi_disable(void *__unused) | 278 | static void __acpi_nmi_disable(void *__unused) |
@@ -239,7 +286,7 @@ static void __acpi_nmi_disable(void *__unused) | |||
239 | void acpi_nmi_disable(void) | 286 | void acpi_nmi_disable(void) |
240 | { | 287 | { |
241 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | 288 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) |
242 | on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); | 289 | on_each_cpu(__acpi_nmi_disable, NULL, 1); |
243 | } | 290 | } |
244 | 291 | ||
245 | void setup_apic_nmi_watchdog(void *unused) | 292 | void setup_apic_nmi_watchdog(void *unused) |
@@ -249,11 +296,12 @@ void setup_apic_nmi_watchdog(void *unused) | |||
249 | 296 | ||
250 | /* cheap hack to support suspend/resume */ | 297 | /* cheap hack to support suspend/resume */ |
251 | /* if cpu0 is not active neither should the other cpus */ | 298 | /* if cpu0 is not active neither should the other cpus */ |
252 | if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) | 299 | if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) |
253 | return; | 300 | return; |
254 | 301 | ||
255 | switch (nmi_watchdog) { | 302 | switch (nmi_watchdog) { |
256 | case NMI_LOCAL_APIC: | 303 | case NMI_LOCAL_APIC: |
304 | /* enable it before to avoid race with handler */ | ||
257 | __get_cpu_var(wd_enabled) = 1; | 305 | __get_cpu_var(wd_enabled) = 1; |
258 | if (lapic_watchdog_init(nmi_hz) < 0) { | 306 | if (lapic_watchdog_init(nmi_hz) < 0) { |
259 | __get_cpu_var(wd_enabled) = 0; | 307 | __get_cpu_var(wd_enabled) = 0; |
@@ -269,9 +317,8 @@ void setup_apic_nmi_watchdog(void *unused) | |||
269 | void stop_apic_nmi_watchdog(void *unused) | 317 | void stop_apic_nmi_watchdog(void *unused) |
270 | { | 318 | { |
271 | /* only support LOCAL and IO APICs for now */ | 319 | /* only support LOCAL and IO APICs for now */ |
272 | if ((nmi_watchdog != NMI_LOCAL_APIC) && | 320 | if (!nmi_watchdog_active()) |
273 | (nmi_watchdog != NMI_IO_APIC)) | 321 | return; |
274 | return; | ||
275 | if (__get_cpu_var(wd_enabled) == 0) | 322 | if (__get_cpu_var(wd_enabled) == 0) |
276 | return; | 323 | return; |
277 | if (nmi_watchdog == NMI_LOCAL_APIC) | 324 | if (nmi_watchdog == NMI_LOCAL_APIC) |
@@ -287,6 +334,11 @@ void stop_apic_nmi_watchdog(void *unused) | |||
287 | * | 334 | * |
288 | * as these watchdog NMI IRQs are generated on every CPU, we only | 335 | * as these watchdog NMI IRQs are generated on every CPU, we only |
289 | * have to check the current processor. | 336 | * have to check the current processor. |
337 | * | ||
338 | * since NMIs don't listen to _any_ locks, we have to be extremely | ||
339 | * careful not to rely on unsafe variables. The printk might lock | ||
340 | * up though, so we have to break up any console locks first ... | ||
341 | * [when there will be more tty-related locks, break them up here too!] | ||
290 | */ | 342 | */ |
291 | 343 | ||
292 | static DEFINE_PER_CPU(unsigned, last_irq_sum); | 344 | static DEFINE_PER_CPU(unsigned, last_irq_sum); |
@@ -295,11 +347,11 @@ static DEFINE_PER_CPU(int, nmi_touch); | |||
295 | 347 | ||
296 | void touch_nmi_watchdog(void) | 348 | void touch_nmi_watchdog(void) |
297 | { | 349 | { |
298 | if (nmi_watchdog > 0) { | 350 | if (nmi_watchdog_active()) { |
299 | unsigned cpu; | 351 | unsigned cpu; |
300 | 352 | ||
301 | /* | 353 | /* |
302 | * Tell other CPUs to reset their alert counters. We cannot | 354 | * Tell other CPUs to reset their alert counters. We cannot |
303 | * do it ourselves because the alert count increase is not | 355 | * do it ourselves because the alert count increase is not |
304 | * atomic. | 356 | * atomic. |
305 | */ | 357 | */ |
@@ -309,6 +361,9 @@ void touch_nmi_watchdog(void) | |||
309 | } | 361 | } |
310 | } | 362 | } |
311 | 363 | ||
364 | /* | ||
365 | * Tickle the softlockup detector too: | ||
366 | */ | ||
312 | touch_softlockup_watchdog(); | 367 | touch_softlockup_watchdog(); |
313 | } | 368 | } |
314 | EXPORT_SYMBOL(touch_nmi_watchdog); | 369 | EXPORT_SYMBOL(touch_nmi_watchdog); |
@@ -316,7 +371,12 @@ EXPORT_SYMBOL(touch_nmi_watchdog); | |||
316 | notrace __kprobes int | 371 | notrace __kprobes int |
317 | nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | 372 | nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) |
318 | { | 373 | { |
319 | int sum; | 374 | /* |
375 | * Since current_thread_info()-> is always on the stack, and we | ||
376 | * always switch the stack NMI-atomically, it's safe to use | ||
377 | * smp_processor_id(). | ||
378 | */ | ||
379 | unsigned int sum; | ||
320 | int touched = 0; | 380 | int touched = 0; |
321 | int cpu = smp_processor_id(); | 381 | int cpu = smp_processor_id(); |
322 | int rc = 0; | 382 | int rc = 0; |
@@ -328,7 +388,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
328 | touched = 1; | 388 | touched = 1; |
329 | } | 389 | } |
330 | 390 | ||
331 | sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); | 391 | sum = get_timer_irqs(cpu); |
392 | |||
332 | if (__get_cpu_var(nmi_touch)) { | 393 | if (__get_cpu_var(nmi_touch)) { |
333 | __get_cpu_var(nmi_touch) = 0; | 394 | __get_cpu_var(nmi_touch) = 0; |
334 | touched = 1; | 395 | touched = 1; |
@@ -338,28 +399,29 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
338 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ | 399 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ |
339 | 400 | ||
340 | spin_lock(&lock); | 401 | spin_lock(&lock); |
341 | printk("NMI backtrace for cpu %d\n", cpu); | 402 | printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); |
342 | dump_stack(); | 403 | dump_stack(); |
343 | spin_unlock(&lock); | 404 | spin_unlock(&lock); |
344 | cpu_clear(cpu, backtrace_mask); | 405 | cpu_clear(cpu, backtrace_mask); |
345 | } | 406 | } |
346 | 407 | ||
347 | #ifdef CONFIG_X86_MCE | 408 | /* Could check oops_in_progress here too, but it's safer not to */ |
348 | /* Could check oops_in_progress here too, but it's safer | 409 | if (mce_in_progress()) |
349 | not too */ | ||
350 | if (atomic_read(&mce_entry) > 0) | ||
351 | touched = 1; | 410 | touched = 1; |
352 | #endif | 411 | |
353 | /* if the apic timer isn't firing, this cpu isn't doing much */ | 412 | /* if the none of the timers isn't firing, this cpu isn't doing much */ |
354 | if (!touched && __get_cpu_var(last_irq_sum) == sum) { | 413 | if (!touched && __get_cpu_var(last_irq_sum) == sum) { |
355 | /* | 414 | /* |
356 | * Ayiee, looks like this CPU is stuck ... | 415 | * Ayiee, looks like this CPU is stuck ... |
357 | * wait a few IRQs (5 seconds) before doing the oops ... | 416 | * wait a few IRQs (5 seconds) before doing the oops ... |
358 | */ | 417 | */ |
359 | local_inc(&__get_cpu_var(alert_counter)); | 418 | local_inc(&__get_cpu_var(alert_counter)); |
360 | if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) | 419 | if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) |
361 | die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, | 420 | /* |
362 | panic_on_timeout); | 421 | * die_nmi will return ONLY if NOTIFY_STOP happens.. |
422 | */ | ||
423 | die_nmi("BUG: NMI Watchdog detected LOCKUP", | ||
424 | regs, panic_on_timeout); | ||
363 | } else { | 425 | } else { |
364 | __get_cpu_var(last_irq_sum) = sum; | 426 | __get_cpu_var(last_irq_sum) = sum; |
365 | local_set(&__get_cpu_var(alert_counter), 0); | 427 | local_set(&__get_cpu_var(alert_counter), 0); |
@@ -373,7 +435,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
373 | rc |= lapic_wd_event(nmi_hz); | 435 | rc |= lapic_wd_event(nmi_hz); |
374 | break; | 436 | break; |
375 | case NMI_IO_APIC: | 437 | case NMI_IO_APIC: |
376 | /* don't know how to accurately check for this. | 438 | /* |
439 | * don't know how to accurately check for this. | ||
377 | * just assume it was a watchdog timer interrupt | 440 | * just assume it was a watchdog timer interrupt |
378 | * This matches the old behaviour. | 441 | * This matches the old behaviour. |
379 | */ | 442 | */ |
@@ -383,31 +446,14 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
383 | return rc; | 446 | return rc; |
384 | } | 447 | } |
385 | 448 | ||
386 | static unsigned ignore_nmis; | 449 | #ifdef CONFIG_SYSCTL |
387 | |||
388 | asmlinkage notrace __kprobes void | ||
389 | do_nmi(struct pt_regs *regs, long error_code) | ||
390 | { | ||
391 | nmi_enter(); | ||
392 | add_pda(__nmi_count,1); | ||
393 | if (!ignore_nmis) | ||
394 | default_do_nmi(regs); | ||
395 | nmi_exit(); | ||
396 | } | ||
397 | |||
398 | void stop_nmi(void) | ||
399 | { | ||
400 | acpi_nmi_disable(); | ||
401 | ignore_nmis++; | ||
402 | } | ||
403 | 450 | ||
404 | void restart_nmi(void) | 451 | static int __init setup_unknown_nmi_panic(char *str) |
405 | { | 452 | { |
406 | ignore_nmis--; | 453 | unknown_nmi_panic = 1; |
407 | acpi_nmi_enable(); | 454 | return 1; |
408 | } | 455 | } |
409 | 456 | __setup("unknown_nmi_panic", setup_unknown_nmi_panic); | |
410 | #ifdef CONFIG_SYSCTL | ||
411 | 457 | ||
412 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | 458 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) |
413 | { | 459 | { |
@@ -415,7 +461,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | |||
415 | char buf[64]; | 461 | char buf[64]; |
416 | 462 | ||
417 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | 463 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); |
418 | die_nmi(buf, regs, 1); /* Always panic here */ | 464 | die_nmi(buf, regs, 1); /* Always panic here */ |
419 | return 0; | 465 | return 0; |
420 | } | 466 | } |
421 | 467 | ||
@@ -433,28 +479,26 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, | |||
433 | if (!!old_state == !!nmi_watchdog_enabled) | 479 | if (!!old_state == !!nmi_watchdog_enabled) |
434 | return 0; | 480 | return 0; |
435 | 481 | ||
436 | if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { | 482 | if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { |
437 | printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); | 483 | printk(KERN_WARNING |
484 | "NMI watchdog is permanently disabled\n"); | ||
438 | return -EIO; | 485 | return -EIO; |
439 | } | 486 | } |
440 | 487 | ||
441 | /* if nmi_watchdog is not set yet, then set it */ | ||
442 | nmi_watchdog_default(); | ||
443 | |||
444 | if (nmi_watchdog == NMI_LOCAL_APIC) { | 488 | if (nmi_watchdog == NMI_LOCAL_APIC) { |
445 | if (nmi_watchdog_enabled) | 489 | if (nmi_watchdog_enabled) |
446 | enable_lapic_nmi_watchdog(); | 490 | enable_lapic_nmi_watchdog(); |
447 | else | 491 | else |
448 | disable_lapic_nmi_watchdog(); | 492 | disable_lapic_nmi_watchdog(); |
449 | } else { | 493 | } else { |
450 | printk( KERN_WARNING | 494 | printk(KERN_WARNING |
451 | "NMI watchdog doesn't know what hardware to touch\n"); | 495 | "NMI watchdog doesn't know what hardware to touch\n"); |
452 | return -EIO; | 496 | return -EIO; |
453 | } | 497 | } |
454 | return 0; | 498 | return 0; |
455 | } | 499 | } |
456 | 500 | ||
457 | #endif | 501 | #endif /* CONFIG_SYSCTL */ |
458 | 502 | ||
459 | int do_nmi_callback(struct pt_regs *regs, int cpu) | 503 | int do_nmi_callback(struct pt_regs *regs, int cpu) |
460 | { | 504 | { |
@@ -477,6 +521,3 @@ void __trigger_all_cpu_backtrace(void) | |||
477 | mdelay(1); | 521 | mdelay(1); |
478 | } | 522 | } |
479 | } | 523 | } |
480 | |||
481 | EXPORT_SYMBOL(nmi_active); | ||
482 | EXPORT_SYMBOL(nmi_watchdog); | ||
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c deleted file mode 100644 index 84160f74eeb0..000000000000 --- a/arch/x86/kernel/nmi_32.c +++ /dev/null | |||
@@ -1,467 +0,0 @@ | |||
1 | /* | ||
2 | * NMI watchdog support on APIC systems | ||
3 | * | ||
4 | * Started by Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes: | ||
7 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | ||
8 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | ||
9 | * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. | ||
10 | * Pavel Machek and | ||
11 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | ||
12 | */ | ||
13 | |||
14 | #include <linux/delay.h> | ||
15 | #include <linux/interrupt.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/nmi.h> | ||
18 | #include <linux/sysdev.h> | ||
19 | #include <linux/sysctl.h> | ||
20 | #include <linux/percpu.h> | ||
21 | #include <linux/kprobes.h> | ||
22 | #include <linux/cpumask.h> | ||
23 | #include <linux/kernel_stat.h> | ||
24 | #include <linux/kdebug.h> | ||
25 | #include <linux/slab.h> | ||
26 | |||
27 | #include <asm/smp.h> | ||
28 | #include <asm/nmi.h> | ||
29 | |||
30 | #include "mach_traps.h" | ||
31 | |||
32 | int unknown_nmi_panic; | ||
33 | int nmi_watchdog_enabled; | ||
34 | |||
35 | static cpumask_t backtrace_mask = CPU_MASK_NONE; | ||
36 | |||
37 | /* nmi_active: | ||
38 | * >0: the lapic NMI watchdog is active, but can be disabled | ||
39 | * <0: the lapic NMI watchdog has not been set up, and cannot | ||
40 | * be enabled | ||
41 | * 0: the lapic NMI watchdog is disabled, but can be enabled | ||
42 | */ | ||
43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
44 | |||
45 | unsigned int nmi_watchdog = NMI_DEFAULT; | ||
46 | static unsigned int nmi_hz = HZ; | ||
47 | |||
48 | static DEFINE_PER_CPU(short, wd_enabled); | ||
49 | |||
50 | static int endflag __initdata = 0; | ||
51 | |||
52 | #ifdef CONFIG_SMP | ||
53 | /* The performance counters used by NMI_LOCAL_APIC don't trigger when | ||
54 | * the CPU is idle. To make sure the NMI watchdog really ticks on all | ||
55 | * CPUs during the test make them busy. | ||
56 | */ | ||
57 | static __init void nmi_cpu_busy(void *data) | ||
58 | { | ||
59 | local_irq_enable_in_hardirq(); | ||
60 | /* Intentionally don't use cpu_relax here. This is | ||
61 | to make sure that the performance counter really ticks, | ||
62 | even if there is a simulator or similar that catches the | ||
63 | pause instruction. On a real HT machine this is fine because | ||
64 | all other CPUs are busy with "useless" delay loops and don't | ||
65 | care if they get somewhat less cycles. */ | ||
66 | while (endflag == 0) | ||
67 | mb(); | ||
68 | } | ||
69 | #endif | ||
70 | |||
71 | int __init check_nmi_watchdog(void) | ||
72 | { | ||
73 | unsigned int *prev_nmi_count; | ||
74 | int cpu; | ||
75 | |||
76 | if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) | ||
77 | return 0; | ||
78 | |||
79 | if (!atomic_read(&nmi_active)) | ||
80 | return 0; | ||
81 | |||
82 | prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); | ||
83 | if (!prev_nmi_count) | ||
84 | return -1; | ||
85 | |||
86 | printk(KERN_INFO "Testing NMI watchdog ... "); | ||
87 | |||
88 | #ifdef CONFIG_SMP | ||
89 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
90 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); | ||
91 | #endif | ||
92 | |||
93 | for_each_possible_cpu(cpu) | ||
94 | prev_nmi_count[cpu] = nmi_count(cpu); | ||
95 | local_irq_enable(); | ||
96 | mdelay((20*1000)/nmi_hz); // wait 20 ticks | ||
97 | |||
98 | for_each_possible_cpu(cpu) { | ||
99 | #ifdef CONFIG_SMP | ||
100 | /* Check cpu_callin_map here because that is set | ||
101 | after the timer is started. */ | ||
102 | if (!cpu_isset(cpu, cpu_callin_map)) | ||
103 | continue; | ||
104 | #endif | ||
105 | if (!per_cpu(wd_enabled, cpu)) | ||
106 | continue; | ||
107 | if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { | ||
108 | printk(KERN_WARNING "WARNING: CPU#%d: NMI " | ||
109 | "appears to be stuck (%d->%d)!\n", | ||
110 | cpu, | ||
111 | prev_nmi_count[cpu], | ||
112 | nmi_count(cpu)); | ||
113 | per_cpu(wd_enabled, cpu) = 0; | ||
114 | atomic_dec(&nmi_active); | ||
115 | } | ||
116 | } | ||
117 | endflag = 1; | ||
118 | if (!atomic_read(&nmi_active)) { | ||
119 | kfree(prev_nmi_count); | ||
120 | atomic_set(&nmi_active, -1); | ||
121 | return -1; | ||
122 | } | ||
123 | printk("OK.\n"); | ||
124 | |||
125 | /* now that we know it works we can reduce NMI frequency to | ||
126 | something more reasonable; makes a difference in some configs */ | ||
127 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
128 | nmi_hz = lapic_adjust_nmi_hz(1); | ||
129 | |||
130 | kfree(prev_nmi_count); | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | static int __init setup_nmi_watchdog(char *str) | ||
135 | { | ||
136 | int nmi; | ||
137 | |||
138 | get_option(&str, &nmi); | ||
139 | |||
140 | if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) | ||
141 | return 0; | ||
142 | |||
143 | nmi_watchdog = nmi; | ||
144 | return 1; | ||
145 | } | ||
146 | |||
147 | __setup("nmi_watchdog=", setup_nmi_watchdog); | ||
148 | |||
149 | |||
150 | /* Suspend/resume support */ | ||
151 | |||
152 | #ifdef CONFIG_PM | ||
153 | |||
154 | static int nmi_pm_active; /* nmi_active before suspend */ | ||
155 | |||
156 | static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) | ||
157 | { | ||
158 | /* only CPU0 goes here, other CPUs should be offline */ | ||
159 | nmi_pm_active = atomic_read(&nmi_active); | ||
160 | stop_apic_nmi_watchdog(NULL); | ||
161 | BUG_ON(atomic_read(&nmi_active) != 0); | ||
162 | return 0; | ||
163 | } | ||
164 | |||
165 | static int lapic_nmi_resume(struct sys_device *dev) | ||
166 | { | ||
167 | /* only CPU0 goes here, other CPUs should be offline */ | ||
168 | if (nmi_pm_active > 0) { | ||
169 | setup_apic_nmi_watchdog(NULL); | ||
170 | touch_nmi_watchdog(); | ||
171 | } | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | |||
176 | static struct sysdev_class nmi_sysclass = { | ||
177 | .name = "lapic_nmi", | ||
178 | .resume = lapic_nmi_resume, | ||
179 | .suspend = lapic_nmi_suspend, | ||
180 | }; | ||
181 | |||
182 | static struct sys_device device_lapic_nmi = { | ||
183 | .id = 0, | ||
184 | .cls = &nmi_sysclass, | ||
185 | }; | ||
186 | |||
187 | static int __init init_lapic_nmi_sysfs(void) | ||
188 | { | ||
189 | int error; | ||
190 | |||
191 | /* should really be a BUG_ON but b/c this is an | ||
192 | * init call, it just doesn't work. -dcz | ||
193 | */ | ||
194 | if (nmi_watchdog != NMI_LOCAL_APIC) | ||
195 | return 0; | ||
196 | |||
197 | if (atomic_read(&nmi_active) < 0) | ||
198 | return 0; | ||
199 | |||
200 | error = sysdev_class_register(&nmi_sysclass); | ||
201 | if (!error) | ||
202 | error = sysdev_register(&device_lapic_nmi); | ||
203 | return error; | ||
204 | } | ||
205 | /* must come after the local APIC's device_initcall() */ | ||
206 | late_initcall(init_lapic_nmi_sysfs); | ||
207 | |||
208 | #endif /* CONFIG_PM */ | ||
209 | |||
210 | static void __acpi_nmi_enable(void *__unused) | ||
211 | { | ||
212 | apic_write_around(APIC_LVT0, APIC_DM_NMI); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Enable timer based NMIs on all CPUs: | ||
217 | */ | ||
218 | void acpi_nmi_enable(void) | ||
219 | { | ||
220 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
221 | on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); | ||
222 | } | ||
223 | |||
224 | static void __acpi_nmi_disable(void *__unused) | ||
225 | { | ||
226 | apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * Disable timer based NMIs on all CPUs: | ||
231 | */ | ||
232 | void acpi_nmi_disable(void) | ||
233 | { | ||
234 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
235 | on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); | ||
236 | } | ||
237 | |||
238 | void setup_apic_nmi_watchdog(void *unused) | ||
239 | { | ||
240 | if (__get_cpu_var(wd_enabled)) | ||
241 | return; | ||
242 | |||
243 | /* cheap hack to support suspend/resume */ | ||
244 | /* if cpu0 is not active neither should the other cpus */ | ||
245 | if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) | ||
246 | return; | ||
247 | |||
248 | switch (nmi_watchdog) { | ||
249 | case NMI_LOCAL_APIC: | ||
250 | __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ | ||
251 | if (lapic_watchdog_init(nmi_hz) < 0) { | ||
252 | __get_cpu_var(wd_enabled) = 0; | ||
253 | return; | ||
254 | } | ||
255 | /* FALL THROUGH */ | ||
256 | case NMI_IO_APIC: | ||
257 | __get_cpu_var(wd_enabled) = 1; | ||
258 | atomic_inc(&nmi_active); | ||
259 | } | ||
260 | } | ||
261 | |||
262 | void stop_apic_nmi_watchdog(void *unused) | ||
263 | { | ||
264 | /* only support LOCAL and IO APICs for now */ | ||
265 | if ((nmi_watchdog != NMI_LOCAL_APIC) && | ||
266 | (nmi_watchdog != NMI_IO_APIC)) | ||
267 | return; | ||
268 | if (__get_cpu_var(wd_enabled) == 0) | ||
269 | return; | ||
270 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
271 | lapic_watchdog_stop(); | ||
272 | __get_cpu_var(wd_enabled) = 0; | ||
273 | atomic_dec(&nmi_active); | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * the best way to detect whether a CPU has a 'hard lockup' problem | ||
278 | * is to check it's local APIC timer IRQ counts. If they are not | ||
279 | * changing then that CPU has some problem. | ||
280 | * | ||
281 | * as these watchdog NMI IRQs are generated on every CPU, we only | ||
282 | * have to check the current processor. | ||
283 | * | ||
284 | * since NMIs don't listen to _any_ locks, we have to be extremely | ||
285 | * careful not to rely on unsafe variables. The printk might lock | ||
286 | * up though, so we have to break up any console locks first ... | ||
287 | * [when there will be more tty-related locks, break them up | ||
288 | * here too!] | ||
289 | */ | ||
290 | |||
291 | static unsigned int | ||
292 | last_irq_sums [NR_CPUS], | ||
293 | alert_counter [NR_CPUS]; | ||
294 | |||
295 | void touch_nmi_watchdog(void) | ||
296 | { | ||
297 | if (nmi_watchdog > 0) { | ||
298 | unsigned cpu; | ||
299 | |||
300 | /* | ||
301 | * Just reset the alert counters, (other CPUs might be | ||
302 | * spinning on locks we hold): | ||
303 | */ | ||
304 | for_each_present_cpu(cpu) { | ||
305 | if (alert_counter[cpu]) | ||
306 | alert_counter[cpu] = 0; | ||
307 | } | ||
308 | } | ||
309 | |||
310 | /* | ||
311 | * Tickle the softlockup detector too: | ||
312 | */ | ||
313 | touch_softlockup_watchdog(); | ||
314 | } | ||
315 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
316 | |||
317 | extern void die_nmi(struct pt_regs *, const char *msg); | ||
318 | |||
319 | notrace __kprobes int | ||
320 | nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | ||
321 | { | ||
322 | |||
323 | /* | ||
324 | * Since current_thread_info()-> is always on the stack, and we | ||
325 | * always switch the stack NMI-atomically, it's safe to use | ||
326 | * smp_processor_id(). | ||
327 | */ | ||
328 | unsigned int sum; | ||
329 | int touched = 0; | ||
330 | int cpu = smp_processor_id(); | ||
331 | int rc = 0; | ||
332 | |||
333 | /* check for other users first */ | ||
334 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
335 | == NOTIFY_STOP) { | ||
336 | rc = 1; | ||
337 | touched = 1; | ||
338 | } | ||
339 | |||
340 | if (cpu_isset(cpu, backtrace_mask)) { | ||
341 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ | ||
342 | |||
343 | spin_lock(&lock); | ||
344 | printk("NMI backtrace for cpu %d\n", cpu); | ||
345 | dump_stack(); | ||
346 | spin_unlock(&lock); | ||
347 | cpu_clear(cpu, backtrace_mask); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * Take the local apic timer and PIT/HPET into account. We don't | ||
352 | * know which one is active, when we have highres/dyntick on | ||
353 | */ | ||
354 | sum = per_cpu(irq_stat, cpu).apic_timer_irqs + | ||
355 | per_cpu(irq_stat, cpu).irq0_irqs; | ||
356 | |||
357 | /* if the none of the timers isn't firing, this cpu isn't doing much */ | ||
358 | if (!touched && last_irq_sums[cpu] == sum) { | ||
359 | /* | ||
360 | * Ayiee, looks like this CPU is stuck ... | ||
361 | * wait a few IRQs (5 seconds) before doing the oops ... | ||
362 | */ | ||
363 | alert_counter[cpu]++; | ||
364 | if (alert_counter[cpu] == 5*nmi_hz) | ||
365 | /* | ||
366 | * die_nmi will return ONLY if NOTIFY_STOP happens.. | ||
367 | */ | ||
368 | die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); | ||
369 | } else { | ||
370 | last_irq_sums[cpu] = sum; | ||
371 | alert_counter[cpu] = 0; | ||
372 | } | ||
373 | /* see if the nmi watchdog went off */ | ||
374 | if (!__get_cpu_var(wd_enabled)) | ||
375 | return rc; | ||
376 | switch (nmi_watchdog) { | ||
377 | case NMI_LOCAL_APIC: | ||
378 | rc |= lapic_wd_event(nmi_hz); | ||
379 | break; | ||
380 | case NMI_IO_APIC: | ||
381 | /* don't know how to accurately check for this. | ||
382 | * just assume it was a watchdog timer interrupt | ||
383 | * This matches the old behaviour. | ||
384 | */ | ||
385 | rc = 1; | ||
386 | break; | ||
387 | } | ||
388 | return rc; | ||
389 | } | ||
390 | |||
391 | #ifdef CONFIG_SYSCTL | ||
392 | |||
393 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | ||
394 | { | ||
395 | unsigned char reason = get_nmi_reason(); | ||
396 | char buf[64]; | ||
397 | |||
398 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | ||
399 | die_nmi(regs, buf); | ||
400 | return 0; | ||
401 | } | ||
402 | |||
403 | /* | ||
404 | * proc handler for /proc/sys/kernel/nmi | ||
405 | */ | ||
406 | int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, | ||
407 | void __user *buffer, size_t *length, loff_t *ppos) | ||
408 | { | ||
409 | int old_state; | ||
410 | |||
411 | nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; | ||
412 | old_state = nmi_watchdog_enabled; | ||
413 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
414 | if (!!old_state == !!nmi_watchdog_enabled) | ||
415 | return 0; | ||
416 | |||
417 | if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { | ||
418 | printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); | ||
419 | return -EIO; | ||
420 | } | ||
421 | |||
422 | if (nmi_watchdog == NMI_DEFAULT) { | ||
423 | if (lapic_watchdog_ok()) | ||
424 | nmi_watchdog = NMI_LOCAL_APIC; | ||
425 | else | ||
426 | nmi_watchdog = NMI_IO_APIC; | ||
427 | } | ||
428 | |||
429 | if (nmi_watchdog == NMI_LOCAL_APIC) { | ||
430 | if (nmi_watchdog_enabled) | ||
431 | enable_lapic_nmi_watchdog(); | ||
432 | else | ||
433 | disable_lapic_nmi_watchdog(); | ||
434 | } else { | ||
435 | printk( KERN_WARNING | ||
436 | "NMI watchdog doesn't know what hardware to touch\n"); | ||
437 | return -EIO; | ||
438 | } | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | #endif | ||
443 | |||
444 | int do_nmi_callback(struct pt_regs *regs, int cpu) | ||
445 | { | ||
446 | #ifdef CONFIG_SYSCTL | ||
447 | if (unknown_nmi_panic) | ||
448 | return unknown_nmi_panic_callback(regs, cpu); | ||
449 | #endif | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | void __trigger_all_cpu_backtrace(void) | ||
454 | { | ||
455 | int i; | ||
456 | |||
457 | backtrace_mask = cpu_online_map; | ||
458 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
459 | for (i = 0; i < 10 * 1000; i++) { | ||
460 | if (cpus_empty(backtrace_mask)) | ||
461 | break; | ||
462 | mdelay(1); | ||
463 | } | ||
464 | } | ||
465 | |||
466 | EXPORT_SYMBOL(nmi_active); | ||
467 | EXPORT_SYMBOL(nmi_watchdog); | ||
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index e65281b1634b..b8c45610b20a 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c | |||
@@ -31,6 +31,9 @@ | |||
31 | #include <asm/numaq.h> | 31 | #include <asm/numaq.h> |
32 | #include <asm/topology.h> | 32 | #include <asm/topology.h> |
33 | #include <asm/processor.h> | 33 | #include <asm/processor.h> |
34 | #include <asm/mpspec.h> | ||
35 | #include <asm/e820.h> | ||
36 | #include <asm/setup.h> | ||
34 | 37 | ||
35 | #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) | 38 | #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) |
36 | 39 | ||
@@ -58,6 +61,8 @@ static void __init smp_dump_qct(void) | |||
58 | node_end_pfn[node] = MB_TO_PAGES( | 61 | node_end_pfn[node] = MB_TO_PAGES( |
59 | eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); | 62 | eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); |
60 | 63 | ||
64 | e820_register_active_regions(node, node_start_pfn[node], | ||
65 | node_end_pfn[node]); | ||
61 | memory_present(node, | 66 | memory_present(node, |
62 | node_start_pfn[node], node_end_pfn[node]); | 67 | node_start_pfn[node], node_end_pfn[node]); |
63 | node_remap_size[node] = node_memmap_size_bytes(node, | 68 | node_remap_size[node] = node_memmap_size_bytes(node, |
@@ -67,23 +72,209 @@ static void __init smp_dump_qct(void) | |||
67 | } | 72 | } |
68 | } | 73 | } |
69 | 74 | ||
70 | /* | ||
71 | * Unlike Summit, we don't really care to let the NUMA-Q | ||
72 | * fall back to flat mode. Don't compile for NUMA-Q | ||
73 | * unless you really need it! | ||
74 | */ | ||
75 | int __init get_memcfg_numaq(void) | ||
76 | { | ||
77 | smp_dump_qct(); | ||
78 | return 1; | ||
79 | } | ||
80 | 75 | ||
81 | static int __init numaq_tsc_disable(void) | 76 | void __init numaq_tsc_disable(void) |
82 | { | 77 | { |
78 | if (!found_numaq) | ||
79 | return; | ||
80 | |||
83 | if (num_online_nodes() > 1) { | 81 | if (num_online_nodes() > 1) { |
84 | printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); | 82 | printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); |
85 | setup_clear_cpu_cap(X86_FEATURE_TSC); | 83 | setup_clear_cpu_cap(X86_FEATURE_TSC); |
86 | } | 84 | } |
85 | } | ||
86 | |||
87 | static int __init numaq_pre_time_init(void) | ||
88 | { | ||
89 | numaq_tsc_disable(); | ||
87 | return 0; | 90 | return 0; |
88 | } | 91 | } |
89 | arch_initcall(numaq_tsc_disable); | 92 | |
93 | int found_numaq; | ||
94 | /* | ||
95 | * Have to match translation table entries to main table entries by counter | ||
96 | * hence the mpc_record variable .... can't see a less disgusting way of | ||
97 | * doing this .... | ||
98 | */ | ||
99 | struct mpc_config_translation { | ||
100 | unsigned char mpc_type; | ||
101 | unsigned char trans_len; | ||
102 | unsigned char trans_type; | ||
103 | unsigned char trans_quad; | ||
104 | unsigned char trans_global; | ||
105 | unsigned char trans_local; | ||
106 | unsigned short trans_reserved; | ||
107 | }; | ||
108 | |||
109 | /* x86_quirks member */ | ||
110 | static int mpc_record; | ||
111 | static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] | ||
112 | __cpuinitdata; | ||
113 | |||
114 | static inline int generate_logical_apicid(int quad, int phys_apicid) | ||
115 | { | ||
116 | return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); | ||
117 | } | ||
118 | |||
119 | /* x86_quirks member */ | ||
120 | static int mpc_apic_id(struct mpc_config_processor *m) | ||
121 | { | ||
122 | int quad = translation_table[mpc_record]->trans_quad; | ||
123 | int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); | ||
124 | |||
125 | printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", | ||
126 | m->mpc_apicid, | ||
127 | (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, | ||
128 | (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, | ||
129 | m->mpc_apicver, quad, logical_apicid); | ||
130 | return logical_apicid; | ||
131 | } | ||
132 | |||
133 | int mp_bus_id_to_node[MAX_MP_BUSSES]; | ||
134 | |||
135 | int mp_bus_id_to_local[MAX_MP_BUSSES]; | ||
136 | |||
137 | /* x86_quirks member */ | ||
138 | static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name) | ||
139 | { | ||
140 | int quad = translation_table[mpc_record]->trans_quad; | ||
141 | int local = translation_table[mpc_record]->trans_local; | ||
142 | |||
143 | mp_bus_id_to_node[m->mpc_busid] = quad; | ||
144 | mp_bus_id_to_local[m->mpc_busid] = local; | ||
145 | printk(KERN_INFO "Bus #%d is %s (node %d)\n", | ||
146 | m->mpc_busid, name, quad); | ||
147 | } | ||
148 | |||
149 | int quad_local_to_mp_bus_id [NR_CPUS/4][4]; | ||
150 | |||
151 | /* x86_quirks member */ | ||
152 | static void mpc_oem_pci_bus(struct mpc_config_bus *m) | ||
153 | { | ||
154 | int quad = translation_table[mpc_record]->trans_quad; | ||
155 | int local = translation_table[mpc_record]->trans_local; | ||
156 | |||
157 | quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; | ||
158 | } | ||
159 | |||
160 | static void __init MP_translation_info(struct mpc_config_translation *m) | ||
161 | { | ||
162 | printk(KERN_INFO | ||
163 | "Translation: record %d, type %d, quad %d, global %d, local %d\n", | ||
164 | mpc_record, m->trans_type, m->trans_quad, m->trans_global, | ||
165 | m->trans_local); | ||
166 | |||
167 | if (mpc_record >= MAX_MPC_ENTRY) | ||
168 | printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); | ||
169 | else | ||
170 | translation_table[mpc_record] = m; /* stash this for later */ | ||
171 | if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) | ||
172 | node_set_online(m->trans_quad); | ||
173 | } | ||
174 | |||
175 | static int __init mpf_checksum(unsigned char *mp, int len) | ||
176 | { | ||
177 | int sum = 0; | ||
178 | |||
179 | while (len--) | ||
180 | sum += *mp++; | ||
181 | |||
182 | return sum & 0xFF; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * Read/parse the MPC oem tables | ||
187 | */ | ||
188 | |||
189 | static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, | ||
190 | unsigned short oemsize) | ||
191 | { | ||
192 | int count = sizeof(*oemtable); /* the header size */ | ||
193 | unsigned char *oemptr = ((unsigned char *)oemtable) + count; | ||
194 | |||
195 | mpc_record = 0; | ||
196 | printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", | ||
197 | oemtable); | ||
198 | if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { | ||
199 | printk(KERN_WARNING | ||
200 | "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", | ||
201 | oemtable->oem_signature[0], oemtable->oem_signature[1], | ||
202 | oemtable->oem_signature[2], oemtable->oem_signature[3]); | ||
203 | return; | ||
204 | } | ||
205 | if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { | ||
206 | printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); | ||
207 | return; | ||
208 | } | ||
209 | while (count < oemtable->oem_length) { | ||
210 | switch (*oemptr) { | ||
211 | case MP_TRANSLATION: | ||
212 | { | ||
213 | struct mpc_config_translation *m = | ||
214 | (struct mpc_config_translation *)oemptr; | ||
215 | MP_translation_info(m); | ||
216 | oemptr += sizeof(*m); | ||
217 | count += sizeof(*m); | ||
218 | ++mpc_record; | ||
219 | break; | ||
220 | } | ||
221 | default: | ||
222 | { | ||
223 | printk(KERN_WARNING | ||
224 | "Unrecognised OEM table entry type! - %d\n", | ||
225 | (int)*oemptr); | ||
226 | return; | ||
227 | } | ||
228 | } | ||
229 | } | ||
230 | } | ||
231 | |||
232 | static struct x86_quirks numaq_x86_quirks __initdata = { | ||
233 | .arch_pre_time_init = numaq_pre_time_init, | ||
234 | .arch_time_init = NULL, | ||
235 | .arch_pre_intr_init = NULL, | ||
236 | .arch_memory_setup = NULL, | ||
237 | .arch_intr_init = NULL, | ||
238 | .arch_trap_init = NULL, | ||
239 | .mach_get_smp_config = NULL, | ||
240 | .mach_find_smp_config = NULL, | ||
241 | .mpc_record = &mpc_record, | ||
242 | .mpc_apic_id = mpc_apic_id, | ||
243 | .mpc_oem_bus_info = mpc_oem_bus_info, | ||
244 | .mpc_oem_pci_bus = mpc_oem_pci_bus, | ||
245 | .smp_read_mpc_oem = smp_read_mpc_oem, | ||
246 | }; | ||
247 | |||
248 | void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, | ||
249 | char *productid) | ||
250 | { | ||
251 | if (strncmp(oem, "IBM NUMA", 8)) | ||
252 | printk("Warning! Not a NUMA-Q system!\n"); | ||
253 | else | ||
254 | found_numaq = 1; | ||
255 | } | ||
256 | |||
257 | static __init void early_check_numaq(void) | ||
258 | { | ||
259 | /* | ||
260 | * Find possible boot-time SMP configuration: | ||
261 | */ | ||
262 | early_find_smp_config(); | ||
263 | /* | ||
264 | * get boot-time SMP configuration: | ||
265 | */ | ||
266 | if (smp_found_config) | ||
267 | early_get_smp_config(); | ||
268 | |||
269 | if (found_numaq) | ||
270 | x86_quirks = &numaq_x86_quirks; | ||
271 | } | ||
272 | |||
273 | int __init get_memcfg_numaq(void) | ||
274 | { | ||
275 | early_check_numaq(); | ||
276 | if (!found_numaq) | ||
277 | return 0; | ||
278 | smp_dump_qct(); | ||
279 | return 1; | ||
280 | } | ||
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 74f0c5ea2a03..94da4d52d798 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -29,7 +29,9 @@ | |||
29 | #include <asm/desc.h> | 29 | #include <asm/desc.h> |
30 | #include <asm/setup.h> | 30 | #include <asm/setup.h> |
31 | #include <asm/arch_hooks.h> | 31 | #include <asm/arch_hooks.h> |
32 | #include <asm/pgtable.h> | ||
32 | #include <asm/time.h> | 33 | #include <asm/time.h> |
34 | #include <asm/pgalloc.h> | ||
33 | #include <asm/irq.h> | 35 | #include <asm/irq.h> |
34 | #include <asm/delay.h> | 36 | #include <asm/delay.h> |
35 | #include <asm/fixmap.h> | 37 | #include <asm/fixmap.h> |
@@ -122,6 +124,7 @@ static void *get_call_destination(u8 type) | |||
122 | .pv_irq_ops = pv_irq_ops, | 124 | .pv_irq_ops = pv_irq_ops, |
123 | .pv_apic_ops = pv_apic_ops, | 125 | .pv_apic_ops = pv_apic_ops, |
124 | .pv_mmu_ops = pv_mmu_ops, | 126 | .pv_mmu_ops = pv_mmu_ops, |
127 | .pv_lock_ops = pv_lock_ops, | ||
125 | }; | 128 | }; |
126 | return *((void **)&tmpl + type); | 129 | return *((void **)&tmpl + type); |
127 | } | 130 | } |
@@ -139,7 +142,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, | |||
139 | /* If the operation is a nop, then nop the callsite */ | 142 | /* If the operation is a nop, then nop the callsite */ |
140 | ret = paravirt_patch_nop(); | 143 | ret = paravirt_patch_nop(); |
141 | else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || | 144 | else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || |
142 | type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret)) | 145 | type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || |
146 | type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || | ||
147 | type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) | ||
143 | /* If operation requires a jmp, then jmp */ | 148 | /* If operation requires a jmp, then jmp */ |
144 | ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); | 149 | ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); |
145 | else | 150 | else |
@@ -190,7 +195,9 @@ static void native_flush_tlb_single(unsigned long addr) | |||
190 | 195 | ||
191 | /* These are in entry.S */ | 196 | /* These are in entry.S */ |
192 | extern void native_iret(void); | 197 | extern void native_iret(void); |
193 | extern void native_irq_enable_syscall_ret(void); | 198 | extern void native_irq_enable_sysexit(void); |
199 | extern void native_usergs_sysret32(void); | ||
200 | extern void native_usergs_sysret64(void); | ||
194 | 201 | ||
195 | static int __init print_banner(void) | 202 | static int __init print_banner(void) |
196 | { | 203 | { |
@@ -261,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) | |||
261 | return __get_cpu_var(paravirt_lazy_mode); | 268 | return __get_cpu_var(paravirt_lazy_mode); |
262 | } | 269 | } |
263 | 270 | ||
271 | void __init paravirt_use_bytelocks(void) | ||
272 | { | ||
273 | #ifdef CONFIG_SMP | ||
274 | pv_lock_ops.spin_is_locked = __byte_spin_is_locked; | ||
275 | pv_lock_ops.spin_is_contended = __byte_spin_is_contended; | ||
276 | pv_lock_ops.spin_lock = __byte_spin_lock; | ||
277 | pv_lock_ops.spin_trylock = __byte_spin_trylock; | ||
278 | pv_lock_ops.spin_unlock = __byte_spin_unlock; | ||
279 | #endif | ||
280 | } | ||
281 | |||
264 | struct pv_info pv_info = { | 282 | struct pv_info pv_info = { |
265 | .name = "bare hardware", | 283 | .name = "bare hardware", |
266 | .paravirt_enabled = 0, | 284 | .paravirt_enabled = 0, |
@@ -280,7 +298,7 @@ struct pv_time_ops pv_time_ops = { | |||
280 | .get_wallclock = native_get_wallclock, | 298 | .get_wallclock = native_get_wallclock, |
281 | .set_wallclock = native_set_wallclock, | 299 | .set_wallclock = native_set_wallclock, |
282 | .sched_clock = native_sched_clock, | 300 | .sched_clock = native_sched_clock, |
283 | .get_cpu_khz = native_calculate_cpu_khz, | 301 | .get_tsc_khz = native_calibrate_tsc, |
284 | }; | 302 | }; |
285 | 303 | ||
286 | struct pv_irq_ops pv_irq_ops = { | 304 | struct pv_irq_ops pv_irq_ops = { |
@@ -291,6 +309,9 @@ struct pv_irq_ops pv_irq_ops = { | |||
291 | .irq_enable = native_irq_enable, | 309 | .irq_enable = native_irq_enable, |
292 | .safe_halt = native_safe_halt, | 310 | .safe_halt = native_safe_halt, |
293 | .halt = native_halt, | 311 | .halt = native_halt, |
312 | #ifdef CONFIG_X86_64 | ||
313 | .adjust_exception_frame = paravirt_nop, | ||
314 | #endif | ||
294 | }; | 315 | }; |
295 | 316 | ||
296 | struct pv_cpu_ops pv_cpu_ops = { | 317 | struct pv_cpu_ops pv_cpu_ops = { |
@@ -321,12 +342,23 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
321 | .store_idt = native_store_idt, | 342 | .store_idt = native_store_idt, |
322 | .store_tr = native_store_tr, | 343 | .store_tr = native_store_tr, |
323 | .load_tls = native_load_tls, | 344 | .load_tls = native_load_tls, |
345 | #ifdef CONFIG_X86_64 | ||
346 | .load_gs_index = native_load_gs_index, | ||
347 | #endif | ||
324 | .write_ldt_entry = native_write_ldt_entry, | 348 | .write_ldt_entry = native_write_ldt_entry, |
325 | .write_gdt_entry = native_write_gdt_entry, | 349 | .write_gdt_entry = native_write_gdt_entry, |
326 | .write_idt_entry = native_write_idt_entry, | 350 | .write_idt_entry = native_write_idt_entry, |
327 | .load_sp0 = native_load_sp0, | 351 | .load_sp0 = native_load_sp0, |
328 | 352 | ||
329 | .irq_enable_syscall_ret = native_irq_enable_syscall_ret, | 353 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
354 | .irq_enable_sysexit = native_irq_enable_sysexit, | ||
355 | #endif | ||
356 | #ifdef CONFIG_X86_64 | ||
357 | #ifdef CONFIG_IA32_EMULATION | ||
358 | .usergs_sysret32 = native_usergs_sysret32, | ||
359 | #endif | ||
360 | .usergs_sysret64 = native_usergs_sysret64, | ||
361 | #endif | ||
330 | .iret = native_iret, | 362 | .iret = native_iret, |
331 | .swapgs = native_swapgs, | 363 | .swapgs = native_swapgs, |
332 | 364 | ||
@@ -342,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
342 | struct pv_apic_ops pv_apic_ops = { | 374 | struct pv_apic_ops pv_apic_ops = { |
343 | #ifdef CONFIG_X86_LOCAL_APIC | 375 | #ifdef CONFIG_X86_LOCAL_APIC |
344 | .apic_write = native_apic_write, | 376 | .apic_write = native_apic_write, |
345 | .apic_write_atomic = native_apic_write_atomic, | ||
346 | .apic_read = native_apic_read, | 377 | .apic_read = native_apic_read, |
347 | .setup_boot_clock = setup_boot_APIC_clock, | 378 | .setup_boot_clock = setup_boot_APIC_clock, |
348 | .setup_secondary_clock = setup_secondary_APIC_clock, | 379 | .setup_secondary_clock = setup_secondary_APIC_clock, |
@@ -354,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
354 | #ifndef CONFIG_X86_64 | 385 | #ifndef CONFIG_X86_64 |
355 | .pagetable_setup_start = native_pagetable_setup_start, | 386 | .pagetable_setup_start = native_pagetable_setup_start, |
356 | .pagetable_setup_done = native_pagetable_setup_done, | 387 | .pagetable_setup_done = native_pagetable_setup_done, |
388 | #else | ||
389 | .pagetable_setup_start = paravirt_nop, | ||
390 | .pagetable_setup_done = paravirt_nop, | ||
357 | #endif | 391 | #endif |
358 | 392 | ||
359 | .read_cr2 = native_read_cr2, | 393 | .read_cr2 = native_read_cr2, |
@@ -366,6 +400,9 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
366 | .flush_tlb_single = native_flush_tlb_single, | 400 | .flush_tlb_single = native_flush_tlb_single, |
367 | .flush_tlb_others = native_flush_tlb_others, | 401 | .flush_tlb_others = native_flush_tlb_others, |
368 | 402 | ||
403 | .pgd_alloc = __paravirt_pgd_alloc, | ||
404 | .pgd_free = paravirt_nop, | ||
405 | |||
369 | .alloc_pte = paravirt_nop, | 406 | .alloc_pte = paravirt_nop, |
370 | .alloc_pmd = paravirt_nop, | 407 | .alloc_pmd = paravirt_nop, |
371 | .alloc_pmd_clone = paravirt_nop, | 408 | .alloc_pmd_clone = paravirt_nop, |
@@ -380,6 +417,9 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
380 | .pte_update = paravirt_nop, | 417 | .pte_update = paravirt_nop, |
381 | .pte_update_defer = paravirt_nop, | 418 | .pte_update_defer = paravirt_nop, |
382 | 419 | ||
420 | .ptep_modify_prot_start = __ptep_modify_prot_start, | ||
421 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, | ||
422 | |||
383 | #ifdef CONFIG_HIGHPTE | 423 | #ifdef CONFIG_HIGHPTE |
384 | .kmap_atomic_pte = kmap_atomic, | 424 | .kmap_atomic_pte = kmap_atomic, |
385 | #endif | 425 | #endif |
@@ -403,6 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
403 | #endif /* PAGETABLE_LEVELS >= 3 */ | 443 | #endif /* PAGETABLE_LEVELS >= 3 */ |
404 | 444 | ||
405 | .pte_val = native_pte_val, | 445 | .pte_val = native_pte_val, |
446 | .pte_flags = native_pte_flags, | ||
406 | .pgd_val = native_pgd_val, | 447 | .pgd_val = native_pgd_val, |
407 | 448 | ||
408 | .make_pte = native_make_pte, | 449 | .make_pte = native_make_pte, |
@@ -416,7 +457,21 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
416 | .enter = paravirt_nop, | 457 | .enter = paravirt_nop, |
417 | .leave = paravirt_nop, | 458 | .leave = paravirt_nop, |
418 | }, | 459 | }, |
460 | |||
461 | .set_fixmap = native_set_fixmap, | ||
462 | }; | ||
463 | |||
464 | struct pv_lock_ops pv_lock_ops = { | ||
465 | #ifdef CONFIG_SMP | ||
466 | .spin_is_locked = __ticket_spin_is_locked, | ||
467 | .spin_is_contended = __ticket_spin_is_contended, | ||
468 | |||
469 | .spin_lock = __ticket_spin_lock, | ||
470 | .spin_trylock = __ticket_spin_trylock, | ||
471 | .spin_unlock = __ticket_spin_unlock, | ||
472 | #endif | ||
419 | }; | 473 | }; |
474 | EXPORT_SYMBOL_GPL(pv_lock_ops); | ||
420 | 475 | ||
421 | EXPORT_SYMBOL_GPL(pv_time_ops); | 476 | EXPORT_SYMBOL_GPL(pv_time_ops); |
422 | EXPORT_SYMBOL (pv_cpu_ops); | 477 | EXPORT_SYMBOL (pv_cpu_ops); |
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 82fc5fcab4f4..58262218781b 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c | |||
@@ -5,7 +5,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); | |||
5 | DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); | 5 | DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); |
6 | DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); | 6 | DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); |
7 | DEF_NATIVE(pv_cpu_ops, iret, "iret"); | 7 | DEF_NATIVE(pv_cpu_ops, iret, "iret"); |
8 | DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit"); | 8 | DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); |
9 | DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); | 9 | DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); |
10 | DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); | 10 | DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); |
11 | DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); | 11 | DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); |
@@ -29,7 +29,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | |||
29 | PATCH_SITE(pv_irq_ops, restore_fl); | 29 | PATCH_SITE(pv_irq_ops, restore_fl); |
30 | PATCH_SITE(pv_irq_ops, save_fl); | 30 | PATCH_SITE(pv_irq_ops, save_fl); |
31 | PATCH_SITE(pv_cpu_ops, iret); | 31 | PATCH_SITE(pv_cpu_ops, iret); |
32 | PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); | 32 | PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); |
33 | PATCH_SITE(pv_mmu_ops, read_cr2); | 33 | PATCH_SITE(pv_mmu_ops, read_cr2); |
34 | PATCH_SITE(pv_mmu_ops, read_cr3); | 34 | PATCH_SITE(pv_mmu_ops, read_cr3); |
35 | PATCH_SITE(pv_mmu_ops, write_cr3); | 35 | PATCH_SITE(pv_mmu_ops, write_cr3); |
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 7d904e138d7e..061d01df9ae6 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c | |||
@@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); | |||
14 | DEF_NATIVE(pv_cpu_ops, clts, "clts"); | 14 | DEF_NATIVE(pv_cpu_ops, clts, "clts"); |
15 | DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); | 15 | DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); |
16 | 16 | ||
17 | /* the three commands give us more control to how to return from a syscall */ | 17 | DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit"); |
18 | DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); | 18 | DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); |
19 | DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); | ||
19 | DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); | 20 | DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); |
20 | 21 | ||
21 | unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | 22 | unsigned native_patch(u8 type, u16 clobbers, void *ibuf, |
@@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | |||
35 | PATCH_SITE(pv_irq_ops, irq_enable); | 36 | PATCH_SITE(pv_irq_ops, irq_enable); |
36 | PATCH_SITE(pv_irq_ops, irq_disable); | 37 | PATCH_SITE(pv_irq_ops, irq_disable); |
37 | PATCH_SITE(pv_cpu_ops, iret); | 38 | PATCH_SITE(pv_cpu_ops, iret); |
38 | PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); | 39 | PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); |
40 | PATCH_SITE(pv_cpu_ops, usergs_sysret32); | ||
41 | PATCH_SITE(pv_cpu_ops, usergs_sysret64); | ||
39 | PATCH_SITE(pv_cpu_ops, swapgs); | 42 | PATCH_SITE(pv_cpu_ops, swapgs); |
40 | PATCH_SITE(pv_mmu_ops, read_cr2); | 43 | PATCH_SITE(pv_mmu_ops, read_cr2); |
41 | PATCH_SITE(pv_mmu_ops, read_cr3); | 44 | PATCH_SITE(pv_mmu_ops, read_cr3); |
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index e28ec497e142..b67a4b1d4eae 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/spinlock.h> | 30 | #include <linux/spinlock.h> |
31 | #include <linux/string.h> | 31 | #include <linux/string.h> |
32 | #include <linux/crash_dump.h> | ||
32 | #include <linux/dma-mapping.h> | 33 | #include <linux/dma-mapping.h> |
33 | #include <linux/bitops.h> | 34 | #include <linux/bitops.h> |
34 | #include <linux/pci_ids.h> | 35 | #include <linux/pci_ids.h> |
@@ -36,7 +37,8 @@ | |||
36 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
37 | #include <linux/scatterlist.h> | 38 | #include <linux/scatterlist.h> |
38 | #include <linux/iommu-helper.h> | 39 | #include <linux/iommu-helper.h> |
39 | #include <asm/gart.h> | 40 | |
41 | #include <asm/iommu.h> | ||
40 | #include <asm/calgary.h> | 42 | #include <asm/calgary.h> |
41 | #include <asm/tce.h> | 43 | #include <asm/tce.h> |
42 | #include <asm/pci-direct.h> | 44 | #include <asm/pci-direct.h> |
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl); | |||
167 | static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); | 169 | static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); |
168 | static void calioc2_tce_cache_blast(struct iommu_table *tbl); | 170 | static void calioc2_tce_cache_blast(struct iommu_table *tbl); |
169 | static void calioc2_dump_error_regs(struct iommu_table *tbl); | 171 | static void calioc2_dump_error_regs(struct iommu_table *tbl); |
172 | static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); | ||
173 | static void get_tce_space_from_tar(void); | ||
170 | 174 | ||
171 | static struct cal_chipset_ops calgary_chip_ops = { | 175 | static struct cal_chipset_ops calgary_chip_ops = { |
172 | .handle_quirks = calgary_handle_quirks, | 176 | .handle_quirks = calgary_handle_quirks, |
@@ -410,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev, | |||
410 | } | 414 | } |
411 | } | 415 | } |
412 | 416 | ||
413 | static int calgary_nontranslate_map_sg(struct device* dev, | ||
414 | struct scatterlist *sg, int nelems, int direction) | ||
415 | { | ||
416 | struct scatterlist *s; | ||
417 | int i; | ||
418 | |||
419 | for_each_sg(sg, s, nelems, i) { | ||
420 | struct page *p = sg_page(s); | ||
421 | |||
422 | BUG_ON(!p); | ||
423 | s->dma_address = virt_to_bus(sg_virt(s)); | ||
424 | s->dma_length = s->length; | ||
425 | } | ||
426 | return nelems; | ||
427 | } | ||
428 | |||
429 | static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | 417 | static int calgary_map_sg(struct device *dev, struct scatterlist *sg, |
430 | int nelems, int direction) | 418 | int nelems, int direction) |
431 | { | 419 | { |
@@ -436,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | |||
436 | unsigned long entry; | 424 | unsigned long entry; |
437 | int i; | 425 | int i; |
438 | 426 | ||
439 | if (!translation_enabled(tbl)) | ||
440 | return calgary_nontranslate_map_sg(dev, sg, nelems, direction); | ||
441 | |||
442 | for_each_sg(sg, s, nelems, i) { | 427 | for_each_sg(sg, s, nelems, i) { |
443 | BUG_ON(!sg_page(s)); | 428 | BUG_ON(!sg_page(s)); |
444 | 429 | ||
@@ -474,7 +459,6 @@ error: | |||
474 | static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, | 459 | static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, |
475 | size_t size, int direction) | 460 | size_t size, int direction) |
476 | { | 461 | { |
477 | dma_addr_t dma_handle = bad_dma_address; | ||
478 | void *vaddr = phys_to_virt(paddr); | 462 | void *vaddr = phys_to_virt(paddr); |
479 | unsigned long uaddr; | 463 | unsigned long uaddr; |
480 | unsigned int npages; | 464 | unsigned int npages; |
@@ -483,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, | |||
483 | uaddr = (unsigned long)vaddr; | 467 | uaddr = (unsigned long)vaddr; |
484 | npages = num_dma_pages(uaddr, size); | 468 | npages = num_dma_pages(uaddr, size); |
485 | 469 | ||
486 | if (translation_enabled(tbl)) | 470 | return iommu_alloc(dev, tbl, vaddr, npages, direction); |
487 | dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); | ||
488 | else | ||
489 | dma_handle = virt_to_bus(vaddr); | ||
490 | |||
491 | return dma_handle; | ||
492 | } | 471 | } |
493 | 472 | ||
494 | static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | 473 | static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, |
@@ -497,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | |||
497 | struct iommu_table *tbl = find_iommu_table(dev); | 476 | struct iommu_table *tbl = find_iommu_table(dev); |
498 | unsigned int npages; | 477 | unsigned int npages; |
499 | 478 | ||
500 | if (!translation_enabled(tbl)) | ||
501 | return; | ||
502 | |||
503 | npages = num_dma_pages(dma_handle, size); | 479 | npages = num_dma_pages(dma_handle, size); |
504 | iommu_free(tbl, dma_handle, npages); | 480 | iommu_free(tbl, dma_handle, npages); |
505 | } | 481 | } |
@@ -522,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, | |||
522 | goto error; | 498 | goto error; |
523 | memset(ret, 0, size); | 499 | memset(ret, 0, size); |
524 | 500 | ||
525 | if (translation_enabled(tbl)) { | 501 | /* set up tces to cover the allocated range */ |
526 | /* set up tces to cover the allocated range */ | 502 | mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); |
527 | mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); | 503 | if (mapping == bad_dma_address) |
528 | if (mapping == bad_dma_address) | 504 | goto free; |
529 | goto free; | 505 | *dma_handle = mapping; |
530 | |||
531 | *dma_handle = mapping; | ||
532 | } else /* non translated slot */ | ||
533 | *dma_handle = virt_to_bus(ret); | ||
534 | |||
535 | return ret; | 506 | return ret; |
536 | |||
537 | free: | 507 | free: |
538 | free_pages((unsigned long)ret, get_order(size)); | 508 | free_pages((unsigned long)ret, get_order(size)); |
539 | ret = NULL; | 509 | ret = NULL; |
@@ -541,7 +511,7 @@ error: | |||
541 | return ret; | 511 | return ret; |
542 | } | 512 | } |
543 | 513 | ||
544 | static const struct dma_mapping_ops calgary_dma_ops = { | 514 | static struct dma_mapping_ops calgary_dma_ops = { |
545 | .alloc_coherent = calgary_alloc_coherent, | 515 | .alloc_coherent = calgary_alloc_coherent, |
546 | .map_single = calgary_map_single, | 516 | .map_single = calgary_map_single, |
547 | .unmap_single = calgary_unmap_single, | 517 | .unmap_single = calgary_unmap_single, |
@@ -830,7 +800,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) | |||
830 | 800 | ||
831 | tbl = pci_iommu(dev->bus); | 801 | tbl = pci_iommu(dev->bus); |
832 | tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; | 802 | tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; |
833 | tce_free(tbl, 0, tbl->it_size); | 803 | |
804 | if (is_kdump_kernel()) | ||
805 | calgary_init_bitmap_from_tce_table(tbl); | ||
806 | else | ||
807 | tce_free(tbl, 0, tbl->it_size); | ||
834 | 808 | ||
835 | if (is_calgary(dev->device)) | 809 | if (is_calgary(dev->device)) |
836 | tbl->chip_ops = &calgary_chip_ops; | 810 | tbl->chip_ops = &calgary_chip_ops; |
@@ -1209,6 +1183,10 @@ static int __init calgary_init(void) | |||
1209 | if (ret) | 1183 | if (ret) |
1210 | return ret; | 1184 | return ret; |
1211 | 1185 | ||
1186 | /* Purely for kdump kernel case */ | ||
1187 | if (is_kdump_kernel()) | ||
1188 | get_tce_space_from_tar(); | ||
1189 | |||
1212 | do { | 1190 | do { |
1213 | dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); | 1191 | dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); |
1214 | if (!dev) | 1192 | if (!dev) |
@@ -1230,6 +1208,16 @@ static int __init calgary_init(void) | |||
1230 | goto error; | 1208 | goto error; |
1231 | } while (1); | 1209 | } while (1); |
1232 | 1210 | ||
1211 | dev = NULL; | ||
1212 | for_each_pci_dev(dev) { | ||
1213 | struct iommu_table *tbl; | ||
1214 | |||
1215 | tbl = find_iommu_table(&dev->dev); | ||
1216 | |||
1217 | if (translation_enabled(tbl)) | ||
1218 | dev->dev.archdata.dma_ops = &calgary_dma_ops; | ||
1219 | } | ||
1220 | |||
1233 | return ret; | 1221 | return ret; |
1234 | 1222 | ||
1235 | error: | 1223 | error: |
@@ -1251,6 +1239,7 @@ error: | |||
1251 | calgary_disable_translation(dev); | 1239 | calgary_disable_translation(dev); |
1252 | calgary_free_bus(dev); | 1240 | calgary_free_bus(dev); |
1253 | pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ | 1241 | pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ |
1242 | dev->dev.archdata.dma_ops = NULL; | ||
1254 | } while (1); | 1243 | } while (1); |
1255 | 1244 | ||
1256 | return ret; | 1245 | return ret; |
@@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) | |||
1339 | return (val != 0xffffffff); | 1328 | return (val != 0xffffffff); |
1340 | } | 1329 | } |
1341 | 1330 | ||
1331 | /* | ||
1332 | * calgary_init_bitmap_from_tce_table(): | ||
1333 | * Funtion for kdump case. In the second/kdump kernel initialize | ||
1334 | * the bitmap based on the tce table entries obtained from first kernel | ||
1335 | */ | ||
1336 | static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) | ||
1337 | { | ||
1338 | u64 *tp; | ||
1339 | unsigned int index; | ||
1340 | tp = ((u64 *)tbl->it_base); | ||
1341 | for (index = 0 ; index < tbl->it_size; index++) { | ||
1342 | if (*tp != 0x0) | ||
1343 | set_bit(index, tbl->it_map); | ||
1344 | tp++; | ||
1345 | } | ||
1346 | } | ||
1347 | |||
1348 | /* | ||
1349 | * get_tce_space_from_tar(): | ||
1350 | * Function for kdump case. Get the tce tables from first kernel | ||
1351 | * by reading the contents of the base adress register of calgary iommu | ||
1352 | */ | ||
1353 | static void get_tce_space_from_tar() | ||
1354 | { | ||
1355 | int bus; | ||
1356 | void __iomem *target; | ||
1357 | unsigned long tce_space; | ||
1358 | |||
1359 | for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { | ||
1360 | struct calgary_bus_info *info = &bus_info[bus]; | ||
1361 | unsigned short pci_device; | ||
1362 | u32 val; | ||
1363 | |||
1364 | val = read_pci_config(bus, 0, 0, 0); | ||
1365 | pci_device = (val & 0xFFFF0000) >> 16; | ||
1366 | |||
1367 | if (!is_cal_pci_dev(pci_device)) | ||
1368 | continue; | ||
1369 | if (info->translation_disabled) | ||
1370 | continue; | ||
1371 | |||
1372 | if (calgary_bus_has_devices(bus, pci_device) || | ||
1373 | translate_empty_slots) { | ||
1374 | target = calgary_reg(bus_info[bus].bbar, | ||
1375 | tar_offset(bus)); | ||
1376 | tce_space = be64_to_cpu(readq(target)); | ||
1377 | tce_space = tce_space & TAR_SW_BITS; | ||
1378 | |||
1379 | tce_space = tce_space & (~specified_table_size); | ||
1380 | info->tce_space = (u64 *)__va(tce_space); | ||
1381 | } | ||
1382 | } | ||
1383 | return; | ||
1384 | } | ||
1385 | |||
1342 | void __init detect_calgary(void) | 1386 | void __init detect_calgary(void) |
1343 | { | 1387 | { |
1344 | int bus; | 1388 | int bus; |
@@ -1394,7 +1438,8 @@ void __init detect_calgary(void) | |||
1394 | return; | 1438 | return; |
1395 | } | 1439 | } |
1396 | 1440 | ||
1397 | specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); | 1441 | specified_table_size = determine_tce_table_size((is_kdump_kernel() ? |
1442 | saved_max_pfn : max_pfn) * PAGE_SIZE); | ||
1398 | 1443 | ||
1399 | for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { | 1444 | for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { |
1400 | struct calgary_bus_info *info = &bus_info[bus]; | 1445 | struct calgary_bus_info *info = &bus_info[bus]; |
@@ -1412,10 +1457,16 @@ void __init detect_calgary(void) | |||
1412 | 1457 | ||
1413 | if (calgary_bus_has_devices(bus, pci_device) || | 1458 | if (calgary_bus_has_devices(bus, pci_device) || |
1414 | translate_empty_slots) { | 1459 | translate_empty_slots) { |
1415 | tbl = alloc_tce_table(); | 1460 | /* |
1416 | if (!tbl) | 1461 | * If it is kdump kernel, find and use tce tables |
1417 | goto cleanup; | 1462 | * from first kernel, else allocate tce tables here |
1418 | info->tce_space = tbl; | 1463 | */ |
1464 | if (!is_kdump_kernel()) { | ||
1465 | tbl = alloc_tce_table(); | ||
1466 | if (!tbl) | ||
1467 | goto cleanup; | ||
1468 | info->tce_space = tbl; | ||
1469 | } | ||
1419 | calgary_found = 1; | 1470 | calgary_found = 1; |
1420 | } | 1471 | } |
1421 | } | 1472 | } |
@@ -1430,6 +1481,10 @@ void __init detect_calgary(void) | |||
1430 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " | 1481 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " |
1431 | "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, | 1482 | "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, |
1432 | debugging ? "enabled" : "disabled"); | 1483 | debugging ? "enabled" : "disabled"); |
1484 | |||
1485 | /* swiotlb for devices that aren't behind the Calgary. */ | ||
1486 | if (max_pfn > MAX_DMA32_PFN) | ||
1487 | swiotlb = 1; | ||
1433 | } | 1488 | } |
1434 | return; | 1489 | return; |
1435 | 1490 | ||
@@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void) | |||
1446 | { | 1501 | { |
1447 | int ret; | 1502 | int ret; |
1448 | 1503 | ||
1449 | if (no_iommu || swiotlb) | 1504 | if (no_iommu || (swiotlb && !calgary_detected)) |
1450 | return -ENODEV; | 1505 | return -ENODEV; |
1451 | 1506 | ||
1452 | if (!calgary_detected) | 1507 | if (!calgary_detected) |
@@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void) | |||
1459 | if (ret) { | 1514 | if (ret) { |
1460 | printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " | 1515 | printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " |
1461 | "falling back to no_iommu\n", ret); | 1516 | "falling back to no_iommu\n", ret); |
1462 | if (end_pfn > MAX_DMA32_PFN) | ||
1463 | printk(KERN_ERR "WARNING more than 4GB of memory, " | ||
1464 | "32bit PCI may malfunction.\n"); | ||
1465 | return ret; | 1517 | return ret; |
1466 | } | 1518 | } |
1467 | 1519 | ||
1468 | force_iommu = 1; | 1520 | force_iommu = 1; |
1469 | bad_dma_address = 0x0; | 1521 | bad_dma_address = 0x0; |
1470 | dma_ops = &calgary_dma_ops; | 1522 | /* dma_ops is set to swiotlb or nommu */ |
1523 | if (!dma_ops) | ||
1524 | dma_ops = &nommu_dma_ops; | ||
1471 | 1525 | ||
1472 | return 0; | 1526 | return 0; |
1473 | } | 1527 | } |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index dc00a1331ace..8dbffb846de9 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -5,13 +5,13 @@ | |||
5 | 5 | ||
6 | #include <asm/proto.h> | 6 | #include <asm/proto.h> |
7 | #include <asm/dma.h> | 7 | #include <asm/dma.h> |
8 | #include <asm/gart.h> | 8 | #include <asm/iommu.h> |
9 | #include <asm/calgary.h> | 9 | #include <asm/calgary.h> |
10 | #include <asm/amd_iommu.h> | ||
10 | 11 | ||
11 | int forbid_dac __read_mostly; | 12 | static int forbid_dac __read_mostly; |
12 | EXPORT_SYMBOL(forbid_dac); | ||
13 | 13 | ||
14 | const struct dma_mapping_ops *dma_ops; | 14 | struct dma_mapping_ops *dma_ops; |
15 | EXPORT_SYMBOL(dma_ops); | 15 | EXPORT_SYMBOL(dma_ops); |
16 | 16 | ||
17 | static int iommu_sac_force __read_mostly; | 17 | static int iommu_sac_force __read_mostly; |
@@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_size_opt); | |||
74 | void __init dma32_reserve_bootmem(void) | 74 | void __init dma32_reserve_bootmem(void) |
75 | { | 75 | { |
76 | unsigned long size, align; | 76 | unsigned long size, align; |
77 | if (end_pfn <= MAX_DMA32_PFN) | 77 | if (max_pfn <= MAX_DMA32_PFN) |
78 | return; | 78 | return; |
79 | 79 | ||
80 | /* | ||
81 | * check aperture_64.c allocate_aperture() for reason about | ||
82 | * using 512M as goal | ||
83 | */ | ||
80 | align = 64ULL<<20; | 84 | align = 64ULL<<20; |
81 | size = round_up(dma32_bootmem_size, align); | 85 | size = round_up(dma32_bootmem_size, align); |
82 | dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, | 86 | dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, |
83 | __pa(MAX_DMA_ADDRESS)); | 87 | 512ULL<<20); |
84 | if (dma32_bootmem_ptr) | 88 | if (dma32_bootmem_ptr) |
85 | dma32_bootmem_size = size; | 89 | dma32_bootmem_size = size; |
86 | else | 90 | else |
@@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void) | |||
88 | } | 92 | } |
89 | static void __init dma32_free_bootmem(void) | 93 | static void __init dma32_free_bootmem(void) |
90 | { | 94 | { |
91 | int node; | ||
92 | 95 | ||
93 | if (end_pfn <= MAX_DMA32_PFN) | 96 | if (max_pfn <= MAX_DMA32_PFN) |
94 | return; | 97 | return; |
95 | 98 | ||
96 | if (!dma32_bootmem_ptr) | 99 | if (!dma32_bootmem_ptr) |
97 | return; | 100 | return; |
98 | 101 | ||
99 | for_each_online_node(node) | 102 | free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size); |
100 | free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr), | ||
101 | dma32_bootmem_size); | ||
102 | 103 | ||
103 | dma32_bootmem_ptr = NULL; | 104 | dma32_bootmem_ptr = NULL; |
104 | dma32_bootmem_size = 0; | 105 | dma32_bootmem_size = 0; |
@@ -112,19 +113,15 @@ void __init pci_iommu_alloc(void) | |||
112 | * The order of these functions is important for | 113 | * The order of these functions is important for |
113 | * fall-back/fail-over reasons | 114 | * fall-back/fail-over reasons |
114 | */ | 115 | */ |
115 | #ifdef CONFIG_GART_IOMMU | ||
116 | gart_iommu_hole_init(); | 116 | gart_iommu_hole_init(); |
117 | #endif | ||
118 | 117 | ||
119 | #ifdef CONFIG_CALGARY_IOMMU | ||
120 | detect_calgary(); | 118 | detect_calgary(); |
121 | #endif | ||
122 | 119 | ||
123 | detect_intel_iommu(); | 120 | detect_intel_iommu(); |
124 | 121 | ||
125 | #ifdef CONFIG_SWIOTLB | 122 | amd_iommu_detect(); |
123 | |||
126 | pci_swiotlb_init(); | 124 | pci_swiotlb_init(); |
127 | #endif | ||
128 | } | 125 | } |
129 | #endif | 126 | #endif |
130 | 127 | ||
@@ -180,9 +177,7 @@ static __init int iommu_setup(char *p) | |||
180 | swiotlb = 1; | 177 | swiotlb = 1; |
181 | #endif | 178 | #endif |
182 | 179 | ||
183 | #ifdef CONFIG_GART_IOMMU | ||
184 | gart_parse_options(p); | 180 | gart_parse_options(p); |
185 | #endif | ||
186 | 181 | ||
187 | #ifdef CONFIG_CALGARY_IOMMU | 182 | #ifdef CONFIG_CALGARY_IOMMU |
188 | if (!strncmp(p, "calgary", 7)) | 183 | if (!strncmp(p, "calgary", 7)) |
@@ -197,136 +192,19 @@ static __init int iommu_setup(char *p) | |||
197 | } | 192 | } |
198 | early_param("iommu", iommu_setup); | 193 | early_param("iommu", iommu_setup); |
199 | 194 | ||
200 | #ifdef CONFIG_X86_32 | ||
201 | int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, | ||
202 | dma_addr_t device_addr, size_t size, int flags) | ||
203 | { | ||
204 | void __iomem *mem_base = NULL; | ||
205 | int pages = size >> PAGE_SHIFT; | ||
206 | int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); | ||
207 | |||
208 | if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) | ||
209 | goto out; | ||
210 | if (!size) | ||
211 | goto out; | ||
212 | if (dev->dma_mem) | ||
213 | goto out; | ||
214 | |||
215 | /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ | ||
216 | |||
217 | mem_base = ioremap(bus_addr, size); | ||
218 | if (!mem_base) | ||
219 | goto out; | ||
220 | |||
221 | dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); | ||
222 | if (!dev->dma_mem) | ||
223 | goto out; | ||
224 | dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); | ||
225 | if (!dev->dma_mem->bitmap) | ||
226 | goto free1_out; | ||
227 | |||
228 | dev->dma_mem->virt_base = mem_base; | ||
229 | dev->dma_mem->device_base = device_addr; | ||
230 | dev->dma_mem->size = pages; | ||
231 | dev->dma_mem->flags = flags; | ||
232 | |||
233 | if (flags & DMA_MEMORY_MAP) | ||
234 | return DMA_MEMORY_MAP; | ||
235 | |||
236 | return DMA_MEMORY_IO; | ||
237 | |||
238 | free1_out: | ||
239 | kfree(dev->dma_mem); | ||
240 | out: | ||
241 | if (mem_base) | ||
242 | iounmap(mem_base); | ||
243 | return 0; | ||
244 | } | ||
245 | EXPORT_SYMBOL(dma_declare_coherent_memory); | ||
246 | |||
247 | void dma_release_declared_memory(struct device *dev) | ||
248 | { | ||
249 | struct dma_coherent_mem *mem = dev->dma_mem; | ||
250 | |||
251 | if (!mem) | ||
252 | return; | ||
253 | dev->dma_mem = NULL; | ||
254 | iounmap(mem->virt_base); | ||
255 | kfree(mem->bitmap); | ||
256 | kfree(mem); | ||
257 | } | ||
258 | EXPORT_SYMBOL(dma_release_declared_memory); | ||
259 | |||
260 | void *dma_mark_declared_memory_occupied(struct device *dev, | ||
261 | dma_addr_t device_addr, size_t size) | ||
262 | { | ||
263 | struct dma_coherent_mem *mem = dev->dma_mem; | ||
264 | int pos, err; | ||
265 | int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); | ||
266 | |||
267 | pages >>= PAGE_SHIFT; | ||
268 | |||
269 | if (!mem) | ||
270 | return ERR_PTR(-EINVAL); | ||
271 | |||
272 | pos = (device_addr - mem->device_base) >> PAGE_SHIFT; | ||
273 | err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); | ||
274 | if (err != 0) | ||
275 | return ERR_PTR(err); | ||
276 | return mem->virt_base + (pos << PAGE_SHIFT); | ||
277 | } | ||
278 | EXPORT_SYMBOL(dma_mark_declared_memory_occupied); | ||
279 | |||
280 | static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, | ||
281 | dma_addr_t *dma_handle, void **ret) | ||
282 | { | ||
283 | struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; | ||
284 | int order = get_order(size); | ||
285 | |||
286 | if (mem) { | ||
287 | int page = bitmap_find_free_region(mem->bitmap, mem->size, | ||
288 | order); | ||
289 | if (page >= 0) { | ||
290 | *dma_handle = mem->device_base + (page << PAGE_SHIFT); | ||
291 | *ret = mem->virt_base + (page << PAGE_SHIFT); | ||
292 | memset(*ret, 0, size); | ||
293 | } | ||
294 | if (mem->flags & DMA_MEMORY_EXCLUSIVE) | ||
295 | *ret = NULL; | ||
296 | } | ||
297 | return (mem != NULL); | ||
298 | } | ||
299 | |||
300 | static int dma_release_coherent(struct device *dev, int order, void *vaddr) | ||
301 | { | ||
302 | struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; | ||
303 | |||
304 | if (mem && vaddr >= mem->virt_base && vaddr < | ||
305 | (mem->virt_base + (mem->size << PAGE_SHIFT))) { | ||
306 | int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; | ||
307 | |||
308 | bitmap_release_region(mem->bitmap, page, order); | ||
309 | return 1; | ||
310 | } | ||
311 | return 0; | ||
312 | } | ||
313 | #else | ||
314 | #define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) | ||
315 | #define dma_release_coherent(dev, order, vaddr) (0) | ||
316 | #endif /* CONFIG_X86_32 */ | ||
317 | |||
318 | int dma_supported(struct device *dev, u64 mask) | 195 | int dma_supported(struct device *dev, u64 mask) |
319 | { | 196 | { |
197 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
198 | |||
320 | #ifdef CONFIG_PCI | 199 | #ifdef CONFIG_PCI |
321 | if (mask > 0xffffffff && forbid_dac > 0) { | 200 | if (mask > 0xffffffff && forbid_dac > 0) { |
322 | printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", | 201 | dev_info(dev, "PCI: Disallowing DAC for device\n"); |
323 | dev->bus_id); | ||
324 | return 0; | 202 | return 0; |
325 | } | 203 | } |
326 | #endif | 204 | #endif |
327 | 205 | ||
328 | if (dma_ops->dma_supported) | 206 | if (ops->dma_supported) |
329 | return dma_ops->dma_supported(dev, mask); | 207 | return ops->dma_supported(dev, mask); |
330 | 208 | ||
331 | /* Copied from i386. Doesn't make much sense, because it will | 209 | /* Copied from i386. Doesn't make much sense, because it will |
332 | only work for pci_alloc_coherent. | 210 | only work for pci_alloc_coherent. |
@@ -347,8 +225,7 @@ int dma_supported(struct device *dev, u64 mask) | |||
347 | type. Normally this doesn't make any difference, but gives | 225 | type. Normally this doesn't make any difference, but gives |
348 | more gentle handling of IOMMU overflow. */ | 226 | more gentle handling of IOMMU overflow. */ |
349 | if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { | 227 | if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { |
350 | printk(KERN_INFO "%s: Force SAC with mask %Lx\n", | 228 | dev_info(dev, "Force SAC with mask %Lx\n", mask); |
351 | dev->bus_id, mask); | ||
352 | return 0; | 229 | return 0; |
353 | } | 230 | } |
354 | 231 | ||
@@ -357,7 +234,7 @@ int dma_supported(struct device *dev, u64 mask) | |||
357 | EXPORT_SYMBOL(dma_supported); | 234 | EXPORT_SYMBOL(dma_supported); |
358 | 235 | ||
359 | /* Allocate DMA memory on node near device */ | 236 | /* Allocate DMA memory on node near device */ |
360 | noinline struct page * | 237 | static noinline struct page * |
361 | dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) | 238 | dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) |
362 | { | 239 | { |
363 | int node; | 240 | int node; |
@@ -374,6 +251,7 @@ void * | |||
374 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | 251 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, |
375 | gfp_t gfp) | 252 | gfp_t gfp) |
376 | { | 253 | { |
254 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
377 | void *memory = NULL; | 255 | void *memory = NULL; |
378 | struct page *page; | 256 | struct page *page; |
379 | unsigned long dma_mask = 0; | 257 | unsigned long dma_mask = 0; |
@@ -383,7 +261,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
383 | /* ignore region specifiers */ | 261 | /* ignore region specifiers */ |
384 | gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); | 262 | gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); |
385 | 263 | ||
386 | if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) | 264 | if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) |
387 | return memory; | 265 | return memory; |
388 | 266 | ||
389 | if (!dev) { | 267 | if (!dev) { |
@@ -442,8 +320,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
442 | /* Let low level make its own zone decisions */ | 320 | /* Let low level make its own zone decisions */ |
443 | gfp &= ~(GFP_DMA32|GFP_DMA); | 321 | gfp &= ~(GFP_DMA32|GFP_DMA); |
444 | 322 | ||
445 | if (dma_ops->alloc_coherent) | 323 | if (ops->alloc_coherent) |
446 | return dma_ops->alloc_coherent(dev, size, | 324 | return ops->alloc_coherent(dev, size, |
447 | dma_handle, gfp); | 325 | dma_handle, gfp); |
448 | return NULL; | 326 | return NULL; |
449 | } | 327 | } |
@@ -455,14 +333,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
455 | } | 333 | } |
456 | } | 334 | } |
457 | 335 | ||
458 | if (dma_ops->alloc_coherent) { | 336 | if (ops->alloc_coherent) { |
459 | free_pages((unsigned long)memory, get_order(size)); | 337 | free_pages((unsigned long)memory, get_order(size)); |
460 | gfp &= ~(GFP_DMA|GFP_DMA32); | 338 | gfp &= ~(GFP_DMA|GFP_DMA32); |
461 | return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); | 339 | return ops->alloc_coherent(dev, size, dma_handle, gfp); |
462 | } | 340 | } |
463 | 341 | ||
464 | if (dma_ops->map_simple) { | 342 | if (ops->map_simple) { |
465 | *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), | 343 | *dma_handle = ops->map_simple(dev, virt_to_phys(memory), |
466 | size, | 344 | size, |
467 | PCI_DMA_BIDIRECTIONAL); | 345 | PCI_DMA_BIDIRECTIONAL); |
468 | if (*dma_handle != bad_dma_address) | 346 | if (*dma_handle != bad_dma_address) |
@@ -484,27 +362,27 @@ EXPORT_SYMBOL(dma_alloc_coherent); | |||
484 | void dma_free_coherent(struct device *dev, size_t size, | 362 | void dma_free_coherent(struct device *dev, size_t size, |
485 | void *vaddr, dma_addr_t bus) | 363 | void *vaddr, dma_addr_t bus) |
486 | { | 364 | { |
365 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
366 | |||
487 | int order = get_order(size); | 367 | int order = get_order(size); |
488 | WARN_ON(irqs_disabled()); /* for portability */ | 368 | WARN_ON(irqs_disabled()); /* for portability */ |
489 | if (dma_release_coherent(dev, order, vaddr)) | 369 | if (dma_release_from_coherent(dev, order, vaddr)) |
490 | return; | 370 | return; |
491 | if (dma_ops->unmap_single) | 371 | if (ops->unmap_single) |
492 | dma_ops->unmap_single(dev, bus, size, 0); | 372 | ops->unmap_single(dev, bus, size, 0); |
493 | free_pages((unsigned long)vaddr, order); | 373 | free_pages((unsigned long)vaddr, order); |
494 | } | 374 | } |
495 | EXPORT_SYMBOL(dma_free_coherent); | 375 | EXPORT_SYMBOL(dma_free_coherent); |
496 | 376 | ||
497 | static int __init pci_iommu_init(void) | 377 | static int __init pci_iommu_init(void) |
498 | { | 378 | { |
499 | #ifdef CONFIG_CALGARY_IOMMU | ||
500 | calgary_iommu_init(); | 379 | calgary_iommu_init(); |
501 | #endif | ||
502 | 380 | ||
503 | intel_iommu_init(); | 381 | intel_iommu_init(); |
504 | 382 | ||
505 | #ifdef CONFIG_GART_IOMMU | 383 | amd_iommu_init(); |
384 | |||
506 | gart_iommu_init(); | 385 | gart_iommu_init(); |
507 | #endif | ||
508 | 386 | ||
509 | no_iommu_init(); | 387 | no_iommu_init(); |
510 | return 0; | 388 | return 0; |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index aa8ec928caa8..49285f8fd4d5 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <asm/mtrr.h> | 32 | #include <asm/mtrr.h> |
33 | #include <asm/pgtable.h> | 33 | #include <asm/pgtable.h> |
34 | #include <asm/proto.h> | 34 | #include <asm/proto.h> |
35 | #include <asm/iommu.h> | ||
35 | #include <asm/gart.h> | 36 | #include <asm/gart.h> |
36 | #include <asm/cacheflush.h> | 37 | #include <asm/cacheflush.h> |
37 | #include <asm/swiotlb.h> | 38 | #include <asm/swiotlb.h> |
@@ -66,9 +67,6 @@ static u32 gart_unmapped_entry; | |||
66 | (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) | 67 | (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) |
67 | #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) | 68 | #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) |
68 | 69 | ||
69 | #define to_pages(addr, size) \ | ||
70 | (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) | ||
71 | |||
72 | #define EMERGENCY_PAGES 32 /* = 128KB */ | 70 | #define EMERGENCY_PAGES 32 /* = 128KB */ |
73 | 71 | ||
74 | #ifdef CONFIG_AGP | 72 | #ifdef CONFIG_AGP |
@@ -104,7 +102,6 @@ static unsigned long alloc_iommu(struct device *dev, int size) | |||
104 | size, base_index, boundary_size, 0); | 102 | size, base_index, boundary_size, 0); |
105 | } | 103 | } |
106 | if (offset != -1) { | 104 | if (offset != -1) { |
107 | set_bit_string(iommu_gart_bitmap, offset, size); | ||
108 | next_bit = offset+size; | 105 | next_bit = offset+size; |
109 | if (next_bit >= iommu_pages) { | 106 | if (next_bit >= iommu_pages) { |
110 | next_bit = 0; | 107 | next_bit = 0; |
@@ -198,9 +195,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) | |||
198 | * out. Hopefully no network devices use single mappings that big. | 195 | * out. Hopefully no network devices use single mappings that big. |
199 | */ | 196 | */ |
200 | 197 | ||
201 | printk(KERN_ERR | 198 | dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); |
202 | "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", | ||
203 | size, dev->bus_id); | ||
204 | 199 | ||
205 | if (size > PAGE_SIZE*EMERGENCY_PAGES) { | 200 | if (size > PAGE_SIZE*EMERGENCY_PAGES) { |
206 | if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) | 201 | if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) |
@@ -243,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | |||
243 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | 238 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, |
244 | size_t size, int dir) | 239 | size_t size, int dir) |
245 | { | 240 | { |
246 | unsigned long npages = to_pages(phys_mem, size); | 241 | unsigned long npages = iommu_num_pages(phys_mem, size); |
247 | unsigned long iommu_page = alloc_iommu(dev, npages); | 242 | unsigned long iommu_page = alloc_iommu(dev, npages); |
248 | int i; | 243 | int i; |
249 | 244 | ||
@@ -306,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, | |||
306 | return; | 301 | return; |
307 | 302 | ||
308 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | 303 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; |
309 | npages = to_pages(dma_addr, size); | 304 | npages = iommu_num_pages(dma_addr, size); |
310 | for (i = 0; i < npages; i++) { | 305 | for (i = 0; i < npages; i++) { |
311 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | 306 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; |
312 | CLEAR_LEAK(iommu_page + i); | 307 | CLEAR_LEAK(iommu_page + i); |
@@ -389,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, | |||
389 | } | 384 | } |
390 | 385 | ||
391 | addr = phys_addr; | 386 | addr = phys_addr; |
392 | pages = to_pages(s->offset, s->length); | 387 | pages = iommu_num_pages(s->offset, s->length); |
393 | while (pages--) { | 388 | while (pages--) { |
394 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | 389 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); |
395 | SET_LEAK(iommu_page); | 390 | SET_LEAK(iommu_page); |
@@ -472,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | |||
472 | 467 | ||
473 | seg_size += s->length; | 468 | seg_size += s->length; |
474 | need = nextneed; | 469 | need = nextneed; |
475 | pages += to_pages(s->offset, s->length); | 470 | pages += iommu_num_pages(s->offset, s->length); |
476 | ps = s; | 471 | ps = s; |
477 | } | 472 | } |
478 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) | 473 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) |
@@ -534,8 +529,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) | |||
534 | unsigned aper_size = 0, aper_base_32, aper_order; | 529 | unsigned aper_size = 0, aper_base_32, aper_order; |
535 | u64 aper_base; | 530 | u64 aper_base; |
536 | 531 | ||
537 | pci_read_config_dword(dev, 0x94, &aper_base_32); | 532 | pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); |
538 | pci_read_config_dword(dev, 0x90, &aper_order); | 533 | pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); |
539 | aper_order = (aper_order >> 1) & 7; | 534 | aper_order = (aper_order >> 1) & 7; |
540 | 535 | ||
541 | aper_base = aper_base_32 & 0x7fff; | 536 | aper_base = aper_base_32 & 0x7fff; |
@@ -549,14 +544,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) | |||
549 | return aper_base; | 544 | return aper_base; |
550 | } | 545 | } |
551 | 546 | ||
547 | static void enable_gart_translations(void) | ||
548 | { | ||
549 | int i; | ||
550 | |||
551 | for (i = 0; i < num_k8_northbridges; i++) { | ||
552 | struct pci_dev *dev = k8_northbridges[i]; | ||
553 | |||
554 | enable_gart_translation(dev, __pa(agp_gatt_table)); | ||
555 | } | ||
556 | } | ||
557 | |||
558 | /* | ||
559 | * If fix_up_north_bridges is set, the north bridges have to be fixed up on | ||
560 | * resume in the same way as they are handled in gart_iommu_hole_init(). | ||
561 | */ | ||
562 | static bool fix_up_north_bridges; | ||
563 | static u32 aperture_order; | ||
564 | static u32 aperture_alloc; | ||
565 | |||
566 | void set_up_gart_resume(u32 aper_order, u32 aper_alloc) | ||
567 | { | ||
568 | fix_up_north_bridges = true; | ||
569 | aperture_order = aper_order; | ||
570 | aperture_alloc = aper_alloc; | ||
571 | } | ||
572 | |||
552 | static int gart_resume(struct sys_device *dev) | 573 | static int gart_resume(struct sys_device *dev) |
553 | { | 574 | { |
575 | printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); | ||
576 | |||
577 | if (fix_up_north_bridges) { | ||
578 | int i; | ||
579 | |||
580 | printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); | ||
581 | |||
582 | for (i = 0; i < num_k8_northbridges; i++) { | ||
583 | struct pci_dev *dev = k8_northbridges[i]; | ||
584 | |||
585 | /* | ||
586 | * Don't enable translations just yet. That is the next | ||
587 | * step. Restore the pre-suspend aperture settings. | ||
588 | */ | ||
589 | pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, | ||
590 | aperture_order << 1); | ||
591 | pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, | ||
592 | aperture_alloc >> 25); | ||
593 | } | ||
594 | } | ||
595 | |||
596 | enable_gart_translations(); | ||
597 | |||
554 | return 0; | 598 | return 0; |
555 | } | 599 | } |
556 | 600 | ||
557 | static int gart_suspend(struct sys_device *dev, pm_message_t state) | 601 | static int gart_suspend(struct sys_device *dev, pm_message_t state) |
558 | { | 602 | { |
559 | return -EINVAL; | 603 | return 0; |
560 | } | 604 | } |
561 | 605 | ||
562 | static struct sysdev_class gart_sysdev_class = { | 606 | static struct sysdev_class gart_sysdev_class = { |
@@ -582,6 +626,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
582 | struct pci_dev *dev; | 626 | struct pci_dev *dev; |
583 | void *gatt; | 627 | void *gatt; |
584 | int i, error; | 628 | int i, error; |
629 | unsigned long start_pfn, end_pfn; | ||
585 | 630 | ||
586 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); | 631 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); |
587 | aper_size = aper_base = info->aper_size = 0; | 632 | aper_size = aper_base = info->aper_size = 0; |
@@ -614,31 +659,25 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
614 | memset(gatt, 0, gatt_size); | 659 | memset(gatt, 0, gatt_size); |
615 | agp_gatt_table = gatt; | 660 | agp_gatt_table = gatt; |
616 | 661 | ||
617 | for (i = 0; i < num_k8_northbridges; i++) { | 662 | enable_gart_translations(); |
618 | u32 gatt_reg; | ||
619 | u32 ctl; | ||
620 | |||
621 | dev = k8_northbridges[i]; | ||
622 | gatt_reg = __pa(gatt) >> 12; | ||
623 | gatt_reg <<= 4; | ||
624 | pci_write_config_dword(dev, 0x98, gatt_reg); | ||
625 | pci_read_config_dword(dev, 0x90, &ctl); | ||
626 | |||
627 | ctl |= 1; | ||
628 | ctl &= ~((1<<4) | (1<<5)); | ||
629 | |||
630 | pci_write_config_dword(dev, 0x90, ctl); | ||
631 | } | ||
632 | 663 | ||
633 | error = sysdev_class_register(&gart_sysdev_class); | 664 | error = sysdev_class_register(&gart_sysdev_class); |
634 | if (!error) | 665 | if (!error) |
635 | error = sysdev_register(&device_gart); | 666 | error = sysdev_register(&device_gart); |
636 | if (error) | 667 | if (error) |
637 | panic("Could not register gart_sysdev -- would corrupt data on next suspend"); | 668 | panic("Could not register gart_sysdev -- would corrupt data on next suspend"); |
669 | |||
638 | flush_gart(); | 670 | flush_gart(); |
639 | 671 | ||
640 | printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", | 672 | printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", |
641 | aper_base, aper_size>>10); | 673 | aper_base, aper_size>>10); |
674 | |||
675 | /* need to map that range */ | ||
676 | end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); | ||
677 | if (end_pfn > max_low_pfn_mapped) { | ||
678 | start_pfn = (aper_base>>PAGE_SHIFT); | ||
679 | init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
680 | } | ||
642 | return 0; | 681 | return 0; |
643 | 682 | ||
644 | nommu: | 683 | nommu: |
@@ -650,8 +689,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
650 | 689 | ||
651 | extern int agp_amd64_init(void); | 690 | extern int agp_amd64_init(void); |
652 | 691 | ||
653 | static const struct dma_mapping_ops gart_dma_ops = { | 692 | static struct dma_mapping_ops gart_dma_ops = { |
654 | .mapping_error = NULL, | ||
655 | .map_single = gart_map_single, | 693 | .map_single = gart_map_single, |
656 | .map_simple = gart_map_simple, | 694 | .map_simple = gart_map_simple, |
657 | .unmap_single = gart_unmap_single, | 695 | .unmap_single = gart_unmap_single, |
@@ -677,11 +715,11 @@ void gart_iommu_shutdown(void) | |||
677 | u32 ctl; | 715 | u32 ctl; |
678 | 716 | ||
679 | dev = k8_northbridges[i]; | 717 | dev = k8_northbridges[i]; |
680 | pci_read_config_dword(dev, 0x90, &ctl); | 718 | pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); |
681 | 719 | ||
682 | ctl &= ~1; | 720 | ctl &= ~GARTEN; |
683 | 721 | ||
684 | pci_write_config_dword(dev, 0x90, ctl); | 722 | pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); |
685 | } | 723 | } |
686 | } | 724 | } |
687 | 725 | ||
@@ -716,10 +754,10 @@ void __init gart_iommu_init(void) | |||
716 | return; | 754 | return; |
717 | 755 | ||
718 | if (no_iommu || | 756 | if (no_iommu || |
719 | (!force_iommu && end_pfn <= MAX_DMA32_PFN) || | 757 | (!force_iommu && max_pfn <= MAX_DMA32_PFN) || |
720 | !gart_iommu_aperture || | 758 | !gart_iommu_aperture || |
721 | (no_agp && init_k8_gatt(&info) < 0)) { | 759 | (no_agp && init_k8_gatt(&info) < 0)) { |
722 | if (end_pfn > MAX_DMA32_PFN) { | 760 | if (max_pfn > MAX_DMA32_PFN) { |
723 | printk(KERN_WARNING "More than 4GB of memory " | 761 | printk(KERN_WARNING "More than 4GB of memory " |
724 | "but GART IOMMU not available.\n" | 762 | "but GART IOMMU not available.\n" |
725 | KERN_WARNING "falling back to iommu=soft.\n"); | 763 | KERN_WARNING "falling back to iommu=soft.\n"); |
@@ -788,10 +826,10 @@ void __init gart_iommu_init(void) | |||
788 | wbinvd(); | 826 | wbinvd(); |
789 | 827 | ||
790 | /* | 828 | /* |
791 | * Try to workaround a bug (thanks to BenH) | 829 | * Try to workaround a bug (thanks to BenH): |
792 | * Set unmapped entries to a scratch page instead of 0. | 830 | * Set unmapped entries to a scratch page instead of 0. |
793 | * Any prefetches that hit unmapped entries won't get an bus abort | 831 | * Any prefetches that hit unmapped entries won't get an bus abort |
794 | * then. | 832 | * then. (P2P bridge may be prefetching on DMA reads). |
795 | */ | 833 | */ |
796 | scratch = get_zeroed_page(GFP_KERNEL); | 834 | scratch = get_zeroed_page(GFP_KERNEL); |
797 | if (!scratch) | 835 | if (!scratch) |
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index aec43d56f49c..3f91f71cdc3e 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c | |||
@@ -7,7 +7,7 @@ | |||
7 | #include <linux/dma-mapping.h> | 7 | #include <linux/dma-mapping.h> |
8 | #include <linux/scatterlist.h> | 8 | #include <linux/scatterlist.h> |
9 | 9 | ||
10 | #include <asm/gart.h> | 10 | #include <asm/iommu.h> |
11 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
12 | #include <asm/dma.h> | 12 | #include <asm/dma.h> |
13 | 13 | ||
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, | |||
72 | return nents; | 72 | return nents; |
73 | } | 73 | } |
74 | 74 | ||
75 | /* Make sure we keep the same behaviour */ | 75 | struct dma_mapping_ops nommu_dma_ops = { |
76 | static int nommu_mapping_error(dma_addr_t dma_addr) | ||
77 | { | ||
78 | #ifdef CONFIG_X86_32 | ||
79 | return 0; | ||
80 | #else | ||
81 | return (dma_addr == bad_dma_address); | ||
82 | #endif | ||
83 | } | ||
84 | |||
85 | |||
86 | const struct dma_mapping_ops nommu_dma_ops = { | ||
87 | .map_single = nommu_map_single, | 76 | .map_single = nommu_map_single, |
88 | .map_sg = nommu_map_sg, | 77 | .map_sg = nommu_map_sg, |
89 | .mapping_error = nommu_mapping_error, | ||
90 | .is_phys = 1, | 78 | .is_phys = 1, |
91 | }; | 79 | }; |
92 | 80 | ||
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 490da7f4b8d0..c4ce0332759e 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c | |||
@@ -5,7 +5,7 @@ | |||
5 | #include <linux/module.h> | 5 | #include <linux/module.h> |
6 | #include <linux/dma-mapping.h> | 6 | #include <linux/dma-mapping.h> |
7 | 7 | ||
8 | #include <asm/gart.h> | 8 | #include <asm/iommu.h> |
9 | #include <asm/swiotlb.h> | 9 | #include <asm/swiotlb.h> |
10 | #include <asm/dma.h> | 10 | #include <asm/dma.h> |
11 | 11 | ||
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, | |||
18 | return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); | 18 | return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); |
19 | } | 19 | } |
20 | 20 | ||
21 | const struct dma_mapping_ops swiotlb_dma_ops = { | 21 | struct dma_mapping_ops swiotlb_dma_ops = { |
22 | .mapping_error = swiotlb_dma_mapping_error, | 22 | .mapping_error = swiotlb_dma_mapping_error, |
23 | .alloc_coherent = swiotlb_alloc_coherent, | 23 | .alloc_coherent = swiotlb_alloc_coherent, |
24 | .free_coherent = swiotlb_free_coherent, | 24 | .free_coherent = swiotlb_free_coherent, |
@@ -38,7 +38,7 @@ const struct dma_mapping_ops swiotlb_dma_ops = { | |||
38 | void __init pci_swiotlb_init(void) | 38 | void __init pci_swiotlb_init(void) |
39 | { | 39 | { |
40 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ | 40 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ |
41 | if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) | 41 | if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) |
42 | swiotlb = 1; | 42 | swiotlb = 1; |
43 | if (swiotlb_force) | 43 | if (swiotlb_force) |
44 | swiotlb = 1; | 44 | swiotlb = 1; |
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c new file mode 100644 index 000000000000..675a48c404a5 --- /dev/null +++ b/arch/x86/kernel/probe_roms_32.c | |||
@@ -0,0 +1,166 @@ | |||
1 | #include <linux/sched.h> | ||
2 | #include <linux/mm.h> | ||
3 | #include <linux/uaccess.h> | ||
4 | #include <linux/mmzone.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/seq_file.h> | ||
7 | #include <linux/console.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/edd.h> | ||
10 | #include <linux/dmi.h> | ||
11 | #include <linux/pfn.h> | ||
12 | #include <linux/pci.h> | ||
13 | #include <asm/pci-direct.h> | ||
14 | |||
15 | |||
16 | #include <asm/e820.h> | ||
17 | #include <asm/mmzone.h> | ||
18 | #include <asm/setup.h> | ||
19 | #include <asm/sections.h> | ||
20 | #include <asm/io.h> | ||
21 | #include <setup_arch.h> | ||
22 | |||
23 | static struct resource system_rom_resource = { | ||
24 | .name = "System ROM", | ||
25 | .start = 0xf0000, | ||
26 | .end = 0xfffff, | ||
27 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
28 | }; | ||
29 | |||
30 | static struct resource extension_rom_resource = { | ||
31 | .name = "Extension ROM", | ||
32 | .start = 0xe0000, | ||
33 | .end = 0xeffff, | ||
34 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
35 | }; | ||
36 | |||
37 | static struct resource adapter_rom_resources[] = { { | ||
38 | .name = "Adapter ROM", | ||
39 | .start = 0xc8000, | ||
40 | .end = 0, | ||
41 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
42 | }, { | ||
43 | .name = "Adapter ROM", | ||
44 | .start = 0, | ||
45 | .end = 0, | ||
46 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
47 | }, { | ||
48 | .name = "Adapter ROM", | ||
49 | .start = 0, | ||
50 | .end = 0, | ||
51 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
52 | }, { | ||
53 | .name = "Adapter ROM", | ||
54 | .start = 0, | ||
55 | .end = 0, | ||
56 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
57 | }, { | ||
58 | .name = "Adapter ROM", | ||
59 | .start = 0, | ||
60 | .end = 0, | ||
61 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
62 | }, { | ||
63 | .name = "Adapter ROM", | ||
64 | .start = 0, | ||
65 | .end = 0, | ||
66 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
67 | } }; | ||
68 | |||
69 | static struct resource video_rom_resource = { | ||
70 | .name = "Video ROM", | ||
71 | .start = 0xc0000, | ||
72 | .end = 0xc7fff, | ||
73 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
74 | }; | ||
75 | |||
76 | #define ROMSIGNATURE 0xaa55 | ||
77 | |||
78 | static int __init romsignature(const unsigned char *rom) | ||
79 | { | ||
80 | const unsigned short * const ptr = (const unsigned short *)rom; | ||
81 | unsigned short sig; | ||
82 | |||
83 | return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; | ||
84 | } | ||
85 | |||
86 | static int __init romchecksum(const unsigned char *rom, unsigned long length) | ||
87 | { | ||
88 | unsigned char sum, c; | ||
89 | |||
90 | for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) | ||
91 | sum += c; | ||
92 | return !length && !sum; | ||
93 | } | ||
94 | |||
95 | void __init probe_roms(void) | ||
96 | { | ||
97 | const unsigned char *rom; | ||
98 | unsigned long start, length, upper; | ||
99 | unsigned char c; | ||
100 | int i; | ||
101 | |||
102 | /* video rom */ | ||
103 | upper = adapter_rom_resources[0].start; | ||
104 | for (start = video_rom_resource.start; start < upper; start += 2048) { | ||
105 | rom = isa_bus_to_virt(start); | ||
106 | if (!romsignature(rom)) | ||
107 | continue; | ||
108 | |||
109 | video_rom_resource.start = start; | ||
110 | |||
111 | if (probe_kernel_address(rom + 2, c) != 0) | ||
112 | continue; | ||
113 | |||
114 | /* 0 < length <= 0x7f * 512, historically */ | ||
115 | length = c * 512; | ||
116 | |||
117 | /* if checksum okay, trust length byte */ | ||
118 | if (length && romchecksum(rom, length)) | ||
119 | video_rom_resource.end = start + length - 1; | ||
120 | |||
121 | request_resource(&iomem_resource, &video_rom_resource); | ||
122 | break; | ||
123 | } | ||
124 | |||
125 | start = (video_rom_resource.end + 1 + 2047) & ~2047UL; | ||
126 | if (start < upper) | ||
127 | start = upper; | ||
128 | |||
129 | /* system rom */ | ||
130 | request_resource(&iomem_resource, &system_rom_resource); | ||
131 | upper = system_rom_resource.start; | ||
132 | |||
133 | /* check for extension rom (ignore length byte!) */ | ||
134 | rom = isa_bus_to_virt(extension_rom_resource.start); | ||
135 | if (romsignature(rom)) { | ||
136 | length = extension_rom_resource.end - extension_rom_resource.start + 1; | ||
137 | if (romchecksum(rom, length)) { | ||
138 | request_resource(&iomem_resource, &extension_rom_resource); | ||
139 | upper = extension_rom_resource.start; | ||
140 | } | ||
141 | } | ||
142 | |||
143 | /* check for adapter roms on 2k boundaries */ | ||
144 | for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { | ||
145 | rom = isa_bus_to_virt(start); | ||
146 | if (!romsignature(rom)) | ||
147 | continue; | ||
148 | |||
149 | if (probe_kernel_address(rom + 2, c) != 0) | ||
150 | continue; | ||
151 | |||
152 | /* 0 < length <= 0x7f * 512, historically */ | ||
153 | length = c * 512; | ||
154 | |||
155 | /* but accept any length that fits if checksum okay */ | ||
156 | if (!length || start + length > upper || !romchecksum(rom, length)) | ||
157 | continue; | ||
158 | |||
159 | adapter_rom_resources[i].start = start; | ||
160 | adapter_rom_resources[i].end = start + length - 1; | ||
161 | request_resource(&iomem_resource, &adapter_rom_resources[i]); | ||
162 | |||
163 | start = adapter_rom_resources[i++].end & ~2047UL; | ||
164 | } | ||
165 | } | ||
166 | |||
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ba370dc8685b..7fc4d5b0a6a0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -6,8 +6,16 @@ | |||
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/pm.h> | 8 | #include <linux/pm.h> |
9 | #include <linux/clockchips.h> | ||
10 | #include <asm/system.h> | ||
11 | |||
12 | unsigned long idle_halt; | ||
13 | EXPORT_SYMBOL(idle_halt); | ||
14 | unsigned long idle_nomwait; | ||
15 | EXPORT_SYMBOL(idle_nomwait); | ||
9 | 16 | ||
10 | struct kmem_cache *task_xstate_cachep; | 17 | struct kmem_cache *task_xstate_cachep; |
18 | static int force_mwait __cpuinitdata; | ||
11 | 19 | ||
12 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 20 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
13 | { | 21 | { |
@@ -45,6 +53,76 @@ void arch_task_cache_init(void) | |||
45 | SLAB_PANIC, NULL); | 53 | SLAB_PANIC, NULL); |
46 | } | 54 | } |
47 | 55 | ||
56 | /* | ||
57 | * Idle related variables and functions | ||
58 | */ | ||
59 | unsigned long boot_option_idle_override = 0; | ||
60 | EXPORT_SYMBOL(boot_option_idle_override); | ||
61 | |||
62 | /* | ||
63 | * Powermanagement idle function, if any.. | ||
64 | */ | ||
65 | void (*pm_idle)(void); | ||
66 | EXPORT_SYMBOL(pm_idle); | ||
67 | |||
68 | #ifdef CONFIG_X86_32 | ||
69 | /* | ||
70 | * This halt magic was a workaround for ancient floppy DMA | ||
71 | * wreckage. It should be safe to remove. | ||
72 | */ | ||
73 | static int hlt_counter; | ||
74 | void disable_hlt(void) | ||
75 | { | ||
76 | hlt_counter++; | ||
77 | } | ||
78 | EXPORT_SYMBOL(disable_hlt); | ||
79 | |||
80 | void enable_hlt(void) | ||
81 | { | ||
82 | hlt_counter--; | ||
83 | } | ||
84 | EXPORT_SYMBOL(enable_hlt); | ||
85 | |||
86 | static inline int hlt_use_halt(void) | ||
87 | { | ||
88 | return (!hlt_counter && boot_cpu_data.hlt_works_ok); | ||
89 | } | ||
90 | #else | ||
91 | static inline int hlt_use_halt(void) | ||
92 | { | ||
93 | return 1; | ||
94 | } | ||
95 | #endif | ||
96 | |||
97 | /* | ||
98 | * We use this if we don't have any better | ||
99 | * idle routine.. | ||
100 | */ | ||
101 | void default_idle(void) | ||
102 | { | ||
103 | if (hlt_use_halt()) { | ||
104 | current_thread_info()->status &= ~TS_POLLING; | ||
105 | /* | ||
106 | * TS_POLLING-cleared state must be visible before we | ||
107 | * test NEED_RESCHED: | ||
108 | */ | ||
109 | smp_mb(); | ||
110 | |||
111 | if (!need_resched()) | ||
112 | safe_halt(); /* enables interrupts racelessly */ | ||
113 | else | ||
114 | local_irq_enable(); | ||
115 | current_thread_info()->status |= TS_POLLING; | ||
116 | } else { | ||
117 | local_irq_enable(); | ||
118 | /* loop is done by the caller */ | ||
119 | cpu_relax(); | ||
120 | } | ||
121 | } | ||
122 | #ifdef CONFIG_APM_MODULE | ||
123 | EXPORT_SYMBOL(default_idle); | ||
124 | #endif | ||
125 | |||
48 | static void do_nothing(void *unused) | 126 | static void do_nothing(void *unused) |
49 | { | 127 | { |
50 | } | 128 | } |
@@ -61,7 +139,7 @@ void cpu_idle_wait(void) | |||
61 | { | 139 | { |
62 | smp_mb(); | 140 | smp_mb(); |
63 | /* kick all the CPUs so that they exit out of pm_idle */ | 141 | /* kick all the CPUs so that they exit out of pm_idle */ |
64 | smp_call_function(do_nothing, NULL, 0, 1); | 142 | smp_call_function(do_nothing, NULL, 1); |
65 | } | 143 | } |
66 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | 144 | EXPORT_SYMBOL_GPL(cpu_idle_wait); |
67 | 145 | ||
@@ -122,54 +200,163 @@ static void poll_idle(void) | |||
122 | * | 200 | * |
123 | * idle=mwait overrides this decision and forces the usage of mwait. | 201 | * idle=mwait overrides this decision and forces the usage of mwait. |
124 | */ | 202 | */ |
203 | static int __cpuinitdata force_mwait; | ||
204 | |||
205 | #define MWAIT_INFO 0x05 | ||
206 | #define MWAIT_ECX_EXTENDED_INFO 0x01 | ||
207 | #define MWAIT_EDX_C1 0xf0 | ||
208 | |||
125 | static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) | 209 | static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) |
126 | { | 210 | { |
211 | u32 eax, ebx, ecx, edx; | ||
212 | |||
127 | if (force_mwait) | 213 | if (force_mwait) |
128 | return 1; | 214 | return 1; |
129 | 215 | ||
130 | if (c->x86_vendor == X86_VENDOR_AMD) { | 216 | if (c->cpuid_level < MWAIT_INFO) |
131 | switch(c->x86) { | 217 | return 0; |
132 | case 0x10: | 218 | |
133 | case 0x11: | 219 | cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); |
134 | return 0; | 220 | /* Check, whether EDX has extended info about MWAIT */ |
135 | } | 221 | if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) |
136 | } | 222 | return 1; |
223 | |||
224 | /* | ||
225 | * edx enumeratios MONITOR/MWAIT extensions. Check, whether | ||
226 | * C1 supports MWAIT | ||
227 | */ | ||
228 | return (edx & MWAIT_EDX_C1); | ||
229 | } | ||
230 | |||
231 | /* | ||
232 | * Check for AMD CPUs, which have potentially C1E support | ||
233 | */ | ||
234 | static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) | ||
235 | { | ||
236 | if (c->x86_vendor != X86_VENDOR_AMD) | ||
237 | return 0; | ||
238 | |||
239 | if (c->x86 < 0x0F) | ||
240 | return 0; | ||
241 | |||
242 | /* Family 0x0f models < rev F do not have C1E */ | ||
243 | if (c->x86 == 0x0f && c->x86_model < 0x40) | ||
244 | return 0; | ||
245 | |||
137 | return 1; | 246 | return 1; |
138 | } | 247 | } |
139 | 248 | ||
140 | void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | 249 | /* |
250 | * C1E aware idle routine. We check for C1E active in the interrupt | ||
251 | * pending message MSR. If we detect C1E, then we handle it the same | ||
252 | * way as C3 power states (local apic timer and TSC stop) | ||
253 | */ | ||
254 | static void c1e_idle(void) | ||
141 | { | 255 | { |
142 | static int selected; | 256 | static cpumask_t c1e_mask = CPU_MASK_NONE; |
257 | static int c1e_detected; | ||
143 | 258 | ||
144 | if (selected) | 259 | if (need_resched()) |
145 | return; | 260 | return; |
261 | |||
262 | if (!c1e_detected) { | ||
263 | u32 lo, hi; | ||
264 | |||
265 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); | ||
266 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { | ||
267 | c1e_detected = 1; | ||
268 | mark_tsc_unstable("TSC halt in C1E"); | ||
269 | printk(KERN_INFO "System has C1E enabled\n"); | ||
270 | } | ||
271 | } | ||
272 | |||
273 | if (c1e_detected) { | ||
274 | int cpu = smp_processor_id(); | ||
275 | |||
276 | if (!cpu_isset(cpu, c1e_mask)) { | ||
277 | cpu_set(cpu, c1e_mask); | ||
278 | /* | ||
279 | * Force broadcast so ACPI can not interfere. Needs | ||
280 | * to run with interrupts enabled as it uses | ||
281 | * smp_function_call. | ||
282 | */ | ||
283 | local_irq_enable(); | ||
284 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, | ||
285 | &cpu); | ||
286 | printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", | ||
287 | cpu); | ||
288 | local_irq_disable(); | ||
289 | } | ||
290 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | ||
291 | |||
292 | default_idle(); | ||
293 | |||
294 | /* | ||
295 | * The switch back from broadcast mode needs to be | ||
296 | * called with interrupts disabled. | ||
297 | */ | ||
298 | local_irq_disable(); | ||
299 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); | ||
300 | local_irq_enable(); | ||
301 | } else | ||
302 | default_idle(); | ||
303 | } | ||
304 | |||
305 | void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | ||
306 | { | ||
146 | #ifdef CONFIG_X86_SMP | 307 | #ifdef CONFIG_X86_SMP |
147 | if (pm_idle == poll_idle && smp_num_siblings > 1) { | 308 | if (pm_idle == poll_idle && smp_num_siblings > 1) { |
148 | printk(KERN_WARNING "WARNING: polling idle and HT enabled," | 309 | printk(KERN_WARNING "WARNING: polling idle and HT enabled," |
149 | " performance may degrade.\n"); | 310 | " performance may degrade.\n"); |
150 | } | 311 | } |
151 | #endif | 312 | #endif |
313 | if (pm_idle) | ||
314 | return; | ||
315 | |||
152 | if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { | 316 | if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { |
153 | /* | 317 | /* |
154 | * Skip, if setup has overridden idle. | ||
155 | * One CPU supports mwait => All CPUs supports mwait | 318 | * One CPU supports mwait => All CPUs supports mwait |
156 | */ | 319 | */ |
157 | if (!pm_idle) { | 320 | printk(KERN_INFO "using mwait in idle threads.\n"); |
158 | printk(KERN_INFO "using mwait in idle threads.\n"); | 321 | pm_idle = mwait_idle; |
159 | pm_idle = mwait_idle; | 322 | } else if (check_c1e_idle(c)) { |
160 | } | 323 | printk(KERN_INFO "using C1E aware idle routine\n"); |
161 | } | 324 | pm_idle = c1e_idle; |
162 | selected = 1; | 325 | } else |
326 | pm_idle = default_idle; | ||
163 | } | 327 | } |
164 | 328 | ||
165 | static int __init idle_setup(char *str) | 329 | static int __init idle_setup(char *str) |
166 | { | 330 | { |
331 | if (!str) | ||
332 | return -EINVAL; | ||
333 | |||
167 | if (!strcmp(str, "poll")) { | 334 | if (!strcmp(str, "poll")) { |
168 | printk("using polling idle threads.\n"); | 335 | printk("using polling idle threads.\n"); |
169 | pm_idle = poll_idle; | 336 | pm_idle = poll_idle; |
170 | } else if (!strcmp(str, "mwait")) | 337 | } else if (!strcmp(str, "mwait")) |
171 | force_mwait = 1; | 338 | force_mwait = 1; |
172 | else | 339 | else if (!strcmp(str, "halt")) { |
340 | /* | ||
341 | * When the boot option of idle=halt is added, halt is | ||
342 | * forced to be used for CPU idle. In such case CPU C2/C3 | ||
343 | * won't be used again. | ||
344 | * To continue to load the CPU idle driver, don't touch | ||
345 | * the boot_option_idle_override. | ||
346 | */ | ||
347 | pm_idle = default_idle; | ||
348 | idle_halt = 1; | ||
349 | return 0; | ||
350 | } else if (!strcmp(str, "nomwait")) { | ||
351 | /* | ||
352 | * If the boot option of "idle=nomwait" is added, | ||
353 | * it means that mwait will be disabled for CPU C2/C3 | ||
354 | * states. In such case it won't touch the variable | ||
355 | * of boot_option_idle_override. | ||
356 | */ | ||
357 | idle_nomwait = 1; | ||
358 | return 0; | ||
359 | } else | ||
173 | return -1; | 360 | return -1; |
174 | 361 | ||
175 | boot_option_idle_override = 1; | 362 | boot_option_idle_override = 1; |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6d5483356e74..53bc653ed5ca 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -58,11 +58,6 @@ | |||
58 | 58 | ||
59 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 59 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
60 | 60 | ||
61 | static int hlt_counter; | ||
62 | |||
63 | unsigned long boot_option_idle_override = 0; | ||
64 | EXPORT_SYMBOL(boot_option_idle_override); | ||
65 | |||
66 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | 61 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; |
67 | EXPORT_PER_CPU_SYMBOL(current_task); | 62 | EXPORT_PER_CPU_SYMBOL(current_task); |
68 | 63 | ||
@@ -77,57 +72,24 @@ unsigned long thread_saved_pc(struct task_struct *tsk) | |||
77 | return ((unsigned long *)tsk->thread.sp)[3]; | 72 | return ((unsigned long *)tsk->thread.sp)[3]; |
78 | } | 73 | } |
79 | 74 | ||
80 | /* | 75 | #ifdef CONFIG_HOTPLUG_CPU |
81 | * Powermanagement idle function, if any.. | 76 | #include <asm/nmi.h> |
82 | */ | ||
83 | void (*pm_idle)(void); | ||
84 | EXPORT_SYMBOL(pm_idle); | ||
85 | 77 | ||
86 | void disable_hlt(void) | 78 | static void cpu_exit_clear(void) |
87 | { | 79 | { |
88 | hlt_counter++; | 80 | int cpu = raw_smp_processor_id(); |
89 | } | ||
90 | 81 | ||
91 | EXPORT_SYMBOL(disable_hlt); | 82 | idle_task_exit(); |
92 | 83 | ||
93 | void enable_hlt(void) | 84 | cpu_uninit(); |
94 | { | 85 | irq_ctx_exit(cpu); |
95 | hlt_counter--; | ||
96 | } | ||
97 | 86 | ||
98 | EXPORT_SYMBOL(enable_hlt); | 87 | cpu_clear(cpu, cpu_callout_map); |
99 | 88 | cpu_clear(cpu, cpu_callin_map); | |
100 | /* | ||
101 | * We use this if we don't have any better | ||
102 | * idle routine.. | ||
103 | */ | ||
104 | void default_idle(void) | ||
105 | { | ||
106 | if (!hlt_counter && boot_cpu_data.hlt_works_ok) { | ||
107 | current_thread_info()->status &= ~TS_POLLING; | ||
108 | /* | ||
109 | * TS_POLLING-cleared state must be visible before we | ||
110 | * test NEED_RESCHED: | ||
111 | */ | ||
112 | smp_mb(); | ||
113 | 89 | ||
114 | if (!need_resched()) | 90 | numa_remove_cpu(cpu); |
115 | safe_halt(); /* enables interrupts racelessly */ | ||
116 | else | ||
117 | local_irq_enable(); | ||
118 | current_thread_info()->status |= TS_POLLING; | ||
119 | } else { | ||
120 | local_irq_enable(); | ||
121 | /* loop is done by the caller */ | ||
122 | cpu_relax(); | ||
123 | } | ||
124 | } | 91 | } |
125 | #ifdef CONFIG_APM_MODULE | ||
126 | EXPORT_SYMBOL(default_idle); | ||
127 | #endif | ||
128 | 92 | ||
129 | #ifdef CONFIG_HOTPLUG_CPU | ||
130 | #include <asm/nmi.h> | ||
131 | /* We don't actually take CPU down, just spin without interrupts. */ | 93 | /* We don't actually take CPU down, just spin without interrupts. */ |
132 | static inline void play_dead(void) | 94 | static inline void play_dead(void) |
133 | { | 95 | { |
@@ -166,26 +128,24 @@ void cpu_idle(void) | |||
166 | 128 | ||
167 | /* endless idle loop with no priority at all */ | 129 | /* endless idle loop with no priority at all */ |
168 | while (1) { | 130 | while (1) { |
169 | tick_nohz_stop_sched_tick(); | 131 | tick_nohz_stop_sched_tick(1); |
170 | while (!need_resched()) { | 132 | while (!need_resched()) { |
171 | void (*idle)(void); | ||
172 | 133 | ||
173 | check_pgt_cache(); | 134 | check_pgt_cache(); |
174 | rmb(); | 135 | rmb(); |
175 | idle = pm_idle; | ||
176 | 136 | ||
177 | if (rcu_pending(cpu)) | 137 | if (rcu_pending(cpu)) |
178 | rcu_check_callbacks(cpu, 0); | 138 | rcu_check_callbacks(cpu, 0); |
179 | 139 | ||
180 | if (!idle) | ||
181 | idle = default_idle; | ||
182 | |||
183 | if (cpu_is_offline(cpu)) | 140 | if (cpu_is_offline(cpu)) |
184 | play_dead(); | 141 | play_dead(); |
185 | 142 | ||
186 | local_irq_disable(); | 143 | local_irq_disable(); |
187 | __get_cpu_var(irq_stat).idle_timestamp = jiffies; | 144 | __get_cpu_var(irq_stat).idle_timestamp = jiffies; |
188 | idle(); | 145 | /* Don't trace irqs off for idle */ |
146 | stop_critical_timings(); | ||
147 | pm_idle(); | ||
148 | start_critical_timings(); | ||
189 | } | 149 | } |
190 | tick_nohz_restart_sched_tick(); | 150 | tick_nohz_restart_sched_tick(); |
191 | preempt_enable_no_resched(); | 151 | preempt_enable_no_resched(); |
@@ -333,6 +293,7 @@ void flush_thread(void) | |||
333 | /* | 293 | /* |
334 | * Forget coprocessor state.. | 294 | * Forget coprocessor state.. |
335 | */ | 295 | */ |
296 | tsk->fpu_counter = 0; | ||
336 | clear_fpu(tsk); | 297 | clear_fpu(tsk); |
337 | clear_used_math(); | 298 | clear_used_math(); |
338 | } | 299 | } |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ac54ff56df80..3fb62a7d9a16 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -56,15 +56,6 @@ asmlinkage extern void ret_from_fork(void); | |||
56 | 56 | ||
57 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; | 57 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; |
58 | 58 | ||
59 | unsigned long boot_option_idle_override = 0; | ||
60 | EXPORT_SYMBOL(boot_option_idle_override); | ||
61 | |||
62 | /* | ||
63 | * Powermanagement idle function, if any.. | ||
64 | */ | ||
65 | void (*pm_idle)(void); | ||
66 | EXPORT_SYMBOL(pm_idle); | ||
67 | |||
68 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); | 59 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); |
69 | 60 | ||
70 | void idle_notifier_register(struct notifier_block *n) | 61 | void idle_notifier_register(struct notifier_block *n) |
@@ -94,25 +85,6 @@ void exit_idle(void) | |||
94 | __exit_idle(); | 85 | __exit_idle(); |
95 | } | 86 | } |
96 | 87 | ||
97 | /* | ||
98 | * We use this if we don't have any better | ||
99 | * idle routine.. | ||
100 | */ | ||
101 | void default_idle(void) | ||
102 | { | ||
103 | current_thread_info()->status &= ~TS_POLLING; | ||
104 | /* | ||
105 | * TS_POLLING-cleared state must be visible before we | ||
106 | * test NEED_RESCHED: | ||
107 | */ | ||
108 | smp_mb(); | ||
109 | if (!need_resched()) | ||
110 | safe_halt(); /* enables interrupts racelessly */ | ||
111 | else | ||
112 | local_irq_enable(); | ||
113 | current_thread_info()->status |= TS_POLLING; | ||
114 | } | ||
115 | |||
116 | #ifdef CONFIG_HOTPLUG_CPU | 88 | #ifdef CONFIG_HOTPLUG_CPU |
117 | DECLARE_PER_CPU(int, cpu_state); | 89 | DECLARE_PER_CPU(int, cpu_state); |
118 | 90 | ||
@@ -148,14 +120,11 @@ void cpu_idle(void) | |||
148 | current_thread_info()->status |= TS_POLLING; | 120 | current_thread_info()->status |= TS_POLLING; |
149 | /* endless idle loop with no priority at all */ | 121 | /* endless idle loop with no priority at all */ |
150 | while (1) { | 122 | while (1) { |
151 | tick_nohz_stop_sched_tick(); | 123 | tick_nohz_stop_sched_tick(1); |
152 | while (!need_resched()) { | 124 | while (!need_resched()) { |
153 | void (*idle)(void); | ||
154 | 125 | ||
155 | rmb(); | 126 | rmb(); |
156 | idle = pm_idle; | 127 | |
157 | if (!idle) | ||
158 | idle = default_idle; | ||
159 | if (cpu_is_offline(smp_processor_id())) | 128 | if (cpu_is_offline(smp_processor_id())) |
160 | play_dead(); | 129 | play_dead(); |
161 | /* | 130 | /* |
@@ -165,7 +134,10 @@ void cpu_idle(void) | |||
165 | */ | 134 | */ |
166 | local_irq_disable(); | 135 | local_irq_disable(); |
167 | enter_idle(); | 136 | enter_idle(); |
168 | idle(); | 137 | /* Don't trace irqs off for idle */ |
138 | stop_critical_timings(); | ||
139 | pm_idle(); | ||
140 | start_critical_timings(); | ||
169 | /* In many cases the interrupt that ended idle | 141 | /* In many cases the interrupt that ended idle |
170 | has already called exit_idle. But some idle | 142 | has already called exit_idle. But some idle |
171 | loops can be woken up without interrupt. */ | 143 | loops can be woken up without interrupt. */ |
@@ -294,6 +266,7 @@ void flush_thread(void) | |||
294 | /* | 266 | /* |
295 | * Forget coprocessor state.. | 267 | * Forget coprocessor state.. |
296 | */ | 268 | */ |
269 | tsk->fpu_counter = 0; | ||
297 | clear_fpu(tsk); | 270 | clear_fpu(tsk); |
298 | clear_used_math(); | 271 | clear_used_math(); |
299 | } | 272 | } |
@@ -365,10 +338,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, | |||
365 | p->thread.fs = me->thread.fs; | 338 | p->thread.fs = me->thread.fs; |
366 | p->thread.gs = me->thread.gs; | 339 | p->thread.gs = me->thread.gs; |
367 | 340 | ||
368 | asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); | 341 | savesegment(gs, p->thread.gsindex); |
369 | asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); | 342 | savesegment(fs, p->thread.fsindex); |
370 | asm("mov %%es,%0" : "=m" (p->thread.es)); | 343 | savesegment(es, p->thread.es); |
371 | asm("mov %%ds,%0" : "=m" (p->thread.ds)); | 344 | savesegment(ds, p->thread.ds); |
372 | 345 | ||
373 | if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { | 346 | if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { |
374 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | 347 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); |
@@ -407,7 +380,9 @@ out: | |||
407 | void | 380 | void |
408 | start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) | 381 | start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) |
409 | { | 382 | { |
410 | asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0)); | 383 | loadsegment(fs, 0); |
384 | loadsegment(es, 0); | ||
385 | loadsegment(ds, 0); | ||
411 | load_gs_index(0); | 386 | load_gs_index(0); |
412 | regs->ip = new_ip; | 387 | regs->ip = new_ip; |
413 | regs->sp = new_sp; | 388 | regs->sp = new_sp; |
@@ -562,10 +537,11 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, | |||
562 | struct task_struct * | 537 | struct task_struct * |
563 | __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | 538 | __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
564 | { | 539 | { |
565 | struct thread_struct *prev = &prev_p->thread, | 540 | struct thread_struct *prev = &prev_p->thread; |
566 | *next = &next_p->thread; | 541 | struct thread_struct *next = &next_p->thread; |
567 | int cpu = smp_processor_id(); | 542 | int cpu = smp_processor_id(); |
568 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 543 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
544 | unsigned fsindex, gsindex; | ||
569 | 545 | ||
570 | /* we're going to use this soon, after a few expensive things */ | 546 | /* we're going to use this soon, after a few expensive things */ |
571 | if (next_p->fpu_counter>5) | 547 | if (next_p->fpu_counter>5) |
@@ -580,52 +556,64 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
580 | * Switch DS and ES. | 556 | * Switch DS and ES. |
581 | * This won't pick up thread selector changes, but I guess that is ok. | 557 | * This won't pick up thread selector changes, but I guess that is ok. |
582 | */ | 558 | */ |
583 | asm volatile("mov %%es,%0" : "=m" (prev->es)); | 559 | savesegment(es, prev->es); |
584 | if (unlikely(next->es | prev->es)) | 560 | if (unlikely(next->es | prev->es)) |
585 | loadsegment(es, next->es); | 561 | loadsegment(es, next->es); |
586 | 562 | ||
587 | asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); | 563 | savesegment(ds, prev->ds); |
588 | if (unlikely(next->ds | prev->ds)) | 564 | if (unlikely(next->ds | prev->ds)) |
589 | loadsegment(ds, next->ds); | 565 | loadsegment(ds, next->ds); |
590 | 566 | ||
567 | |||
568 | /* We must save %fs and %gs before load_TLS() because | ||
569 | * %fs and %gs may be cleared by load_TLS(). | ||
570 | * | ||
571 | * (e.g. xen_load_tls()) | ||
572 | */ | ||
573 | savesegment(fs, fsindex); | ||
574 | savesegment(gs, gsindex); | ||
575 | |||
591 | load_TLS(next, cpu); | 576 | load_TLS(next, cpu); |
592 | 577 | ||
578 | /* | ||
579 | * Leave lazy mode, flushing any hypercalls made here. | ||
580 | * This must be done before restoring TLS segments so | ||
581 | * the GDT and LDT are properly updated, and must be | ||
582 | * done before math_state_restore, so the TS bit is up | ||
583 | * to date. | ||
584 | */ | ||
585 | arch_leave_lazy_cpu_mode(); | ||
586 | |||
593 | /* | 587 | /* |
594 | * Switch FS and GS. | 588 | * Switch FS and GS. |
589 | * | ||
590 | * Segment register != 0 always requires a reload. Also | ||
591 | * reload when it has changed. When prev process used 64bit | ||
592 | * base always reload to avoid an information leak. | ||
595 | */ | 593 | */ |
596 | { | 594 | if (unlikely(fsindex | next->fsindex | prev->fs)) { |
597 | unsigned fsindex; | 595 | loadsegment(fs, next->fsindex); |
598 | asm volatile("movl %%fs,%0" : "=r" (fsindex)); | 596 | /* |
599 | /* segment register != 0 always requires a reload. | 597 | * Check if the user used a selector != 0; if yes |
600 | also reload when it has changed. | 598 | * clear 64bit base, since overloaded base is always |
601 | when prev process used 64bit base always reload | 599 | * mapped to the Null selector |
602 | to avoid an information leak. */ | 600 | */ |
603 | if (unlikely(fsindex | next->fsindex | prev->fs)) { | 601 | if (fsindex) |
604 | loadsegment(fs, next->fsindex); | ||
605 | /* check if the user used a selector != 0 | ||
606 | * if yes clear 64bit base, since overloaded base | ||
607 | * is always mapped to the Null selector | ||
608 | */ | ||
609 | if (fsindex) | ||
610 | prev->fs = 0; | 602 | prev->fs = 0; |
611 | } | ||
612 | /* when next process has a 64bit base use it */ | ||
613 | if (next->fs) | ||
614 | wrmsrl(MSR_FS_BASE, next->fs); | ||
615 | prev->fsindex = fsindex; | ||
616 | } | 603 | } |
617 | { | 604 | /* when next process has a 64bit base use it */ |
618 | unsigned gsindex; | 605 | if (next->fs) |
619 | asm volatile("movl %%gs,%0" : "=r" (gsindex)); | 606 | wrmsrl(MSR_FS_BASE, next->fs); |
620 | if (unlikely(gsindex | next->gsindex | prev->gs)) { | 607 | prev->fsindex = fsindex; |
621 | load_gs_index(next->gsindex); | 608 | |
622 | if (gsindex) | 609 | if (unlikely(gsindex | next->gsindex | prev->gs)) { |
610 | load_gs_index(next->gsindex); | ||
611 | if (gsindex) | ||
623 | prev->gs = 0; | 612 | prev->gs = 0; |
624 | } | ||
625 | if (next->gs) | ||
626 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | ||
627 | prev->gsindex = gsindex; | ||
628 | } | 613 | } |
614 | if (next->gs) | ||
615 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | ||
616 | prev->gsindex = gsindex; | ||
629 | 617 | ||
630 | /* Must be after DS reload */ | 618 | /* Must be after DS reload */ |
631 | unlazy_fpu(prev_p); | 619 | unlazy_fpu(prev_p); |
@@ -638,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
638 | write_pda(pcurrent, next_p); | 626 | write_pda(pcurrent, next_p); |
639 | 627 | ||
640 | write_pda(kernelstack, | 628 | write_pda(kernelstack, |
641 | (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); | 629 | (unsigned long)task_stack_page(next_p) + |
630 | THREAD_SIZE - PDA_STACKOFFSET); | ||
642 | #ifdef CONFIG_CC_STACKPROTECTOR | 631 | #ifdef CONFIG_CC_STACKPROTECTOR |
643 | write_pda(stack_canary, next_p->stack_canary); | 632 | write_pda(stack_canary, next_p->stack_canary); |
644 | /* | 633 | /* |
@@ -797,7 +786,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | |||
797 | set_32bit_tls(task, FS_TLS, addr); | 786 | set_32bit_tls(task, FS_TLS, addr); |
798 | if (doit) { | 787 | if (doit) { |
799 | load_TLS(&task->thread, cpu); | 788 | load_TLS(&task->thread, cpu); |
800 | asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); | 789 | loadsegment(fs, FS_TLS_SEL); |
801 | } | 790 | } |
802 | task->thread.fsindex = FS_TLS_SEL; | 791 | task->thread.fsindex = FS_TLS_SEL; |
803 | task->thread.fs = 0; | 792 | task->thread.fs = 0; |
@@ -807,7 +796,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | |||
807 | if (doit) { | 796 | if (doit) { |
808 | /* set the selector to 0 to not confuse | 797 | /* set the selector to 0 to not confuse |
809 | __switch_to */ | 798 | __switch_to */ |
810 | asm volatile("movl %0,%%fs" :: "r" (0)); | 799 | loadsegment(fs, 0); |
811 | ret = checking_wrmsrl(MSR_FS_BASE, addr); | 800 | ret = checking_wrmsrl(MSR_FS_BASE, addr); |
812 | } | 801 | } |
813 | } | 802 | } |
@@ -830,7 +819,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | |||
830 | if (task->thread.gsindex == GS_TLS_SEL) | 819 | if (task->thread.gsindex == GS_TLS_SEL) |
831 | base = read_32bit_tls(task, GS_TLS); | 820 | base = read_32bit_tls(task, GS_TLS); |
832 | else if (doit) { | 821 | else if (doit) { |
833 | asm("movl %%gs,%0" : "=r" (gsindex)); | 822 | savesegment(gs, gsindex); |
834 | if (gsindex) | 823 | if (gsindex) |
835 | rdmsrl(MSR_KERNEL_GS_BASE, base); | 824 | rdmsrl(MSR_KERNEL_GS_BASE, base); |
836 | else | 825 | else |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a7835f282936..e37dccce85db 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -943,13 +943,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
943 | return copy_regset_to_user(child, &user_x86_32_view, | 943 | return copy_regset_to_user(child, &user_x86_32_view, |
944 | REGSET_XFP, | 944 | REGSET_XFP, |
945 | 0, sizeof(struct user_fxsr_struct), | 945 | 0, sizeof(struct user_fxsr_struct), |
946 | datap); | 946 | datap) ? -EIO : 0; |
947 | 947 | ||
948 | case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ | 948 | case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ |
949 | return copy_regset_from_user(child, &user_x86_32_view, | 949 | return copy_regset_from_user(child, &user_x86_32_view, |
950 | REGSET_XFP, | 950 | REGSET_XFP, |
951 | 0, sizeof(struct user_fxsr_struct), | 951 | 0, sizeof(struct user_fxsr_struct), |
952 | datap); | 952 | datap) ? -EIO : 0; |
953 | #endif | 953 | #endif |
954 | 954 | ||
955 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION | 955 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION |
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) | |||
1357 | #endif | 1357 | #endif |
1358 | } | 1358 | } |
1359 | 1359 | ||
1360 | #ifdef CONFIG_X86_32 | ||
1361 | |||
1362 | void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) | 1360 | void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) |
1363 | { | 1361 | { |
1364 | struct siginfo info; | 1362 | struct siginfo info; |
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) | |||
1377 | force_sig_info(SIGTRAP, &info, tsk); | 1375 | force_sig_info(SIGTRAP, &info, tsk); |
1378 | } | 1376 | } |
1379 | 1377 | ||
1380 | /* notification of system call entry/exit | ||
1381 | * - triggered by current->work.syscall_trace | ||
1382 | */ | ||
1383 | int do_syscall_trace(struct pt_regs *regs, int entryexit) | ||
1384 | { | ||
1385 | int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); | ||
1386 | /* | ||
1387 | * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall | ||
1388 | * interception | ||
1389 | */ | ||
1390 | int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); | ||
1391 | int ret = 0; | ||
1392 | |||
1393 | /* do the secure computing check first */ | ||
1394 | if (!entryexit) | ||
1395 | secure_computing(regs->orig_ax); | ||
1396 | |||
1397 | if (unlikely(current->audit_context)) { | ||
1398 | if (entryexit) | ||
1399 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), | ||
1400 | regs->ax); | ||
1401 | /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only | ||
1402 | * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is | ||
1403 | * not used, entry.S will call us only on syscall exit, not | ||
1404 | * entry; so when TIF_SYSCALL_AUDIT is used we must avoid | ||
1405 | * calling send_sigtrap() on syscall entry. | ||
1406 | * | ||
1407 | * Note that when PTRACE_SYSEMU_SINGLESTEP is used, | ||
1408 | * is_singlestep is false, despite his name, so we will still do | ||
1409 | * the correct thing. | ||
1410 | */ | ||
1411 | else if (is_singlestep) | ||
1412 | goto out; | ||
1413 | } | ||
1414 | |||
1415 | if (!(current->ptrace & PT_PTRACED)) | ||
1416 | goto out; | ||
1417 | |||
1418 | /* If a process stops on the 1st tracepoint with SYSCALL_TRACE | ||
1419 | * and then is resumed with SYSEMU_SINGLESTEP, it will come in | ||
1420 | * here. We have to check this and return */ | ||
1421 | if (is_sysemu && entryexit) | ||
1422 | return 0; | ||
1423 | |||
1424 | /* Fake a debug trap */ | ||
1425 | if (is_singlestep) | ||
1426 | send_sigtrap(current, regs, 0); | ||
1427 | |||
1428 | if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) | ||
1429 | goto out; | ||
1430 | |||
1431 | /* the 0x80 provides a way for the tracing parent to distinguish | ||
1432 | between a syscall stop and SIGTRAP delivery */ | ||
1433 | /* Note that the debugger could change the result of test_thread_flag!*/ | ||
1434 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); | ||
1435 | |||
1436 | /* | ||
1437 | * this isn't the same as continuing with a signal, but it will do | ||
1438 | * for normal use. strace only continues with a signal if the | ||
1439 | * stopping signal is not SIGTRAP. -brl | ||
1440 | */ | ||
1441 | if (current->exit_code) { | ||
1442 | send_sig(current->exit_code, current, 1); | ||
1443 | current->exit_code = 0; | ||
1444 | } | ||
1445 | ret = is_sysemu; | ||
1446 | out: | ||
1447 | if (unlikely(current->audit_context) && !entryexit) | ||
1448 | audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, | ||
1449 | regs->bx, regs->cx, regs->dx, regs->si); | ||
1450 | if (ret == 0) | ||
1451 | return 0; | ||
1452 | |||
1453 | regs->orig_ax = -1; /* force skip of syscall restarting */ | ||
1454 | if (unlikely(current->audit_context)) | ||
1455 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); | ||
1456 | return 1; | ||
1457 | } | ||
1458 | |||
1459 | #else /* CONFIG_X86_64 */ | ||
1460 | |||
1461 | static void syscall_trace(struct pt_regs *regs) | 1378 | static void syscall_trace(struct pt_regs *regs) |
1462 | { | 1379 | { |
1380 | if (!(current->ptrace & PT_PTRACED)) | ||
1381 | return; | ||
1463 | 1382 | ||
1464 | #if 0 | 1383 | #if 0 |
1465 | printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", | 1384 | printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", |
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs) | |||
1481 | } | 1400 | } |
1482 | } | 1401 | } |
1483 | 1402 | ||
1484 | asmlinkage void syscall_trace_enter(struct pt_regs *regs) | 1403 | #ifdef CONFIG_X86_32 |
1404 | # define IS_IA32 1 | ||
1405 | #elif defined CONFIG_IA32_EMULATION | ||
1406 | # define IS_IA32 test_thread_flag(TIF_IA32) | ||
1407 | #else | ||
1408 | # define IS_IA32 0 | ||
1409 | #endif | ||
1410 | |||
1411 | /* | ||
1412 | * We must return the syscall number to actually look up in the table. | ||
1413 | * This can be -1L to skip running any syscall at all. | ||
1414 | */ | ||
1415 | asmregparm long syscall_trace_enter(struct pt_regs *regs) | ||
1485 | { | 1416 | { |
1417 | long ret = 0; | ||
1418 | |||
1419 | /* | ||
1420 | * If we stepped into a sysenter/syscall insn, it trapped in | ||
1421 | * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. | ||
1422 | * If user-mode had set TF itself, then it's still clear from | ||
1423 | * do_debug() and we need to set it again to restore the user | ||
1424 | * state. If we entered on the slow path, TF was already set. | ||
1425 | */ | ||
1426 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
1427 | regs->flags |= X86_EFLAGS_TF; | ||
1428 | |||
1486 | /* do the secure computing check first */ | 1429 | /* do the secure computing check first */ |
1487 | secure_computing(regs->orig_ax); | 1430 | secure_computing(regs->orig_ax); |
1488 | 1431 | ||
1489 | if (test_thread_flag(TIF_SYSCALL_TRACE) | 1432 | if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) |
1490 | && (current->ptrace & PT_PTRACED)) | 1433 | ret = -1L; |
1434 | |||
1435 | if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) | ||
1491 | syscall_trace(regs); | 1436 | syscall_trace(regs); |
1492 | 1437 | ||
1493 | if (unlikely(current->audit_context)) { | 1438 | if (unlikely(current->audit_context)) { |
1494 | if (test_thread_flag(TIF_IA32)) { | 1439 | if (IS_IA32) |
1495 | audit_syscall_entry(AUDIT_ARCH_I386, | 1440 | audit_syscall_entry(AUDIT_ARCH_I386, |
1496 | regs->orig_ax, | 1441 | regs->orig_ax, |
1497 | regs->bx, regs->cx, | 1442 | regs->bx, regs->cx, |
1498 | regs->dx, regs->si); | 1443 | regs->dx, regs->si); |
1499 | } else { | 1444 | #ifdef CONFIG_X86_64 |
1445 | else | ||
1500 | audit_syscall_entry(AUDIT_ARCH_X86_64, | 1446 | audit_syscall_entry(AUDIT_ARCH_X86_64, |
1501 | regs->orig_ax, | 1447 | regs->orig_ax, |
1502 | regs->di, regs->si, | 1448 | regs->di, regs->si, |
1503 | regs->dx, regs->r10); | 1449 | regs->dx, regs->r10); |
1504 | } | 1450 | #endif |
1505 | } | 1451 | } |
1452 | |||
1453 | return ret ?: regs->orig_ax; | ||
1506 | } | 1454 | } |
1507 | 1455 | ||
1508 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) | 1456 | asmregparm void syscall_trace_leave(struct pt_regs *regs) |
1509 | { | 1457 | { |
1510 | if (unlikely(current->audit_context)) | 1458 | if (unlikely(current->audit_context)) |
1511 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); | 1459 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); |
1512 | 1460 | ||
1513 | if ((test_thread_flag(TIF_SYSCALL_TRACE) | 1461 | if (test_thread_flag(TIF_SYSCALL_TRACE)) |
1514 | || test_thread_flag(TIF_SINGLESTEP)) | ||
1515 | && (current->ptrace & PT_PTRACED)) | ||
1516 | syscall_trace(regs); | 1462 | syscall_trace(regs); |
1517 | } | ||
1518 | 1463 | ||
1519 | #endif /* CONFIG_X86_32 */ | 1464 | /* |
1465 | * If TIF_SYSCALL_EMU is set, we only get here because of | ||
1466 | * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). | ||
1467 | * We already reported this syscall instruction in | ||
1468 | * syscall_trace_enter(), so don't do any more now. | ||
1469 | */ | ||
1470 | if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) | ||
1471 | return; | ||
1472 | |||
1473 | /* | ||
1474 | * If we are single-stepping, synthesize a trap to follow the | ||
1475 | * system call instruction. | ||
1476 | */ | ||
1477 | if (test_thread_flag(TIF_SINGLESTEP) && | ||
1478 | (current->ptrace & PT_PTRACED)) | ||
1479 | send_sigtrap(current, regs, 0); | ||
1480 | } | ||
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000000..05fbe9a0325a --- /dev/null +++ b/arch/x86/kernel/pvclock.c | |||
@@ -0,0 +1,141 @@ | |||
1 | /* paravirtual clock -- common code used by kvm/xen | ||
2 | |||
3 | This program is free software; you can redistribute it and/or modify | ||
4 | it under the terms of the GNU General Public License as published by | ||
5 | the Free Software Foundation; either version 2 of the License, or | ||
6 | (at your option) any later version. | ||
7 | |||
8 | This program is distributed in the hope that it will be useful, | ||
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | GNU General Public License for more details. | ||
12 | |||
13 | You should have received a copy of the GNU General Public License | ||
14 | along with this program; if not, write to the Free Software | ||
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
16 | */ | ||
17 | |||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <asm/pvclock.h> | ||
21 | |||
22 | /* | ||
23 | * These are perodically updated | ||
24 | * xen: magic shared_info page | ||
25 | * kvm: gpa registered via msr | ||
26 | * and then copied here. | ||
27 | */ | ||
28 | struct pvclock_shadow_time { | ||
29 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | ||
30 | u64 system_timestamp; /* Time, in nanosecs, since boot. */ | ||
31 | u32 tsc_to_nsec_mul; | ||
32 | int tsc_shift; | ||
33 | u32 version; | ||
34 | }; | ||
35 | |||
36 | /* | ||
37 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | ||
38 | * yielding a 64-bit result. | ||
39 | */ | ||
40 | static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | ||
41 | { | ||
42 | u64 product; | ||
43 | #ifdef __i386__ | ||
44 | u32 tmp1, tmp2; | ||
45 | #endif | ||
46 | |||
47 | if (shift < 0) | ||
48 | delta >>= -shift; | ||
49 | else | ||
50 | delta <<= shift; | ||
51 | |||
52 | #ifdef __i386__ | ||
53 | __asm__ ( | ||
54 | "mul %5 ; " | ||
55 | "mov %4,%%eax ; " | ||
56 | "mov %%edx,%4 ; " | ||
57 | "mul %5 ; " | ||
58 | "xor %5,%5 ; " | ||
59 | "add %4,%%eax ; " | ||
60 | "adc %5,%%edx ; " | ||
61 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | ||
62 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | ||
63 | #elif __x86_64__ | ||
64 | __asm__ ( | ||
65 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | ||
66 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | ||
67 | #else | ||
68 | #error implement me! | ||
69 | #endif | ||
70 | |||
71 | return product; | ||
72 | } | ||
73 | |||
74 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) | ||
75 | { | ||
76 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; | ||
77 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Reads a consistent set of time-base values from hypervisor, | ||
82 | * into a shadow data area. | ||
83 | */ | ||
84 | static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | ||
85 | struct pvclock_vcpu_time_info *src) | ||
86 | { | ||
87 | do { | ||
88 | dst->version = src->version; | ||
89 | rmb(); /* fetch version before data */ | ||
90 | dst->tsc_timestamp = src->tsc_timestamp; | ||
91 | dst->system_timestamp = src->system_time; | ||
92 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | ||
93 | dst->tsc_shift = src->tsc_shift; | ||
94 | rmb(); /* test version after fetching data */ | ||
95 | } while ((src->version & 1) || (dst->version != src->version)); | ||
96 | |||
97 | return dst->version; | ||
98 | } | ||
99 | |||
100 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | ||
101 | { | ||
102 | struct pvclock_shadow_time shadow; | ||
103 | unsigned version; | ||
104 | cycle_t ret, offset; | ||
105 | |||
106 | do { | ||
107 | version = pvclock_get_time_values(&shadow, src); | ||
108 | barrier(); | ||
109 | offset = pvclock_get_nsec_offset(&shadow); | ||
110 | ret = shadow.system_timestamp + offset; | ||
111 | barrier(); | ||
112 | } while (version != src->version); | ||
113 | |||
114 | return ret; | ||
115 | } | ||
116 | |||
117 | void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, | ||
118 | struct pvclock_vcpu_time_info *vcpu_time, | ||
119 | struct timespec *ts) | ||
120 | { | ||
121 | u32 version; | ||
122 | u64 delta; | ||
123 | struct timespec now; | ||
124 | |||
125 | /* get wallclock at system boot */ | ||
126 | do { | ||
127 | version = wall_clock->version; | ||
128 | rmb(); /* fetch version before time */ | ||
129 | now.tv_sec = wall_clock->sec; | ||
130 | now.tv_nsec = wall_clock->nsec; | ||
131 | rmb(); /* fetch time before checking version */ | ||
132 | } while ((wall_clock->version & 1) || (version != wall_clock->version)); | ||
133 | |||
134 | delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ | ||
135 | delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; | ||
136 | |||
137 | now.tv_nsec = do_div(delta, NSEC_PER_SEC); | ||
138 | now.tv_sec = delta; | ||
139 | |||
140 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | ||
141 | } | ||
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index f327abafe3e6..d13858818100 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -65,6 +65,7 @@ static enum { | |||
65 | ICH_FORCE_HPET_RESUME, | 65 | ICH_FORCE_HPET_RESUME, |
66 | VT8237_FORCE_HPET_RESUME, | 66 | VT8237_FORCE_HPET_RESUME, |
67 | NVIDIA_FORCE_HPET_RESUME, | 67 | NVIDIA_FORCE_HPET_RESUME, |
68 | ATI_FORCE_HPET_RESUME, | ||
68 | } force_hpet_resume_type; | 69 | } force_hpet_resume_type; |
69 | 70 | ||
70 | static void __iomem *rcba_base; | 71 | static void __iomem *rcba_base; |
@@ -176,6 +177,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, | |||
176 | 177 | ||
177 | static struct pci_dev *cached_dev; | 178 | static struct pci_dev *cached_dev; |
178 | 179 | ||
180 | static void hpet_print_force_info(void) | ||
181 | { | ||
182 | printk(KERN_INFO "HPET not enabled in BIOS. " | ||
183 | "You might try hpet=force boot option\n"); | ||
184 | } | ||
185 | |||
179 | static void old_ich_force_hpet_resume(void) | 186 | static void old_ich_force_hpet_resume(void) |
180 | { | 187 | { |
181 | u32 val; | 188 | u32 val; |
@@ -255,6 +262,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev) | |||
255 | { | 262 | { |
256 | if (hpet_force_user) | 263 | if (hpet_force_user) |
257 | old_ich_force_enable_hpet(dev); | 264 | old_ich_force_enable_hpet(dev); |
265 | else | ||
266 | hpet_print_force_info(); | ||
258 | } | 267 | } |
259 | 268 | ||
260 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, | 269 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, |
@@ -294,8 +303,13 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev) | |||
294 | { | 303 | { |
295 | u32 uninitialized_var(val); | 304 | u32 uninitialized_var(val); |
296 | 305 | ||
297 | if (!hpet_force_user || hpet_address || force_hpet_address) | 306 | if (hpet_address || force_hpet_address) |
307 | return; | ||
308 | |||
309 | if (!hpet_force_user) { | ||
310 | hpet_print_force_info(); | ||
298 | return; | 311 | return; |
312 | } | ||
299 | 313 | ||
300 | pci_read_config_dword(dev, 0x68, &val); | 314 | pci_read_config_dword(dev, 0x68, &val); |
301 | /* | 315 | /* |
@@ -334,6 +348,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, | |||
334 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, | 348 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, |
335 | vt8237_force_enable_hpet); | 349 | vt8237_force_enable_hpet); |
336 | 350 | ||
351 | static void ati_force_hpet_resume(void) | ||
352 | { | ||
353 | pci_write_config_dword(cached_dev, 0x14, 0xfed00000); | ||
354 | printk(KERN_DEBUG "Force enabled HPET at resume\n"); | ||
355 | } | ||
356 | |||
357 | static void ati_force_enable_hpet(struct pci_dev *dev) | ||
358 | { | ||
359 | u32 uninitialized_var(val); | ||
360 | |||
361 | if (hpet_address || force_hpet_address) | ||
362 | return; | ||
363 | |||
364 | if (!hpet_force_user) { | ||
365 | hpet_print_force_info(); | ||
366 | return; | ||
367 | } | ||
368 | |||
369 | pci_write_config_dword(dev, 0x14, 0xfed00000); | ||
370 | pci_read_config_dword(dev, 0x14, &val); | ||
371 | force_hpet_address = val; | ||
372 | force_hpet_resume_type = ATI_FORCE_HPET_RESUME; | ||
373 | dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", | ||
374 | force_hpet_address); | ||
375 | cached_dev = dev; | ||
376 | return; | ||
377 | } | ||
378 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, | ||
379 | ati_force_enable_hpet); | ||
380 | |||
337 | /* | 381 | /* |
338 | * Undocumented chipset feature taken from LinuxBIOS. | 382 | * Undocumented chipset feature taken from LinuxBIOS. |
339 | */ | 383 | */ |
@@ -347,9 +391,14 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev) | |||
347 | { | 391 | { |
348 | u32 uninitialized_var(val); | 392 | u32 uninitialized_var(val); |
349 | 393 | ||
350 | if (!hpet_force_user || hpet_address || force_hpet_address) | 394 | if (hpet_address || force_hpet_address) |
351 | return; | 395 | return; |
352 | 396 | ||
397 | if (!hpet_force_user) { | ||
398 | hpet_print_force_info(); | ||
399 | return; | ||
400 | } | ||
401 | |||
353 | pci_write_config_dword(dev, 0x44, 0xfed00001); | 402 | pci_write_config_dword(dev, 0x44, 0xfed00001); |
354 | pci_read_config_dword(dev, 0x44, &val); | 403 | pci_read_config_dword(dev, 0x44, &val); |
355 | force_hpet_address = val & 0xfffffffe; | 404 | force_hpet_address = val & 0xfffffffe; |
@@ -401,6 +450,9 @@ void force_hpet_resume(void) | |||
401 | case NVIDIA_FORCE_HPET_RESUME: | 450 | case NVIDIA_FORCE_HPET_RESUME: |
402 | nvidia_force_hpet_resume(); | 451 | nvidia_force_hpet_resume(); |
403 | return; | 452 | return; |
453 | case ATI_FORCE_HPET_RESUME: | ||
454 | ati_force_hpet_resume(); | ||
455 | return; | ||
404 | default: | 456 | default: |
405 | break; | 457 | break; |
406 | } | 458 | } |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f6be7d5f82f8..724adfc63cb9 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -27,7 +27,7 @@ | |||
27 | void (*pm_power_off)(void); | 27 | void (*pm_power_off)(void); |
28 | EXPORT_SYMBOL(pm_power_off); | 28 | EXPORT_SYMBOL(pm_power_off); |
29 | 29 | ||
30 | static long no_idt[3]; | 30 | static const struct desc_ptr no_idt = {}; |
31 | static int reboot_mode; | 31 | static int reboot_mode; |
32 | enum reboot_type reboot_type = BOOT_KBD; | 32 | enum reboot_type reboot_type = BOOT_KBD; |
33 | int reboot_force; | 33 | int reboot_force; |
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
177 | DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), | 177 | DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), |
178 | }, | 178 | }, |
179 | }, | 179 | }, |
180 | { /* Handle problems with rebooting on Dell T5400's */ | ||
181 | .callback = set_bios_reboot, | ||
182 | .ident = "Dell Precision T5400", | ||
183 | .matches = { | ||
184 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), | ||
185 | DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), | ||
186 | }, | ||
187 | }, | ||
180 | { /* Handle problems with rebooting on HP laptops */ | 188 | { /* Handle problems with rebooting on HP laptops */ |
181 | .callback = set_bios_reboot, | 189 | .callback = set_bios_reboot, |
182 | .ident = "HP Compaq Laptop", | 190 | .ident = "HP Compaq Laptop", |
@@ -201,15 +209,15 @@ core_initcall(reboot_init); | |||
201 | controller to pulse the CPU reset line, which is more thorough, but | 209 | controller to pulse the CPU reset line, which is more thorough, but |
202 | doesn't work with at least one type of 486 motherboard. It is easy | 210 | doesn't work with at least one type of 486 motherboard. It is easy |
203 | to stop this code working; hence the copious comments. */ | 211 | to stop this code working; hence the copious comments. */ |
204 | static unsigned long long | 212 | static const unsigned long long |
205 | real_mode_gdt_entries [3] = | 213 | real_mode_gdt_entries [3] = |
206 | { | 214 | { |
207 | 0x0000000000000000ULL, /* Null descriptor */ | 215 | 0x0000000000000000ULL, /* Null descriptor */ |
208 | 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ | 216 | 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ |
209 | 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ | 217 | 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ |
210 | }; | 218 | }; |
211 | 219 | ||
212 | static struct desc_ptr | 220 | static const struct desc_ptr |
213 | real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, | 221 | real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, |
214 | real_mode_idt = { 0x3ff, 0 }; | 222 | real_mode_idt = { 0x3ff, 0 }; |
215 | 223 | ||
@@ -231,7 +239,7 @@ real_mode_idt = { 0x3ff, 0 }; | |||
231 | 239 | ||
232 | More could be done here to set up the registers as if a CPU reset had | 240 | More could be done here to set up the registers as if a CPU reset had |
233 | occurred; hopefully real BIOSs don't assume much. */ | 241 | occurred; hopefully real BIOSs don't assume much. */ |
234 | static unsigned char real_mode_switch [] = | 242 | static const unsigned char real_mode_switch [] = |
235 | { | 243 | { |
236 | 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ | 244 | 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ |
237 | 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ | 245 | 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ |
@@ -245,7 +253,7 @@ static unsigned char real_mode_switch [] = | |||
245 | 0x24, 0x10, /* f: andb $0x10,al */ | 253 | 0x24, 0x10, /* f: andb $0x10,al */ |
246 | 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ | 254 | 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ |
247 | }; | 255 | }; |
248 | static unsigned char jump_to_bios [] = | 256 | static const unsigned char jump_to_bios [] = |
249 | { | 257 | { |
250 | 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ | 258 | 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ |
251 | }; | 259 | }; |
@@ -255,7 +263,7 @@ static unsigned char jump_to_bios [] = | |||
255 | * specified by the code and length parameters. | 263 | * specified by the code and length parameters. |
256 | * We assume that length will aways be less that 100! | 264 | * We assume that length will aways be less that 100! |
257 | */ | 265 | */ |
258 | void machine_real_restart(unsigned char *code, int length) | 266 | void machine_real_restart(const unsigned char *code, int length) |
259 | { | 267 | { |
260 | local_irq_disable(); | 268 | local_irq_disable(); |
261 | 269 | ||
@@ -368,7 +376,7 @@ static void native_machine_emergency_restart(void) | |||
368 | } | 376 | } |
369 | 377 | ||
370 | case BOOT_TRIPLE: | 378 | case BOOT_TRIPLE: |
371 | load_idt((const struct desc_ptr *)&no_idt); | 379 | load_idt(&no_idt); |
372 | __asm__ __volatile__("int3"); | 380 | __asm__ __volatile__("int3"); |
373 | 381 | ||
374 | reboot_type = BOOT_KBD; | 382 | reboot_type = BOOT_KBD; |
@@ -403,10 +411,9 @@ void native_machine_shutdown(void) | |||
403 | { | 411 | { |
404 | /* Stop the cpus and apics */ | 412 | /* Stop the cpus and apics */ |
405 | #ifdef CONFIG_SMP | 413 | #ifdef CONFIG_SMP |
406 | int reboot_cpu_id; | ||
407 | 414 | ||
408 | /* The boot cpu is always logical cpu 0 */ | 415 | /* The boot cpu is always logical cpu 0 */ |
409 | reboot_cpu_id = 0; | 416 | int reboot_cpu_id = 0; |
410 | 417 | ||
411 | #ifdef CONFIG_X86_32 | 418 | #ifdef CONFIG_X86_32 |
412 | /* See if there has been given a command line override */ | 419 | /* See if there has been given a command line override */ |
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index dec0b5ec25c2..61a837743fe5 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c | |||
@@ -49,7 +49,7 @@ struct device_fixup { | |||
49 | void (*reboot_fixup)(struct pci_dev *); | 49 | void (*reboot_fixup)(struct pci_dev *); |
50 | }; | 50 | }; |
51 | 51 | ||
52 | static struct device_fixup fixups_table[] = { | 52 | static const struct device_fixup fixups_table[] = { |
53 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, | 53 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, |
54 | { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, | 54 | { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, |
55 | { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, | 55 | { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, |
@@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = { | |||
64 | */ | 64 | */ |
65 | void mach_reboot_fixups(void) | 65 | void mach_reboot_fixups(void) |
66 | { | 66 | { |
67 | struct device_fixup *cur; | 67 | const struct device_fixup *cur; |
68 | struct pci_dev *dev; | 68 | struct pci_dev *dev; |
69 | int i; | 69 | int i; |
70 | 70 | ||
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470d..703310a99023 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S | |||
@@ -20,11 +20,44 @@ | |||
20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | 20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) |
21 | #define PAE_PGD_ATTR (_PAGE_PRESENT) | 21 | #define PAE_PGD_ATTR (_PAGE_PRESENT) |
22 | 22 | ||
23 | /* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are | ||
24 | * used to save some data for jumping back | ||
25 | */ | ||
26 | #define DATA(offset) (PAGE_SIZE/2+(offset)) | ||
27 | |||
28 | /* Minimal CPU state */ | ||
29 | #define ESP DATA(0x0) | ||
30 | #define CR0 DATA(0x4) | ||
31 | #define CR3 DATA(0x8) | ||
32 | #define CR4 DATA(0xc) | ||
33 | |||
34 | /* other data */ | ||
35 | #define CP_VA_CONTROL_PAGE DATA(0x10) | ||
36 | #define CP_PA_PGD DATA(0x14) | ||
37 | #define CP_PA_SWAP_PAGE DATA(0x18) | ||
38 | #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) | ||
39 | |||
23 | .text | 40 | .text |
24 | .align PAGE_SIZE | 41 | .align PAGE_SIZE |
25 | .globl relocate_kernel | 42 | .globl relocate_kernel |
26 | relocate_kernel: | 43 | relocate_kernel: |
27 | movl 8(%esp), %ebp /* list of pages */ | 44 | /* Save the CPU context, used for jumping back */ |
45 | |||
46 | pushl %ebx | ||
47 | pushl %esi | ||
48 | pushl %edi | ||
49 | pushl %ebp | ||
50 | pushf | ||
51 | |||
52 | movl 20+8(%esp), %ebp /* list of pages */ | ||
53 | movl PTR(VA_CONTROL_PAGE)(%ebp), %edi | ||
54 | movl %esp, ESP(%edi) | ||
55 | movl %cr0, %eax | ||
56 | movl %eax, CR0(%edi) | ||
57 | movl %cr3, %eax | ||
58 | movl %eax, CR3(%edi) | ||
59 | movl %cr4, %eax | ||
60 | movl %eax, CR4(%edi) | ||
28 | 61 | ||
29 | #ifdef CONFIG_X86_PAE | 62 | #ifdef CONFIG_X86_PAE |
30 | /* map the control page at its virtual address */ | 63 | /* map the control page at its virtual address */ |
@@ -138,15 +171,25 @@ relocate_kernel: | |||
138 | 171 | ||
139 | relocate_new_kernel: | 172 | relocate_new_kernel: |
140 | /* read the arguments and say goodbye to the stack */ | 173 | /* read the arguments and say goodbye to the stack */ |
141 | movl 4(%esp), %ebx /* page_list */ | 174 | movl 20+4(%esp), %ebx /* page_list */ |
142 | movl 8(%esp), %ebp /* list of pages */ | 175 | movl 20+8(%esp), %ebp /* list of pages */ |
143 | movl 12(%esp), %edx /* start address */ | 176 | movl 20+12(%esp), %edx /* start address */ |
144 | movl 16(%esp), %ecx /* cpu_has_pae */ | 177 | movl 20+16(%esp), %ecx /* cpu_has_pae */ |
178 | movl 20+20(%esp), %esi /* preserve_context */ | ||
145 | 179 | ||
146 | /* zero out flags, and disable interrupts */ | 180 | /* zero out flags, and disable interrupts */ |
147 | pushl $0 | 181 | pushl $0 |
148 | popfl | 182 | popfl |
149 | 183 | ||
184 | /* save some information for jumping back */ | ||
185 | movl PTR(VA_CONTROL_PAGE)(%ebp), %edi | ||
186 | movl %edi, CP_VA_CONTROL_PAGE(%edi) | ||
187 | movl PTR(PA_PGD)(%ebp), %eax | ||
188 | movl %eax, CP_PA_PGD(%edi) | ||
189 | movl PTR(PA_SWAP_PAGE)(%ebp), %eax | ||
190 | movl %eax, CP_PA_SWAP_PAGE(%edi) | ||
191 | movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) | ||
192 | |||
150 | /* get physical address of control page now */ | 193 | /* get physical address of control page now */ |
151 | /* this is impossible after page table switch */ | 194 | /* this is impossible after page table switch */ |
152 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edi | 195 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edi |
@@ -197,8 +240,90 @@ identity_mapped: | |||
197 | xorl %eax, %eax | 240 | xorl %eax, %eax |
198 | movl %eax, %cr3 | 241 | movl %eax, %cr3 |
199 | 242 | ||
243 | movl CP_PA_SWAP_PAGE(%edi), %eax | ||
244 | pushl %eax | ||
245 | pushl %ebx | ||
246 | call swap_pages | ||
247 | addl $8, %esp | ||
248 | |||
249 | /* To be certain of avoiding problems with self-modifying code | ||
250 | * I need to execute a serializing instruction here. | ||
251 | * So I flush the TLB, it's handy, and not processor dependent. | ||
252 | */ | ||
253 | xorl %eax, %eax | ||
254 | movl %eax, %cr3 | ||
255 | |||
256 | /* set all of the registers to known values */ | ||
257 | /* leave %esp alone */ | ||
258 | |||
259 | testl %esi, %esi | ||
260 | jnz 1f | ||
261 | xorl %edi, %edi | ||
262 | xorl %eax, %eax | ||
263 | xorl %ebx, %ebx | ||
264 | xorl %ecx, %ecx | ||
265 | xorl %edx, %edx | ||
266 | xorl %esi, %esi | ||
267 | xorl %ebp, %ebp | ||
268 | ret | ||
269 | 1: | ||
270 | popl %edx | ||
271 | movl CP_PA_SWAP_PAGE(%edi), %esp | ||
272 | addl $PAGE_SIZE, %esp | ||
273 | 2: | ||
274 | call *%edx | ||
275 | |||
276 | /* get the re-entry point of the peer system */ | ||
277 | movl 0(%esp), %ebp | ||
278 | call 1f | ||
279 | 1: | ||
280 | popl %ebx | ||
281 | subl $(1b - relocate_kernel), %ebx | ||
282 | movl CP_VA_CONTROL_PAGE(%ebx), %edi | ||
283 | lea PAGE_SIZE(%ebx), %esp | ||
284 | movl CP_PA_SWAP_PAGE(%ebx), %eax | ||
285 | movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx | ||
286 | pushl %eax | ||
287 | pushl %edx | ||
288 | call swap_pages | ||
289 | addl $8, %esp | ||
290 | movl CP_PA_PGD(%ebx), %eax | ||
291 | movl %eax, %cr3 | ||
292 | movl %cr0, %eax | ||
293 | orl $(1<<31), %eax | ||
294 | movl %eax, %cr0 | ||
295 | lea PAGE_SIZE(%edi), %esp | ||
296 | movl %edi, %eax | ||
297 | addl $(virtual_mapped - relocate_kernel), %eax | ||
298 | pushl %eax | ||
299 | ret | ||
300 | |||
301 | virtual_mapped: | ||
302 | movl CR4(%edi), %eax | ||
303 | movl %eax, %cr4 | ||
304 | movl CR3(%edi), %eax | ||
305 | movl %eax, %cr3 | ||
306 | movl CR0(%edi), %eax | ||
307 | movl %eax, %cr0 | ||
308 | movl ESP(%edi), %esp | ||
309 | movl %ebp, %eax | ||
310 | |||
311 | popf | ||
312 | popl %ebp | ||
313 | popl %edi | ||
314 | popl %esi | ||
315 | popl %ebx | ||
316 | ret | ||
317 | |||
200 | /* Do the copies */ | 318 | /* Do the copies */ |
201 | movl %ebx, %ecx | 319 | swap_pages: |
320 | movl 8(%esp), %edx | ||
321 | movl 4(%esp), %ecx | ||
322 | pushl %ebp | ||
323 | pushl %ebx | ||
324 | pushl %edi | ||
325 | pushl %esi | ||
326 | movl %ecx, %ebx | ||
202 | jmp 1f | 327 | jmp 1f |
203 | 328 | ||
204 | 0: /* top, read another word from the indirection page */ | 329 | 0: /* top, read another word from the indirection page */ |
@@ -226,27 +351,28 @@ identity_mapped: | |||
226 | movl %ecx, %esi /* For every source page do a copy */ | 351 | movl %ecx, %esi /* For every source page do a copy */ |
227 | andl $0xfffff000, %esi | 352 | andl $0xfffff000, %esi |
228 | 353 | ||
354 | movl %edi, %eax | ||
355 | movl %esi, %ebp | ||
356 | |||
357 | movl %edx, %edi | ||
229 | movl $1024, %ecx | 358 | movl $1024, %ecx |
230 | rep ; movsl | 359 | rep ; movsl |
231 | jmp 0b | ||
232 | 360 | ||
233 | 3: | 361 | movl %ebp, %edi |
234 | 362 | movl %eax, %esi | |
235 | /* To be certain of avoiding problems with self-modifying code | 363 | movl $1024, %ecx |
236 | * I need to execute a serializing instruction here. | 364 | rep ; movsl |
237 | * So I flush the TLB, it's handy, and not processor dependent. | ||
238 | */ | ||
239 | xorl %eax, %eax | ||
240 | movl %eax, %cr3 | ||
241 | 365 | ||
242 | /* set all of the registers to known values */ | 366 | movl %eax, %edi |
243 | /* leave %esp alone */ | 367 | movl %edx, %esi |
368 | movl $1024, %ecx | ||
369 | rep ; movsl | ||
244 | 370 | ||
245 | xorl %eax, %eax | 371 | lea PAGE_SIZE(%ebp), %esi |
246 | xorl %ebx, %ebx | 372 | jmp 0b |
247 | xorl %ecx, %ecx | 373 | 3: |
248 | xorl %edx, %edx | 374 | popl %esi |
249 | xorl %esi, %esi | 375 | popl %edi |
250 | xorl %edi, %edi | 376 | popl %ebx |
251 | xorl %ebp, %ebp | 377 | popl %ebp |
252 | ret | 378 | ret |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6f80b852a196..b520dae02bf4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1,139 +1,885 @@ | |||
1 | #include <linux/kernel.h> | 1 | /* |
2 | * Copyright (C) 1995 Linus Torvalds | ||
3 | * | ||
4 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | ||
5 | * | ||
6 | * Memory region support | ||
7 | * David Parsons <orc@pell.chi.il.us>, July-August 1999 | ||
8 | * | ||
9 | * Added E820 sanitization routine (removes overlapping memory regions); | ||
10 | * Brian Moyle <bmoyle@mvista.com>, February 2001 | ||
11 | * | ||
12 | * Moved CPU detection code to cpu/${cpu}.c | ||
13 | * Patrick Mochel <mochel@osdl.org>, March 2002 | ||
14 | * | ||
15 | * Provisions for empty E820 memory regions (reported by certain BIOSes). | ||
16 | * Alex Achenbach <xela@slit.de>, December 2002. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * This file handles the architecture-dependent parts of initialization | ||
22 | */ | ||
23 | |||
24 | #include <linux/sched.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/mmzone.h> | ||
27 | #include <linux/screen_info.h> | ||
28 | #include <linux/ioport.h> | ||
29 | #include <linux/acpi.h> | ||
30 | #include <linux/apm_bios.h> | ||
31 | #include <linux/initrd.h> | ||
32 | #include <linux/bootmem.h> | ||
33 | #include <linux/seq_file.h> | ||
34 | #include <linux/console.h> | ||
35 | #include <linux/mca.h> | ||
36 | #include <linux/root_dev.h> | ||
37 | #include <linux/highmem.h> | ||
2 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/efi.h> | ||
3 | #include <linux/init.h> | 40 | #include <linux/init.h> |
4 | #include <linux/bootmem.h> | 41 | #include <linux/edd.h> |
42 | #include <linux/iscsi_ibft.h> | ||
43 | #include <linux/nodemask.h> | ||
44 | #include <linux/kexec.h> | ||
45 | #include <linux/dmi.h> | ||
46 | #include <linux/pfn.h> | ||
47 | #include <linux/pci.h> | ||
48 | #include <asm/pci-direct.h> | ||
49 | #include <linux/init_ohci1394_dma.h> | ||
50 | #include <linux/kvm_para.h> | ||
51 | |||
52 | #include <linux/errno.h> | ||
53 | #include <linux/kernel.h> | ||
54 | #include <linux/stddef.h> | ||
55 | #include <linux/unistd.h> | ||
56 | #include <linux/ptrace.h> | ||
57 | #include <linux/slab.h> | ||
58 | #include <linux/user.h> | ||
59 | #include <linux/delay.h> | ||
60 | |||
61 | #include <linux/kallsyms.h> | ||
62 | #include <linux/cpufreq.h> | ||
63 | #include <linux/dma-mapping.h> | ||
64 | #include <linux/ctype.h> | ||
65 | #include <linux/uaccess.h> | ||
66 | |||
5 | #include <linux/percpu.h> | 67 | #include <linux/percpu.h> |
6 | #include <asm/smp.h> | 68 | #include <linux/crash_dump.h> |
7 | #include <asm/percpu.h> | 69 | |
70 | #include <video/edid.h> | ||
71 | |||
72 | #include <asm/mtrr.h> | ||
73 | #include <asm/apic.h> | ||
74 | #include <asm/e820.h> | ||
75 | #include <asm/mpspec.h> | ||
76 | #include <asm/setup.h> | ||
77 | #include <asm/arch_hooks.h> | ||
78 | #include <asm/efi.h> | ||
8 | #include <asm/sections.h> | 79 | #include <asm/sections.h> |
80 | #include <asm/dmi.h> | ||
81 | #include <asm/io_apic.h> | ||
82 | #include <asm/ist.h> | ||
83 | #include <asm/vmi.h> | ||
84 | #include <setup_arch.h> | ||
85 | #include <asm/bios_ebda.h> | ||
86 | #include <asm/cacheflush.h> | ||
9 | #include <asm/processor.h> | 87 | #include <asm/processor.h> |
10 | #include <asm/setup.h> | 88 | #include <asm/bugs.h> |
89 | |||
90 | #include <asm/system.h> | ||
91 | #include <asm/vsyscall.h> | ||
92 | #include <asm/smp.h> | ||
93 | #include <asm/desc.h> | ||
94 | #include <asm/dma.h> | ||
95 | #include <asm/iommu.h> | ||
96 | #include <asm/mmu_context.h> | ||
97 | #include <asm/proto.h> | ||
98 | |||
99 | #include <mach_apic.h> | ||
100 | #include <asm/paravirt.h> | ||
101 | |||
102 | #include <asm/percpu.h> | ||
11 | #include <asm/topology.h> | 103 | #include <asm/topology.h> |
12 | #include <asm/mpspec.h> | ||
13 | #include <asm/apicdef.h> | 104 | #include <asm/apicdef.h> |
105 | #ifdef CONFIG_X86_64 | ||
106 | #include <asm/numa_64.h> | ||
107 | #endif | ||
14 | 108 | ||
15 | #ifdef CONFIG_X86_LOCAL_APIC | 109 | #ifndef ARCH_SETUP |
16 | unsigned int num_processors; | 110 | #define ARCH_SETUP |
17 | unsigned disabled_cpus __cpuinitdata; | 111 | #endif |
18 | /* Processor that is doing the boot up */ | ||
19 | unsigned int boot_cpu_physical_apicid = -1U; | ||
20 | EXPORT_SYMBOL(boot_cpu_physical_apicid); | ||
21 | 112 | ||
22 | DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; | 113 | #ifndef CONFIG_DEBUG_BOOT_PARAMS |
23 | EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); | 114 | struct boot_params __initdata boot_params; |
115 | #else | ||
116 | struct boot_params boot_params; | ||
117 | #endif | ||
24 | 118 | ||
25 | /* Bitmask of physically existing CPUs */ | 119 | /* |
26 | physid_mask_t phys_cpu_present_map; | 120 | * Machine setup.. |
121 | */ | ||
122 | static struct resource data_resource = { | ||
123 | .name = "Kernel data", | ||
124 | .start = 0, | ||
125 | .end = 0, | ||
126 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
127 | }; | ||
128 | |||
129 | static struct resource code_resource = { | ||
130 | .name = "Kernel code", | ||
131 | .start = 0, | ||
132 | .end = 0, | ||
133 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
134 | }; | ||
135 | |||
136 | static struct resource bss_resource = { | ||
137 | .name = "Kernel bss", | ||
138 | .start = 0, | ||
139 | .end = 0, | ||
140 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
141 | }; | ||
142 | |||
143 | |||
144 | #ifdef CONFIG_X86_32 | ||
145 | /* This value is set up by the early boot code to point to the value | ||
146 | immediately after the boot time page tables. It contains a *physical* | ||
147 | address, and must not be in the .bss segment! */ | ||
148 | unsigned long init_pg_tables_start __initdata = ~0UL; | ||
149 | unsigned long init_pg_tables_end __initdata = ~0UL; | ||
150 | |||
151 | static struct resource video_ram_resource = { | ||
152 | .name = "Video RAM area", | ||
153 | .start = 0xa0000, | ||
154 | .end = 0xbffff, | ||
155 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
156 | }; | ||
157 | |||
158 | /* cpu data as detected by the assembly code in head.S */ | ||
159 | struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; | ||
160 | /* common cpu data for all cpus */ | ||
161 | struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; | ||
162 | EXPORT_SYMBOL(boot_cpu_data); | ||
163 | static void set_mca_bus(int x) | ||
164 | { | ||
165 | #ifdef CONFIG_MCA | ||
166 | MCA_bus = x; | ||
167 | #endif | ||
168 | } | ||
169 | |||
170 | unsigned int def_to_bigsmp; | ||
171 | |||
172 | /* for MCA, but anyone else can use it if they want */ | ||
173 | unsigned int machine_id; | ||
174 | unsigned int machine_submodel_id; | ||
175 | unsigned int BIOS_revision; | ||
176 | |||
177 | struct apm_info apm_info; | ||
178 | EXPORT_SYMBOL(apm_info); | ||
179 | |||
180 | #if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ | ||
181 | defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) | ||
182 | struct ist_info ist_info; | ||
183 | EXPORT_SYMBOL(ist_info); | ||
184 | #else | ||
185 | struct ist_info ist_info; | ||
27 | #endif | 186 | #endif |
28 | 187 | ||
29 | #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) | 188 | #else |
189 | struct cpuinfo_x86 boot_cpu_data __read_mostly; | ||
190 | EXPORT_SYMBOL(boot_cpu_data); | ||
191 | #endif | ||
192 | |||
193 | |||
194 | #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) | ||
195 | unsigned long mmu_cr4_features; | ||
196 | #else | ||
197 | unsigned long mmu_cr4_features = X86_CR4_PAE; | ||
198 | #endif | ||
199 | |||
200 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
201 | int bootloader_type; | ||
202 | |||
30 | /* | 203 | /* |
31 | * Copy data used in early init routines from the initial arrays to the | 204 | * Early DMI memory |
32 | * per cpu data areas. These arrays then become expendable and the | ||
33 | * *_early_ptr's are zeroed indicating that the static arrays are gone. | ||
34 | */ | 205 | */ |
35 | static void __init setup_per_cpu_maps(void) | 206 | int dmi_alloc_index; |
207 | char dmi_alloc_data[DMI_MAX_DATA]; | ||
208 | |||
209 | /* | ||
210 | * Setup options | ||
211 | */ | ||
212 | struct screen_info screen_info; | ||
213 | EXPORT_SYMBOL(screen_info); | ||
214 | struct edid_info edid_info; | ||
215 | EXPORT_SYMBOL_GPL(edid_info); | ||
216 | |||
217 | extern int root_mountflags; | ||
218 | |||
219 | unsigned long saved_video_mode; | ||
220 | |||
221 | #define RAMDISK_IMAGE_START_MASK 0x07FF | ||
222 | #define RAMDISK_PROMPT_FLAG 0x8000 | ||
223 | #define RAMDISK_LOAD_FLAG 0x4000 | ||
224 | |||
225 | static char __initdata command_line[COMMAND_LINE_SIZE]; | ||
226 | |||
227 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
228 | struct edd edd; | ||
229 | #ifdef CONFIG_EDD_MODULE | ||
230 | EXPORT_SYMBOL(edd); | ||
231 | #endif | ||
232 | /** | ||
233 | * copy_edd() - Copy the BIOS EDD information | ||
234 | * from boot_params into a safe place. | ||
235 | * | ||
236 | */ | ||
237 | static inline void copy_edd(void) | ||
238 | { | ||
239 | memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, | ||
240 | sizeof(edd.mbr_signature)); | ||
241 | memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); | ||
242 | edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; | ||
243 | edd.edd_info_nr = boot_params.eddbuf_entries; | ||
244 | } | ||
245 | #else | ||
246 | static inline void copy_edd(void) | ||
247 | { | ||
248 | } | ||
249 | #endif | ||
250 | |||
251 | #ifdef CONFIG_BLK_DEV_INITRD | ||
252 | |||
253 | #ifdef CONFIG_X86_32 | ||
254 | |||
255 | #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) | ||
256 | static void __init relocate_initrd(void) | ||
36 | { | 257 | { |
37 | int cpu; | ||
38 | 258 | ||
39 | for_each_possible_cpu(cpu) { | 259 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; |
40 | per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; | 260 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; |
41 | per_cpu(x86_bios_cpu_apicid, cpu) = | 261 | u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; |
42 | x86_bios_cpu_apicid_init[cpu]; | 262 | u64 ramdisk_here; |
43 | #ifdef CONFIG_NUMA | 263 | unsigned long slop, clen, mapaddr; |
44 | per_cpu(x86_cpu_to_node_map, cpu) = | 264 | char *p, *q; |
45 | x86_cpu_to_node_map_init[cpu]; | 265 | |
266 | /* We need to move the initrd down into lowmem */ | ||
267 | ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, | ||
268 | PAGE_SIZE); | ||
269 | |||
270 | if (ramdisk_here == -1ULL) | ||
271 | panic("Cannot find place for new RAMDISK of size %lld\n", | ||
272 | ramdisk_size); | ||
273 | |||
274 | /* Note: this includes all the lowmem currently occupied by | ||
275 | the initrd, we rely on that fact to keep the data intact. */ | ||
276 | reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, | ||
277 | "NEW RAMDISK"); | ||
278 | initrd_start = ramdisk_here + PAGE_OFFSET; | ||
279 | initrd_end = initrd_start + ramdisk_size; | ||
280 | printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", | ||
281 | ramdisk_here, ramdisk_here + ramdisk_size); | ||
282 | |||
283 | q = (char *)initrd_start; | ||
284 | |||
285 | /* Copy any lowmem portion of the initrd */ | ||
286 | if (ramdisk_image < end_of_lowmem) { | ||
287 | clen = end_of_lowmem - ramdisk_image; | ||
288 | p = (char *)__va(ramdisk_image); | ||
289 | memcpy(q, p, clen); | ||
290 | q += clen; | ||
291 | ramdisk_image += clen; | ||
292 | ramdisk_size -= clen; | ||
293 | } | ||
294 | |||
295 | /* Copy the highmem portion of the initrd */ | ||
296 | while (ramdisk_size) { | ||
297 | slop = ramdisk_image & ~PAGE_MASK; | ||
298 | clen = ramdisk_size; | ||
299 | if (clen > MAX_MAP_CHUNK-slop) | ||
300 | clen = MAX_MAP_CHUNK-slop; | ||
301 | mapaddr = ramdisk_image & PAGE_MASK; | ||
302 | p = early_ioremap(mapaddr, clen+slop); | ||
303 | memcpy(q, p+slop, clen); | ||
304 | early_iounmap(p, clen+slop); | ||
305 | q += clen; | ||
306 | ramdisk_image += clen; | ||
307 | ramdisk_size -= clen; | ||
308 | } | ||
309 | /* high pages is not converted by early_res_to_bootmem */ | ||
310 | ramdisk_image = boot_params.hdr.ramdisk_image; | ||
311 | ramdisk_size = boot_params.hdr.ramdisk_size; | ||
312 | printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" | ||
313 | " %08llx - %08llx\n", | ||
314 | ramdisk_image, ramdisk_image + ramdisk_size - 1, | ||
315 | ramdisk_here, ramdisk_here + ramdisk_size - 1); | ||
316 | } | ||
46 | #endif | 317 | #endif |
318 | |||
319 | static void __init reserve_initrd(void) | ||
320 | { | ||
321 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; | ||
322 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; | ||
323 | u64 ramdisk_end = ramdisk_image + ramdisk_size; | ||
324 | u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; | ||
325 | |||
326 | if (!boot_params.hdr.type_of_loader || | ||
327 | !ramdisk_image || !ramdisk_size) | ||
328 | return; /* No initrd provided by bootloader */ | ||
329 | |||
330 | initrd_start = 0; | ||
331 | |||
332 | if (ramdisk_size >= (end_of_lowmem>>1)) { | ||
333 | free_early(ramdisk_image, ramdisk_end); | ||
334 | printk(KERN_ERR "initrd too large to handle, " | ||
335 | "disabling initrd\n"); | ||
336 | return; | ||
337 | } | ||
338 | |||
339 | printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, | ||
340 | ramdisk_end); | ||
341 | |||
342 | |||
343 | if (ramdisk_end <= end_of_lowmem) { | ||
344 | /* All in lowmem, easy case */ | ||
345 | /* | ||
346 | * don't need to reserve again, already reserved early | ||
347 | * in i386_start_kernel | ||
348 | */ | ||
349 | initrd_start = ramdisk_image + PAGE_OFFSET; | ||
350 | initrd_end = initrd_start + ramdisk_size; | ||
351 | return; | ||
47 | } | 352 | } |
48 | 353 | ||
49 | /* indicate the early static arrays will soon be gone */ | 354 | #ifdef CONFIG_X86_32 |
50 | x86_cpu_to_apicid_early_ptr = NULL; | 355 | relocate_initrd(); |
51 | x86_bios_cpu_apicid_early_ptr = NULL; | 356 | #else |
52 | #ifdef CONFIG_NUMA | 357 | printk(KERN_ERR "initrd extends beyond end of memory " |
53 | x86_cpu_to_node_map_early_ptr = NULL; | 358 | "(0x%08llx > 0x%08llx)\ndisabling initrd\n", |
359 | ramdisk_end, end_of_lowmem); | ||
360 | initrd_start = 0; | ||
54 | #endif | 361 | #endif |
362 | free_early(ramdisk_image, ramdisk_end); | ||
55 | } | 363 | } |
364 | #else | ||
365 | static void __init reserve_initrd(void) | ||
366 | { | ||
367 | } | ||
368 | #endif /* CONFIG_BLK_DEV_INITRD */ | ||
369 | |||
370 | static void __init parse_setup_data(void) | ||
371 | { | ||
372 | struct setup_data *data; | ||
373 | u64 pa_data; | ||
374 | |||
375 | if (boot_params.hdr.version < 0x0209) | ||
376 | return; | ||
377 | pa_data = boot_params.hdr.setup_data; | ||
378 | while (pa_data) { | ||
379 | data = early_ioremap(pa_data, PAGE_SIZE); | ||
380 | switch (data->type) { | ||
381 | case SETUP_E820_EXT: | ||
382 | parse_e820_ext(data, pa_data); | ||
383 | break; | ||
384 | default: | ||
385 | break; | ||
386 | } | ||
387 | pa_data = data->next; | ||
388 | early_iounmap(data, PAGE_SIZE); | ||
389 | } | ||
390 | } | ||
391 | |||
392 | static void __init e820_reserve_setup_data(void) | ||
393 | { | ||
394 | struct setup_data *data; | ||
395 | u64 pa_data; | ||
396 | int found = 0; | ||
397 | |||
398 | if (boot_params.hdr.version < 0x0209) | ||
399 | return; | ||
400 | pa_data = boot_params.hdr.setup_data; | ||
401 | while (pa_data) { | ||
402 | data = early_ioremap(pa_data, sizeof(*data)); | ||
403 | e820_update_range(pa_data, sizeof(*data)+data->len, | ||
404 | E820_RAM, E820_RESERVED_KERN); | ||
405 | found = 1; | ||
406 | pa_data = data->next; | ||
407 | early_iounmap(data, sizeof(*data)); | ||
408 | } | ||
409 | if (!found) | ||
410 | return; | ||
56 | 411 | ||
57 | #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP | 412 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
58 | cpumask_t *cpumask_of_cpu_map __read_mostly; | 413 | memcpy(&e820_saved, &e820, sizeof(struct e820map)); |
59 | EXPORT_SYMBOL(cpumask_of_cpu_map); | 414 | printk(KERN_INFO "extended physical RAM map:\n"); |
415 | e820_print_map("reserve setup_data"); | ||
416 | } | ||
60 | 417 | ||
61 | /* requires nr_cpu_ids to be initialized */ | 418 | static void __init reserve_early_setup_data(void) |
62 | static void __init setup_cpumask_of_cpu(void) | ||
63 | { | 419 | { |
64 | int i; | 420 | struct setup_data *data; |
421 | u64 pa_data; | ||
422 | char buf[32]; | ||
423 | |||
424 | if (boot_params.hdr.version < 0x0209) | ||
425 | return; | ||
426 | pa_data = boot_params.hdr.setup_data; | ||
427 | while (pa_data) { | ||
428 | data = early_ioremap(pa_data, sizeof(*data)); | ||
429 | sprintf(buf, "setup data %x", data->type); | ||
430 | reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); | ||
431 | pa_data = data->next; | ||
432 | early_iounmap(data, sizeof(*data)); | ||
433 | } | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * --------- Crashkernel reservation ------------------------------ | ||
438 | */ | ||
439 | |||
440 | #ifdef CONFIG_KEXEC | ||
441 | |||
442 | /** | ||
443 | * Reserve @size bytes of crashkernel memory at any suitable offset. | ||
444 | * | ||
445 | * @size: Size of the crashkernel memory to reserve. | ||
446 | * Returns the base address on success, and -1ULL on failure. | ||
447 | */ | ||
448 | unsigned long long find_and_reserve_crashkernel(unsigned long long size) | ||
449 | { | ||
450 | const unsigned long long alignment = 16<<20; /* 16M */ | ||
451 | unsigned long long start = 0LL; | ||
452 | |||
453 | while (1) { | ||
454 | int ret; | ||
455 | |||
456 | start = find_e820_area(start, ULONG_MAX, size, alignment); | ||
457 | if (start == -1ULL) | ||
458 | return start; | ||
459 | |||
460 | /* try to reserve it */ | ||
461 | ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE); | ||
462 | if (ret >= 0) | ||
463 | return start; | ||
65 | 464 | ||
66 | /* alloc_bootmem zeroes memory */ | 465 | start += alignment; |
67 | cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); | 466 | } |
68 | for (i = 0; i < nr_cpu_ids; i++) | 467 | } |
69 | cpu_set(i, cpumask_of_cpu_map[i]); | 468 | |
469 | static inline unsigned long long get_total_mem(void) | ||
470 | { | ||
471 | unsigned long long total; | ||
472 | |||
473 | total = max_low_pfn - min_low_pfn; | ||
474 | #ifdef CONFIG_HIGHMEM | ||
475 | total += highend_pfn - highstart_pfn; | ||
476 | #endif | ||
477 | |||
478 | return total << PAGE_SHIFT; | ||
479 | } | ||
480 | |||
481 | static void __init reserve_crashkernel(void) | ||
482 | { | ||
483 | unsigned long long total_mem; | ||
484 | unsigned long long crash_size, crash_base; | ||
485 | int ret; | ||
486 | |||
487 | total_mem = get_total_mem(); | ||
488 | |||
489 | ret = parse_crashkernel(boot_command_line, total_mem, | ||
490 | &crash_size, &crash_base); | ||
491 | if (ret != 0 || crash_size <= 0) | ||
492 | return; | ||
493 | |||
494 | /* 0 means: find the address automatically */ | ||
495 | if (crash_base <= 0) { | ||
496 | crash_base = find_and_reserve_crashkernel(crash_size); | ||
497 | if (crash_base == -1ULL) { | ||
498 | pr_info("crashkernel reservation failed. " | ||
499 | "No suitable area found.\n"); | ||
500 | return; | ||
501 | } | ||
502 | } else { | ||
503 | ret = reserve_bootmem_generic(crash_base, crash_size, | ||
504 | BOOTMEM_EXCLUSIVE); | ||
505 | if (ret < 0) { | ||
506 | pr_info("crashkernel reservation failed - " | ||
507 | "memory is in use\n"); | ||
508 | return; | ||
509 | } | ||
510 | } | ||
511 | |||
512 | printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " | ||
513 | "for crashkernel (System RAM: %ldMB)\n", | ||
514 | (unsigned long)(crash_size >> 20), | ||
515 | (unsigned long)(crash_base >> 20), | ||
516 | (unsigned long)(total_mem >> 20)); | ||
517 | |||
518 | crashk_res.start = crash_base; | ||
519 | crashk_res.end = crash_base + crash_size - 1; | ||
520 | insert_resource(&iomem_resource, &crashk_res); | ||
70 | } | 521 | } |
71 | #else | 522 | #else |
72 | static inline void setup_cpumask_of_cpu(void) { } | 523 | static void __init reserve_crashkernel(void) |
524 | { | ||
525 | } | ||
73 | #endif | 526 | #endif |
74 | 527 | ||
75 | #ifdef CONFIG_X86_32 | 528 | static struct resource standard_io_resources[] = { |
76 | /* | 529 | { .name = "dma1", .start = 0x00, .end = 0x1f, |
77 | * Great future not-so-futuristic plan: make i386 and x86_64 do it | 530 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, |
78 | * the same way | 531 | { .name = "pic1", .start = 0x20, .end = 0x21, |
532 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
533 | { .name = "timer0", .start = 0x40, .end = 0x43, | ||
534 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
535 | { .name = "timer1", .start = 0x50, .end = 0x53, | ||
536 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
537 | { .name = "keyboard", .start = 0x60, .end = 0x60, | ||
538 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
539 | { .name = "keyboard", .start = 0x64, .end = 0x64, | ||
540 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
541 | { .name = "dma page reg", .start = 0x80, .end = 0x8f, | ||
542 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
543 | { .name = "pic2", .start = 0xa0, .end = 0xa1, | ||
544 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
545 | { .name = "dma2", .start = 0xc0, .end = 0xdf, | ||
546 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
547 | { .name = "fpu", .start = 0xf0, .end = 0xff, | ||
548 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } | ||
549 | }; | ||
550 | |||
551 | static void __init reserve_standard_io_resources(void) | ||
552 | { | ||
553 | int i; | ||
554 | |||
555 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
556 | for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | ||
557 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
558 | |||
559 | } | ||
560 | |||
561 | #ifdef CONFIG_PROC_VMCORE | ||
562 | /* elfcorehdr= specifies the location of elf core header | ||
563 | * stored by the crashed kernel. This option will be passed | ||
564 | * by kexec loader to the capture kernel. | ||
79 | */ | 565 | */ |
80 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | 566 | static int __init setup_elfcorehdr(char *arg) |
81 | EXPORT_SYMBOL(__per_cpu_offset); | 567 | { |
568 | char *end; | ||
569 | if (!arg) | ||
570 | return -EINVAL; | ||
571 | elfcorehdr_addr = memparse(arg, &end); | ||
572 | return end > arg ? 0 : -EINVAL; | ||
573 | } | ||
574 | early_param("elfcorehdr", setup_elfcorehdr); | ||
82 | #endif | 575 | #endif |
83 | 576 | ||
577 | static struct x86_quirks default_x86_quirks __initdata; | ||
578 | |||
579 | struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; | ||
580 | |||
581 | /* | ||
582 | * Determine if we were loaded by an EFI loader. If so, then we have also been | ||
583 | * passed the efi memmap, systab, etc., so we should use these data structures | ||
584 | * for initialization. Note, the efi init code path is determined by the | ||
585 | * global efi_enabled. This allows the same kernel image to be used on existing | ||
586 | * systems (with a traditional BIOS) as well as on EFI systems. | ||
587 | */ | ||
84 | /* | 588 | /* |
85 | * Great future plan: | 589 | * setup_arch - architecture-specific boot-time initializations |
86 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | 590 | * |
87 | * Always point %gs to its beginning | 591 | * Note: On x86_64, fixmaps are ready for use even before this is called. |
88 | */ | 592 | */ |
89 | void __init setup_per_cpu_areas(void) | 593 | |
594 | void __init setup_arch(char **cmdline_p) | ||
90 | { | 595 | { |
91 | int i, highest_cpu = 0; | 596 | #ifdef CONFIG_X86_32 |
92 | unsigned long size; | 597 | memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); |
598 | visws_early_detect(); | ||
599 | pre_setup_arch_hook(); | ||
600 | #else | ||
601 | printk(KERN_INFO "Command line: %s\n", boot_command_line); | ||
602 | #endif | ||
93 | 603 | ||
94 | #ifdef CONFIG_HOTPLUG_CPU | 604 | early_cpu_init(); |
95 | prefill_possible_map(); | 605 | early_ioremap_init(); |
606 | |||
607 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); | ||
608 | screen_info = boot_params.screen_info; | ||
609 | edid_info = boot_params.edid_info; | ||
610 | #ifdef CONFIG_X86_32 | ||
611 | apm_info.bios = boot_params.apm_bios_info; | ||
612 | ist_info = boot_params.ist_info; | ||
613 | if (boot_params.sys_desc_table.length != 0) { | ||
614 | set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); | ||
615 | machine_id = boot_params.sys_desc_table.table[0]; | ||
616 | machine_submodel_id = boot_params.sys_desc_table.table[1]; | ||
617 | BIOS_revision = boot_params.sys_desc_table.table[2]; | ||
618 | } | ||
619 | #endif | ||
620 | saved_video_mode = boot_params.hdr.vid_mode; | ||
621 | bootloader_type = boot_params.hdr.type_of_loader; | ||
622 | |||
623 | #ifdef CONFIG_BLK_DEV_RAM | ||
624 | rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; | ||
625 | rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); | ||
626 | rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); | ||
627 | #endif | ||
628 | #ifdef CONFIG_EFI | ||
629 | if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, | ||
630 | #ifdef CONFIG_X86_32 | ||
631 | "EL32", | ||
632 | #else | ||
633 | "EL64", | ||
96 | #endif | 634 | #endif |
635 | 4)) { | ||
636 | efi_enabled = 1; | ||
637 | efi_reserve_early(); | ||
638 | } | ||
639 | #endif | ||
640 | |||
641 | ARCH_SETUP | ||
642 | |||
643 | setup_memory_map(); | ||
644 | parse_setup_data(); | ||
645 | /* update the e820_saved too */ | ||
646 | e820_reserve_setup_data(); | ||
97 | 647 | ||
98 | /* Copy section for each CPU (we discard the original) */ | 648 | copy_edd(); |
99 | size = PERCPU_ENOUGH_ROOM; | ||
100 | printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", | ||
101 | size); | ||
102 | 649 | ||
103 | for_each_possible_cpu(i) { | 650 | if (!boot_params.hdr.root_flags) |
104 | char *ptr; | 651 | root_mountflags &= ~MS_RDONLY; |
105 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 652 | init_mm.start_code = (unsigned long) _text; |
106 | ptr = alloc_bootmem_pages(size); | 653 | init_mm.end_code = (unsigned long) _etext; |
654 | init_mm.end_data = (unsigned long) _edata; | ||
655 | #ifdef CONFIG_X86_32 | ||
656 | init_mm.brk = init_pg_tables_end + PAGE_OFFSET; | ||
107 | #else | 657 | #else |
108 | int node = early_cpu_to_node(i); | 658 | init_mm.brk = (unsigned long) &_end; |
109 | if (!node_online(node) || !NODE_DATA(node)) { | ||
110 | ptr = alloc_bootmem_pages(size); | ||
111 | printk(KERN_INFO | ||
112 | "cpu %d has no node or node-local memory\n", i); | ||
113 | } | ||
114 | else | ||
115 | ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); | ||
116 | #endif | 659 | #endif |
117 | if (!ptr) | 660 | |
118 | panic("Cannot allocate cpu data for CPU %d\n", i); | 661 | code_resource.start = virt_to_phys(_text); |
119 | #ifdef CONFIG_X86_64 | 662 | code_resource.end = virt_to_phys(_etext)-1; |
120 | cpu_pda(i)->data_offset = ptr - __per_cpu_start; | 663 | data_resource.start = virt_to_phys(_etext); |
664 | data_resource.end = virt_to_phys(_edata)-1; | ||
665 | bss_resource.start = virt_to_phys(&__bss_start); | ||
666 | bss_resource.end = virt_to_phys(&__bss_stop)-1; | ||
667 | |||
668 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | ||
669 | *cmdline_p = command_line; | ||
670 | |||
671 | parse_early_param(); | ||
672 | |||
673 | /* after early param, so could get panic from serial */ | ||
674 | reserve_early_setup_data(); | ||
675 | |||
676 | if (acpi_mps_check()) { | ||
677 | #ifdef CONFIG_X86_LOCAL_APIC | ||
678 | disable_apic = 1; | ||
679 | #endif | ||
680 | setup_clear_cpu_cap(X86_FEATURE_APIC); | ||
681 | } | ||
682 | |||
683 | #ifdef CONFIG_PCI | ||
684 | if (pci_early_dump_regs) | ||
685 | early_dump_pci_devices(); | ||
686 | #endif | ||
687 | |||
688 | finish_e820_parsing(); | ||
689 | |||
690 | #ifdef CONFIG_X86_32 | ||
691 | probe_roms(); | ||
692 | #endif | ||
693 | |||
694 | /* after parse_early_param, so could debug it */ | ||
695 | insert_resource(&iomem_resource, &code_resource); | ||
696 | insert_resource(&iomem_resource, &data_resource); | ||
697 | insert_resource(&iomem_resource, &bss_resource); | ||
698 | |||
699 | if (efi_enabled) | ||
700 | efi_init(); | ||
701 | |||
702 | #ifdef CONFIG_X86_32 | ||
703 | if (ppro_with_ram_bug()) { | ||
704 | e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, | ||
705 | E820_RESERVED); | ||
706 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | ||
707 | printk(KERN_INFO "fixed physical RAM map:\n"); | ||
708 | e820_print_map("bad_ppro"); | ||
709 | } | ||
710 | #else | ||
711 | early_gart_iommu_check(); | ||
712 | #endif | ||
713 | |||
714 | /* | ||
715 | * partially used pages are not usable - thus | ||
716 | * we are rounding upwards: | ||
717 | */ | ||
718 | max_pfn = e820_end_of_ram_pfn(); | ||
719 | |||
720 | /* preallocate 4k for mptable mpc */ | ||
721 | early_reserve_e820_mpc_new(); | ||
722 | /* update e820 for memory not covered by WB MTRRs */ | ||
723 | mtrr_bp_init(); | ||
724 | if (mtrr_trim_uncached_memory(max_pfn)) | ||
725 | max_pfn = e820_end_of_ram_pfn(); | ||
726 | |||
727 | #ifdef CONFIG_X86_32 | ||
728 | /* max_low_pfn get updated here */ | ||
729 | find_low_pfn_range(); | ||
121 | #else | 730 | #else |
122 | __per_cpu_offset[i] = ptr - __per_cpu_start; | 731 | num_physpages = max_pfn; |
732 | |||
733 | check_efer(); | ||
734 | |||
735 | /* How many end-of-memory variables you have, grandma! */ | ||
736 | /* need this before calling reserve_initrd */ | ||
737 | if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) | ||
738 | max_low_pfn = e820_end_of_low_ram_pfn(); | ||
739 | else | ||
740 | max_low_pfn = max_pfn; | ||
741 | |||
742 | high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; | ||
123 | #endif | 743 | #endif |
124 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
125 | 744 | ||
126 | highest_cpu = i; | 745 | /* max_pfn_mapped is updated here */ |
746 | max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); | ||
747 | max_pfn_mapped = max_low_pfn_mapped; | ||
748 | |||
749 | #ifdef CONFIG_X86_64 | ||
750 | if (max_pfn > max_low_pfn) { | ||
751 | max_pfn_mapped = init_memory_mapping(1UL<<32, | ||
752 | max_pfn<<PAGE_SHIFT); | ||
753 | /* can we preseve max_low_pfn ?*/ | ||
754 | max_low_pfn = max_pfn; | ||
127 | } | 755 | } |
756 | #endif | ||
128 | 757 | ||
129 | nr_cpu_ids = highest_cpu + 1; | 758 | /* |
130 | printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); | 759 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. |
760 | */ | ||
131 | 761 | ||
132 | /* Setup percpu data maps */ | 762 | #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT |
133 | setup_per_cpu_maps(); | 763 | if (init_ohci1394_dma_early) |
764 | init_ohci1394_dma_on_all_controllers(); | ||
765 | #endif | ||
134 | 766 | ||
135 | /* Setup cpumask_of_cpu map */ | 767 | reserve_initrd(); |
136 | setup_cpumask_of_cpu(); | 768 | |
137 | } | 769 | #ifdef CONFIG_X86_64 |
770 | vsmp_init(); | ||
771 | #endif | ||
772 | |||
773 | dmi_scan_machine(); | ||
774 | |||
775 | io_delay_init(); | ||
776 | |||
777 | /* | ||
778 | * Parse the ACPI tables for possible boot-time SMP configuration. | ||
779 | */ | ||
780 | acpi_boot_table_init(); | ||
781 | |||
782 | #ifdef CONFIG_ACPI_NUMA | ||
783 | /* | ||
784 | * Parse SRAT to discover nodes. | ||
785 | */ | ||
786 | acpi_numa_init(); | ||
787 | #endif | ||
788 | |||
789 | initmem_init(0, max_pfn); | ||
790 | |||
791 | #ifdef CONFIG_X86_64 | ||
792 | dma32_reserve_bootmem(); | ||
793 | #endif | ||
138 | 794 | ||
795 | #ifdef CONFIG_ACPI_SLEEP | ||
796 | /* | ||
797 | * Reserve low memory region for sleep support. | ||
798 | */ | ||
799 | acpi_reserve_bootmem(); | ||
139 | #endif | 800 | #endif |
801 | #ifdef CONFIG_X86_FIND_SMP_CONFIG | ||
802 | /* | ||
803 | * Find and reserve possible boot-time SMP configuration: | ||
804 | */ | ||
805 | find_smp_config(); | ||
806 | #endif | ||
807 | reserve_crashkernel(); | ||
808 | |||
809 | reserve_ibft_region(); | ||
810 | |||
811 | #ifdef CONFIG_KVM_CLOCK | ||
812 | kvmclock_init(); | ||
813 | #endif | ||
814 | |||
815 | #if defined(CONFIG_VMI) && defined(CONFIG_X86_32) | ||
816 | /* | ||
817 | * Must be after max_low_pfn is determined, and before kernel | ||
818 | * pagetables are setup. | ||
819 | */ | ||
820 | vmi_init(); | ||
821 | #endif | ||
822 | |||
823 | paravirt_pagetable_setup_start(swapper_pg_dir); | ||
824 | paging_init(); | ||
825 | paravirt_pagetable_setup_done(swapper_pg_dir); | ||
826 | paravirt_post_allocator_init(); | ||
827 | |||
828 | #ifdef CONFIG_X86_64 | ||
829 | map_vsyscall(); | ||
830 | #endif | ||
831 | |||
832 | #ifdef CONFIG_X86_GENERICARCH | ||
833 | generic_apic_probe(); | ||
834 | #endif | ||
835 | |||
836 | early_quirks(); | ||
837 | |||
838 | /* | ||
839 | * Read APIC and some other early information from ACPI tables. | ||
840 | */ | ||
841 | acpi_boot_init(); | ||
842 | |||
843 | #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) | ||
844 | /* | ||
845 | * get boot-time SMP configuration: | ||
846 | */ | ||
847 | if (smp_found_config) | ||
848 | get_smp_config(); | ||
849 | #endif | ||
850 | |||
851 | prefill_possible_map(); | ||
852 | #ifdef CONFIG_X86_64 | ||
853 | init_cpu_to_node(); | ||
854 | #endif | ||
855 | |||
856 | init_apic_mappings(); | ||
857 | ioapic_init_mappings(); | ||
858 | |||
859 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) | ||
860 | if (def_to_bigsmp) | ||
861 | printk(KERN_WARNING "More than 8 CPUs detected and " | ||
862 | "CONFIG_X86_PC cannot handle it.\nUse " | ||
863 | "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); | ||
864 | #endif | ||
865 | kvm_guest_init(); | ||
866 | |||
867 | e820_reserve_resources(); | ||
868 | e820_mark_nosave_regions(max_low_pfn); | ||
869 | |||
870 | #ifdef CONFIG_X86_32 | ||
871 | request_resource(&iomem_resource, &video_ram_resource); | ||
872 | #endif | ||
873 | reserve_standard_io_resources(); | ||
874 | |||
875 | e820_setup_gap(); | ||
876 | |||
877 | #ifdef CONFIG_VT | ||
878 | #if defined(CONFIG_VGA_CONSOLE) | ||
879 | if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) | ||
880 | conswitchp = &vga_con; | ||
881 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
882 | conswitchp = &dummy_con; | ||
883 | #endif | ||
884 | #endif | ||
885 | } | ||
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c deleted file mode 100644 index aee0e8200777..000000000000 --- a/arch/x86/kernel/setup64.c +++ /dev/null | |||
@@ -1,287 +0,0 @@ | |||
1 | /* | ||
2 | * X86-64 specific CPU setup. | ||
3 | * Copyright (C) 1995 Linus Torvalds | ||
4 | * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. | ||
5 | * See setup.c for older changelog. | ||
6 | */ | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/bootmem.h> | ||
12 | #include <linux/bitops.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/kgdb.h> | ||
15 | #include <asm/pda.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/processor.h> | ||
18 | #include <asm/desc.h> | ||
19 | #include <asm/atomic.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/smp.h> | ||
22 | #include <asm/i387.h> | ||
23 | #include <asm/percpu.h> | ||
24 | #include <asm/proto.h> | ||
25 | #include <asm/sections.h> | ||
26 | #include <asm/setup.h> | ||
27 | #include <asm/genapic.h> | ||
28 | |||
29 | #ifndef CONFIG_DEBUG_BOOT_PARAMS | ||
30 | struct boot_params __initdata boot_params; | ||
31 | #else | ||
32 | struct boot_params boot_params; | ||
33 | #endif | ||
34 | |||
35 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | ||
36 | |||
37 | struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; | ||
38 | EXPORT_SYMBOL(_cpu_pda); | ||
39 | struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; | ||
40 | |||
41 | struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | ||
42 | |||
43 | char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); | ||
44 | |||
45 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | ||
46 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | ||
47 | |||
48 | static int do_not_nx __cpuinitdata = 0; | ||
49 | |||
50 | /* noexec=on|off | ||
51 | Control non executable mappings for 64bit processes. | ||
52 | |||
53 | on Enable(default) | ||
54 | off Disable | ||
55 | */ | ||
56 | static int __init nonx_setup(char *str) | ||
57 | { | ||
58 | if (!str) | ||
59 | return -EINVAL; | ||
60 | if (!strncmp(str, "on", 2)) { | ||
61 | __supported_pte_mask |= _PAGE_NX; | ||
62 | do_not_nx = 0; | ||
63 | } else if (!strncmp(str, "off", 3)) { | ||
64 | do_not_nx = 1; | ||
65 | __supported_pte_mask &= ~_PAGE_NX; | ||
66 | } | ||
67 | return 0; | ||
68 | } | ||
69 | early_param("noexec", nonx_setup); | ||
70 | |||
71 | int force_personality32 = 0; | ||
72 | |||
73 | /* noexec32=on|off | ||
74 | Control non executable heap for 32bit processes. | ||
75 | To control the stack too use noexec=off | ||
76 | |||
77 | on PROT_READ does not imply PROT_EXEC for 32bit processes (default) | ||
78 | off PROT_READ implies PROT_EXEC | ||
79 | */ | ||
80 | static int __init nonx32_setup(char *str) | ||
81 | { | ||
82 | if (!strcmp(str, "on")) | ||
83 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
84 | else if (!strcmp(str, "off")) | ||
85 | force_personality32 |= READ_IMPLIES_EXEC; | ||
86 | return 1; | ||
87 | } | ||
88 | __setup("noexec32=", nonx32_setup); | ||
89 | |||
90 | void pda_init(int cpu) | ||
91 | { | ||
92 | struct x8664_pda *pda = cpu_pda(cpu); | ||
93 | |||
94 | /* Setup up data that may be needed in __get_free_pages early */ | ||
95 | asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); | ||
96 | /* Memory clobbers used to order PDA accessed */ | ||
97 | mb(); | ||
98 | wrmsrl(MSR_GS_BASE, pda); | ||
99 | mb(); | ||
100 | |||
101 | pda->cpunumber = cpu; | ||
102 | pda->irqcount = -1; | ||
103 | pda->kernelstack = | ||
104 | (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; | ||
105 | pda->active_mm = &init_mm; | ||
106 | pda->mmu_state = 0; | ||
107 | |||
108 | if (cpu == 0) { | ||
109 | /* others are initialized in smpboot.c */ | ||
110 | pda->pcurrent = &init_task; | ||
111 | pda->irqstackptr = boot_cpu_stack; | ||
112 | } else { | ||
113 | pda->irqstackptr = (char *) | ||
114 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
115 | if (!pda->irqstackptr) | ||
116 | panic("cannot allocate irqstack for cpu %d", cpu); | ||
117 | } | ||
118 | |||
119 | |||
120 | pda->irqstackptr += IRQSTACKSIZE-64; | ||
121 | } | ||
122 | |||
123 | char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] | ||
124 | __attribute__((section(".bss.page_aligned"))); | ||
125 | |||
126 | extern asmlinkage void ignore_sysret(void); | ||
127 | |||
128 | /* May not be marked __init: used by software suspend */ | ||
129 | void syscall_init(void) | ||
130 | { | ||
131 | /* | ||
132 | * LSTAR and STAR live in a bit strange symbiosis. | ||
133 | * They both write to the same internal register. STAR allows to set CS/DS | ||
134 | * but only a 32bit target. LSTAR sets the 64bit rip. | ||
135 | */ | ||
136 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
137 | wrmsrl(MSR_LSTAR, system_call); | ||
138 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
139 | |||
140 | #ifdef CONFIG_IA32_EMULATION | ||
141 | syscall32_cpu_init (); | ||
142 | #endif | ||
143 | |||
144 | /* Flags to clear on syscall */ | ||
145 | wrmsrl(MSR_SYSCALL_MASK, | ||
146 | X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); | ||
147 | } | ||
148 | |||
149 | void __cpuinit check_efer(void) | ||
150 | { | ||
151 | unsigned long efer; | ||
152 | |||
153 | rdmsrl(MSR_EFER, efer); | ||
154 | if (!(efer & EFER_NX) || do_not_nx) { | ||
155 | __supported_pte_mask &= ~_PAGE_NX; | ||
156 | } | ||
157 | } | ||
158 | |||
159 | unsigned long kernel_eflags; | ||
160 | |||
161 | /* | ||
162 | * Copies of the original ist values from the tss are only accessed during | ||
163 | * debugging, no special alignment required. | ||
164 | */ | ||
165 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | ||
166 | |||
167 | /* | ||
168 | * cpu_init() initializes state that is per-CPU. Some data is already | ||
169 | * initialized (naturally) in the bootstrap process, such as the GDT | ||
170 | * and IDT. We reload them nevertheless, this function acts as a | ||
171 | * 'CPU state barrier', nothing should get across. | ||
172 | * A lot of state is already set up in PDA init. | ||
173 | */ | ||
174 | void __cpuinit cpu_init (void) | ||
175 | { | ||
176 | int cpu = stack_smp_processor_id(); | ||
177 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
178 | struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); | ||
179 | unsigned long v; | ||
180 | char *estacks = NULL; | ||
181 | struct task_struct *me; | ||
182 | int i; | ||
183 | |||
184 | /* CPU 0 is initialised in head64.c */ | ||
185 | if (cpu != 0) { | ||
186 | pda_init(cpu); | ||
187 | } else | ||
188 | estacks = boot_exception_stacks; | ||
189 | |||
190 | me = current; | ||
191 | |||
192 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
193 | panic("CPU#%d already initialized!\n", cpu); | ||
194 | |||
195 | printk("Initializing CPU#%d\n", cpu); | ||
196 | |||
197 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
198 | |||
199 | /* | ||
200 | * Initialize the per-CPU GDT with the boot GDT, | ||
201 | * and set up the GDT descriptor: | ||
202 | */ | ||
203 | if (cpu) | ||
204 | memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); | ||
205 | |||
206 | cpu_gdt_descr[cpu].size = GDT_SIZE; | ||
207 | load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); | ||
208 | load_idt((const struct desc_ptr *)&idt_descr); | ||
209 | |||
210 | memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); | ||
211 | syscall_init(); | ||
212 | |||
213 | wrmsrl(MSR_FS_BASE, 0); | ||
214 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
215 | barrier(); | ||
216 | |||
217 | check_efer(); | ||
218 | |||
219 | /* | ||
220 | * set up and load the per-CPU TSS | ||
221 | */ | ||
222 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
223 | static const unsigned int order[N_EXCEPTION_STACKS] = { | ||
224 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, | ||
225 | [DEBUG_STACK - 1] = DEBUG_STACK_ORDER | ||
226 | }; | ||
227 | if (cpu) { | ||
228 | estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); | ||
229 | if (!estacks) | ||
230 | panic("Cannot allocate exception stack %ld %d\n", | ||
231 | v, cpu); | ||
232 | } | ||
233 | estacks += PAGE_SIZE << order[v]; | ||
234 | orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; | ||
235 | } | ||
236 | |||
237 | t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
238 | /* | ||
239 | * <= is required because the CPU will access up to | ||
240 | * 8 bits beyond the end of the IO permission bitmap. | ||
241 | */ | ||
242 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
243 | t->io_bitmap[i] = ~0UL; | ||
244 | |||
245 | atomic_inc(&init_mm.mm_count); | ||
246 | me->active_mm = &init_mm; | ||
247 | if (me->mm) | ||
248 | BUG(); | ||
249 | enter_lazy_tlb(&init_mm, me); | ||
250 | |||
251 | set_tss_desc(cpu, t); | ||
252 | load_TR_desc(); | ||
253 | load_LDT(&init_mm.context); | ||
254 | |||
255 | #ifdef CONFIG_KGDB | ||
256 | /* | ||
257 | * If the kgdb is connected no debug regs should be altered. This | ||
258 | * is only applicable when KGDB and a KGDB I/O module are built | ||
259 | * into the kernel and you are using early debugging with | ||
260 | * kgdbwait. KGDB will control the kernel HW breakpoint registers. | ||
261 | */ | ||
262 | if (kgdb_connected && arch_kgdb_ops.correct_hw_break) | ||
263 | arch_kgdb_ops.correct_hw_break(); | ||
264 | else { | ||
265 | #endif | ||
266 | /* | ||
267 | * Clear all 6 debug registers: | ||
268 | */ | ||
269 | |||
270 | set_debugreg(0UL, 0); | ||
271 | set_debugreg(0UL, 1); | ||
272 | set_debugreg(0UL, 2); | ||
273 | set_debugreg(0UL, 3); | ||
274 | set_debugreg(0UL, 6); | ||
275 | set_debugreg(0UL, 7); | ||
276 | #ifdef CONFIG_KGDB | ||
277 | /* If the kgdb is connected no debug regs should be altered. */ | ||
278 | } | ||
279 | #endif | ||
280 | |||
281 | fpu_init(); | ||
282 | |||
283 | raw_local_save_flags(kernel_eflags); | ||
284 | |||
285 | if (is_uv_system()) | ||
286 | uv_cpu_init(); | ||
287 | } | ||
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c deleted file mode 100644 index 2c5f8b213e86..000000000000 --- a/arch/x86/kernel/setup_32.c +++ /dev/null | |||
@@ -1,958 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1995 Linus Torvalds | ||
3 | * | ||
4 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | ||
5 | * | ||
6 | * Memory region support | ||
7 | * David Parsons <orc@pell.chi.il.us>, July-August 1999 | ||
8 | * | ||
9 | * Added E820 sanitization routine (removes overlapping memory regions); | ||
10 | * Brian Moyle <bmoyle@mvista.com>, February 2001 | ||
11 | * | ||
12 | * Moved CPU detection code to cpu/${cpu}.c | ||
13 | * Patrick Mochel <mochel@osdl.org>, March 2002 | ||
14 | * | ||
15 | * Provisions for empty E820 memory regions (reported by certain BIOSes). | ||
16 | * Alex Achenbach <xela@slit.de>, December 2002. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * This file handles the architecture-dependent parts of initialization | ||
22 | */ | ||
23 | |||
24 | #include <linux/sched.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/mmzone.h> | ||
27 | #include <linux/screen_info.h> | ||
28 | #include <linux/ioport.h> | ||
29 | #include <linux/acpi.h> | ||
30 | #include <linux/apm_bios.h> | ||
31 | #include <linux/initrd.h> | ||
32 | #include <linux/bootmem.h> | ||
33 | #include <linux/seq_file.h> | ||
34 | #include <linux/console.h> | ||
35 | #include <linux/mca.h> | ||
36 | #include <linux/root_dev.h> | ||
37 | #include <linux/highmem.h> | ||
38 | #include <linux/module.h> | ||
39 | #include <linux/efi.h> | ||
40 | #include <linux/init.h> | ||
41 | #include <linux/edd.h> | ||
42 | #include <linux/iscsi_ibft.h> | ||
43 | #include <linux/nodemask.h> | ||
44 | #include <linux/kexec.h> | ||
45 | #include <linux/crash_dump.h> | ||
46 | #include <linux/dmi.h> | ||
47 | #include <linux/pfn.h> | ||
48 | #include <linux/pci.h> | ||
49 | #include <linux/init_ohci1394_dma.h> | ||
50 | #include <linux/kvm_para.h> | ||
51 | |||
52 | #include <video/edid.h> | ||
53 | |||
54 | #include <asm/mtrr.h> | ||
55 | #include <asm/apic.h> | ||
56 | #include <asm/e820.h> | ||
57 | #include <asm/mpspec.h> | ||
58 | #include <asm/mmzone.h> | ||
59 | #include <asm/setup.h> | ||
60 | #include <asm/arch_hooks.h> | ||
61 | #include <asm/sections.h> | ||
62 | #include <asm/io_apic.h> | ||
63 | #include <asm/ist.h> | ||
64 | #include <asm/io.h> | ||
65 | #include <asm/vmi.h> | ||
66 | #include <setup_arch.h> | ||
67 | #include <asm/bios_ebda.h> | ||
68 | #include <asm/cacheflush.h> | ||
69 | #include <asm/processor.h> | ||
70 | |||
71 | /* This value is set up by the early boot code to point to the value | ||
72 | immediately after the boot time page tables. It contains a *physical* | ||
73 | address, and must not be in the .bss segment! */ | ||
74 | unsigned long init_pg_tables_end __initdata = ~0UL; | ||
75 | |||
76 | /* | ||
77 | * Machine setup.. | ||
78 | */ | ||
79 | static struct resource data_resource = { | ||
80 | .name = "Kernel data", | ||
81 | .start = 0, | ||
82 | .end = 0, | ||
83 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
84 | }; | ||
85 | |||
86 | static struct resource code_resource = { | ||
87 | .name = "Kernel code", | ||
88 | .start = 0, | ||
89 | .end = 0, | ||
90 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
91 | }; | ||
92 | |||
93 | static struct resource bss_resource = { | ||
94 | .name = "Kernel bss", | ||
95 | .start = 0, | ||
96 | .end = 0, | ||
97 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
98 | }; | ||
99 | |||
100 | static struct resource video_ram_resource = { | ||
101 | .name = "Video RAM area", | ||
102 | .start = 0xa0000, | ||
103 | .end = 0xbffff, | ||
104 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
105 | }; | ||
106 | |||
107 | static struct resource standard_io_resources[] = { { | ||
108 | .name = "dma1", | ||
109 | .start = 0x0000, | ||
110 | .end = 0x001f, | ||
111 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
112 | }, { | ||
113 | .name = "pic1", | ||
114 | .start = 0x0020, | ||
115 | .end = 0x0021, | ||
116 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
117 | }, { | ||
118 | .name = "timer0", | ||
119 | .start = 0x0040, | ||
120 | .end = 0x0043, | ||
121 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
122 | }, { | ||
123 | .name = "timer1", | ||
124 | .start = 0x0050, | ||
125 | .end = 0x0053, | ||
126 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
127 | }, { | ||
128 | .name = "keyboard", | ||
129 | .start = 0x0060, | ||
130 | .end = 0x0060, | ||
131 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
132 | }, { | ||
133 | .name = "keyboard", | ||
134 | .start = 0x0064, | ||
135 | .end = 0x0064, | ||
136 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
137 | }, { | ||
138 | .name = "dma page reg", | ||
139 | .start = 0x0080, | ||
140 | .end = 0x008f, | ||
141 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
142 | }, { | ||
143 | .name = "pic2", | ||
144 | .start = 0x00a0, | ||
145 | .end = 0x00a1, | ||
146 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
147 | }, { | ||
148 | .name = "dma2", | ||
149 | .start = 0x00c0, | ||
150 | .end = 0x00df, | ||
151 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
152 | }, { | ||
153 | .name = "fpu", | ||
154 | .start = 0x00f0, | ||
155 | .end = 0x00ff, | ||
156 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
157 | } }; | ||
158 | |||
159 | /* cpu data as detected by the assembly code in head.S */ | ||
160 | struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | ||
161 | /* common cpu data for all cpus */ | ||
162 | struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | ||
163 | EXPORT_SYMBOL(boot_cpu_data); | ||
164 | |||
165 | unsigned int def_to_bigsmp; | ||
166 | |||
167 | #ifndef CONFIG_X86_PAE | ||
168 | unsigned long mmu_cr4_features; | ||
169 | #else | ||
170 | unsigned long mmu_cr4_features = X86_CR4_PAE; | ||
171 | #endif | ||
172 | |||
173 | /* for MCA, but anyone else can use it if they want */ | ||
174 | unsigned int machine_id; | ||
175 | unsigned int machine_submodel_id; | ||
176 | unsigned int BIOS_revision; | ||
177 | |||
178 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
179 | int bootloader_type; | ||
180 | |||
181 | /* user-defined highmem size */ | ||
182 | static unsigned int highmem_pages = -1; | ||
183 | |||
184 | /* | ||
185 | * Setup options | ||
186 | */ | ||
187 | struct screen_info screen_info; | ||
188 | EXPORT_SYMBOL(screen_info); | ||
189 | struct apm_info apm_info; | ||
190 | EXPORT_SYMBOL(apm_info); | ||
191 | struct edid_info edid_info; | ||
192 | EXPORT_SYMBOL_GPL(edid_info); | ||
193 | struct ist_info ist_info; | ||
194 | #if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ | ||
195 | defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) | ||
196 | EXPORT_SYMBOL(ist_info); | ||
197 | #endif | ||
198 | |||
199 | extern void early_cpu_init(void); | ||
200 | extern int root_mountflags; | ||
201 | |||
202 | unsigned long saved_video_mode; | ||
203 | |||
204 | #define RAMDISK_IMAGE_START_MASK 0x07FF | ||
205 | #define RAMDISK_PROMPT_FLAG 0x8000 | ||
206 | #define RAMDISK_LOAD_FLAG 0x4000 | ||
207 | |||
208 | static char __initdata command_line[COMMAND_LINE_SIZE]; | ||
209 | |||
210 | #ifndef CONFIG_DEBUG_BOOT_PARAMS | ||
211 | struct boot_params __initdata boot_params; | ||
212 | #else | ||
213 | struct boot_params boot_params; | ||
214 | #endif | ||
215 | |||
216 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
217 | struct edd edd; | ||
218 | #ifdef CONFIG_EDD_MODULE | ||
219 | EXPORT_SYMBOL(edd); | ||
220 | #endif | ||
221 | /** | ||
222 | * copy_edd() - Copy the BIOS EDD information | ||
223 | * from boot_params into a safe place. | ||
224 | * | ||
225 | */ | ||
226 | static inline void copy_edd(void) | ||
227 | { | ||
228 | memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, | ||
229 | sizeof(edd.mbr_signature)); | ||
230 | memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); | ||
231 | edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; | ||
232 | edd.edd_info_nr = boot_params.eddbuf_entries; | ||
233 | } | ||
234 | #else | ||
235 | static inline void copy_edd(void) | ||
236 | { | ||
237 | } | ||
238 | #endif | ||
239 | |||
240 | int __initdata user_defined_memmap; | ||
241 | |||
242 | /* | ||
243 | * "mem=nopentium" disables the 4MB page tables. | ||
244 | * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM | ||
245 | * to <mem>, overriding the bios size. | ||
246 | * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from | ||
247 | * <start> to <start>+<mem>, overriding the bios size. | ||
248 | * | ||
249 | * HPA tells me bootloaders need to parse mem=, so no new | ||
250 | * option should be mem= [also see Documentation/i386/boot.txt] | ||
251 | */ | ||
252 | static int __init parse_mem(char *arg) | ||
253 | { | ||
254 | if (!arg) | ||
255 | return -EINVAL; | ||
256 | |||
257 | if (strcmp(arg, "nopentium") == 0) { | ||
258 | setup_clear_cpu_cap(X86_FEATURE_PSE); | ||
259 | } else { | ||
260 | /* If the user specifies memory size, we | ||
261 | * limit the BIOS-provided memory map to | ||
262 | * that size. exactmap can be used to specify | ||
263 | * the exact map. mem=number can be used to | ||
264 | * trim the existing memory map. | ||
265 | */ | ||
266 | unsigned long long mem_size; | ||
267 | |||
268 | mem_size = memparse(arg, &arg); | ||
269 | limit_regions(mem_size); | ||
270 | user_defined_memmap = 1; | ||
271 | } | ||
272 | return 0; | ||
273 | } | ||
274 | early_param("mem", parse_mem); | ||
275 | |||
276 | #ifdef CONFIG_PROC_VMCORE | ||
277 | /* elfcorehdr= specifies the location of elf core header | ||
278 | * stored by the crashed kernel. | ||
279 | */ | ||
280 | static int __init parse_elfcorehdr(char *arg) | ||
281 | { | ||
282 | if (!arg) | ||
283 | return -EINVAL; | ||
284 | |||
285 | elfcorehdr_addr = memparse(arg, &arg); | ||
286 | return 0; | ||
287 | } | ||
288 | early_param("elfcorehdr", parse_elfcorehdr); | ||
289 | #endif /* CONFIG_PROC_VMCORE */ | ||
290 | |||
291 | /* | ||
292 | * highmem=size forces highmem to be exactly 'size' bytes. | ||
293 | * This works even on boxes that have no highmem otherwise. | ||
294 | * This also works to reduce highmem size on bigger boxes. | ||
295 | */ | ||
296 | static int __init parse_highmem(char *arg) | ||
297 | { | ||
298 | if (!arg) | ||
299 | return -EINVAL; | ||
300 | |||
301 | highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; | ||
302 | return 0; | ||
303 | } | ||
304 | early_param("highmem", parse_highmem); | ||
305 | |||
306 | /* | ||
307 | * vmalloc=size forces the vmalloc area to be exactly 'size' | ||
308 | * bytes. This can be used to increase (or decrease) the | ||
309 | * vmalloc area - the default is 128m. | ||
310 | */ | ||
311 | static int __init parse_vmalloc(char *arg) | ||
312 | { | ||
313 | if (!arg) | ||
314 | return -EINVAL; | ||
315 | |||
316 | __VMALLOC_RESERVE = memparse(arg, &arg); | ||
317 | return 0; | ||
318 | } | ||
319 | early_param("vmalloc", parse_vmalloc); | ||
320 | |||
321 | /* | ||
322 | * reservetop=size reserves a hole at the top of the kernel address space which | ||
323 | * a hypervisor can load into later. Needed for dynamically loaded hypervisors, | ||
324 | * so relocating the fixmap can be done before paging initialization. | ||
325 | */ | ||
326 | static int __init parse_reservetop(char *arg) | ||
327 | { | ||
328 | unsigned long address; | ||
329 | |||
330 | if (!arg) | ||
331 | return -EINVAL; | ||
332 | |||
333 | address = memparse(arg, &arg); | ||
334 | reserve_top_address(address); | ||
335 | return 0; | ||
336 | } | ||
337 | early_param("reservetop", parse_reservetop); | ||
338 | |||
339 | /* | ||
340 | * Determine low and high memory ranges: | ||
341 | */ | ||
342 | unsigned long __init find_max_low_pfn(void) | ||
343 | { | ||
344 | unsigned long max_low_pfn; | ||
345 | |||
346 | max_low_pfn = max_pfn; | ||
347 | if (max_low_pfn > MAXMEM_PFN) { | ||
348 | if (highmem_pages == -1) | ||
349 | highmem_pages = max_pfn - MAXMEM_PFN; | ||
350 | if (highmem_pages + MAXMEM_PFN < max_pfn) | ||
351 | max_pfn = MAXMEM_PFN + highmem_pages; | ||
352 | if (highmem_pages + MAXMEM_PFN > max_pfn) { | ||
353 | printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); | ||
354 | highmem_pages = 0; | ||
355 | } | ||
356 | max_low_pfn = MAXMEM_PFN; | ||
357 | #ifndef CONFIG_HIGHMEM | ||
358 | /* Maximum memory usable is what is directly addressable */ | ||
359 | printk(KERN_WARNING "Warning only %ldMB will be used.\n", | ||
360 | MAXMEM>>20); | ||
361 | if (max_pfn > MAX_NONPAE_PFN) | ||
362 | printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); | ||
363 | else | ||
364 | printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); | ||
365 | max_pfn = MAXMEM_PFN; | ||
366 | #else /* !CONFIG_HIGHMEM */ | ||
367 | #ifndef CONFIG_HIGHMEM64G | ||
368 | if (max_pfn > MAX_NONPAE_PFN) { | ||
369 | max_pfn = MAX_NONPAE_PFN; | ||
370 | printk(KERN_WARNING "Warning only 4GB will be used.\n"); | ||
371 | printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); | ||
372 | } | ||
373 | #endif /* !CONFIG_HIGHMEM64G */ | ||
374 | #endif /* !CONFIG_HIGHMEM */ | ||
375 | } else { | ||
376 | if (highmem_pages == -1) | ||
377 | highmem_pages = 0; | ||
378 | #ifdef CONFIG_HIGHMEM | ||
379 | if (highmem_pages >= max_pfn) { | ||
380 | printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); | ||
381 | highmem_pages = 0; | ||
382 | } | ||
383 | if (highmem_pages) { | ||
384 | if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ | ||
385 | printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); | ||
386 | highmem_pages = 0; | ||
387 | } | ||
388 | max_low_pfn -= highmem_pages; | ||
389 | } | ||
390 | #else | ||
391 | if (highmem_pages) | ||
392 | printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); | ||
393 | #endif | ||
394 | } | ||
395 | return max_low_pfn; | ||
396 | } | ||
397 | |||
398 | #define BIOS_LOWMEM_KILOBYTES 0x413 | ||
399 | |||
400 | /* | ||
401 | * The BIOS places the EBDA/XBDA at the top of conventional | ||
402 | * memory, and usually decreases the reported amount of | ||
403 | * conventional memory (int 0x12) too. This also contains a | ||
404 | * workaround for Dell systems that neglect to reserve EBDA. | ||
405 | * The same workaround also avoids a problem with the AMD768MPX | ||
406 | * chipset: reserve a page before VGA to prevent PCI prefetch | ||
407 | * into it (errata #56). Usually the page is reserved anyways, | ||
408 | * unless you have no PS/2 mouse plugged in. | ||
409 | */ | ||
410 | static void __init reserve_ebda_region(void) | ||
411 | { | ||
412 | unsigned int lowmem, ebda_addr; | ||
413 | |||
414 | /* To determine the position of the EBDA and the */ | ||
415 | /* end of conventional memory, we need to look at */ | ||
416 | /* the BIOS data area. In a paravirtual environment */ | ||
417 | /* that area is absent. We'll just have to assume */ | ||
418 | /* that the paravirt case can handle memory setup */ | ||
419 | /* correctly, without our help. */ | ||
420 | if (paravirt_enabled()) | ||
421 | return; | ||
422 | |||
423 | /* end of low (conventional) memory */ | ||
424 | lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); | ||
425 | lowmem <<= 10; | ||
426 | |||
427 | /* start of EBDA area */ | ||
428 | ebda_addr = get_bios_ebda(); | ||
429 | |||
430 | /* Fixup: bios puts an EBDA in the top 64K segment */ | ||
431 | /* of conventional memory, but does not adjust lowmem. */ | ||
432 | if ((lowmem - ebda_addr) <= 0x10000) | ||
433 | lowmem = ebda_addr; | ||
434 | |||
435 | /* Fixup: bios does not report an EBDA at all. */ | ||
436 | /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ | ||
437 | if ((ebda_addr == 0) && (lowmem >= 0x9f000)) | ||
438 | lowmem = 0x9f000; | ||
439 | |||
440 | /* Paranoia: should never happen, but... */ | ||
441 | if ((lowmem == 0) || (lowmem >= 0x100000)) | ||
442 | lowmem = 0x9f000; | ||
443 | |||
444 | /* reserve all memory between lowmem and the 1MB mark */ | ||
445 | reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT); | ||
446 | } | ||
447 | |||
448 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
449 | static void __init setup_bootmem_allocator(void); | ||
450 | static unsigned long __init setup_memory(void) | ||
451 | { | ||
452 | /* | ||
453 | * partially used pages are not usable - thus | ||
454 | * we are rounding upwards: | ||
455 | */ | ||
456 | min_low_pfn = PFN_UP(init_pg_tables_end); | ||
457 | |||
458 | max_low_pfn = find_max_low_pfn(); | ||
459 | |||
460 | #ifdef CONFIG_HIGHMEM | ||
461 | highstart_pfn = highend_pfn = max_pfn; | ||
462 | if (max_pfn > max_low_pfn) { | ||
463 | highstart_pfn = max_low_pfn; | ||
464 | } | ||
465 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | ||
466 | pages_to_mb(highend_pfn - highstart_pfn)); | ||
467 | num_physpages = highend_pfn; | ||
468 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | ||
469 | #else | ||
470 | num_physpages = max_low_pfn; | ||
471 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | ||
472 | #endif | ||
473 | #ifdef CONFIG_FLATMEM | ||
474 | max_mapnr = num_physpages; | ||
475 | #endif | ||
476 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | ||
477 | pages_to_mb(max_low_pfn)); | ||
478 | |||
479 | setup_bootmem_allocator(); | ||
480 | |||
481 | return max_low_pfn; | ||
482 | } | ||
483 | |||
484 | static void __init zone_sizes_init(void) | ||
485 | { | ||
486 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
487 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
488 | max_zone_pfns[ZONE_DMA] = | ||
489 | virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
490 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; | ||
491 | #ifdef CONFIG_HIGHMEM | ||
492 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; | ||
493 | add_active_range(0, 0, highend_pfn); | ||
494 | #else | ||
495 | add_active_range(0, 0, max_low_pfn); | ||
496 | #endif | ||
497 | |||
498 | free_area_init_nodes(max_zone_pfns); | ||
499 | } | ||
500 | #else | ||
501 | extern unsigned long __init setup_memory(void); | ||
502 | extern void zone_sizes_init(void); | ||
503 | #endif /* !CONFIG_NEED_MULTIPLE_NODES */ | ||
504 | |||
505 | static inline unsigned long long get_total_mem(void) | ||
506 | { | ||
507 | unsigned long long total; | ||
508 | |||
509 | total = max_low_pfn - min_low_pfn; | ||
510 | #ifdef CONFIG_HIGHMEM | ||
511 | total += highend_pfn - highstart_pfn; | ||
512 | #endif | ||
513 | |||
514 | return total << PAGE_SHIFT; | ||
515 | } | ||
516 | |||
517 | #ifdef CONFIG_KEXEC | ||
518 | static void __init reserve_crashkernel(void) | ||
519 | { | ||
520 | unsigned long long total_mem; | ||
521 | unsigned long long crash_size, crash_base; | ||
522 | int ret; | ||
523 | |||
524 | total_mem = get_total_mem(); | ||
525 | |||
526 | ret = parse_crashkernel(boot_command_line, total_mem, | ||
527 | &crash_size, &crash_base); | ||
528 | if (ret == 0 && crash_size > 0) { | ||
529 | if (crash_base > 0) { | ||
530 | printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " | ||
531 | "for crashkernel (System RAM: %ldMB)\n", | ||
532 | (unsigned long)(crash_size >> 20), | ||
533 | (unsigned long)(crash_base >> 20), | ||
534 | (unsigned long)(total_mem >> 20)); | ||
535 | crashk_res.start = crash_base; | ||
536 | crashk_res.end = crash_base + crash_size - 1; | ||
537 | reserve_bootmem(crash_base, crash_size, | ||
538 | BOOTMEM_DEFAULT); | ||
539 | } else | ||
540 | printk(KERN_INFO "crashkernel reservation failed - " | ||
541 | "you have to specify a base address\n"); | ||
542 | } | ||
543 | } | ||
544 | #else | ||
545 | static inline void __init reserve_crashkernel(void) | ||
546 | {} | ||
547 | #endif | ||
548 | |||
549 | #ifdef CONFIG_BLK_DEV_INITRD | ||
550 | |||
551 | static bool do_relocate_initrd = false; | ||
552 | |||
553 | static void __init reserve_initrd(void) | ||
554 | { | ||
555 | unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; | ||
556 | unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; | ||
557 | unsigned long ramdisk_end = ramdisk_image + ramdisk_size; | ||
558 | unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; | ||
559 | unsigned long ramdisk_here; | ||
560 | |||
561 | initrd_start = 0; | ||
562 | |||
563 | if (!boot_params.hdr.type_of_loader || | ||
564 | !ramdisk_image || !ramdisk_size) | ||
565 | return; /* No initrd provided by bootloader */ | ||
566 | |||
567 | if (ramdisk_end < ramdisk_image) { | ||
568 | printk(KERN_ERR "initrd wraps around end of memory, " | ||
569 | "disabling initrd\n"); | ||
570 | return; | ||
571 | } | ||
572 | if (ramdisk_size >= end_of_lowmem/2) { | ||
573 | printk(KERN_ERR "initrd too large to handle, " | ||
574 | "disabling initrd\n"); | ||
575 | return; | ||
576 | } | ||
577 | if (ramdisk_end <= end_of_lowmem) { | ||
578 | /* All in lowmem, easy case */ | ||
579 | reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); | ||
580 | initrd_start = ramdisk_image + PAGE_OFFSET; | ||
581 | initrd_end = initrd_start+ramdisk_size; | ||
582 | return; | ||
583 | } | ||
584 | |||
585 | /* We need to move the initrd down into lowmem */ | ||
586 | ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; | ||
587 | |||
588 | /* Note: this includes all the lowmem currently occupied by | ||
589 | the initrd, we rely on that fact to keep the data intact. */ | ||
590 | reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT); | ||
591 | initrd_start = ramdisk_here + PAGE_OFFSET; | ||
592 | initrd_end = initrd_start + ramdisk_size; | ||
593 | |||
594 | do_relocate_initrd = true; | ||
595 | } | ||
596 | |||
597 | #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) | ||
598 | |||
599 | static void __init relocate_initrd(void) | ||
600 | { | ||
601 | unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; | ||
602 | unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; | ||
603 | unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; | ||
604 | unsigned long ramdisk_here; | ||
605 | unsigned long slop, clen, mapaddr; | ||
606 | char *p, *q; | ||
607 | |||
608 | if (!do_relocate_initrd) | ||
609 | return; | ||
610 | |||
611 | ramdisk_here = initrd_start - PAGE_OFFSET; | ||
612 | |||
613 | q = (char *)initrd_start; | ||
614 | |||
615 | /* Copy any lowmem portion of the initrd */ | ||
616 | if (ramdisk_image < end_of_lowmem) { | ||
617 | clen = end_of_lowmem - ramdisk_image; | ||
618 | p = (char *)__va(ramdisk_image); | ||
619 | memcpy(q, p, clen); | ||
620 | q += clen; | ||
621 | ramdisk_image += clen; | ||
622 | ramdisk_size -= clen; | ||
623 | } | ||
624 | |||
625 | /* Copy the highmem portion of the initrd */ | ||
626 | while (ramdisk_size) { | ||
627 | slop = ramdisk_image & ~PAGE_MASK; | ||
628 | clen = ramdisk_size; | ||
629 | if (clen > MAX_MAP_CHUNK-slop) | ||
630 | clen = MAX_MAP_CHUNK-slop; | ||
631 | mapaddr = ramdisk_image & PAGE_MASK; | ||
632 | p = early_ioremap(mapaddr, clen+slop); | ||
633 | memcpy(q, p+slop, clen); | ||
634 | early_iounmap(p, clen+slop); | ||
635 | q += clen; | ||
636 | ramdisk_image += clen; | ||
637 | ramdisk_size -= clen; | ||
638 | } | ||
639 | } | ||
640 | |||
641 | #endif /* CONFIG_BLK_DEV_INITRD */ | ||
642 | |||
643 | void __init setup_bootmem_allocator(void) | ||
644 | { | ||
645 | unsigned long bootmap_size; | ||
646 | /* | ||
647 | * Initialize the boot-time allocator (with low memory only): | ||
648 | */ | ||
649 | bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); | ||
650 | |||
651 | register_bootmem_low_pages(max_low_pfn); | ||
652 | |||
653 | /* | ||
654 | * Reserve the bootmem bitmap itself as well. We do this in two | ||
655 | * steps (first step was init_bootmem()) because this catches | ||
656 | * the (very unlikely) case of us accidentally initializing the | ||
657 | * bootmem allocator with an invalid RAM area. | ||
658 | */ | ||
659 | reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + | ||
660 | bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text), | ||
661 | BOOTMEM_DEFAULT); | ||
662 | |||
663 | /* | ||
664 | * reserve physical page 0 - it's a special BIOS page on many boxes, | ||
665 | * enabling clean reboots, SMP operation, laptop functions. | ||
666 | */ | ||
667 | reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT); | ||
668 | |||
669 | /* reserve EBDA region */ | ||
670 | reserve_ebda_region(); | ||
671 | |||
672 | #ifdef CONFIG_SMP | ||
673 | /* | ||
674 | * But first pinch a few for the stack/trampoline stuff | ||
675 | * FIXME: Don't need the extra page at 4K, but need to fix | ||
676 | * trampoline before removing it. (see the GDT stuff) | ||
677 | */ | ||
678 | reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT); | ||
679 | #endif | ||
680 | #ifdef CONFIG_ACPI_SLEEP | ||
681 | /* | ||
682 | * Reserve low memory region for sleep support. | ||
683 | */ | ||
684 | acpi_reserve_bootmem(); | ||
685 | #endif | ||
686 | #ifdef CONFIG_X86_FIND_SMP_CONFIG | ||
687 | /* | ||
688 | * Find and reserve possible boot-time SMP configuration: | ||
689 | */ | ||
690 | find_smp_config(); | ||
691 | #endif | ||
692 | #ifdef CONFIG_BLK_DEV_INITRD | ||
693 | reserve_initrd(); | ||
694 | #endif | ||
695 | numa_kva_reserve(); | ||
696 | reserve_crashkernel(); | ||
697 | |||
698 | reserve_ibft_region(); | ||
699 | } | ||
700 | |||
701 | /* | ||
702 | * The node 0 pgdat is initialized before all of these because | ||
703 | * it's needed for bootmem. node>0 pgdats have their virtual | ||
704 | * space allocated before the pagetables are in place to access | ||
705 | * them, so they can't be cleared then. | ||
706 | * | ||
707 | * This should all compile down to nothing when NUMA is off. | ||
708 | */ | ||
709 | static void __init remapped_pgdat_init(void) | ||
710 | { | ||
711 | int nid; | ||
712 | |||
713 | for_each_online_node(nid) { | ||
714 | if (nid != 0) | ||
715 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | ||
716 | } | ||
717 | } | ||
718 | |||
719 | #ifdef CONFIG_MCA | ||
720 | static void set_mca_bus(int x) | ||
721 | { | ||
722 | MCA_bus = x; | ||
723 | } | ||
724 | #else | ||
725 | static void set_mca_bus(int x) { } | ||
726 | #endif | ||
727 | |||
728 | /* Overridden in paravirt.c if CONFIG_PARAVIRT */ | ||
729 | char * __init __attribute__((weak)) memory_setup(void) | ||
730 | { | ||
731 | return machine_specific_memory_setup(); | ||
732 | } | ||
733 | |||
734 | #ifdef CONFIG_NUMA | ||
735 | /* | ||
736 | * In the golden day, when everything among i386 and x86_64 will be | ||
737 | * integrated, this will not live here | ||
738 | */ | ||
739 | void *x86_cpu_to_node_map_early_ptr; | ||
740 | int x86_cpu_to_node_map_init[NR_CPUS] = { | ||
741 | [0 ... NR_CPUS-1] = NUMA_NO_NODE | ||
742 | }; | ||
743 | DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; | ||
744 | #endif | ||
745 | |||
746 | /* | ||
747 | * Determine if we were loaded by an EFI loader. If so, then we have also been | ||
748 | * passed the efi memmap, systab, etc., so we should use these data structures | ||
749 | * for initialization. Note, the efi init code path is determined by the | ||
750 | * global efi_enabled. This allows the same kernel image to be used on existing | ||
751 | * systems (with a traditional BIOS) as well as on EFI systems. | ||
752 | */ | ||
753 | void __init setup_arch(char **cmdline_p) | ||
754 | { | ||
755 | unsigned long max_low_pfn; | ||
756 | |||
757 | memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); | ||
758 | pre_setup_arch_hook(); | ||
759 | early_cpu_init(); | ||
760 | early_ioremap_init(); | ||
761 | |||
762 | #ifdef CONFIG_EFI | ||
763 | if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, | ||
764 | "EL32", 4)) | ||
765 | efi_enabled = 1; | ||
766 | #endif | ||
767 | |||
768 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); | ||
769 | screen_info = boot_params.screen_info; | ||
770 | edid_info = boot_params.edid_info; | ||
771 | apm_info.bios = boot_params.apm_bios_info; | ||
772 | ist_info = boot_params.ist_info; | ||
773 | saved_video_mode = boot_params.hdr.vid_mode; | ||
774 | if( boot_params.sys_desc_table.length != 0 ) { | ||
775 | set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); | ||
776 | machine_id = boot_params.sys_desc_table.table[0]; | ||
777 | machine_submodel_id = boot_params.sys_desc_table.table[1]; | ||
778 | BIOS_revision = boot_params.sys_desc_table.table[2]; | ||
779 | } | ||
780 | bootloader_type = boot_params.hdr.type_of_loader; | ||
781 | |||
782 | #ifdef CONFIG_BLK_DEV_RAM | ||
783 | rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; | ||
784 | rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); | ||
785 | rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); | ||
786 | #endif | ||
787 | ARCH_SETUP | ||
788 | |||
789 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
790 | print_memory_map(memory_setup()); | ||
791 | |||
792 | copy_edd(); | ||
793 | |||
794 | if (!boot_params.hdr.root_flags) | ||
795 | root_mountflags &= ~MS_RDONLY; | ||
796 | init_mm.start_code = (unsigned long) _text; | ||
797 | init_mm.end_code = (unsigned long) _etext; | ||
798 | init_mm.end_data = (unsigned long) _edata; | ||
799 | init_mm.brk = init_pg_tables_end + PAGE_OFFSET; | ||
800 | |||
801 | code_resource.start = virt_to_phys(_text); | ||
802 | code_resource.end = virt_to_phys(_etext)-1; | ||
803 | data_resource.start = virt_to_phys(_etext); | ||
804 | data_resource.end = virt_to_phys(_edata)-1; | ||
805 | bss_resource.start = virt_to_phys(&__bss_start); | ||
806 | bss_resource.end = virt_to_phys(&__bss_stop)-1; | ||
807 | |||
808 | parse_early_param(); | ||
809 | |||
810 | if (user_defined_memmap) { | ||
811 | printk(KERN_INFO "user-defined physical RAM map:\n"); | ||
812 | print_memory_map("user"); | ||
813 | } | ||
814 | |||
815 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | ||
816 | *cmdline_p = command_line; | ||
817 | |||
818 | if (efi_enabled) | ||
819 | efi_init(); | ||
820 | |||
821 | /* update e820 for memory not covered by WB MTRRs */ | ||
822 | propagate_e820_map(); | ||
823 | mtrr_bp_init(); | ||
824 | if (mtrr_trim_uncached_memory(max_pfn)) | ||
825 | propagate_e820_map(); | ||
826 | |||
827 | max_low_pfn = setup_memory(); | ||
828 | |||
829 | #ifdef CONFIG_KVM_CLOCK | ||
830 | kvmclock_init(); | ||
831 | #endif | ||
832 | |||
833 | #ifdef CONFIG_VMI | ||
834 | /* | ||
835 | * Must be after max_low_pfn is determined, and before kernel | ||
836 | * pagetables are setup. | ||
837 | */ | ||
838 | vmi_init(); | ||
839 | #endif | ||
840 | kvm_guest_init(); | ||
841 | |||
842 | /* | ||
843 | * NOTE: before this point _nobody_ is allowed to allocate | ||
844 | * any memory using the bootmem allocator. Although the | ||
845 | * allocator is now initialised only the first 8Mb of the kernel | ||
846 | * virtual address space has been mapped. All allocations before | ||
847 | * paging_init() has completed must use the alloc_bootmem_low_pages() | ||
848 | * variant (which allocates DMA'able memory) and care must be taken | ||
849 | * not to exceed the 8Mb limit. | ||
850 | */ | ||
851 | |||
852 | #ifdef CONFIG_SMP | ||
853 | smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ | ||
854 | #endif | ||
855 | paging_init(); | ||
856 | |||
857 | /* | ||
858 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. | ||
859 | */ | ||
860 | |||
861 | #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | ||
862 | if (init_ohci1394_dma_early) | ||
863 | init_ohci1394_dma_on_all_controllers(); | ||
864 | #endif | ||
865 | |||
866 | remapped_pgdat_init(); | ||
867 | sparse_init(); | ||
868 | zone_sizes_init(); | ||
869 | |||
870 | /* | ||
871 | * NOTE: at this point the bootmem allocator is fully available. | ||
872 | */ | ||
873 | |||
874 | #ifdef CONFIG_BLK_DEV_INITRD | ||
875 | relocate_initrd(); | ||
876 | #endif | ||
877 | |||
878 | paravirt_post_allocator_init(); | ||
879 | |||
880 | dmi_scan_machine(); | ||
881 | |||
882 | io_delay_init(); | ||
883 | |||
884 | #ifdef CONFIG_X86_SMP | ||
885 | /* | ||
886 | * setup to use the early static init tables during kernel startup | ||
887 | * X86_SMP will exclude sub-arches that don't deal well with it. | ||
888 | */ | ||
889 | x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; | ||
890 | x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; | ||
891 | #ifdef CONFIG_NUMA | ||
892 | x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; | ||
893 | #endif | ||
894 | #endif | ||
895 | |||
896 | #ifdef CONFIG_X86_GENERICARCH | ||
897 | generic_apic_probe(); | ||
898 | #endif | ||
899 | |||
900 | #ifdef CONFIG_ACPI | ||
901 | /* | ||
902 | * Parse the ACPI tables for possible boot-time SMP configuration. | ||
903 | */ | ||
904 | acpi_boot_table_init(); | ||
905 | #endif | ||
906 | |||
907 | early_quirks(); | ||
908 | |||
909 | #ifdef CONFIG_ACPI | ||
910 | acpi_boot_init(); | ||
911 | |||
912 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) | ||
913 | if (def_to_bigsmp) | ||
914 | printk(KERN_WARNING "More than 8 CPUs detected and " | ||
915 | "CONFIG_X86_PC cannot handle it.\nUse " | ||
916 | "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); | ||
917 | #endif | ||
918 | #endif | ||
919 | #ifdef CONFIG_X86_LOCAL_APIC | ||
920 | if (smp_found_config) | ||
921 | get_smp_config(); | ||
922 | #endif | ||
923 | |||
924 | e820_register_memory(); | ||
925 | e820_mark_nosave_regions(); | ||
926 | |||
927 | #ifdef CONFIG_VT | ||
928 | #if defined(CONFIG_VGA_CONSOLE) | ||
929 | if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) | ||
930 | conswitchp = &vga_con; | ||
931 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
932 | conswitchp = &dummy_con; | ||
933 | #endif | ||
934 | #endif | ||
935 | } | ||
936 | |||
937 | /* | ||
938 | * Request address space for all standard resources | ||
939 | * | ||
940 | * This is called just before pcibios_init(), which is also a | ||
941 | * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). | ||
942 | */ | ||
943 | static int __init request_standard_resources(void) | ||
944 | { | ||
945 | int i; | ||
946 | |||
947 | printk(KERN_INFO "Setting up standard PCI resources\n"); | ||
948 | init_iomem_resources(&code_resource, &data_resource, &bss_resource); | ||
949 | |||
950 | request_resource(&iomem_resource, &video_ram_resource); | ||
951 | |||
952 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
953 | for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | ||
954 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
955 | return 0; | ||
956 | } | ||
957 | |||
958 | subsys_initcall(request_standard_resources); | ||
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c deleted file mode 100644 index 6dff1286ad8a..000000000000 --- a/arch/x86/kernel/setup_64.c +++ /dev/null | |||
@@ -1,1194 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1995 Linus Torvalds | ||
3 | */ | ||
4 | |||
5 | /* | ||
6 | * This file handles the architecture-dependent parts of initialization | ||
7 | */ | ||
8 | |||
9 | #include <linux/errno.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/stddef.h> | ||
14 | #include <linux/unistd.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/user.h> | ||
18 | #include <linux/screen_info.h> | ||
19 | #include <linux/ioport.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/initrd.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/bootmem.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <asm/processor.h> | ||
27 | #include <linux/console.h> | ||
28 | #include <linux/seq_file.h> | ||
29 | #include <linux/crash_dump.h> | ||
30 | #include <linux/root_dev.h> | ||
31 | #include <linux/pci.h> | ||
32 | #include <asm/pci-direct.h> | ||
33 | #include <linux/efi.h> | ||
34 | #include <linux/acpi.h> | ||
35 | #include <linux/kallsyms.h> | ||
36 | #include <linux/edd.h> | ||
37 | #include <linux/iscsi_ibft.h> | ||
38 | #include <linux/mmzone.h> | ||
39 | #include <linux/kexec.h> | ||
40 | #include <linux/cpufreq.h> | ||
41 | #include <linux/dmi.h> | ||
42 | #include <linux/dma-mapping.h> | ||
43 | #include <linux/ctype.h> | ||
44 | #include <linux/sort.h> | ||
45 | #include <linux/uaccess.h> | ||
46 | #include <linux/init_ohci1394_dma.h> | ||
47 | #include <linux/kvm_para.h> | ||
48 | |||
49 | #include <asm/mtrr.h> | ||
50 | #include <asm/uaccess.h> | ||
51 | #include <asm/system.h> | ||
52 | #include <asm/vsyscall.h> | ||
53 | #include <asm/io.h> | ||
54 | #include <asm/smp.h> | ||
55 | #include <asm/msr.h> | ||
56 | #include <asm/desc.h> | ||
57 | #include <video/edid.h> | ||
58 | #include <asm/e820.h> | ||
59 | #include <asm/dma.h> | ||
60 | #include <asm/gart.h> | ||
61 | #include <asm/mpspec.h> | ||
62 | #include <asm/mmu_context.h> | ||
63 | #include <asm/proto.h> | ||
64 | #include <asm/setup.h> | ||
65 | #include <asm/numa.h> | ||
66 | #include <asm/sections.h> | ||
67 | #include <asm/dmi.h> | ||
68 | #include <asm/cacheflush.h> | ||
69 | #include <asm/mce.h> | ||
70 | #include <asm/ds.h> | ||
71 | #include <asm/topology.h> | ||
72 | #include <asm/trampoline.h> | ||
73 | #include <asm/pat.h> | ||
74 | |||
75 | #include <mach_apic.h> | ||
76 | #ifdef CONFIG_PARAVIRT | ||
77 | #include <asm/paravirt.h> | ||
78 | #else | ||
79 | #define ARCH_SETUP | ||
80 | #endif | ||
81 | |||
82 | /* | ||
83 | * Machine setup.. | ||
84 | */ | ||
85 | |||
86 | struct cpuinfo_x86 boot_cpu_data __read_mostly; | ||
87 | EXPORT_SYMBOL(boot_cpu_data); | ||
88 | |||
89 | __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | ||
90 | |||
91 | unsigned long mmu_cr4_features; | ||
92 | |||
93 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
94 | int bootloader_type; | ||
95 | |||
96 | unsigned long saved_video_mode; | ||
97 | |||
98 | int force_mwait __cpuinitdata; | ||
99 | |||
100 | /* | ||
101 | * Early DMI memory | ||
102 | */ | ||
103 | int dmi_alloc_index; | ||
104 | char dmi_alloc_data[DMI_MAX_DATA]; | ||
105 | |||
106 | /* | ||
107 | * Setup options | ||
108 | */ | ||
109 | struct screen_info screen_info; | ||
110 | EXPORT_SYMBOL(screen_info); | ||
111 | struct sys_desc_table_struct { | ||
112 | unsigned short length; | ||
113 | unsigned char table[0]; | ||
114 | }; | ||
115 | |||
116 | struct edid_info edid_info; | ||
117 | EXPORT_SYMBOL_GPL(edid_info); | ||
118 | |||
119 | extern int root_mountflags; | ||
120 | |||
121 | char __initdata command_line[COMMAND_LINE_SIZE]; | ||
122 | |||
123 | static struct resource standard_io_resources[] = { | ||
124 | { .name = "dma1", .start = 0x00, .end = 0x1f, | ||
125 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
126 | { .name = "pic1", .start = 0x20, .end = 0x21, | ||
127 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
128 | { .name = "timer0", .start = 0x40, .end = 0x43, | ||
129 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
130 | { .name = "timer1", .start = 0x50, .end = 0x53, | ||
131 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
132 | { .name = "keyboard", .start = 0x60, .end = 0x60, | ||
133 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
134 | { .name = "keyboard", .start = 0x64, .end = 0x64, | ||
135 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
136 | { .name = "dma page reg", .start = 0x80, .end = 0x8f, | ||
137 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
138 | { .name = "pic2", .start = 0xa0, .end = 0xa1, | ||
139 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
140 | { .name = "dma2", .start = 0xc0, .end = 0xdf, | ||
141 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
142 | { .name = "fpu", .start = 0xf0, .end = 0xff, | ||
143 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } | ||
144 | }; | ||
145 | |||
146 | #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) | ||
147 | |||
148 | static struct resource data_resource = { | ||
149 | .name = "Kernel data", | ||
150 | .start = 0, | ||
151 | .end = 0, | ||
152 | .flags = IORESOURCE_RAM, | ||
153 | }; | ||
154 | static struct resource code_resource = { | ||
155 | .name = "Kernel code", | ||
156 | .start = 0, | ||
157 | .end = 0, | ||
158 | .flags = IORESOURCE_RAM, | ||
159 | }; | ||
160 | static struct resource bss_resource = { | ||
161 | .name = "Kernel bss", | ||
162 | .start = 0, | ||
163 | .end = 0, | ||
164 | .flags = IORESOURCE_RAM, | ||
165 | }; | ||
166 | |||
167 | static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); | ||
168 | |||
169 | #ifdef CONFIG_PROC_VMCORE | ||
170 | /* elfcorehdr= specifies the location of elf core header | ||
171 | * stored by the crashed kernel. This option will be passed | ||
172 | * by kexec loader to the capture kernel. | ||
173 | */ | ||
174 | static int __init setup_elfcorehdr(char *arg) | ||
175 | { | ||
176 | char *end; | ||
177 | if (!arg) | ||
178 | return -EINVAL; | ||
179 | elfcorehdr_addr = memparse(arg, &end); | ||
180 | return end > arg ? 0 : -EINVAL; | ||
181 | } | ||
182 | early_param("elfcorehdr", setup_elfcorehdr); | ||
183 | #endif | ||
184 | |||
185 | #ifndef CONFIG_NUMA | ||
186 | static void __init | ||
187 | contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
188 | { | ||
189 | unsigned long bootmap_size, bootmap; | ||
190 | |||
191 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
192 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, | ||
193 | PAGE_SIZE); | ||
194 | if (bootmap == -1L) | ||
195 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | ||
196 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | ||
197 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
198 | free_bootmem_with_active_regions(0, end_pfn); | ||
199 | early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); | ||
200 | reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); | ||
201 | } | ||
202 | #endif | ||
203 | |||
204 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
205 | struct edd edd; | ||
206 | #ifdef CONFIG_EDD_MODULE | ||
207 | EXPORT_SYMBOL(edd); | ||
208 | #endif | ||
209 | /** | ||
210 | * copy_edd() - Copy the BIOS EDD information | ||
211 | * from boot_params into a safe place. | ||
212 | * | ||
213 | */ | ||
214 | static inline void copy_edd(void) | ||
215 | { | ||
216 | memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, | ||
217 | sizeof(edd.mbr_signature)); | ||
218 | memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); | ||
219 | edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; | ||
220 | edd.edd_info_nr = boot_params.eddbuf_entries; | ||
221 | } | ||
222 | #else | ||
223 | static inline void copy_edd(void) | ||
224 | { | ||
225 | } | ||
226 | #endif | ||
227 | |||
228 | #ifdef CONFIG_KEXEC | ||
229 | static void __init reserve_crashkernel(void) | ||
230 | { | ||
231 | unsigned long long total_mem; | ||
232 | unsigned long long crash_size, crash_base; | ||
233 | int ret; | ||
234 | |||
235 | total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; | ||
236 | |||
237 | ret = parse_crashkernel(boot_command_line, total_mem, | ||
238 | &crash_size, &crash_base); | ||
239 | if (ret == 0 && crash_size) { | ||
240 | if (crash_base <= 0) { | ||
241 | printk(KERN_INFO "crashkernel reservation failed - " | ||
242 | "you have to specify a base address\n"); | ||
243 | return; | ||
244 | } | ||
245 | |||
246 | if (reserve_bootmem(crash_base, crash_size, | ||
247 | BOOTMEM_EXCLUSIVE) < 0) { | ||
248 | printk(KERN_INFO "crashkernel reservation failed - " | ||
249 | "memory is in use\n"); | ||
250 | return; | ||
251 | } | ||
252 | |||
253 | printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " | ||
254 | "for crashkernel (System RAM: %ldMB)\n", | ||
255 | (unsigned long)(crash_size >> 20), | ||
256 | (unsigned long)(crash_base >> 20), | ||
257 | (unsigned long)(total_mem >> 20)); | ||
258 | crashk_res.start = crash_base; | ||
259 | crashk_res.end = crash_base + crash_size - 1; | ||
260 | insert_resource(&iomem_resource, &crashk_res); | ||
261 | } | ||
262 | } | ||
263 | #else | ||
264 | static inline void __init reserve_crashkernel(void) | ||
265 | {} | ||
266 | #endif | ||
267 | |||
268 | /* Overridden in paravirt.c if CONFIG_PARAVIRT */ | ||
269 | void __attribute__((weak)) __init memory_setup(void) | ||
270 | { | ||
271 | machine_specific_memory_setup(); | ||
272 | } | ||
273 | |||
274 | static void __init parse_setup_data(void) | ||
275 | { | ||
276 | struct setup_data *data; | ||
277 | unsigned long pa_data; | ||
278 | |||
279 | if (boot_params.hdr.version < 0x0209) | ||
280 | return; | ||
281 | pa_data = boot_params.hdr.setup_data; | ||
282 | while (pa_data) { | ||
283 | data = early_ioremap(pa_data, PAGE_SIZE); | ||
284 | switch (data->type) { | ||
285 | default: | ||
286 | break; | ||
287 | } | ||
288 | #ifndef CONFIG_DEBUG_BOOT_PARAMS | ||
289 | free_early(pa_data, pa_data+sizeof(*data)+data->len); | ||
290 | #endif | ||
291 | pa_data = data->next; | ||
292 | early_iounmap(data, PAGE_SIZE); | ||
293 | } | ||
294 | } | ||
295 | |||
296 | #ifdef CONFIG_PCI_MMCONFIG | ||
297 | extern void __cpuinit fam10h_check_enable_mmcfg(void); | ||
298 | extern void __init check_enable_amd_mmconf_dmi(void); | ||
299 | #else | ||
300 | void __cpuinit fam10h_check_enable_mmcfg(void) | ||
301 | { | ||
302 | } | ||
303 | void __init check_enable_amd_mmconf_dmi(void) | ||
304 | { | ||
305 | } | ||
306 | #endif | ||
307 | |||
308 | /* | ||
309 | * setup_arch - architecture-specific boot-time initializations | ||
310 | * | ||
311 | * Note: On x86_64, fixmaps are ready for use even before this is called. | ||
312 | */ | ||
313 | void __init setup_arch(char **cmdline_p) | ||
314 | { | ||
315 | unsigned i; | ||
316 | |||
317 | printk(KERN_INFO "Command line: %s\n", boot_command_line); | ||
318 | |||
319 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); | ||
320 | screen_info = boot_params.screen_info; | ||
321 | edid_info = boot_params.edid_info; | ||
322 | saved_video_mode = boot_params.hdr.vid_mode; | ||
323 | bootloader_type = boot_params.hdr.type_of_loader; | ||
324 | |||
325 | #ifdef CONFIG_BLK_DEV_RAM | ||
326 | rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; | ||
327 | rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); | ||
328 | rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); | ||
329 | #endif | ||
330 | #ifdef CONFIG_EFI | ||
331 | if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, | ||
332 | "EL64", 4)) | ||
333 | efi_enabled = 1; | ||
334 | #endif | ||
335 | |||
336 | ARCH_SETUP | ||
337 | |||
338 | memory_setup(); | ||
339 | copy_edd(); | ||
340 | |||
341 | if (!boot_params.hdr.root_flags) | ||
342 | root_mountflags &= ~MS_RDONLY; | ||
343 | init_mm.start_code = (unsigned long) &_text; | ||
344 | init_mm.end_code = (unsigned long) &_etext; | ||
345 | init_mm.end_data = (unsigned long) &_edata; | ||
346 | init_mm.brk = (unsigned long) &_end; | ||
347 | |||
348 | code_resource.start = virt_to_phys(&_text); | ||
349 | code_resource.end = virt_to_phys(&_etext)-1; | ||
350 | data_resource.start = virt_to_phys(&_etext); | ||
351 | data_resource.end = virt_to_phys(&_edata)-1; | ||
352 | bss_resource.start = virt_to_phys(&__bss_start); | ||
353 | bss_resource.end = virt_to_phys(&__bss_stop)-1; | ||
354 | |||
355 | early_identify_cpu(&boot_cpu_data); | ||
356 | |||
357 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | ||
358 | *cmdline_p = command_line; | ||
359 | |||
360 | parse_setup_data(); | ||
361 | |||
362 | parse_early_param(); | ||
363 | |||
364 | #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | ||
365 | if (init_ohci1394_dma_early) | ||
366 | init_ohci1394_dma_on_all_controllers(); | ||
367 | #endif | ||
368 | |||
369 | finish_e820_parsing(); | ||
370 | |||
371 | /* after parse_early_param, so could debug it */ | ||
372 | insert_resource(&iomem_resource, &code_resource); | ||
373 | insert_resource(&iomem_resource, &data_resource); | ||
374 | insert_resource(&iomem_resource, &bss_resource); | ||
375 | |||
376 | early_gart_iommu_check(); | ||
377 | |||
378 | e820_register_active_regions(0, 0, -1UL); | ||
379 | /* | ||
380 | * partially used pages are not usable - thus | ||
381 | * we are rounding upwards: | ||
382 | */ | ||
383 | end_pfn = e820_end_of_ram(); | ||
384 | /* update e820 for memory not covered by WB MTRRs */ | ||
385 | mtrr_bp_init(); | ||
386 | if (mtrr_trim_uncached_memory(end_pfn)) { | ||
387 | e820_register_active_regions(0, 0, -1UL); | ||
388 | end_pfn = e820_end_of_ram(); | ||
389 | } | ||
390 | |||
391 | num_physpages = end_pfn; | ||
392 | |||
393 | check_efer(); | ||
394 | |||
395 | max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT)); | ||
396 | if (efi_enabled) | ||
397 | efi_init(); | ||
398 | |||
399 | vsmp_init(); | ||
400 | |||
401 | dmi_scan_machine(); | ||
402 | |||
403 | io_delay_init(); | ||
404 | |||
405 | #ifdef CONFIG_KVM_CLOCK | ||
406 | kvmclock_init(); | ||
407 | #endif | ||
408 | |||
409 | #ifdef CONFIG_SMP | ||
410 | /* setup to use the early static init tables during kernel startup */ | ||
411 | x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; | ||
412 | x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; | ||
413 | #ifdef CONFIG_NUMA | ||
414 | x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; | ||
415 | #endif | ||
416 | #endif | ||
417 | |||
418 | #ifdef CONFIG_ACPI | ||
419 | /* | ||
420 | * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). | ||
421 | * Call this early for SRAT node setup. | ||
422 | */ | ||
423 | acpi_boot_table_init(); | ||
424 | #endif | ||
425 | |||
426 | /* How many end-of-memory variables you have, grandma! */ | ||
427 | max_low_pfn = end_pfn; | ||
428 | max_pfn = end_pfn; | ||
429 | high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; | ||
430 | |||
431 | /* Remove active ranges so rediscovery with NUMA-awareness happens */ | ||
432 | remove_all_active_ranges(); | ||
433 | |||
434 | #ifdef CONFIG_ACPI_NUMA | ||
435 | /* | ||
436 | * Parse SRAT to discover nodes. | ||
437 | */ | ||
438 | acpi_numa_init(); | ||
439 | #endif | ||
440 | |||
441 | #ifdef CONFIG_NUMA | ||
442 | numa_initmem_init(0, end_pfn); | ||
443 | #else | ||
444 | contig_initmem_init(0, end_pfn); | ||
445 | #endif | ||
446 | |||
447 | dma32_reserve_bootmem(); | ||
448 | |||
449 | #ifdef CONFIG_ACPI_SLEEP | ||
450 | /* | ||
451 | * Reserve low memory region for sleep support. | ||
452 | */ | ||
453 | acpi_reserve_bootmem(); | ||
454 | #endif | ||
455 | |||
456 | if (efi_enabled) | ||
457 | efi_reserve_bootmem(); | ||
458 | |||
459 | /* | ||
460 | * Find and reserve possible boot-time SMP configuration: | ||
461 | */ | ||
462 | find_smp_config(); | ||
463 | #ifdef CONFIG_BLK_DEV_INITRD | ||
464 | if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { | ||
465 | unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; | ||
466 | unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; | ||
467 | unsigned long ramdisk_end = ramdisk_image + ramdisk_size; | ||
468 | unsigned long end_of_mem = end_pfn << PAGE_SHIFT; | ||
469 | |||
470 | if (ramdisk_end <= end_of_mem) { | ||
471 | /* | ||
472 | * don't need to reserve again, already reserved early | ||
473 | * in x86_64_start_kernel, and early_res_to_bootmem | ||
474 | * convert that to reserved in bootmem | ||
475 | */ | ||
476 | initrd_start = ramdisk_image + PAGE_OFFSET; | ||
477 | initrd_end = initrd_start+ramdisk_size; | ||
478 | } else { | ||
479 | free_bootmem(ramdisk_image, ramdisk_size); | ||
480 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
481 | "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | ||
482 | ramdisk_end, end_of_mem); | ||
483 | initrd_start = 0; | ||
484 | } | ||
485 | } | ||
486 | #endif | ||
487 | reserve_crashkernel(); | ||
488 | |||
489 | reserve_ibft_region(); | ||
490 | |||
491 | paging_init(); | ||
492 | map_vsyscall(); | ||
493 | |||
494 | early_quirks(); | ||
495 | |||
496 | #ifdef CONFIG_ACPI | ||
497 | /* | ||
498 | * Read APIC and some other early information from ACPI tables. | ||
499 | */ | ||
500 | acpi_boot_init(); | ||
501 | #endif | ||
502 | |||
503 | init_cpu_to_node(); | ||
504 | |||
505 | /* | ||
506 | * get boot-time SMP configuration: | ||
507 | */ | ||
508 | if (smp_found_config) | ||
509 | get_smp_config(); | ||
510 | init_apic_mappings(); | ||
511 | ioapic_init_mappings(); | ||
512 | |||
513 | kvm_guest_init(); | ||
514 | |||
515 | /* | ||
516 | * We trust e820 completely. No explicit ROM probing in memory. | ||
517 | */ | ||
518 | e820_reserve_resources(); | ||
519 | e820_mark_nosave_regions(); | ||
520 | |||
521 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
522 | for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | ||
523 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
524 | |||
525 | e820_setup_gap(); | ||
526 | |||
527 | #ifdef CONFIG_VT | ||
528 | #if defined(CONFIG_VGA_CONSOLE) | ||
529 | if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) | ||
530 | conswitchp = &vga_con; | ||
531 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
532 | conswitchp = &dummy_con; | ||
533 | #endif | ||
534 | #endif | ||
535 | |||
536 | /* do this before identify_cpu for boot cpu */ | ||
537 | check_enable_amd_mmconf_dmi(); | ||
538 | } | ||
539 | |||
540 | static int __cpuinit get_model_name(struct cpuinfo_x86 *c) | ||
541 | { | ||
542 | unsigned int *v; | ||
543 | |||
544 | if (c->extended_cpuid_level < 0x80000004) | ||
545 | return 0; | ||
546 | |||
547 | v = (unsigned int *) c->x86_model_id; | ||
548 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | ||
549 | cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | ||
550 | cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | ||
551 | c->x86_model_id[48] = 0; | ||
552 | return 1; | ||
553 | } | ||
554 | |||
555 | |||
556 | static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | ||
557 | { | ||
558 | unsigned int n, dummy, eax, ebx, ecx, edx; | ||
559 | |||
560 | n = c->extended_cpuid_level; | ||
561 | |||
562 | if (n >= 0x80000005) { | ||
563 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | ||
564 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " | ||
565 | "D cache %dK (%d bytes/line)\n", | ||
566 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | ||
567 | c->x86_cache_size = (ecx>>24) + (edx>>24); | ||
568 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
569 | c->x86_tlbsize = 0; | ||
570 | } | ||
571 | |||
572 | if (n >= 0x80000006) { | ||
573 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | ||
574 | ecx = cpuid_ecx(0x80000006); | ||
575 | c->x86_cache_size = ecx >> 16; | ||
576 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
577 | |||
578 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | ||
579 | c->x86_cache_size, ecx & 0xFF); | ||
580 | } | ||
581 | if (n >= 0x80000008) { | ||
582 | cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | ||
583 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
584 | c->x86_phys_bits = eax & 0xff; | ||
585 | } | ||
586 | } | ||
587 | |||
588 | #ifdef CONFIG_NUMA | ||
589 | static int __cpuinit nearby_node(int apicid) | ||
590 | { | ||
591 | int i, node; | ||
592 | |||
593 | for (i = apicid - 1; i >= 0; i--) { | ||
594 | node = apicid_to_node[i]; | ||
595 | if (node != NUMA_NO_NODE && node_online(node)) | ||
596 | return node; | ||
597 | } | ||
598 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | ||
599 | node = apicid_to_node[i]; | ||
600 | if (node != NUMA_NO_NODE && node_online(node)) | ||
601 | return node; | ||
602 | } | ||
603 | return first_node(node_online_map); /* Shouldn't happen */ | ||
604 | } | ||
605 | #endif | ||
606 | |||
607 | /* | ||
608 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | ||
609 | * Assumes number of cores is a power of two. | ||
610 | */ | ||
611 | static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | ||
612 | { | ||
613 | #ifdef CONFIG_SMP | ||
614 | unsigned bits; | ||
615 | #ifdef CONFIG_NUMA | ||
616 | int cpu = smp_processor_id(); | ||
617 | int node = 0; | ||
618 | unsigned apicid = hard_smp_processor_id(); | ||
619 | #endif | ||
620 | bits = c->x86_coreid_bits; | ||
621 | |||
622 | /* Low order bits define the core id (index of core in socket) */ | ||
623 | c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); | ||
624 | /* Convert the initial APIC ID into the socket ID */ | ||
625 | c->phys_proc_id = c->initial_apicid >> bits; | ||
626 | |||
627 | #ifdef CONFIG_NUMA | ||
628 | node = c->phys_proc_id; | ||
629 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | ||
630 | node = apicid_to_node[apicid]; | ||
631 | if (!node_online(node)) { | ||
632 | /* Two possibilities here: | ||
633 | - The CPU is missing memory and no node was created. | ||
634 | In that case try picking one from a nearby CPU | ||
635 | - The APIC IDs differ from the HyperTransport node IDs | ||
636 | which the K8 northbridge parsing fills in. | ||
637 | Assume they are all increased by a constant offset, | ||
638 | but in the same order as the HT nodeids. | ||
639 | If that doesn't result in a usable node fall back to the | ||
640 | path for the previous case. */ | ||
641 | |||
642 | int ht_nodeid = c->initial_apicid; | ||
643 | |||
644 | if (ht_nodeid >= 0 && | ||
645 | apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | ||
646 | node = apicid_to_node[ht_nodeid]; | ||
647 | /* Pick a nearby node */ | ||
648 | if (!node_online(node)) | ||
649 | node = nearby_node(apicid); | ||
650 | } | ||
651 | numa_set_node(cpu, node); | ||
652 | |||
653 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
654 | #endif | ||
655 | #endif | ||
656 | } | ||
657 | |||
658 | static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) | ||
659 | { | ||
660 | #ifdef CONFIG_SMP | ||
661 | unsigned bits, ecx; | ||
662 | |||
663 | /* Multi core CPU? */ | ||
664 | if (c->extended_cpuid_level < 0x80000008) | ||
665 | return; | ||
666 | |||
667 | ecx = cpuid_ecx(0x80000008); | ||
668 | |||
669 | c->x86_max_cores = (ecx & 0xff) + 1; | ||
670 | |||
671 | /* CPU telling us the core id bits shift? */ | ||
672 | bits = (ecx >> 12) & 0xF; | ||
673 | |||
674 | /* Otherwise recompute */ | ||
675 | if (bits == 0) { | ||
676 | while ((1 << bits) < c->x86_max_cores) | ||
677 | bits++; | ||
678 | } | ||
679 | |||
680 | c->x86_coreid_bits = bits; | ||
681 | |||
682 | #endif | ||
683 | } | ||
684 | |||
685 | #define ENABLE_C1E_MASK 0x18000000 | ||
686 | #define CPUID_PROCESSOR_SIGNATURE 1 | ||
687 | #define CPUID_XFAM 0x0ff00000 | ||
688 | #define CPUID_XFAM_K8 0x00000000 | ||
689 | #define CPUID_XFAM_10H 0x00100000 | ||
690 | #define CPUID_XFAM_11H 0x00200000 | ||
691 | #define CPUID_XMOD 0x000f0000 | ||
692 | #define CPUID_XMOD_REV_F 0x00040000 | ||
693 | |||
694 | /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ | ||
695 | static __cpuinit int amd_apic_timer_broken(void) | ||
696 | { | ||
697 | u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); | ||
698 | |||
699 | switch (eax & CPUID_XFAM) { | ||
700 | case CPUID_XFAM_K8: | ||
701 | if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) | ||
702 | break; | ||
703 | case CPUID_XFAM_10H: | ||
704 | case CPUID_XFAM_11H: | ||
705 | rdmsr(MSR_K8_ENABLE_C1E, lo, hi); | ||
706 | if (lo & ENABLE_C1E_MASK) | ||
707 | return 1; | ||
708 | break; | ||
709 | default: | ||
710 | /* err on the side of caution */ | ||
711 | return 1; | ||
712 | } | ||
713 | return 0; | ||
714 | } | ||
715 | |||
716 | static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | ||
717 | { | ||
718 | early_init_amd_mc(c); | ||
719 | |||
720 | /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | ||
721 | if (c->x86_power & (1<<8)) | ||
722 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
723 | } | ||
724 | |||
725 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | ||
726 | { | ||
727 | unsigned level; | ||
728 | |||
729 | #ifdef CONFIG_SMP | ||
730 | unsigned long value; | ||
731 | |||
732 | /* | ||
733 | * Disable TLB flush filter by setting HWCR.FFDIS on K8 | ||
734 | * bit 6 of msr C001_0015 | ||
735 | * | ||
736 | * Errata 63 for SH-B3 steppings | ||
737 | * Errata 122 for all steppings (F+ have it disabled by default) | ||
738 | */ | ||
739 | if (c->x86 == 15) { | ||
740 | rdmsrl(MSR_K8_HWCR, value); | ||
741 | value |= 1 << 6; | ||
742 | wrmsrl(MSR_K8_HWCR, value); | ||
743 | } | ||
744 | #endif | ||
745 | |||
746 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | ||
747 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | ||
748 | clear_cpu_cap(c, 0*32+31); | ||
749 | |||
750 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | ||
751 | level = cpuid_eax(1); | ||
752 | if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || | ||
753 | level >= 0x0f58)) | ||
754 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
755 | if (c->x86 == 0x10 || c->x86 == 0x11) | ||
756 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
757 | |||
758 | /* Enable workaround for FXSAVE leak */ | ||
759 | if (c->x86 >= 6) | ||
760 | set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); | ||
761 | |||
762 | level = get_model_name(c); | ||
763 | if (!level) { | ||
764 | switch (c->x86) { | ||
765 | case 15: | ||
766 | /* Should distinguish Models here, but this is only | ||
767 | a fallback anyways. */ | ||
768 | strcpy(c->x86_model_id, "Hammer"); | ||
769 | break; | ||
770 | } | ||
771 | } | ||
772 | display_cacheinfo(c); | ||
773 | |||
774 | /* Multi core CPU? */ | ||
775 | if (c->extended_cpuid_level >= 0x80000008) | ||
776 | amd_detect_cmp(c); | ||
777 | |||
778 | if (c->extended_cpuid_level >= 0x80000006 && | ||
779 | (cpuid_edx(0x80000006) & 0xf000)) | ||
780 | num_cache_leaves = 4; | ||
781 | else | ||
782 | num_cache_leaves = 3; | ||
783 | |||
784 | if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) | ||
785 | set_cpu_cap(c, X86_FEATURE_K8); | ||
786 | |||
787 | /* MFENCE stops RDTSC speculation */ | ||
788 | set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); | ||
789 | |||
790 | if (c->x86 == 0x10) | ||
791 | fam10h_check_enable_mmcfg(); | ||
792 | |||
793 | if (amd_apic_timer_broken()) | ||
794 | disable_apic_timer = 1; | ||
795 | |||
796 | if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { | ||
797 | unsigned long long tseg; | ||
798 | |||
799 | /* | ||
800 | * Split up direct mapping around the TSEG SMM area. | ||
801 | * Don't do it for gbpages because there seems very little | ||
802 | * benefit in doing so. | ||
803 | */ | ||
804 | if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && | ||
805 | (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) | ||
806 | set_memory_4k((unsigned long)__va(tseg), 1); | ||
807 | } | ||
808 | } | ||
809 | |||
810 | void __cpuinit detect_ht(struct cpuinfo_x86 *c) | ||
811 | { | ||
812 | #ifdef CONFIG_SMP | ||
813 | u32 eax, ebx, ecx, edx; | ||
814 | int index_msb, core_bits; | ||
815 | |||
816 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
817 | |||
818 | |||
819 | if (!cpu_has(c, X86_FEATURE_HT)) | ||
820 | return; | ||
821 | if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
822 | goto out; | ||
823 | |||
824 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
825 | |||
826 | if (smp_num_siblings == 1) { | ||
827 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
828 | } else if (smp_num_siblings > 1) { | ||
829 | |||
830 | if (smp_num_siblings > NR_CPUS) { | ||
831 | printk(KERN_WARNING "CPU: Unsupported number of " | ||
832 | "siblings %d", smp_num_siblings); | ||
833 | smp_num_siblings = 1; | ||
834 | return; | ||
835 | } | ||
836 | |||
837 | index_msb = get_count_order(smp_num_siblings); | ||
838 | c->phys_proc_id = phys_pkg_id(index_msb); | ||
839 | |||
840 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | ||
841 | |||
842 | index_msb = get_count_order(smp_num_siblings); | ||
843 | |||
844 | core_bits = get_count_order(c->x86_max_cores); | ||
845 | |||
846 | c->cpu_core_id = phys_pkg_id(index_msb) & | ||
847 | ((1 << core_bits) - 1); | ||
848 | } | ||
849 | out: | ||
850 | if ((c->x86_max_cores * smp_num_siblings) > 1) { | ||
851 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | ||
852 | c->phys_proc_id); | ||
853 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", | ||
854 | c->cpu_core_id); | ||
855 | } | ||
856 | |||
857 | #endif | ||
858 | } | ||
859 | |||
860 | /* | ||
861 | * find out the number of processor cores on the die | ||
862 | */ | ||
863 | static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) | ||
864 | { | ||
865 | unsigned int eax, t; | ||
866 | |||
867 | if (c->cpuid_level < 4) | ||
868 | return 1; | ||
869 | |||
870 | cpuid_count(4, 0, &eax, &t, &t, &t); | ||
871 | |||
872 | if (eax & 0x1f) | ||
873 | return ((eax >> 26) + 1); | ||
874 | else | ||
875 | return 1; | ||
876 | } | ||
877 | |||
878 | static void __cpuinit srat_detect_node(void) | ||
879 | { | ||
880 | #ifdef CONFIG_NUMA | ||
881 | unsigned node; | ||
882 | int cpu = smp_processor_id(); | ||
883 | int apicid = hard_smp_processor_id(); | ||
884 | |||
885 | /* Don't do the funky fallback heuristics the AMD version employs | ||
886 | for now. */ | ||
887 | node = apicid_to_node[apicid]; | ||
888 | if (node == NUMA_NO_NODE || !node_online(node)) | ||
889 | node = first_node(node_online_map); | ||
890 | numa_set_node(cpu, node); | ||
891 | |||
892 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
893 | #endif | ||
894 | } | ||
895 | |||
896 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | ||
897 | { | ||
898 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || | ||
899 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | ||
900 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
901 | } | ||
902 | |||
903 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) | ||
904 | { | ||
905 | /* Cache sizes */ | ||
906 | unsigned n; | ||
907 | |||
908 | init_intel_cacheinfo(c); | ||
909 | if (c->cpuid_level > 9) { | ||
910 | unsigned eax = cpuid_eax(10); | ||
911 | /* Check for version and the number of counters */ | ||
912 | if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) | ||
913 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | ||
914 | } | ||
915 | |||
916 | if (cpu_has_ds) { | ||
917 | unsigned int l1, l2; | ||
918 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | ||
919 | if (!(l1 & (1<<11))) | ||
920 | set_cpu_cap(c, X86_FEATURE_BTS); | ||
921 | if (!(l1 & (1<<12))) | ||
922 | set_cpu_cap(c, X86_FEATURE_PEBS); | ||
923 | } | ||
924 | |||
925 | |||
926 | if (cpu_has_bts) | ||
927 | ds_init_intel(c); | ||
928 | |||
929 | n = c->extended_cpuid_level; | ||
930 | if (n >= 0x80000008) { | ||
931 | unsigned eax = cpuid_eax(0x80000008); | ||
932 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
933 | c->x86_phys_bits = eax & 0xff; | ||
934 | /* CPUID workaround for Intel 0F34 CPU */ | ||
935 | if (c->x86_vendor == X86_VENDOR_INTEL && | ||
936 | c->x86 == 0xF && c->x86_model == 0x3 && | ||
937 | c->x86_mask == 0x4) | ||
938 | c->x86_phys_bits = 36; | ||
939 | } | ||
940 | |||
941 | if (c->x86 == 15) | ||
942 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
943 | if (c->x86 == 6) | ||
944 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
945 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | ||
946 | c->x86_max_cores = intel_num_cpu_cores(c); | ||
947 | |||
948 | srat_detect_node(); | ||
949 | } | ||
950 | |||
951 | static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) | ||
952 | { | ||
953 | if (c->x86 == 0x6 && c->x86_model >= 0xf) | ||
954 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
955 | } | ||
956 | |||
957 | static void __cpuinit init_centaur(struct cpuinfo_x86 *c) | ||
958 | { | ||
959 | /* Cache sizes */ | ||
960 | unsigned n; | ||
961 | |||
962 | n = c->extended_cpuid_level; | ||
963 | if (n >= 0x80000008) { | ||
964 | unsigned eax = cpuid_eax(0x80000008); | ||
965 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
966 | c->x86_phys_bits = eax & 0xff; | ||
967 | } | ||
968 | |||
969 | if (c->x86 == 0x6 && c->x86_model >= 0xf) { | ||
970 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
971 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
972 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
973 | } | ||
974 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | ||
975 | } | ||
976 | |||
977 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | ||
978 | { | ||
979 | char *v = c->x86_vendor_id; | ||
980 | |||
981 | if (!strcmp(v, "AuthenticAMD")) | ||
982 | c->x86_vendor = X86_VENDOR_AMD; | ||
983 | else if (!strcmp(v, "GenuineIntel")) | ||
984 | c->x86_vendor = X86_VENDOR_INTEL; | ||
985 | else if (!strcmp(v, "CentaurHauls")) | ||
986 | c->x86_vendor = X86_VENDOR_CENTAUR; | ||
987 | else | ||
988 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
989 | } | ||
990 | |||
991 | /* Do some early cpuid on the boot CPU to get some parameter that are | ||
992 | needed before check_bugs. Everything advanced is in identify_cpu | ||
993 | below. */ | ||
994 | static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | ||
995 | { | ||
996 | u32 tfms, xlvl; | ||
997 | |||
998 | c->loops_per_jiffy = loops_per_jiffy; | ||
999 | c->x86_cache_size = -1; | ||
1000 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
1001 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | ||
1002 | c->x86_vendor_id[0] = '\0'; /* Unset */ | ||
1003 | c->x86_model_id[0] = '\0'; /* Unset */ | ||
1004 | c->x86_clflush_size = 64; | ||
1005 | c->x86_cache_alignment = c->x86_clflush_size; | ||
1006 | c->x86_max_cores = 1; | ||
1007 | c->x86_coreid_bits = 0; | ||
1008 | c->extended_cpuid_level = 0; | ||
1009 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
1010 | |||
1011 | /* Get vendor name */ | ||
1012 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | ||
1013 | (unsigned int *)&c->x86_vendor_id[0], | ||
1014 | (unsigned int *)&c->x86_vendor_id[8], | ||
1015 | (unsigned int *)&c->x86_vendor_id[4]); | ||
1016 | |||
1017 | get_cpu_vendor(c); | ||
1018 | |||
1019 | /* Initialize the standard set of capabilities */ | ||
1020 | /* Note that the vendor-specific code below might override */ | ||
1021 | |||
1022 | /* Intel-defined flags: level 0x00000001 */ | ||
1023 | if (c->cpuid_level >= 0x00000001) { | ||
1024 | __u32 misc; | ||
1025 | cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | ||
1026 | &c->x86_capability[0]); | ||
1027 | c->x86 = (tfms >> 8) & 0xf; | ||
1028 | c->x86_model = (tfms >> 4) & 0xf; | ||
1029 | c->x86_mask = tfms & 0xf; | ||
1030 | if (c->x86 == 0xf) | ||
1031 | c->x86 += (tfms >> 20) & 0xff; | ||
1032 | if (c->x86 >= 0x6) | ||
1033 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
1034 | if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) | ||
1035 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | ||
1036 | } else { | ||
1037 | /* Have CPUID level 0 only - unheard of */ | ||
1038 | c->x86 = 4; | ||
1039 | } | ||
1040 | |||
1041 | c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; | ||
1042 | #ifdef CONFIG_SMP | ||
1043 | c->phys_proc_id = c->initial_apicid; | ||
1044 | #endif | ||
1045 | /* AMD-defined flags: level 0x80000001 */ | ||
1046 | xlvl = cpuid_eax(0x80000000); | ||
1047 | c->extended_cpuid_level = xlvl; | ||
1048 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
1049 | if (xlvl >= 0x80000001) { | ||
1050 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
1051 | c->x86_capability[6] = cpuid_ecx(0x80000001); | ||
1052 | } | ||
1053 | if (xlvl >= 0x80000004) | ||
1054 | get_model_name(c); /* Default name */ | ||
1055 | } | ||
1056 | |||
1057 | /* Transmeta-defined flags: level 0x80860001 */ | ||
1058 | xlvl = cpuid_eax(0x80860000); | ||
1059 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
1060 | /* Don't set x86_cpuid_level here for now to not confuse. */ | ||
1061 | if (xlvl >= 0x80860001) | ||
1062 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
1063 | } | ||
1064 | |||
1065 | c->extended_cpuid_level = cpuid_eax(0x80000000); | ||
1066 | if (c->extended_cpuid_level >= 0x80000007) | ||
1067 | c->x86_power = cpuid_edx(0x80000007); | ||
1068 | |||
1069 | switch (c->x86_vendor) { | ||
1070 | case X86_VENDOR_AMD: | ||
1071 | early_init_amd(c); | ||
1072 | break; | ||
1073 | case X86_VENDOR_INTEL: | ||
1074 | early_init_intel(c); | ||
1075 | break; | ||
1076 | case X86_VENDOR_CENTAUR: | ||
1077 | early_init_centaur(c); | ||
1078 | break; | ||
1079 | } | ||
1080 | |||
1081 | validate_pat_support(c); | ||
1082 | } | ||
1083 | |||
1084 | /* | ||
1085 | * This does the hard work of actually picking apart the CPU stuff... | ||
1086 | */ | ||
1087 | void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | ||
1088 | { | ||
1089 | int i; | ||
1090 | |||
1091 | early_identify_cpu(c); | ||
1092 | |||
1093 | init_scattered_cpuid_features(c); | ||
1094 | |||
1095 | c->apicid = phys_pkg_id(0); | ||
1096 | |||
1097 | /* | ||
1098 | * Vendor-specific initialization. In this section we | ||
1099 | * canonicalize the feature flags, meaning if there are | ||
1100 | * features a certain CPU supports which CPUID doesn't | ||
1101 | * tell us, CPUID claiming incorrect flags, or other bugs, | ||
1102 | * we handle them here. | ||
1103 | * | ||
1104 | * At the end of this section, c->x86_capability better | ||
1105 | * indicate the features this CPU genuinely supports! | ||
1106 | */ | ||
1107 | switch (c->x86_vendor) { | ||
1108 | case X86_VENDOR_AMD: | ||
1109 | init_amd(c); | ||
1110 | break; | ||
1111 | |||
1112 | case X86_VENDOR_INTEL: | ||
1113 | init_intel(c); | ||
1114 | break; | ||
1115 | |||
1116 | case X86_VENDOR_CENTAUR: | ||
1117 | init_centaur(c); | ||
1118 | break; | ||
1119 | |||
1120 | case X86_VENDOR_UNKNOWN: | ||
1121 | default: | ||
1122 | display_cacheinfo(c); | ||
1123 | break; | ||
1124 | } | ||
1125 | |||
1126 | detect_ht(c); | ||
1127 | |||
1128 | /* | ||
1129 | * On SMP, boot_cpu_data holds the common feature set between | ||
1130 | * all CPUs; so make sure that we indicate which features are | ||
1131 | * common between the CPUs. The first time this routine gets | ||
1132 | * executed, c == &boot_cpu_data. | ||
1133 | */ | ||
1134 | if (c != &boot_cpu_data) { | ||
1135 | /* AND the already accumulated flags with these */ | ||
1136 | for (i = 0; i < NCAPINTS; i++) | ||
1137 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | ||
1138 | } | ||
1139 | |||
1140 | /* Clear all flags overriden by options */ | ||
1141 | for (i = 0; i < NCAPINTS; i++) | ||
1142 | c->x86_capability[i] &= ~cleared_cpu_caps[i]; | ||
1143 | |||
1144 | #ifdef CONFIG_X86_MCE | ||
1145 | mcheck_init(c); | ||
1146 | #endif | ||
1147 | select_idle_routine(c); | ||
1148 | |||
1149 | #ifdef CONFIG_NUMA | ||
1150 | numa_add_cpu(smp_processor_id()); | ||
1151 | #endif | ||
1152 | |||
1153 | } | ||
1154 | |||
1155 | void __cpuinit identify_boot_cpu(void) | ||
1156 | { | ||
1157 | identify_cpu(&boot_cpu_data); | ||
1158 | } | ||
1159 | |||
1160 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | ||
1161 | { | ||
1162 | BUG_ON(c == &boot_cpu_data); | ||
1163 | identify_cpu(c); | ||
1164 | mtrr_ap_init(); | ||
1165 | } | ||
1166 | |||
1167 | static __init int setup_noclflush(char *arg) | ||
1168 | { | ||
1169 | setup_clear_cpu_cap(X86_FEATURE_CLFLSH); | ||
1170 | return 1; | ||
1171 | } | ||
1172 | __setup("noclflush", setup_noclflush); | ||
1173 | |||
1174 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | ||
1175 | { | ||
1176 | if (c->x86_model_id[0]) | ||
1177 | printk(KERN_CONT "%s", c->x86_model_id); | ||
1178 | |||
1179 | if (c->x86_mask || c->cpuid_level >= 0) | ||
1180 | printk(KERN_CONT " stepping %02x\n", c->x86_mask); | ||
1181 | else | ||
1182 | printk(KERN_CONT "\n"); | ||
1183 | } | ||
1184 | |||
1185 | static __init int setup_disablecpuid(char *arg) | ||
1186 | { | ||
1187 | int bit; | ||
1188 | if (get_option(&arg, &bit) && bit < NCAPINTS*32) | ||
1189 | setup_clear_cpu_cap(bit); | ||
1190 | else | ||
1191 | return 0; | ||
1192 | return 1; | ||
1193 | } | ||
1194 | __setup("clearcpuid=", setup_disablecpuid); | ||
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c new file mode 100644 index 000000000000..76e305e064f9 --- /dev/null +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -0,0 +1,378 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/bootmem.h> | ||
5 | #include <linux/percpu.h> | ||
6 | #include <linux/kexec.h> | ||
7 | #include <linux/crash_dump.h> | ||
8 | #include <asm/smp.h> | ||
9 | #include <asm/percpu.h> | ||
10 | #include <asm/sections.h> | ||
11 | #include <asm/processor.h> | ||
12 | #include <asm/setup.h> | ||
13 | #include <asm/topology.h> | ||
14 | #include <asm/mpspec.h> | ||
15 | #include <asm/apicdef.h> | ||
16 | #include <asm/highmem.h> | ||
17 | |||
18 | #ifdef CONFIG_X86_LOCAL_APIC | ||
19 | unsigned int num_processors; | ||
20 | unsigned disabled_cpus __cpuinitdata; | ||
21 | /* Processor that is doing the boot up */ | ||
22 | unsigned int boot_cpu_physical_apicid = -1U; | ||
23 | unsigned int max_physical_apicid; | ||
24 | EXPORT_SYMBOL(boot_cpu_physical_apicid); | ||
25 | |||
26 | /* Bitmask of physically existing CPUs */ | ||
27 | physid_mask_t phys_cpu_present_map; | ||
28 | #endif | ||
29 | |||
30 | /* map cpu index to physical APIC ID */ | ||
31 | DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); | ||
32 | DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); | ||
33 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); | ||
34 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); | ||
35 | |||
36 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | ||
37 | #define X86_64_NUMA 1 | ||
38 | |||
39 | /* map cpu index to node index */ | ||
40 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | ||
41 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
42 | |||
43 | /* which logical CPUs are on which nodes */ | ||
44 | cpumask_t *node_to_cpumask_map; | ||
45 | EXPORT_SYMBOL(node_to_cpumask_map); | ||
46 | |||
47 | /* setup node_to_cpumask_map */ | ||
48 | static void __init setup_node_to_cpumask_map(void); | ||
49 | |||
50 | #else | ||
51 | static inline void setup_node_to_cpumask_map(void) { } | ||
52 | #endif | ||
53 | |||
54 | #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) | ||
55 | /* | ||
56 | * Copy data used in early init routines from the initial arrays to the | ||
57 | * per cpu data areas. These arrays then become expendable and the | ||
58 | * *_early_ptr's are zeroed indicating that the static arrays are gone. | ||
59 | */ | ||
60 | static void __init setup_per_cpu_maps(void) | ||
61 | { | ||
62 | int cpu; | ||
63 | |||
64 | for_each_possible_cpu(cpu) { | ||
65 | per_cpu(x86_cpu_to_apicid, cpu) = | ||
66 | early_per_cpu_map(x86_cpu_to_apicid, cpu); | ||
67 | per_cpu(x86_bios_cpu_apicid, cpu) = | ||
68 | early_per_cpu_map(x86_bios_cpu_apicid, cpu); | ||
69 | #ifdef X86_64_NUMA | ||
70 | per_cpu(x86_cpu_to_node_map, cpu) = | ||
71 | early_per_cpu_map(x86_cpu_to_node_map, cpu); | ||
72 | #endif | ||
73 | } | ||
74 | |||
75 | /* indicate the early static arrays will soon be gone */ | ||
76 | early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; | ||
77 | early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; | ||
78 | #ifdef X86_64_NUMA | ||
79 | early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; | ||
80 | #endif | ||
81 | } | ||
82 | |||
83 | #ifdef CONFIG_X86_32 | ||
84 | /* | ||
85 | * Great future not-so-futuristic plan: make i386 and x86_64 do it | ||
86 | * the same way | ||
87 | */ | ||
88 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | ||
89 | EXPORT_SYMBOL(__per_cpu_offset); | ||
90 | static inline void setup_cpu_pda_map(void) { } | ||
91 | |||
92 | #elif !defined(CONFIG_SMP) | ||
93 | static inline void setup_cpu_pda_map(void) { } | ||
94 | |||
95 | #else /* CONFIG_SMP && CONFIG_X86_64 */ | ||
96 | |||
97 | /* | ||
98 | * Allocate cpu_pda pointer table and array via alloc_bootmem. | ||
99 | */ | ||
100 | static void __init setup_cpu_pda_map(void) | ||
101 | { | ||
102 | char *pda; | ||
103 | struct x8664_pda **new_cpu_pda; | ||
104 | unsigned long size; | ||
105 | int cpu; | ||
106 | |||
107 | size = roundup(sizeof(struct x8664_pda), cache_line_size()); | ||
108 | |||
109 | /* allocate cpu_pda array and pointer table */ | ||
110 | { | ||
111 | unsigned long tsize = nr_cpu_ids * sizeof(void *); | ||
112 | unsigned long asize = size * (nr_cpu_ids - 1); | ||
113 | |||
114 | tsize = roundup(tsize, cache_line_size()); | ||
115 | new_cpu_pda = alloc_bootmem(tsize + asize); | ||
116 | pda = (char *)new_cpu_pda + tsize; | ||
117 | } | ||
118 | |||
119 | /* initialize pointer table to static pda's */ | ||
120 | for_each_possible_cpu(cpu) { | ||
121 | if (cpu == 0) { | ||
122 | /* leave boot cpu pda in place */ | ||
123 | new_cpu_pda[0] = cpu_pda(0); | ||
124 | continue; | ||
125 | } | ||
126 | new_cpu_pda[cpu] = (struct x8664_pda *)pda; | ||
127 | new_cpu_pda[cpu]->in_bootmem = 1; | ||
128 | pda += size; | ||
129 | } | ||
130 | |||
131 | /* point to new pointer table */ | ||
132 | _cpu_pda = new_cpu_pda; | ||
133 | } | ||
134 | #endif | ||
135 | |||
136 | /* | ||
137 | * Great future plan: | ||
138 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | ||
139 | * Always point %gs to its beginning | ||
140 | */ | ||
141 | void __init setup_per_cpu_areas(void) | ||
142 | { | ||
143 | ssize_t size = PERCPU_ENOUGH_ROOM; | ||
144 | char *ptr; | ||
145 | int cpu; | ||
146 | |||
147 | /* Setup cpu_pda map */ | ||
148 | setup_cpu_pda_map(); | ||
149 | |||
150 | /* Copy section for each CPU (we discard the original) */ | ||
151 | size = PERCPU_ENOUGH_ROOM; | ||
152 | printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", | ||
153 | size); | ||
154 | |||
155 | for_each_possible_cpu(cpu) { | ||
156 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
157 | ptr = alloc_bootmem_pages(size); | ||
158 | #else | ||
159 | int node = early_cpu_to_node(cpu); | ||
160 | if (!node_online(node) || !NODE_DATA(node)) { | ||
161 | ptr = alloc_bootmem_pages(size); | ||
162 | printk(KERN_INFO | ||
163 | "cpu %d has no node %d or node-local memory\n", | ||
164 | cpu, node); | ||
165 | } | ||
166 | else | ||
167 | ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); | ||
168 | #endif | ||
169 | per_cpu_offset(cpu) = ptr - __per_cpu_start; | ||
170 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
171 | |||
172 | } | ||
173 | |||
174 | printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", | ||
175 | NR_CPUS, nr_cpu_ids, nr_node_ids); | ||
176 | |||
177 | /* Setup percpu data maps */ | ||
178 | setup_per_cpu_maps(); | ||
179 | |||
180 | /* Setup node to cpumask map */ | ||
181 | setup_node_to_cpumask_map(); | ||
182 | } | ||
183 | |||
184 | #endif | ||
185 | |||
186 | #ifdef X86_64_NUMA | ||
187 | |||
188 | /* | ||
189 | * Allocate node_to_cpumask_map based on number of available nodes | ||
190 | * Requires node_possible_map to be valid. | ||
191 | * | ||
192 | * Note: node_to_cpumask() is not valid until after this is done. | ||
193 | */ | ||
194 | static void __init setup_node_to_cpumask_map(void) | ||
195 | { | ||
196 | unsigned int node, num = 0; | ||
197 | cpumask_t *map; | ||
198 | |||
199 | /* setup nr_node_ids if not done yet */ | ||
200 | if (nr_node_ids == MAX_NUMNODES) { | ||
201 | for_each_node_mask(node, node_possible_map) | ||
202 | num = node; | ||
203 | nr_node_ids = num + 1; | ||
204 | } | ||
205 | |||
206 | /* allocate the map */ | ||
207 | map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); | ||
208 | |||
209 | pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", | ||
210 | map, nr_node_ids); | ||
211 | |||
212 | /* node_to_cpumask() will now work */ | ||
213 | node_to_cpumask_map = map; | ||
214 | } | ||
215 | |||
216 | void __cpuinit numa_set_node(int cpu, int node) | ||
217 | { | ||
218 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | ||
219 | |||
220 | if (cpu_pda(cpu) && node != NUMA_NO_NODE) | ||
221 | cpu_pda(cpu)->nodenumber = node; | ||
222 | |||
223 | if (cpu_to_node_map) | ||
224 | cpu_to_node_map[cpu] = node; | ||
225 | |||
226 | else if (per_cpu_offset(cpu)) | ||
227 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
228 | |||
229 | else | ||
230 | pr_debug("Setting node for non-present cpu %d\n", cpu); | ||
231 | } | ||
232 | |||
233 | void __cpuinit numa_clear_node(int cpu) | ||
234 | { | ||
235 | numa_set_node(cpu, NUMA_NO_NODE); | ||
236 | } | ||
237 | |||
238 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
239 | |||
240 | void __cpuinit numa_add_cpu(int cpu) | ||
241 | { | ||
242 | cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
243 | } | ||
244 | |||
245 | void __cpuinit numa_remove_cpu(int cpu) | ||
246 | { | ||
247 | cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); | ||
248 | } | ||
249 | |||
250 | #else /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
251 | |||
252 | /* | ||
253 | * --------- debug versions of the numa functions --------- | ||
254 | */ | ||
255 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
256 | { | ||
257 | int node = cpu_to_node(cpu); | ||
258 | cpumask_t *mask; | ||
259 | char buf[64]; | ||
260 | |||
261 | if (node_to_cpumask_map == NULL) { | ||
262 | printk(KERN_ERR "node_to_cpumask_map NULL\n"); | ||
263 | dump_stack(); | ||
264 | return; | ||
265 | } | ||
266 | |||
267 | mask = &node_to_cpumask_map[node]; | ||
268 | if (enable) | ||
269 | cpu_set(cpu, *mask); | ||
270 | else | ||
271 | cpu_clear(cpu, *mask); | ||
272 | |||
273 | cpulist_scnprintf(buf, sizeof(buf), *mask); | ||
274 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
275 | enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); | ||
276 | } | ||
277 | |||
278 | void __cpuinit numa_add_cpu(int cpu) | ||
279 | { | ||
280 | numa_set_cpumask(cpu, 1); | ||
281 | } | ||
282 | |||
283 | void __cpuinit numa_remove_cpu(int cpu) | ||
284 | { | ||
285 | numa_set_cpumask(cpu, 0); | ||
286 | } | ||
287 | |||
288 | int cpu_to_node(int cpu) | ||
289 | { | ||
290 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | ||
291 | printk(KERN_WARNING | ||
292 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
293 | dump_stack(); | ||
294 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
295 | } | ||
296 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
297 | } | ||
298 | EXPORT_SYMBOL(cpu_to_node); | ||
299 | |||
300 | /* | ||
301 | * Same function as cpu_to_node() but used if called before the | ||
302 | * per_cpu areas are setup. | ||
303 | */ | ||
304 | int early_cpu_to_node(int cpu) | ||
305 | { | ||
306 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
307 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
308 | |||
309 | if (!per_cpu_offset(cpu)) { | ||
310 | printk(KERN_WARNING | ||
311 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
312 | dump_stack(); | ||
313 | return NUMA_NO_NODE; | ||
314 | } | ||
315 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
316 | } | ||
317 | |||
318 | |||
319 | /* empty cpumask */ | ||
320 | static const cpumask_t cpu_mask_none; | ||
321 | |||
322 | /* | ||
323 | * Returns a pointer to the bitmask of CPUs on Node 'node'. | ||
324 | */ | ||
325 | const cpumask_t *_node_to_cpumask_ptr(int node) | ||
326 | { | ||
327 | if (node_to_cpumask_map == NULL) { | ||
328 | printk(KERN_WARNING | ||
329 | "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", | ||
330 | node); | ||
331 | dump_stack(); | ||
332 | return (const cpumask_t *)&cpu_online_map; | ||
333 | } | ||
334 | if (node >= nr_node_ids) { | ||
335 | printk(KERN_WARNING | ||
336 | "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", | ||
337 | node, nr_node_ids); | ||
338 | dump_stack(); | ||
339 | return &cpu_mask_none; | ||
340 | } | ||
341 | return &node_to_cpumask_map[node]; | ||
342 | } | ||
343 | EXPORT_SYMBOL(_node_to_cpumask_ptr); | ||
344 | |||
345 | /* | ||
346 | * Returns a bitmask of CPUs on Node 'node'. | ||
347 | * | ||
348 | * Side note: this function creates the returned cpumask on the stack | ||
349 | * so with a high NR_CPUS count, excessive stack space is used. The | ||
350 | * node_to_cpumask_ptr function should be used whenever possible. | ||
351 | */ | ||
352 | cpumask_t node_to_cpumask(int node) | ||
353 | { | ||
354 | if (node_to_cpumask_map == NULL) { | ||
355 | printk(KERN_WARNING | ||
356 | "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); | ||
357 | dump_stack(); | ||
358 | return cpu_online_map; | ||
359 | } | ||
360 | if (node >= nr_node_ids) { | ||
361 | printk(KERN_WARNING | ||
362 | "node_to_cpumask(%d): node > nr_node_ids(%d)\n", | ||
363 | node, nr_node_ids); | ||
364 | dump_stack(); | ||
365 | return cpu_mask_none; | ||
366 | } | ||
367 | return node_to_cpumask_map[node]; | ||
368 | } | ||
369 | EXPORT_SYMBOL(node_to_cpumask); | ||
370 | |||
371 | /* | ||
372 | * --------- end of debug versions of the numa functions --------- | ||
373 | */ | ||
374 | |||
375 | #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
376 | |||
377 | #endif /* X86_64_NUMA */ | ||
378 | |||
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d92373630963..6fb5bcdd8933 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c | |||
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) | |||
212 | 212 | ||
213 | badframe: | 213 | badframe: |
214 | if (show_unhandled_signals && printk_ratelimit()) { | 214 | if (show_unhandled_signals && printk_ratelimit()) { |
215 | printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" | 215 | printk("%s%s[%d] bad frame in sigreturn frame:" |
216 | "%p ip:%lx sp:%lx oeax:%lx", | 216 | "%p ip:%lx sp:%lx oeax:%lx", |
217 | task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, | 217 | task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, |
218 | current->comm, task_pid_nr(current), frame, regs->ip, | 218 | current->comm, task_pid_nr(current), frame, regs->ip, |
@@ -657,18 +657,9 @@ static void do_signal(struct pt_regs *regs) | |||
657 | void | 657 | void |
658 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | 658 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) |
659 | { | 659 | { |
660 | /* Pending single-step? */ | ||
661 | if (thread_info_flags & _TIF_SINGLESTEP) { | ||
662 | regs->flags |= X86_EFLAGS_TF; | ||
663 | clear_thread_flag(TIF_SINGLESTEP); | ||
664 | } | ||
665 | |||
666 | /* deal with pending signal delivery */ | 660 | /* deal with pending signal delivery */ |
667 | if (thread_info_flags & _TIF_SIGPENDING) | 661 | if (thread_info_flags & _TIF_SIGPENDING) |
668 | do_signal(regs); | 662 | do_signal(regs); |
669 | 663 | ||
670 | if (thread_info_flags & _TIF_HRTICK_RESCHED) | ||
671 | hrtick_resched(); | ||
672 | |||
673 | clear_thread_flag(TIF_IRET); | 664 | clear_thread_flag(TIF_IRET); |
674 | } | 665 | } |
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e7..b45ef8ddd651 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c | |||
@@ -53,6 +53,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | |||
53 | return do_sigaltstack(uss, uoss, regs->sp); | 53 | return do_sigaltstack(uss, uoss, regs->sp); |
54 | } | 54 | } |
55 | 55 | ||
56 | /* | ||
57 | * Signal frame handlers. | ||
58 | */ | ||
59 | |||
60 | static inline int save_i387(struct _fpstate __user *buf) | ||
61 | { | ||
62 | struct task_struct *tsk = current; | ||
63 | int err = 0; | ||
64 | |||
65 | BUILD_BUG_ON(sizeof(struct user_i387_struct) != | ||
66 | sizeof(tsk->thread.xstate->fxsave)); | ||
67 | |||
68 | if ((unsigned long)buf % 16) | ||
69 | printk("save_i387: bad fpstate %p\n", buf); | ||
70 | |||
71 | if (!used_math()) | ||
72 | return 0; | ||
73 | clear_used_math(); /* trigger finit */ | ||
74 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | ||
75 | err = save_i387_checking((struct i387_fxsave_struct __user *) | ||
76 | buf); | ||
77 | if (err) | ||
78 | return err; | ||
79 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | ||
80 | stts(); | ||
81 | } else { | ||
82 | if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, | ||
83 | sizeof(struct i387_fxsave_struct))) | ||
84 | return -1; | ||
85 | } | ||
86 | return 1; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * This restores directly out of user space. Exceptions are handled. | ||
91 | */ | ||
92 | static inline int restore_i387(struct _fpstate __user *buf) | ||
93 | { | ||
94 | struct task_struct *tsk = current; | ||
95 | int err; | ||
96 | |||
97 | if (!used_math()) { | ||
98 | err = init_fpu(tsk); | ||
99 | if (err) | ||
100 | return err; | ||
101 | } | ||
102 | |||
103 | if (!(task_thread_info(current)->status & TS_USEDFPU)) { | ||
104 | clts(); | ||
105 | task_thread_info(current)->status |= TS_USEDFPU; | ||
106 | } | ||
107 | return restore_fpu_checking((__force struct i387_fxsave_struct *)buf); | ||
108 | } | ||
56 | 109 | ||
57 | /* | 110 | /* |
58 | * Do a signal return; undo the signal stack. | 111 | * Do a signal return; undo the signal stack. |
@@ -487,12 +540,6 @@ static void do_signal(struct pt_regs *regs) | |||
487 | void do_notify_resume(struct pt_regs *regs, void *unused, | 540 | void do_notify_resume(struct pt_regs *regs, void *unused, |
488 | __u32 thread_info_flags) | 541 | __u32 thread_info_flags) |
489 | { | 542 | { |
490 | /* Pending single-step? */ | ||
491 | if (thread_info_flags & _TIF_SINGLESTEP) { | ||
492 | regs->flags |= X86_EFLAGS_TF; | ||
493 | clear_thread_flag(TIF_SINGLESTEP); | ||
494 | } | ||
495 | |||
496 | #ifdef CONFIG_X86_MCE | 543 | #ifdef CONFIG_X86_MCE |
497 | /* notify userspace of pending MCEs */ | 544 | /* notify userspace of pending MCEs */ |
498 | if (thread_info_flags & _TIF_MCE_NOTIFY) | 545 | if (thread_info_flags & _TIF_MCE_NOTIFY) |
@@ -502,9 +549,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused, | |||
502 | /* deal with pending signal delivery */ | 549 | /* deal with pending signal delivery */ |
503 | if (thread_info_flags & _TIF_SIGPENDING) | 550 | if (thread_info_flags & _TIF_SIGPENDING) |
504 | do_signal(regs); | 551 | do_signal(regs); |
505 | |||
506 | if (thread_info_flags & _TIF_HRTICK_RESCHED) | ||
507 | hrtick_resched(); | ||
508 | } | 552 | } |
509 | 553 | ||
510 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | 554 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) |
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 0cb7aadc87cd..361b7a4c640c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c | |||
@@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu) | |||
121 | send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | 121 | send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); |
122 | } | 122 | } |
123 | 123 | ||
124 | /* | 124 | void native_send_call_func_single_ipi(int cpu) |
125 | * Structure and data for smp_call_function(). This is designed to minimise | ||
126 | * static memory requirements. It also looks cleaner. | ||
127 | */ | ||
128 | static DEFINE_SPINLOCK(call_lock); | ||
129 | |||
130 | struct call_data_struct { | ||
131 | void (*func) (void *info); | ||
132 | void *info; | ||
133 | atomic_t started; | ||
134 | atomic_t finished; | ||
135 | int wait; | ||
136 | }; | ||
137 | |||
138 | void lock_ipi_call_lock(void) | ||
139 | { | 125 | { |
140 | spin_lock_irq(&call_lock); | 126 | send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); |
141 | } | ||
142 | |||
143 | void unlock_ipi_call_lock(void) | ||
144 | { | ||
145 | spin_unlock_irq(&call_lock); | ||
146 | } | ||
147 | |||
148 | static struct call_data_struct *call_data; | ||
149 | |||
150 | static void __smp_call_function(void (*func) (void *info), void *info, | ||
151 | int nonatomic, int wait) | ||
152 | { | ||
153 | struct call_data_struct data; | ||
154 | int cpus = num_online_cpus() - 1; | ||
155 | |||
156 | if (!cpus) | ||
157 | return; | ||
158 | |||
159 | data.func = func; | ||
160 | data.info = info; | ||
161 | atomic_set(&data.started, 0); | ||
162 | data.wait = wait; | ||
163 | if (wait) | ||
164 | atomic_set(&data.finished, 0); | ||
165 | |||
166 | call_data = &data; | ||
167 | mb(); | ||
168 | |||
169 | /* Send a message to all other CPUs and wait for them to respond */ | ||
170 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | ||
171 | |||
172 | /* Wait for response */ | ||
173 | while (atomic_read(&data.started) != cpus) | ||
174 | cpu_relax(); | ||
175 | |||
176 | if (wait) | ||
177 | while (atomic_read(&data.finished) != cpus) | ||
178 | cpu_relax(); | ||
179 | } | 127 | } |
180 | 128 | ||
181 | 129 | void native_send_call_func_ipi(cpumask_t mask) | |
182 | /** | ||
183 | * smp_call_function_mask(): Run a function on a set of other CPUs. | ||
184 | * @mask: The set of cpus to run on. Must not include the current cpu. | ||
185 | * @func: The function to run. This must be fast and non-blocking. | ||
186 | * @info: An arbitrary pointer to pass to the function. | ||
187 | * @wait: If true, wait (atomically) until function has completed on other CPUs. | ||
188 | * | ||
189 | * Returns 0 on success, else a negative status code. | ||
190 | * | ||
191 | * If @wait is true, then returns once @func has returned; otherwise | ||
192 | * it returns just before the target cpu calls @func. | ||
193 | * | ||
194 | * You must not call this function with disabled interrupts or from a | ||
195 | * hardware interrupt handler or from a bottom half handler. | ||
196 | */ | ||
197 | static int | ||
198 | native_smp_call_function_mask(cpumask_t mask, | ||
199 | void (*func)(void *), void *info, | ||
200 | int wait) | ||
201 | { | 130 | { |
202 | struct call_data_struct data; | ||
203 | cpumask_t allbutself; | 131 | cpumask_t allbutself; |
204 | int cpus; | ||
205 | |||
206 | /* Can deadlock when called with interrupts disabled */ | ||
207 | WARN_ON(irqs_disabled()); | ||
208 | |||
209 | /* Holding any lock stops cpus from going down. */ | ||
210 | spin_lock(&call_lock); | ||
211 | 132 | ||
212 | allbutself = cpu_online_map; | 133 | allbutself = cpu_online_map; |
213 | cpu_clear(smp_processor_id(), allbutself); | 134 | cpu_clear(smp_processor_id(), allbutself); |
214 | 135 | ||
215 | cpus_and(mask, mask, allbutself); | ||
216 | cpus = cpus_weight(mask); | ||
217 | |||
218 | if (!cpus) { | ||
219 | spin_unlock(&call_lock); | ||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | data.func = func; | ||
224 | data.info = info; | ||
225 | atomic_set(&data.started, 0); | ||
226 | data.wait = wait; | ||
227 | if (wait) | ||
228 | atomic_set(&data.finished, 0); | ||
229 | |||
230 | call_data = &data; | ||
231 | wmb(); | ||
232 | |||
233 | /* Send a message to other CPUs */ | ||
234 | if (cpus_equal(mask, allbutself) && | 136 | if (cpus_equal(mask, allbutself) && |
235 | cpus_equal(cpu_online_map, cpu_callout_map)) | 137 | cpus_equal(cpu_online_map, cpu_callout_map)) |
236 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | 138 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); |
237 | else | 139 | else |
238 | send_IPI_mask(mask, CALL_FUNCTION_VECTOR); | 140 | send_IPI_mask(mask, CALL_FUNCTION_VECTOR); |
239 | |||
240 | /* Wait for response */ | ||
241 | while (atomic_read(&data.started) != cpus) | ||
242 | cpu_relax(); | ||
243 | |||
244 | if (wait) | ||
245 | while (atomic_read(&data.finished) != cpus) | ||
246 | cpu_relax(); | ||
247 | spin_unlock(&call_lock); | ||
248 | |||
249 | return 0; | ||
250 | } | 141 | } |
251 | 142 | ||
252 | static void stop_this_cpu(void *dummy) | 143 | static void stop_this_cpu(void *dummy) |
@@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy) | |||
268 | 159 | ||
269 | static void native_smp_send_stop(void) | 160 | static void native_smp_send_stop(void) |
270 | { | 161 | { |
271 | int nolock; | ||
272 | unsigned long flags; | 162 | unsigned long flags; |
273 | 163 | ||
274 | if (reboot_force) | 164 | if (reboot_force) |
275 | return; | 165 | return; |
276 | 166 | ||
277 | /* Don't deadlock on the call lock in panic */ | 167 | smp_call_function(stop_this_cpu, NULL, 0); |
278 | nolock = !spin_trylock(&call_lock); | ||
279 | local_irq_save(flags); | 168 | local_irq_save(flags); |
280 | __smp_call_function(stop_this_cpu, NULL, 0, 0); | ||
281 | if (!nolock) | ||
282 | spin_unlock(&call_lock); | ||
283 | disable_local_APIC(); | 169 | disable_local_APIC(); |
284 | local_irq_restore(flags); | 170 | local_irq_restore(flags); |
285 | } | 171 | } |
@@ -301,33 +187,28 @@ void smp_reschedule_interrupt(struct pt_regs *regs) | |||
301 | 187 | ||
302 | void smp_call_function_interrupt(struct pt_regs *regs) | 188 | void smp_call_function_interrupt(struct pt_regs *regs) |
303 | { | 189 | { |
304 | void (*func) (void *info) = call_data->func; | ||
305 | void *info = call_data->info; | ||
306 | int wait = call_data->wait; | ||
307 | |||
308 | ack_APIC_irq(); | 190 | ack_APIC_irq(); |
309 | /* | ||
310 | * Notify initiating CPU that I've grabbed the data and am | ||
311 | * about to execute the function | ||
312 | */ | ||
313 | mb(); | ||
314 | atomic_inc(&call_data->started); | ||
315 | /* | ||
316 | * At this point the info structure may be out of scope unless wait==1 | ||
317 | */ | ||
318 | irq_enter(); | 191 | irq_enter(); |
319 | (*func)(info); | 192 | generic_smp_call_function_interrupt(); |
320 | #ifdef CONFIG_X86_32 | 193 | #ifdef CONFIG_X86_32 |
321 | __get_cpu_var(irq_stat).irq_call_count++; | 194 | __get_cpu_var(irq_stat).irq_call_count++; |
322 | #else | 195 | #else |
323 | add_pda(irq_call_count, 1); | 196 | add_pda(irq_call_count, 1); |
324 | #endif | 197 | #endif |
325 | irq_exit(); | 198 | irq_exit(); |
199 | } | ||
326 | 200 | ||
327 | if (wait) { | 201 | void smp_call_function_single_interrupt(struct pt_regs *regs) |
328 | mb(); | 202 | { |
329 | atomic_inc(&call_data->finished); | 203 | ack_APIC_irq(); |
330 | } | 204 | irq_enter(); |
205 | generic_smp_call_function_single_interrupt(); | ||
206 | #ifdef CONFIG_X86_32 | ||
207 | __get_cpu_var(irq_stat).irq_call_count++; | ||
208 | #else | ||
209 | add_pda(irq_call_count, 1); | ||
210 | #endif | ||
211 | irq_exit(); | ||
331 | } | 212 | } |
332 | 213 | ||
333 | struct smp_ops smp_ops = { | 214 | struct smp_ops smp_ops = { |
@@ -338,7 +219,8 @@ struct smp_ops smp_ops = { | |||
338 | 219 | ||
339 | .smp_send_stop = native_smp_send_stop, | 220 | .smp_send_stop = native_smp_send_stop, |
340 | .smp_send_reschedule = native_smp_send_reschedule, | 221 | .smp_send_reschedule = native_smp_send_reschedule, |
341 | .smp_call_function_mask = native_smp_call_function_mask, | 222 | |
223 | .send_call_func_ipi = native_send_call_func_ipi, | ||
224 | .send_call_func_single_ipi = native_send_call_func_single_ipi, | ||
342 | }; | 225 | }; |
343 | EXPORT_SYMBOL_GPL(smp_ops); | 226 | EXPORT_SYMBOL_GPL(smp_ops); |
344 | |||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 56078d61c793..332512767f4f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -59,7 +59,6 @@ | |||
59 | #include <asm/pgtable.h> | 59 | #include <asm/pgtable.h> |
60 | #include <asm/tlbflush.h> | 60 | #include <asm/tlbflush.h> |
61 | #include <asm/mtrr.h> | 61 | #include <asm/mtrr.h> |
62 | #include <asm/nmi.h> | ||
63 | #include <asm/vmi.h> | 62 | #include <asm/vmi.h> |
64 | #include <asm/genapic.h> | 63 | #include <asm/genapic.h> |
65 | #include <linux/mc146818rtc.h> | 64 | #include <linux/mc146818rtc.h> |
@@ -68,22 +67,6 @@ | |||
68 | #include <mach_wakecpu.h> | 67 | #include <mach_wakecpu.h> |
69 | #include <smpboot_hooks.h> | 68 | #include <smpboot_hooks.h> |
70 | 69 | ||
71 | /* | ||
72 | * FIXME: For x86_64, those are defined in other files. But moving them here, | ||
73 | * would make the setup areas dependent on smp, which is a loss. When we | ||
74 | * integrate apic between arches, we can probably do a better job, but | ||
75 | * right now, they'll stay here -- glommer | ||
76 | */ | ||
77 | |||
78 | /* which logical CPU number maps to which CPU (physical APIC ID) */ | ||
79 | u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = | ||
80 | { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
81 | void *x86_cpu_to_apicid_early_ptr; | ||
82 | |||
83 | u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata | ||
84 | = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
85 | void *x86_bios_cpu_apicid_early_ptr; | ||
86 | |||
87 | #ifdef CONFIG_X86_32 | 70 | #ifdef CONFIG_X86_32 |
88 | u8 apicid_2_node[MAX_APICID]; | 71 | u8 apicid_2_node[MAX_APICID]; |
89 | static int low_mappings; | 72 | static int low_mappings; |
@@ -198,13 +181,12 @@ static void map_cpu_to_logical_apicid(void) | |||
198 | map_cpu_to_node(cpu, node); | 181 | map_cpu_to_node(cpu, node); |
199 | } | 182 | } |
200 | 183 | ||
201 | static void unmap_cpu_to_logical_apicid(int cpu) | 184 | void numa_remove_cpu(int cpu) |
202 | { | 185 | { |
203 | cpu_2_logical_apicid[cpu] = BAD_APICID; | 186 | cpu_2_logical_apicid[cpu] = BAD_APICID; |
204 | unmap_cpu_to_node(cpu); | 187 | unmap_cpu_to_node(cpu); |
205 | } | 188 | } |
206 | #else | 189 | #else |
207 | #define unmap_cpu_to_logical_apicid(cpu) do {} while (0) | ||
208 | #define map_cpu_to_logical_apicid() do {} while (0) | 190 | #define map_cpu_to_logical_apicid() do {} while (0) |
209 | #endif | 191 | #endif |
210 | 192 | ||
@@ -234,7 +216,7 @@ static void __cpuinit smp_callin(void) | |||
234 | panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, | 216 | panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, |
235 | phys_id, cpuid); | 217 | phys_id, cpuid); |
236 | } | 218 | } |
237 | Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); | 219 | pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); |
238 | 220 | ||
239 | /* | 221 | /* |
240 | * STARTUP IPIs are fragile beasts as they might sometimes | 222 | * STARTUP IPIs are fragile beasts as they might sometimes |
@@ -269,7 +251,7 @@ static void __cpuinit smp_callin(void) | |||
269 | * boards) | 251 | * boards) |
270 | */ | 252 | */ |
271 | 253 | ||
272 | Dprintk("CALLIN, before setup_local_APIC().\n"); | 254 | pr_debug("CALLIN, before setup_local_APIC().\n"); |
273 | smp_callin_clear_local_apic(); | 255 | smp_callin_clear_local_apic(); |
274 | setup_local_APIC(); | 256 | setup_local_APIC(); |
275 | end_local_APIC_setup(); | 257 | end_local_APIC_setup(); |
@@ -284,7 +266,7 @@ static void __cpuinit smp_callin(void) | |||
284 | local_irq_enable(); | 266 | local_irq_enable(); |
285 | calibrate_delay(); | 267 | calibrate_delay(); |
286 | local_irq_disable(); | 268 | local_irq_disable(); |
287 | Dprintk("Stack at about %p\n", &cpuid); | 269 | pr_debug("Stack at about %p\n", &cpuid); |
288 | 270 | ||
289 | /* | 271 | /* |
290 | * Save our processor parameters | 272 | * Save our processor parameters |
@@ -345,19 +327,12 @@ static void __cpuinit start_secondary(void *unused) | |||
345 | * lock helps us to not include this cpu in a currently in progress | 327 | * lock helps us to not include this cpu in a currently in progress |
346 | * smp_call_function(). | 328 | * smp_call_function(). |
347 | */ | 329 | */ |
348 | lock_ipi_call_lock(); | 330 | ipi_call_lock_irq(); |
349 | #ifdef CONFIG_X86_64 | 331 | #ifdef CONFIG_X86_IO_APIC |
350 | spin_lock(&vector_lock); | 332 | setup_vector_irq(smp_processor_id()); |
351 | |||
352 | /* Setup the per cpu irq handling data structures */ | ||
353 | __setup_vector_irq(smp_processor_id()); | ||
354 | /* | ||
355 | * Allow the master to continue. | ||
356 | */ | ||
357 | spin_unlock(&vector_lock); | ||
358 | #endif | 333 | #endif |
359 | cpu_set(smp_processor_id(), cpu_online_map); | 334 | cpu_set(smp_processor_id(), cpu_online_map); |
360 | unlock_ipi_call_lock(); | 335 | ipi_call_unlock_irq(); |
361 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | 336 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; |
362 | 337 | ||
363 | setup_secondary_clock(); | 338 | setup_secondary_clock(); |
@@ -366,31 +341,8 @@ static void __cpuinit start_secondary(void *unused) | |||
366 | cpu_idle(); | 341 | cpu_idle(); |
367 | } | 342 | } |
368 | 343 | ||
369 | #ifdef CONFIG_X86_32 | ||
370 | /* | ||
371 | * Everything has been set up for the secondary | ||
372 | * CPUs - they just need to reload everything | ||
373 | * from the task structure | ||
374 | * This function must not return. | ||
375 | */ | ||
376 | void __devinit initialize_secondary(void) | ||
377 | { | ||
378 | /* | ||
379 | * We don't actually need to load the full TSS, | ||
380 | * basically just the stack pointer and the ip. | ||
381 | */ | ||
382 | |||
383 | asm volatile( | ||
384 | "movl %0,%%esp\n\t" | ||
385 | "jmp *%1" | ||
386 | : | ||
387 | :"m" (current->thread.sp), "m" (current->thread.ip)); | ||
388 | } | ||
389 | #endif | ||
390 | |||
391 | static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) | 344 | static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) |
392 | { | 345 | { |
393 | #ifdef CONFIG_X86_32 | ||
394 | /* | 346 | /* |
395 | * Mask B, Pentium, but not Pentium MMX | 347 | * Mask B, Pentium, but not Pentium MMX |
396 | */ | 348 | */ |
@@ -440,7 +392,6 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) | |||
440 | 392 | ||
441 | valid_k7: | 393 | valid_k7: |
442 | ; | 394 | ; |
443 | #endif | ||
444 | } | 395 | } |
445 | 396 | ||
446 | static void __cpuinit smp_checks(void) | 397 | static void __cpuinit smp_checks(void) |
@@ -487,7 +438,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) | |||
487 | cpu_set(cpu, cpu_sibling_setup_map); | 438 | cpu_set(cpu, cpu_sibling_setup_map); |
488 | 439 | ||
489 | if (smp_num_siblings > 1) { | 440 | if (smp_num_siblings > 1) { |
490 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | 441 | for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { |
491 | if (c->phys_proc_id == cpu_data(i).phys_proc_id && | 442 | if (c->phys_proc_id == cpu_data(i).phys_proc_id && |
492 | c->cpu_core_id == cpu_data(i).cpu_core_id) { | 443 | c->cpu_core_id == cpu_data(i).cpu_core_id) { |
493 | cpu_set(i, per_cpu(cpu_sibling_map, cpu)); | 444 | cpu_set(i, per_cpu(cpu_sibling_map, cpu)); |
@@ -510,7 +461,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) | |||
510 | return; | 461 | return; |
511 | } | 462 | } |
512 | 463 | ||
513 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | 464 | for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { |
514 | if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && | 465 | if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && |
515 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { | 466 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { |
516 | cpu_set(i, c->llc_shared_map); | 467 | cpu_set(i, c->llc_shared_map); |
@@ -555,23 +506,6 @@ cpumask_t cpu_coregroup_map(int cpu) | |||
555 | return c->llc_shared_map; | 506 | return c->llc_shared_map; |
556 | } | 507 | } |
557 | 508 | ||
558 | #ifdef CONFIG_X86_32 | ||
559 | /* | ||
560 | * We are called very early to get the low memory for the | ||
561 | * SMP bootup trampoline page. | ||
562 | */ | ||
563 | void __init smp_alloc_memory(void) | ||
564 | { | ||
565 | trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); | ||
566 | /* | ||
567 | * Has to be in very low memory so we can execute | ||
568 | * real-mode AP code. | ||
569 | */ | ||
570 | if (__pa(trampoline_base) >= 0x9F000) | ||
571 | BUG(); | ||
572 | } | ||
573 | #endif | ||
574 | |||
575 | static void impress_friends(void) | 509 | static void impress_friends(void) |
576 | { | 510 | { |
577 | int cpu; | 511 | int cpu; |
@@ -579,7 +513,7 @@ static void impress_friends(void) | |||
579 | /* | 513 | /* |
580 | * Allow the user to impress friends. | 514 | * Allow the user to impress friends. |
581 | */ | 515 | */ |
582 | Dprintk("Before bogomips.\n"); | 516 | pr_debug("Before bogomips.\n"); |
583 | for_each_possible_cpu(cpu) | 517 | for_each_possible_cpu(cpu) |
584 | if (cpu_isset(cpu, cpu_callout_map)) | 518 | if (cpu_isset(cpu, cpu_callout_map)) |
585 | bogosum += cpu_data(cpu).loops_per_jiffy; | 519 | bogosum += cpu_data(cpu).loops_per_jiffy; |
@@ -589,7 +523,7 @@ static void impress_friends(void) | |||
589 | bogosum/(500000/HZ), | 523 | bogosum/(500000/HZ), |
590 | (bogosum/(5000/HZ))%100); | 524 | (bogosum/(5000/HZ))%100); |
591 | 525 | ||
592 | Dprintk("Before bogocount - setting activated=1.\n"); | 526 | pr_debug("Before bogocount - setting activated=1.\n"); |
593 | } | 527 | } |
594 | 528 | ||
595 | static inline void __inquire_remote_apic(int apicid) | 529 | static inline void __inquire_remote_apic(int apicid) |
@@ -612,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid) | |||
612 | printk(KERN_CONT | 546 | printk(KERN_CONT |
613 | "a previous APIC delivery may have failed\n"); | 547 | "a previous APIC delivery may have failed\n"); |
614 | 548 | ||
615 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); | 549 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); |
616 | apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); | 550 | apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); |
617 | 551 | ||
618 | timeout = 0; | 552 | timeout = 0; |
619 | do { | 553 | do { |
@@ -645,29 +579,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) | |||
645 | int maxlvt; | 579 | int maxlvt; |
646 | 580 | ||
647 | /* Target chip */ | 581 | /* Target chip */ |
648 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); | 582 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); |
649 | 583 | ||
650 | /* Boot on the stack */ | 584 | /* Boot on the stack */ |
651 | /* Kick the second */ | 585 | /* Kick the second */ |
652 | apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); | 586 | apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); |
653 | 587 | ||
654 | Dprintk("Waiting for send to finish...\n"); | 588 | pr_debug("Waiting for send to finish...\n"); |
655 | send_status = safe_apic_wait_icr_idle(); | 589 | send_status = safe_apic_wait_icr_idle(); |
656 | 590 | ||
657 | /* | 591 | /* |
658 | * Give the other CPU some time to accept the IPI. | 592 | * Give the other CPU some time to accept the IPI. |
659 | */ | 593 | */ |
660 | udelay(200); | 594 | udelay(200); |
661 | /* | ||
662 | * Due to the Pentium erratum 3AP. | ||
663 | */ | ||
664 | maxlvt = lapic_get_maxlvt(); | 595 | maxlvt = lapic_get_maxlvt(); |
665 | if (maxlvt > 3) { | 596 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
666 | apic_read_around(APIC_SPIV); | ||
667 | apic_write(APIC_ESR, 0); | 597 | apic_write(APIC_ESR, 0); |
668 | } | ||
669 | accept_status = (apic_read(APIC_ESR) & 0xEF); | 598 | accept_status = (apic_read(APIC_ESR) & 0xEF); |
670 | Dprintk("NMI sent.\n"); | 599 | pr_debug("NMI sent.\n"); |
671 | 600 | ||
672 | if (send_status) | 601 | if (send_status) |
673 | printk(KERN_ERR "APIC never delivered???\n"); | 602 | printk(KERN_ERR "APIC never delivered???\n"); |
@@ -691,42 +620,44 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | |||
691 | return send_status; | 620 | return send_status; |
692 | } | 621 | } |
693 | 622 | ||
623 | maxlvt = lapic_get_maxlvt(); | ||
624 | |||
694 | /* | 625 | /* |
695 | * Be paranoid about clearing APIC errors. | 626 | * Be paranoid about clearing APIC errors. |
696 | */ | 627 | */ |
697 | if (APIC_INTEGRATED(apic_version[phys_apicid])) { | 628 | if (APIC_INTEGRATED(apic_version[phys_apicid])) { |
698 | apic_read_around(APIC_SPIV); | 629 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
699 | apic_write(APIC_ESR, 0); | 630 | apic_write(APIC_ESR, 0); |
700 | apic_read(APIC_ESR); | 631 | apic_read(APIC_ESR); |
701 | } | 632 | } |
702 | 633 | ||
703 | Dprintk("Asserting INIT.\n"); | 634 | pr_debug("Asserting INIT.\n"); |
704 | 635 | ||
705 | /* | 636 | /* |
706 | * Turn INIT on target chip | 637 | * Turn INIT on target chip |
707 | */ | 638 | */ |
708 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | 639 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); |
709 | 640 | ||
710 | /* | 641 | /* |
711 | * Send IPI | 642 | * Send IPI |
712 | */ | 643 | */ |
713 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | 644 | apic_write(APIC_ICR, |
714 | | APIC_DM_INIT); | 645 | APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); |
715 | 646 | ||
716 | Dprintk("Waiting for send to finish...\n"); | 647 | pr_debug("Waiting for send to finish...\n"); |
717 | send_status = safe_apic_wait_icr_idle(); | 648 | send_status = safe_apic_wait_icr_idle(); |
718 | 649 | ||
719 | mdelay(10); | 650 | mdelay(10); |
720 | 651 | ||
721 | Dprintk("Deasserting INIT.\n"); | 652 | pr_debug("Deasserting INIT.\n"); |
722 | 653 | ||
723 | /* Target chip */ | 654 | /* Target chip */ |
724 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | 655 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); |
725 | 656 | ||
726 | /* Send IPI */ | 657 | /* Send IPI */ |
727 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); | 658 | apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); |
728 | 659 | ||
729 | Dprintk("Waiting for send to finish...\n"); | 660 | pr_debug("Waiting for send to finish...\n"); |
730 | send_status = safe_apic_wait_icr_idle(); | 661 | send_status = safe_apic_wait_icr_idle(); |
731 | 662 | ||
732 | mb(); | 663 | mb(); |
@@ -748,64 +679,52 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | |||
748 | * target processor state. | 679 | * target processor state. |
749 | */ | 680 | */ |
750 | startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, | 681 | startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, |
751 | #ifdef CONFIG_X86_64 | ||
752 | (unsigned long)init_rsp); | ||
753 | #else | ||
754 | (unsigned long)stack_start.sp); | 682 | (unsigned long)stack_start.sp); |
755 | #endif | ||
756 | 683 | ||
757 | /* | 684 | /* |
758 | * Run STARTUP IPI loop. | 685 | * Run STARTUP IPI loop. |
759 | */ | 686 | */ |
760 | Dprintk("#startup loops: %d.\n", num_starts); | 687 | pr_debug("#startup loops: %d.\n", num_starts); |
761 | |||
762 | maxlvt = lapic_get_maxlvt(); | ||
763 | 688 | ||
764 | for (j = 1; j <= num_starts; j++) { | 689 | for (j = 1; j <= num_starts; j++) { |
765 | Dprintk("Sending STARTUP #%d.\n", j); | 690 | pr_debug("Sending STARTUP #%d.\n", j); |
766 | apic_read_around(APIC_SPIV); | 691 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
767 | apic_write(APIC_ESR, 0); | 692 | apic_write(APIC_ESR, 0); |
768 | apic_read(APIC_ESR); | 693 | apic_read(APIC_ESR); |
769 | Dprintk("After apic_write.\n"); | 694 | pr_debug("After apic_write.\n"); |
770 | 695 | ||
771 | /* | 696 | /* |
772 | * STARTUP IPI | 697 | * STARTUP IPI |
773 | */ | 698 | */ |
774 | 699 | ||
775 | /* Target chip */ | 700 | /* Target chip */ |
776 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | 701 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); |
777 | 702 | ||
778 | /* Boot on the stack */ | 703 | /* Boot on the stack */ |
779 | /* Kick the second */ | 704 | /* Kick the second */ |
780 | apic_write_around(APIC_ICR, APIC_DM_STARTUP | 705 | apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); |
781 | | (start_eip >> 12)); | ||
782 | 706 | ||
783 | /* | 707 | /* |
784 | * Give the other CPU some time to accept the IPI. | 708 | * Give the other CPU some time to accept the IPI. |
785 | */ | 709 | */ |
786 | udelay(300); | 710 | udelay(300); |
787 | 711 | ||
788 | Dprintk("Startup point 1.\n"); | 712 | pr_debug("Startup point 1.\n"); |
789 | 713 | ||
790 | Dprintk("Waiting for send to finish...\n"); | 714 | pr_debug("Waiting for send to finish...\n"); |
791 | send_status = safe_apic_wait_icr_idle(); | 715 | send_status = safe_apic_wait_icr_idle(); |
792 | 716 | ||
793 | /* | 717 | /* |
794 | * Give the other CPU some time to accept the IPI. | 718 | * Give the other CPU some time to accept the IPI. |
795 | */ | 719 | */ |
796 | udelay(200); | 720 | udelay(200); |
797 | /* | 721 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
798 | * Due to the Pentium erratum 3AP. | ||
799 | */ | ||
800 | if (maxlvt > 3) { | ||
801 | apic_read_around(APIC_SPIV); | ||
802 | apic_write(APIC_ESR, 0); | 722 | apic_write(APIC_ESR, 0); |
803 | } | ||
804 | accept_status = (apic_read(APIC_ESR) & 0xEF); | 723 | accept_status = (apic_read(APIC_ESR) & 0xEF); |
805 | if (send_status || accept_status) | 724 | if (send_status || accept_status) |
806 | break; | 725 | break; |
807 | } | 726 | } |
808 | Dprintk("After Startup.\n"); | 727 | pr_debug("After Startup.\n"); |
809 | 728 | ||
810 | if (send_status) | 729 | if (send_status) |
811 | printk(KERN_ERR "APIC never delivered???\n"); | 730 | printk(KERN_ERR "APIC never delivered???\n"); |
@@ -832,6 +751,45 @@ static void __cpuinit do_fork_idle(struct work_struct *work) | |||
832 | complete(&c_idle->done); | 751 | complete(&c_idle->done); |
833 | } | 752 | } |
834 | 753 | ||
754 | #ifdef CONFIG_X86_64 | ||
755 | /* | ||
756 | * Allocate node local memory for the AP pda. | ||
757 | * | ||
758 | * Must be called after the _cpu_pda pointer table is initialized. | ||
759 | */ | ||
760 | int __cpuinit get_local_pda(int cpu) | ||
761 | { | ||
762 | struct x8664_pda *oldpda, *newpda; | ||
763 | unsigned long size = sizeof(struct x8664_pda); | ||
764 | int node = cpu_to_node(cpu); | ||
765 | |||
766 | if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) | ||
767 | return 0; | ||
768 | |||
769 | oldpda = cpu_pda(cpu); | ||
770 | newpda = kmalloc_node(size, GFP_ATOMIC, node); | ||
771 | if (!newpda) { | ||
772 | printk(KERN_ERR "Could not allocate node local PDA " | ||
773 | "for CPU %d on node %d\n", cpu, node); | ||
774 | |||
775 | if (oldpda) | ||
776 | return 0; /* have a usable pda */ | ||
777 | else | ||
778 | return -1; | ||
779 | } | ||
780 | |||
781 | if (oldpda) { | ||
782 | memcpy(newpda, oldpda, size); | ||
783 | if (!after_bootmem) | ||
784 | free_bootmem((unsigned long)oldpda, size); | ||
785 | } | ||
786 | |||
787 | newpda->in_bootmem = 0; | ||
788 | cpu_pda(cpu) = newpda; | ||
789 | return 0; | ||
790 | } | ||
791 | #endif /* CONFIG_X86_64 */ | ||
792 | |||
835 | static int __cpuinit do_boot_cpu(int apicid, int cpu) | 793 | static int __cpuinit do_boot_cpu(int apicid, int cpu) |
836 | /* | 794 | /* |
837 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad | 795 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad |
@@ -848,28 +806,14 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) | |||
848 | .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), | 806 | .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), |
849 | }; | 807 | }; |
850 | INIT_WORK(&c_idle.work, do_fork_idle); | 808 | INIT_WORK(&c_idle.work, do_fork_idle); |
851 | #ifdef CONFIG_X86_64 | ||
852 | /* allocate memory for gdts of secondary cpus. Hotplug is considered */ | ||
853 | if (!cpu_gdt_descr[cpu].address && | ||
854 | !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { | ||
855 | printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); | ||
856 | return -1; | ||
857 | } | ||
858 | 809 | ||
810 | #ifdef CONFIG_X86_64 | ||
859 | /* Allocate node local memory for AP pdas */ | 811 | /* Allocate node local memory for AP pdas */ |
860 | if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { | 812 | if (cpu > 0) { |
861 | struct x8664_pda *newpda, *pda; | 813 | boot_error = get_local_pda(cpu); |
862 | int node = cpu_to_node(cpu); | 814 | if (boot_error) |
863 | pda = cpu_pda(cpu); | 815 | goto restore_state; |
864 | newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, | 816 | /* if can't get pda memory, can't start cpu */ |
865 | node); | ||
866 | if (newpda) { | ||
867 | memcpy(newpda, pda, sizeof(struct x8664_pda)); | ||
868 | cpu_pda(cpu) = newpda; | ||
869 | } else | ||
870 | printk(KERN_ERR | ||
871 | "Could not allocate node local PDA for CPU %d on node %d\n", | ||
872 | cpu, node); | ||
873 | } | 817 | } |
874 | #endif | 818 | #endif |
875 | 819 | ||
@@ -905,18 +849,15 @@ do_rest: | |||
905 | #ifdef CONFIG_X86_32 | 849 | #ifdef CONFIG_X86_32 |
906 | per_cpu(current_task, cpu) = c_idle.idle; | 850 | per_cpu(current_task, cpu) = c_idle.idle; |
907 | init_gdt(cpu); | 851 | init_gdt(cpu); |
908 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | ||
909 | c_idle.idle->thread.ip = (unsigned long) start_secondary; | ||
910 | /* Stack for startup_32 can be just as for start_secondary onwards */ | 852 | /* Stack for startup_32 can be just as for start_secondary onwards */ |
911 | stack_start.sp = (void *) c_idle.idle->thread.sp; | ||
912 | irq_ctx_init(cpu); | 853 | irq_ctx_init(cpu); |
913 | #else | 854 | #else |
914 | cpu_pda(cpu)->pcurrent = c_idle.idle; | 855 | cpu_pda(cpu)->pcurrent = c_idle.idle; |
915 | init_rsp = c_idle.idle->thread.sp; | ||
916 | load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); | ||
917 | initial_code = (unsigned long)start_secondary; | ||
918 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); | 856 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); |
919 | #endif | 857 | #endif |
858 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | ||
859 | initial_code = (unsigned long)start_secondary; | ||
860 | stack_start.sp = (void *) c_idle.idle->thread.sp; | ||
920 | 861 | ||
921 | /* start_ip had better be page-aligned! */ | 862 | /* start_ip had better be page-aligned! */ |
922 | start_ip = setup_trampoline(); | 863 | start_ip = setup_trampoline(); |
@@ -934,7 +875,7 @@ do_rest: | |||
934 | 875 | ||
935 | if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { | 876 | if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { |
936 | 877 | ||
937 | Dprintk("Setting warm reset code and vector.\n"); | 878 | pr_debug("Setting warm reset code and vector.\n"); |
938 | 879 | ||
939 | store_NMI_vector(&nmi_high, &nmi_low); | 880 | store_NMI_vector(&nmi_high, &nmi_low); |
940 | 881 | ||
@@ -955,9 +896,9 @@ do_rest: | |||
955 | /* | 896 | /* |
956 | * allow APs to start initializing. | 897 | * allow APs to start initializing. |
957 | */ | 898 | */ |
958 | Dprintk("Before Callout %d.\n", cpu); | 899 | pr_debug("Before Callout %d.\n", cpu); |
959 | cpu_set(cpu, cpu_callout_map); | 900 | cpu_set(cpu, cpu_callout_map); |
960 | Dprintk("After Callout %d.\n", cpu); | 901 | pr_debug("After Callout %d.\n", cpu); |
961 | 902 | ||
962 | /* | 903 | /* |
963 | * Wait 5s total for a response | 904 | * Wait 5s total for a response |
@@ -970,10 +911,10 @@ do_rest: | |||
970 | 911 | ||
971 | if (cpu_isset(cpu, cpu_callin_map)) { | 912 | if (cpu_isset(cpu, cpu_callin_map)) { |
972 | /* number CPUs logically, starting from 1 (BSP is 0) */ | 913 | /* number CPUs logically, starting from 1 (BSP is 0) */ |
973 | Dprintk("OK.\n"); | 914 | pr_debug("OK.\n"); |
974 | printk(KERN_INFO "CPU%d: ", cpu); | 915 | printk(KERN_INFO "CPU%d: ", cpu); |
975 | print_cpu_info(&cpu_data(cpu)); | 916 | print_cpu_info(&cpu_data(cpu)); |
976 | Dprintk("CPU has booted.\n"); | 917 | pr_debug("CPU has booted.\n"); |
977 | } else { | 918 | } else { |
978 | boot_error = 1; | 919 | boot_error = 1; |
979 | if (*((volatile unsigned char *)trampoline_base) | 920 | if (*((volatile unsigned char *)trampoline_base) |
@@ -987,16 +928,14 @@ do_rest: | |||
987 | inquire_remote_apic(apicid); | 928 | inquire_remote_apic(apicid); |
988 | } | 929 | } |
989 | } | 930 | } |
990 | |||
991 | if (boot_error) { | ||
992 | /* Try to put things back the way they were before ... */ | ||
993 | unmap_cpu_to_logical_apicid(cpu); | ||
994 | #ifdef CONFIG_X86_64 | 931 | #ifdef CONFIG_X86_64 |
995 | clear_node_cpumask(cpu); /* was set by numa_add_cpu */ | 932 | restore_state: |
996 | #endif | 933 | #endif |
934 | if (boot_error) { | ||
935 | /* Try to put things back the way they were before ... */ | ||
936 | numa_remove_cpu(cpu); /* was set by numa_add_cpu */ | ||
997 | cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ | 937 | cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ |
998 | cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ | 938 | cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ |
999 | cpu_clear(cpu, cpu_possible_map); | ||
1000 | cpu_clear(cpu, cpu_present_map); | 939 | cpu_clear(cpu, cpu_present_map); |
1001 | per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; | 940 | per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; |
1002 | } | 941 | } |
@@ -1020,7 +959,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
1020 | 959 | ||
1021 | WARN_ON(irqs_disabled()); | 960 | WARN_ON(irqs_disabled()); |
1022 | 961 | ||
1023 | Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); | 962 | pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); |
1024 | 963 | ||
1025 | if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || | 964 | if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || |
1026 | !physid_isset(apicid, phys_cpu_present_map)) { | 965 | !physid_isset(apicid, phys_cpu_present_map)) { |
@@ -1032,7 +971,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
1032 | * Already booted CPU? | 971 | * Already booted CPU? |
1033 | */ | 972 | */ |
1034 | if (cpu_isset(cpu, cpu_callin_map)) { | 973 | if (cpu_isset(cpu, cpu_callin_map)) { |
1035 | Dprintk("do_boot_cpu %d Already started\n", cpu); | 974 | pr_debug("do_boot_cpu %d Already started\n", cpu); |
1036 | return -ENOSYS; | 975 | return -ENOSYS; |
1037 | } | 976 | } |
1038 | 977 | ||
@@ -1059,7 +998,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
1059 | err = do_boot_cpu(apicid, cpu); | 998 | err = do_boot_cpu(apicid, cpu); |
1060 | #endif | 999 | #endif |
1061 | if (err) { | 1000 | if (err) { |
1062 | Dprintk("do_boot_cpu failed %d\n", err); | 1001 | pr_debug("do_boot_cpu failed %d\n", err); |
1063 | return -EIO; | 1002 | return -EIO; |
1064 | } | 1003 | } |
1065 | 1004 | ||
@@ -1088,14 +1027,12 @@ static __init void disable_smp(void) | |||
1088 | { | 1027 | { |
1089 | cpu_present_map = cpumask_of_cpu(0); | 1028 | cpu_present_map = cpumask_of_cpu(0); |
1090 | cpu_possible_map = cpumask_of_cpu(0); | 1029 | cpu_possible_map = cpumask_of_cpu(0); |
1091 | #ifdef CONFIG_X86_32 | ||
1092 | smpboot_clear_io_apic_irqs(); | 1030 | smpboot_clear_io_apic_irqs(); |
1093 | #endif | 1031 | |
1094 | if (smp_found_config) | 1032 | if (smp_found_config) |
1095 | phys_cpu_present_map = | 1033 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); |
1096 | physid_mask_of_physid(boot_cpu_physical_apicid); | ||
1097 | else | 1034 | else |
1098 | phys_cpu_present_map = physid_mask_of_physid(0); | 1035 | physid_set_mask_of_physid(0, &phys_cpu_present_map); |
1099 | map_cpu_to_logical_apicid(); | 1036 | map_cpu_to_logical_apicid(); |
1100 | cpu_set(0, per_cpu(cpu_sibling_map, 0)); | 1037 | cpu_set(0, per_cpu(cpu_sibling_map, 0)); |
1101 | cpu_set(0, per_cpu(cpu_core_map, 0)); | 1038 | cpu_set(0, per_cpu(cpu_core_map, 0)); |
@@ -1158,12 +1095,12 @@ static int __init smp_sanity_check(unsigned max_cpus) | |||
1158 | * If SMP should be disabled, then really disable it! | 1095 | * If SMP should be disabled, then really disable it! |
1159 | */ | 1096 | */ |
1160 | if (!max_cpus) { | 1097 | if (!max_cpus) { |
1161 | printk(KERN_INFO "SMP mode deactivated," | 1098 | printk(KERN_INFO "SMP mode deactivated.\n"); |
1162 | "forcing use of dummy APIC emulation.\n"); | ||
1163 | smpboot_clear_io_apic(); | 1099 | smpboot_clear_io_apic(); |
1164 | #ifdef CONFIG_X86_32 | 1100 | |
1101 | localise_nmi_watchdog(); | ||
1102 | |||
1165 | connect_bsp_APIC(); | 1103 | connect_bsp_APIC(); |
1166 | #endif | ||
1167 | setup_local_APIC(); | 1104 | setup_local_APIC(); |
1168 | end_local_APIC_setup(); | 1105 | end_local_APIC_setup(); |
1169 | return -1; | 1106 | return -1; |
@@ -1191,7 +1128,6 @@ static void __init smp_cpu_index_default(void) | |||
1191 | void __init native_smp_prepare_cpus(unsigned int max_cpus) | 1128 | void __init native_smp_prepare_cpus(unsigned int max_cpus) |
1192 | { | 1129 | { |
1193 | preempt_disable(); | 1130 | preempt_disable(); |
1194 | nmi_watchdog_default(); | ||
1195 | smp_cpu_index_default(); | 1131 | smp_cpu_index_default(); |
1196 | current_cpu_data = boot_cpu_data; | 1132 | current_cpu_data = boot_cpu_data; |
1197 | cpu_callin_map = cpumask_of_cpu(0); | 1133 | cpu_callin_map = cpumask_of_cpu(0); |
@@ -1218,9 +1154,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1218 | } | 1154 | } |
1219 | preempt_enable(); | 1155 | preempt_enable(); |
1220 | 1156 | ||
1221 | #ifdef CONFIG_X86_32 | ||
1222 | connect_bsp_APIC(); | 1157 | connect_bsp_APIC(); |
1223 | #endif | 1158 | |
1224 | /* | 1159 | /* |
1225 | * Switch from PIC to APIC mode. | 1160 | * Switch from PIC to APIC mode. |
1226 | */ | 1161 | */ |
@@ -1258,8 +1193,8 @@ void __init native_smp_prepare_boot_cpu(void) | |||
1258 | int me = smp_processor_id(); | 1193 | int me = smp_processor_id(); |
1259 | #ifdef CONFIG_X86_32 | 1194 | #ifdef CONFIG_X86_32 |
1260 | init_gdt(me); | 1195 | init_gdt(me); |
1261 | switch_to_new_gdt(); | ||
1262 | #endif | 1196 | #endif |
1197 | switch_to_new_gdt(); | ||
1263 | /* already set me in cpu_online_map in boot_cpu_init() */ | 1198 | /* already set me in cpu_online_map in boot_cpu_init() */ |
1264 | cpu_set(me, cpu_callout_map); | 1199 | cpu_set(me, cpu_callout_map); |
1265 | per_cpu(cpu_state, me) = CPU_ONLINE; | 1200 | per_cpu(cpu_state, me) = CPU_ONLINE; |
@@ -1267,7 +1202,7 @@ void __init native_smp_prepare_boot_cpu(void) | |||
1267 | 1202 | ||
1268 | void __init native_smp_cpus_done(unsigned int max_cpus) | 1203 | void __init native_smp_cpus_done(unsigned int max_cpus) |
1269 | { | 1204 | { |
1270 | Dprintk("Boot done.\n"); | 1205 | pr_debug("Boot done.\n"); |
1271 | 1206 | ||
1272 | impress_friends(); | 1207 | impress_friends(); |
1273 | smp_checks(); | 1208 | smp_checks(); |
@@ -1279,29 +1214,12 @@ void __init native_smp_cpus_done(unsigned int max_cpus) | |||
1279 | 1214 | ||
1280 | #ifdef CONFIG_HOTPLUG_CPU | 1215 | #ifdef CONFIG_HOTPLUG_CPU |
1281 | 1216 | ||
1282 | # ifdef CONFIG_X86_32 | ||
1283 | void cpu_exit_clear(void) | ||
1284 | { | ||
1285 | int cpu = raw_smp_processor_id(); | ||
1286 | |||
1287 | idle_task_exit(); | ||
1288 | |||
1289 | cpu_uninit(); | ||
1290 | irq_ctx_exit(cpu); | ||
1291 | |||
1292 | cpu_clear(cpu, cpu_callout_map); | ||
1293 | cpu_clear(cpu, cpu_callin_map); | ||
1294 | |||
1295 | unmap_cpu_to_logical_apicid(cpu); | ||
1296 | } | ||
1297 | # endif /* CONFIG_X86_32 */ | ||
1298 | |||
1299 | static void remove_siblinginfo(int cpu) | 1217 | static void remove_siblinginfo(int cpu) |
1300 | { | 1218 | { |
1301 | int sibling; | 1219 | int sibling; |
1302 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 1220 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
1303 | 1221 | ||
1304 | for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { | 1222 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { |
1305 | cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); | 1223 | cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); |
1306 | /*/ | 1224 | /*/ |
1307 | * last thread sibling in this cpu core going down | 1225 | * last thread sibling in this cpu core going down |
@@ -1310,7 +1228,7 @@ static void remove_siblinginfo(int cpu) | |||
1310 | cpu_data(sibling).booted_cores--; | 1228 | cpu_data(sibling).booted_cores--; |
1311 | } | 1229 | } |
1312 | 1230 | ||
1313 | for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) | 1231 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) |
1314 | cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); | 1232 | cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); |
1315 | cpus_clear(per_cpu(cpu_sibling_map, cpu)); | 1233 | cpus_clear(per_cpu(cpu_sibling_map, cpu)); |
1316 | cpus_clear(per_cpu(cpu_core_map, cpu)); | 1234 | cpus_clear(per_cpu(cpu_core_map, cpu)); |
@@ -1349,12 +1267,20 @@ __init void prefill_possible_map(void) | |||
1349 | int i; | 1267 | int i; |
1350 | int possible; | 1268 | int possible; |
1351 | 1269 | ||
1270 | /* no processor from mptable or madt */ | ||
1271 | if (!num_processors) | ||
1272 | num_processors = 1; | ||
1273 | |||
1274 | #ifdef CONFIG_HOTPLUG_CPU | ||
1352 | if (additional_cpus == -1) { | 1275 | if (additional_cpus == -1) { |
1353 | if (disabled_cpus > 0) | 1276 | if (disabled_cpus > 0) |
1354 | additional_cpus = disabled_cpus; | 1277 | additional_cpus = disabled_cpus; |
1355 | else | 1278 | else |
1356 | additional_cpus = 0; | 1279 | additional_cpus = 0; |
1357 | } | 1280 | } |
1281 | #else | ||
1282 | additional_cpus = 0; | ||
1283 | #endif | ||
1358 | possible = num_processors + additional_cpus; | 1284 | possible = num_processors + additional_cpus; |
1359 | if (possible > NR_CPUS) | 1285 | if (possible > NR_CPUS) |
1360 | possible = NR_CPUS; | 1286 | possible = NR_CPUS; |
@@ -1364,18 +1290,18 @@ __init void prefill_possible_map(void) | |||
1364 | 1290 | ||
1365 | for (i = 0; i < possible; i++) | 1291 | for (i = 0; i < possible; i++) |
1366 | cpu_set(i, cpu_possible_map); | 1292 | cpu_set(i, cpu_possible_map); |
1293 | |||
1294 | nr_cpu_ids = possible; | ||
1367 | } | 1295 | } |
1368 | 1296 | ||
1369 | static void __ref remove_cpu_from_maps(int cpu) | 1297 | static void __ref remove_cpu_from_maps(int cpu) |
1370 | { | 1298 | { |
1371 | cpu_clear(cpu, cpu_online_map); | 1299 | cpu_clear(cpu, cpu_online_map); |
1372 | #ifdef CONFIG_X86_64 | ||
1373 | cpu_clear(cpu, cpu_callout_map); | 1300 | cpu_clear(cpu, cpu_callout_map); |
1374 | cpu_clear(cpu, cpu_callin_map); | 1301 | cpu_clear(cpu, cpu_callin_map); |
1375 | /* was set by cpu_init() */ | 1302 | /* was set by cpu_init() */ |
1376 | clear_bit(cpu, (unsigned long *)&cpu_initialized); | 1303 | cpu_clear(cpu, cpu_initialized); |
1377 | clear_node_cpumask(cpu); | 1304 | numa_remove_cpu(cpu); |
1378 | #endif | ||
1379 | } | 1305 | } |
1380 | 1306 | ||
1381 | int __cpu_disable(void) | 1307 | int __cpu_disable(void) |
@@ -1453,7 +1379,8 @@ static int __init parse_maxcpus(char *arg) | |||
1453 | { | 1379 | { |
1454 | extern unsigned int maxcpus; | 1380 | extern unsigned int maxcpus; |
1455 | 1381 | ||
1456 | maxcpus = simple_strtoul(arg, NULL, 0); | 1382 | if (arg) |
1383 | maxcpus = simple_strtoul(arg, NULL, 0); | ||
1457 | return 0; | 1384 | return 0; |
1458 | } | 1385 | } |
1459 | early_param("maxcpus", parse_maxcpus); | 1386 | early_param("maxcpus", parse_maxcpus); |
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c index 3449064d141a..99941b37eca0 100644 --- a/arch/x86/kernel/smpcommon.c +++ b/arch/x86/kernel/smpcommon.c | |||
@@ -25,59 +25,3 @@ __cpuinit void init_gdt(int cpu) | |||
25 | per_cpu(cpu_number, cpu) = cpu; | 25 | per_cpu(cpu_number, cpu) = cpu; |
26 | } | 26 | } |
27 | #endif | 27 | #endif |
28 | |||
29 | /** | ||
30 | * smp_call_function(): Run a function on all other CPUs. | ||
31 | * @func: The function to run. This must be fast and non-blocking. | ||
32 | * @info: An arbitrary pointer to pass to the function. | ||
33 | * @nonatomic: Unused. | ||
34 | * @wait: If true, wait (atomically) until function has completed on other CPUs. | ||
35 | * | ||
36 | * Returns 0 on success, else a negative status code. | ||
37 | * | ||
38 | * If @wait is true, then returns once @func has returned; otherwise | ||
39 | * it returns just before the target cpu calls @func. | ||
40 | * | ||
41 | * You must not call this function with disabled interrupts or from a | ||
42 | * hardware interrupt handler or from a bottom half handler. | ||
43 | */ | ||
44 | int smp_call_function(void (*func) (void *info), void *info, int nonatomic, | ||
45 | int wait) | ||
46 | { | ||
47 | return smp_call_function_mask(cpu_online_map, func, info, wait); | ||
48 | } | ||
49 | EXPORT_SYMBOL(smp_call_function); | ||
50 | |||
51 | /** | ||
52 | * smp_call_function_single - Run a function on a specific CPU | ||
53 | * @cpu: The target CPU. Cannot be the calling CPU. | ||
54 | * @func: The function to run. This must be fast and non-blocking. | ||
55 | * @info: An arbitrary pointer to pass to the function. | ||
56 | * @nonatomic: Unused. | ||
57 | * @wait: If true, wait until function has completed on other CPUs. | ||
58 | * | ||
59 | * Returns 0 on success, else a negative status code. | ||
60 | * | ||
61 | * If @wait is true, then returns once @func has returned; otherwise | ||
62 | * it returns just before the target cpu calls @func. | ||
63 | */ | ||
64 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | ||
65 | int nonatomic, int wait) | ||
66 | { | ||
67 | /* prevent preemption and reschedule on another processor */ | ||
68 | int ret; | ||
69 | int me = get_cpu(); | ||
70 | if (cpu == me) { | ||
71 | local_irq_disable(); | ||
72 | func(info); | ||
73 | local_irq_enable(); | ||
74 | put_cpu(); | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); | ||
79 | |||
80 | put_cpu(); | ||
81 | return ret; | ||
82 | } | ||
83 | EXPORT_SYMBOL(smp_call_function_single); | ||
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c deleted file mode 100644 index 8b137891791f..000000000000 --- a/arch/x86/kernel/smpcommon_32.c +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | |||
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c deleted file mode 100644 index 70e4a374b4e8..000000000000 --- a/arch/x86/kernel/srat_32.c +++ /dev/null | |||
@@ -1,358 +0,0 @@ | |||
1 | /* | ||
2 | * Some of the code in this file has been gleaned from the 64 bit | ||
3 | * discontigmem support code base. | ||
4 | * | ||
5 | * Copyright (C) 2002, IBM Corp. | ||
6 | * | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | * | ||
24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> | ||
25 | */ | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mmzone.h> | ||
29 | #include <linux/acpi.h> | ||
30 | #include <linux/nodemask.h> | ||
31 | #include <asm/srat.h> | ||
32 | #include <asm/topology.h> | ||
33 | #include <asm/smp.h> | ||
34 | |||
35 | /* | ||
36 | * proximity macros and definitions | ||
37 | */ | ||
38 | #define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ | ||
39 | #define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ | ||
40 | #define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) | ||
41 | #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) | ||
42 | /* bitmap length; _PXM is at most 255 */ | ||
43 | #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) | ||
44 | static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ | ||
45 | |||
46 | #define MAX_CHUNKS_PER_NODE 3 | ||
47 | #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) | ||
48 | struct node_memory_chunk_s { | ||
49 | unsigned long start_pfn; | ||
50 | unsigned long end_pfn; | ||
51 | u8 pxm; // proximity domain of node | ||
52 | u8 nid; // which cnode contains this chunk? | ||
53 | u8 bank; // which mem bank on this node | ||
54 | }; | ||
55 | static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; | ||
56 | |||
57 | static int num_memory_chunks; /* total number of memory chunks */ | ||
58 | static u8 __initdata apicid_to_pxm[MAX_APICID]; | ||
59 | |||
60 | /* Identify CPU proximity domains */ | ||
61 | static void __init parse_cpu_affinity_structure(char *p) | ||
62 | { | ||
63 | struct acpi_srat_cpu_affinity *cpu_affinity = | ||
64 | (struct acpi_srat_cpu_affinity *) p; | ||
65 | |||
66 | if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
67 | return; /* empty entry */ | ||
68 | |||
69 | /* mark this node as "seen" in node bitmap */ | ||
70 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); | ||
71 | |||
72 | apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; | ||
73 | |||
74 | printk("CPU 0x%02X in proximity domain 0x%02X\n", | ||
75 | cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * Identify memory proximity domains and hot-remove capabilities. | ||
80 | * Fill node memory chunk list structure. | ||
81 | */ | ||
82 | static void __init parse_memory_affinity_structure (char *sratp) | ||
83 | { | ||
84 | unsigned long long paddr, size; | ||
85 | unsigned long start_pfn, end_pfn; | ||
86 | u8 pxm; | ||
87 | struct node_memory_chunk_s *p, *q, *pend; | ||
88 | struct acpi_srat_mem_affinity *memory_affinity = | ||
89 | (struct acpi_srat_mem_affinity *) sratp; | ||
90 | |||
91 | if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
92 | return; /* empty entry */ | ||
93 | |||
94 | pxm = memory_affinity->proximity_domain & 0xff; | ||
95 | |||
96 | /* mark this node as "seen" in node bitmap */ | ||
97 | BMAP_SET(pxm_bitmap, pxm); | ||
98 | |||
99 | /* calculate info for memory chunk structure */ | ||
100 | paddr = memory_affinity->base_address; | ||
101 | size = memory_affinity->length; | ||
102 | |||
103 | start_pfn = paddr >> PAGE_SHIFT; | ||
104 | end_pfn = (paddr + size) >> PAGE_SHIFT; | ||
105 | |||
106 | |||
107 | if (num_memory_chunks >= MAXCHUNKS) { | ||
108 | printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", | ||
109 | size/(1024*1024), paddr); | ||
110 | return; | ||
111 | } | ||
112 | |||
113 | /* Insertion sort based on base address */ | ||
114 | pend = &node_memory_chunk[num_memory_chunks]; | ||
115 | for (p = &node_memory_chunk[0]; p < pend; p++) { | ||
116 | if (start_pfn < p->start_pfn) | ||
117 | break; | ||
118 | } | ||
119 | if (p < pend) { | ||
120 | for (q = pend; q >= p; q--) | ||
121 | *(q + 1) = *q; | ||
122 | } | ||
123 | p->start_pfn = start_pfn; | ||
124 | p->end_pfn = end_pfn; | ||
125 | p->pxm = pxm; | ||
126 | |||
127 | num_memory_chunks++; | ||
128 | |||
129 | printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", | ||
130 | start_pfn, end_pfn, | ||
131 | memory_affinity->memory_type, | ||
132 | pxm, | ||
133 | ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? | ||
134 | "enabled and removable" : "enabled" ) ); | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * The SRAT table always lists ascending addresses, so can always | ||
139 | * assume that the first "start" address that you see is the real | ||
140 | * start of the node, and that the current "end" address is after | ||
141 | * the previous one. | ||
142 | */ | ||
143 | static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) | ||
144 | { | ||
145 | /* | ||
146 | * Only add present memory as told by the e820. | ||
147 | * There is no guarantee from the SRAT that the memory it | ||
148 | * enumerates is present at boot time because it represents | ||
149 | * *possible* memory hotplug areas the same as normal RAM. | ||
150 | */ | ||
151 | if (memory_chunk->start_pfn >= max_pfn) { | ||
152 | printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", | ||
153 | memory_chunk->start_pfn, memory_chunk->end_pfn); | ||
154 | return; | ||
155 | } | ||
156 | if (memory_chunk->nid != nid) | ||
157 | return; | ||
158 | |||
159 | if (!node_has_online_mem(nid)) | ||
160 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
161 | |||
162 | if (node_start_pfn[nid] > memory_chunk->start_pfn) | ||
163 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
164 | |||
165 | if (node_end_pfn[nid] < memory_chunk->end_pfn) | ||
166 | node_end_pfn[nid] = memory_chunk->end_pfn; | ||
167 | } | ||
168 | |||
169 | /* Parse the ACPI Static Resource Affinity Table */ | ||
170 | static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) | ||
171 | { | ||
172 | u8 *start, *end, *p; | ||
173 | int i, j, nid; | ||
174 | |||
175 | start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ | ||
176 | p = start; | ||
177 | end = (u8 *)sratp + sratp->header.length; | ||
178 | |||
179 | memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ | ||
180 | memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); | ||
181 | |||
182 | num_memory_chunks = 0; | ||
183 | while (p < end) { | ||
184 | switch (*p) { | ||
185 | case ACPI_SRAT_TYPE_CPU_AFFINITY: | ||
186 | parse_cpu_affinity_structure(p); | ||
187 | break; | ||
188 | case ACPI_SRAT_TYPE_MEMORY_AFFINITY: | ||
189 | parse_memory_affinity_structure(p); | ||
190 | break; | ||
191 | default: | ||
192 | printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); | ||
193 | break; | ||
194 | } | ||
195 | p += p[1]; | ||
196 | if (p[1] == 0) { | ||
197 | printk("acpi20_parse_srat: Entry length value is zero;" | ||
198 | " can't parse any further!\n"); | ||
199 | break; | ||
200 | } | ||
201 | } | ||
202 | |||
203 | if (num_memory_chunks == 0) { | ||
204 | printk("could not finy any ACPI SRAT memory areas.\n"); | ||
205 | goto out_fail; | ||
206 | } | ||
207 | |||
208 | /* Calculate total number of nodes in system from PXM bitmap and create | ||
209 | * a set of sequential node IDs starting at zero. (ACPI doesn't seem | ||
210 | * to specify the range of _PXM values.) | ||
211 | */ | ||
212 | /* | ||
213 | * MCD - we no longer HAVE to number nodes sequentially. PXM domain | ||
214 | * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically | ||
215 | * 32, so we will continue numbering them in this manner until MAX_NUMNODES | ||
216 | * approaches MAX_PXM_DOMAINS for i386. | ||
217 | */ | ||
218 | nodes_clear(node_online_map); | ||
219 | for (i = 0; i < MAX_PXM_DOMAINS; i++) { | ||
220 | if (BMAP_TEST(pxm_bitmap, i)) { | ||
221 | int nid = acpi_map_pxm_to_node(i); | ||
222 | node_set_online(nid); | ||
223 | } | ||
224 | } | ||
225 | BUG_ON(num_online_nodes() == 0); | ||
226 | |||
227 | /* set cnode id in memory chunk structure */ | ||
228 | for (i = 0; i < num_memory_chunks; i++) | ||
229 | node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); | ||
230 | |||
231 | printk("pxm bitmap: "); | ||
232 | for (i = 0; i < sizeof(pxm_bitmap); i++) { | ||
233 | printk("%02X ", pxm_bitmap[i]); | ||
234 | } | ||
235 | printk("\n"); | ||
236 | printk("Number of logical nodes in system = %d\n", num_online_nodes()); | ||
237 | printk("Number of memory chunks in system = %d\n", num_memory_chunks); | ||
238 | |||
239 | for (i = 0; i < MAX_APICID; i++) | ||
240 | apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); | ||
241 | |||
242 | for (j = 0; j < num_memory_chunks; j++){ | ||
243 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; | ||
244 | printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", | ||
245 | j, chunk->nid, chunk->start_pfn, chunk->end_pfn); | ||
246 | node_read_chunk(chunk->nid, chunk); | ||
247 | add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); | ||
248 | } | ||
249 | |||
250 | for_each_online_node(nid) { | ||
251 | unsigned long start = node_start_pfn[nid]; | ||
252 | unsigned long end = node_end_pfn[nid]; | ||
253 | |||
254 | memory_present(nid, start, end); | ||
255 | node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); | ||
256 | } | ||
257 | return 1; | ||
258 | out_fail: | ||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | struct acpi_static_rsdt { | ||
263 | struct acpi_table_rsdt table; | ||
264 | u32 padding[7]; /* Allow for 7 more table entries */ | ||
265 | }; | ||
266 | |||
267 | int __init get_memcfg_from_srat(void) | ||
268 | { | ||
269 | struct acpi_table_header *header = NULL; | ||
270 | struct acpi_table_rsdp *rsdp = NULL; | ||
271 | struct acpi_table_rsdt *rsdt = NULL; | ||
272 | acpi_native_uint rsdp_address = 0; | ||
273 | struct acpi_static_rsdt saved_rsdt; | ||
274 | int tables = 0; | ||
275 | int i = 0; | ||
276 | |||
277 | rsdp_address = acpi_os_get_root_pointer(); | ||
278 | if (!rsdp_address) { | ||
279 | printk("%s: System description tables not found\n", | ||
280 | __func__); | ||
281 | goto out_err; | ||
282 | } | ||
283 | |||
284 | printk("%s: assigning address to rsdp\n", __func__); | ||
285 | rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address; | ||
286 | if (!rsdp) { | ||
287 | printk("%s: Didn't find ACPI root!\n", __func__); | ||
288 | goto out_err; | ||
289 | } | ||
290 | |||
291 | printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, | ||
292 | rsdp->oem_id); | ||
293 | |||
294 | if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) { | ||
295 | printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__); | ||
296 | goto out_err; | ||
297 | } | ||
298 | |||
299 | rsdt = (struct acpi_table_rsdt *) | ||
300 | early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); | ||
301 | |||
302 | if (!rsdt) { | ||
303 | printk(KERN_WARNING | ||
304 | "%s: ACPI: Invalid root system description tables (RSDT)\n", | ||
305 | __func__); | ||
306 | goto out_err; | ||
307 | } | ||
308 | |||
309 | header = &rsdt->header; | ||
310 | |||
311 | if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) { | ||
312 | printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); | ||
313 | goto out_err; | ||
314 | } | ||
315 | |||
316 | /* | ||
317 | * The number of tables is computed by taking the | ||
318 | * size of all entries (header size minus total | ||
319 | * size of RSDT) divided by the size of each entry | ||
320 | * (4-byte table pointers). | ||
321 | */ | ||
322 | tables = (header->length - sizeof(struct acpi_table_header)) / 4; | ||
323 | |||
324 | if (!tables) | ||
325 | goto out_err; | ||
326 | |||
327 | memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); | ||
328 | |||
329 | if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) { | ||
330 | printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", | ||
331 | saved_rsdt.table.header.length); | ||
332 | goto out_err; | ||
333 | } | ||
334 | |||
335 | printk("Begin SRAT table scan....\n"); | ||
336 | |||
337 | for (i = 0; i < tables; i++) { | ||
338 | /* Map in header, then map in full table length. */ | ||
339 | header = (struct acpi_table_header *) | ||
340 | early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); | ||
341 | if (!header) | ||
342 | break; | ||
343 | header = (struct acpi_table_header *) | ||
344 | early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); | ||
345 | if (!header) | ||
346 | break; | ||
347 | |||
348 | if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) | ||
349 | continue; | ||
350 | |||
351 | /* we've found the srat table. don't need to look at any more tables */ | ||
352 | return acpi20_parse_srat((struct acpi_table_srat *)header); | ||
353 | } | ||
354 | out_err: | ||
355 | remove_all_active_ranges(); | ||
356 | printk("failed to get NUMA memory information from SRAT table\n"); | ||
357 | return 0; | ||
358 | } | ||
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index c28c342c162f..a03e7f6d90c3 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace) | |||
74 | if (trace->nr_entries < trace->max_entries) | 74 | if (trace->nr_entries < trace->max_entries) |
75 | trace->entries[trace->nr_entries++] = ULONG_MAX; | 75 | trace->entries[trace->nr_entries++] = ULONG_MAX; |
76 | } | 76 | } |
77 | EXPORT_SYMBOL_GPL(save_stack_trace); | ||
77 | 78 | ||
78 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | 79 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) |
79 | { | 80 | { |
@@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | |||
81 | if (trace->nr_entries < trace->max_entries) | 82 | if (trace->nr_entries < trace->max_entries) |
82 | trace->entries[trace->nr_entries++] = ULONG_MAX; | 83 | trace->entries[trace->nr_entries++] = ULONG_MAX; |
83 | } | 84 | } |
85 | EXPORT_SYMBOL_GPL(save_stack_trace_tsk); | ||
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 92c20fee6781..e8b9863ef8c4 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c | |||
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) | |||
105 | static int enable_single_step(struct task_struct *child) | 105 | static int enable_single_step(struct task_struct *child) |
106 | { | 106 | { |
107 | struct pt_regs *regs = task_pt_regs(child); | 107 | struct pt_regs *regs = task_pt_regs(child); |
108 | unsigned long oflags; | ||
109 | |||
110 | /* | ||
111 | * If we stepped into a sysenter/syscall insn, it trapped in | ||
112 | * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. | ||
113 | * If user-mode had set TF itself, then it's still clear from | ||
114 | * do_debug() and we need to set it again to restore the user | ||
115 | * state so we don't wrongly set TIF_FORCED_TF below. | ||
116 | * If enable_single_step() was used last and that is what | ||
117 | * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are | ||
118 | * already set and our bookkeeping is fine. | ||
119 | */ | ||
120 | if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) | ||
121 | regs->flags |= X86_EFLAGS_TF; | ||
108 | 122 | ||
109 | /* | 123 | /* |
110 | * Always set TIF_SINGLESTEP - this guarantees that | 124 | * Always set TIF_SINGLESTEP - this guarantees that |
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child) | |||
113 | */ | 127 | */ |
114 | set_tsk_thread_flag(child, TIF_SINGLESTEP); | 128 | set_tsk_thread_flag(child, TIF_SINGLESTEP); |
115 | 129 | ||
116 | /* | 130 | oflags = regs->flags; |
117 | * If TF was already set, don't do anything else | ||
118 | */ | ||
119 | if (regs->flags & X86_EFLAGS_TF) | ||
120 | return 0; | ||
121 | 131 | ||
122 | /* Set TF on the kernel stack.. */ | 132 | /* Set TF on the kernel stack.. */ |
123 | regs->flags |= X86_EFLAGS_TF; | 133 | regs->flags |= X86_EFLAGS_TF; |
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child) | |||
126 | * ..but if TF is changed by the instruction we will trace, | 136 | * ..but if TF is changed by the instruction we will trace, |
127 | * don't mark it as being "us" that set it, so that we | 137 | * don't mark it as being "us" that set it, so that we |
128 | * won't clear it by hand later. | 138 | * won't clear it by hand later. |
139 | * | ||
140 | * Note that if we don't actually execute the popf because | ||
141 | * of a signal arriving right now or suchlike, we will lose | ||
142 | * track of the fact that it really was "us" that set it. | ||
129 | */ | 143 | */ |
130 | if (is_setting_trap_flag(child, regs)) | 144 | if (is_setting_trap_flag(child, regs)) { |
145 | clear_tsk_thread_flag(child, TIF_FORCED_TF); | ||
131 | return 0; | 146 | return 0; |
147 | } | ||
148 | |||
149 | /* | ||
150 | * If TF was already set, check whether it was us who set it. | ||
151 | * If not, we should never attempt a block step. | ||
152 | */ | ||
153 | if (oflags & X86_EFLAGS_TF) | ||
154 | return test_tsk_thread_flag(child, TIF_FORCED_TF); | ||
132 | 155 | ||
133 | set_tsk_thread_flag(child, TIF_FORCED_TF); | 156 | set_tsk_thread_flag(child, TIF_FORCED_TF); |
134 | 157 | ||
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index ae751094eba9..d67ce5f044ba 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c | |||
@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata; | |||
36 | static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; | 36 | static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; |
37 | static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; | 37 | static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; |
38 | 38 | ||
39 | #ifndef CONFIG_X86_NUMAQ | ||
39 | static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; | 40 | static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; |
41 | #endif | ||
40 | 42 | ||
41 | static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) | 43 | static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) |
42 | { | 44 | { |
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index d2ab52cc1d6b..7066cb855a60 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c | |||
@@ -19,8 +19,8 @@ | |||
19 | #include <linux/utsname.h> | 19 | #include <linux/utsname.h> |
20 | #include <linux/ipc.h> | 20 | #include <linux/ipc.h> |
21 | 21 | ||
22 | #include <asm/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <asm/unistd.h> | 23 | #include <linux/unistd.h> |
24 | 24 | ||
25 | asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, | 25 | asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, |
26 | unsigned long prot, unsigned long flags, | 26 | unsigned long prot, unsigned long flags, |
@@ -103,7 +103,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg) | |||
103 | * | 103 | * |
104 | * This is really horribly ugly. | 104 | * This is really horribly ugly. |
105 | */ | 105 | */ |
106 | asmlinkage int sys_ipc (uint call, int first, int second, | 106 | asmlinkage int sys_ipc(uint call, int first, int second, |
107 | int third, void __user *ptr, long fifth) | 107 | int third, void __user *ptr, long fifth) |
108 | { | 108 | { |
109 | int version, ret; | 109 | int version, ret; |
@@ -113,24 +113,24 @@ asmlinkage int sys_ipc (uint call, int first, int second, | |||
113 | 113 | ||
114 | switch (call) { | 114 | switch (call) { |
115 | case SEMOP: | 115 | case SEMOP: |
116 | return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); | 116 | return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); |
117 | case SEMTIMEDOP: | 117 | case SEMTIMEDOP: |
118 | return sys_semtimedop(first, (struct sembuf __user *)ptr, second, | 118 | return sys_semtimedop(first, (struct sembuf __user *)ptr, second, |
119 | (const struct timespec __user *)fifth); | 119 | (const struct timespec __user *)fifth); |
120 | 120 | ||
121 | case SEMGET: | 121 | case SEMGET: |
122 | return sys_semget (first, second, third); | 122 | return sys_semget(first, second, third); |
123 | case SEMCTL: { | 123 | case SEMCTL: { |
124 | union semun fourth; | 124 | union semun fourth; |
125 | if (!ptr) | 125 | if (!ptr) |
126 | return -EINVAL; | 126 | return -EINVAL; |
127 | if (get_user(fourth.__pad, (void __user * __user *) ptr)) | 127 | if (get_user(fourth.__pad, (void __user * __user *) ptr)) |
128 | return -EFAULT; | 128 | return -EFAULT; |
129 | return sys_semctl (first, second, third, fourth); | 129 | return sys_semctl(first, second, third, fourth); |
130 | } | 130 | } |
131 | 131 | ||
132 | case MSGSND: | 132 | case MSGSND: |
133 | return sys_msgsnd (first, (struct msgbuf __user *) ptr, | 133 | return sys_msgsnd(first, (struct msgbuf __user *) ptr, |
134 | second, third); | 134 | second, third); |
135 | case MSGRCV: | 135 | case MSGRCV: |
136 | switch (version) { | 136 | switch (version) { |
@@ -138,45 +138,45 @@ asmlinkage int sys_ipc (uint call, int first, int second, | |||
138 | struct ipc_kludge tmp; | 138 | struct ipc_kludge tmp; |
139 | if (!ptr) | 139 | if (!ptr) |
140 | return -EINVAL; | 140 | return -EINVAL; |
141 | 141 | ||
142 | if (copy_from_user(&tmp, | 142 | if (copy_from_user(&tmp, |
143 | (struct ipc_kludge __user *) ptr, | 143 | (struct ipc_kludge __user *) ptr, |
144 | sizeof (tmp))) | 144 | sizeof(tmp))) |
145 | return -EFAULT; | 145 | return -EFAULT; |
146 | return sys_msgrcv (first, tmp.msgp, second, | 146 | return sys_msgrcv(first, tmp.msgp, second, |
147 | tmp.msgtyp, third); | 147 | tmp.msgtyp, third); |
148 | } | 148 | } |
149 | default: | 149 | default: |
150 | return sys_msgrcv (first, | 150 | return sys_msgrcv(first, |
151 | (struct msgbuf __user *) ptr, | 151 | (struct msgbuf __user *) ptr, |
152 | second, fifth, third); | 152 | second, fifth, third); |
153 | } | 153 | } |
154 | case MSGGET: | 154 | case MSGGET: |
155 | return sys_msgget ((key_t) first, second); | 155 | return sys_msgget((key_t) first, second); |
156 | case MSGCTL: | 156 | case MSGCTL: |
157 | return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); | 157 | return sys_msgctl(first, second, (struct msqid_ds __user *) ptr); |
158 | 158 | ||
159 | case SHMAT: | 159 | case SHMAT: |
160 | switch (version) { | 160 | switch (version) { |
161 | default: { | 161 | default: { |
162 | ulong raddr; | 162 | ulong raddr; |
163 | ret = do_shmat (first, (char __user *) ptr, second, &raddr); | 163 | ret = do_shmat(first, (char __user *) ptr, second, &raddr); |
164 | if (ret) | 164 | if (ret) |
165 | return ret; | 165 | return ret; |
166 | return put_user (raddr, (ulong __user *) third); | 166 | return put_user(raddr, (ulong __user *) third); |
167 | } | 167 | } |
168 | case 1: /* iBCS2 emulator entry point */ | 168 | case 1: /* iBCS2 emulator entry point */ |
169 | if (!segment_eq(get_fs(), get_ds())) | 169 | if (!segment_eq(get_fs(), get_ds())) |
170 | return -EINVAL; | 170 | return -EINVAL; |
171 | /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ | 171 | /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ |
172 | return do_shmat (first, (char __user *) ptr, second, (ulong *) third); | 172 | return do_shmat(first, (char __user *) ptr, second, (ulong *) third); |
173 | } | 173 | } |
174 | case SHMDT: | 174 | case SHMDT: |
175 | return sys_shmdt ((char __user *)ptr); | 175 | return sys_shmdt((char __user *)ptr); |
176 | case SHMGET: | 176 | case SHMGET: |
177 | return sys_shmget (first, second, third); | 177 | return sys_shmget(first, second, third); |
178 | case SHMCTL: | 178 | case SHMCTL: |
179 | return sys_shmctl (first, second, | 179 | return sys_shmctl(first, second, |
180 | (struct shmid_ds __user *) ptr); | 180 | (struct shmid_ds __user *) ptr); |
181 | default: | 181 | default: |
182 | return -ENOSYS; | 182 | return -ENOSYS; |
@@ -186,28 +186,28 @@ asmlinkage int sys_ipc (uint call, int first, int second, | |||
186 | /* | 186 | /* |
187 | * Old cruft | 187 | * Old cruft |
188 | */ | 188 | */ |
189 | asmlinkage int sys_uname(struct old_utsname __user * name) | 189 | asmlinkage int sys_uname(struct old_utsname __user *name) |
190 | { | 190 | { |
191 | int err; | 191 | int err; |
192 | if (!name) | 192 | if (!name) |
193 | return -EFAULT; | 193 | return -EFAULT; |
194 | down_read(&uts_sem); | 194 | down_read(&uts_sem); |
195 | err = copy_to_user(name, utsname(), sizeof (*name)); | 195 | err = copy_to_user(name, utsname(), sizeof(*name)); |
196 | up_read(&uts_sem); | 196 | up_read(&uts_sem); |
197 | return err?-EFAULT:0; | 197 | return err? -EFAULT:0; |
198 | } | 198 | } |
199 | 199 | ||
200 | asmlinkage int sys_olduname(struct oldold_utsname __user * name) | 200 | asmlinkage int sys_olduname(struct oldold_utsname __user *name) |
201 | { | 201 | { |
202 | int error; | 202 | int error; |
203 | 203 | ||
204 | if (!name) | 204 | if (!name) |
205 | return -EFAULT; | 205 | return -EFAULT; |
206 | if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) | 206 | if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) |
207 | return -EFAULT; | 207 | return -EFAULT; |
208 | 208 | ||
209 | down_read(&uts_sem); | 209 | down_read(&uts_sem); |
210 | 210 | ||
211 | error = __copy_to_user(&name->sysname, &utsname()->sysname, | 211 | error = __copy_to_user(&name->sysname, &utsname()->sysname, |
212 | __OLD_UTS_LEN); | 212 | __OLD_UTS_LEN); |
213 | error |= __put_user(0, name->sysname + __OLD_UTS_LEN); | 213 | error |= __put_user(0, name->sysname + __OLD_UTS_LEN); |
@@ -223,9 +223,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name) | |||
223 | error |= __copy_to_user(&name->machine, &utsname()->machine, | 223 | error |= __copy_to_user(&name->machine, &utsname()->machine, |
224 | __OLD_UTS_LEN); | 224 | __OLD_UTS_LEN); |
225 | error |= __put_user(0, name->machine + __OLD_UTS_LEN); | 225 | error |= __put_user(0, name->machine + __OLD_UTS_LEN); |
226 | 226 | ||
227 | up_read(&uts_sem); | 227 | up_read(&uts_sem); |
228 | 228 | ||
229 | error = error ? -EFAULT : 0; | 229 | error = error ? -EFAULT : 0; |
230 | 230 | ||
231 | return error; | 231 | return error; |
@@ -241,6 +241,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]) | |||
241 | long __res; | 241 | long __res; |
242 | asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" | 242 | asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" |
243 | : "=a" (__res) | 243 | : "=a" (__res) |
244 | : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory"); | 244 | : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); |
245 | return __res; | 245 | return __res; |
246 | } | 246 | } |
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index adff5562f5fd..d44395ff34c3 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -326,3 +326,9 @@ ENTRY(sys_call_table) | |||
326 | .long sys_fallocate | 326 | .long sys_fallocate |
327 | .long sys_timerfd_settime /* 325 */ | 327 | .long sys_timerfd_settime /* 325 */ |
328 | .long sys_timerfd_gettime | 328 | .long sys_timerfd_gettime |
329 | .long sys_signalfd4 | ||
330 | .long sys_eventfd2 | ||
331 | .long sys_epoll_create1 | ||
332 | .long sys_dup3 /* 330 */ | ||
333 | .long sys_pipe2 | ||
334 | .long sys_inotify_init1 | ||
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 2ff21f398934..ffe3c664afc0 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c | |||
@@ -39,9 +39,6 @@ | |||
39 | 39 | ||
40 | #include "do_timer.h" | 40 | #include "do_timer.h" |
41 | 41 | ||
42 | unsigned int cpu_khz; /* Detected as we calibrate the TSC */ | ||
43 | EXPORT_SYMBOL(cpu_khz); | ||
44 | |||
45 | int timer_ack; | 42 | int timer_ack; |
46 | 43 | ||
47 | unsigned long profile_pc(struct pt_regs *regs) | 44 | unsigned long profile_pc(struct pt_regs *regs) |
@@ -84,8 +81,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) | |||
84 | if (timer_ack) { | 81 | if (timer_ack) { |
85 | /* | 82 | /* |
86 | * Subtle, when I/O APICs are used we have to ack timer IRQ | 83 | * Subtle, when I/O APICs are used we have to ack timer IRQ |
87 | * manually to reset the IRR bit for do_slow_gettimeoffset(). | 84 | * manually to deassert NMI lines for the watchdog if run |
88 | * This will also deassert NMI lines for the watchdog if run | ||
89 | * on an 82489DX-based system. | 85 | * on an 82489DX-based system. |
90 | */ | 86 | */ |
91 | spin_lock(&i8259A_lock); | 87 | spin_lock(&i8259A_lock); |
@@ -133,6 +129,7 @@ void __init hpet_time_init(void) | |||
133 | */ | 129 | */ |
134 | void __init time_init(void) | 130 | void __init time_init(void) |
135 | { | 131 | { |
132 | pre_time_init_hook(); | ||
136 | tsc_init(); | 133 | tsc_init(); |
137 | late_time_init = choose_time_init(); | 134 | late_time_init = choose_time_init(); |
138 | } | 135 | } |
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index c737849e2ef7..e3d49c553af2 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c | |||
@@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) | |||
56 | /* calibrate_cpu is used on systems with fixed rate TSCs to determine | 56 | /* calibrate_cpu is used on systems with fixed rate TSCs to determine |
57 | * processor frequency */ | 57 | * processor frequency */ |
58 | #define TICK_COUNT 100000000 | 58 | #define TICK_COUNT 100000000 |
59 | unsigned long __init native_calculate_cpu_khz(void) | 59 | unsigned long __init calibrate_cpu(void) |
60 | { | 60 | { |
61 | int tsc_start, tsc_now; | 61 | int tsc_start, tsc_now; |
62 | int i, no_ctr_free; | 62 | int i, no_ctr_free; |
@@ -116,23 +116,11 @@ void __init hpet_time_init(void) | |||
116 | 116 | ||
117 | void __init time_init(void) | 117 | void __init time_init(void) |
118 | { | 118 | { |
119 | tsc_calibrate(); | 119 | tsc_init(); |
120 | |||
121 | cpu_khz = tsc_khz; | ||
122 | if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && | ||
123 | (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) | ||
124 | cpu_khz = calculate_cpu_khz(); | ||
125 | |||
126 | if (unsynchronized_tsc()) | ||
127 | mark_tsc_unstable("TSCs unsynchronized"); | ||
128 | |||
129 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | 120 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) |
130 | vgetcpu_mode = VGETCPU_RDTSCP; | 121 | vgetcpu_mode = VGETCPU_RDTSCP; |
131 | else | 122 | else |
132 | vgetcpu_mode = VGETCPU_LSL; | 123 | vgetcpu_mode = VGETCPU_LSL; |
133 | 124 | ||
134 | printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", | ||
135 | cpu_khz / 1000, cpu_khz % 1000); | ||
136 | init_tsc_clocksource(); | ||
137 | late_time_init = choose_time_init(); | 125 | late_time_init = choose_time_init(); |
138 | } | 126 | } |
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index 9bb2363851af..fec1ecedc9b7 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c | |||
@@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info) | |||
238 | 238 | ||
239 | void flush_tlb_all(void) | 239 | void flush_tlb_all(void) |
240 | { | 240 | { |
241 | on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | 241 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
242 | } | 242 | } |
243 | 243 | ||
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index a1f07d793202..dcbf7a1159ea 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c | |||
@@ -15,6 +15,8 @@ | |||
15 | #include <asm/proto.h> | 15 | #include <asm/proto.h> |
16 | #include <asm/apicdef.h> | 16 | #include <asm/apicdef.h> |
17 | #include <asm/idle.h> | 17 | #include <asm/idle.h> |
18 | #include <asm/uv/uv_hub.h> | ||
19 | #include <asm/uv/uv_bau.h> | ||
18 | 20 | ||
19 | #include <mach_ipi.h> | 21 | #include <mach_ipi.h> |
20 | /* | 22 | /* |
@@ -162,6 +164,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, | |||
162 | union smp_flush_state *f; | 164 | union smp_flush_state *f; |
163 | cpumask_t cpumask = *cpumaskp; | 165 | cpumask_t cpumask = *cpumaskp; |
164 | 166 | ||
167 | if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va)) | ||
168 | return; | ||
169 | |||
165 | /* Caller has disabled preemption */ | 170 | /* Caller has disabled preemption */ |
166 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | 171 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; |
167 | f = &per_cpu(flush_state, sender); | 172 | f = &per_cpu(flush_state, sender); |
@@ -270,5 +275,5 @@ static void do_flush_tlb_all(void *info) | |||
270 | 275 | ||
271 | void flush_tlb_all(void) | 276 | void flush_tlb_all(void) |
272 | { | 277 | { |
273 | on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | 278 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
274 | } | 279 | } |
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c new file mode 100644 index 000000000000..d0fbb7712ab0 --- /dev/null +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -0,0 +1,792 @@ | |||
1 | /* | ||
2 | * SGI UltraViolet TLB flush routines. | ||
3 | * | ||
4 | * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. | ||
5 | * | ||
6 | * This code is released under the GNU General Public License version 2 or | ||
7 | * later. | ||
8 | */ | ||
9 | #include <linux/mc146818rtc.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/kernel.h> | ||
12 | |||
13 | #include <asm/mmu_context.h> | ||
14 | #include <asm/uv/uv_mmrs.h> | ||
15 | #include <asm/uv/uv_hub.h> | ||
16 | #include <asm/uv/uv_bau.h> | ||
17 | #include <asm/genapic.h> | ||
18 | #include <asm/idle.h> | ||
19 | #include <asm/tsc.h> | ||
20 | |||
21 | #include <mach_apic.h> | ||
22 | |||
23 | static struct bau_control **uv_bau_table_bases __read_mostly; | ||
24 | static int uv_bau_retry_limit __read_mostly; | ||
25 | |||
26 | /* position of pnode (which is nasid>>1): */ | ||
27 | static int uv_nshift __read_mostly; | ||
28 | |||
29 | static unsigned long uv_mmask __read_mostly; | ||
30 | |||
31 | static DEFINE_PER_CPU(struct ptc_stats, ptcstats); | ||
32 | static DEFINE_PER_CPU(struct bau_control, bau_control); | ||
33 | |||
34 | /* | ||
35 | * Free a software acknowledge hardware resource by clearing its Pending | ||
36 | * bit. This will return a reply to the sender. | ||
37 | * If the message has timed out, a reply has already been sent by the | ||
38 | * hardware but the resource has not been released. In that case our | ||
39 | * clear of the Timeout bit (as well) will free the resource. No reply will | ||
40 | * be sent (the hardware will only do one reply per message). | ||
41 | */ | ||
42 | static void uv_reply_to_message(int resource, | ||
43 | struct bau_payload_queue_entry *msg, | ||
44 | struct bau_msg_status *msp) | ||
45 | { | ||
46 | unsigned long dw; | ||
47 | |||
48 | dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); | ||
49 | msg->replied_to = 1; | ||
50 | msg->sw_ack_vector = 0; | ||
51 | if (msp) | ||
52 | msp->seen_by.bits = 0; | ||
53 | uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * Do all the things a cpu should do for a TLB shootdown message. | ||
58 | * Other cpu's may come here at the same time for this message. | ||
59 | */ | ||
60 | static void uv_bau_process_message(struct bau_payload_queue_entry *msg, | ||
61 | int msg_slot, int sw_ack_slot) | ||
62 | { | ||
63 | unsigned long this_cpu_mask; | ||
64 | struct bau_msg_status *msp; | ||
65 | int cpu; | ||
66 | |||
67 | msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; | ||
68 | cpu = uv_blade_processor_id(); | ||
69 | msg->number_of_cpus = | ||
70 | uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); | ||
71 | this_cpu_mask = 1UL << cpu; | ||
72 | if (msp->seen_by.bits & this_cpu_mask) | ||
73 | return; | ||
74 | atomic_or_long(&msp->seen_by.bits, this_cpu_mask); | ||
75 | |||
76 | if (msg->replied_to == 1) | ||
77 | return; | ||
78 | |||
79 | if (msg->address == TLB_FLUSH_ALL) { | ||
80 | local_flush_tlb(); | ||
81 | __get_cpu_var(ptcstats).alltlb++; | ||
82 | } else { | ||
83 | __flush_tlb_one(msg->address); | ||
84 | __get_cpu_var(ptcstats).onetlb++; | ||
85 | } | ||
86 | |||
87 | __get_cpu_var(ptcstats).requestee++; | ||
88 | |||
89 | atomic_inc_short(&msg->acknowledge_count); | ||
90 | if (msg->number_of_cpus == msg->acknowledge_count) | ||
91 | uv_reply_to_message(sw_ack_slot, msg, msp); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Examine the payload queue on one distribution node to see | ||
96 | * which messages have not been seen, and which cpu(s) have not seen them. | ||
97 | * | ||
98 | * Returns the number of cpu's that have not responded. | ||
99 | */ | ||
100 | static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) | ||
101 | { | ||
102 | struct bau_payload_queue_entry *msg; | ||
103 | struct bau_msg_status *msp; | ||
104 | int count = 0; | ||
105 | int i; | ||
106 | int j; | ||
107 | |||
108 | for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; | ||
109 | msg++, i++) { | ||
110 | if ((msg->sending_cpu == sender) && (!msg->replied_to)) { | ||
111 | msp = bau_tablesp->msg_statuses + i; | ||
112 | printk(KERN_DEBUG | ||
113 | "blade %d: address:%#lx %d of %d, not cpu(s): ", | ||
114 | i, msg->address, msg->acknowledge_count, | ||
115 | msg->number_of_cpus); | ||
116 | for (j = 0; j < msg->number_of_cpus; j++) { | ||
117 | if (!((1L << j) & msp->seen_by.bits)) { | ||
118 | count++; | ||
119 | printk("%d ", j); | ||
120 | } | ||
121 | } | ||
122 | printk("\n"); | ||
123 | } | ||
124 | } | ||
125 | return count; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * Examine the payload queue on all the distribution nodes to see | ||
130 | * which messages have not been seen, and which cpu(s) have not seen them. | ||
131 | * | ||
132 | * Returns the number of cpu's that have not responded. | ||
133 | */ | ||
134 | static int uv_examine_destinations(struct bau_target_nodemask *distribution) | ||
135 | { | ||
136 | int sender; | ||
137 | int i; | ||
138 | int count = 0; | ||
139 | |||
140 | sender = smp_processor_id(); | ||
141 | for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { | ||
142 | if (!bau_node_isset(i, distribution)) | ||
143 | continue; | ||
144 | count += uv_examine_destination(uv_bau_table_bases[i], sender); | ||
145 | } | ||
146 | return count; | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * wait for completion of a broadcast message | ||
151 | * | ||
152 | * return COMPLETE, RETRY or GIVEUP | ||
153 | */ | ||
154 | static int uv_wait_completion(struct bau_desc *bau_desc, | ||
155 | unsigned long mmr_offset, int right_shift) | ||
156 | { | ||
157 | int exams = 0; | ||
158 | long destination_timeouts = 0; | ||
159 | long source_timeouts = 0; | ||
160 | unsigned long descriptor_status; | ||
161 | |||
162 | while ((descriptor_status = (((unsigned long) | ||
163 | uv_read_local_mmr(mmr_offset) >> | ||
164 | right_shift) & UV_ACT_STATUS_MASK)) != | ||
165 | DESC_STATUS_IDLE) { | ||
166 | if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { | ||
167 | source_timeouts++; | ||
168 | if (source_timeouts > SOURCE_TIMEOUT_LIMIT) | ||
169 | source_timeouts = 0; | ||
170 | __get_cpu_var(ptcstats).s_retry++; | ||
171 | return FLUSH_RETRY; | ||
172 | } | ||
173 | /* | ||
174 | * spin here looking for progress at the destinations | ||
175 | */ | ||
176 | if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { | ||
177 | destination_timeouts++; | ||
178 | if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { | ||
179 | /* | ||
180 | * returns number of cpus not responding | ||
181 | */ | ||
182 | if (uv_examine_destinations | ||
183 | (&bau_desc->distribution) == 0) { | ||
184 | __get_cpu_var(ptcstats).d_retry++; | ||
185 | return FLUSH_RETRY; | ||
186 | } | ||
187 | exams++; | ||
188 | if (exams >= uv_bau_retry_limit) { | ||
189 | printk(KERN_DEBUG | ||
190 | "uv_flush_tlb_others"); | ||
191 | printk("giving up on cpu %d\n", | ||
192 | smp_processor_id()); | ||
193 | return FLUSH_GIVEUP; | ||
194 | } | ||
195 | /* | ||
196 | * delays can hang the simulator | ||
197 | udelay(1000); | ||
198 | */ | ||
199 | destination_timeouts = 0; | ||
200 | } | ||
201 | } | ||
202 | } | ||
203 | return FLUSH_COMPLETE; | ||
204 | } | ||
205 | |||
206 | /** | ||
207 | * uv_flush_send_and_wait | ||
208 | * | ||
209 | * Send a broadcast and wait for a broadcast message to complete. | ||
210 | * | ||
211 | * The cpumaskp mask contains the cpus the broadcast was sent to. | ||
212 | * | ||
213 | * Returns 1 if all remote flushing was done. The mask is zeroed. | ||
214 | * Returns 0 if some remote flushing remains to be done. The mask is left | ||
215 | * unchanged. | ||
216 | */ | ||
217 | int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, | ||
218 | cpumask_t *cpumaskp) | ||
219 | { | ||
220 | int completion_status = 0; | ||
221 | int right_shift; | ||
222 | int tries = 0; | ||
223 | int blade; | ||
224 | int bit; | ||
225 | unsigned long mmr_offset; | ||
226 | unsigned long index; | ||
227 | cycles_t time1; | ||
228 | cycles_t time2; | ||
229 | |||
230 | if (cpu < UV_CPUS_PER_ACT_STATUS) { | ||
231 | mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; | ||
232 | right_shift = cpu * UV_ACT_STATUS_SIZE; | ||
233 | } else { | ||
234 | mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; | ||
235 | right_shift = | ||
236 | ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); | ||
237 | } | ||
238 | time1 = get_cycles(); | ||
239 | do { | ||
240 | tries++; | ||
241 | index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | | ||
242 | cpu; | ||
243 | uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); | ||
244 | completion_status = uv_wait_completion(bau_desc, mmr_offset, | ||
245 | right_shift); | ||
246 | } while (completion_status == FLUSH_RETRY); | ||
247 | time2 = get_cycles(); | ||
248 | __get_cpu_var(ptcstats).sflush += (time2 - time1); | ||
249 | if (tries > 1) | ||
250 | __get_cpu_var(ptcstats).retriesok++; | ||
251 | |||
252 | if (completion_status == FLUSH_GIVEUP) { | ||
253 | /* | ||
254 | * Cause the caller to do an IPI-style TLB shootdown on | ||
255 | * the cpu's, all of which are still in the mask. | ||
256 | */ | ||
257 | __get_cpu_var(ptcstats).ptc_i++; | ||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * Success, so clear the remote cpu's from the mask so we don't | ||
263 | * use the IPI method of shootdown on them. | ||
264 | */ | ||
265 | for_each_cpu_mask(bit, *cpumaskp) { | ||
266 | blade = uv_cpu_to_blade_id(bit); | ||
267 | if (blade == this_blade) | ||
268 | continue; | ||
269 | cpu_clear(bit, *cpumaskp); | ||
270 | } | ||
271 | if (!cpus_empty(*cpumaskp)) | ||
272 | return 0; | ||
273 | return 1; | ||
274 | } | ||
275 | |||
276 | /** | ||
277 | * uv_flush_tlb_others - globally purge translation cache of a virtual | ||
278 | * address or all TLB's | ||
279 | * @cpumaskp: mask of all cpu's in which the address is to be removed | ||
280 | * @mm: mm_struct containing virtual address range | ||
281 | * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) | ||
282 | * | ||
283 | * This is the entry point for initiating any UV global TLB shootdown. | ||
284 | * | ||
285 | * Purges the translation caches of all specified processors of the given | ||
286 | * virtual address, or purges all TLB's on specified processors. | ||
287 | * | ||
288 | * The caller has derived the cpumaskp from the mm_struct and has subtracted | ||
289 | * the local cpu from the mask. This function is called only if there | ||
290 | * are bits set in the mask. (e.g. flush_tlb_page()) | ||
291 | * | ||
292 | * The cpumaskp is converted into a nodemask of the nodes containing | ||
293 | * the cpus. | ||
294 | * | ||
295 | * Returns 1 if all remote flushing was done. | ||
296 | * Returns 0 if some remote flushing remains to be done. | ||
297 | */ | ||
298 | int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, | ||
299 | unsigned long va) | ||
300 | { | ||
301 | int i; | ||
302 | int bit; | ||
303 | int blade; | ||
304 | int cpu; | ||
305 | int this_blade; | ||
306 | int locals = 0; | ||
307 | struct bau_desc *bau_desc; | ||
308 | |||
309 | cpu = uv_blade_processor_id(); | ||
310 | this_blade = uv_numa_blade_id(); | ||
311 | bau_desc = __get_cpu_var(bau_control).descriptor_base; | ||
312 | bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; | ||
313 | |||
314 | bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); | ||
315 | |||
316 | i = 0; | ||
317 | for_each_cpu_mask(bit, *cpumaskp) { | ||
318 | blade = uv_cpu_to_blade_id(bit); | ||
319 | BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); | ||
320 | if (blade == this_blade) { | ||
321 | locals++; | ||
322 | continue; | ||
323 | } | ||
324 | bau_node_set(blade, &bau_desc->distribution); | ||
325 | i++; | ||
326 | } | ||
327 | if (i == 0) { | ||
328 | /* | ||
329 | * no off_node flushing; return status for local node | ||
330 | */ | ||
331 | if (locals) | ||
332 | return 0; | ||
333 | else | ||
334 | return 1; | ||
335 | } | ||
336 | __get_cpu_var(ptcstats).requestor++; | ||
337 | __get_cpu_var(ptcstats).ntargeted += i; | ||
338 | |||
339 | bau_desc->payload.address = va; | ||
340 | bau_desc->payload.sending_cpu = smp_processor_id(); | ||
341 | |||
342 | return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * The BAU message interrupt comes here. (registered by set_intr_gate) | ||
347 | * See entry_64.S | ||
348 | * | ||
349 | * We received a broadcast assist message. | ||
350 | * | ||
351 | * Interrupts may have been disabled; this interrupt could represent | ||
352 | * the receipt of several messages. | ||
353 | * | ||
354 | * All cores/threads on this node get this interrupt. | ||
355 | * The last one to see it does the s/w ack. | ||
356 | * (the resource will not be freed until noninterruptable cpus see this | ||
357 | * interrupt; hardware will timeout the s/w ack and reply ERROR) | ||
358 | */ | ||
359 | void uv_bau_message_interrupt(struct pt_regs *regs) | ||
360 | { | ||
361 | struct bau_payload_queue_entry *va_queue_first; | ||
362 | struct bau_payload_queue_entry *va_queue_last; | ||
363 | struct bau_payload_queue_entry *msg; | ||
364 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
365 | cycles_t time1; | ||
366 | cycles_t time2; | ||
367 | int msg_slot; | ||
368 | int sw_ack_slot; | ||
369 | int fw; | ||
370 | int count = 0; | ||
371 | unsigned long local_pnode; | ||
372 | |||
373 | ack_APIC_irq(); | ||
374 | exit_idle(); | ||
375 | irq_enter(); | ||
376 | |||
377 | time1 = get_cycles(); | ||
378 | |||
379 | local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); | ||
380 | |||
381 | va_queue_first = __get_cpu_var(bau_control).va_queue_first; | ||
382 | va_queue_last = __get_cpu_var(bau_control).va_queue_last; | ||
383 | |||
384 | msg = __get_cpu_var(bau_control).bau_msg_head; | ||
385 | while (msg->sw_ack_vector) { | ||
386 | count++; | ||
387 | fw = msg->sw_ack_vector; | ||
388 | msg_slot = msg - va_queue_first; | ||
389 | sw_ack_slot = ffs(fw) - 1; | ||
390 | |||
391 | uv_bau_process_message(msg, msg_slot, sw_ack_slot); | ||
392 | |||
393 | msg++; | ||
394 | if (msg > va_queue_last) | ||
395 | msg = va_queue_first; | ||
396 | __get_cpu_var(bau_control).bau_msg_head = msg; | ||
397 | } | ||
398 | if (!count) | ||
399 | __get_cpu_var(ptcstats).nomsg++; | ||
400 | else if (count > 1) | ||
401 | __get_cpu_var(ptcstats).multmsg++; | ||
402 | |||
403 | time2 = get_cycles(); | ||
404 | __get_cpu_var(ptcstats).dflush += (time2 - time1); | ||
405 | |||
406 | irq_exit(); | ||
407 | set_irq_regs(old_regs); | ||
408 | } | ||
409 | |||
410 | static void uv_enable_timeouts(void) | ||
411 | { | ||
412 | int i; | ||
413 | int blade; | ||
414 | int last_blade; | ||
415 | int pnode; | ||
416 | int cur_cpu = 0; | ||
417 | unsigned long apicid; | ||
418 | |||
419 | last_blade = -1; | ||
420 | for_each_online_node(i) { | ||
421 | blade = uv_node_to_blade_id(i); | ||
422 | if (blade == last_blade) | ||
423 | continue; | ||
424 | last_blade = blade; | ||
425 | apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); | ||
426 | pnode = uv_blade_to_pnode(blade); | ||
427 | cur_cpu += uv_blade_nr_possible_cpus(i); | ||
428 | } | ||
429 | } | ||
430 | |||
431 | static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) | ||
432 | { | ||
433 | if (*offset < num_possible_cpus()) | ||
434 | return offset; | ||
435 | return NULL; | ||
436 | } | ||
437 | |||
438 | static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) | ||
439 | { | ||
440 | (*offset)++; | ||
441 | if (*offset < num_possible_cpus()) | ||
442 | return offset; | ||
443 | return NULL; | ||
444 | } | ||
445 | |||
446 | static void uv_ptc_seq_stop(struct seq_file *file, void *data) | ||
447 | { | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * Display the statistics thru /proc | ||
452 | * data points to the cpu number | ||
453 | */ | ||
454 | static int uv_ptc_seq_show(struct seq_file *file, void *data) | ||
455 | { | ||
456 | struct ptc_stats *stat; | ||
457 | int cpu; | ||
458 | |||
459 | cpu = *(loff_t *)data; | ||
460 | |||
461 | if (!cpu) { | ||
462 | seq_printf(file, | ||
463 | "# cpu requestor requestee one all sretry dretry ptc_i "); | ||
464 | seq_printf(file, | ||
465 | "sw_ack sflush dflush sok dnomsg dmult starget\n"); | ||
466 | } | ||
467 | if (cpu < num_possible_cpus() && cpu_online(cpu)) { | ||
468 | stat = &per_cpu(ptcstats, cpu); | ||
469 | seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", | ||
470 | cpu, stat->requestor, | ||
471 | stat->requestee, stat->onetlb, stat->alltlb, | ||
472 | stat->s_retry, stat->d_retry, stat->ptc_i); | ||
473 | seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", | ||
474 | uv_read_global_mmr64(uv_blade_to_pnode | ||
475 | (uv_cpu_to_blade_id(cpu)), | ||
476 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), | ||
477 | stat->sflush, stat->dflush, | ||
478 | stat->retriesok, stat->nomsg, | ||
479 | stat->multmsg, stat->ntargeted); | ||
480 | } | ||
481 | |||
482 | return 0; | ||
483 | } | ||
484 | |||
485 | /* | ||
486 | * 0: display meaning of the statistics | ||
487 | * >0: retry limit | ||
488 | */ | ||
489 | static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | ||
490 | size_t count, loff_t *data) | ||
491 | { | ||
492 | long newmode; | ||
493 | char optstr[64]; | ||
494 | |||
495 | if (count == 0 || count > sizeof(optstr)) | ||
496 | return -EINVAL; | ||
497 | if (copy_from_user(optstr, user, count)) | ||
498 | return -EFAULT; | ||
499 | optstr[count - 1] = '\0'; | ||
500 | if (strict_strtoul(optstr, 10, &newmode) < 0) { | ||
501 | printk(KERN_DEBUG "%s is invalid\n", optstr); | ||
502 | return -EINVAL; | ||
503 | } | ||
504 | |||
505 | if (newmode == 0) { | ||
506 | printk(KERN_DEBUG "# cpu: cpu number\n"); | ||
507 | printk(KERN_DEBUG | ||
508 | "requestor: times this cpu was the flush requestor\n"); | ||
509 | printk(KERN_DEBUG | ||
510 | "requestee: times this cpu was requested to flush its TLBs\n"); | ||
511 | printk(KERN_DEBUG | ||
512 | "one: times requested to flush a single address\n"); | ||
513 | printk(KERN_DEBUG | ||
514 | "all: times requested to flush all TLB's\n"); | ||
515 | printk(KERN_DEBUG | ||
516 | "sretry: number of retries of source-side timeouts\n"); | ||
517 | printk(KERN_DEBUG | ||
518 | "dretry: number of retries of destination-side timeouts\n"); | ||
519 | printk(KERN_DEBUG | ||
520 | "ptc_i: times UV fell through to IPI-style flushes\n"); | ||
521 | printk(KERN_DEBUG | ||
522 | "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); | ||
523 | printk(KERN_DEBUG | ||
524 | "sflush_us: cycles spent in uv_flush_tlb_others()\n"); | ||
525 | printk(KERN_DEBUG | ||
526 | "dflush_us: cycles spent in handling flush requests\n"); | ||
527 | printk(KERN_DEBUG "sok: successes on retry\n"); | ||
528 | printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); | ||
529 | printk(KERN_DEBUG | ||
530 | "dmult: interrupts with multiple messages\n"); | ||
531 | printk(KERN_DEBUG "starget: nodes targeted\n"); | ||
532 | } else { | ||
533 | uv_bau_retry_limit = newmode; | ||
534 | printk(KERN_DEBUG "timeout retry limit:%d\n", | ||
535 | uv_bau_retry_limit); | ||
536 | } | ||
537 | |||
538 | return count; | ||
539 | } | ||
540 | |||
541 | static const struct seq_operations uv_ptc_seq_ops = { | ||
542 | .start = uv_ptc_seq_start, | ||
543 | .next = uv_ptc_seq_next, | ||
544 | .stop = uv_ptc_seq_stop, | ||
545 | .show = uv_ptc_seq_show | ||
546 | }; | ||
547 | |||
548 | static int uv_ptc_proc_open(struct inode *inode, struct file *file) | ||
549 | { | ||
550 | return seq_open(file, &uv_ptc_seq_ops); | ||
551 | } | ||
552 | |||
553 | static const struct file_operations proc_uv_ptc_operations = { | ||
554 | .open = uv_ptc_proc_open, | ||
555 | .read = seq_read, | ||
556 | .write = uv_ptc_proc_write, | ||
557 | .llseek = seq_lseek, | ||
558 | .release = seq_release, | ||
559 | }; | ||
560 | |||
561 | static int __init uv_ptc_init(void) | ||
562 | { | ||
563 | struct proc_dir_entry *proc_uv_ptc; | ||
564 | |||
565 | if (!is_uv_system()) | ||
566 | return 0; | ||
567 | |||
568 | if (!proc_mkdir("sgi_uv", NULL)) | ||
569 | return -EINVAL; | ||
570 | |||
571 | proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); | ||
572 | if (!proc_uv_ptc) { | ||
573 | printk(KERN_ERR "unable to create %s proc entry\n", | ||
574 | UV_PTC_BASENAME); | ||
575 | remove_proc_entry("sgi_uv", NULL); | ||
576 | return -EINVAL; | ||
577 | } | ||
578 | proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; | ||
579 | return 0; | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * begin the initialization of the per-blade control structures | ||
584 | */ | ||
585 | static struct bau_control * __init uv_table_bases_init(int blade, int node) | ||
586 | { | ||
587 | int i; | ||
588 | int *ip; | ||
589 | struct bau_msg_status *msp; | ||
590 | struct bau_control *bau_tabp; | ||
591 | |||
592 | bau_tabp = | ||
593 | kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); | ||
594 | BUG_ON(!bau_tabp); | ||
595 | |||
596 | bau_tabp->msg_statuses = | ||
597 | kmalloc_node(sizeof(struct bau_msg_status) * | ||
598 | DEST_Q_SIZE, GFP_KERNEL, node); | ||
599 | BUG_ON(!bau_tabp->msg_statuses); | ||
600 | |||
601 | for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++) | ||
602 | bau_cpubits_clear(&msp->seen_by, (int) | ||
603 | uv_blade_nr_possible_cpus(blade)); | ||
604 | |||
605 | bau_tabp->watching = | ||
606 | kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node); | ||
607 | BUG_ON(!bau_tabp->watching); | ||
608 | |||
609 | for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) | ||
610 | *ip = 0; | ||
611 | |||
612 | uv_bau_table_bases[blade] = bau_tabp; | ||
613 | |||
614 | return bau_tabp; | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * finish the initialization of the per-blade control structures | ||
619 | */ | ||
620 | static void __init | ||
621 | uv_table_bases_finish(int blade, int node, int cur_cpu, | ||
622 | struct bau_control *bau_tablesp, | ||
623 | struct bau_desc *adp) | ||
624 | { | ||
625 | struct bau_control *bcp; | ||
626 | int i; | ||
627 | |||
628 | for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) { | ||
629 | bcp = (struct bau_control *)&per_cpu(bau_control, i); | ||
630 | |||
631 | bcp->bau_msg_head = bau_tablesp->va_queue_first; | ||
632 | bcp->va_queue_first = bau_tablesp->va_queue_first; | ||
633 | bcp->va_queue_last = bau_tablesp->va_queue_last; | ||
634 | bcp->watching = bau_tablesp->watching; | ||
635 | bcp->msg_statuses = bau_tablesp->msg_statuses; | ||
636 | bcp->descriptor_base = adp; | ||
637 | } | ||
638 | } | ||
639 | |||
640 | /* | ||
641 | * initialize the sending side's sending buffers | ||
642 | */ | ||
643 | static struct bau_desc * __init | ||
644 | uv_activation_descriptor_init(int node, int pnode) | ||
645 | { | ||
646 | int i; | ||
647 | unsigned long pa; | ||
648 | unsigned long m; | ||
649 | unsigned long n; | ||
650 | unsigned long mmr_image; | ||
651 | struct bau_desc *adp; | ||
652 | struct bau_desc *ad2; | ||
653 | |||
654 | adp = (struct bau_desc *) | ||
655 | kmalloc_node(16384, GFP_KERNEL, node); | ||
656 | BUG_ON(!adp); | ||
657 | |||
658 | pa = __pa((unsigned long)adp); | ||
659 | n = pa >> uv_nshift; | ||
660 | m = pa & uv_mmask; | ||
661 | |||
662 | mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); | ||
663 | if (mmr_image) { | ||
664 | uv_write_global_mmr64(pnode, (unsigned long) | ||
665 | UVH_LB_BAU_SB_DESCRIPTOR_BASE, | ||
666 | (n << UV_DESC_BASE_PNODE_SHIFT | m)); | ||
667 | } | ||
668 | |||
669 | for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { | ||
670 | memset(ad2, 0, sizeof(struct bau_desc)); | ||
671 | ad2->header.sw_ack_flag = 1; | ||
672 | ad2->header.base_dest_nodeid = | ||
673 | uv_blade_to_pnode(uv_cpu_to_blade_id(0)); | ||
674 | ad2->header.command = UV_NET_ENDPOINT_INTD; | ||
675 | ad2->header.int_both = 1; | ||
676 | /* | ||
677 | * all others need to be set to zero: | ||
678 | * fairness chaining multilevel count replied_to | ||
679 | */ | ||
680 | } | ||
681 | return adp; | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * initialize the destination side's receiving buffers | ||
686 | */ | ||
687 | static struct bau_payload_queue_entry * __init | ||
688 | uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) | ||
689 | { | ||
690 | struct bau_payload_queue_entry *pqp; | ||
691 | char *cp; | ||
692 | |||
693 | pqp = (struct bau_payload_queue_entry *) kmalloc_node( | ||
694 | (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), | ||
695 | GFP_KERNEL, node); | ||
696 | BUG_ON(!pqp); | ||
697 | |||
698 | cp = (char *)pqp + 31; | ||
699 | pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); | ||
700 | bau_tablesp->va_queue_first = pqp; | ||
701 | uv_write_global_mmr64(pnode, | ||
702 | UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, | ||
703 | ((unsigned long)pnode << | ||
704 | UV_PAYLOADQ_PNODE_SHIFT) | | ||
705 | uv_physnodeaddr(pqp)); | ||
706 | uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, | ||
707 | uv_physnodeaddr(pqp)); | ||
708 | bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1); | ||
709 | uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, | ||
710 | (unsigned long) | ||
711 | uv_physnodeaddr(bau_tablesp->va_queue_last)); | ||
712 | memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); | ||
713 | |||
714 | return pqp; | ||
715 | } | ||
716 | |||
717 | /* | ||
718 | * Initialization of each UV blade's structures | ||
719 | */ | ||
720 | static int __init uv_init_blade(int blade, int node, int cur_cpu) | ||
721 | { | ||
722 | int pnode; | ||
723 | unsigned long pa; | ||
724 | unsigned long apicid; | ||
725 | struct bau_desc *adp; | ||
726 | struct bau_payload_queue_entry *pqp; | ||
727 | struct bau_control *bau_tablesp; | ||
728 | |||
729 | bau_tablesp = uv_table_bases_init(blade, node); | ||
730 | pnode = uv_blade_to_pnode(blade); | ||
731 | adp = uv_activation_descriptor_init(node, pnode); | ||
732 | pqp = uv_payload_queue_init(node, pnode, bau_tablesp); | ||
733 | uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp); | ||
734 | /* | ||
735 | * the below initialization can't be in firmware because the | ||
736 | * messaging IRQ will be determined by the OS | ||
737 | */ | ||
738 | apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); | ||
739 | pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); | ||
740 | if ((pa & 0xff) != UV_BAU_MESSAGE) { | ||
741 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, | ||
742 | ((apicid << 32) | UV_BAU_MESSAGE)); | ||
743 | } | ||
744 | return 0; | ||
745 | } | ||
746 | |||
747 | /* | ||
748 | * Initialization of BAU-related structures | ||
749 | */ | ||
750 | static int __init uv_bau_init(void) | ||
751 | { | ||
752 | int blade; | ||
753 | int node; | ||
754 | int nblades; | ||
755 | int last_blade; | ||
756 | int cur_cpu = 0; | ||
757 | |||
758 | if (!is_uv_system()) | ||
759 | return 0; | ||
760 | |||
761 | uv_bau_retry_limit = 1; | ||
762 | uv_nshift = uv_hub_info->n_val; | ||
763 | uv_mmask = (1UL << uv_hub_info->n_val) - 1; | ||
764 | nblades = 0; | ||
765 | last_blade = -1; | ||
766 | for_each_online_node(node) { | ||
767 | blade = uv_node_to_blade_id(node); | ||
768 | if (blade == last_blade) | ||
769 | continue; | ||
770 | last_blade = blade; | ||
771 | nblades++; | ||
772 | } | ||
773 | uv_bau_table_bases = (struct bau_control **) | ||
774 | kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); | ||
775 | BUG_ON(!uv_bau_table_bases); | ||
776 | |||
777 | last_blade = -1; | ||
778 | for_each_online_node(node) { | ||
779 | blade = uv_node_to_blade_id(node); | ||
780 | if (blade == last_blade) | ||
781 | continue; | ||
782 | last_blade = blade; | ||
783 | uv_init_blade(blade, node, cur_cpu); | ||
784 | cur_cpu += uv_blade_nr_possible_cpus(blade); | ||
785 | } | ||
786 | set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); | ||
787 | uv_enable_timeouts(); | ||
788 | |||
789 | return 0; | ||
790 | } | ||
791 | __initcall(uv_bau_init); | ||
792 | __initcall(uv_ptc_init); | ||
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index abbf199adebb..1106fac6024d 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c | |||
@@ -2,7 +2,7 @@ | |||
2 | 2 | ||
3 | #include <asm/trampoline.h> | 3 | #include <asm/trampoline.h> |
4 | 4 | ||
5 | /* ready for x86_64, no harm for x86, since it will overwrite after alloc */ | 5 | /* ready for x86_64 and x86 */ |
6 | unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); | 6 | unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); |
7 | 7 | ||
8 | /* | 8 | /* |
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 08d752de4eee..03df8e45e5a1 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c | |||
@@ -1,5 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 1991, 1992 Linus Torvalds | 2 | * Copyright (C) 1991, 1992 Linus Torvalds |
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
3 | * | 4 | * |
4 | * Pentium III FXSR, SSE support | 5 | * Pentium III FXSR, SSE support |
5 | * Gareth Hughes <gareth@valinux.com>, May 2000 | 6 | * Gareth Hughes <gareth@valinux.com>, May 2000 |
@@ -57,11 +58,10 @@ | |||
57 | #include <asm/nmi.h> | 58 | #include <asm/nmi.h> |
58 | #include <asm/smp.h> | 59 | #include <asm/smp.h> |
59 | #include <asm/io.h> | 60 | #include <asm/io.h> |
61 | #include <asm/traps.h> | ||
60 | 62 | ||
61 | #include "mach_traps.h" | 63 | #include "mach_traps.h" |
62 | 64 | ||
63 | int panic_on_unrecovered_nmi; | ||
64 | |||
65 | DECLARE_BITMAP(used_vectors, NR_VECTORS); | 65 | DECLARE_BITMAP(used_vectors, NR_VECTORS); |
66 | EXPORT_SYMBOL_GPL(used_vectors); | 66 | EXPORT_SYMBOL_GPL(used_vectors); |
67 | 67 | ||
@@ -78,39 +78,22 @@ char ignore_fpu_irq; | |||
78 | gate_desc idt_table[256] | 78 | gate_desc idt_table[256] |
79 | __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; | 79 | __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; |
80 | 80 | ||
81 | asmlinkage void divide_error(void); | 81 | int panic_on_unrecovered_nmi; |
82 | asmlinkage void debug(void); | ||
83 | asmlinkage void nmi(void); | ||
84 | asmlinkage void int3(void); | ||
85 | asmlinkage void overflow(void); | ||
86 | asmlinkage void bounds(void); | ||
87 | asmlinkage void invalid_op(void); | ||
88 | asmlinkage void device_not_available(void); | ||
89 | asmlinkage void coprocessor_segment_overrun(void); | ||
90 | asmlinkage void invalid_TSS(void); | ||
91 | asmlinkage void segment_not_present(void); | ||
92 | asmlinkage void stack_segment(void); | ||
93 | asmlinkage void general_protection(void); | ||
94 | asmlinkage void page_fault(void); | ||
95 | asmlinkage void coprocessor_error(void); | ||
96 | asmlinkage void simd_coprocessor_error(void); | ||
97 | asmlinkage void alignment_check(void); | ||
98 | asmlinkage void spurious_interrupt_bug(void); | ||
99 | asmlinkage void machine_check(void); | ||
100 | |||
101 | int kstack_depth_to_print = 24; | 82 | int kstack_depth_to_print = 24; |
102 | static unsigned int code_bytes = 64; | 83 | static unsigned int code_bytes = 64; |
84 | static int ignore_nmis; | ||
85 | static int die_counter; | ||
103 | 86 | ||
104 | void printk_address(unsigned long address, int reliable) | 87 | void printk_address(unsigned long address, int reliable) |
105 | { | 88 | { |
106 | #ifdef CONFIG_KALLSYMS | 89 | #ifdef CONFIG_KALLSYMS |
107 | char namebuf[KSYM_NAME_LEN]; | ||
108 | unsigned long offset = 0; | 90 | unsigned long offset = 0; |
109 | unsigned long symsize; | 91 | unsigned long symsize; |
110 | const char *symname; | 92 | const char *symname; |
111 | char reliab[4] = ""; | ||
112 | char *delim = ":"; | ||
113 | char *modname; | 93 | char *modname; |
94 | char *delim = ":"; | ||
95 | char namebuf[KSYM_NAME_LEN]; | ||
96 | char reliab[4] = ""; | ||
114 | 97 | ||
115 | symname = kallsyms_lookup(address, &symsize, &offset, | 98 | symname = kallsyms_lookup(address, &symsize, &offset, |
116 | &modname, namebuf); | 99 | &modname, namebuf); |
@@ -130,22 +113,23 @@ void printk_address(unsigned long address, int reliable) | |||
130 | #endif | 113 | #endif |
131 | } | 114 | } |
132 | 115 | ||
133 | static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) | 116 | static inline int valid_stack_ptr(struct thread_info *tinfo, |
117 | void *p, unsigned int size) | ||
134 | { | 118 | { |
135 | return p > (void *)tinfo && | 119 | void *t = tinfo; |
136 | p <= (void *)tinfo + THREAD_SIZE - size; | 120 | return p > t && p <= t + THREAD_SIZE - size; |
137 | } | 121 | } |
138 | 122 | ||
139 | /* The form of the top of the frame on the stack */ | 123 | /* The form of the top of the frame on the stack */ |
140 | struct stack_frame { | 124 | struct stack_frame { |
141 | struct stack_frame *next_frame; | 125 | struct stack_frame *next_frame; |
142 | unsigned long return_address; | 126 | unsigned long return_address; |
143 | }; | 127 | }; |
144 | 128 | ||
145 | static inline unsigned long | 129 | static inline unsigned long |
146 | print_context_stack(struct thread_info *tinfo, | 130 | print_context_stack(struct thread_info *tinfo, |
147 | unsigned long *stack, unsigned long bp, | 131 | unsigned long *stack, unsigned long bp, |
148 | const struct stacktrace_ops *ops, void *data) | 132 | const struct stacktrace_ops *ops, void *data) |
149 | { | 133 | { |
150 | struct stack_frame *frame = (struct stack_frame *)bp; | 134 | struct stack_frame *frame = (struct stack_frame *)bp; |
151 | 135 | ||
@@ -167,8 +151,6 @@ print_context_stack(struct thread_info *tinfo, | |||
167 | return bp; | 151 | return bp; |
168 | } | 152 | } |
169 | 153 | ||
170 | #define MSG(msg) ops->warning(data, msg) | ||
171 | |||
172 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | 154 | void dump_trace(struct task_struct *task, struct pt_regs *regs, |
173 | unsigned long *stack, unsigned long bp, | 155 | unsigned long *stack, unsigned long bp, |
174 | const struct stacktrace_ops *ops, void *data) | 156 | const struct stacktrace_ops *ops, void *data) |
@@ -178,7 +160,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, | |||
178 | 160 | ||
179 | if (!stack) { | 161 | if (!stack) { |
180 | unsigned long dummy; | 162 | unsigned long dummy; |
181 | |||
182 | stack = &dummy; | 163 | stack = &dummy; |
183 | if (task != current) | 164 | if (task != current) |
184 | stack = (unsigned long *)task->thread.sp; | 165 | stack = (unsigned long *)task->thread.sp; |
@@ -196,7 +177,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, | |||
196 | } | 177 | } |
197 | #endif | 178 | #endif |
198 | 179 | ||
199 | while (1) { | 180 | for (;;) { |
200 | struct thread_info *context; | 181 | struct thread_info *context; |
201 | 182 | ||
202 | context = (struct thread_info *) | 183 | context = (struct thread_info *) |
@@ -248,15 +229,15 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) | |||
248 | } | 229 | } |
249 | 230 | ||
250 | static const struct stacktrace_ops print_trace_ops = { | 231 | static const struct stacktrace_ops print_trace_ops = { |
251 | .warning = print_trace_warning, | 232 | .warning = print_trace_warning, |
252 | .warning_symbol = print_trace_warning_symbol, | 233 | .warning_symbol = print_trace_warning_symbol, |
253 | .stack = print_trace_stack, | 234 | .stack = print_trace_stack, |
254 | .address = print_trace_address, | 235 | .address = print_trace_address, |
255 | }; | 236 | }; |
256 | 237 | ||
257 | static void | 238 | static void |
258 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | 239 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, |
259 | unsigned long *stack, unsigned long bp, char *log_lvl) | 240 | unsigned long *stack, unsigned long bp, char *log_lvl) |
260 | { | 241 | { |
261 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | 242 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); |
262 | printk("%s =======================\n", log_lvl); | 243 | printk("%s =======================\n", log_lvl); |
@@ -351,15 +332,14 @@ void show_registers(struct pt_regs *regs) | |||
351 | printk(KERN_EMERG "Code: "); | 332 | printk(KERN_EMERG "Code: "); |
352 | 333 | ||
353 | ip = (u8 *)regs->ip - code_prologue; | 334 | ip = (u8 *)regs->ip - code_prologue; |
354 | if (ip < (u8 *)PAGE_OFFSET || | 335 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { |
355 | probe_kernel_address(ip, c)) { | ||
356 | /* try starting at EIP */ | 336 | /* try starting at EIP */ |
357 | ip = (u8 *)regs->ip; | 337 | ip = (u8 *)regs->ip; |
358 | code_len = code_len - code_prologue + 1; | 338 | code_len = code_len - code_prologue + 1; |
359 | } | 339 | } |
360 | for (i = 0; i < code_len; i++, ip++) { | 340 | for (i = 0; i < code_len; i++, ip++) { |
361 | if (ip < (u8 *)PAGE_OFFSET || | 341 | if (ip < (u8 *)PAGE_OFFSET || |
362 | probe_kernel_address(ip, c)) { | 342 | probe_kernel_address(ip, c)) { |
363 | printk(" Bad EIP value."); | 343 | printk(" Bad EIP value."); |
364 | break; | 344 | break; |
365 | } | 345 | } |
@@ -384,7 +364,53 @@ int is_valid_bugaddr(unsigned long ip) | |||
384 | return ud2 == 0x0b0f; | 364 | return ud2 == 0x0b0f; |
385 | } | 365 | } |
386 | 366 | ||
387 | static int die_counter; | 367 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; |
368 | static int die_owner = -1; | ||
369 | static unsigned int die_nest_count; | ||
370 | |||
371 | unsigned __kprobes long oops_begin(void) | ||
372 | { | ||
373 | unsigned long flags; | ||
374 | |||
375 | oops_enter(); | ||
376 | |||
377 | if (die_owner != raw_smp_processor_id()) { | ||
378 | console_verbose(); | ||
379 | raw_local_irq_save(flags); | ||
380 | __raw_spin_lock(&die_lock); | ||
381 | die_owner = smp_processor_id(); | ||
382 | die_nest_count = 0; | ||
383 | bust_spinlocks(1); | ||
384 | } else { | ||
385 | raw_local_irq_save(flags); | ||
386 | } | ||
387 | die_nest_count++; | ||
388 | return flags; | ||
389 | } | ||
390 | |||
391 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
392 | { | ||
393 | bust_spinlocks(0); | ||
394 | die_owner = -1; | ||
395 | add_taint(TAINT_DIE); | ||
396 | __raw_spin_unlock(&die_lock); | ||
397 | raw_local_irq_restore(flags); | ||
398 | |||
399 | if (!regs) | ||
400 | return; | ||
401 | |||
402 | if (kexec_should_crash(current)) | ||
403 | crash_kexec(regs); | ||
404 | |||
405 | if (in_interrupt()) | ||
406 | panic("Fatal exception in interrupt"); | ||
407 | |||
408 | if (panic_on_oops) | ||
409 | panic("Fatal exception"); | ||
410 | |||
411 | oops_exit(); | ||
412 | do_exit(signr); | ||
413 | } | ||
388 | 414 | ||
389 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | 415 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) |
390 | { | 416 | { |
@@ -402,26 +428,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) | |||
402 | printk("DEBUG_PAGEALLOC"); | 428 | printk("DEBUG_PAGEALLOC"); |
403 | #endif | 429 | #endif |
404 | printk("\n"); | 430 | printk("\n"); |
405 | |||
406 | if (notify_die(DIE_OOPS, str, regs, err, | 431 | if (notify_die(DIE_OOPS, str, regs, err, |
407 | current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { | 432 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) |
408 | 433 | return 1; | |
409 | show_registers(regs); | ||
410 | /* Executive summary in case the oops scrolled away */ | ||
411 | sp = (unsigned long) (®s->sp); | ||
412 | savesegment(ss, ss); | ||
413 | if (user_mode(regs)) { | ||
414 | sp = regs->sp; | ||
415 | ss = regs->ss & 0xffff; | ||
416 | } | ||
417 | printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); | ||
418 | print_symbol("%s", regs->ip); | ||
419 | printk(" SS:ESP %04x:%08lx\n", ss, sp); | ||
420 | 434 | ||
421 | return 0; | 435 | show_registers(regs); |
436 | /* Executive summary in case the oops scrolled away */ | ||
437 | sp = (unsigned long) (®s->sp); | ||
438 | savesegment(ss, ss); | ||
439 | if (user_mode(regs)) { | ||
440 | sp = regs->sp; | ||
441 | ss = regs->ss & 0xffff; | ||
422 | } | 442 | } |
423 | 443 | printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); | |
424 | return 1; | 444 | print_symbol("%s", regs->ip); |
445 | printk(" SS:ESP %04x:%08lx\n", ss, sp); | ||
446 | return 0; | ||
425 | } | 447 | } |
426 | 448 | ||
427 | /* | 449 | /* |
@@ -430,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) | |||
430 | */ | 452 | */ |
431 | void die(const char *str, struct pt_regs *regs, long err) | 453 | void die(const char *str, struct pt_regs *regs, long err) |
432 | { | 454 | { |
433 | static struct { | 455 | unsigned long flags = oops_begin(); |
434 | raw_spinlock_t lock; | ||
435 | u32 lock_owner; | ||
436 | int lock_owner_depth; | ||
437 | } die = { | ||
438 | .lock = __RAW_SPIN_LOCK_UNLOCKED, | ||
439 | .lock_owner = -1, | ||
440 | .lock_owner_depth = 0 | ||
441 | }; | ||
442 | unsigned long flags; | ||
443 | |||
444 | oops_enter(); | ||
445 | |||
446 | if (die.lock_owner != raw_smp_processor_id()) { | ||
447 | console_verbose(); | ||
448 | raw_local_irq_save(flags); | ||
449 | __raw_spin_lock(&die.lock); | ||
450 | die.lock_owner = smp_processor_id(); | ||
451 | die.lock_owner_depth = 0; | ||
452 | bust_spinlocks(1); | ||
453 | } else { | ||
454 | raw_local_irq_save(flags); | ||
455 | } | ||
456 | 456 | ||
457 | if (++die.lock_owner_depth < 3) { | 457 | if (die_nest_count < 3) { |
458 | report_bug(regs->ip, regs); | 458 | report_bug(regs->ip, regs); |
459 | 459 | ||
460 | if (__die(str, regs, err)) | 460 | if (__die(str, regs, err)) |
@@ -463,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err) | |||
463 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | 463 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); |
464 | } | 464 | } |
465 | 465 | ||
466 | bust_spinlocks(0); | 466 | oops_end(flags, regs, SIGSEGV); |
467 | die.lock_owner = -1; | ||
468 | add_taint(TAINT_DIE); | ||
469 | __raw_spin_unlock(&die.lock); | ||
470 | raw_local_irq_restore(flags); | ||
471 | |||
472 | if (!regs) | ||
473 | return; | ||
474 | |||
475 | if (kexec_should_crash(current)) | ||
476 | crash_kexec(regs); | ||
477 | |||
478 | if (in_interrupt()) | ||
479 | panic("Fatal exception in interrupt"); | ||
480 | |||
481 | if (panic_on_oops) | ||
482 | panic("Fatal exception"); | ||
483 | |||
484 | oops_exit(); | ||
485 | do_exit(SIGSEGV); | ||
486 | } | 467 | } |
487 | 468 | ||
488 | static inline void | 469 | static inline void |
@@ -546,7 +527,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ | |||
546 | { \ | 527 | { \ |
547 | trace_hardirqs_fixup(); \ | 528 | trace_hardirqs_fixup(); \ |
548 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 529 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
549 | == NOTIFY_STOP) \ | 530 | == NOTIFY_STOP) \ |
550 | return; \ | 531 | return; \ |
551 | do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ | 532 | do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ |
552 | } | 533 | } |
@@ -562,7 +543,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ | |||
562 | info.si_code = sicode; \ | 543 | info.si_code = sicode; \ |
563 | info.si_addr = (void __user *)siaddr; \ | 544 | info.si_addr = (void __user *)siaddr; \ |
564 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 545 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
565 | == NOTIFY_STOP) \ | 546 | == NOTIFY_STOP) \ |
566 | return; \ | 547 | return; \ |
567 | do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ | 548 | do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ |
568 | } | 549 | } |
@@ -571,7 +552,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ | |||
571 | void do_##name(struct pt_regs *regs, long error_code) \ | 552 | void do_##name(struct pt_regs *regs, long error_code) \ |
572 | { \ | 553 | { \ |
573 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 554 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
574 | == NOTIFY_STOP) \ | 555 | == NOTIFY_STOP) \ |
575 | return; \ | 556 | return; \ |
576 | do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ | 557 | do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ |
577 | } | 558 | } |
@@ -586,27 +567,29 @@ void do_##name(struct pt_regs *regs, long error_code) \ | |||
586 | info.si_addr = (void __user *)siaddr; \ | 567 | info.si_addr = (void __user *)siaddr; \ |
587 | trace_hardirqs_fixup(); \ | 568 | trace_hardirqs_fixup(); \ |
588 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 569 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
589 | == NOTIFY_STOP) \ | 570 | == NOTIFY_STOP) \ |
590 | return; \ | 571 | return; \ |
591 | do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ | 572 | do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ |
592 | } | 573 | } |
593 | 574 | ||
594 | DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | 575 | DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) |
595 | #ifndef CONFIG_KPROBES | 576 | #ifndef CONFIG_KPROBES |
596 | DO_VM86_ERROR(3, SIGTRAP, "int3", int3) | 577 | DO_VM86_ERROR(3, SIGTRAP, "int3", int3) |
597 | #endif | 578 | #endif |
598 | DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) | 579 | DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) |
599 | DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) | 580 | DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) |
600 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) | 581 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) |
601 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | 582 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) |
602 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | 583 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) |
603 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | 584 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) |
604 | DO_ERROR(12, SIGBUS, "stack segment", stack_segment) | 585 | DO_ERROR(12, SIGBUS, "stack segment", stack_segment) |
605 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) | 586 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) |
606 | DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) | 587 | DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) |
607 | 588 | ||
608 | void __kprobes do_general_protection(struct pt_regs *regs, long error_code) | 589 | void __kprobes |
590 | do_general_protection(struct pt_regs *regs, long error_code) | ||
609 | { | 591 | { |
592 | struct task_struct *tsk; | ||
610 | struct thread_struct *thread; | 593 | struct thread_struct *thread; |
611 | struct tss_struct *tss; | 594 | struct tss_struct *tss; |
612 | int cpu; | 595 | int cpu; |
@@ -647,23 +630,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code) | |||
647 | if (regs->flags & X86_VM_MASK) | 630 | if (regs->flags & X86_VM_MASK) |
648 | goto gp_in_vm86; | 631 | goto gp_in_vm86; |
649 | 632 | ||
633 | tsk = current; | ||
650 | if (!user_mode(regs)) | 634 | if (!user_mode(regs)) |
651 | goto gp_in_kernel; | 635 | goto gp_in_kernel; |
652 | 636 | ||
653 | current->thread.error_code = error_code; | 637 | tsk->thread.error_code = error_code; |
654 | current->thread.trap_no = 13; | 638 | tsk->thread.trap_no = 13; |
655 | 639 | ||
656 | if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && | 640 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && |
657 | printk_ratelimit()) { | 641 | printk_ratelimit()) { |
658 | printk(KERN_INFO | 642 | printk(KERN_INFO |
659 | "%s[%d] general protection ip:%lx sp:%lx error:%lx", | 643 | "%s[%d] general protection ip:%lx sp:%lx error:%lx", |
660 | current->comm, task_pid_nr(current), | 644 | tsk->comm, task_pid_nr(tsk), |
661 | regs->ip, regs->sp, error_code); | 645 | regs->ip, regs->sp, error_code); |
662 | print_vma_addr(" in ", regs->ip); | 646 | print_vma_addr(" in ", regs->ip); |
663 | printk("\n"); | 647 | printk("\n"); |
664 | } | 648 | } |
665 | 649 | ||
666 | force_sig(SIGSEGV, current); | 650 | force_sig(SIGSEGV, tsk); |
667 | return; | 651 | return; |
668 | 652 | ||
669 | gp_in_vm86: | 653 | gp_in_vm86: |
@@ -672,14 +656,15 @@ gp_in_vm86: | |||
672 | return; | 656 | return; |
673 | 657 | ||
674 | gp_in_kernel: | 658 | gp_in_kernel: |
675 | if (!fixup_exception(regs)) { | 659 | if (fixup_exception(regs)) |
676 | current->thread.error_code = error_code; | 660 | return; |
677 | current->thread.trap_no = 13; | 661 | |
678 | if (notify_die(DIE_GPF, "general protection fault", regs, | 662 | tsk->thread.error_code = error_code; |
663 | tsk->thread.trap_no = 13; | ||
664 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
679 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | 665 | error_code, 13, SIGSEGV) == NOTIFY_STOP) |
680 | return; | 666 | return; |
681 | die("general protection fault", regs, error_code); | 667 | die("general protection fault", regs, error_code); |
682 | } | ||
683 | } | 668 | } |
684 | 669 | ||
685 | static notrace __kprobes void | 670 | static notrace __kprobes void |
@@ -756,9 +741,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | |||
756 | 741 | ||
757 | static DEFINE_SPINLOCK(nmi_print_lock); | 742 | static DEFINE_SPINLOCK(nmi_print_lock); |
758 | 743 | ||
759 | void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) | 744 | void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) |
760 | { | 745 | { |
761 | if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) | 746 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) |
762 | return; | 747 | return; |
763 | 748 | ||
764 | spin_lock(&nmi_print_lock); | 749 | spin_lock(&nmi_print_lock); |
@@ -767,10 +752,12 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) | |||
767 | * to get a message out: | 752 | * to get a message out: |
768 | */ | 753 | */ |
769 | bust_spinlocks(1); | 754 | bust_spinlocks(1); |
770 | printk(KERN_EMERG "%s", msg); | 755 | printk(KERN_EMERG "%s", str); |
771 | printk(" on CPU%d, ip %08lx, registers:\n", | 756 | printk(" on CPU%d, ip %08lx, registers:\n", |
772 | smp_processor_id(), regs->ip); | 757 | smp_processor_id(), regs->ip); |
773 | show_registers(regs); | 758 | show_registers(regs); |
759 | if (do_panic) | ||
760 | panic("Non maskable interrupt"); | ||
774 | console_silent(); | 761 | console_silent(); |
775 | spin_unlock(&nmi_print_lock); | 762 | spin_unlock(&nmi_print_lock); |
776 | bust_spinlocks(0); | 763 | bust_spinlocks(0); |
@@ -790,14 +777,17 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) | |||
790 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | 777 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) |
791 | { | 778 | { |
792 | unsigned char reason = 0; | 779 | unsigned char reason = 0; |
780 | int cpu; | ||
793 | 781 | ||
794 | /* Only the BSP gets external NMIs from the system: */ | 782 | cpu = smp_processor_id(); |
795 | if (!smp_processor_id()) | 783 | |
784 | /* Only the BSP gets external NMIs from the system. */ | ||
785 | if (!cpu) | ||
796 | reason = get_nmi_reason(); | 786 | reason = get_nmi_reason(); |
797 | 787 | ||
798 | if (!(reason & 0xc0)) { | 788 | if (!(reason & 0xc0)) { |
799 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | 789 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) |
800 | == NOTIFY_STOP) | 790 | == NOTIFY_STOP) |
801 | return; | 791 | return; |
802 | #ifdef CONFIG_X86_LOCAL_APIC | 792 | #ifdef CONFIG_X86_LOCAL_APIC |
803 | /* | 793 | /* |
@@ -806,7 +796,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
806 | */ | 796 | */ |
807 | if (nmi_watchdog_tick(regs, reason)) | 797 | if (nmi_watchdog_tick(regs, reason)) |
808 | return; | 798 | return; |
809 | if (!do_nmi_callback(regs, smp_processor_id())) | 799 | if (!do_nmi_callback(regs, cpu)) |
810 | unknown_nmi_error(reason, regs); | 800 | unknown_nmi_error(reason, regs); |
811 | #else | 801 | #else |
812 | unknown_nmi_error(reason, regs); | 802 | unknown_nmi_error(reason, regs); |
@@ -816,6 +806,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
816 | } | 806 | } |
817 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | 807 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) |
818 | return; | 808 | return; |
809 | |||
810 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | ||
819 | if (reason & 0x80) | 811 | if (reason & 0x80) |
820 | mem_parity_error(reason, regs); | 812 | mem_parity_error(reason, regs); |
821 | if (reason & 0x40) | 813 | if (reason & 0x40) |
@@ -827,8 +819,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
827 | reassert_nmi(); | 819 | reassert_nmi(); |
828 | } | 820 | } |
829 | 821 | ||
830 | static int ignore_nmis; | ||
831 | |||
832 | notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) | 822 | notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) |
833 | { | 823 | { |
834 | int cpu; | 824 | int cpu; |
@@ -913,7 +903,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
913 | tsk->thread.debugctlmsr = 0; | 903 | tsk->thread.debugctlmsr = 0; |
914 | 904 | ||
915 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | 905 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, |
916 | SIGTRAP) == NOTIFY_STOP) | 906 | SIGTRAP) == NOTIFY_STOP) |
917 | return; | 907 | return; |
918 | /* It's safe to allow irq's after DR6 has been saved */ | 908 | /* It's safe to allow irq's after DR6 has been saved */ |
919 | if (regs->flags & X86_EFLAGS_IF) | 909 | if (regs->flags & X86_EFLAGS_IF) |
@@ -974,9 +964,8 @@ clear_TF_reenable: | |||
974 | void math_error(void __user *ip) | 964 | void math_error(void __user *ip) |
975 | { | 965 | { |
976 | struct task_struct *task; | 966 | struct task_struct *task; |
977 | unsigned short cwd; | ||
978 | unsigned short swd; | ||
979 | siginfo_t info; | 967 | siginfo_t info; |
968 | unsigned short cwd, swd; | ||
980 | 969 | ||
981 | /* | 970 | /* |
982 | * Save the info for the exception handler and clear the error. | 971 | * Save the info for the exception handler and clear the error. |
@@ -995,7 +984,7 @@ void math_error(void __user *ip) | |||
995 | * C1 reg you need in case of a stack fault, 0x040 is the stack | 984 | * C1 reg you need in case of a stack fault, 0x040 is the stack |
996 | * fault bit. We should only be taking one exception at a time, | 985 | * fault bit. We should only be taking one exception at a time, |
997 | * so if this combination doesn't produce any single exception, | 986 | * so if this combination doesn't produce any single exception, |
998 | * then we have a bad program that isn't syncronizing its FPU usage | 987 | * then we have a bad program that isn't synchronizing its FPU usage |
999 | * and it will suffer the consequences since we won't be able to | 988 | * and it will suffer the consequences since we won't be able to |
1000 | * fully reproduce the context of the exception | 989 | * fully reproduce the context of the exception |
1001 | */ | 990 | */ |
@@ -1004,7 +993,7 @@ void math_error(void __user *ip) | |||
1004 | switch (swd & ~cwd & 0x3f) { | 993 | switch (swd & ~cwd & 0x3f) { |
1005 | case 0x000: /* No unmasked exception */ | 994 | case 0x000: /* No unmasked exception */ |
1006 | return; | 995 | return; |
1007 | default: /* Multiple exceptions */ | 996 | default: /* Multiple exceptions */ |
1008 | break; | 997 | break; |
1009 | case 0x001: /* Invalid Op */ | 998 | case 0x001: /* Invalid Op */ |
1010 | /* | 999 | /* |
@@ -1040,8 +1029,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code) | |||
1040 | static void simd_math_error(void __user *ip) | 1029 | static void simd_math_error(void __user *ip) |
1041 | { | 1030 | { |
1042 | struct task_struct *task; | 1031 | struct task_struct *task; |
1043 | unsigned short mxcsr; | ||
1044 | siginfo_t info; | 1032 | siginfo_t info; |
1033 | unsigned short mxcsr; | ||
1045 | 1034 | ||
1046 | /* | 1035 | /* |
1047 | * Save the info for the exception handler and clear the error. | 1036 | * Save the info for the exception handler and clear the error. |
@@ -1117,7 +1106,7 @@ void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) | |||
1117 | 1106 | ||
1118 | unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | 1107 | unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) |
1119 | { | 1108 | { |
1120 | struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; | 1109 | struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); |
1121 | unsigned long base = (kesp - uesp) & -THREAD_SIZE; | 1110 | unsigned long base = (kesp - uesp) & -THREAD_SIZE; |
1122 | unsigned long new_kesp = kesp - base; | 1111 | unsigned long new_kesp = kesp - base; |
1123 | unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; | 1112 | unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; |
@@ -1196,19 +1185,16 @@ void __init trap_init(void) | |||
1196 | early_iounmap(p, 4); | 1185 | early_iounmap(p, 4); |
1197 | #endif | 1186 | #endif |
1198 | 1187 | ||
1199 | #ifdef CONFIG_X86_LOCAL_APIC | 1188 | set_trap_gate(0, ÷_error); |
1200 | init_apic_mappings(); | 1189 | set_intr_gate(1, &debug); |
1201 | #endif | 1190 | set_intr_gate(2, &nmi); |
1202 | set_trap_gate(0, ÷_error); | 1191 | set_system_intr_gate(3, &int3); /* int3 can be called from all */ |
1203 | set_intr_gate(1, &debug); | 1192 | set_system_gate(4, &overflow); /* int4 can be called from all */ |
1204 | set_intr_gate(2, &nmi); | 1193 | set_trap_gate(5, &bounds); |
1205 | set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ | 1194 | set_trap_gate(6, &invalid_op); |
1206 | set_system_gate(4, &overflow); | 1195 | set_trap_gate(7, &device_not_available); |
1207 | set_trap_gate(5, &bounds); | 1196 | set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); |
1208 | set_trap_gate(6, &invalid_op); | 1197 | set_trap_gate(9, &coprocessor_segment_overrun); |
1209 | set_trap_gate(7, &device_not_available); | ||
1210 | set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); | ||
1211 | set_trap_gate(9, &coprocessor_segment_overrun); | ||
1212 | set_trap_gate(10, &invalid_TSS); | 1198 | set_trap_gate(10, &invalid_TSS); |
1213 | set_trap_gate(11, &segment_not_present); | 1199 | set_trap_gate(11, &segment_not_present); |
1214 | set_trap_gate(12, &stack_segment); | 1200 | set_trap_gate(12, &stack_segment); |
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index adff76ea97c4..3f18d73f420c 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c | |||
@@ -10,73 +10,56 @@ | |||
10 | * 'Traps.c' handles hardware traps and faults after we have saved some | 10 | * 'Traps.c' handles hardware traps and faults after we have saved some |
11 | * state in 'entry.S'. | 11 | * state in 'entry.S'. |
12 | */ | 12 | */ |
13 | #include <linux/sched.h> | 13 | #include <linux/moduleparam.h> |
14 | #include <linux/interrupt.h> | ||
15 | #include <linux/kallsyms.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/kprobes.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/utsname.h> | ||
20 | #include <linux/kdebug.h> | ||
14 | #include <linux/kernel.h> | 21 | #include <linux/kernel.h> |
22 | #include <linux/module.h> | ||
23 | #include <linux/ptrace.h> | ||
15 | #include <linux/string.h> | 24 | #include <linux/string.h> |
25 | #include <linux/unwind.h> | ||
26 | #include <linux/delay.h> | ||
16 | #include <linux/errno.h> | 27 | #include <linux/errno.h> |
17 | #include <linux/ptrace.h> | 28 | #include <linux/kexec.h> |
29 | #include <linux/sched.h> | ||
18 | #include <linux/timer.h> | 30 | #include <linux/timer.h> |
19 | #include <linux/mm.h> | ||
20 | #include <linux/init.h> | 31 | #include <linux/init.h> |
21 | #include <linux/delay.h> | ||
22 | #include <linux/spinlock.h> | ||
23 | #include <linux/interrupt.h> | ||
24 | #include <linux/kallsyms.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/moduleparam.h> | ||
27 | #include <linux/nmi.h> | ||
28 | #include <linux/kprobes.h> | ||
29 | #include <linux/kexec.h> | ||
30 | #include <linux/unwind.h> | ||
31 | #include <linux/uaccess.h> | ||
32 | #include <linux/bug.h> | 32 | #include <linux/bug.h> |
33 | #include <linux/kdebug.h> | 33 | #include <linux/nmi.h> |
34 | #include <linux/utsname.h> | 34 | #include <linux/mm.h> |
35 | |||
36 | #include <mach_traps.h> | ||
37 | 35 | ||
38 | #if defined(CONFIG_EDAC) | 36 | #if defined(CONFIG_EDAC) |
39 | #include <linux/edac.h> | 37 | #include <linux/edac.h> |
40 | #endif | 38 | #endif |
41 | 39 | ||
42 | #include <asm/system.h> | 40 | #include <asm/stacktrace.h> |
43 | #include <asm/io.h> | 41 | #include <asm/processor.h> |
44 | #include <asm/atomic.h> | ||
45 | #include <asm/debugreg.h> | 42 | #include <asm/debugreg.h> |
43 | #include <asm/atomic.h> | ||
44 | #include <asm/system.h> | ||
45 | #include <asm/unwind.h> | ||
46 | #include <asm/desc.h> | 46 | #include <asm/desc.h> |
47 | #include <asm/i387.h> | 47 | #include <asm/i387.h> |
48 | #include <asm/processor.h> | 48 | #include <asm/nmi.h> |
49 | #include <asm/unwind.h> | ||
50 | #include <asm/smp.h> | 49 | #include <asm/smp.h> |
50 | #include <asm/io.h> | ||
51 | #include <asm/pgalloc.h> | 51 | #include <asm/pgalloc.h> |
52 | #include <asm/pda.h> | ||
53 | #include <asm/proto.h> | 52 | #include <asm/proto.h> |
54 | #include <asm/nmi.h> | 53 | #include <asm/pda.h> |
55 | #include <asm/stacktrace.h> | 54 | #include <asm/traps.h> |
56 | 55 | ||
57 | asmlinkage void divide_error(void); | 56 | #include <mach_traps.h> |
58 | asmlinkage void debug(void); | ||
59 | asmlinkage void nmi(void); | ||
60 | asmlinkage void int3(void); | ||
61 | asmlinkage void overflow(void); | ||
62 | asmlinkage void bounds(void); | ||
63 | asmlinkage void invalid_op(void); | ||
64 | asmlinkage void device_not_available(void); | ||
65 | asmlinkage void double_fault(void); | ||
66 | asmlinkage void coprocessor_segment_overrun(void); | ||
67 | asmlinkage void invalid_TSS(void); | ||
68 | asmlinkage void segment_not_present(void); | ||
69 | asmlinkage void stack_segment(void); | ||
70 | asmlinkage void general_protection(void); | ||
71 | asmlinkage void page_fault(void); | ||
72 | asmlinkage void coprocessor_error(void); | ||
73 | asmlinkage void simd_coprocessor_error(void); | ||
74 | asmlinkage void reserved(void); | ||
75 | asmlinkage void alignment_check(void); | ||
76 | asmlinkage void machine_check(void); | ||
77 | asmlinkage void spurious_interrupt_bug(void); | ||
78 | 57 | ||
58 | int panic_on_unrecovered_nmi; | ||
59 | int kstack_depth_to_print = 12; | ||
79 | static unsigned int code_bytes = 64; | 60 | static unsigned int code_bytes = 64; |
61 | static int ignore_nmis; | ||
62 | static int die_counter; | ||
80 | 63 | ||
81 | static inline void conditional_sti(struct pt_regs *regs) | 64 | static inline void conditional_sti(struct pt_regs *regs) |
82 | { | 65 | { |
@@ -100,34 +83,9 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) | |||
100 | dec_preempt_count(); | 83 | dec_preempt_count(); |
101 | } | 84 | } |
102 | 85 | ||
103 | int kstack_depth_to_print = 12; | ||
104 | |||
105 | void printk_address(unsigned long address, int reliable) | 86 | void printk_address(unsigned long address, int reliable) |
106 | { | 87 | { |
107 | #ifdef CONFIG_KALLSYMS | 88 | printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); |
108 | unsigned long offset = 0, symsize; | ||
109 | const char *symname; | ||
110 | char *modname; | ||
111 | char *delim = ":"; | ||
112 | char namebuf[KSYM_NAME_LEN]; | ||
113 | char reliab[4] = ""; | ||
114 | |||
115 | symname = kallsyms_lookup(address, &symsize, &offset, | ||
116 | &modname, namebuf); | ||
117 | if (!symname) { | ||
118 | printk(" [<%016lx>]\n", address); | ||
119 | return; | ||
120 | } | ||
121 | if (!reliable) | ||
122 | strcpy(reliab, "? "); | ||
123 | |||
124 | if (!modname) | ||
125 | modname = delim = ""; | ||
126 | printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", | ||
127 | address, reliab, delim, modname, delim, symname, offset, symsize); | ||
128 | #else | ||
129 | printk(" [<%016lx>]\n", address); | ||
130 | #endif | ||
131 | } | 89 | } |
132 | 90 | ||
133 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | 91 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, |
@@ -204,8 +162,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | |||
204 | return NULL; | 162 | return NULL; |
205 | } | 163 | } |
206 | 164 | ||
207 | #define MSG(txt) ops->warning(data, txt) | ||
208 | |||
209 | /* | 165 | /* |
210 | * x86-64 can have up to three kernel stacks: | 166 | * x86-64 can have up to three kernel stacks: |
211 | * process stack | 167 | * process stack |
@@ -232,11 +188,11 @@ struct stack_frame { | |||
232 | unsigned long return_address; | 188 | unsigned long return_address; |
233 | }; | 189 | }; |
234 | 190 | ||
235 | 191 | static inline unsigned long | |
236 | static inline unsigned long print_context_stack(struct thread_info *tinfo, | 192 | print_context_stack(struct thread_info *tinfo, |
237 | unsigned long *stack, unsigned long bp, | 193 | unsigned long *stack, unsigned long bp, |
238 | const struct stacktrace_ops *ops, void *data, | 194 | const struct stacktrace_ops *ops, void *data, |
239 | unsigned long *end) | 195 | unsigned long *end) |
240 | { | 196 | { |
241 | struct stack_frame *frame = (struct stack_frame *)bp; | 197 | struct stack_frame *frame = (struct stack_frame *)bp; |
242 | 198 | ||
@@ -258,7 +214,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, | |||
258 | return bp; | 214 | return bp; |
259 | } | 215 | } |
260 | 216 | ||
261 | void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | 217 | void dump_trace(struct task_struct *task, struct pt_regs *regs, |
262 | unsigned long *stack, unsigned long bp, | 218 | unsigned long *stack, unsigned long bp, |
263 | const struct stacktrace_ops *ops, void *data) | 219 | const struct stacktrace_ops *ops, void *data) |
264 | { | 220 | { |
@@ -267,36 +223,34 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | |||
267 | unsigned used = 0; | 223 | unsigned used = 0; |
268 | struct thread_info *tinfo; | 224 | struct thread_info *tinfo; |
269 | 225 | ||
270 | if (!tsk) | 226 | if (!task) |
271 | tsk = current; | 227 | task = current; |
272 | tinfo = task_thread_info(tsk); | ||
273 | 228 | ||
274 | if (!stack) { | 229 | if (!stack) { |
275 | unsigned long dummy; | 230 | unsigned long dummy; |
276 | stack = &dummy; | 231 | stack = &dummy; |
277 | if (tsk && tsk != current) | 232 | if (task && task != current) |
278 | stack = (unsigned long *)tsk->thread.sp; | 233 | stack = (unsigned long *)task->thread.sp; |
279 | } | 234 | } |
280 | 235 | ||
281 | #ifdef CONFIG_FRAME_POINTER | 236 | #ifdef CONFIG_FRAME_POINTER |
282 | if (!bp) { | 237 | if (!bp) { |
283 | if (tsk == current) { | 238 | if (task == current) { |
284 | /* Grab bp right from our regs */ | 239 | /* Grab bp right from our regs */ |
285 | asm("movq %%rbp, %0" : "=r" (bp):); | 240 | asm("movq %%rbp, %0" : "=r" (bp) :); |
286 | } else { | 241 | } else { |
287 | /* bp is the last reg pushed by switch_to */ | 242 | /* bp is the last reg pushed by switch_to */ |
288 | bp = *(unsigned long *) tsk->thread.sp; | 243 | bp = *(unsigned long *) task->thread.sp; |
289 | } | 244 | } |
290 | } | 245 | } |
291 | #endif | 246 | #endif |
292 | 247 | ||
293 | |||
294 | |||
295 | /* | 248 | /* |
296 | * Print function call entries in all stacks, starting at the | 249 | * Print function call entries in all stacks, starting at the |
297 | * current stack address. If the stacks consist of nested | 250 | * current stack address. If the stacks consist of nested |
298 | * exceptions | 251 | * exceptions |
299 | */ | 252 | */ |
253 | tinfo = task_thread_info(task); | ||
300 | for (;;) { | 254 | for (;;) { |
301 | char *id; | 255 | char *id; |
302 | unsigned long *estack_end; | 256 | unsigned long *estack_end; |
@@ -381,18 +335,24 @@ static const struct stacktrace_ops print_trace_ops = { | |||
381 | .address = print_trace_address, | 335 | .address = print_trace_address, |
382 | }; | 336 | }; |
383 | 337 | ||
384 | void | 338 | static void |
385 | show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, | 339 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, |
386 | unsigned long bp) | 340 | unsigned long *stack, unsigned long bp, char *log_lvl) |
387 | { | 341 | { |
388 | printk("\nCall Trace:\n"); | 342 | printk("\nCall Trace:\n"); |
389 | dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); | 343 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); |
390 | printk("\n"); | 344 | printk("\n"); |
391 | } | 345 | } |
392 | 346 | ||
347 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
348 | unsigned long *stack, unsigned long bp) | ||
349 | { | ||
350 | show_trace_log_lvl(task, regs, stack, bp, ""); | ||
351 | } | ||
352 | |||
393 | static void | 353 | static void |
394 | _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, | 354 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, |
395 | unsigned long bp) | 355 | unsigned long *sp, unsigned long bp, char *log_lvl) |
396 | { | 356 | { |
397 | unsigned long *stack; | 357 | unsigned long *stack; |
398 | int i; | 358 | int i; |
@@ -404,14 +364,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, | |||
404 | // back trace for this cpu. | 364 | // back trace for this cpu. |
405 | 365 | ||
406 | if (sp == NULL) { | 366 | if (sp == NULL) { |
407 | if (tsk) | 367 | if (task) |
408 | sp = (unsigned long *)tsk->thread.sp; | 368 | sp = (unsigned long *)task->thread.sp; |
409 | else | 369 | else |
410 | sp = (unsigned long *)&sp; | 370 | sp = (unsigned long *)&sp; |
411 | } | 371 | } |
412 | 372 | ||
413 | stack = sp; | 373 | stack = sp; |
414 | for(i=0; i < kstack_depth_to_print; i++) { | 374 | for (i = 0; i < kstack_depth_to_print; i++) { |
415 | if (stack >= irqstack && stack <= irqstack_end) { | 375 | if (stack >= irqstack && stack <= irqstack_end) { |
416 | if (stack == irqstack_end) { | 376 | if (stack == irqstack_end) { |
417 | stack = (unsigned long *) (irqstack_end[-1]); | 377 | stack = (unsigned long *) (irqstack_end[-1]); |
@@ -426,12 +386,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, | |||
426 | printk(" %016lx", *stack++); | 386 | printk(" %016lx", *stack++); |
427 | touch_nmi_watchdog(); | 387 | touch_nmi_watchdog(); |
428 | } | 388 | } |
429 | show_trace(tsk, regs, sp, bp); | 389 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); |
430 | } | 390 | } |
431 | 391 | ||
432 | void show_stack(struct task_struct *tsk, unsigned long * sp) | 392 | void show_stack(struct task_struct *task, unsigned long *sp) |
433 | { | 393 | { |
434 | _show_stack(tsk, NULL, sp, 0); | 394 | show_stack_log_lvl(task, NULL, sp, 0, ""); |
435 | } | 395 | } |
436 | 396 | ||
437 | /* | 397 | /* |
@@ -439,8 +399,8 @@ void show_stack(struct task_struct *tsk, unsigned long * sp) | |||
439 | */ | 399 | */ |
440 | void dump_stack(void) | 400 | void dump_stack(void) |
441 | { | 401 | { |
442 | unsigned long dummy; | ||
443 | unsigned long bp = 0; | 402 | unsigned long bp = 0; |
403 | unsigned long stack; | ||
444 | 404 | ||
445 | #ifdef CONFIG_FRAME_POINTER | 405 | #ifdef CONFIG_FRAME_POINTER |
446 | if (!bp) | 406 | if (!bp) |
@@ -452,7 +412,7 @@ void dump_stack(void) | |||
452 | init_utsname()->release, | 412 | init_utsname()->release, |
453 | (int)strcspn(init_utsname()->version, " "), | 413 | (int)strcspn(init_utsname()->version, " "), |
454 | init_utsname()->version); | 414 | init_utsname()->version); |
455 | show_trace(NULL, NULL, &dummy, bp); | 415 | show_trace(NULL, NULL, &stack, bp); |
456 | } | 416 | } |
457 | 417 | ||
458 | EXPORT_SYMBOL(dump_stack); | 418 | EXPORT_SYMBOL(dump_stack); |
@@ -463,12 +423,8 @@ void show_registers(struct pt_regs *regs) | |||
463 | unsigned long sp; | 423 | unsigned long sp; |
464 | const int cpu = smp_processor_id(); | 424 | const int cpu = smp_processor_id(); |
465 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | 425 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; |
466 | u8 *ip; | ||
467 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
468 | unsigned int code_len = code_bytes; | ||
469 | 426 | ||
470 | sp = regs->sp; | 427 | sp = regs->sp; |
471 | ip = (u8 *) regs->ip - code_prologue; | ||
472 | printk("CPU %d ", cpu); | 428 | printk("CPU %d ", cpu); |
473 | __show_regs(regs); | 429 | __show_regs(regs); |
474 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | 430 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", |
@@ -479,15 +435,22 @@ void show_registers(struct pt_regs *regs) | |||
479 | * time of the fault.. | 435 | * time of the fault.. |
480 | */ | 436 | */ |
481 | if (!user_mode(regs)) { | 437 | if (!user_mode(regs)) { |
438 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
439 | unsigned int code_len = code_bytes; | ||
482 | unsigned char c; | 440 | unsigned char c; |
441 | u8 *ip; | ||
442 | |||
483 | printk("Stack: "); | 443 | printk("Stack: "); |
484 | _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); | 444 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, |
445 | regs->bp, ""); | ||
485 | printk("\n"); | 446 | printk("\n"); |
486 | 447 | ||
487 | printk(KERN_EMERG "Code: "); | 448 | printk(KERN_EMERG "Code: "); |
449 | |||
450 | ip = (u8 *)regs->ip - code_prologue; | ||
488 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | 451 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { |
489 | /* try starting at RIP */ | 452 | /* try starting at RIP */ |
490 | ip = (u8 *) regs->ip; | 453 | ip = (u8 *)regs->ip; |
491 | code_len = code_len - code_prologue + 1; | 454 | code_len = code_len - code_prologue + 1; |
492 | } | 455 | } |
493 | for (i = 0; i < code_len; i++, ip++) { | 456 | for (i = 0; i < code_len; i++, ip++) { |
@@ -503,7 +466,7 @@ void show_registers(struct pt_regs *regs) | |||
503 | } | 466 | } |
504 | } | 467 | } |
505 | printk("\n"); | 468 | printk("\n"); |
506 | } | 469 | } |
507 | 470 | ||
508 | int is_valid_bugaddr(unsigned long ip) | 471 | int is_valid_bugaddr(unsigned long ip) |
509 | { | 472 | { |
@@ -543,7 +506,7 @@ unsigned __kprobes long oops_begin(void) | |||
543 | } | 506 | } |
544 | 507 | ||
545 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | 508 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) |
546 | { | 509 | { |
547 | die_owner = -1; | 510 | die_owner = -1; |
548 | bust_spinlocks(0); | 511 | bust_spinlocks(0); |
549 | die_nest_count--; | 512 | die_nest_count--; |
@@ -561,10 +524,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | |||
561 | do_exit(signr); | 524 | do_exit(signr); |
562 | } | 525 | } |
563 | 526 | ||
564 | int __kprobes __die(const char * str, struct pt_regs * regs, long err) | 527 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) |
565 | { | 528 | { |
566 | static int die_counter; | 529 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); |
567 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); | ||
568 | #ifdef CONFIG_PREEMPT | 530 | #ifdef CONFIG_PREEMPT |
569 | printk("PREEMPT "); | 531 | printk("PREEMPT "); |
570 | #endif | 532 | #endif |
@@ -575,8 +537,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) | |||
575 | printk("DEBUG_PAGEALLOC"); | 537 | printk("DEBUG_PAGEALLOC"); |
576 | #endif | 538 | #endif |
577 | printk("\n"); | 539 | printk("\n"); |
578 | if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | 540 | if (notify_die(DIE_OOPS, str, regs, err, |
541 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
579 | return 1; | 542 | return 1; |
543 | |||
580 | show_registers(regs); | 544 | show_registers(regs); |
581 | add_taint(TAINT_DIE); | 545 | add_taint(TAINT_DIE); |
582 | /* Executive summary in case the oops scrolled away */ | 546 | /* Executive summary in case the oops scrolled away */ |
@@ -588,7 +552,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) | |||
588 | return 0; | 552 | return 0; |
589 | } | 553 | } |
590 | 554 | ||
591 | void die(const char * str, struct pt_regs * regs, long err) | 555 | void die(const char *str, struct pt_regs *regs, long err) |
592 | { | 556 | { |
593 | unsigned long flags = oops_begin(); | 557 | unsigned long flags = oops_begin(); |
594 | 558 | ||
@@ -605,8 +569,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) | |||
605 | { | 569 | { |
606 | unsigned long flags; | 570 | unsigned long flags; |
607 | 571 | ||
608 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == | 572 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) |
609 | NOTIFY_STOP) | ||
610 | return; | 573 | return; |
611 | 574 | ||
612 | flags = oops_begin(); | 575 | flags = oops_begin(); |
@@ -614,7 +577,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) | |||
614 | * We are in trouble anyway, lets at least try | 577 | * We are in trouble anyway, lets at least try |
615 | * to get a message out. | 578 | * to get a message out. |
616 | */ | 579 | */ |
617 | printk(str, smp_processor_id()); | 580 | printk(KERN_EMERG "%s", str); |
581 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
582 | smp_processor_id(), regs->ip); | ||
618 | show_registers(regs); | 583 | show_registers(regs); |
619 | if (kexec_should_crash(current)) | 584 | if (kexec_should_crash(current)) |
620 | crash_kexec(regs); | 585 | crash_kexec(regs); |
@@ -626,44 +591,44 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) | |||
626 | do_exit(SIGBUS); | 591 | do_exit(SIGBUS); |
627 | } | 592 | } |
628 | 593 | ||
629 | static void __kprobes do_trap(int trapnr, int signr, char *str, | 594 | static void __kprobes |
630 | struct pt_regs * regs, long error_code, | 595 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, |
631 | siginfo_t *info) | 596 | long error_code, siginfo_t *info) |
632 | { | 597 | { |
633 | struct task_struct *tsk = current; | 598 | struct task_struct *tsk = current; |
634 | 599 | ||
635 | if (user_mode(regs)) { | 600 | if (!user_mode(regs)) |
636 | /* | 601 | goto kernel_trap; |
637 | * We want error_code and trap_no set for userspace | ||
638 | * faults and kernelspace faults which result in | ||
639 | * die(), but not kernelspace faults which are fixed | ||
640 | * up. die() gives the process no chance to handle | ||
641 | * the signal and notice the kernel fault information, | ||
642 | * so that won't result in polluting the information | ||
643 | * about previously queued, but not yet delivered, | ||
644 | * faults. See also do_general_protection below. | ||
645 | */ | ||
646 | tsk->thread.error_code = error_code; | ||
647 | tsk->thread.trap_no = trapnr; | ||
648 | |||
649 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
650 | printk_ratelimit()) { | ||
651 | printk(KERN_INFO | ||
652 | "%s[%d] trap %s ip:%lx sp:%lx error:%lx", | ||
653 | tsk->comm, tsk->pid, str, | ||
654 | regs->ip, regs->sp, error_code); | ||
655 | print_vma_addr(" in ", regs->ip); | ||
656 | printk("\n"); | ||
657 | } | ||
658 | 602 | ||
659 | if (info) | 603 | /* |
660 | force_sig_info(signr, info, tsk); | 604 | * We want error_code and trap_no set for userspace faults and |
661 | else | 605 | * kernelspace faults which result in die(), but not |
662 | force_sig(signr, tsk); | 606 | * kernelspace faults which are fixed up. die() gives the |
663 | return; | 607 | * process no chance to handle the signal and notice the |
608 | * kernel fault information, so that won't result in polluting | ||
609 | * the information about previously queued, but not yet | ||
610 | * delivered, faults. See also do_general_protection below. | ||
611 | */ | ||
612 | tsk->thread.error_code = error_code; | ||
613 | tsk->thread.trap_no = trapnr; | ||
614 | |||
615 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
616 | printk_ratelimit()) { | ||
617 | printk(KERN_INFO | ||
618 | "%s[%d] trap %s ip:%lx sp:%lx error:%lx", | ||
619 | tsk->comm, tsk->pid, str, | ||
620 | regs->ip, regs->sp, error_code); | ||
621 | print_vma_addr(" in ", regs->ip); | ||
622 | printk("\n"); | ||
664 | } | 623 | } |
665 | 624 | ||
625 | if (info) | ||
626 | force_sig_info(signr, info, tsk); | ||
627 | else | ||
628 | force_sig(signr, tsk); | ||
629 | return; | ||
666 | 630 | ||
631 | kernel_trap: | ||
667 | if (!fixup_exception(regs)) { | 632 | if (!fixup_exception(regs)) { |
668 | tsk->thread.error_code = error_code; | 633 | tsk->thread.error_code = error_code; |
669 | tsk->thread.trap_no = trapnr; | 634 | tsk->thread.trap_no = trapnr; |
@@ -673,41 +638,39 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, | |||
673 | } | 638 | } |
674 | 639 | ||
675 | #define DO_ERROR(trapnr, signr, str, name) \ | 640 | #define DO_ERROR(trapnr, signr, str, name) \ |
676 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | 641 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ |
677 | { \ | 642 | { \ |
678 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 643 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
679 | == NOTIFY_STOP) \ | 644 | == NOTIFY_STOP) \ |
680 | return; \ | 645 | return; \ |
681 | conditional_sti(regs); \ | 646 | conditional_sti(regs); \ |
682 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | 647 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ |
683 | } | 648 | } |
684 | 649 | ||
685 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | 650 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ |
686 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | 651 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ |
687 | { \ | 652 | { \ |
688 | siginfo_t info; \ | 653 | siginfo_t info; \ |
689 | info.si_signo = signr; \ | 654 | info.si_signo = signr; \ |
690 | info.si_errno = 0; \ | 655 | info.si_errno = 0; \ |
691 | info.si_code = sicode; \ | 656 | info.si_code = sicode; \ |
692 | info.si_addr = (void __user *)siaddr; \ | 657 | info.si_addr = (void __user *)siaddr; \ |
693 | trace_hardirqs_fixup(); \ | 658 | trace_hardirqs_fixup(); \ |
694 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 659 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
695 | == NOTIFY_STOP) \ | 660 | == NOTIFY_STOP) \ |
696 | return; \ | 661 | return; \ |
697 | conditional_sti(regs); \ | 662 | conditional_sti(regs); \ |
698 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | 663 | do_trap(trapnr, signr, str, regs, error_code, &info); \ |
699 | } | 664 | } |
700 | 665 | ||
701 | DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | 666 | DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) |
702 | DO_ERROR( 4, SIGSEGV, "overflow", overflow) | 667 | DO_ERROR(4, SIGSEGV, "overflow", overflow) |
703 | DO_ERROR( 5, SIGSEGV, "bounds", bounds) | 668 | DO_ERROR(5, SIGSEGV, "bounds", bounds) |
704 | DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) | 669 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) |
705 | DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) | 670 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) |
706 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
707 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | 671 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) |
708 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | 672 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) |
709 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | 673 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) |
710 | DO_ERROR(18, SIGSEGV, "reserved", reserved) | ||
711 | 674 | ||
712 | /* Runs on IST stack */ | 675 | /* Runs on IST stack */ |
713 | asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) | 676 | asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) |
@@ -737,31 +700,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) | |||
737 | die(str, regs, error_code); | 700 | die(str, regs, error_code); |
738 | } | 701 | } |
739 | 702 | ||
740 | asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, | 703 | asmlinkage void __kprobes |
741 | long error_code) | 704 | do_general_protection(struct pt_regs *regs, long error_code) |
742 | { | 705 | { |
743 | struct task_struct *tsk = current; | 706 | struct task_struct *tsk; |
744 | 707 | ||
745 | conditional_sti(regs); | 708 | conditional_sti(regs); |
746 | 709 | ||
747 | if (user_mode(regs)) { | 710 | tsk = current; |
748 | tsk->thread.error_code = error_code; | 711 | if (!user_mode(regs)) |
749 | tsk->thread.trap_no = 13; | 712 | goto gp_in_kernel; |
750 | |||
751 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
752 | printk_ratelimit()) { | ||
753 | printk(KERN_INFO | ||
754 | "%s[%d] general protection ip:%lx sp:%lx error:%lx", | ||
755 | tsk->comm, tsk->pid, | ||
756 | regs->ip, regs->sp, error_code); | ||
757 | print_vma_addr(" in ", regs->ip); | ||
758 | printk("\n"); | ||
759 | } | ||
760 | 713 | ||
761 | force_sig(SIGSEGV, tsk); | 714 | tsk->thread.error_code = error_code; |
762 | return; | 715 | tsk->thread.trap_no = 13; |
763 | } | 716 | |
717 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
718 | printk_ratelimit()) { | ||
719 | printk(KERN_INFO | ||
720 | "%s[%d] general protection ip:%lx sp:%lx error:%lx", | ||
721 | tsk->comm, tsk->pid, | ||
722 | regs->ip, regs->sp, error_code); | ||
723 | print_vma_addr(" in ", regs->ip); | ||
724 | printk("\n"); | ||
725 | } | ||
726 | |||
727 | force_sig(SIGSEGV, tsk); | ||
728 | return; | ||
764 | 729 | ||
730 | gp_in_kernel: | ||
765 | if (fixup_exception(regs)) | 731 | if (fixup_exception(regs)) |
766 | return; | 732 | return; |
767 | 733 | ||
@@ -774,14 +740,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, | |||
774 | } | 740 | } |
775 | 741 | ||
776 | static notrace __kprobes void | 742 | static notrace __kprobes void |
777 | mem_parity_error(unsigned char reason, struct pt_regs * regs) | 743 | mem_parity_error(unsigned char reason, struct pt_regs *regs) |
778 | { | 744 | { |
779 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | 745 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", |
780 | reason); | 746 | reason); |
781 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | 747 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); |
782 | 748 | ||
783 | #if defined(CONFIG_EDAC) | 749 | #if defined(CONFIG_EDAC) |
784 | if(edac_handler_set()) { | 750 | if (edac_handler_set()) { |
785 | edac_atomic_assert_error(); | 751 | edac_atomic_assert_error(); |
786 | return; | 752 | return; |
787 | } | 753 | } |
@@ -798,7 +764,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) | |||
798 | } | 764 | } |
799 | 765 | ||
800 | static notrace __kprobes void | 766 | static notrace __kprobes void |
801 | io_check_error(unsigned char reason, struct pt_regs * regs) | 767 | io_check_error(unsigned char reason, struct pt_regs *regs) |
802 | { | 768 | { |
803 | printk("NMI: IOCK error (debug interrupt?)\n"); | 769 | printk("NMI: IOCK error (debug interrupt?)\n"); |
804 | show_registers(regs); | 770 | show_registers(regs); |
@@ -828,14 +794,14 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | |||
828 | 794 | ||
829 | /* Runs on IST stack. This code must keep interrupts off all the time. | 795 | /* Runs on IST stack. This code must keep interrupts off all the time. |
830 | Nested NMIs are prevented by the CPU. */ | 796 | Nested NMIs are prevented by the CPU. */ |
831 | asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) | 797 | asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) |
832 | { | 798 | { |
833 | unsigned char reason = 0; | 799 | unsigned char reason = 0; |
834 | int cpu; | 800 | int cpu; |
835 | 801 | ||
836 | cpu = smp_processor_id(); | 802 | cpu = smp_processor_id(); |
837 | 803 | ||
838 | /* Only the BSP gets external NMIs from the system. */ | 804 | /* Only the BSP gets external NMIs from the system. */ |
839 | if (!cpu) | 805 | if (!cpu) |
840 | reason = get_nmi_reason(); | 806 | reason = get_nmi_reason(); |
841 | 807 | ||
@@ -847,32 +813,57 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
847 | * Ok, so this is none of the documented NMI sources, | 813 | * Ok, so this is none of the documented NMI sources, |
848 | * so it must be the NMI watchdog. | 814 | * so it must be the NMI watchdog. |
849 | */ | 815 | */ |
850 | if (nmi_watchdog_tick(regs,reason)) | 816 | if (nmi_watchdog_tick(regs, reason)) |
851 | return; | 817 | return; |
852 | if (!do_nmi_callback(regs,cpu)) | 818 | if (!do_nmi_callback(regs, cpu)) |
853 | unknown_nmi_error(reason, regs); | 819 | unknown_nmi_error(reason, regs); |
854 | 820 | ||
855 | return; | 821 | return; |
856 | } | 822 | } |
857 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | 823 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) |
858 | return; | 824 | return; |
859 | 825 | ||
860 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | 826 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ |
861 | |||
862 | if (reason & 0x80) | 827 | if (reason & 0x80) |
863 | mem_parity_error(reason, regs); | 828 | mem_parity_error(reason, regs); |
864 | if (reason & 0x40) | 829 | if (reason & 0x40) |
865 | io_check_error(reason, regs); | 830 | io_check_error(reason, regs); |
866 | } | 831 | } |
867 | 832 | ||
833 | asmlinkage notrace __kprobes void | ||
834 | do_nmi(struct pt_regs *regs, long error_code) | ||
835 | { | ||
836 | nmi_enter(); | ||
837 | |||
838 | add_pda(__nmi_count, 1); | ||
839 | |||
840 | if (!ignore_nmis) | ||
841 | default_do_nmi(regs); | ||
842 | |||
843 | nmi_exit(); | ||
844 | } | ||
845 | |||
846 | void stop_nmi(void) | ||
847 | { | ||
848 | acpi_nmi_disable(); | ||
849 | ignore_nmis++; | ||
850 | } | ||
851 | |||
852 | void restart_nmi(void) | ||
853 | { | ||
854 | ignore_nmis--; | ||
855 | acpi_nmi_enable(); | ||
856 | } | ||
857 | |||
868 | /* runs on IST stack. */ | 858 | /* runs on IST stack. */ |
869 | asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) | 859 | asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) |
870 | { | 860 | { |
871 | trace_hardirqs_fixup(); | 861 | trace_hardirqs_fixup(); |
872 | 862 | ||
873 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { | 863 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) |
864 | == NOTIFY_STOP) | ||
874 | return; | 865 | return; |
875 | } | 866 | |
876 | preempt_conditional_sti(regs); | 867 | preempt_conditional_sti(regs); |
877 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | 868 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); |
878 | preempt_conditional_cli(regs); | 869 | preempt_conditional_cli(regs); |
@@ -903,8 +894,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | |||
903 | asmlinkage void __kprobes do_debug(struct pt_regs * regs, | 894 | asmlinkage void __kprobes do_debug(struct pt_regs * regs, |
904 | unsigned long error_code) | 895 | unsigned long error_code) |
905 | { | 896 | { |
906 | unsigned long condition; | ||
907 | struct task_struct *tsk = current; | 897 | struct task_struct *tsk = current; |
898 | unsigned long condition; | ||
908 | siginfo_t info; | 899 | siginfo_t info; |
909 | 900 | ||
910 | trace_hardirqs_fixup(); | 901 | trace_hardirqs_fixup(); |
@@ -925,21 +916,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, | |||
925 | 916 | ||
926 | /* Mask out spurious debug traps due to lazy DR7 setting */ | 917 | /* Mask out spurious debug traps due to lazy DR7 setting */ |
927 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | 918 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { |
928 | if (!tsk->thread.debugreg7) { | 919 | if (!tsk->thread.debugreg7) |
929 | goto clear_dr7; | 920 | goto clear_dr7; |
930 | } | ||
931 | } | 921 | } |
932 | 922 | ||
933 | tsk->thread.debugreg6 = condition; | 923 | tsk->thread.debugreg6 = condition; |
934 | 924 | ||
935 | |||
936 | /* | 925 | /* |
937 | * Single-stepping through TF: make sure we ignore any events in | 926 | * Single-stepping through TF: make sure we ignore any events in |
938 | * kernel space (but re-enable TF when returning to user mode). | 927 | * kernel space (but re-enable TF when returning to user mode). |
939 | */ | 928 | */ |
940 | if (condition & DR_STEP) { | 929 | if (condition & DR_STEP) { |
941 | if (!user_mode(regs)) | 930 | if (!user_mode(regs)) |
942 | goto clear_TF_reenable; | 931 | goto clear_TF_reenable; |
943 | } | 932 | } |
944 | 933 | ||
945 | /* Ok, finally something we can handle */ | 934 | /* Ok, finally something we can handle */ |
@@ -952,7 +941,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, | |||
952 | force_sig_info(SIGTRAP, &info, tsk); | 941 | force_sig_info(SIGTRAP, &info, tsk); |
953 | 942 | ||
954 | clear_dr7: | 943 | clear_dr7: |
955 | set_debugreg(0UL, 7); | 944 | set_debugreg(0, 7); |
956 | preempt_conditional_cli(regs); | 945 | preempt_conditional_cli(regs); |
957 | return; | 946 | return; |
958 | 947 | ||
@@ -960,6 +949,7 @@ clear_TF_reenable: | |||
960 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | 949 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); |
961 | regs->flags &= ~X86_EFLAGS_TF; | 950 | regs->flags &= ~X86_EFLAGS_TF; |
962 | preempt_conditional_cli(regs); | 951 | preempt_conditional_cli(regs); |
952 | return; | ||
963 | } | 953 | } |
964 | 954 | ||
965 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | 955 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) |
@@ -982,7 +972,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | |||
982 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | 972 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) |
983 | { | 973 | { |
984 | void __user *ip = (void __user *)(regs->ip); | 974 | void __user *ip = (void __user *)(regs->ip); |
985 | struct task_struct * task; | 975 | struct task_struct *task; |
986 | siginfo_t info; | 976 | siginfo_t info; |
987 | unsigned short cwd, swd; | 977 | unsigned short cwd, swd; |
988 | 978 | ||
@@ -1015,30 +1005,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) | |||
1015 | cwd = get_fpu_cwd(task); | 1005 | cwd = get_fpu_cwd(task); |
1016 | swd = get_fpu_swd(task); | 1006 | swd = get_fpu_swd(task); |
1017 | switch (swd & ~cwd & 0x3f) { | 1007 | switch (swd & ~cwd & 0x3f) { |
1018 | case 0x000: | 1008 | case 0x000: /* No unmasked exception */ |
1019 | default: | 1009 | default: /* Multiple exceptions */ |
1020 | break; | 1010 | break; |
1021 | case 0x001: /* Invalid Op */ | 1011 | case 0x001: /* Invalid Op */ |
1022 | /* | 1012 | /* |
1023 | * swd & 0x240 == 0x040: Stack Underflow | 1013 | * swd & 0x240 == 0x040: Stack Underflow |
1024 | * swd & 0x240 == 0x240: Stack Overflow | 1014 | * swd & 0x240 == 0x240: Stack Overflow |
1025 | * User must clear the SF bit (0x40) if set | 1015 | * User must clear the SF bit (0x40) if set |
1026 | */ | 1016 | */ |
1027 | info.si_code = FPE_FLTINV; | 1017 | info.si_code = FPE_FLTINV; |
1028 | break; | 1018 | break; |
1029 | case 0x002: /* Denormalize */ | 1019 | case 0x002: /* Denormalize */ |
1030 | case 0x010: /* Underflow */ | 1020 | case 0x010: /* Underflow */ |
1031 | info.si_code = FPE_FLTUND; | 1021 | info.si_code = FPE_FLTUND; |
1032 | break; | 1022 | break; |
1033 | case 0x004: /* Zero Divide */ | 1023 | case 0x004: /* Zero Divide */ |
1034 | info.si_code = FPE_FLTDIV; | 1024 | info.si_code = FPE_FLTDIV; |
1035 | break; | 1025 | break; |
1036 | case 0x008: /* Overflow */ | 1026 | case 0x008: /* Overflow */ |
1037 | info.si_code = FPE_FLTOVF; | 1027 | info.si_code = FPE_FLTOVF; |
1038 | break; | 1028 | break; |
1039 | case 0x020: /* Precision */ | 1029 | case 0x020: /* Precision */ |
1040 | info.si_code = FPE_FLTRES; | 1030 | info.si_code = FPE_FLTRES; |
1041 | break; | 1031 | break; |
1042 | } | 1032 | } |
1043 | force_sig_info(SIGFPE, &info, task); | 1033 | force_sig_info(SIGFPE, &info, task); |
1044 | } | 1034 | } |
@@ -1051,7 +1041,7 @@ asmlinkage void bad_intr(void) | |||
1051 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | 1041 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) |
1052 | { | 1042 | { |
1053 | void __user *ip = (void __user *)(regs->ip); | 1043 | void __user *ip = (void __user *)(regs->ip); |
1054 | struct task_struct * task; | 1044 | struct task_struct *task; |
1055 | siginfo_t info; | 1045 | siginfo_t info; |
1056 | unsigned short mxcsr; | 1046 | unsigned short mxcsr; |
1057 | 1047 | ||
@@ -1079,25 +1069,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | |||
1079 | */ | 1069 | */ |
1080 | mxcsr = get_fpu_mxcsr(task); | 1070 | mxcsr = get_fpu_mxcsr(task); |
1081 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | 1071 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { |
1082 | case 0x000: | 1072 | case 0x000: |
1083 | default: | 1073 | default: |
1084 | break; | 1074 | break; |
1085 | case 0x001: /* Invalid Op */ | 1075 | case 0x001: /* Invalid Op */ |
1086 | info.si_code = FPE_FLTINV; | 1076 | info.si_code = FPE_FLTINV; |
1087 | break; | 1077 | break; |
1088 | case 0x002: /* Denormalize */ | 1078 | case 0x002: /* Denormalize */ |
1089 | case 0x010: /* Underflow */ | 1079 | case 0x010: /* Underflow */ |
1090 | info.si_code = FPE_FLTUND; | 1080 | info.si_code = FPE_FLTUND; |
1091 | break; | 1081 | break; |
1092 | case 0x004: /* Zero Divide */ | 1082 | case 0x004: /* Zero Divide */ |
1093 | info.si_code = FPE_FLTDIV; | 1083 | info.si_code = FPE_FLTDIV; |
1094 | break; | 1084 | break; |
1095 | case 0x008: /* Overflow */ | 1085 | case 0x008: /* Overflow */ |
1096 | info.si_code = FPE_FLTOVF; | 1086 | info.si_code = FPE_FLTOVF; |
1097 | break; | 1087 | break; |
1098 | case 0x020: /* Precision */ | 1088 | case 0x020: /* Precision */ |
1099 | info.si_code = FPE_FLTRES; | 1089 | info.si_code = FPE_FLTRES; |
1100 | break; | 1090 | break; |
1101 | } | 1091 | } |
1102 | force_sig_info(SIGFPE, &info, task); | 1092 | force_sig_info(SIGFPE, &info, task); |
1103 | } | 1093 | } |
@@ -1115,7 +1105,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | |||
1115 | } | 1105 | } |
1116 | 1106 | ||
1117 | /* | 1107 | /* |
1118 | * 'math_state_restore()' saves the current math information in the | 1108 | * 'math_state_restore()' saves the current math information in the |
1119 | * old math state array, and gets the new ones from the current task | 1109 | * old math state array, and gets the new ones from the current task |
1120 | * | 1110 | * |
1121 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | 1111 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. |
@@ -1140,7 +1130,7 @@ asmlinkage void math_state_restore(void) | |||
1140 | local_irq_disable(); | 1130 | local_irq_disable(); |
1141 | } | 1131 | } |
1142 | 1132 | ||
1143 | clts(); /* Allow maths ops (or we recurse) */ | 1133 | clts(); /* Allow maths ops (or we recurse) */ |
1144 | restore_fpu_checking(&me->thread.xstate->fxsave); | 1134 | restore_fpu_checking(&me->thread.xstate->fxsave); |
1145 | task_thread_info(me)->status |= TS_USEDFPU; | 1135 | task_thread_info(me)->status |= TS_USEDFPU; |
1146 | me->fpu_counter++; | 1136 | me->fpu_counter++; |
@@ -1149,64 +1139,61 @@ EXPORT_SYMBOL_GPL(math_state_restore); | |||
1149 | 1139 | ||
1150 | void __init trap_init(void) | 1140 | void __init trap_init(void) |
1151 | { | 1141 | { |
1152 | set_intr_gate(0,÷_error); | 1142 | set_intr_gate(0, ÷_error); |
1153 | set_intr_gate_ist(1,&debug,DEBUG_STACK); | 1143 | set_intr_gate_ist(1, &debug, DEBUG_STACK); |
1154 | set_intr_gate_ist(2,&nmi,NMI_STACK); | 1144 | set_intr_gate_ist(2, &nmi, NMI_STACK); |
1155 | set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ | 1145 | set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ |
1156 | set_system_gate(4,&overflow); /* int4 can be called from all */ | 1146 | set_system_gate(4, &overflow); /* int4 can be called from all */ |
1157 | set_intr_gate(5,&bounds); | 1147 | set_intr_gate(5, &bounds); |
1158 | set_intr_gate(6,&invalid_op); | 1148 | set_intr_gate(6, &invalid_op); |
1159 | set_intr_gate(7,&device_not_available); | 1149 | set_intr_gate(7, &device_not_available); |
1160 | set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); | 1150 | set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); |
1161 | set_intr_gate(9,&coprocessor_segment_overrun); | 1151 | set_intr_gate(9, &coprocessor_segment_overrun); |
1162 | set_intr_gate(10,&invalid_TSS); | 1152 | set_intr_gate(10, &invalid_TSS); |
1163 | set_intr_gate(11,&segment_not_present); | 1153 | set_intr_gate(11, &segment_not_present); |
1164 | set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); | 1154 | set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); |
1165 | set_intr_gate(13,&general_protection); | 1155 | set_intr_gate(13, &general_protection); |
1166 | set_intr_gate(14,&page_fault); | 1156 | set_intr_gate(14, &page_fault); |
1167 | set_intr_gate(15,&spurious_interrupt_bug); | 1157 | set_intr_gate(15, &spurious_interrupt_bug); |
1168 | set_intr_gate(16,&coprocessor_error); | 1158 | set_intr_gate(16, &coprocessor_error); |
1169 | set_intr_gate(17,&alignment_check); | 1159 | set_intr_gate(17, &alignment_check); |
1170 | #ifdef CONFIG_X86_MCE | 1160 | #ifdef CONFIG_X86_MCE |
1171 | set_intr_gate_ist(18,&machine_check, MCE_STACK); | 1161 | set_intr_gate_ist(18, &machine_check, MCE_STACK); |
1172 | #endif | 1162 | #endif |
1173 | set_intr_gate(19,&simd_coprocessor_error); | 1163 | set_intr_gate(19, &simd_coprocessor_error); |
1174 | 1164 | ||
1175 | #ifdef CONFIG_IA32_EMULATION | 1165 | #ifdef CONFIG_IA32_EMULATION |
1176 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | 1166 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); |
1177 | #endif | 1167 | #endif |
1178 | |||
1179 | /* | 1168 | /* |
1180 | * initialize the per thread extended state: | 1169 | * initialize the per thread extended state: |
1181 | */ | 1170 | */ |
1182 | init_thread_xstate(); | 1171 | init_thread_xstate(); |
1183 | /* | 1172 | /* |
1184 | * Should be a barrier for any external CPU state. | 1173 | * Should be a barrier for any external CPU state: |
1185 | */ | 1174 | */ |
1186 | cpu_init(); | 1175 | cpu_init(); |
1187 | } | 1176 | } |
1188 | 1177 | ||
1189 | |||
1190 | static int __init oops_setup(char *s) | 1178 | static int __init oops_setup(char *s) |
1191 | { | 1179 | { |
1192 | if (!s) | 1180 | if (!s) |
1193 | return -EINVAL; | 1181 | return -EINVAL; |
1194 | if (!strcmp(s, "panic")) | 1182 | if (!strcmp(s, "panic")) |
1195 | panic_on_oops = 1; | 1183 | panic_on_oops = 1; |
1196 | return 0; | 1184 | return 0; |
1197 | } | 1185 | } |
1198 | early_param("oops", oops_setup); | 1186 | early_param("oops", oops_setup); |
1199 | 1187 | ||
1200 | static int __init kstack_setup(char *s) | 1188 | static int __init kstack_setup(char *s) |
1201 | { | 1189 | { |
1202 | if (!s) | 1190 | if (!s) |
1203 | return -EINVAL; | 1191 | return -EINVAL; |
1204 | kstack_depth_to_print = simple_strtoul(s,NULL,0); | 1192 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); |
1205 | return 0; | 1193 | return 0; |
1206 | } | 1194 | } |
1207 | early_param("kstack", kstack_setup); | 1195 | early_param("kstack", kstack_setup); |
1208 | 1196 | ||
1209 | |||
1210 | static int __init code_bytes_setup(char *s) | 1197 | static int __init code_bytes_setup(char *s) |
1211 | { | 1198 | { |
1212 | code_bytes = simple_strtoul(s, NULL, 0); | 1199 | code_bytes = simple_strtoul(s, NULL, 0); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c new file mode 100644 index 000000000000..7603c0553909 --- /dev/null +++ b/arch/x86/kernel/tsc.c | |||
@@ -0,0 +1,535 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/timer.h> | ||
6 | #include <linux/acpi_pmtmr.h> | ||
7 | #include <linux/cpufreq.h> | ||
8 | #include <linux/dmi.h> | ||
9 | #include <linux/delay.h> | ||
10 | #include <linux/clocksource.h> | ||
11 | #include <linux/percpu.h> | ||
12 | |||
13 | #include <asm/hpet.h> | ||
14 | #include <asm/timer.h> | ||
15 | #include <asm/vgtod.h> | ||
16 | #include <asm/time.h> | ||
17 | #include <asm/delay.h> | ||
18 | |||
19 | unsigned int cpu_khz; /* TSC clocks / usec, not used here */ | ||
20 | EXPORT_SYMBOL(cpu_khz); | ||
21 | unsigned int tsc_khz; | ||
22 | EXPORT_SYMBOL(tsc_khz); | ||
23 | |||
24 | /* | ||
25 | * TSC can be unstable due to cpufreq or due to unsynced TSCs | ||
26 | */ | ||
27 | static int tsc_unstable; | ||
28 | |||
29 | /* native_sched_clock() is called before tsc_init(), so | ||
30 | we must start with the TSC soft disabled to prevent | ||
31 | erroneous rdtsc usage on !cpu_has_tsc processors */ | ||
32 | static int tsc_disabled = -1; | ||
33 | |||
34 | /* | ||
35 | * Scheduler clock - returns current time in nanosec units. | ||
36 | */ | ||
37 | u64 native_sched_clock(void) | ||
38 | { | ||
39 | u64 this_offset; | ||
40 | |||
41 | /* | ||
42 | * Fall back to jiffies if there's no TSC available: | ||
43 | * ( But note that we still use it if the TSC is marked | ||
44 | * unstable. We do this because unlike Time Of Day, | ||
45 | * the scheduler clock tolerates small errors and it's | ||
46 | * very important for it to be as fast as the platform | ||
47 | * can achive it. ) | ||
48 | */ | ||
49 | if (unlikely(tsc_disabled)) { | ||
50 | /* No locking but a rare wrong value is not a big deal: */ | ||
51 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); | ||
52 | } | ||
53 | |||
54 | /* read the Time Stamp Counter: */ | ||
55 | rdtscll(this_offset); | ||
56 | |||
57 | /* return the value in ns */ | ||
58 | return cycles_2_ns(this_offset); | ||
59 | } | ||
60 | |||
61 | /* We need to define a real function for sched_clock, to override the | ||
62 | weak default version */ | ||
63 | #ifdef CONFIG_PARAVIRT | ||
64 | unsigned long long sched_clock(void) | ||
65 | { | ||
66 | return paravirt_sched_clock(); | ||
67 | } | ||
68 | #else | ||
69 | unsigned long long | ||
70 | sched_clock(void) __attribute__((alias("native_sched_clock"))); | ||
71 | #endif | ||
72 | |||
73 | int check_tsc_unstable(void) | ||
74 | { | ||
75 | return tsc_unstable; | ||
76 | } | ||
77 | EXPORT_SYMBOL_GPL(check_tsc_unstable); | ||
78 | |||
79 | #ifdef CONFIG_X86_TSC | ||
80 | int __init notsc_setup(char *str) | ||
81 | { | ||
82 | printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " | ||
83 | "cannot disable TSC completely.\n"); | ||
84 | tsc_disabled = 1; | ||
85 | return 1; | ||
86 | } | ||
87 | #else | ||
88 | /* | ||
89 | * disable flag for tsc. Takes effect by clearing the TSC cpu flag | ||
90 | * in cpu/common.c | ||
91 | */ | ||
92 | int __init notsc_setup(char *str) | ||
93 | { | ||
94 | setup_clear_cpu_cap(X86_FEATURE_TSC); | ||
95 | return 1; | ||
96 | } | ||
97 | #endif | ||
98 | |||
99 | __setup("notsc", notsc_setup); | ||
100 | |||
101 | #define MAX_RETRIES 5 | ||
102 | #define SMI_TRESHOLD 50000 | ||
103 | |||
104 | /* | ||
105 | * Read TSC and the reference counters. Take care of SMI disturbance | ||
106 | */ | ||
107 | static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) | ||
108 | { | ||
109 | u64 t1, t2; | ||
110 | int i; | ||
111 | |||
112 | for (i = 0; i < MAX_RETRIES; i++) { | ||
113 | t1 = get_cycles(); | ||
114 | if (hpet) | ||
115 | *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; | ||
116 | else | ||
117 | *pm = acpi_pm_read_early(); | ||
118 | t2 = get_cycles(); | ||
119 | if ((t2 - t1) < SMI_TRESHOLD) | ||
120 | return t2; | ||
121 | } | ||
122 | return ULLONG_MAX; | ||
123 | } | ||
124 | |||
125 | /** | ||
126 | * native_calibrate_tsc - calibrate the tsc on boot | ||
127 | */ | ||
128 | unsigned long native_calibrate_tsc(void) | ||
129 | { | ||
130 | unsigned long flags; | ||
131 | u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; | ||
132 | int hpet = is_hpet_enabled(); | ||
133 | unsigned int tsc_khz_val = 0; | ||
134 | |||
135 | local_irq_save(flags); | ||
136 | |||
137 | tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); | ||
138 | |||
139 | outb((inb(0x61) & ~0x02) | 0x01, 0x61); | ||
140 | |||
141 | outb(0xb0, 0x43); | ||
142 | outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); | ||
143 | outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); | ||
144 | tr1 = get_cycles(); | ||
145 | while ((inb(0x61) & 0x20) == 0); | ||
146 | tr2 = get_cycles(); | ||
147 | |||
148 | tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); | ||
149 | |||
150 | local_irq_restore(flags); | ||
151 | |||
152 | /* | ||
153 | * Preset the result with the raw and inaccurate PIT | ||
154 | * calibration value | ||
155 | */ | ||
156 | delta = (tr2 - tr1); | ||
157 | do_div(delta, 50); | ||
158 | tsc_khz_val = delta; | ||
159 | |||
160 | /* hpet or pmtimer available ? */ | ||
161 | if (!hpet && !pm1 && !pm2) { | ||
162 | printk(KERN_INFO "TSC calibrated against PIT\n"); | ||
163 | goto out; | ||
164 | } | ||
165 | |||
166 | /* Check, whether the sampling was disturbed by an SMI */ | ||
167 | if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { | ||
168 | printk(KERN_WARNING "TSC calibration disturbed by SMI, " | ||
169 | "using PIT calibration result\n"); | ||
170 | goto out; | ||
171 | } | ||
172 | |||
173 | tsc2 = (tsc2 - tsc1) * 1000000LL; | ||
174 | |||
175 | if (hpet) { | ||
176 | printk(KERN_INFO "TSC calibrated against HPET\n"); | ||
177 | if (hpet2 < hpet1) | ||
178 | hpet2 += 0x100000000ULL; | ||
179 | hpet2 -= hpet1; | ||
180 | tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); | ||
181 | do_div(tsc1, 1000000); | ||
182 | } else { | ||
183 | printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); | ||
184 | if (pm2 < pm1) | ||
185 | pm2 += (u64)ACPI_PM_OVRRUN; | ||
186 | pm2 -= pm1; | ||
187 | tsc1 = pm2 * 1000000000LL; | ||
188 | do_div(tsc1, PMTMR_TICKS_PER_SEC); | ||
189 | } | ||
190 | |||
191 | do_div(tsc2, tsc1); | ||
192 | tsc_khz_val = tsc2; | ||
193 | |||
194 | out: | ||
195 | return tsc_khz_val; | ||
196 | } | ||
197 | |||
198 | |||
199 | #ifdef CONFIG_X86_32 | ||
200 | /* Only called from the Powernow K7 cpu freq driver */ | ||
201 | int recalibrate_cpu_khz(void) | ||
202 | { | ||
203 | #ifndef CONFIG_SMP | ||
204 | unsigned long cpu_khz_old = cpu_khz; | ||
205 | |||
206 | if (cpu_has_tsc) { | ||
207 | tsc_khz = calibrate_tsc(); | ||
208 | cpu_khz = tsc_khz; | ||
209 | cpu_data(0).loops_per_jiffy = | ||
210 | cpufreq_scale(cpu_data(0).loops_per_jiffy, | ||
211 | cpu_khz_old, cpu_khz); | ||
212 | return 0; | ||
213 | } else | ||
214 | return -ENODEV; | ||
215 | #else | ||
216 | return -ENODEV; | ||
217 | #endif | ||
218 | } | ||
219 | |||
220 | EXPORT_SYMBOL(recalibrate_cpu_khz); | ||
221 | |||
222 | #endif /* CONFIG_X86_32 */ | ||
223 | |||
224 | /* Accelerators for sched_clock() | ||
225 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
226 | * basic equation: | ||
227 | * ns = cycles / (freq / ns_per_sec) | ||
228 | * ns = cycles * (ns_per_sec / freq) | ||
229 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
230 | * ns = cycles * (10^6 / cpu_khz) | ||
231 | * | ||
232 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
233 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
234 | * ns = cycles * cyc2ns_scale / SC | ||
235 | * | ||
236 | * And since SC is a constant power of two, we can convert the div | ||
237 | * into a shift. | ||
238 | * | ||
239 | * We can use khz divisor instead of mhz to keep a better precision, since | ||
240 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
241 | * (mathieu.desnoyers@polymtl.ca) | ||
242 | * | ||
243 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
244 | */ | ||
245 | |||
246 | DEFINE_PER_CPU(unsigned long, cyc2ns); | ||
247 | |||
248 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | ||
249 | { | ||
250 | unsigned long long tsc_now, ns_now; | ||
251 | unsigned long flags, *scale; | ||
252 | |||
253 | local_irq_save(flags); | ||
254 | sched_clock_idle_sleep_event(); | ||
255 | |||
256 | scale = &per_cpu(cyc2ns, cpu); | ||
257 | |||
258 | rdtscll(tsc_now); | ||
259 | ns_now = __cycles_2_ns(tsc_now); | ||
260 | |||
261 | if (cpu_khz) | ||
262 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; | ||
263 | |||
264 | sched_clock_idle_wakeup_event(0); | ||
265 | local_irq_restore(flags); | ||
266 | } | ||
267 | |||
268 | #ifdef CONFIG_CPU_FREQ | ||
269 | |||
270 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency | ||
271 | * changes. | ||
272 | * | ||
273 | * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's | ||
274 | * not that important because current Opteron setups do not support | ||
275 | * scaling on SMP anyroads. | ||
276 | * | ||
277 | * Should fix up last_tsc too. Currently gettimeofday in the | ||
278 | * first tick after the change will be slightly wrong. | ||
279 | */ | ||
280 | |||
281 | static unsigned int ref_freq; | ||
282 | static unsigned long loops_per_jiffy_ref; | ||
283 | static unsigned long tsc_khz_ref; | ||
284 | |||
285 | static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | ||
286 | void *data) | ||
287 | { | ||
288 | struct cpufreq_freqs *freq = data; | ||
289 | unsigned long *lpj, dummy; | ||
290 | |||
291 | if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) | ||
292 | return 0; | ||
293 | |||
294 | lpj = &dummy; | ||
295 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
296 | #ifdef CONFIG_SMP | ||
297 | lpj = &cpu_data(freq->cpu).loops_per_jiffy; | ||
298 | #else | ||
299 | lpj = &boot_cpu_data.loops_per_jiffy; | ||
300 | #endif | ||
301 | |||
302 | if (!ref_freq) { | ||
303 | ref_freq = freq->old; | ||
304 | loops_per_jiffy_ref = *lpj; | ||
305 | tsc_khz_ref = tsc_khz; | ||
306 | } | ||
307 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
308 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
309 | (val == CPUFREQ_RESUMECHANGE)) { | ||
310 | *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); | ||
311 | |||
312 | tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); | ||
313 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
314 | mark_tsc_unstable("cpufreq changes"); | ||
315 | } | ||
316 | |||
317 | set_cyc2ns_scale(tsc_khz_ref, freq->cpu); | ||
318 | |||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | static struct notifier_block time_cpufreq_notifier_block = { | ||
323 | .notifier_call = time_cpufreq_notifier | ||
324 | }; | ||
325 | |||
326 | static int __init cpufreq_tsc(void) | ||
327 | { | ||
328 | cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
329 | CPUFREQ_TRANSITION_NOTIFIER); | ||
330 | return 0; | ||
331 | } | ||
332 | |||
333 | core_initcall(cpufreq_tsc); | ||
334 | |||
335 | #endif /* CONFIG_CPU_FREQ */ | ||
336 | |||
337 | /* clocksource code */ | ||
338 | |||
339 | static struct clocksource clocksource_tsc; | ||
340 | |||
341 | /* | ||
342 | * We compare the TSC to the cycle_last value in the clocksource | ||
343 | * structure to avoid a nasty time-warp. This can be observed in a | ||
344 | * very small window right after one CPU updated cycle_last under | ||
345 | * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which | ||
346 | * is smaller than the cycle_last reference value due to a TSC which | ||
347 | * is slighty behind. This delta is nowhere else observable, but in | ||
348 | * that case it results in a forward time jump in the range of hours | ||
349 | * due to the unsigned delta calculation of the time keeping core | ||
350 | * code, which is necessary to support wrapping clocksources like pm | ||
351 | * timer. | ||
352 | */ | ||
353 | static cycle_t read_tsc(void) | ||
354 | { | ||
355 | cycle_t ret = (cycle_t)get_cycles(); | ||
356 | |||
357 | return ret >= clocksource_tsc.cycle_last ? | ||
358 | ret : clocksource_tsc.cycle_last; | ||
359 | } | ||
360 | |||
361 | #ifdef CONFIG_X86_64 | ||
362 | static cycle_t __vsyscall_fn vread_tsc(void) | ||
363 | { | ||
364 | cycle_t ret = (cycle_t)vget_cycles(); | ||
365 | |||
366 | return ret >= __vsyscall_gtod_data.clock.cycle_last ? | ||
367 | ret : __vsyscall_gtod_data.clock.cycle_last; | ||
368 | } | ||
369 | #endif | ||
370 | |||
371 | static struct clocksource clocksource_tsc = { | ||
372 | .name = "tsc", | ||
373 | .rating = 300, | ||
374 | .read = read_tsc, | ||
375 | .mask = CLOCKSOURCE_MASK(64), | ||
376 | .shift = 22, | ||
377 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | ||
378 | CLOCK_SOURCE_MUST_VERIFY, | ||
379 | #ifdef CONFIG_X86_64 | ||
380 | .vread = vread_tsc, | ||
381 | #endif | ||
382 | }; | ||
383 | |||
384 | void mark_tsc_unstable(char *reason) | ||
385 | { | ||
386 | if (!tsc_unstable) { | ||
387 | tsc_unstable = 1; | ||
388 | printk("Marking TSC unstable due to %s\n", reason); | ||
389 | /* Change only the rating, when not registered */ | ||
390 | if (clocksource_tsc.mult) | ||
391 | clocksource_change_rating(&clocksource_tsc, 0); | ||
392 | else | ||
393 | clocksource_tsc.rating = 0; | ||
394 | } | ||
395 | } | ||
396 | |||
397 | EXPORT_SYMBOL_GPL(mark_tsc_unstable); | ||
398 | |||
399 | static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) | ||
400 | { | ||
401 | printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", | ||
402 | d->ident); | ||
403 | tsc_unstable = 1; | ||
404 | return 0; | ||
405 | } | ||
406 | |||
407 | /* List of systems that have known TSC problems */ | ||
408 | static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { | ||
409 | { | ||
410 | .callback = dmi_mark_tsc_unstable, | ||
411 | .ident = "IBM Thinkpad 380XD", | ||
412 | .matches = { | ||
413 | DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), | ||
414 | DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), | ||
415 | }, | ||
416 | }, | ||
417 | {} | ||
418 | }; | ||
419 | |||
420 | /* | ||
421 | * Geode_LX - the OLPC CPU has a possibly a very reliable TSC | ||
422 | */ | ||
423 | #ifdef CONFIG_MGEODE_LX | ||
424 | /* RTSC counts during suspend */ | ||
425 | #define RTSC_SUSP 0x100 | ||
426 | |||
427 | static void __init check_geode_tsc_reliable(void) | ||
428 | { | ||
429 | unsigned long res_low, res_high; | ||
430 | |||
431 | rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); | ||
432 | if (res_low & RTSC_SUSP) | ||
433 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; | ||
434 | } | ||
435 | #else | ||
436 | static inline void check_geode_tsc_reliable(void) { } | ||
437 | #endif | ||
438 | |||
439 | /* | ||
440 | * Make an educated guess if the TSC is trustworthy and synchronized | ||
441 | * over all CPUs. | ||
442 | */ | ||
443 | __cpuinit int unsynchronized_tsc(void) | ||
444 | { | ||
445 | if (!cpu_has_tsc || tsc_unstable) | ||
446 | return 1; | ||
447 | |||
448 | #ifdef CONFIG_SMP | ||
449 | if (apic_is_clustered_box()) | ||
450 | return 1; | ||
451 | #endif | ||
452 | |||
453 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | ||
454 | return 0; | ||
455 | /* | ||
456 | * Intel systems are normally all synchronized. | ||
457 | * Exceptions must mark TSC as unstable: | ||
458 | */ | ||
459 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { | ||
460 | /* assume multi socket systems are not synchronized: */ | ||
461 | if (num_possible_cpus() > 1) | ||
462 | tsc_unstable = 1; | ||
463 | } | ||
464 | |||
465 | return tsc_unstable; | ||
466 | } | ||
467 | |||
468 | static void __init init_tsc_clocksource(void) | ||
469 | { | ||
470 | clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, | ||
471 | clocksource_tsc.shift); | ||
472 | /* lower the rating if we already know its unstable: */ | ||
473 | if (check_tsc_unstable()) { | ||
474 | clocksource_tsc.rating = 0; | ||
475 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; | ||
476 | } | ||
477 | clocksource_register(&clocksource_tsc); | ||
478 | } | ||
479 | |||
480 | void __init tsc_init(void) | ||
481 | { | ||
482 | u64 lpj; | ||
483 | int cpu; | ||
484 | |||
485 | if (!cpu_has_tsc) | ||
486 | return; | ||
487 | |||
488 | tsc_khz = calibrate_tsc(); | ||
489 | cpu_khz = tsc_khz; | ||
490 | |||
491 | if (!tsc_khz) { | ||
492 | mark_tsc_unstable("could not calculate TSC khz"); | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | #ifdef CONFIG_X86_64 | ||
497 | if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && | ||
498 | (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) | ||
499 | cpu_khz = calibrate_cpu(); | ||
500 | #endif | ||
501 | |||
502 | lpj = ((u64)tsc_khz * 1000); | ||
503 | do_div(lpj, HZ); | ||
504 | lpj_fine = lpj; | ||
505 | |||
506 | printk("Detected %lu.%03lu MHz processor.\n", | ||
507 | (unsigned long)cpu_khz / 1000, | ||
508 | (unsigned long)cpu_khz % 1000); | ||
509 | |||
510 | /* | ||
511 | * Secondary CPUs do not run through tsc_init(), so set up | ||
512 | * all the scale factors for all CPUs, assuming the same | ||
513 | * speed as the bootup CPU. (cpufreq notifiers will fix this | ||
514 | * up if their speed diverges) | ||
515 | */ | ||
516 | for_each_possible_cpu(cpu) | ||
517 | set_cyc2ns_scale(cpu_khz, cpu); | ||
518 | |||
519 | if (tsc_disabled > 0) | ||
520 | return; | ||
521 | |||
522 | /* now allow native_sched_clock() to use rdtsc */ | ||
523 | tsc_disabled = 0; | ||
524 | |||
525 | use_tsc_delay(); | ||
526 | /* Check and install the TSC clocksource */ | ||
527 | dmi_check_system(bad_tsc_dmi_table); | ||
528 | |||
529 | if (unsynchronized_tsc()) | ||
530 | mark_tsc_unstable("TSCs unsynchronized"); | ||
531 | |||
532 | check_geode_tsc_reliable(); | ||
533 | init_tsc_clocksource(); | ||
534 | } | ||
535 | |||
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c deleted file mode 100644 index 068759db63dd..000000000000 --- a/arch/x86/kernel/tsc_32.c +++ /dev/null | |||
@@ -1,453 +0,0 @@ | |||
1 | #include <linux/sched.h> | ||
2 | #include <linux/clocksource.h> | ||
3 | #include <linux/workqueue.h> | ||
4 | #include <linux/cpufreq.h> | ||
5 | #include <linux/jiffies.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/dmi.h> | ||
8 | #include <linux/percpu.h> | ||
9 | |||
10 | #include <asm/delay.h> | ||
11 | #include <asm/tsc.h> | ||
12 | #include <asm/io.h> | ||
13 | #include <asm/timer.h> | ||
14 | |||
15 | #include "mach_timer.h" | ||
16 | |||
17 | static int tsc_disabled; | ||
18 | |||
19 | /* | ||
20 | * On some systems the TSC frequency does not | ||
21 | * change with the cpu frequency. So we need | ||
22 | * an extra value to store the TSC freq | ||
23 | */ | ||
24 | unsigned int tsc_khz; | ||
25 | EXPORT_SYMBOL_GPL(tsc_khz); | ||
26 | |||
27 | #ifdef CONFIG_X86_TSC | ||
28 | static int __init tsc_setup(char *str) | ||
29 | { | ||
30 | printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " | ||
31 | "cannot disable TSC completely.\n"); | ||
32 | tsc_disabled = 1; | ||
33 | return 1; | ||
34 | } | ||
35 | #else | ||
36 | /* | ||
37 | * disable flag for tsc. Takes effect by clearing the TSC cpu flag | ||
38 | * in cpu/common.c | ||
39 | */ | ||
40 | static int __init tsc_setup(char *str) | ||
41 | { | ||
42 | setup_clear_cpu_cap(X86_FEATURE_TSC); | ||
43 | return 1; | ||
44 | } | ||
45 | #endif | ||
46 | |||
47 | __setup("notsc", tsc_setup); | ||
48 | |||
49 | /* | ||
50 | * code to mark and check if the TSC is unstable | ||
51 | * due to cpufreq or due to unsynced TSCs | ||
52 | */ | ||
53 | static int tsc_unstable; | ||
54 | |||
55 | int check_tsc_unstable(void) | ||
56 | { | ||
57 | return tsc_unstable; | ||
58 | } | ||
59 | EXPORT_SYMBOL_GPL(check_tsc_unstable); | ||
60 | |||
61 | /* Accelerators for sched_clock() | ||
62 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
63 | * basic equation: | ||
64 | * ns = cycles / (freq / ns_per_sec) | ||
65 | * ns = cycles * (ns_per_sec / freq) | ||
66 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
67 | * ns = cycles * (10^6 / cpu_khz) | ||
68 | * | ||
69 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
70 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
71 | * ns = cycles * cyc2ns_scale / SC | ||
72 | * | ||
73 | * And since SC is a constant power of two, we can convert the div | ||
74 | * into a shift. | ||
75 | * | ||
76 | * We can use khz divisor instead of mhz to keep a better precision, since | ||
77 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
78 | * (mathieu.desnoyers@polymtl.ca) | ||
79 | * | ||
80 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
81 | */ | ||
82 | |||
83 | DEFINE_PER_CPU(unsigned long, cyc2ns); | ||
84 | |||
85 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | ||
86 | { | ||
87 | unsigned long long tsc_now, ns_now; | ||
88 | unsigned long flags, *scale; | ||
89 | |||
90 | local_irq_save(flags); | ||
91 | sched_clock_idle_sleep_event(); | ||
92 | |||
93 | scale = &per_cpu(cyc2ns, cpu); | ||
94 | |||
95 | rdtscll(tsc_now); | ||
96 | ns_now = __cycles_2_ns(tsc_now); | ||
97 | |||
98 | if (cpu_khz) | ||
99 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; | ||
100 | |||
101 | /* | ||
102 | * Start smoothly with the new frequency: | ||
103 | */ | ||
104 | sched_clock_idle_wakeup_event(0); | ||
105 | local_irq_restore(flags); | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Scheduler clock - returns current time in nanosec units. | ||
110 | */ | ||
111 | unsigned long long native_sched_clock(void) | ||
112 | { | ||
113 | unsigned long long this_offset; | ||
114 | |||
115 | /* | ||
116 | * Fall back to jiffies if there's no TSC available: | ||
117 | * ( But note that we still use it if the TSC is marked | ||
118 | * unstable. We do this because unlike Time Of Day, | ||
119 | * the scheduler clock tolerates small errors and it's | ||
120 | * very important for it to be as fast as the platform | ||
121 | * can achive it. ) | ||
122 | */ | ||
123 | if (unlikely(tsc_disabled)) | ||
124 | /* No locking but a rare wrong value is not a big deal: */ | ||
125 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); | ||
126 | |||
127 | /* read the Time Stamp Counter: */ | ||
128 | rdtscll(this_offset); | ||
129 | |||
130 | /* return the value in ns */ | ||
131 | return cycles_2_ns(this_offset); | ||
132 | } | ||
133 | |||
134 | /* We need to define a real function for sched_clock, to override the | ||
135 | weak default version */ | ||
136 | #ifdef CONFIG_PARAVIRT | ||
137 | unsigned long long sched_clock(void) | ||
138 | { | ||
139 | return paravirt_sched_clock(); | ||
140 | } | ||
141 | #else | ||
142 | unsigned long long sched_clock(void) | ||
143 | __attribute__((alias("native_sched_clock"))); | ||
144 | #endif | ||
145 | |||
146 | unsigned long native_calculate_cpu_khz(void) | ||
147 | { | ||
148 | unsigned long long start, end; | ||
149 | unsigned long count; | ||
150 | u64 delta64 = (u64)ULLONG_MAX; | ||
151 | int i; | ||
152 | unsigned long flags; | ||
153 | |||
154 | local_irq_save(flags); | ||
155 | |||
156 | /* run 3 times to ensure the cache is warm and to get an accurate reading */ | ||
157 | for (i = 0; i < 3; i++) { | ||
158 | mach_prepare_counter(); | ||
159 | rdtscll(start); | ||
160 | mach_countup(&count); | ||
161 | rdtscll(end); | ||
162 | |||
163 | /* | ||
164 | * Error: ECTCNEVERSET | ||
165 | * The CTC wasn't reliable: we got a hit on the very first read, | ||
166 | * or the CPU was so fast/slow that the quotient wouldn't fit in | ||
167 | * 32 bits.. | ||
168 | */ | ||
169 | if (count <= 1) | ||
170 | continue; | ||
171 | |||
172 | /* cpu freq too slow: */ | ||
173 | if ((end - start) <= CALIBRATE_TIME_MSEC) | ||
174 | continue; | ||
175 | |||
176 | /* | ||
177 | * We want the minimum time of all runs in case one of them | ||
178 | * is inaccurate due to SMI or other delay | ||
179 | */ | ||
180 | delta64 = min(delta64, (end - start)); | ||
181 | } | ||
182 | |||
183 | /* cpu freq too fast (or every run was bad): */ | ||
184 | if (delta64 > (1ULL<<32)) | ||
185 | goto err; | ||
186 | |||
187 | delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ | ||
188 | do_div(delta64,CALIBRATE_TIME_MSEC); | ||
189 | |||
190 | local_irq_restore(flags); | ||
191 | return (unsigned long)delta64; | ||
192 | err: | ||
193 | local_irq_restore(flags); | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | int recalibrate_cpu_khz(void) | ||
198 | { | ||
199 | #ifndef CONFIG_SMP | ||
200 | unsigned long cpu_khz_old = cpu_khz; | ||
201 | |||
202 | if (cpu_has_tsc) { | ||
203 | cpu_khz = calculate_cpu_khz(); | ||
204 | tsc_khz = cpu_khz; | ||
205 | cpu_data(0).loops_per_jiffy = | ||
206 | cpufreq_scale(cpu_data(0).loops_per_jiffy, | ||
207 | cpu_khz_old, cpu_khz); | ||
208 | return 0; | ||
209 | } else | ||
210 | return -ENODEV; | ||
211 | #else | ||
212 | return -ENODEV; | ||
213 | #endif | ||
214 | } | ||
215 | |||
216 | EXPORT_SYMBOL(recalibrate_cpu_khz); | ||
217 | |||
218 | #ifdef CONFIG_CPU_FREQ | ||
219 | |||
220 | /* | ||
221 | * if the CPU frequency is scaled, TSC-based delays will need a different | ||
222 | * loops_per_jiffy value to function properly. | ||
223 | */ | ||
224 | static unsigned int ref_freq; | ||
225 | static unsigned long loops_per_jiffy_ref; | ||
226 | static unsigned long cpu_khz_ref; | ||
227 | |||
228 | static int | ||
229 | time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) | ||
230 | { | ||
231 | struct cpufreq_freqs *freq = data; | ||
232 | |||
233 | if (!ref_freq) { | ||
234 | if (!freq->old){ | ||
235 | ref_freq = freq->new; | ||
236 | return 0; | ||
237 | } | ||
238 | ref_freq = freq->old; | ||
239 | loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy; | ||
240 | cpu_khz_ref = cpu_khz; | ||
241 | } | ||
242 | |||
243 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
244 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
245 | (val == CPUFREQ_RESUMECHANGE)) { | ||
246 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
247 | cpu_data(freq->cpu).loops_per_jiffy = | ||
248 | cpufreq_scale(loops_per_jiffy_ref, | ||
249 | ref_freq, freq->new); | ||
250 | |||
251 | if (cpu_khz) { | ||
252 | |||
253 | if (num_online_cpus() == 1) | ||
254 | cpu_khz = cpufreq_scale(cpu_khz_ref, | ||
255 | ref_freq, freq->new); | ||
256 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { | ||
257 | tsc_khz = cpu_khz; | ||
258 | set_cyc2ns_scale(cpu_khz, freq->cpu); | ||
259 | /* | ||
260 | * TSC based sched_clock turns | ||
261 | * to junk w/ cpufreq | ||
262 | */ | ||
263 | mark_tsc_unstable("cpufreq changes"); | ||
264 | } | ||
265 | } | ||
266 | } | ||
267 | |||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | static struct notifier_block time_cpufreq_notifier_block = { | ||
272 | .notifier_call = time_cpufreq_notifier | ||
273 | }; | ||
274 | |||
275 | static int __init cpufreq_tsc(void) | ||
276 | { | ||
277 | return cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
278 | CPUFREQ_TRANSITION_NOTIFIER); | ||
279 | } | ||
280 | core_initcall(cpufreq_tsc); | ||
281 | |||
282 | #endif | ||
283 | |||
284 | /* clock source code */ | ||
285 | |||
286 | static unsigned long current_tsc_khz; | ||
287 | static struct clocksource clocksource_tsc; | ||
288 | |||
289 | /* | ||
290 | * We compare the TSC to the cycle_last value in the clocksource | ||
291 | * structure to avoid a nasty time-warp issue. This can be observed in | ||
292 | * a very small window right after one CPU updated cycle_last under | ||
293 | * xtime lock and the other CPU reads a TSC value which is smaller | ||
294 | * than the cycle_last reference value due to a TSC which is slighty | ||
295 | * behind. This delta is nowhere else observable, but in that case it | ||
296 | * results in a forward time jump in the range of hours due to the | ||
297 | * unsigned delta calculation of the time keeping core code, which is | ||
298 | * necessary to support wrapping clocksources like pm timer. | ||
299 | */ | ||
300 | static cycle_t read_tsc(void) | ||
301 | { | ||
302 | cycle_t ret; | ||
303 | |||
304 | rdtscll(ret); | ||
305 | |||
306 | return ret >= clocksource_tsc.cycle_last ? | ||
307 | ret : clocksource_tsc.cycle_last; | ||
308 | } | ||
309 | |||
310 | static struct clocksource clocksource_tsc = { | ||
311 | .name = "tsc", | ||
312 | .rating = 300, | ||
313 | .read = read_tsc, | ||
314 | .mask = CLOCKSOURCE_MASK(64), | ||
315 | .mult = 0, /* to be set */ | ||
316 | .shift = 22, | ||
317 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | ||
318 | CLOCK_SOURCE_MUST_VERIFY, | ||
319 | }; | ||
320 | |||
321 | void mark_tsc_unstable(char *reason) | ||
322 | { | ||
323 | if (!tsc_unstable) { | ||
324 | tsc_unstable = 1; | ||
325 | printk("Marking TSC unstable due to: %s.\n", reason); | ||
326 | /* Can be called before registration */ | ||
327 | if (clocksource_tsc.mult) | ||
328 | clocksource_change_rating(&clocksource_tsc, 0); | ||
329 | else | ||
330 | clocksource_tsc.rating = 0; | ||
331 | } | ||
332 | } | ||
333 | EXPORT_SYMBOL_GPL(mark_tsc_unstable); | ||
334 | |||
335 | static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) | ||
336 | { | ||
337 | printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", | ||
338 | d->ident); | ||
339 | tsc_unstable = 1; | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | /* List of systems that have known TSC problems */ | ||
344 | static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { | ||
345 | { | ||
346 | .callback = dmi_mark_tsc_unstable, | ||
347 | .ident = "IBM Thinkpad 380XD", | ||
348 | .matches = { | ||
349 | DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), | ||
350 | DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), | ||
351 | }, | ||
352 | }, | ||
353 | {} | ||
354 | }; | ||
355 | |||
356 | /* | ||
357 | * Make an educated guess if the TSC is trustworthy and synchronized | ||
358 | * over all CPUs. | ||
359 | */ | ||
360 | __cpuinit int unsynchronized_tsc(void) | ||
361 | { | ||
362 | if (!cpu_has_tsc || tsc_unstable) | ||
363 | return 1; | ||
364 | |||
365 | /* Anything with constant TSC should be synchronized */ | ||
366 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | ||
367 | return 0; | ||
368 | |||
369 | /* | ||
370 | * Intel systems are normally all synchronized. | ||
371 | * Exceptions must mark TSC as unstable: | ||
372 | */ | ||
373 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { | ||
374 | /* assume multi socket systems are not synchronized: */ | ||
375 | if (num_possible_cpus() > 1) | ||
376 | tsc_unstable = 1; | ||
377 | } | ||
378 | return tsc_unstable; | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * Geode_LX - the OLPC CPU has a possibly a very reliable TSC | ||
383 | */ | ||
384 | #ifdef CONFIG_MGEODE_LX | ||
385 | /* RTSC counts during suspend */ | ||
386 | #define RTSC_SUSP 0x100 | ||
387 | |||
388 | static void __init check_geode_tsc_reliable(void) | ||
389 | { | ||
390 | unsigned long res_low, res_high; | ||
391 | |||
392 | rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); | ||
393 | if (res_low & RTSC_SUSP) | ||
394 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; | ||
395 | } | ||
396 | #else | ||
397 | static inline void check_geode_tsc_reliable(void) { } | ||
398 | #endif | ||
399 | |||
400 | |||
401 | void __init tsc_init(void) | ||
402 | { | ||
403 | int cpu; | ||
404 | |||
405 | if (!cpu_has_tsc || tsc_disabled) { | ||
406 | /* Disable the TSC in case of !cpu_has_tsc */ | ||
407 | tsc_disabled = 1; | ||
408 | return; | ||
409 | } | ||
410 | |||
411 | cpu_khz = calculate_cpu_khz(); | ||
412 | tsc_khz = cpu_khz; | ||
413 | |||
414 | if (!cpu_khz) { | ||
415 | mark_tsc_unstable("could not calculate TSC khz"); | ||
416 | /* | ||
417 | * We need to disable the TSC completely in this case | ||
418 | * to prevent sched_clock() from using it. | ||
419 | */ | ||
420 | tsc_disabled = 1; | ||
421 | return; | ||
422 | } | ||
423 | |||
424 | printk("Detected %lu.%03lu MHz processor.\n", | ||
425 | (unsigned long)cpu_khz / 1000, | ||
426 | (unsigned long)cpu_khz % 1000); | ||
427 | |||
428 | /* | ||
429 | * Secondary CPUs do not run through tsc_init(), so set up | ||
430 | * all the scale factors for all CPUs, assuming the same | ||
431 | * speed as the bootup CPU. (cpufreq notifiers will fix this | ||
432 | * up if their speed diverges) | ||
433 | */ | ||
434 | for_each_possible_cpu(cpu) | ||
435 | set_cyc2ns_scale(cpu_khz, cpu); | ||
436 | |||
437 | use_tsc_delay(); | ||
438 | |||
439 | /* Check and install the TSC clocksource */ | ||
440 | dmi_check_system(bad_tsc_dmi_table); | ||
441 | |||
442 | unsynchronized_tsc(); | ||
443 | check_geode_tsc_reliable(); | ||
444 | current_tsc_khz = tsc_khz; | ||
445 | clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, | ||
446 | clocksource_tsc.shift); | ||
447 | /* lower the rating if we already know its unstable: */ | ||
448 | if (check_tsc_unstable()) { | ||
449 | clocksource_tsc.rating = 0; | ||
450 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; | ||
451 | } | ||
452 | clocksource_register(&clocksource_tsc); | ||
453 | } | ||
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c deleted file mode 100644 index 1784b8077a12..000000000000 --- a/arch/x86/kernel/tsc_64.c +++ /dev/null | |||
@@ -1,357 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/interrupt.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/clocksource.h> | ||
6 | #include <linux/time.h> | ||
7 | #include <linux/acpi.h> | ||
8 | #include <linux/cpufreq.h> | ||
9 | #include <linux/acpi_pmtmr.h> | ||
10 | |||
11 | #include <asm/hpet.h> | ||
12 | #include <asm/timex.h> | ||
13 | #include <asm/timer.h> | ||
14 | #include <asm/vgtod.h> | ||
15 | |||
16 | static int notsc __initdata = 0; | ||
17 | |||
18 | unsigned int cpu_khz; /* TSC clocks / usec, not used here */ | ||
19 | EXPORT_SYMBOL(cpu_khz); | ||
20 | unsigned int tsc_khz; | ||
21 | EXPORT_SYMBOL(tsc_khz); | ||
22 | |||
23 | /* Accelerators for sched_clock() | ||
24 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
25 | * basic equation: | ||
26 | * ns = cycles / (freq / ns_per_sec) | ||
27 | * ns = cycles * (ns_per_sec / freq) | ||
28 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
29 | * ns = cycles * (10^6 / cpu_khz) | ||
30 | * | ||
31 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
32 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
33 | * ns = cycles * cyc2ns_scale / SC | ||
34 | * | ||
35 | * And since SC is a constant power of two, we can convert the div | ||
36 | * into a shift. | ||
37 | * | ||
38 | * We can use khz divisor instead of mhz to keep a better precision, since | ||
39 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
40 | * (mathieu.desnoyers@polymtl.ca) | ||
41 | * | ||
42 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
43 | */ | ||
44 | DEFINE_PER_CPU(unsigned long, cyc2ns); | ||
45 | |||
46 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | ||
47 | { | ||
48 | unsigned long long tsc_now, ns_now; | ||
49 | unsigned long flags, *scale; | ||
50 | |||
51 | local_irq_save(flags); | ||
52 | sched_clock_idle_sleep_event(); | ||
53 | |||
54 | scale = &per_cpu(cyc2ns, cpu); | ||
55 | |||
56 | rdtscll(tsc_now); | ||
57 | ns_now = __cycles_2_ns(tsc_now); | ||
58 | |||
59 | if (cpu_khz) | ||
60 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; | ||
61 | |||
62 | sched_clock_idle_wakeup_event(0); | ||
63 | local_irq_restore(flags); | ||
64 | } | ||
65 | |||
66 | unsigned long long native_sched_clock(void) | ||
67 | { | ||
68 | unsigned long a = 0; | ||
69 | |||
70 | /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, | ||
71 | * which means it is not completely exact and may not be monotonous | ||
72 | * between CPUs. But the errors should be too small to matter for | ||
73 | * scheduling purposes. | ||
74 | */ | ||
75 | |||
76 | rdtscll(a); | ||
77 | return cycles_2_ns(a); | ||
78 | } | ||
79 | |||
80 | /* We need to define a real function for sched_clock, to override the | ||
81 | weak default version */ | ||
82 | #ifdef CONFIG_PARAVIRT | ||
83 | unsigned long long sched_clock(void) | ||
84 | { | ||
85 | return paravirt_sched_clock(); | ||
86 | } | ||
87 | #else | ||
88 | unsigned long long | ||
89 | sched_clock(void) __attribute__((alias("native_sched_clock"))); | ||
90 | #endif | ||
91 | |||
92 | |||
93 | static int tsc_unstable; | ||
94 | |||
95 | int check_tsc_unstable(void) | ||
96 | { | ||
97 | return tsc_unstable; | ||
98 | } | ||
99 | EXPORT_SYMBOL_GPL(check_tsc_unstable); | ||
100 | |||
101 | #ifdef CONFIG_CPU_FREQ | ||
102 | |||
103 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency | ||
104 | * changes. | ||
105 | * | ||
106 | * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's | ||
107 | * not that important because current Opteron setups do not support | ||
108 | * scaling on SMP anyroads. | ||
109 | * | ||
110 | * Should fix up last_tsc too. Currently gettimeofday in the | ||
111 | * first tick after the change will be slightly wrong. | ||
112 | */ | ||
113 | |||
114 | static unsigned int ref_freq; | ||
115 | static unsigned long loops_per_jiffy_ref; | ||
116 | static unsigned long tsc_khz_ref; | ||
117 | |||
118 | static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | ||
119 | void *data) | ||
120 | { | ||
121 | struct cpufreq_freqs *freq = data; | ||
122 | unsigned long *lpj, dummy; | ||
123 | |||
124 | if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) | ||
125 | return 0; | ||
126 | |||
127 | lpj = &dummy; | ||
128 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
129 | #ifdef CONFIG_SMP | ||
130 | lpj = &cpu_data(freq->cpu).loops_per_jiffy; | ||
131 | #else | ||
132 | lpj = &boot_cpu_data.loops_per_jiffy; | ||
133 | #endif | ||
134 | |||
135 | if (!ref_freq) { | ||
136 | ref_freq = freq->old; | ||
137 | loops_per_jiffy_ref = *lpj; | ||
138 | tsc_khz_ref = tsc_khz; | ||
139 | } | ||
140 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
141 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
142 | (val == CPUFREQ_RESUMECHANGE)) { | ||
143 | *lpj = | ||
144 | cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); | ||
145 | |||
146 | tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); | ||
147 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
148 | mark_tsc_unstable("cpufreq changes"); | ||
149 | } | ||
150 | |||
151 | set_cyc2ns_scale(tsc_khz_ref, freq->cpu); | ||
152 | |||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | static struct notifier_block time_cpufreq_notifier_block = { | ||
157 | .notifier_call = time_cpufreq_notifier | ||
158 | }; | ||
159 | |||
160 | static int __init cpufreq_tsc(void) | ||
161 | { | ||
162 | cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
163 | CPUFREQ_TRANSITION_NOTIFIER); | ||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | core_initcall(cpufreq_tsc); | ||
168 | |||
169 | #endif | ||
170 | |||
171 | #define MAX_RETRIES 5 | ||
172 | #define SMI_TRESHOLD 50000 | ||
173 | |||
174 | /* | ||
175 | * Read TSC and the reference counters. Take care of SMI disturbance | ||
176 | */ | ||
177 | static unsigned long __init tsc_read_refs(unsigned long *pm, | ||
178 | unsigned long *hpet) | ||
179 | { | ||
180 | unsigned long t1, t2; | ||
181 | int i; | ||
182 | |||
183 | for (i = 0; i < MAX_RETRIES; i++) { | ||
184 | t1 = get_cycles(); | ||
185 | if (hpet) | ||
186 | *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; | ||
187 | else | ||
188 | *pm = acpi_pm_read_early(); | ||
189 | t2 = get_cycles(); | ||
190 | if ((t2 - t1) < SMI_TRESHOLD) | ||
191 | return t2; | ||
192 | } | ||
193 | return ULONG_MAX; | ||
194 | } | ||
195 | |||
196 | /** | ||
197 | * tsc_calibrate - calibrate the tsc on boot | ||
198 | */ | ||
199 | void __init tsc_calibrate(void) | ||
200 | { | ||
201 | unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; | ||
202 | int hpet = is_hpet_enabled(), cpu; | ||
203 | |||
204 | local_irq_save(flags); | ||
205 | |||
206 | tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); | ||
207 | |||
208 | outb((inb(0x61) & ~0x02) | 0x01, 0x61); | ||
209 | |||
210 | outb(0xb0, 0x43); | ||
211 | outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); | ||
212 | outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); | ||
213 | tr1 = get_cycles(); | ||
214 | while ((inb(0x61) & 0x20) == 0); | ||
215 | tr2 = get_cycles(); | ||
216 | |||
217 | tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); | ||
218 | |||
219 | local_irq_restore(flags); | ||
220 | |||
221 | /* | ||
222 | * Preset the result with the raw and inaccurate PIT | ||
223 | * calibration value | ||
224 | */ | ||
225 | tsc_khz = (tr2 - tr1) / 50; | ||
226 | |||
227 | /* hpet or pmtimer available ? */ | ||
228 | if (!hpet && !pm1 && !pm2) { | ||
229 | printk(KERN_INFO "TSC calibrated against PIT\n"); | ||
230 | goto out; | ||
231 | } | ||
232 | |||
233 | /* Check, whether the sampling was disturbed by an SMI */ | ||
234 | if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { | ||
235 | printk(KERN_WARNING "TSC calibration disturbed by SMI, " | ||
236 | "using PIT calibration result\n"); | ||
237 | goto out; | ||
238 | } | ||
239 | |||
240 | tsc2 = (tsc2 - tsc1) * 1000000L; | ||
241 | |||
242 | if (hpet) { | ||
243 | printk(KERN_INFO "TSC calibrated against HPET\n"); | ||
244 | if (hpet2 < hpet1) | ||
245 | hpet2 += 0x100000000; | ||
246 | hpet2 -= hpet1; | ||
247 | tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; | ||
248 | } else { | ||
249 | printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); | ||
250 | if (pm2 < pm1) | ||
251 | pm2 += ACPI_PM_OVRRUN; | ||
252 | pm2 -= pm1; | ||
253 | tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC; | ||
254 | } | ||
255 | |||
256 | tsc_khz = tsc2 / tsc1; | ||
257 | |||
258 | out: | ||
259 | for_each_possible_cpu(cpu) | ||
260 | set_cyc2ns_scale(tsc_khz, cpu); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * Make an educated guess if the TSC is trustworthy and synchronized | ||
265 | * over all CPUs. | ||
266 | */ | ||
267 | __cpuinit int unsynchronized_tsc(void) | ||
268 | { | ||
269 | if (tsc_unstable) | ||
270 | return 1; | ||
271 | |||
272 | #ifdef CONFIG_SMP | ||
273 | if (apic_is_clustered_box()) | ||
274 | return 1; | ||
275 | #endif | ||
276 | |||
277 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | ||
278 | return 0; | ||
279 | |||
280 | /* Assume multi socket systems are not synchronized */ | ||
281 | return num_present_cpus() > 1; | ||
282 | } | ||
283 | |||
284 | int __init notsc_setup(char *s) | ||
285 | { | ||
286 | notsc = 1; | ||
287 | return 1; | ||
288 | } | ||
289 | |||
290 | __setup("notsc", notsc_setup); | ||
291 | |||
292 | static struct clocksource clocksource_tsc; | ||
293 | |||
294 | /* | ||
295 | * We compare the TSC to the cycle_last value in the clocksource | ||
296 | * structure to avoid a nasty time-warp. This can be observed in a | ||
297 | * very small window right after one CPU updated cycle_last under | ||
298 | * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which | ||
299 | * is smaller than the cycle_last reference value due to a TSC which | ||
300 | * is slighty behind. This delta is nowhere else observable, but in | ||
301 | * that case it results in a forward time jump in the range of hours | ||
302 | * due to the unsigned delta calculation of the time keeping core | ||
303 | * code, which is necessary to support wrapping clocksources like pm | ||
304 | * timer. | ||
305 | */ | ||
306 | static cycle_t read_tsc(void) | ||
307 | { | ||
308 | cycle_t ret = (cycle_t)get_cycles(); | ||
309 | |||
310 | return ret >= clocksource_tsc.cycle_last ? | ||
311 | ret : clocksource_tsc.cycle_last; | ||
312 | } | ||
313 | |||
314 | static cycle_t __vsyscall_fn vread_tsc(void) | ||
315 | { | ||
316 | cycle_t ret = (cycle_t)vget_cycles(); | ||
317 | |||
318 | return ret >= __vsyscall_gtod_data.clock.cycle_last ? | ||
319 | ret : __vsyscall_gtod_data.clock.cycle_last; | ||
320 | } | ||
321 | |||
322 | static struct clocksource clocksource_tsc = { | ||
323 | .name = "tsc", | ||
324 | .rating = 300, | ||
325 | .read = read_tsc, | ||
326 | .mask = CLOCKSOURCE_MASK(64), | ||
327 | .shift = 22, | ||
328 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | ||
329 | CLOCK_SOURCE_MUST_VERIFY, | ||
330 | .vread = vread_tsc, | ||
331 | }; | ||
332 | |||
333 | void mark_tsc_unstable(char *reason) | ||
334 | { | ||
335 | if (!tsc_unstable) { | ||
336 | tsc_unstable = 1; | ||
337 | printk("Marking TSC unstable due to %s\n", reason); | ||
338 | /* Change only the rating, when not registered */ | ||
339 | if (clocksource_tsc.mult) | ||
340 | clocksource_change_rating(&clocksource_tsc, 0); | ||
341 | else | ||
342 | clocksource_tsc.rating = 0; | ||
343 | } | ||
344 | } | ||
345 | EXPORT_SYMBOL_GPL(mark_tsc_unstable); | ||
346 | |||
347 | void __init init_tsc_clocksource(void) | ||
348 | { | ||
349 | if (!notsc) { | ||
350 | clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, | ||
351 | clocksource_tsc.shift); | ||
352 | if (check_tsc_unstable()) | ||
353 | clocksource_tsc.rating = 0; | ||
354 | |||
355 | clocksource_register(&clocksource_tsc); | ||
356 | } | ||
357 | } | ||
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c new file mode 100644 index 000000000000..41e01b145c48 --- /dev/null +++ b/arch/x86/kernel/visws_quirks.c | |||
@@ -0,0 +1,707 @@ | |||
1 | /* | ||
2 | * SGI Visual Workstation support and quirks, unmaintained. | ||
3 | * | ||
4 | * Split out from setup.c by davej@suse.de | ||
5 | * | ||
6 | * Copyright (C) 1999 Bent Hagemark, Ingo Molnar | ||
7 | * | ||
8 | * SGI Visual Workstation interrupt controller | ||
9 | * | ||
10 | * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC | ||
11 | * which serves as the main interrupt controller in the system. Non-legacy | ||
12 | * hardware in the system uses this controller directly. Legacy devices | ||
13 | * are connected to the PIIX4 which in turn has its 8259(s) connected to | ||
14 | * a of the Cobalt APIC entry. | ||
15 | * | ||
16 | * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com | ||
17 | * | ||
18 | * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> | ||
19 | */ | ||
20 | #include <linux/interrupt.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/smp.h> | ||
24 | |||
25 | #include <asm/visws/cobalt.h> | ||
26 | #include <asm/visws/piix4.h> | ||
27 | #include <asm/arch_hooks.h> | ||
28 | #include <asm/fixmap.h> | ||
29 | #include <asm/reboot.h> | ||
30 | #include <asm/setup.h> | ||
31 | #include <asm/e820.h> | ||
32 | #include <asm/smp.h> | ||
33 | #include <asm/io.h> | ||
34 | |||
35 | #include <mach_ipi.h> | ||
36 | |||
37 | #include "mach_apic.h" | ||
38 | |||
39 | #include <linux/init.h> | ||
40 | #include <linux/smp.h> | ||
41 | |||
42 | #include <linux/kernel_stat.h> | ||
43 | #include <linux/interrupt.h> | ||
44 | #include <linux/init.h> | ||
45 | |||
46 | #include <asm/io.h> | ||
47 | #include <asm/apic.h> | ||
48 | #include <asm/i8259.h> | ||
49 | #include <asm/irq_vectors.h> | ||
50 | #include <asm/visws/cobalt.h> | ||
51 | #include <asm/visws/lithium.h> | ||
52 | #include <asm/visws/piix4.h> | ||
53 | |||
54 | #include <linux/sched.h> | ||
55 | #include <linux/kernel.h> | ||
56 | #include <linux/init.h> | ||
57 | #include <linux/pci.h> | ||
58 | #include <linux/pci_ids.h> | ||
59 | |||
60 | extern int no_broadcast; | ||
61 | |||
62 | #include <asm/io.h> | ||
63 | #include <asm/apic.h> | ||
64 | #include <asm/arch_hooks.h> | ||
65 | #include <asm/visws/cobalt.h> | ||
66 | #include <asm/visws/lithium.h> | ||
67 | |||
68 | char visws_board_type = -1; | ||
69 | char visws_board_rev = -1; | ||
70 | |||
71 | int is_visws_box(void) | ||
72 | { | ||
73 | return visws_board_type >= 0; | ||
74 | } | ||
75 | |||
76 | static int __init visws_time_init(void) | ||
77 | { | ||
78 | printk(KERN_INFO "Starting Cobalt Timer system clock\n"); | ||
79 | |||
80 | /* Set the countdown value */ | ||
81 | co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); | ||
82 | |||
83 | /* Start the timer */ | ||
84 | co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); | ||
85 | |||
86 | /* Enable (unmask) the timer interrupt */ | ||
87 | co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); | ||
88 | |||
89 | /* | ||
90 | * Zero return means the generic timer setup code will set up | ||
91 | * the standard vector: | ||
92 | */ | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static int __init visws_pre_intr_init(void) | ||
97 | { | ||
98 | init_VISWS_APIC_irqs(); | ||
99 | |||
100 | /* | ||
101 | * We dont want ISA irqs to be set up by the generic code: | ||
102 | */ | ||
103 | return 1; | ||
104 | } | ||
105 | |||
106 | /* Quirk for machine specific memory setup. */ | ||
107 | |||
108 | #define MB (1024 * 1024) | ||
109 | |||
110 | unsigned long sgivwfb_mem_phys; | ||
111 | unsigned long sgivwfb_mem_size; | ||
112 | EXPORT_SYMBOL(sgivwfb_mem_phys); | ||
113 | EXPORT_SYMBOL(sgivwfb_mem_size); | ||
114 | |||
115 | long long mem_size __initdata = 0; | ||
116 | |||
117 | static char * __init visws_memory_setup(void) | ||
118 | { | ||
119 | long long gfx_mem_size = 8 * MB; | ||
120 | |||
121 | mem_size = boot_params.alt_mem_k; | ||
122 | |||
123 | if (!mem_size) { | ||
124 | printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); | ||
125 | mem_size = 128 * MB; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * this hardcodes the graphics memory to 8 MB | ||
130 | * it really should be sized dynamically (or at least | ||
131 | * set as a boot param) | ||
132 | */ | ||
133 | if (!sgivwfb_mem_size) { | ||
134 | printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n"); | ||
135 | sgivwfb_mem_size = 8 * MB; | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * Trim to nearest MB | ||
140 | */ | ||
141 | sgivwfb_mem_size &= ~((1 << 20) - 1); | ||
142 | sgivwfb_mem_phys = mem_size - gfx_mem_size; | ||
143 | |||
144 | e820_add_region(0, LOWMEMSIZE(), E820_RAM); | ||
145 | e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); | ||
146 | e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); | ||
147 | |||
148 | return "PROM"; | ||
149 | } | ||
150 | |||
151 | static void visws_machine_emergency_restart(void) | ||
152 | { | ||
153 | /* | ||
154 | * Visual Workstations restart after this | ||
155 | * register is poked on the PIIX4 | ||
156 | */ | ||
157 | outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); | ||
158 | } | ||
159 | |||
160 | static void visws_machine_power_off(void) | ||
161 | { | ||
162 | unsigned short pm_status; | ||
163 | /* extern unsigned int pci_bus0; */ | ||
164 | |||
165 | while ((pm_status = inw(PMSTS_PORT)) & 0x100) | ||
166 | outw(pm_status, PMSTS_PORT); | ||
167 | |||
168 | outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); | ||
169 | |||
170 | mdelay(10); | ||
171 | |||
172 | #define PCI_CONF1_ADDRESS(bus, devfn, reg) \ | ||
173 | (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) | ||
174 | |||
175 | /* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */ | ||
176 | outl(PIIX_SPECIAL_STOP, 0xCFC); | ||
177 | } | ||
178 | |||
179 | static int __init visws_get_smp_config(unsigned int early) | ||
180 | { | ||
181 | /* | ||
182 | * Prevent MP-table parsing by the generic code: | ||
183 | */ | ||
184 | return 1; | ||
185 | } | ||
186 | |||
187 | extern unsigned int __cpuinitdata maxcpus; | ||
188 | |||
189 | /* | ||
190 | * The Visual Workstation is Intel MP compliant in the hardware | ||
191 | * sense, but it doesn't have a BIOS(-configuration table). | ||
192 | * No problem for Linux. | ||
193 | */ | ||
194 | |||
195 | static void __init MP_processor_info(struct mpc_config_processor *m) | ||
196 | { | ||
197 | int ver, logical_apicid; | ||
198 | physid_mask_t apic_cpus; | ||
199 | |||
200 | if (!(m->mpc_cpuflag & CPU_ENABLED)) | ||
201 | return; | ||
202 | |||
203 | logical_apicid = m->mpc_apicid; | ||
204 | printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", | ||
205 | m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", | ||
206 | m->mpc_apicid, | ||
207 | (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, | ||
208 | (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, | ||
209 | m->mpc_apicver); | ||
210 | |||
211 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) | ||
212 | boot_cpu_physical_apicid = m->mpc_apicid; | ||
213 | |||
214 | ver = m->mpc_apicver; | ||
215 | if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) { | ||
216 | printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", | ||
217 | m->mpc_apicid, MAX_APICS); | ||
218 | return; | ||
219 | } | ||
220 | |||
221 | apic_cpus = apicid_to_cpu_present(m->mpc_apicid); | ||
222 | physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); | ||
223 | /* | ||
224 | * Validate version | ||
225 | */ | ||
226 | if (ver == 0x0) { | ||
227 | printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " | ||
228 | "fixing up to 0x10. (tell your hw vendor)\n", | ||
229 | m->mpc_apicid); | ||
230 | ver = 0x10; | ||
231 | } | ||
232 | apic_version[m->mpc_apicid] = ver; | ||
233 | } | ||
234 | |||
235 | static int __init visws_find_smp_config(unsigned int reserve) | ||
236 | { | ||
237 | struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); | ||
238 | unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); | ||
239 | |||
240 | if (ncpus > CO_CPU_MAX) { | ||
241 | printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", | ||
242 | ncpus, mp); | ||
243 | |||
244 | ncpus = CO_CPU_MAX; | ||
245 | } | ||
246 | |||
247 | if (ncpus > maxcpus) | ||
248 | ncpus = maxcpus; | ||
249 | |||
250 | #ifdef CONFIG_X86_LOCAL_APIC | ||
251 | smp_found_config = 1; | ||
252 | #endif | ||
253 | while (ncpus--) | ||
254 | MP_processor_info(mp++); | ||
255 | |||
256 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
257 | |||
258 | return 1; | ||
259 | } | ||
260 | |||
261 | static int visws_trap_init(void); | ||
262 | |||
263 | static struct x86_quirks visws_x86_quirks __initdata = { | ||
264 | .arch_time_init = visws_time_init, | ||
265 | .arch_pre_intr_init = visws_pre_intr_init, | ||
266 | .arch_memory_setup = visws_memory_setup, | ||
267 | .arch_intr_init = NULL, | ||
268 | .arch_trap_init = visws_trap_init, | ||
269 | .mach_get_smp_config = visws_get_smp_config, | ||
270 | .mach_find_smp_config = visws_find_smp_config, | ||
271 | }; | ||
272 | |||
273 | void __init visws_early_detect(void) | ||
274 | { | ||
275 | int raw; | ||
276 | |||
277 | visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) | ||
278 | >> PIIX_GPI_BD_SHIFT; | ||
279 | |||
280 | if (visws_board_type < 0) | ||
281 | return; | ||
282 | |||
283 | /* | ||
284 | * Install special quirks for timer, interrupt and memory setup: | ||
285 | * Fall back to generic behavior for traps: | ||
286 | * Override generic MP-table parsing: | ||
287 | */ | ||
288 | x86_quirks = &visws_x86_quirks; | ||
289 | |||
290 | /* | ||
291 | * Install reboot quirks: | ||
292 | */ | ||
293 | pm_power_off = visws_machine_power_off; | ||
294 | machine_ops.emergency_restart = visws_machine_emergency_restart; | ||
295 | |||
296 | /* | ||
297 | * Do not use broadcast IPIs: | ||
298 | */ | ||
299 | no_broadcast = 0; | ||
300 | |||
301 | #ifdef CONFIG_X86_IO_APIC | ||
302 | /* | ||
303 | * Turn off IO-APIC detection and initialization: | ||
304 | */ | ||
305 | skip_ioapic_setup = 1; | ||
306 | #endif | ||
307 | |||
308 | /* | ||
309 | * Get Board rev. | ||
310 | * First, we have to initialize the 307 part to allow us access | ||
311 | * to the GPIO registers. Let's map them at 0x0fc0 which is right | ||
312 | * after the PIIX4 PM section. | ||
313 | */ | ||
314 | outb_p(SIO_DEV_SEL, SIO_INDEX); | ||
315 | outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ | ||
316 | |||
317 | outb_p(SIO_DEV_MSB, SIO_INDEX); | ||
318 | outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ | ||
319 | |||
320 | outb_p(SIO_DEV_LSB, SIO_INDEX); | ||
321 | outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ | ||
322 | |||
323 | outb_p(SIO_DEV_ENB, SIO_INDEX); | ||
324 | outb_p(1, SIO_DATA); /* Enable GPIO registers. */ | ||
325 | |||
326 | /* | ||
327 | * Now, we have to map the power management section to write | ||
328 | * a bit which enables access to the GPIO registers. | ||
329 | * What lunatic came up with this shit? | ||
330 | */ | ||
331 | outb_p(SIO_DEV_SEL, SIO_INDEX); | ||
332 | outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ | ||
333 | |||
334 | outb_p(SIO_DEV_MSB, SIO_INDEX); | ||
335 | outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ | ||
336 | |||
337 | outb_p(SIO_DEV_LSB, SIO_INDEX); | ||
338 | outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ | ||
339 | |||
340 | outb_p(SIO_DEV_ENB, SIO_INDEX); | ||
341 | outb_p(1, SIO_DATA); /* Enable PM registers. */ | ||
342 | |||
343 | /* | ||
344 | * Now, write the PM register which enables the GPIO registers. | ||
345 | */ | ||
346 | outb_p(SIO_PM_FER2, SIO_PM_INDEX); | ||
347 | outb_p(SIO_PM_GP_EN, SIO_PM_DATA); | ||
348 | |||
349 | /* | ||
350 | * Now, initialize the GPIO registers. | ||
351 | * We want them all to be inputs which is the | ||
352 | * power on default, so let's leave them alone. | ||
353 | * So, let's just read the board rev! | ||
354 | */ | ||
355 | raw = inb_p(SIO_GP_DATA1); | ||
356 | raw &= 0x7f; /* 7 bits of valid board revision ID. */ | ||
357 | |||
358 | if (visws_board_type == VISWS_320) { | ||
359 | if (raw < 0x6) { | ||
360 | visws_board_rev = 4; | ||
361 | } else if (raw < 0xc) { | ||
362 | visws_board_rev = 5; | ||
363 | } else { | ||
364 | visws_board_rev = 6; | ||
365 | } | ||
366 | } else if (visws_board_type == VISWS_540) { | ||
367 | visws_board_rev = 2; | ||
368 | } else { | ||
369 | visws_board_rev = raw; | ||
370 | } | ||
371 | |||
372 | printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", | ||
373 | (visws_board_type == VISWS_320 ? "320" : | ||
374 | (visws_board_type == VISWS_540 ? "540" : | ||
375 | "unknown")), visws_board_rev); | ||
376 | } | ||
377 | |||
378 | #define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) | ||
379 | #define BCD (LI_INTB | LI_INTC | LI_INTD) | ||
380 | #define ALLDEVS (A01234 | BCD) | ||
381 | |||
382 | static __init void lithium_init(void) | ||
383 | { | ||
384 | set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); | ||
385 | set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); | ||
386 | |||
387 | if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || | ||
388 | (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { | ||
389 | printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); | ||
390 | /* panic("This machine is not SGI Visual Workstation 320/540"); */ | ||
391 | } | ||
392 | |||
393 | if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || | ||
394 | (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { | ||
395 | printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); | ||
396 | /* panic("This machine is not SGI Visual Workstation 320/540"); */ | ||
397 | } | ||
398 | |||
399 | li_pcia_write16(LI_PCI_INTEN, ALLDEVS); | ||
400 | li_pcib_write16(LI_PCI_INTEN, ALLDEVS); | ||
401 | } | ||
402 | |||
403 | static __init void cobalt_init(void) | ||
404 | { | ||
405 | /* | ||
406 | * On normal SMP PC this is used only with SMP, but we have to | ||
407 | * use it and set it up here to start the Cobalt clock | ||
408 | */ | ||
409 | set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); | ||
410 | setup_local_APIC(); | ||
411 | printk(KERN_INFO "Local APIC Version %#x, ID %#x\n", | ||
412 | (unsigned int)apic_read(APIC_LVR), | ||
413 | (unsigned int)apic_read(APIC_ID)); | ||
414 | |||
415 | set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); | ||
416 | set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); | ||
417 | printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", | ||
418 | co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); | ||
419 | |||
420 | /* Enable Cobalt APIC being careful to NOT change the ID! */ | ||
421 | co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); | ||
422 | |||
423 | printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", | ||
424 | co_apic_read(CO_APIC_ID)); | ||
425 | } | ||
426 | |||
427 | static int __init visws_trap_init(void) | ||
428 | { | ||
429 | lithium_init(); | ||
430 | cobalt_init(); | ||
431 | |||
432 | return 1; | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * IRQ controller / APIC support: | ||
437 | */ | ||
438 | |||
439 | static DEFINE_SPINLOCK(cobalt_lock); | ||
440 | |||
441 | /* | ||
442 | * Set the given Cobalt APIC Redirection Table entry to point | ||
443 | * to the given IDT vector/index. | ||
444 | */ | ||
445 | static inline void co_apic_set(int entry, int irq) | ||
446 | { | ||
447 | co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); | ||
448 | co_apic_write(CO_APIC_HI(entry), 0); | ||
449 | } | ||
450 | |||
451 | /* | ||
452 | * Cobalt (IO)-APIC functions to handle PCI devices. | ||
453 | */ | ||
454 | static inline int co_apic_ide0_hack(void) | ||
455 | { | ||
456 | extern char visws_board_type; | ||
457 | extern char visws_board_rev; | ||
458 | |||
459 | if (visws_board_type == VISWS_320 && visws_board_rev == 5) | ||
460 | return 5; | ||
461 | return CO_APIC_IDE0; | ||
462 | } | ||
463 | |||
464 | static int is_co_apic(unsigned int irq) | ||
465 | { | ||
466 | if (IS_CO_APIC(irq)) | ||
467 | return CO_APIC(irq); | ||
468 | |||
469 | switch (irq) { | ||
470 | case 0: return CO_APIC_CPU; | ||
471 | case CO_IRQ_IDE0: return co_apic_ide0_hack(); | ||
472 | case CO_IRQ_IDE1: return CO_APIC_IDE1; | ||
473 | default: return -1; | ||
474 | } | ||
475 | } | ||
476 | |||
477 | |||
478 | /* | ||
479 | * This is the SGI Cobalt (IO-)APIC: | ||
480 | */ | ||
481 | |||
482 | static void enable_cobalt_irq(unsigned int irq) | ||
483 | { | ||
484 | co_apic_set(is_co_apic(irq), irq); | ||
485 | } | ||
486 | |||
487 | static void disable_cobalt_irq(unsigned int irq) | ||
488 | { | ||
489 | int entry = is_co_apic(irq); | ||
490 | |||
491 | co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); | ||
492 | co_apic_read(CO_APIC_LO(entry)); | ||
493 | } | ||
494 | |||
495 | /* | ||
496 | * "irq" really just serves to identify the device. Here is where we | ||
497 | * map this to the Cobalt APIC entry where it's physically wired. | ||
498 | * This is called via request_irq -> setup_irq -> irq_desc->startup() | ||
499 | */ | ||
500 | static unsigned int startup_cobalt_irq(unsigned int irq) | ||
501 | { | ||
502 | unsigned long flags; | ||
503 | |||
504 | spin_lock_irqsave(&cobalt_lock, flags); | ||
505 | if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) | ||
506 | irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); | ||
507 | enable_cobalt_irq(irq); | ||
508 | spin_unlock_irqrestore(&cobalt_lock, flags); | ||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static void ack_cobalt_irq(unsigned int irq) | ||
513 | { | ||
514 | unsigned long flags; | ||
515 | |||
516 | spin_lock_irqsave(&cobalt_lock, flags); | ||
517 | disable_cobalt_irq(irq); | ||
518 | apic_write(APIC_EOI, APIC_EIO_ACK); | ||
519 | spin_unlock_irqrestore(&cobalt_lock, flags); | ||
520 | } | ||
521 | |||
522 | static void end_cobalt_irq(unsigned int irq) | ||
523 | { | ||
524 | unsigned long flags; | ||
525 | |||
526 | spin_lock_irqsave(&cobalt_lock, flags); | ||
527 | if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) | ||
528 | enable_cobalt_irq(irq); | ||
529 | spin_unlock_irqrestore(&cobalt_lock, flags); | ||
530 | } | ||
531 | |||
532 | static struct irq_chip cobalt_irq_type = { | ||
533 | .typename = "Cobalt-APIC", | ||
534 | .startup = startup_cobalt_irq, | ||
535 | .shutdown = disable_cobalt_irq, | ||
536 | .enable = enable_cobalt_irq, | ||
537 | .disable = disable_cobalt_irq, | ||
538 | .ack = ack_cobalt_irq, | ||
539 | .end = end_cobalt_irq, | ||
540 | }; | ||
541 | |||
542 | |||
543 | /* | ||
544 | * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt | ||
545 | * -- not the manner expected by the code in i8259.c. | ||
546 | * | ||
547 | * there is a 'master' physical interrupt source that gets sent to | ||
548 | * the CPU. But in the chipset there are various 'virtual' interrupts | ||
549 | * waiting to be handled. We represent this to Linux through a 'master' | ||
550 | * interrupt controller type, and through a special virtual interrupt- | ||
551 | * controller. Device drivers only see the virtual interrupt sources. | ||
552 | */ | ||
553 | static unsigned int startup_piix4_master_irq(unsigned int irq) | ||
554 | { | ||
555 | init_8259A(0); | ||
556 | |||
557 | return startup_cobalt_irq(irq); | ||
558 | } | ||
559 | |||
560 | static void end_piix4_master_irq(unsigned int irq) | ||
561 | { | ||
562 | unsigned long flags; | ||
563 | |||
564 | spin_lock_irqsave(&cobalt_lock, flags); | ||
565 | enable_cobalt_irq(irq); | ||
566 | spin_unlock_irqrestore(&cobalt_lock, flags); | ||
567 | } | ||
568 | |||
569 | static struct irq_chip piix4_master_irq_type = { | ||
570 | .typename = "PIIX4-master", | ||
571 | .startup = startup_piix4_master_irq, | ||
572 | .ack = ack_cobalt_irq, | ||
573 | .end = end_piix4_master_irq, | ||
574 | }; | ||
575 | |||
576 | |||
577 | static struct irq_chip piix4_virtual_irq_type = { | ||
578 | .typename = "PIIX4-virtual", | ||
579 | .shutdown = disable_8259A_irq, | ||
580 | .enable = enable_8259A_irq, | ||
581 | .disable = disable_8259A_irq, | ||
582 | }; | ||
583 | |||
584 | |||
585 | /* | ||
586 | * PIIX4-8259 master/virtual functions to handle interrupt requests | ||
587 | * from legacy devices: floppy, parallel, serial, rtc. | ||
588 | * | ||
589 | * None of these get Cobalt APIC entries, neither do they have IDT | ||
590 | * entries. These interrupts are purely virtual and distributed from | ||
591 | * the 'master' interrupt source: CO_IRQ_8259. | ||
592 | * | ||
593 | * When the 8259 interrupts its handler figures out which of these | ||
594 | * devices is interrupting and dispatches to its handler. | ||
595 | * | ||
596 | * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ | ||
597 | * enable_irq gets the right irq. This 'master' irq is never directly | ||
598 | * manipulated by any driver. | ||
599 | */ | ||
600 | static irqreturn_t piix4_master_intr(int irq, void *dev_id) | ||
601 | { | ||
602 | int realirq; | ||
603 | irq_desc_t *desc; | ||
604 | unsigned long flags; | ||
605 | |||
606 | spin_lock_irqsave(&i8259A_lock, flags); | ||
607 | |||
608 | /* Find out what's interrupting in the PIIX4 master 8259 */ | ||
609 | outb(0x0c, 0x20); /* OCW3 Poll command */ | ||
610 | realirq = inb(0x20); | ||
611 | |||
612 | /* | ||
613 | * Bit 7 == 0 means invalid/spurious | ||
614 | */ | ||
615 | if (unlikely(!(realirq & 0x80))) | ||
616 | goto out_unlock; | ||
617 | |||
618 | realirq &= 7; | ||
619 | |||
620 | if (unlikely(realirq == 2)) { | ||
621 | outb(0x0c, 0xa0); | ||
622 | realirq = inb(0xa0); | ||
623 | |||
624 | if (unlikely(!(realirq & 0x80))) | ||
625 | goto out_unlock; | ||
626 | |||
627 | realirq = (realirq & 7) + 8; | ||
628 | } | ||
629 | |||
630 | /* mask and ack interrupt */ | ||
631 | cached_irq_mask |= 1 << realirq; | ||
632 | if (unlikely(realirq > 7)) { | ||
633 | inb(0xa1); | ||
634 | outb(cached_slave_mask, 0xa1); | ||
635 | outb(0x60 + (realirq & 7), 0xa0); | ||
636 | outb(0x60 + 2, 0x20); | ||
637 | } else { | ||
638 | inb(0x21); | ||
639 | outb(cached_master_mask, 0x21); | ||
640 | outb(0x60 + realirq, 0x20); | ||
641 | } | ||
642 | |||
643 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
644 | |||
645 | desc = irq_desc + realirq; | ||
646 | |||
647 | /* | ||
648 | * handle this 'virtual interrupt' as a Cobalt one now. | ||
649 | */ | ||
650 | kstat_cpu(smp_processor_id()).irqs[realirq]++; | ||
651 | |||
652 | if (likely(desc->action != NULL)) | ||
653 | handle_IRQ_event(realirq, desc->action); | ||
654 | |||
655 | if (!(desc->status & IRQ_DISABLED)) | ||
656 | enable_8259A_irq(realirq); | ||
657 | |||
658 | return IRQ_HANDLED; | ||
659 | |||
660 | out_unlock: | ||
661 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
662 | return IRQ_NONE; | ||
663 | } | ||
664 | |||
665 | static struct irqaction master_action = { | ||
666 | .handler = piix4_master_intr, | ||
667 | .name = "PIIX4-8259", | ||
668 | }; | ||
669 | |||
670 | static struct irqaction cascade_action = { | ||
671 | .handler = no_action, | ||
672 | .name = "cascade", | ||
673 | }; | ||
674 | |||
675 | |||
676 | void init_VISWS_APIC_irqs(void) | ||
677 | { | ||
678 | int i; | ||
679 | |||
680 | for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { | ||
681 | irq_desc[i].status = IRQ_DISABLED; | ||
682 | irq_desc[i].action = 0; | ||
683 | irq_desc[i].depth = 1; | ||
684 | |||
685 | if (i == 0) { | ||
686 | irq_desc[i].chip = &cobalt_irq_type; | ||
687 | } | ||
688 | else if (i == CO_IRQ_IDE0) { | ||
689 | irq_desc[i].chip = &cobalt_irq_type; | ||
690 | } | ||
691 | else if (i == CO_IRQ_IDE1) { | ||
692 | irq_desc[i].chip = &cobalt_irq_type; | ||
693 | } | ||
694 | else if (i == CO_IRQ_8259) { | ||
695 | irq_desc[i].chip = &piix4_master_irq_type; | ||
696 | } | ||
697 | else if (i < CO_IRQ_APIC0) { | ||
698 | irq_desc[i].chip = &piix4_virtual_irq_type; | ||
699 | } | ||
700 | else if (IS_CO_APIC(i)) { | ||
701 | irq_desc[i].chip = &cobalt_irq_type; | ||
702 | } | ||
703 | } | ||
704 | |||
705 | setup_irq(CO_IRQ_8259, &master_action); | ||
706 | setup_irq(2, &cascade_action); | ||
707 | } | ||
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 956f38927aa7..0a1b1a9d922d 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -151,7 +151,7 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, | |||
151 | insns, ip); | 151 | insns, ip); |
152 | case PARAVIRT_PATCH(pv_cpu_ops.iret): | 152 | case PARAVIRT_PATCH(pv_cpu_ops.iret): |
153 | return patch_internal(VMI_CALL_IRET, len, insns, ip); | 153 | return patch_internal(VMI_CALL_IRET, len, insns, ip); |
154 | case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret): | 154 | case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): |
155 | return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); | 155 | return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); |
156 | default: | 156 | default: |
157 | break; | 157 | break; |
@@ -896,7 +896,7 @@ static inline int __init activate_vmi(void) | |||
896 | * the backend. They are performance critical anyway, so requiring | 896 | * the backend. They are performance critical anyway, so requiring |
897 | * a patch is not a big problem. | 897 | * a patch is not a big problem. |
898 | */ | 898 | */ |
899 | pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0; | 899 | pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; |
900 | pv_cpu_ops.iret = (void *)0xbadbab0; | 900 | pv_cpu_ops.iret = (void *)0xbadbab0; |
901 | 901 | ||
902 | #ifdef CONFIG_SMP | 902 | #ifdef CONFIG_SMP |
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void) | |||
906 | #ifdef CONFIG_X86_LOCAL_APIC | 906 | #ifdef CONFIG_X86_LOCAL_APIC |
907 | para_fill(pv_apic_ops.apic_read, APICRead); | 907 | para_fill(pv_apic_ops.apic_read, APICRead); |
908 | para_fill(pv_apic_ops.apic_write, APICWrite); | 908 | para_fill(pv_apic_ops.apic_write, APICWrite); |
909 | para_fill(pv_apic_ops.apic_write_atomic, APICWrite); | ||
910 | #endif | 909 | #endif |
911 | 910 | ||
912 | /* | 911 | /* |
@@ -932,7 +931,7 @@ static inline int __init activate_vmi(void) | |||
932 | pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; | 931 | pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; |
933 | #endif | 932 | #endif |
934 | pv_time_ops.sched_clock = vmi_sched_clock; | 933 | pv_time_ops.sched_clock = vmi_sched_clock; |
935 | pv_time_ops.get_cpu_khz = vmi_cpu_khz; | 934 | pv_time_ops.get_tsc_khz = vmi_tsc_khz; |
936 | 935 | ||
937 | /* We have true wallclock functions; disable CMOS clock sync */ | 936 | /* We have true wallclock functions; disable CMOS clock sync */ |
938 | no_sync_cmos_clock = 1; | 937 | no_sync_cmos_clock = 1; |
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index a2b030780aa9..6953859fe289 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c | |||
@@ -33,8 +33,7 @@ | |||
33 | #include <asm/apic.h> | 33 | #include <asm/apic.h> |
34 | #include <asm/timer.h> | 34 | #include <asm/timer.h> |
35 | #include <asm/i8253.h> | 35 | #include <asm/i8253.h> |
36 | 36 | #include <asm/irq_vectors.h> | |
37 | #include <irq_vectors.h> | ||
38 | 37 | ||
39 | #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) | 38 | #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) |
40 | #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) | 39 | #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) |
@@ -70,8 +69,8 @@ unsigned long long vmi_sched_clock(void) | |||
70 | return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); | 69 | return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); |
71 | } | 70 | } |
72 | 71 | ||
73 | /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ | 72 | /* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ |
74 | unsigned long vmi_cpu_khz(void) | 73 | unsigned long vmi_tsc_khz(void) |
75 | { | 74 | { |
76 | unsigned long long khz; | 75 | unsigned long long khz; |
77 | khz = vmi_timer_ops.get_cycle_frequency(); | 76 | khz = vmi_timer_ops.get_cycle_frequency(); |
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index ce5ed083a1e9..cdb2363697d2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S | |||
@@ -49,23 +49,14 @@ SECTIONS | |||
49 | _etext = .; /* End of text section */ | 49 | _etext = .; /* End of text section */ |
50 | } :text = 0x9090 | 50 | } :text = 0x9090 |
51 | 51 | ||
52 | NOTES :text :note | ||
53 | |||
52 | . = ALIGN(16); /* Exception table */ | 54 | . = ALIGN(16); /* Exception table */ |
53 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { | 55 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { |
54 | __start___ex_table = .; | 56 | __start___ex_table = .; |
55 | *(__ex_table) | 57 | *(__ex_table) |
56 | __stop___ex_table = .; | 58 | __stop___ex_table = .; |
57 | } | 59 | } :text = 0x9090 |
58 | |||
59 | NOTES :text :note | ||
60 | |||
61 | BUG_TABLE :text | ||
62 | |||
63 | . = ALIGN(4); | ||
64 | .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { | ||
65 | __tracedata_start = .; | ||
66 | *(.tracedata) | ||
67 | __tracedata_end = .; | ||
68 | } | ||
69 | 60 | ||
70 | RODATA | 61 | RODATA |
71 | 62 | ||
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fad3674b06a5..63e5c1a22e88 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S | |||
@@ -19,7 +19,7 @@ PHDRS { | |||
19 | data PT_LOAD FLAGS(7); /* RWE */ | 19 | data PT_LOAD FLAGS(7); /* RWE */ |
20 | user PT_LOAD FLAGS(7); /* RWE */ | 20 | user PT_LOAD FLAGS(7); /* RWE */ |
21 | data.init PT_LOAD FLAGS(7); /* RWE */ | 21 | data.init PT_LOAD FLAGS(7); /* RWE */ |
22 | note PT_NOTE FLAGS(4); /* R__ */ | 22 | note PT_NOTE FLAGS(0); /* ___ */ |
23 | } | 23 | } |
24 | SECTIONS | 24 | SECTIONS |
25 | { | 25 | { |
@@ -40,26 +40,17 @@ SECTIONS | |||
40 | _etext = .; /* End of text section */ | 40 | _etext = .; /* End of text section */ |
41 | } :text = 0x9090 | 41 | } :text = 0x9090 |
42 | 42 | ||
43 | NOTES :text :note | ||
44 | |||
43 | . = ALIGN(16); /* Exception table */ | 45 | . = ALIGN(16); /* Exception table */ |
44 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { | 46 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { |
45 | __start___ex_table = .; | 47 | __start___ex_table = .; |
46 | *(__ex_table) | 48 | *(__ex_table) |
47 | __stop___ex_table = .; | 49 | __stop___ex_table = .; |
48 | } | 50 | } :text = 0x9090 |
49 | |||
50 | NOTES :text :note | ||
51 | |||
52 | BUG_TABLE :text | ||
53 | 51 | ||
54 | RODATA | 52 | RODATA |
55 | 53 | ||
56 | . = ALIGN(4); | ||
57 | .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { | ||
58 | __tracedata_start = .; | ||
59 | *(.tracedata) | ||
60 | __tracedata_end = .; | ||
61 | } | ||
62 | |||
63 | . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ | 54 | . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ |
64 | /* Data */ | 55 | /* Data */ |
65 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | 56 | .data : AT(ADDR(.data) - LOAD_OFFSET) { |
@@ -177,6 +168,7 @@ SECTIONS | |||
177 | *(.con_initcall.init) | 168 | *(.con_initcall.init) |
178 | } | 169 | } |
179 | __con_initcall_end = .; | 170 | __con_initcall_end = .; |
171 | . = ALIGN(16); | ||
180 | __x86cpuvendor_start = .; | 172 | __x86cpuvendor_start = .; |
181 | .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { | 173 | .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { |
182 | *(.x86cpuvendor.init) | 174 | *(.x86cpuvendor.init) |
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index ba8c0b75ab0a..0c029e8959c7 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c | |||
@@ -15,9 +15,12 @@ | |||
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/pci_ids.h> | 16 | #include <linux/pci_ids.h> |
17 | #include <linux/pci_regs.h> | 17 | #include <linux/pci_regs.h> |
18 | |||
19 | #include <asm/apic.h> | ||
18 | #include <asm/pci-direct.h> | 20 | #include <asm/pci-direct.h> |
19 | #include <asm/io.h> | 21 | #include <asm/io.h> |
20 | #include <asm/paravirt.h> | 22 | #include <asm/paravirt.h> |
23 | #include <asm/setup.h> | ||
21 | 24 | ||
22 | #if defined CONFIG_PCI && defined CONFIG_PARAVIRT | 25 | #if defined CONFIG_PCI && defined CONFIG_PARAVIRT |
23 | /* | 26 | /* |
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 61efa2f7d564..0b8b6690a86d 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -42,7 +42,8 @@ | |||
42 | #include <asm/topology.h> | 42 | #include <asm/topology.h> |
43 | #include <asm/vgtod.h> | 43 | #include <asm/vgtod.h> |
44 | 44 | ||
45 | #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) | 45 | #define __vsyscall(nr) \ |
46 | __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace | ||
46 | #define __syscall_clobber "r11","cx","memory" | 47 | #define __syscall_clobber "r11","cx","memory" |
47 | 48 | ||
48 | /* | 49 | /* |
@@ -249,7 +250,7 @@ static ctl_table kernel_root_table2[] = { | |||
249 | doesn't violate that. We'll find out if it does. */ | 250 | doesn't violate that. We'll find out if it does. */ |
250 | static void __cpuinit vsyscall_set_cpu(int cpu) | 251 | static void __cpuinit vsyscall_set_cpu(int cpu) |
251 | { | 252 | { |
252 | unsigned long *d; | 253 | unsigned long d; |
253 | unsigned long node = 0; | 254 | unsigned long node = 0; |
254 | #ifdef CONFIG_NUMA | 255 | #ifdef CONFIG_NUMA |
255 | node = cpu_to_node(cpu); | 256 | node = cpu_to_node(cpu); |
@@ -260,11 +261,11 @@ static void __cpuinit vsyscall_set_cpu(int cpu) | |||
260 | /* Store cpu number in limit so that it can be loaded quickly | 261 | /* Store cpu number in limit so that it can be loaded quickly |
261 | in user space in vgetcpu. | 262 | in user space in vgetcpu. |
262 | 12 bits for the CPU and 8 bits for the node. */ | 263 | 12 bits for the CPU and 8 bits for the node. */ |
263 | d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU); | 264 | d = 0x0f40000000000ULL; |
264 | *d = 0x0f40000000000ULL; | 265 | d |= cpu; |
265 | *d |= cpu; | 266 | d |= (node & 0xf) << 12; |
266 | *d |= (node & 0xf) << 12; | 267 | d |= (node >> 4) << 48; |
267 | *d |= (node >> 4) << 48; | 268 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); |
268 | } | 269 | } |
269 | 270 | ||
270 | static void __cpuinit cpu_vsyscall_init(void *arg) | 271 | static void __cpuinit cpu_vsyscall_init(void *arg) |
@@ -278,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) | |||
278 | { | 279 | { |
279 | long cpu = (long)arg; | 280 | long cpu = (long)arg; |
280 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | 281 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) |
281 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); | 282 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); |
282 | return NOTIFY_DONE; | 283 | return NOTIFY_DONE; |
283 | } | 284 | } |
284 | 285 | ||
@@ -301,7 +302,7 @@ static int __init vsyscall_init(void) | |||
301 | #ifdef CONFIG_SYSCTL | 302 | #ifdef CONFIG_SYSCTL |
302 | register_sysctl_table(kernel_root_table2); | 303 | register_sysctl_table(kernel_root_table2); |
303 | #endif | 304 | #endif |
304 | on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); | 305 | on_each_cpu(cpu_vsyscall_init, NULL, 1); |
305 | hotcpu_notifier(cpu_vsyscall_notifier, 0); | 306 | hotcpu_notifier(cpu_vsyscall_notifier, 0); |
306 | return 0; | 307 | return 0; |
307 | } | 308 | } |
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index f6c05d0410fb..b545f371b5f5 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c | |||
@@ -2,13 +2,20 @@ | |||
2 | All C exports should go in the respective C files. */ | 2 | All C exports should go in the respective C files. */ |
3 | 3 | ||
4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
5 | #include <net/checksum.h> | ||
6 | #include <linux/smp.h> | 5 | #include <linux/smp.h> |
7 | 6 | ||
7 | #include <net/checksum.h> | ||
8 | |||
8 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | 10 | #include <asm/pgtable.h> |
11 | #include <asm/uaccess.h> | ||
11 | #include <asm/desc.h> | 12 | #include <asm/desc.h> |
13 | #include <asm/ftrace.h> | ||
14 | |||
15 | #ifdef CONFIG_FTRACE | ||
16 | /* mcount is defined in assembly */ | ||
17 | EXPORT_SYMBOL(mcount); | ||
18 | #endif | ||
12 | 19 | ||
13 | EXPORT_SYMBOL(kernel_thread); | 20 | EXPORT_SYMBOL(kernel_thread); |
14 | 21 | ||
@@ -53,8 +60,3 @@ EXPORT_SYMBOL(init_level4_pgt); | |||
53 | EXPORT_SYMBOL(load_gs_index); | 60 | EXPORT_SYMBOL(load_gs_index); |
54 | 61 | ||
55 | EXPORT_SYMBOL(_proxy_pda); | 62 | EXPORT_SYMBOL(_proxy_pda); |
56 | |||
57 | #ifdef CONFIG_PARAVIRT | ||
58 | /* Virtualized guests may want to use it */ | ||
59 | EXPORT_SYMBOL_GPL(cpu_gdt_descr); | ||
60 | #endif | ||