aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile9
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S14
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h6
-rw-r--r--arch/x86/kernel/acpi/sleep.c6
-rw-r--r--arch/x86/kernel/alternative.c23
-rw-r--r--arch/x86/kernel/amd_iommu.c2764
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1572
-rw-r--r--arch/x86/kernel/apb_timer.c410
-rw-r--r--arch/x86/kernel/apic/apic.c27
-rw-r--r--arch/x86/kernel/apic/io_apic.c91
-rw-r--r--arch/x86/kernel/apm_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c152
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c288
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c182
-rw-r--r--arch/x86/kernel/cpu/perf_event.c168
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c14
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c385
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c119
-rw-r--r--arch/x86/kernel/devicetree.c60
-rw-r--r--arch/x86/kernel/dumpstack_64.c37
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/hpet.c14
-rw-r--r--arch/x86/kernel/i8253.c99
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c21
-rw-r--r--arch/x86/kernel/ptrace.c5
-rw-r--r--arch/x86/kernel/quirks.c5
-rw-r--r--arch/x86/kernel/reboot.c32
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S2
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S2
-rw-r--r--arch/x86/kernel/signal.c56
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/kernel/tsc.c26
-rw-r--r--arch/x86/kernel/vmlinux.lds.S49
-rw-r--r--arch/x86/kernel/vread_tsc_64.c36
-rw-r--r--arch/x86/kernel/vsyscall_64.c310
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S27
47 files changed, 1335 insertions, 5825 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90b06d4daee2..04105574c8e9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,17 +24,12 @@ endif
24nostackp := $(call cc-option, -fno-stack-protector) 24nostackp := $(call cc-option, -fno-stack-protector)
25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
26CFLAGS_hpet.o := $(nostackp) 26CFLAGS_hpet.o := $(nostackp)
27CFLAGS_vread_tsc_64.o := $(nostackp)
28CFLAGS_paravirt.o := $(nostackp) 27CFLAGS_paravirt.o := $(nostackp)
29GCOV_PROFILE_vsyscall_64.o := n 28GCOV_PROFILE_vsyscall_64.o := n
30GCOV_PROFILE_hpet.o := n 29GCOV_PROFILE_hpet.o := n
31GCOV_PROFILE_tsc.o := n 30GCOV_PROFILE_tsc.o := n
32GCOV_PROFILE_vread_tsc_64.o := n
33GCOV_PROFILE_paravirt.o := n 31GCOV_PROFILE_paravirt.o := n
34 32
35# vread_tsc_64 is hot and should be fully optimized:
36CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
37
38obj-y := process_$(BITS).o signal.o entry_$(BITS).o 33obj-y := process_$(BITS).o signal.o entry_$(BITS).o
39obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 34obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
40obj-y += time.o ioport.o ldt.o dumpstack.o 35obj-y += time.o ioport.o ldt.o dumpstack.o
@@ -43,7 +38,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
43obj-y += probe_roms.o 38obj-y += probe_roms.o
44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 39obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 40obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o 41obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
42obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
47obj-y += bootflag.o e820.o 43obj-y += bootflag.o e820.o
48obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
49obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -123,7 +119,6 @@ ifeq ($(CONFIG_X86_64),y)
123 119
124 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o 120 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 121 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
127 122
128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 123 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
129 obj-y += vsmp_64.o 124 obj-y += vsmp_64.o
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index ead21b663117..b4fd836e4053 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -28,6 +28,8 @@ pmode_cr3: .long 0 /* Saved %cr3 */
28pmode_cr4: .long 0 /* Saved %cr4 */ 28pmode_cr4: .long 0 /* Saved %cr4 */
29pmode_efer: .quad 0 /* Saved EFER */ 29pmode_efer: .quad 0 /* Saved EFER */
30pmode_gdt: .quad 0 30pmode_gdt: .quad 0
31pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
32pmode_behavior: .long 0 /* Wakeup behavior flags */
31realmode_flags: .long 0 33realmode_flags: .long 0
32real_magic: .long 0 34real_magic: .long 0
33trampoline_segment: .word 0 35trampoline_segment: .word 0
@@ -91,6 +93,18 @@ wakeup_code:
91 /* Call the C code */ 93 /* Call the C code */
92 calll main 94 calll main
93 95
96 /* Restore MISC_ENABLE before entering protected mode, in case
97 BIOS decided to clear XD_DISABLE during S3. */
98 movl pmode_behavior, %eax
99 btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
100 jnc 1f
101
102 movl pmode_misc_en, %eax
103 movl pmode_misc_en + 4, %edx
104 movl $MSR_IA32_MISC_ENABLE, %ecx
105 wrmsr
1061:
107
94 /* Do any other stuff... */ 108 /* Do any other stuff... */
95 109
96#ifndef CONFIG_64BIT 110#ifndef CONFIG_64BIT
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index e1828c07e79c..97a29e1430e3 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
21 u32 pmode_efer_low; /* Protected mode EFER */ 21 u32 pmode_efer_low; /* Protected mode EFER */
22 u32 pmode_efer_high; 22 u32 pmode_efer_high;
23 u64 pmode_gdt; 23 u64 pmode_gdt;
24 u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
25 u32 pmode_misc_en_high;
26 u32 pmode_behavior; /* Wakeup routine behavior flags */
24 u32 realmode_flags; 27 u32 realmode_flags;
25 u32 real_magic; 28 u32 real_magic;
26 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ 29 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
@@ -39,4 +42,7 @@ extern struct wakeup_header wakeup_header;
39#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 42#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
40#define WAKEUP_END_SIGNATURE 0x65a22c82 43#define WAKEUP_END_SIGNATURE 0x65a22c82
41 44
45/* Wakeup behavior bits */
46#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
47
42#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ 48#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 18a857ba7a25..103b6ab368d3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -77,6 +77,12 @@ int acpi_suspend_lowlevel(void)
77 77
78 header->pmode_cr0 = read_cr0(); 78 header->pmode_cr0 = read_cr0();
79 header->pmode_cr4 = read_cr4_safe(); 79 header->pmode_cr4 = read_cr4_safe();
80 header->pmode_behavior = 0;
81 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
82 &header->pmode_misc_en_low,
83 &header->pmode_misc_en_high))
84 header->pmode_behavior |=
85 (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
80 header->realmode_flags = acpi_realmode_flags; 86 header->realmode_flags = acpi_realmode_flags;
81 header->real_magic = 0x12345678; 87 header->real_magic = 0x12345678;
82 88
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a81f2d52f869..c63822816249 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/mce.h> 15#include <asm/mce.h>
16#include <asm/nmi.h> 16#include <asm/nmi.h>
17#include <asm/vsyscall.h>
18#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
19#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
20#include <asm/io.h> 19#include <asm/io.h>
@@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
250 249
251extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
252extern s32 __smp_locks[], __smp_locks_end[]; 251extern s32 __smp_locks[], __smp_locks_end[];
253extern char __vsyscall_0;
254void *text_poke_early(void *addr, const void *opcode, size_t len); 252void *text_poke_early(void *addr, const void *opcode, size_t len);
255 253
256/* Replace instructions with better alternatives for this CPU type. 254/* Replace instructions with better alternatives for this CPU type.
@@ -263,6 +261,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
263 struct alt_instr *end) 261 struct alt_instr *end)
264{ 262{
265 struct alt_instr *a; 263 struct alt_instr *a;
264 u8 *instr, *replacement;
266 u8 insnbuf[MAX_PATCH_LEN]; 265 u8 insnbuf[MAX_PATCH_LEN];
267 266
268 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 267 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
@@ -276,25 +275,23 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
276 * order. 275 * order.
277 */ 276 */
278 for (a = start; a < end; a++) { 277 for (a = start; a < end; a++) {
279 u8 *instr = a->instr; 278 instr = (u8 *)&a->instr_offset + a->instr_offset;
279 replacement = (u8 *)&a->repl_offset + a->repl_offset;
280 BUG_ON(a->replacementlen > a->instrlen); 280 BUG_ON(a->replacementlen > a->instrlen);
281 BUG_ON(a->instrlen > sizeof(insnbuf)); 281 BUG_ON(a->instrlen > sizeof(insnbuf));
282 BUG_ON(a->cpuid >= NCAPINTS*32); 282 BUG_ON(a->cpuid >= NCAPINTS*32);
283 if (!boot_cpu_has(a->cpuid)) 283 if (!boot_cpu_has(a->cpuid))
284 continue; 284 continue;
285#ifdef CONFIG_X86_64 285
286 /* vsyscall code is not mapped yet. resolve it manually. */ 286 memcpy(insnbuf, replacement, a->replacementlen);
287 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { 287
288 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); 288 /* 0xe8 is a relative jump; fix the offset. */
289 DPRINTK("%s: vsyscall fixup: %p => %p\n",
290 __func__, a->instr, instr);
291 }
292#endif
293 memcpy(insnbuf, a->replacement, a->replacementlen);
294 if (*insnbuf == 0xe8 && a->replacementlen == 5) 289 if (*insnbuf == 0xe8 && a->replacementlen == 5)
295 *(s32 *)(insnbuf + 1) += a->replacement - a->instr; 290 *(s32 *)(insnbuf + 1) += replacement - instr;
291
296 add_nops(insnbuf + a->replacementlen, 292 add_nops(insnbuf + a->replacementlen,
297 a->instrlen - a->replacementlen); 293 a->instrlen - a->replacementlen);
294
298 text_poke_early(instr, insnbuf, a->instrlen); 295 text_poke_early(instr, insnbuf, a->instrlen);
299 } 296 }
300} 297}
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 7c3a95e54ec5..000000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2764 +0,0 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/bitmap.h>
23#include <linux/slab.h>
24#include <linux/debugfs.h>
25#include <linux/scatterlist.h>
26#include <linux/dma-mapping.h>
27#include <linux/iommu-helper.h>
28#include <linux/iommu.h>
29#include <linux/delay.h>
30#include <asm/proto.h>
31#include <asm/iommu.h>
32#include <asm/gart.h>
33#include <asm/dma.h>
34#include <asm/amd_iommu_proto.h>
35#include <asm/amd_iommu_types.h>
36#include <asm/amd_iommu.h>
37
38#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
39
40#define LOOP_TIMEOUT 100000
41
42static DEFINE_RWLOCK(amd_iommu_devtable_lock);
43
44/* A list of preallocated protection domains */
45static LIST_HEAD(iommu_pd_list);
46static DEFINE_SPINLOCK(iommu_pd_list_lock);
47
48/*
49 * Domain for untranslated devices - only allocated
50 * if iommu=pt passed on kernel cmd line.
51 */
52static struct protection_domain *pt_domain;
53
54static struct iommu_ops amd_iommu_ops;
55
56/*
57 * general struct to manage commands send to an IOMMU
58 */
59struct iommu_cmd {
60 u32 data[4];
61};
62
63static void update_domain(struct protection_domain *domain);
64
65/****************************************************************************
66 *
67 * Helper functions
68 *
69 ****************************************************************************/
70
71static inline u16 get_device_id(struct device *dev)
72{
73 struct pci_dev *pdev = to_pci_dev(dev);
74
75 return calc_devid(pdev->bus->number, pdev->devfn);
76}
77
78static struct iommu_dev_data *get_dev_data(struct device *dev)
79{
80 return dev->archdata.iommu;
81}
82
83/*
84 * In this function the list of preallocated protection domains is traversed to
85 * find the domain for a specific device
86 */
87static struct dma_ops_domain *find_protection_domain(u16 devid)
88{
89 struct dma_ops_domain *entry, *ret = NULL;
90 unsigned long flags;
91 u16 alias = amd_iommu_alias_table[devid];
92
93 if (list_empty(&iommu_pd_list))
94 return NULL;
95
96 spin_lock_irqsave(&iommu_pd_list_lock, flags);
97
98 list_for_each_entry(entry, &iommu_pd_list, list) {
99 if (entry->target_dev == devid ||
100 entry->target_dev == alias) {
101 ret = entry;
102 break;
103 }
104 }
105
106 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
107
108 return ret;
109}
110
111/*
112 * This function checks if the driver got a valid device from the caller to
113 * avoid dereferencing invalid pointers.
114 */
115static bool check_device(struct device *dev)
116{
117 u16 devid;
118
119 if (!dev || !dev->dma_mask)
120 return false;
121
122 /* No device or no PCI device */
123 if (dev->bus != &pci_bus_type)
124 return false;
125
126 devid = get_device_id(dev);
127
128 /* Out of our scope? */
129 if (devid > amd_iommu_last_bdf)
130 return false;
131
132 if (amd_iommu_rlookup_table[devid] == NULL)
133 return false;
134
135 return true;
136}
137
138static int iommu_init_device(struct device *dev)
139{
140 struct iommu_dev_data *dev_data;
141 struct pci_dev *pdev;
142 u16 devid, alias;
143
144 if (dev->archdata.iommu)
145 return 0;
146
147 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
148 if (!dev_data)
149 return -ENOMEM;
150
151 dev_data->dev = dev;
152
153 devid = get_device_id(dev);
154 alias = amd_iommu_alias_table[devid];
155 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
156 if (pdev)
157 dev_data->alias = &pdev->dev;
158 else {
159 kfree(dev_data);
160 return -ENOTSUPP;
161 }
162
163 atomic_set(&dev_data->bind, 0);
164
165 dev->archdata.iommu = dev_data;
166
167
168 return 0;
169}
170
171static void iommu_ignore_device(struct device *dev)
172{
173 u16 devid, alias;
174
175 devid = get_device_id(dev);
176 alias = amd_iommu_alias_table[devid];
177
178 memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
179 memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
180
181 amd_iommu_rlookup_table[devid] = NULL;
182 amd_iommu_rlookup_table[alias] = NULL;
183}
184
185static void iommu_uninit_device(struct device *dev)
186{
187 kfree(dev->archdata.iommu);
188}
189
190void __init amd_iommu_uninit_devices(void)
191{
192 struct pci_dev *pdev = NULL;
193
194 for_each_pci_dev(pdev) {
195
196 if (!check_device(&pdev->dev))
197 continue;
198
199 iommu_uninit_device(&pdev->dev);
200 }
201}
202
203int __init amd_iommu_init_devices(void)
204{
205 struct pci_dev *pdev = NULL;
206 int ret = 0;
207
208 for_each_pci_dev(pdev) {
209
210 if (!check_device(&pdev->dev))
211 continue;
212
213 ret = iommu_init_device(&pdev->dev);
214 if (ret == -ENOTSUPP)
215 iommu_ignore_device(&pdev->dev);
216 else if (ret)
217 goto out_free;
218 }
219
220 return 0;
221
222out_free:
223
224 amd_iommu_uninit_devices();
225
226 return ret;
227}
228#ifdef CONFIG_AMD_IOMMU_STATS
229
230/*
231 * Initialization code for statistics collection
232 */
233
234DECLARE_STATS_COUNTER(compl_wait);
235DECLARE_STATS_COUNTER(cnt_map_single);
236DECLARE_STATS_COUNTER(cnt_unmap_single);
237DECLARE_STATS_COUNTER(cnt_map_sg);
238DECLARE_STATS_COUNTER(cnt_unmap_sg);
239DECLARE_STATS_COUNTER(cnt_alloc_coherent);
240DECLARE_STATS_COUNTER(cnt_free_coherent);
241DECLARE_STATS_COUNTER(cross_page);
242DECLARE_STATS_COUNTER(domain_flush_single);
243DECLARE_STATS_COUNTER(domain_flush_all);
244DECLARE_STATS_COUNTER(alloced_io_mem);
245DECLARE_STATS_COUNTER(total_map_requests);
246
247static struct dentry *stats_dir;
248static struct dentry *de_fflush;
249
250static void amd_iommu_stats_add(struct __iommu_counter *cnt)
251{
252 if (stats_dir == NULL)
253 return;
254
255 cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
256 &cnt->value);
257}
258
259static void amd_iommu_stats_init(void)
260{
261 stats_dir = debugfs_create_dir("amd-iommu", NULL);
262 if (stats_dir == NULL)
263 return;
264
265 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
266 (u32 *)&amd_iommu_unmap_flush);
267
268 amd_iommu_stats_add(&compl_wait);
269 amd_iommu_stats_add(&cnt_map_single);
270 amd_iommu_stats_add(&cnt_unmap_single);
271 amd_iommu_stats_add(&cnt_map_sg);
272 amd_iommu_stats_add(&cnt_unmap_sg);
273 amd_iommu_stats_add(&cnt_alloc_coherent);
274 amd_iommu_stats_add(&cnt_free_coherent);
275 amd_iommu_stats_add(&cross_page);
276 amd_iommu_stats_add(&domain_flush_single);
277 amd_iommu_stats_add(&domain_flush_all);
278 amd_iommu_stats_add(&alloced_io_mem);
279 amd_iommu_stats_add(&total_map_requests);
280}
281
282#endif
283
284/****************************************************************************
285 *
286 * Interrupt handling functions
287 *
288 ****************************************************************************/
289
290static void dump_dte_entry(u16 devid)
291{
292 int i;
293
294 for (i = 0; i < 8; ++i)
295 pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
296 amd_iommu_dev_table[devid].data[i]);
297}
298
299static void dump_command(unsigned long phys_addr)
300{
301 struct iommu_cmd *cmd = phys_to_virt(phys_addr);
302 int i;
303
304 for (i = 0; i < 4; ++i)
305 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
306}
307
308static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
309{
310 u32 *event = __evt;
311 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
312 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
313 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
314 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
315 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
316
317 printk(KERN_ERR "AMD-Vi: Event logged [");
318
319 switch (type) {
320 case EVENT_TYPE_ILL_DEV:
321 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
322 "address=0x%016llx flags=0x%04x]\n",
323 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
324 address, flags);
325 dump_dte_entry(devid);
326 break;
327 case EVENT_TYPE_IO_FAULT:
328 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
329 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
330 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
331 domid, address, flags);
332 break;
333 case EVENT_TYPE_DEV_TAB_ERR:
334 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
335 "address=0x%016llx flags=0x%04x]\n",
336 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
337 address, flags);
338 break;
339 case EVENT_TYPE_PAGE_TAB_ERR:
340 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
341 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
342 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
343 domid, address, flags);
344 break;
345 case EVENT_TYPE_ILL_CMD:
346 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
347 dump_command(address);
348 break;
349 case EVENT_TYPE_CMD_HARD_ERR:
350 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
351 "flags=0x%04x]\n", address, flags);
352 break;
353 case EVENT_TYPE_IOTLB_INV_TO:
354 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
355 "address=0x%016llx]\n",
356 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
357 address);
358 break;
359 case EVENT_TYPE_INV_DEV_REQ:
360 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
361 "address=0x%016llx flags=0x%04x]\n",
362 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
363 address, flags);
364 break;
365 default:
366 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
367 }
368}
369
370static void iommu_poll_events(struct amd_iommu *iommu)
371{
372 u32 head, tail;
373 unsigned long flags;
374
375 spin_lock_irqsave(&iommu->lock, flags);
376
377 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
378 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
379
380 while (head != tail) {
381 iommu_print_event(iommu, iommu->evt_buf + head);
382 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
383 }
384
385 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
386
387 spin_unlock_irqrestore(&iommu->lock, flags);
388}
389
390irqreturn_t amd_iommu_int_thread(int irq, void *data)
391{
392 struct amd_iommu *iommu;
393
394 for_each_iommu(iommu)
395 iommu_poll_events(iommu);
396
397 return IRQ_HANDLED;
398}
399
400irqreturn_t amd_iommu_int_handler(int irq, void *data)
401{
402 return IRQ_WAKE_THREAD;
403}
404
405/****************************************************************************
406 *
407 * IOMMU command queuing functions
408 *
409 ****************************************************************************/
410
411static int wait_on_sem(volatile u64 *sem)
412{
413 int i = 0;
414
415 while (*sem == 0 && i < LOOP_TIMEOUT) {
416 udelay(1);
417 i += 1;
418 }
419
420 if (i == LOOP_TIMEOUT) {
421 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
422 return -EIO;
423 }
424
425 return 0;
426}
427
428static void copy_cmd_to_buffer(struct amd_iommu *iommu,
429 struct iommu_cmd *cmd,
430 u32 tail)
431{
432 u8 *target;
433
434 target = iommu->cmd_buf + tail;
435 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
436
437 /* Copy command to buffer */
438 memcpy(target, cmd, sizeof(*cmd));
439
440 /* Tell the IOMMU about it */
441 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
442}
443
444static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
445{
446 WARN_ON(address & 0x7ULL);
447
448 memset(cmd, 0, sizeof(*cmd));
449 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
450 cmd->data[1] = upper_32_bits(__pa(address));
451 cmd->data[2] = 1;
452 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
453}
454
455static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
456{
457 memset(cmd, 0, sizeof(*cmd));
458 cmd->data[0] = devid;
459 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
460}
461
462static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
463 size_t size, u16 domid, int pde)
464{
465 u64 pages;
466 int s;
467
468 pages = iommu_num_pages(address, size, PAGE_SIZE);
469 s = 0;
470
471 if (pages > 1) {
472 /*
473 * If we have to flush more than one page, flush all
474 * TLB entries for this domain
475 */
476 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
477 s = 1;
478 }
479
480 address &= PAGE_MASK;
481
482 memset(cmd, 0, sizeof(*cmd));
483 cmd->data[1] |= domid;
484 cmd->data[2] = lower_32_bits(address);
485 cmd->data[3] = upper_32_bits(address);
486 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
487 if (s) /* size bit - we flush more than one 4kb page */
488 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
489 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
490 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
491}
492
493static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
494 u64 address, size_t size)
495{
496 u64 pages;
497 int s;
498
499 pages = iommu_num_pages(address, size, PAGE_SIZE);
500 s = 0;
501
502 if (pages > 1) {
503 /*
504 * If we have to flush more than one page, flush all
505 * TLB entries for this domain
506 */
507 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
508 s = 1;
509 }
510
511 address &= PAGE_MASK;
512
513 memset(cmd, 0, sizeof(*cmd));
514 cmd->data[0] = devid;
515 cmd->data[0] |= (qdep & 0xff) << 24;
516 cmd->data[1] = devid;
517 cmd->data[2] = lower_32_bits(address);
518 cmd->data[3] = upper_32_bits(address);
519 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
520 if (s)
521 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
522}
523
524static void build_inv_all(struct iommu_cmd *cmd)
525{
526 memset(cmd, 0, sizeof(*cmd));
527 CMD_SET_TYPE(cmd, CMD_INV_ALL);
528}
529
530/*
531 * Writes the command to the IOMMUs command buffer and informs the
532 * hardware about the new command.
533 */
534static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
535{
536 u32 left, tail, head, next_tail;
537 unsigned long flags;
538
539 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
540
541again:
542 spin_lock_irqsave(&iommu->lock, flags);
543
544 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
545 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
546 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
547 left = (head - next_tail) % iommu->cmd_buf_size;
548
549 if (left <= 2) {
550 struct iommu_cmd sync_cmd;
551 volatile u64 sem = 0;
552 int ret;
553
554 build_completion_wait(&sync_cmd, (u64)&sem);
555 copy_cmd_to_buffer(iommu, &sync_cmd, tail);
556
557 spin_unlock_irqrestore(&iommu->lock, flags);
558
559 if ((ret = wait_on_sem(&sem)) != 0)
560 return ret;
561
562 goto again;
563 }
564
565 copy_cmd_to_buffer(iommu, cmd, tail);
566
567 /* We need to sync now to make sure all commands are processed */
568 iommu->need_sync = true;
569
570 spin_unlock_irqrestore(&iommu->lock, flags);
571
572 return 0;
573}
574
575/*
576 * This function queues a completion wait command into the command
577 * buffer of an IOMMU
578 */
579static int iommu_completion_wait(struct amd_iommu *iommu)
580{
581 struct iommu_cmd cmd;
582 volatile u64 sem = 0;
583 int ret;
584
585 if (!iommu->need_sync)
586 return 0;
587
588 build_completion_wait(&cmd, (u64)&sem);
589
590 ret = iommu_queue_command(iommu, &cmd);
591 if (ret)
592 return ret;
593
594 return wait_on_sem(&sem);
595}
596
597static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
598{
599 struct iommu_cmd cmd;
600
601 build_inv_dte(&cmd, devid);
602
603 return iommu_queue_command(iommu, &cmd);
604}
605
606static void iommu_flush_dte_all(struct amd_iommu *iommu)
607{
608 u32 devid;
609
610 for (devid = 0; devid <= 0xffff; ++devid)
611 iommu_flush_dte(iommu, devid);
612
613 iommu_completion_wait(iommu);
614}
615
616/*
617 * This function uses heavy locking and may disable irqs for some time. But
618 * this is no issue because it is only called during resume.
619 */
620static void iommu_flush_tlb_all(struct amd_iommu *iommu)
621{
622 u32 dom_id;
623
624 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
625 struct iommu_cmd cmd;
626 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
627 dom_id, 1);
628 iommu_queue_command(iommu, &cmd);
629 }
630
631 iommu_completion_wait(iommu);
632}
633
634static void iommu_flush_all(struct amd_iommu *iommu)
635{
636 struct iommu_cmd cmd;
637
638 build_inv_all(&cmd);
639
640 iommu_queue_command(iommu, &cmd);
641 iommu_completion_wait(iommu);
642}
643
644void iommu_flush_all_caches(struct amd_iommu *iommu)
645{
646 if (iommu_feature(iommu, FEATURE_IA)) {
647 iommu_flush_all(iommu);
648 } else {
649 iommu_flush_dte_all(iommu);
650 iommu_flush_tlb_all(iommu);
651 }
652}
653
654/*
655 * Command send function for flushing on-device TLB
656 */
657static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
658{
659 struct pci_dev *pdev = to_pci_dev(dev);
660 struct amd_iommu *iommu;
661 struct iommu_cmd cmd;
662 u16 devid;
663 int qdep;
664
665 qdep = pci_ats_queue_depth(pdev);
666 devid = get_device_id(dev);
667 iommu = amd_iommu_rlookup_table[devid];
668
669 build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
670
671 return iommu_queue_command(iommu, &cmd);
672}
673
674/*
675 * Command send function for invalidating a device table entry
676 */
677static int device_flush_dte(struct device *dev)
678{
679 struct amd_iommu *iommu;
680 struct pci_dev *pdev;
681 u16 devid;
682 int ret;
683
684 pdev = to_pci_dev(dev);
685 devid = get_device_id(dev);
686 iommu = amd_iommu_rlookup_table[devid];
687
688 ret = iommu_flush_dte(iommu, devid);
689 if (ret)
690 return ret;
691
692 if (pci_ats_enabled(pdev))
693 ret = device_flush_iotlb(dev, 0, ~0UL);
694
695 return ret;
696}
697
698/*
699 * TLB invalidation function which is called from the mapping functions.
700 * It invalidates a single PTE if the range to flush is within a single
701 * page. Otherwise it flushes the whole TLB of the IOMMU.
702 */
703static void __domain_flush_pages(struct protection_domain *domain,
704 u64 address, size_t size, int pde)
705{
706 struct iommu_dev_data *dev_data;
707 struct iommu_cmd cmd;
708 int ret = 0, i;
709
710 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
711
712 for (i = 0; i < amd_iommus_present; ++i) {
713 if (!domain->dev_iommu[i])
714 continue;
715
716 /*
717 * Devices of this domain are behind this IOMMU
718 * We need a TLB flush
719 */
720 ret |= iommu_queue_command(amd_iommus[i], &cmd);
721 }
722
723 list_for_each_entry(dev_data, &domain->dev_list, list) {
724 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
725
726 if (!pci_ats_enabled(pdev))
727 continue;
728
729 ret |= device_flush_iotlb(dev_data->dev, address, size);
730 }
731
732 WARN_ON(ret);
733}
734
735static void domain_flush_pages(struct protection_domain *domain,
736 u64 address, size_t size)
737{
738 __domain_flush_pages(domain, address, size, 0);
739}
740
741/* Flush the whole IO/TLB for a given protection domain */
742static void domain_flush_tlb(struct protection_domain *domain)
743{
744 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
745}
746
747/* Flush the whole IO/TLB for a given protection domain - including PDE */
748static void domain_flush_tlb_pde(struct protection_domain *domain)
749{
750 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
751}
752
753static void domain_flush_complete(struct protection_domain *domain)
754{
755 int i;
756
757 for (i = 0; i < amd_iommus_present; ++i) {
758 if (!domain->dev_iommu[i])
759 continue;
760
761 /*
762 * Devices of this domain are behind this IOMMU
763 * We need to wait for completion of all commands.
764 */
765 iommu_completion_wait(amd_iommus[i]);
766 }
767}
768
769
770/*
771 * This function flushes the DTEs for all devices in domain
772 */
773static void domain_flush_devices(struct protection_domain *domain)
774{
775 struct iommu_dev_data *dev_data;
776 unsigned long flags;
777
778 spin_lock_irqsave(&domain->lock, flags);
779
780 list_for_each_entry(dev_data, &domain->dev_list, list)
781 device_flush_dte(dev_data->dev);
782
783 spin_unlock_irqrestore(&domain->lock, flags);
784}
785
786/****************************************************************************
787 *
788 * The functions below are used the create the page table mappings for
789 * unity mapped regions.
790 *
791 ****************************************************************************/
792
793/*
794 * This function is used to add another level to an IO page table. Adding
795 * another level increases the size of the address space by 9 bits to a size up
796 * to 64 bits.
797 */
798static bool increase_address_space(struct protection_domain *domain,
799 gfp_t gfp)
800{
801 u64 *pte;
802
803 if (domain->mode == PAGE_MODE_6_LEVEL)
804 /* address space already 64 bit large */
805 return false;
806
807 pte = (void *)get_zeroed_page(gfp);
808 if (!pte)
809 return false;
810
811 *pte = PM_LEVEL_PDE(domain->mode,
812 virt_to_phys(domain->pt_root));
813 domain->pt_root = pte;
814 domain->mode += 1;
815 domain->updated = true;
816
817 return true;
818}
819
820static u64 *alloc_pte(struct protection_domain *domain,
821 unsigned long address,
822 unsigned long page_size,
823 u64 **pte_page,
824 gfp_t gfp)
825{
826 int level, end_lvl;
827 u64 *pte, *page;
828
829 BUG_ON(!is_power_of_2(page_size));
830
831 while (address > PM_LEVEL_SIZE(domain->mode))
832 increase_address_space(domain, gfp);
833
834 level = domain->mode - 1;
835 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
836 address = PAGE_SIZE_ALIGN(address, page_size);
837 end_lvl = PAGE_SIZE_LEVEL(page_size);
838
839 while (level > end_lvl) {
840 if (!IOMMU_PTE_PRESENT(*pte)) {
841 page = (u64 *)get_zeroed_page(gfp);
842 if (!page)
843 return NULL;
844 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
845 }
846
847 /* No level skipping support yet */
848 if (PM_PTE_LEVEL(*pte) != level)
849 return NULL;
850
851 level -= 1;
852
853 pte = IOMMU_PTE_PAGE(*pte);
854
855 if (pte_page && level == end_lvl)
856 *pte_page = pte;
857
858 pte = &pte[PM_LEVEL_INDEX(level, address)];
859 }
860
861 return pte;
862}
863
864/*
865 * This function checks if there is a PTE for a given dma address. If
866 * there is one, it returns the pointer to it.
867 */
868static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
869{
870 int level;
871 u64 *pte;
872
873 if (address > PM_LEVEL_SIZE(domain->mode))
874 return NULL;
875
876 level = domain->mode - 1;
877 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
878
879 while (level > 0) {
880
881 /* Not Present */
882 if (!IOMMU_PTE_PRESENT(*pte))
883 return NULL;
884
885 /* Large PTE */
886 if (PM_PTE_LEVEL(*pte) == 0x07) {
887 unsigned long pte_mask, __pte;
888
889 /*
890 * If we have a series of large PTEs, make
891 * sure to return a pointer to the first one.
892 */
893 pte_mask = PTE_PAGE_SIZE(*pte);
894 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
895 __pte = ((unsigned long)pte) & pte_mask;
896
897 return (u64 *)__pte;
898 }
899
900 /* No level skipping support yet */
901 if (PM_PTE_LEVEL(*pte) != level)
902 return NULL;
903
904 level -= 1;
905
906 /* Walk to the next level */
907 pte = IOMMU_PTE_PAGE(*pte);
908 pte = &pte[PM_LEVEL_INDEX(level, address)];
909 }
910
911 return pte;
912}
913
914/*
915 * Generic mapping functions. It maps a physical address into a DMA
916 * address space. It allocates the page table pages if necessary.
917 * In the future it can be extended to a generic mapping function
918 * supporting all features of AMD IOMMU page tables like level skipping
919 * and full 64 bit address spaces.
920 */
921static int iommu_map_page(struct protection_domain *dom,
922 unsigned long bus_addr,
923 unsigned long phys_addr,
924 int prot,
925 unsigned long page_size)
926{
927 u64 __pte, *pte;
928 int i, count;
929
930 if (!(prot & IOMMU_PROT_MASK))
931 return -EINVAL;
932
933 bus_addr = PAGE_ALIGN(bus_addr);
934 phys_addr = PAGE_ALIGN(phys_addr);
935 count = PAGE_SIZE_PTE_COUNT(page_size);
936 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
937
938 for (i = 0; i < count; ++i)
939 if (IOMMU_PTE_PRESENT(pte[i]))
940 return -EBUSY;
941
942 if (page_size > PAGE_SIZE) {
943 __pte = PAGE_SIZE_PTE(phys_addr, page_size);
944 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
945 } else
946 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
947
948 if (prot & IOMMU_PROT_IR)
949 __pte |= IOMMU_PTE_IR;
950 if (prot & IOMMU_PROT_IW)
951 __pte |= IOMMU_PTE_IW;
952
953 for (i = 0; i < count; ++i)
954 pte[i] = __pte;
955
956 update_domain(dom);
957
958 return 0;
959}
960
961static unsigned long iommu_unmap_page(struct protection_domain *dom,
962 unsigned long bus_addr,
963 unsigned long page_size)
964{
965 unsigned long long unmap_size, unmapped;
966 u64 *pte;
967
968 BUG_ON(!is_power_of_2(page_size));
969
970 unmapped = 0;
971
972 while (unmapped < page_size) {
973
974 pte = fetch_pte(dom, bus_addr);
975
976 if (!pte) {
977 /*
978 * No PTE for this address
979 * move forward in 4kb steps
980 */
981 unmap_size = PAGE_SIZE;
982 } else if (PM_PTE_LEVEL(*pte) == 0) {
983 /* 4kb PTE found for this address */
984 unmap_size = PAGE_SIZE;
985 *pte = 0ULL;
986 } else {
987 int count, i;
988
989 /* Large PTE found which maps this address */
990 unmap_size = PTE_PAGE_SIZE(*pte);
991 count = PAGE_SIZE_PTE_COUNT(unmap_size);
992 for (i = 0; i < count; i++)
993 pte[i] = 0ULL;
994 }
995
996 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
997 unmapped += unmap_size;
998 }
999
1000 BUG_ON(!is_power_of_2(unmapped));
1001
1002 return unmapped;
1003}
1004
1005/*
1006 * This function checks if a specific unity mapping entry is needed for
1007 * this specific IOMMU.
1008 */
1009static int iommu_for_unity_map(struct amd_iommu *iommu,
1010 struct unity_map_entry *entry)
1011{
1012 u16 bdf, i;
1013
1014 for (i = entry->devid_start; i <= entry->devid_end; ++i) {
1015 bdf = amd_iommu_alias_table[i];
1016 if (amd_iommu_rlookup_table[bdf] == iommu)
1017 return 1;
1018 }
1019
1020 return 0;
1021}
1022
1023/*
1024 * This function actually applies the mapping to the page table of the
1025 * dma_ops domain.
1026 */
1027static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
1028 struct unity_map_entry *e)
1029{
1030 u64 addr;
1031 int ret;
1032
1033 for (addr = e->address_start; addr < e->address_end;
1034 addr += PAGE_SIZE) {
1035 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
1036 PAGE_SIZE);
1037 if (ret)
1038 return ret;
1039 /*
1040 * if unity mapping is in aperture range mark the page
1041 * as allocated in the aperture
1042 */
1043 if (addr < dma_dom->aperture_size)
1044 __set_bit(addr >> PAGE_SHIFT,
1045 dma_dom->aperture[0]->bitmap);
1046 }
1047
1048 return 0;
1049}
1050
1051/*
1052 * Init the unity mappings for a specific IOMMU in the system
1053 *
1054 * Basically iterates over all unity mapping entries and applies them to
1055 * the default domain DMA of that IOMMU if necessary.
1056 */
1057static int iommu_init_unity_mappings(struct amd_iommu *iommu)
1058{
1059 struct unity_map_entry *entry;
1060 int ret;
1061
1062 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
1063 if (!iommu_for_unity_map(iommu, entry))
1064 continue;
1065 ret = dma_ops_unity_map(iommu->default_dom, entry);
1066 if (ret)
1067 return ret;
1068 }
1069
1070 return 0;
1071}
1072
1073/*
1074 * Inits the unity mappings required for a specific device
1075 */
1076static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
1077 u16 devid)
1078{
1079 struct unity_map_entry *e;
1080 int ret;
1081
1082 list_for_each_entry(e, &amd_iommu_unity_map, list) {
1083 if (!(devid >= e->devid_start && devid <= e->devid_end))
1084 continue;
1085 ret = dma_ops_unity_map(dma_dom, e);
1086 if (ret)
1087 return ret;
1088 }
1089
1090 return 0;
1091}
1092
1093/****************************************************************************
1094 *
1095 * The next functions belong to the address allocator for the dma_ops
1096 * interface functions. They work like the allocators in the other IOMMU
1097 * drivers. Its basically a bitmap which marks the allocated pages in
1098 * the aperture. Maybe it could be enhanced in the future to a more
1099 * efficient allocator.
1100 *
1101 ****************************************************************************/
1102
1103/*
1104 * The address allocator core functions.
1105 *
1106 * called with domain->lock held
1107 */
1108
1109/*
1110 * Used to reserve address ranges in the aperture (e.g. for exclusion
1111 * ranges.
1112 */
1113static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1114 unsigned long start_page,
1115 unsigned int pages)
1116{
1117 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1118
1119 if (start_page + pages > last_page)
1120 pages = last_page - start_page;
1121
1122 for (i = start_page; i < start_page + pages; ++i) {
1123 int index = i / APERTURE_RANGE_PAGES;
1124 int page = i % APERTURE_RANGE_PAGES;
1125 __set_bit(page, dom->aperture[index]->bitmap);
1126 }
1127}
1128
1129/*
1130 * This function is used to add a new aperture range to an existing
1131 * aperture in case of dma_ops domain allocation or address allocation
1132 * failure.
1133 */
1134static int alloc_new_range(struct dma_ops_domain *dma_dom,
1135 bool populate, gfp_t gfp)
1136{
1137 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
1138 struct amd_iommu *iommu;
1139 unsigned long i;
1140
1141#ifdef CONFIG_IOMMU_STRESS
1142 populate = false;
1143#endif
1144
1145 if (index >= APERTURE_MAX_RANGES)
1146 return -ENOMEM;
1147
1148 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
1149 if (!dma_dom->aperture[index])
1150 return -ENOMEM;
1151
1152 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
1153 if (!dma_dom->aperture[index]->bitmap)
1154 goto out_free;
1155
1156 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
1157
1158 if (populate) {
1159 unsigned long address = dma_dom->aperture_size;
1160 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
1161 u64 *pte, *pte_page;
1162
1163 for (i = 0; i < num_ptes; ++i) {
1164 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1165 &pte_page, gfp);
1166 if (!pte)
1167 goto out_free;
1168
1169 dma_dom->aperture[index]->pte_pages[i] = pte_page;
1170
1171 address += APERTURE_RANGE_SIZE / 64;
1172 }
1173 }
1174
1175 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1176
1177 /* Initialize the exclusion range if necessary */
1178 for_each_iommu(iommu) {
1179 if (iommu->exclusion_start &&
1180 iommu->exclusion_start >= dma_dom->aperture[index]->offset
1181 && iommu->exclusion_start < dma_dom->aperture_size) {
1182 unsigned long startpage;
1183 int pages = iommu_num_pages(iommu->exclusion_start,
1184 iommu->exclusion_length,
1185 PAGE_SIZE);
1186 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1187 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1188 }
1189 }
1190
1191 /*
1192 * Check for areas already mapped as present in the new aperture
1193 * range and mark those pages as reserved in the allocator. Such
1194 * mappings may already exist as a result of requested unity
1195 * mappings for devices.
1196 */
1197 for (i = dma_dom->aperture[index]->offset;
1198 i < dma_dom->aperture_size;
1199 i += PAGE_SIZE) {
1200 u64 *pte = fetch_pte(&dma_dom->domain, i);
1201 if (!pte || !IOMMU_PTE_PRESENT(*pte))
1202 continue;
1203
1204 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
1205 }
1206
1207 update_domain(&dma_dom->domain);
1208
1209 return 0;
1210
1211out_free:
1212 update_domain(&dma_dom->domain);
1213
1214 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
1215
1216 kfree(dma_dom->aperture[index]);
1217 dma_dom->aperture[index] = NULL;
1218
1219 return -ENOMEM;
1220}
1221
1222static unsigned long dma_ops_area_alloc(struct device *dev,
1223 struct dma_ops_domain *dom,
1224 unsigned int pages,
1225 unsigned long align_mask,
1226 u64 dma_mask,
1227 unsigned long start)
1228{
1229 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
1230 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
1231 int i = start >> APERTURE_RANGE_SHIFT;
1232 unsigned long boundary_size;
1233 unsigned long address = -1;
1234 unsigned long limit;
1235
1236 next_bit >>= PAGE_SHIFT;
1237
1238 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
1239 PAGE_SIZE) >> PAGE_SHIFT;
1240
1241 for (;i < max_index; ++i) {
1242 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
1243
1244 if (dom->aperture[i]->offset >= dma_mask)
1245 break;
1246
1247 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
1248 dma_mask >> PAGE_SHIFT);
1249
1250 address = iommu_area_alloc(dom->aperture[i]->bitmap,
1251 limit, next_bit, pages, 0,
1252 boundary_size, align_mask);
1253 if (address != -1) {
1254 address = dom->aperture[i]->offset +
1255 (address << PAGE_SHIFT);
1256 dom->next_address = address + (pages << PAGE_SHIFT);
1257 break;
1258 }
1259
1260 next_bit = 0;
1261 }
1262
1263 return address;
1264}
1265
1266static unsigned long dma_ops_alloc_addresses(struct device *dev,
1267 struct dma_ops_domain *dom,
1268 unsigned int pages,
1269 unsigned long align_mask,
1270 u64 dma_mask)
1271{
1272 unsigned long address;
1273
1274#ifdef CONFIG_IOMMU_STRESS
1275 dom->next_address = 0;
1276 dom->need_flush = true;
1277#endif
1278
1279 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1280 dma_mask, dom->next_address);
1281
1282 if (address == -1) {
1283 dom->next_address = 0;
1284 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1285 dma_mask, 0);
1286 dom->need_flush = true;
1287 }
1288
1289 if (unlikely(address == -1))
1290 address = DMA_ERROR_CODE;
1291
1292 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
1293
1294 return address;
1295}
1296
1297/*
1298 * The address free function.
1299 *
1300 * called with domain->lock held
1301 */
1302static void dma_ops_free_addresses(struct dma_ops_domain *dom,
1303 unsigned long address,
1304 unsigned int pages)
1305{
1306 unsigned i = address >> APERTURE_RANGE_SHIFT;
1307 struct aperture_range *range = dom->aperture[i];
1308
1309 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
1310
1311#ifdef CONFIG_IOMMU_STRESS
1312 if (i < 4)
1313 return;
1314#endif
1315
1316 if (address >= dom->next_address)
1317 dom->need_flush = true;
1318
1319 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
1320
1321 bitmap_clear(range->bitmap, address, pages);
1322
1323}
1324
1325/****************************************************************************
1326 *
1327 * The next functions belong to the domain allocation. A domain is
1328 * allocated for every IOMMU as the default domain. If device isolation
1329 * is enabled, every device get its own domain. The most important thing
1330 * about domains is the page table mapping the DMA address space they
1331 * contain.
1332 *
1333 ****************************************************************************/
1334
1335/*
1336 * This function adds a protection domain to the global protection domain list
1337 */
1338static void add_domain_to_list(struct protection_domain *domain)
1339{
1340 unsigned long flags;
1341
1342 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1343 list_add(&domain->list, &amd_iommu_pd_list);
1344 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1345}
1346
1347/*
1348 * This function removes a protection domain to the global
1349 * protection domain list
1350 */
1351static void del_domain_from_list(struct protection_domain *domain)
1352{
1353 unsigned long flags;
1354
1355 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1356 list_del(&domain->list);
1357 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1358}
1359
1360static u16 domain_id_alloc(void)
1361{
1362 unsigned long flags;
1363 int id;
1364
1365 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1366 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1367 BUG_ON(id == 0);
1368 if (id > 0 && id < MAX_DOMAIN_ID)
1369 __set_bit(id, amd_iommu_pd_alloc_bitmap);
1370 else
1371 id = 0;
1372 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1373
1374 return id;
1375}
1376
1377static void domain_id_free(int id)
1378{
1379 unsigned long flags;
1380
1381 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1382 if (id > 0 && id < MAX_DOMAIN_ID)
1383 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
1384 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1385}
1386
1387static void free_pagetable(struct protection_domain *domain)
1388{
1389 int i, j;
1390 u64 *p1, *p2, *p3;
1391
1392 p1 = domain->pt_root;
1393
1394 if (!p1)
1395 return;
1396
1397 for (i = 0; i < 512; ++i) {
1398 if (!IOMMU_PTE_PRESENT(p1[i]))
1399 continue;
1400
1401 p2 = IOMMU_PTE_PAGE(p1[i]);
1402 for (j = 0; j < 512; ++j) {
1403 if (!IOMMU_PTE_PRESENT(p2[j]))
1404 continue;
1405 p3 = IOMMU_PTE_PAGE(p2[j]);
1406 free_page((unsigned long)p3);
1407 }
1408
1409 free_page((unsigned long)p2);
1410 }
1411
1412 free_page((unsigned long)p1);
1413
1414 domain->pt_root = NULL;
1415}
1416
1417/*
1418 * Free a domain, only used if something went wrong in the
1419 * allocation path and we need to free an already allocated page table
1420 */
1421static void dma_ops_domain_free(struct dma_ops_domain *dom)
1422{
1423 int i;
1424
1425 if (!dom)
1426 return;
1427
1428 del_domain_from_list(&dom->domain);
1429
1430 free_pagetable(&dom->domain);
1431
1432 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
1433 if (!dom->aperture[i])
1434 continue;
1435 free_page((unsigned long)dom->aperture[i]->bitmap);
1436 kfree(dom->aperture[i]);
1437 }
1438
1439 kfree(dom);
1440}
1441
1442/*
1443 * Allocates a new protection domain usable for the dma_ops functions.
1444 * It also initializes the page table and the address allocator data
1445 * structures required for the dma_ops interface
1446 */
1447static struct dma_ops_domain *dma_ops_domain_alloc(void)
1448{
1449 struct dma_ops_domain *dma_dom;
1450
1451 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
1452 if (!dma_dom)
1453 return NULL;
1454
1455 spin_lock_init(&dma_dom->domain.lock);
1456
1457 dma_dom->domain.id = domain_id_alloc();
1458 if (dma_dom->domain.id == 0)
1459 goto free_dma_dom;
1460 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1461 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1462 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1463 dma_dom->domain.flags = PD_DMA_OPS_MASK;
1464 dma_dom->domain.priv = dma_dom;
1465 if (!dma_dom->domain.pt_root)
1466 goto free_dma_dom;
1467
1468 dma_dom->need_flush = false;
1469 dma_dom->target_dev = 0xffff;
1470
1471 add_domain_to_list(&dma_dom->domain);
1472
1473 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1474 goto free_dma_dom;
1475
1476 /*
1477 * mark the first page as allocated so we never return 0 as
1478 * a valid dma-address. So we can use 0 as error value
1479 */
1480 dma_dom->aperture[0]->bitmap[0] = 1;
1481 dma_dom->next_address = 0;
1482
1483
1484 return dma_dom;
1485
1486free_dma_dom:
1487 dma_ops_domain_free(dma_dom);
1488
1489 return NULL;
1490}
1491
1492/*
1493 * little helper function to check whether a given protection domain is a
1494 * dma_ops domain
1495 */
1496static bool dma_ops_domain(struct protection_domain *domain)
1497{
1498 return domain->flags & PD_DMA_OPS_MASK;
1499}
1500
1501static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1502{
1503 u64 pte_root = virt_to_phys(domain->pt_root);
1504 u32 flags = 0;
1505
1506 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1507 << DEV_ENTRY_MODE_SHIFT;
1508 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1509
1510 if (ats)
1511 flags |= DTE_FLAG_IOTLB;
1512
1513 amd_iommu_dev_table[devid].data[3] |= flags;
1514 amd_iommu_dev_table[devid].data[2] = domain->id;
1515 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1516 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1517}
1518
1519static void clear_dte_entry(u16 devid)
1520{
1521 /* remove entry from the device table seen by the hardware */
1522 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1523 amd_iommu_dev_table[devid].data[1] = 0;
1524 amd_iommu_dev_table[devid].data[2] = 0;
1525
1526 amd_iommu_apply_erratum_63(devid);
1527}
1528
1529static void do_attach(struct device *dev, struct protection_domain *domain)
1530{
1531 struct iommu_dev_data *dev_data;
1532 struct amd_iommu *iommu;
1533 struct pci_dev *pdev;
1534 bool ats = false;
1535 u16 devid;
1536
1537 devid = get_device_id(dev);
1538 iommu = amd_iommu_rlookup_table[devid];
1539 dev_data = get_dev_data(dev);
1540 pdev = to_pci_dev(dev);
1541
1542 if (amd_iommu_iotlb_sup)
1543 ats = pci_ats_enabled(pdev);
1544
1545 /* Update data structures */
1546 dev_data->domain = domain;
1547 list_add(&dev_data->list, &domain->dev_list);
1548 set_dte_entry(devid, domain, ats);
1549
1550 /* Do reference counting */
1551 domain->dev_iommu[iommu->index] += 1;
1552 domain->dev_cnt += 1;
1553
1554 /* Flush the DTE entry */
1555 device_flush_dte(dev);
1556}
1557
1558static void do_detach(struct device *dev)
1559{
1560 struct iommu_dev_data *dev_data;
1561 struct amd_iommu *iommu;
1562 u16 devid;
1563
1564 devid = get_device_id(dev);
1565 iommu = amd_iommu_rlookup_table[devid];
1566 dev_data = get_dev_data(dev);
1567
1568 /* decrease reference counters */
1569 dev_data->domain->dev_iommu[iommu->index] -= 1;
1570 dev_data->domain->dev_cnt -= 1;
1571
1572 /* Update data structures */
1573 dev_data->domain = NULL;
1574 list_del(&dev_data->list);
1575 clear_dte_entry(devid);
1576
1577 /* Flush the DTE entry */
1578 device_flush_dte(dev);
1579}
1580
1581/*
1582 * If a device is not yet associated with a domain, this function does
1583 * assigns it visible for the hardware
1584 */
1585static int __attach_device(struct device *dev,
1586 struct protection_domain *domain)
1587{
1588 struct iommu_dev_data *dev_data, *alias_data;
1589 int ret;
1590
1591 dev_data = get_dev_data(dev);
1592 alias_data = get_dev_data(dev_data->alias);
1593
1594 if (!alias_data)
1595 return -EINVAL;
1596
1597 /* lock domain */
1598 spin_lock(&domain->lock);
1599
1600 /* Some sanity checks */
1601 ret = -EBUSY;
1602 if (alias_data->domain != NULL &&
1603 alias_data->domain != domain)
1604 goto out_unlock;
1605
1606 if (dev_data->domain != NULL &&
1607 dev_data->domain != domain)
1608 goto out_unlock;
1609
1610 /* Do real assignment */
1611 if (dev_data->alias != dev) {
1612 alias_data = get_dev_data(dev_data->alias);
1613 if (alias_data->domain == NULL)
1614 do_attach(dev_data->alias, domain);
1615
1616 atomic_inc(&alias_data->bind);
1617 }
1618
1619 if (dev_data->domain == NULL)
1620 do_attach(dev, domain);
1621
1622 atomic_inc(&dev_data->bind);
1623
1624 ret = 0;
1625
1626out_unlock:
1627
1628 /* ready */
1629 spin_unlock(&domain->lock);
1630
1631 return ret;
1632}
1633
1634/*
1635 * If a device is not yet associated with a domain, this function does
1636 * assigns it visible for the hardware
1637 */
1638static int attach_device(struct device *dev,
1639 struct protection_domain *domain)
1640{
1641 struct pci_dev *pdev = to_pci_dev(dev);
1642 unsigned long flags;
1643 int ret;
1644
1645 if (amd_iommu_iotlb_sup)
1646 pci_enable_ats(pdev, PAGE_SHIFT);
1647
1648 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1649 ret = __attach_device(dev, domain);
1650 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1651
1652 /*
1653 * We might boot into a crash-kernel here. The crashed kernel
1654 * left the caches in the IOMMU dirty. So we have to flush
1655 * here to evict all dirty stuff.
1656 */
1657 domain_flush_tlb_pde(domain);
1658
1659 return ret;
1660}
1661
1662/*
1663 * Removes a device from a protection domain (unlocked)
1664 */
1665static void __detach_device(struct device *dev)
1666{
1667 struct iommu_dev_data *dev_data = get_dev_data(dev);
1668 struct iommu_dev_data *alias_data;
1669 struct protection_domain *domain;
1670 unsigned long flags;
1671
1672 BUG_ON(!dev_data->domain);
1673
1674 domain = dev_data->domain;
1675
1676 spin_lock_irqsave(&domain->lock, flags);
1677
1678 if (dev_data->alias != dev) {
1679 alias_data = get_dev_data(dev_data->alias);
1680 if (atomic_dec_and_test(&alias_data->bind))
1681 do_detach(dev_data->alias);
1682 }
1683
1684 if (atomic_dec_and_test(&dev_data->bind))
1685 do_detach(dev);
1686
1687 spin_unlock_irqrestore(&domain->lock, flags);
1688
1689 /*
1690 * If we run in passthrough mode the device must be assigned to the
1691 * passthrough domain if it is detached from any other domain.
1692 * Make sure we can deassign from the pt_domain itself.
1693 */
1694 if (iommu_pass_through &&
1695 (dev_data->domain == NULL && domain != pt_domain))
1696 __attach_device(dev, pt_domain);
1697}
1698
1699/*
1700 * Removes a device from a protection domain (with devtable_lock held)
1701 */
1702static void detach_device(struct device *dev)
1703{
1704 struct pci_dev *pdev = to_pci_dev(dev);
1705 unsigned long flags;
1706
1707 /* lock device table */
1708 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1709 __detach_device(dev);
1710 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1711
1712 if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
1713 pci_disable_ats(pdev);
1714}
1715
1716/*
1717 * Find out the protection domain structure for a given PCI device. This
1718 * will give us the pointer to the page table root for example.
1719 */
1720static struct protection_domain *domain_for_device(struct device *dev)
1721{
1722 struct protection_domain *dom;
1723 struct iommu_dev_data *dev_data, *alias_data;
1724 unsigned long flags;
1725 u16 devid;
1726
1727 devid = get_device_id(dev);
1728 dev_data = get_dev_data(dev);
1729 alias_data = get_dev_data(dev_data->alias);
1730 if (!alias_data)
1731 return NULL;
1732
1733 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1734 dom = dev_data->domain;
1735 if (dom == NULL &&
1736 alias_data->domain != NULL) {
1737 __attach_device(dev, alias_data->domain);
1738 dom = alias_data->domain;
1739 }
1740
1741 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1742
1743 return dom;
1744}
1745
1746static int device_change_notifier(struct notifier_block *nb,
1747 unsigned long action, void *data)
1748{
1749 struct device *dev = data;
1750 u16 devid;
1751 struct protection_domain *domain;
1752 struct dma_ops_domain *dma_domain;
1753 struct amd_iommu *iommu;
1754 unsigned long flags;
1755
1756 if (!check_device(dev))
1757 return 0;
1758
1759 devid = get_device_id(dev);
1760 iommu = amd_iommu_rlookup_table[devid];
1761
1762 switch (action) {
1763 case BUS_NOTIFY_UNBOUND_DRIVER:
1764
1765 domain = domain_for_device(dev);
1766
1767 if (!domain)
1768 goto out;
1769 if (iommu_pass_through)
1770 break;
1771 detach_device(dev);
1772 break;
1773 case BUS_NOTIFY_ADD_DEVICE:
1774
1775 iommu_init_device(dev);
1776
1777 domain = domain_for_device(dev);
1778
1779 /* allocate a protection domain if a device is added */
1780 dma_domain = find_protection_domain(devid);
1781 if (dma_domain)
1782 goto out;
1783 dma_domain = dma_ops_domain_alloc();
1784 if (!dma_domain)
1785 goto out;
1786 dma_domain->target_dev = devid;
1787
1788 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1789 list_add_tail(&dma_domain->list, &iommu_pd_list);
1790 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1791
1792 break;
1793 case BUS_NOTIFY_DEL_DEVICE:
1794
1795 iommu_uninit_device(dev);
1796
1797 default:
1798 goto out;
1799 }
1800
1801 device_flush_dte(dev);
1802 iommu_completion_wait(iommu);
1803
1804out:
1805 return 0;
1806}
1807
1808static struct notifier_block device_nb = {
1809 .notifier_call = device_change_notifier,
1810};
1811
1812void amd_iommu_init_notifier(void)
1813{
1814 bus_register_notifier(&pci_bus_type, &device_nb);
1815}
1816
1817/*****************************************************************************
1818 *
1819 * The next functions belong to the dma_ops mapping/unmapping code.
1820 *
1821 *****************************************************************************/
1822
1823/*
1824 * In the dma_ops path we only have the struct device. This function
1825 * finds the corresponding IOMMU, the protection domain and the
1826 * requestor id for a given device.
1827 * If the device is not yet associated with a domain this is also done
1828 * in this function.
1829 */
1830static struct protection_domain *get_domain(struct device *dev)
1831{
1832 struct protection_domain *domain;
1833 struct dma_ops_domain *dma_dom;
1834 u16 devid = get_device_id(dev);
1835
1836 if (!check_device(dev))
1837 return ERR_PTR(-EINVAL);
1838
1839 domain = domain_for_device(dev);
1840 if (domain != NULL && !dma_ops_domain(domain))
1841 return ERR_PTR(-EBUSY);
1842
1843 if (domain != NULL)
1844 return domain;
1845
1846 /* Device not bount yet - bind it */
1847 dma_dom = find_protection_domain(devid);
1848 if (!dma_dom)
1849 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1850 attach_device(dev, &dma_dom->domain);
1851 DUMP_printk("Using protection domain %d for device %s\n",
1852 dma_dom->domain.id, dev_name(dev));
1853
1854 return &dma_dom->domain;
1855}
1856
1857static void update_device_table(struct protection_domain *domain)
1858{
1859 struct iommu_dev_data *dev_data;
1860
1861 list_for_each_entry(dev_data, &domain->dev_list, list) {
1862 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
1863 u16 devid = get_device_id(dev_data->dev);
1864 set_dte_entry(devid, domain, pci_ats_enabled(pdev));
1865 }
1866}
1867
1868static void update_domain(struct protection_domain *domain)
1869{
1870 if (!domain->updated)
1871 return;
1872
1873 update_device_table(domain);
1874
1875 domain_flush_devices(domain);
1876 domain_flush_tlb_pde(domain);
1877
1878 domain->updated = false;
1879}
1880
1881/*
1882 * This function fetches the PTE for a given address in the aperture
1883 */
1884static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1885 unsigned long address)
1886{
1887 struct aperture_range *aperture;
1888 u64 *pte, *pte_page;
1889
1890 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1891 if (!aperture)
1892 return NULL;
1893
1894 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1895 if (!pte) {
1896 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
1897 GFP_ATOMIC);
1898 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1899 } else
1900 pte += PM_LEVEL_INDEX(0, address);
1901
1902 update_domain(&dom->domain);
1903
1904 return pte;
1905}
1906
1907/*
1908 * This is the generic map function. It maps one 4kb page at paddr to
1909 * the given address in the DMA address space for the domain.
1910 */
1911static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1912 unsigned long address,
1913 phys_addr_t paddr,
1914 int direction)
1915{
1916 u64 *pte, __pte;
1917
1918 WARN_ON(address > dom->aperture_size);
1919
1920 paddr &= PAGE_MASK;
1921
1922 pte = dma_ops_get_pte(dom, address);
1923 if (!pte)
1924 return DMA_ERROR_CODE;
1925
1926 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1927
1928 if (direction == DMA_TO_DEVICE)
1929 __pte |= IOMMU_PTE_IR;
1930 else if (direction == DMA_FROM_DEVICE)
1931 __pte |= IOMMU_PTE_IW;
1932 else if (direction == DMA_BIDIRECTIONAL)
1933 __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
1934
1935 WARN_ON(*pte);
1936
1937 *pte = __pte;
1938
1939 return (dma_addr_t)address;
1940}
1941
1942/*
1943 * The generic unmapping function for on page in the DMA address space.
1944 */
1945static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1946 unsigned long address)
1947{
1948 struct aperture_range *aperture;
1949 u64 *pte;
1950
1951 if (address >= dom->aperture_size)
1952 return;
1953
1954 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1955 if (!aperture)
1956 return;
1957
1958 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1959 if (!pte)
1960 return;
1961
1962 pte += PM_LEVEL_INDEX(0, address);
1963
1964 WARN_ON(!*pte);
1965
1966 *pte = 0ULL;
1967}
1968
1969/*
1970 * This function contains common code for mapping of a physically
1971 * contiguous memory region into DMA address space. It is used by all
1972 * mapping functions provided with this IOMMU driver.
1973 * Must be called with the domain lock held.
1974 */
1975static dma_addr_t __map_single(struct device *dev,
1976 struct dma_ops_domain *dma_dom,
1977 phys_addr_t paddr,
1978 size_t size,
1979 int dir,
1980 bool align,
1981 u64 dma_mask)
1982{
1983 dma_addr_t offset = paddr & ~PAGE_MASK;
1984 dma_addr_t address, start, ret;
1985 unsigned int pages;
1986 unsigned long align_mask = 0;
1987 int i;
1988
1989 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
1990 paddr &= PAGE_MASK;
1991
1992 INC_STATS_COUNTER(total_map_requests);
1993
1994 if (pages > 1)
1995 INC_STATS_COUNTER(cross_page);
1996
1997 if (align)
1998 align_mask = (1UL << get_order(size)) - 1;
1999
2000retry:
2001 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
2002 dma_mask);
2003 if (unlikely(address == DMA_ERROR_CODE)) {
2004 /*
2005 * setting next_address here will let the address
2006 * allocator only scan the new allocated range in the
2007 * first run. This is a small optimization.
2008 */
2009 dma_dom->next_address = dma_dom->aperture_size;
2010
2011 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
2012 goto out;
2013
2014 /*
2015 * aperture was successfully enlarged by 128 MB, try
2016 * allocation again
2017 */
2018 goto retry;
2019 }
2020
2021 start = address;
2022 for (i = 0; i < pages; ++i) {
2023 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
2024 if (ret == DMA_ERROR_CODE)
2025 goto out_unmap;
2026
2027 paddr += PAGE_SIZE;
2028 start += PAGE_SIZE;
2029 }
2030 address += offset;
2031
2032 ADD_STATS_COUNTER(alloced_io_mem, size);
2033
2034 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
2035 domain_flush_tlb(&dma_dom->domain);
2036 dma_dom->need_flush = false;
2037 } else if (unlikely(amd_iommu_np_cache))
2038 domain_flush_pages(&dma_dom->domain, address, size);
2039
2040out:
2041 return address;
2042
2043out_unmap:
2044
2045 for (--i; i >= 0; --i) {
2046 start -= PAGE_SIZE;
2047 dma_ops_domain_unmap(dma_dom, start);
2048 }
2049
2050 dma_ops_free_addresses(dma_dom, address, pages);
2051
2052 return DMA_ERROR_CODE;
2053}
2054
2055/*
2056 * Does the reverse of the __map_single function. Must be called with
2057 * the domain lock held too
2058 */
2059static void __unmap_single(struct dma_ops_domain *dma_dom,
2060 dma_addr_t dma_addr,
2061 size_t size,
2062 int dir)
2063{
2064 dma_addr_t flush_addr;
2065 dma_addr_t i, start;
2066 unsigned int pages;
2067
2068 if ((dma_addr == DMA_ERROR_CODE) ||
2069 (dma_addr + size > dma_dom->aperture_size))
2070 return;
2071
2072 flush_addr = dma_addr;
2073 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
2074 dma_addr &= PAGE_MASK;
2075 start = dma_addr;
2076
2077 for (i = 0; i < pages; ++i) {
2078 dma_ops_domain_unmap(dma_dom, start);
2079 start += PAGE_SIZE;
2080 }
2081
2082 SUB_STATS_COUNTER(alloced_io_mem, size);
2083
2084 dma_ops_free_addresses(dma_dom, dma_addr, pages);
2085
2086 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
2087 domain_flush_pages(&dma_dom->domain, flush_addr, size);
2088 dma_dom->need_flush = false;
2089 }
2090}
2091
2092/*
2093 * The exported map_single function for dma_ops.
2094 */
2095static dma_addr_t map_page(struct device *dev, struct page *page,
2096 unsigned long offset, size_t size,
2097 enum dma_data_direction dir,
2098 struct dma_attrs *attrs)
2099{
2100 unsigned long flags;
2101 struct protection_domain *domain;
2102 dma_addr_t addr;
2103 u64 dma_mask;
2104 phys_addr_t paddr = page_to_phys(page) + offset;
2105
2106 INC_STATS_COUNTER(cnt_map_single);
2107
2108 domain = get_domain(dev);
2109 if (PTR_ERR(domain) == -EINVAL)
2110 return (dma_addr_t)paddr;
2111 else if (IS_ERR(domain))
2112 return DMA_ERROR_CODE;
2113
2114 dma_mask = *dev->dma_mask;
2115
2116 spin_lock_irqsave(&domain->lock, flags);
2117
2118 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
2119 dma_mask);
2120 if (addr == DMA_ERROR_CODE)
2121 goto out;
2122
2123 domain_flush_complete(domain);
2124
2125out:
2126 spin_unlock_irqrestore(&domain->lock, flags);
2127
2128 return addr;
2129}
2130
2131/*
2132 * The exported unmap_single function for dma_ops.
2133 */
2134static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2135 enum dma_data_direction dir, struct dma_attrs *attrs)
2136{
2137 unsigned long flags;
2138 struct protection_domain *domain;
2139
2140 INC_STATS_COUNTER(cnt_unmap_single);
2141
2142 domain = get_domain(dev);
2143 if (IS_ERR(domain))
2144 return;
2145
2146 spin_lock_irqsave(&domain->lock, flags);
2147
2148 __unmap_single(domain->priv, dma_addr, size, dir);
2149
2150 domain_flush_complete(domain);
2151
2152 spin_unlock_irqrestore(&domain->lock, flags);
2153}
2154
2155/*
2156 * This is a special map_sg function which is used if we should map a
2157 * device which is not handled by an AMD IOMMU in the system.
2158 */
2159static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
2160 int nelems, int dir)
2161{
2162 struct scatterlist *s;
2163 int i;
2164
2165 for_each_sg(sglist, s, nelems, i) {
2166 s->dma_address = (dma_addr_t)sg_phys(s);
2167 s->dma_length = s->length;
2168 }
2169
2170 return nelems;
2171}
2172
2173/*
2174 * The exported map_sg function for dma_ops (handles scatter-gather
2175 * lists).
2176 */
2177static int map_sg(struct device *dev, struct scatterlist *sglist,
2178 int nelems, enum dma_data_direction dir,
2179 struct dma_attrs *attrs)
2180{
2181 unsigned long flags;
2182 struct protection_domain *domain;
2183 int i;
2184 struct scatterlist *s;
2185 phys_addr_t paddr;
2186 int mapped_elems = 0;
2187 u64 dma_mask;
2188
2189 INC_STATS_COUNTER(cnt_map_sg);
2190
2191 domain = get_domain(dev);
2192 if (PTR_ERR(domain) == -EINVAL)
2193 return map_sg_no_iommu(dev, sglist, nelems, dir);
2194 else if (IS_ERR(domain))
2195 return 0;
2196
2197 dma_mask = *dev->dma_mask;
2198
2199 spin_lock_irqsave(&domain->lock, flags);
2200
2201 for_each_sg(sglist, s, nelems, i) {
2202 paddr = sg_phys(s);
2203
2204 s->dma_address = __map_single(dev, domain->priv,
2205 paddr, s->length, dir, false,
2206 dma_mask);
2207
2208 if (s->dma_address) {
2209 s->dma_length = s->length;
2210 mapped_elems++;
2211 } else
2212 goto unmap;
2213 }
2214
2215 domain_flush_complete(domain);
2216
2217out:
2218 spin_unlock_irqrestore(&domain->lock, flags);
2219
2220 return mapped_elems;
2221unmap:
2222 for_each_sg(sglist, s, mapped_elems, i) {
2223 if (s->dma_address)
2224 __unmap_single(domain->priv, s->dma_address,
2225 s->dma_length, dir);
2226 s->dma_address = s->dma_length = 0;
2227 }
2228
2229 mapped_elems = 0;
2230
2231 goto out;
2232}
2233
2234/*
2235 * The exported map_sg function for dma_ops (handles scatter-gather
2236 * lists).
2237 */
2238static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2239 int nelems, enum dma_data_direction dir,
2240 struct dma_attrs *attrs)
2241{
2242 unsigned long flags;
2243 struct protection_domain *domain;
2244 struct scatterlist *s;
2245 int i;
2246
2247 INC_STATS_COUNTER(cnt_unmap_sg);
2248
2249 domain = get_domain(dev);
2250 if (IS_ERR(domain))
2251 return;
2252
2253 spin_lock_irqsave(&domain->lock, flags);
2254
2255 for_each_sg(sglist, s, nelems, i) {
2256 __unmap_single(domain->priv, s->dma_address,
2257 s->dma_length, dir);
2258 s->dma_address = s->dma_length = 0;
2259 }
2260
2261 domain_flush_complete(domain);
2262
2263 spin_unlock_irqrestore(&domain->lock, flags);
2264}
2265
2266/*
2267 * The exported alloc_coherent function for dma_ops.
2268 */
2269static void *alloc_coherent(struct device *dev, size_t size,
2270 dma_addr_t *dma_addr, gfp_t flag)
2271{
2272 unsigned long flags;
2273 void *virt_addr;
2274 struct protection_domain *domain;
2275 phys_addr_t paddr;
2276 u64 dma_mask = dev->coherent_dma_mask;
2277
2278 INC_STATS_COUNTER(cnt_alloc_coherent);
2279
2280 domain = get_domain(dev);
2281 if (PTR_ERR(domain) == -EINVAL) {
2282 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2283 *dma_addr = __pa(virt_addr);
2284 return virt_addr;
2285 } else if (IS_ERR(domain))
2286 return NULL;
2287
2288 dma_mask = dev->coherent_dma_mask;
2289 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2290 flag |= __GFP_ZERO;
2291
2292 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2293 if (!virt_addr)
2294 return NULL;
2295
2296 paddr = virt_to_phys(virt_addr);
2297
2298 if (!dma_mask)
2299 dma_mask = *dev->dma_mask;
2300
2301 spin_lock_irqsave(&domain->lock, flags);
2302
2303 *dma_addr = __map_single(dev, domain->priv, paddr,
2304 size, DMA_BIDIRECTIONAL, true, dma_mask);
2305
2306 if (*dma_addr == DMA_ERROR_CODE) {
2307 spin_unlock_irqrestore(&domain->lock, flags);
2308 goto out_free;
2309 }
2310
2311 domain_flush_complete(domain);
2312
2313 spin_unlock_irqrestore(&domain->lock, flags);
2314
2315 return virt_addr;
2316
2317out_free:
2318
2319 free_pages((unsigned long)virt_addr, get_order(size));
2320
2321 return NULL;
2322}
2323
2324/*
2325 * The exported free_coherent function for dma_ops.
2326 */
2327static void free_coherent(struct device *dev, size_t size,
2328 void *virt_addr, dma_addr_t dma_addr)
2329{
2330 unsigned long flags;
2331 struct protection_domain *domain;
2332
2333 INC_STATS_COUNTER(cnt_free_coherent);
2334
2335 domain = get_domain(dev);
2336 if (IS_ERR(domain))
2337 goto free_mem;
2338
2339 spin_lock_irqsave(&domain->lock, flags);
2340
2341 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2342
2343 domain_flush_complete(domain);
2344
2345 spin_unlock_irqrestore(&domain->lock, flags);
2346
2347free_mem:
2348 free_pages((unsigned long)virt_addr, get_order(size));
2349}
2350
2351/*
2352 * This function is called by the DMA layer to find out if we can handle a
2353 * particular device. It is part of the dma_ops.
2354 */
2355static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2356{
2357 return check_device(dev);
2358}
2359
2360/*
2361 * The function for pre-allocating protection domains.
2362 *
2363 * If the driver core informs the DMA layer if a driver grabs a device
2364 * we don't need to preallocate the protection domains anymore.
2365 * For now we have to.
2366 */
2367static void prealloc_protection_domains(void)
2368{
2369 struct pci_dev *dev = NULL;
2370 struct dma_ops_domain *dma_dom;
2371 u16 devid;
2372
2373 for_each_pci_dev(dev) {
2374
2375 /* Do we handle this device? */
2376 if (!check_device(&dev->dev))
2377 continue;
2378
2379 /* Is there already any domain for it? */
2380 if (domain_for_device(&dev->dev))
2381 continue;
2382
2383 devid = get_device_id(&dev->dev);
2384
2385 dma_dom = dma_ops_domain_alloc();
2386 if (!dma_dom)
2387 continue;
2388 init_unity_mappings_for_device(dma_dom, devid);
2389 dma_dom->target_dev = devid;
2390
2391 attach_device(&dev->dev, &dma_dom->domain);
2392
2393 list_add_tail(&dma_dom->list, &iommu_pd_list);
2394 }
2395}
2396
2397static struct dma_map_ops amd_iommu_dma_ops = {
2398 .alloc_coherent = alloc_coherent,
2399 .free_coherent = free_coherent,
2400 .map_page = map_page,
2401 .unmap_page = unmap_page,
2402 .map_sg = map_sg,
2403 .unmap_sg = unmap_sg,
2404 .dma_supported = amd_iommu_dma_supported,
2405};
2406
2407static unsigned device_dma_ops_init(void)
2408{
2409 struct pci_dev *pdev = NULL;
2410 unsigned unhandled = 0;
2411
2412 for_each_pci_dev(pdev) {
2413 if (!check_device(&pdev->dev)) {
2414 unhandled += 1;
2415 continue;
2416 }
2417
2418 pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
2419 }
2420
2421 return unhandled;
2422}
2423
2424/*
2425 * The function which clues the AMD IOMMU driver into dma_ops.
2426 */
2427
2428void __init amd_iommu_init_api(void)
2429{
2430 register_iommu(&amd_iommu_ops);
2431}
2432
2433int __init amd_iommu_init_dma_ops(void)
2434{
2435 struct amd_iommu *iommu;
2436 int ret, unhandled;
2437
2438 /*
2439 * first allocate a default protection domain for every IOMMU we
2440 * found in the system. Devices not assigned to any other
2441 * protection domain will be assigned to the default one.
2442 */
2443 for_each_iommu(iommu) {
2444 iommu->default_dom = dma_ops_domain_alloc();
2445 if (iommu->default_dom == NULL)
2446 return -ENOMEM;
2447 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
2448 ret = iommu_init_unity_mappings(iommu);
2449 if (ret)
2450 goto free_domains;
2451 }
2452
2453 /*
2454 * Pre-allocate the protection domains for each device.
2455 */
2456 prealloc_protection_domains();
2457
2458 iommu_detected = 1;
2459 swiotlb = 0;
2460
2461 /* Make the driver finally visible to the drivers */
2462 unhandled = device_dma_ops_init();
2463 if (unhandled && max_pfn > MAX_DMA32_PFN) {
2464 /* There are unhandled devices - initialize swiotlb for them */
2465 swiotlb = 1;
2466 }
2467
2468 amd_iommu_stats_init();
2469
2470 return 0;
2471
2472free_domains:
2473
2474 for_each_iommu(iommu) {
2475 if (iommu->default_dom)
2476 dma_ops_domain_free(iommu->default_dom);
2477 }
2478
2479 return ret;
2480}
2481
2482/*****************************************************************************
2483 *
2484 * The following functions belong to the exported interface of AMD IOMMU
2485 *
2486 * This interface allows access to lower level functions of the IOMMU
2487 * like protection domain handling and assignement of devices to domains
2488 * which is not possible with the dma_ops interface.
2489 *
2490 *****************************************************************************/
2491
2492static void cleanup_domain(struct protection_domain *domain)
2493{
2494 struct iommu_dev_data *dev_data, *next;
2495 unsigned long flags;
2496
2497 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2498
2499 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2500 struct device *dev = dev_data->dev;
2501
2502 __detach_device(dev);
2503 atomic_set(&dev_data->bind, 0);
2504 }
2505
2506 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2507}
2508
2509static void protection_domain_free(struct protection_domain *domain)
2510{
2511 if (!domain)
2512 return;
2513
2514 del_domain_from_list(domain);
2515
2516 if (domain->id)
2517 domain_id_free(domain->id);
2518
2519 kfree(domain);
2520}
2521
2522static struct protection_domain *protection_domain_alloc(void)
2523{
2524 struct protection_domain *domain;
2525
2526 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2527 if (!domain)
2528 return NULL;
2529
2530 spin_lock_init(&domain->lock);
2531 mutex_init(&domain->api_lock);
2532 domain->id = domain_id_alloc();
2533 if (!domain->id)
2534 goto out_err;
2535 INIT_LIST_HEAD(&domain->dev_list);
2536
2537 add_domain_to_list(domain);
2538
2539 return domain;
2540
2541out_err:
2542 kfree(domain);
2543
2544 return NULL;
2545}
2546
2547static int amd_iommu_domain_init(struct iommu_domain *dom)
2548{
2549 struct protection_domain *domain;
2550
2551 domain = protection_domain_alloc();
2552 if (!domain)
2553 goto out_free;
2554
2555 domain->mode = PAGE_MODE_3_LEVEL;
2556 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2557 if (!domain->pt_root)
2558 goto out_free;
2559
2560 dom->priv = domain;
2561
2562 return 0;
2563
2564out_free:
2565 protection_domain_free(domain);
2566
2567 return -ENOMEM;
2568}
2569
2570static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2571{
2572 struct protection_domain *domain = dom->priv;
2573
2574 if (!domain)
2575 return;
2576
2577 if (domain->dev_cnt > 0)
2578 cleanup_domain(domain);
2579
2580 BUG_ON(domain->dev_cnt != 0);
2581
2582 free_pagetable(domain);
2583
2584 protection_domain_free(domain);
2585
2586 dom->priv = NULL;
2587}
2588
2589static void amd_iommu_detach_device(struct iommu_domain *dom,
2590 struct device *dev)
2591{
2592 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2593 struct amd_iommu *iommu;
2594 u16 devid;
2595
2596 if (!check_device(dev))
2597 return;
2598
2599 devid = get_device_id(dev);
2600
2601 if (dev_data->domain != NULL)
2602 detach_device(dev);
2603
2604 iommu = amd_iommu_rlookup_table[devid];
2605 if (!iommu)
2606 return;
2607
2608 device_flush_dte(dev);
2609 iommu_completion_wait(iommu);
2610}
2611
2612static int amd_iommu_attach_device(struct iommu_domain *dom,
2613 struct device *dev)
2614{
2615 struct protection_domain *domain = dom->priv;
2616 struct iommu_dev_data *dev_data;
2617 struct amd_iommu *iommu;
2618 int ret;
2619 u16 devid;
2620
2621 if (!check_device(dev))
2622 return -EINVAL;
2623
2624 dev_data = dev->archdata.iommu;
2625
2626 devid = get_device_id(dev);
2627
2628 iommu = amd_iommu_rlookup_table[devid];
2629 if (!iommu)
2630 return -EINVAL;
2631
2632 if (dev_data->domain)
2633 detach_device(dev);
2634
2635 ret = attach_device(dev, domain);
2636
2637 iommu_completion_wait(iommu);
2638
2639 return ret;
2640}
2641
2642static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2643 phys_addr_t paddr, int gfp_order, int iommu_prot)
2644{
2645 unsigned long page_size = 0x1000UL << gfp_order;
2646 struct protection_domain *domain = dom->priv;
2647 int prot = 0;
2648 int ret;
2649
2650 if (iommu_prot & IOMMU_READ)
2651 prot |= IOMMU_PROT_IR;
2652 if (iommu_prot & IOMMU_WRITE)
2653 prot |= IOMMU_PROT_IW;
2654
2655 mutex_lock(&domain->api_lock);
2656 ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2657 mutex_unlock(&domain->api_lock);
2658
2659 return ret;
2660}
2661
2662static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2663 int gfp_order)
2664{
2665 struct protection_domain *domain = dom->priv;
2666 unsigned long page_size, unmap_size;
2667
2668 page_size = 0x1000UL << gfp_order;
2669
2670 mutex_lock(&domain->api_lock);
2671 unmap_size = iommu_unmap_page(domain, iova, page_size);
2672 mutex_unlock(&domain->api_lock);
2673
2674 domain_flush_tlb_pde(domain);
2675
2676 return get_order(unmap_size);
2677}
2678
2679static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2680 unsigned long iova)
2681{
2682 struct protection_domain *domain = dom->priv;
2683 unsigned long offset_mask;
2684 phys_addr_t paddr;
2685 u64 *pte, __pte;
2686
2687 pte = fetch_pte(domain, iova);
2688
2689 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2690 return 0;
2691
2692 if (PM_PTE_LEVEL(*pte) == 0)
2693 offset_mask = PAGE_SIZE - 1;
2694 else
2695 offset_mask = PTE_PAGE_SIZE(*pte) - 1;
2696
2697 __pte = *pte & PM_ADDR_MASK;
2698 paddr = (__pte & ~offset_mask) | (iova & offset_mask);
2699
2700 return paddr;
2701}
2702
2703static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2704 unsigned long cap)
2705{
2706 switch (cap) {
2707 case IOMMU_CAP_CACHE_COHERENCY:
2708 return 1;
2709 }
2710
2711 return 0;
2712}
2713
2714static struct iommu_ops amd_iommu_ops = {
2715 .domain_init = amd_iommu_domain_init,
2716 .domain_destroy = amd_iommu_domain_destroy,
2717 .attach_dev = amd_iommu_attach_device,
2718 .detach_dev = amd_iommu_detach_device,
2719 .map = amd_iommu_map,
2720 .unmap = amd_iommu_unmap,
2721 .iova_to_phys = amd_iommu_iova_to_phys,
2722 .domain_has_cap = amd_iommu_domain_has_cap,
2723};
2724
2725/*****************************************************************************
2726 *
2727 * The next functions do a basic initialization of IOMMU for pass through
2728 * mode
2729 *
2730 * In passthrough mode the IOMMU is initialized and enabled but not used for
2731 * DMA-API translation.
2732 *
2733 *****************************************************************************/
2734
2735int __init amd_iommu_init_passthrough(void)
2736{
2737 struct amd_iommu *iommu;
2738 struct pci_dev *dev = NULL;
2739 u16 devid;
2740
2741 /* allocate passthrough domain */
2742 pt_domain = protection_domain_alloc();
2743 if (!pt_domain)
2744 return -ENOMEM;
2745
2746 pt_domain->mode |= PAGE_MODE_NONE;
2747
2748 for_each_pci_dev(dev) {
2749 if (!check_device(&dev->dev))
2750 continue;
2751
2752 devid = get_device_id(&dev->dev);
2753
2754 iommu = amd_iommu_rlookup_table[devid];
2755 if (!iommu)
2756 continue;
2757
2758 attach_device(&dev->dev, pt_domain);
2759 }
2760
2761 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
2762
2763 return 0;
2764}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index bfc8453bd98d..000000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1572 +0,0 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/acpi.h>
22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/syscore_ops.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
29#include <asm/amd_iommu_types.h>
30#include <asm/amd_iommu.h>
31#include <asm/iommu.h>
32#include <asm/gart.h>
33#include <asm/x86_init.h>
34#include <asm/iommu_table.h>
35/*
36 * definitions for the ACPI scanning code
37 */
38#define IVRS_HEADER_LENGTH 48
39
40#define ACPI_IVHD_TYPE 0x10
41#define ACPI_IVMD_TYPE_ALL 0x20
42#define ACPI_IVMD_TYPE 0x21
43#define ACPI_IVMD_TYPE_RANGE 0x22
44
45#define IVHD_DEV_ALL 0x01
46#define IVHD_DEV_SELECT 0x02
47#define IVHD_DEV_SELECT_RANGE_START 0x03
48#define IVHD_DEV_RANGE_END 0x04
49#define IVHD_DEV_ALIAS 0x42
50#define IVHD_DEV_ALIAS_RANGE 0x43
51#define IVHD_DEV_EXT_SELECT 0x46
52#define IVHD_DEV_EXT_SELECT_RANGE 0x47
53
54#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
55#define IVHD_FLAG_PASSPW_EN_MASK 0x02
56#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
57#define IVHD_FLAG_ISOC_EN_MASK 0x08
58
59#define IVMD_FLAG_EXCL_RANGE 0x08
60#define IVMD_FLAG_UNITY_MAP 0x01
61
62#define ACPI_DEVFLAG_INITPASS 0x01
63#define ACPI_DEVFLAG_EXTINT 0x02
64#define ACPI_DEVFLAG_NMI 0x04
65#define ACPI_DEVFLAG_SYSMGT1 0x10
66#define ACPI_DEVFLAG_SYSMGT2 0x20
67#define ACPI_DEVFLAG_LINT0 0x40
68#define ACPI_DEVFLAG_LINT1 0x80
69#define ACPI_DEVFLAG_ATSDIS 0x10000000
70
71/*
72 * ACPI table definitions
73 *
74 * These data structures are laid over the table to parse the important values
75 * out of it.
76 */
77
78/*
79 * structure describing one IOMMU in the ACPI table. Typically followed by one
80 * or more ivhd_entrys.
81 */
82struct ivhd_header {
83 u8 type;
84 u8 flags;
85 u16 length;
86 u16 devid;
87 u16 cap_ptr;
88 u64 mmio_phys;
89 u16 pci_seg;
90 u16 info;
91 u32 reserved;
92} __attribute__((packed));
93
94/*
95 * A device entry describing which devices a specific IOMMU translates and
96 * which requestor ids they use.
97 */
98struct ivhd_entry {
99 u8 type;
100 u16 devid;
101 u8 flags;
102 u32 ext;
103} __attribute__((packed));
104
105/*
106 * An AMD IOMMU memory definition structure. It defines things like exclusion
107 * ranges for devices and regions that should be unity mapped.
108 */
109struct ivmd_header {
110 u8 type;
111 u8 flags;
112 u16 length;
113 u16 devid;
114 u16 aux;
115 u64 resv;
116 u64 range_start;
117 u64 range_length;
118} __attribute__((packed));
119
120bool amd_iommu_dump;
121
122static int __initdata amd_iommu_detected;
123static bool __initdata amd_iommu_disabled;
124
125u16 amd_iommu_last_bdf; /* largest PCI device id we have
126 to handle */
127LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
128 we find in ACPI */
129bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
130
131LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
132 system */
133
134/* Array to assign indices to IOMMUs*/
135struct amd_iommu *amd_iommus[MAX_IOMMUS];
136int amd_iommus_present;
137
138/* IOMMUs have a non-present cache? */
139bool amd_iommu_np_cache __read_mostly;
140bool amd_iommu_iotlb_sup __read_mostly = true;
141
142/*
143 * The ACPI table parsing functions set this variable on an error
144 */
145static int __initdata amd_iommu_init_err;
146
147/*
148 * List of protection domains - used during resume
149 */
150LIST_HEAD(amd_iommu_pd_list);
151spinlock_t amd_iommu_pd_lock;
152
153/*
154 * Pointer to the device table which is shared by all AMD IOMMUs
155 * it is indexed by the PCI device id or the HT unit id and contains
156 * information about the domain the device belongs to as well as the
157 * page table root pointer.
158 */
159struct dev_table_entry *amd_iommu_dev_table;
160
161/*
162 * The alias table is a driver specific data structure which contains the
163 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
164 * More than one device can share the same requestor id.
165 */
166u16 *amd_iommu_alias_table;
167
168/*
169 * The rlookup table is used to find the IOMMU which is responsible
170 * for a specific device. It is also indexed by the PCI device id.
171 */
172struct amd_iommu **amd_iommu_rlookup_table;
173
174/*
175 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
176 * to know which ones are already in use.
177 */
178unsigned long *amd_iommu_pd_alloc_bitmap;
179
180static u32 dev_table_size; /* size of the device table */
181static u32 alias_table_size; /* size of the alias table */
182static u32 rlookup_table_size; /* size if the rlookup table */
183
184/*
185 * This function flushes all internal caches of
186 * the IOMMU used by this driver.
187 */
188extern void iommu_flush_all_caches(struct amd_iommu *iommu);
189
190static inline void update_last_devid(u16 devid)
191{
192 if (devid > amd_iommu_last_bdf)
193 amd_iommu_last_bdf = devid;
194}
195
196static inline unsigned long tbl_size(int entry_size)
197{
198 unsigned shift = PAGE_SHIFT +
199 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
200
201 return 1UL << shift;
202}
203
204/* Access to l1 and l2 indexed register spaces */
205
206static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
207{
208 u32 val;
209
210 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
211 pci_read_config_dword(iommu->dev, 0xfc, &val);
212 return val;
213}
214
215static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
216{
217 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
218 pci_write_config_dword(iommu->dev, 0xfc, val);
219 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
220}
221
222static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
223{
224 u32 val;
225
226 pci_write_config_dword(iommu->dev, 0xf0, address);
227 pci_read_config_dword(iommu->dev, 0xf4, &val);
228 return val;
229}
230
231static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
232{
233 pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
234 pci_write_config_dword(iommu->dev, 0xf4, val);
235}
236
237/****************************************************************************
238 *
239 * AMD IOMMU MMIO register space handling functions
240 *
241 * These functions are used to program the IOMMU device registers in
242 * MMIO space required for that driver.
243 *
244 ****************************************************************************/
245
246/*
247 * This function set the exclusion range in the IOMMU. DMA accesses to the
248 * exclusion range are passed through untranslated
249 */
250static void iommu_set_exclusion_range(struct amd_iommu *iommu)
251{
252 u64 start = iommu->exclusion_start & PAGE_MASK;
253 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
254 u64 entry;
255
256 if (!iommu->exclusion_start)
257 return;
258
259 entry = start | MMIO_EXCL_ENABLE_MASK;
260 memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
261 &entry, sizeof(entry));
262
263 entry = limit;
264 memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
265 &entry, sizeof(entry));
266}
267
268/* Programs the physical address of the device table into the IOMMU hardware */
269static void __init iommu_set_device_table(struct amd_iommu *iommu)
270{
271 u64 entry;
272
273 BUG_ON(iommu->mmio_base == NULL);
274
275 entry = virt_to_phys(amd_iommu_dev_table);
276 entry |= (dev_table_size >> 12) - 1;
277 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
278 &entry, sizeof(entry));
279}
280
281/* Generic functions to enable/disable certain features of the IOMMU. */
282static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
283{
284 u32 ctrl;
285
286 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
287 ctrl |= (1 << bit);
288 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
289}
290
291static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
292{
293 u32 ctrl;
294
295 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
296 ctrl &= ~(1 << bit);
297 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
298}
299
300/* Function to enable the hardware */
301static void iommu_enable(struct amd_iommu *iommu)
302{
303 static const char * const feat_str[] = {
304 "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
305 "IA", "GA", "HE", "PC", NULL
306 };
307 int i;
308
309 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
310 dev_name(&iommu->dev->dev), iommu->cap_ptr);
311
312 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
313 printk(KERN_CONT " extended features: ");
314 for (i = 0; feat_str[i]; ++i)
315 if (iommu_feature(iommu, (1ULL << i)))
316 printk(KERN_CONT " %s", feat_str[i]);
317 }
318 printk(KERN_CONT "\n");
319
320 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
321}
322
323static void iommu_disable(struct amd_iommu *iommu)
324{
325 /* Disable command buffer */
326 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
327
328 /* Disable event logging and event interrupts */
329 iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
330 iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
331
332 /* Disable IOMMU hardware itself */
333 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
334}
335
336/*
337 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
338 * the system has one.
339 */
340static u8 * __init iommu_map_mmio_space(u64 address)
341{
342 u8 *ret;
343
344 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
345 pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
346 address);
347 pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
348 return NULL;
349 }
350
351 ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
352 if (ret != NULL)
353 return ret;
354
355 release_mem_region(address, MMIO_REGION_LENGTH);
356
357 return NULL;
358}
359
360static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
361{
362 if (iommu->mmio_base)
363 iounmap(iommu->mmio_base);
364 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
365}
366
367/****************************************************************************
368 *
369 * The functions below belong to the first pass of AMD IOMMU ACPI table
370 * parsing. In this pass we try to find out the highest device id this
371 * code has to handle. Upon this information the size of the shared data
372 * structures is determined later.
373 *
374 ****************************************************************************/
375
376/*
377 * This function calculates the length of a given IVHD entry
378 */
379static inline int ivhd_entry_length(u8 *ivhd)
380{
381 return 0x04 << (*ivhd >> 6);
382}
383
384/*
385 * This function reads the last device id the IOMMU has to handle from the PCI
386 * capability header for this IOMMU
387 */
388static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
389{
390 u32 cap;
391
392 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
393 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
394
395 return 0;
396}
397
398/*
399 * After reading the highest device id from the IOMMU PCI capability header
400 * this function looks if there is a higher device id defined in the ACPI table
401 */
402static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
403{
404 u8 *p = (void *)h, *end = (void *)h;
405 struct ivhd_entry *dev;
406
407 p += sizeof(*h);
408 end += h->length;
409
410 find_last_devid_on_pci(PCI_BUS(h->devid),
411 PCI_SLOT(h->devid),
412 PCI_FUNC(h->devid),
413 h->cap_ptr);
414
415 while (p < end) {
416 dev = (struct ivhd_entry *)p;
417 switch (dev->type) {
418 case IVHD_DEV_SELECT:
419 case IVHD_DEV_RANGE_END:
420 case IVHD_DEV_ALIAS:
421 case IVHD_DEV_EXT_SELECT:
422 /* all the above subfield types refer to device ids */
423 update_last_devid(dev->devid);
424 break;
425 default:
426 break;
427 }
428 p += ivhd_entry_length(p);
429 }
430
431 WARN_ON(p != end);
432
433 return 0;
434}
435
436/*
437 * Iterate over all IVHD entries in the ACPI table and find the highest device
438 * id which we need to handle. This is the first of three functions which parse
439 * the ACPI table. So we check the checksum here.
440 */
441static int __init find_last_devid_acpi(struct acpi_table_header *table)
442{
443 int i;
444 u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
445 struct ivhd_header *h;
446
447 /*
448 * Validate checksum here so we don't need to do it when
449 * we actually parse the table
450 */
451 for (i = 0; i < table->length; ++i)
452 checksum += p[i];
453 if (checksum != 0) {
454 /* ACPI table corrupt */
455 amd_iommu_init_err = -ENODEV;
456 return 0;
457 }
458
459 p += IVRS_HEADER_LENGTH;
460
461 end += table->length;
462 while (p < end) {
463 h = (struct ivhd_header *)p;
464 switch (h->type) {
465 case ACPI_IVHD_TYPE:
466 find_last_devid_from_ivhd(h);
467 break;
468 default:
469 break;
470 }
471 p += h->length;
472 }
473 WARN_ON(p != end);
474
475 return 0;
476}
477
478/****************************************************************************
479 *
480 * The following functions belong the the code path which parses the ACPI table
481 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
482 * data structures, initialize the device/alias/rlookup table and also
483 * basically initialize the hardware.
484 *
485 ****************************************************************************/
486
487/*
488 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
489 * write commands to that buffer later and the IOMMU will execute them
490 * asynchronously
491 */
492static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
493{
494 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
495 get_order(CMD_BUFFER_SIZE));
496
497 if (cmd_buf == NULL)
498 return NULL;
499
500 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
501
502 return cmd_buf;
503}
504
505/*
506 * This function resets the command buffer if the IOMMU stopped fetching
507 * commands from it.
508 */
509void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
510{
511 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
512
513 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
514 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
515
516 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
517}
518
519/*
520 * This function writes the command buffer address to the hardware and
521 * enables it.
522 */
523static void iommu_enable_command_buffer(struct amd_iommu *iommu)
524{
525 u64 entry;
526
527 BUG_ON(iommu->cmd_buf == NULL);
528
529 entry = (u64)virt_to_phys(iommu->cmd_buf);
530 entry |= MMIO_CMD_SIZE_512;
531
532 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
533 &entry, sizeof(entry));
534
535 amd_iommu_reset_cmd_buffer(iommu);
536 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
537}
538
539static void __init free_command_buffer(struct amd_iommu *iommu)
540{
541 free_pages((unsigned long)iommu->cmd_buf,
542 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
543}
544
545/* allocates the memory where the IOMMU will log its events to */
546static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
547{
548 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
549 get_order(EVT_BUFFER_SIZE));
550
551 if (iommu->evt_buf == NULL)
552 return NULL;
553
554 iommu->evt_buf_size = EVT_BUFFER_SIZE;
555
556 return iommu->evt_buf;
557}
558
559static void iommu_enable_event_buffer(struct amd_iommu *iommu)
560{
561 u64 entry;
562
563 BUG_ON(iommu->evt_buf == NULL);
564
565 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
566
567 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
568 &entry, sizeof(entry));
569
570 /* set head and tail to zero manually */
571 writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
572 writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
573
574 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
575}
576
577static void __init free_event_buffer(struct amd_iommu *iommu)
578{
579 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
580}
581
582/* sets a specific bit in the device table entry. */
583static void set_dev_entry_bit(u16 devid, u8 bit)
584{
585 int i = (bit >> 5) & 0x07;
586 int _bit = bit & 0x1f;
587
588 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
589}
590
591static int get_dev_entry_bit(u16 devid, u8 bit)
592{
593 int i = (bit >> 5) & 0x07;
594 int _bit = bit & 0x1f;
595
596 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
597}
598
599
600void amd_iommu_apply_erratum_63(u16 devid)
601{
602 int sysmgt;
603
604 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
605 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
606
607 if (sysmgt == 0x01)
608 set_dev_entry_bit(devid, DEV_ENTRY_IW);
609}
610
611/* Writes the specific IOMMU for a device into the rlookup table */
612static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
613{
614 amd_iommu_rlookup_table[devid] = iommu;
615}
616
617/*
618 * This function takes the device specific flags read from the ACPI
619 * table and sets up the device table entry with that information
620 */
621static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
622 u16 devid, u32 flags, u32 ext_flags)
623{
624 if (flags & ACPI_DEVFLAG_INITPASS)
625 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
626 if (flags & ACPI_DEVFLAG_EXTINT)
627 set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
628 if (flags & ACPI_DEVFLAG_NMI)
629 set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
630 if (flags & ACPI_DEVFLAG_SYSMGT1)
631 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
632 if (flags & ACPI_DEVFLAG_SYSMGT2)
633 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
634 if (flags & ACPI_DEVFLAG_LINT0)
635 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
636 if (flags & ACPI_DEVFLAG_LINT1)
637 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
638
639 amd_iommu_apply_erratum_63(devid);
640
641 set_iommu_for_device(iommu, devid);
642}
643
644/*
645 * Reads the device exclusion range from ACPI and initialize IOMMU with
646 * it
647 */
648static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
649{
650 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
651
652 if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
653 return;
654
655 if (iommu) {
656 /*
657 * We only can configure exclusion ranges per IOMMU, not
658 * per device. But we can enable the exclusion range per
659 * device. This is done here
660 */
661 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
662 iommu->exclusion_start = m->range_start;
663 iommu->exclusion_length = m->range_length;
664 }
665}
666
667/*
668 * This function reads some important data from the IOMMU PCI space and
669 * initializes the driver data structure with it. It reads the hardware
670 * capabilities and the first/last device entries
671 */
672static void __init init_iommu_from_pci(struct amd_iommu *iommu)
673{
674 int cap_ptr = iommu->cap_ptr;
675 u32 range, misc, low, high;
676 int i, j;
677
678 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
679 &iommu->cap);
680 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
681 &range);
682 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
683 &misc);
684
685 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
686 MMIO_GET_FD(range));
687 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
688 MMIO_GET_LD(range));
689 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
690
691 if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
692 amd_iommu_iotlb_sup = false;
693
694 /* read extended feature bits */
695 low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
696 high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
697
698 iommu->features = ((u64)high << 32) | low;
699
700 if (!is_rd890_iommu(iommu->dev))
701 return;
702
703 /*
704 * Some rd890 systems may not be fully reconfigured by the BIOS, so
705 * it's necessary for us to store this information so it can be
706 * reprogrammed on resume
707 */
708
709 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
710 &iommu->stored_addr_lo);
711 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
712 &iommu->stored_addr_hi);
713
714 /* Low bit locks writes to configuration space */
715 iommu->stored_addr_lo &= ~1;
716
717 for (i = 0; i < 6; i++)
718 for (j = 0; j < 0x12; j++)
719 iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
720
721 for (i = 0; i < 0x83; i++)
722 iommu->stored_l2[i] = iommu_read_l2(iommu, i);
723}
724
725/*
726 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
727 * initializes the hardware and our data structures with it.
728 */
729static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
730 struct ivhd_header *h)
731{
732 u8 *p = (u8 *)h;
733 u8 *end = p, flags = 0;
734 u16 devid = 0, devid_start = 0, devid_to = 0;
735 u32 dev_i, ext_flags = 0;
736 bool alias = false;
737 struct ivhd_entry *e;
738
739 /*
740 * First save the recommended feature enable bits from ACPI
741 */
742 iommu->acpi_flags = h->flags;
743
744 /*
745 * Done. Now parse the device entries
746 */
747 p += sizeof(struct ivhd_header);
748 end += h->length;
749
750
751 while (p < end) {
752 e = (struct ivhd_entry *)p;
753 switch (e->type) {
754 case IVHD_DEV_ALL:
755
756 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
757 " last device %02x:%02x.%x flags: %02x\n",
758 PCI_BUS(iommu->first_device),
759 PCI_SLOT(iommu->first_device),
760 PCI_FUNC(iommu->first_device),
761 PCI_BUS(iommu->last_device),
762 PCI_SLOT(iommu->last_device),
763 PCI_FUNC(iommu->last_device),
764 e->flags);
765
766 for (dev_i = iommu->first_device;
767 dev_i <= iommu->last_device; ++dev_i)
768 set_dev_entry_from_acpi(iommu, dev_i,
769 e->flags, 0);
770 break;
771 case IVHD_DEV_SELECT:
772
773 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
774 "flags: %02x\n",
775 PCI_BUS(e->devid),
776 PCI_SLOT(e->devid),
777 PCI_FUNC(e->devid),
778 e->flags);
779
780 devid = e->devid;
781 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
782 break;
783 case IVHD_DEV_SELECT_RANGE_START:
784
785 DUMP_printk(" DEV_SELECT_RANGE_START\t "
786 "devid: %02x:%02x.%x flags: %02x\n",
787 PCI_BUS(e->devid),
788 PCI_SLOT(e->devid),
789 PCI_FUNC(e->devid),
790 e->flags);
791
792 devid_start = e->devid;
793 flags = e->flags;
794 ext_flags = 0;
795 alias = false;
796 break;
797 case IVHD_DEV_ALIAS:
798
799 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
800 "flags: %02x devid_to: %02x:%02x.%x\n",
801 PCI_BUS(e->devid),
802 PCI_SLOT(e->devid),
803 PCI_FUNC(e->devid),
804 e->flags,
805 PCI_BUS(e->ext >> 8),
806 PCI_SLOT(e->ext >> 8),
807 PCI_FUNC(e->ext >> 8));
808
809 devid = e->devid;
810 devid_to = e->ext >> 8;
811 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
812 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
813 amd_iommu_alias_table[devid] = devid_to;
814 break;
815 case IVHD_DEV_ALIAS_RANGE:
816
817 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
818 "devid: %02x:%02x.%x flags: %02x "
819 "devid_to: %02x:%02x.%x\n",
820 PCI_BUS(e->devid),
821 PCI_SLOT(e->devid),
822 PCI_FUNC(e->devid),
823 e->flags,
824 PCI_BUS(e->ext >> 8),
825 PCI_SLOT(e->ext >> 8),
826 PCI_FUNC(e->ext >> 8));
827
828 devid_start = e->devid;
829 flags = e->flags;
830 devid_to = e->ext >> 8;
831 ext_flags = 0;
832 alias = true;
833 break;
834 case IVHD_DEV_EXT_SELECT:
835
836 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
837 "flags: %02x ext: %08x\n",
838 PCI_BUS(e->devid),
839 PCI_SLOT(e->devid),
840 PCI_FUNC(e->devid),
841 e->flags, e->ext);
842
843 devid = e->devid;
844 set_dev_entry_from_acpi(iommu, devid, e->flags,
845 e->ext);
846 break;
847 case IVHD_DEV_EXT_SELECT_RANGE:
848
849 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
850 "%02x:%02x.%x flags: %02x ext: %08x\n",
851 PCI_BUS(e->devid),
852 PCI_SLOT(e->devid),
853 PCI_FUNC(e->devid),
854 e->flags, e->ext);
855
856 devid_start = e->devid;
857 flags = e->flags;
858 ext_flags = e->ext;
859 alias = false;
860 break;
861 case IVHD_DEV_RANGE_END:
862
863 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
864 PCI_BUS(e->devid),
865 PCI_SLOT(e->devid),
866 PCI_FUNC(e->devid));
867
868 devid = e->devid;
869 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
870 if (alias) {
871 amd_iommu_alias_table[dev_i] = devid_to;
872 set_dev_entry_from_acpi(iommu,
873 devid_to, flags, ext_flags);
874 }
875 set_dev_entry_from_acpi(iommu, dev_i,
876 flags, ext_flags);
877 }
878 break;
879 default:
880 break;
881 }
882
883 p += ivhd_entry_length(p);
884 }
885}
886
887/* Initializes the device->iommu mapping for the driver */
888static int __init init_iommu_devices(struct amd_iommu *iommu)
889{
890 u32 i;
891
892 for (i = iommu->first_device; i <= iommu->last_device; ++i)
893 set_iommu_for_device(iommu, i);
894
895 return 0;
896}
897
898static void __init free_iommu_one(struct amd_iommu *iommu)
899{
900 free_command_buffer(iommu);
901 free_event_buffer(iommu);
902 iommu_unmap_mmio_space(iommu);
903}
904
905static void __init free_iommu_all(void)
906{
907 struct amd_iommu *iommu, *next;
908
909 for_each_iommu_safe(iommu, next) {
910 list_del(&iommu->list);
911 free_iommu_one(iommu);
912 kfree(iommu);
913 }
914}
915
916/*
917 * This function clues the initialization function for one IOMMU
918 * together and also allocates the command buffer and programs the
919 * hardware. It does NOT enable the IOMMU. This is done afterwards.
920 */
921static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
922{
923 spin_lock_init(&iommu->lock);
924
925 /* Add IOMMU to internal data structures */
926 list_add_tail(&iommu->list, &amd_iommu_list);
927 iommu->index = amd_iommus_present++;
928
929 if (unlikely(iommu->index >= MAX_IOMMUS)) {
930 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
931 return -ENOSYS;
932 }
933
934 /* Index is fine - add IOMMU to the array */
935 amd_iommus[iommu->index] = iommu;
936
937 /*
938 * Copy data from ACPI table entry to the iommu struct
939 */
940 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
941 if (!iommu->dev)
942 return 1;
943
944 iommu->cap_ptr = h->cap_ptr;
945 iommu->pci_seg = h->pci_seg;
946 iommu->mmio_phys = h->mmio_phys;
947 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
948 if (!iommu->mmio_base)
949 return -ENOMEM;
950
951 iommu->cmd_buf = alloc_command_buffer(iommu);
952 if (!iommu->cmd_buf)
953 return -ENOMEM;
954
955 iommu->evt_buf = alloc_event_buffer(iommu);
956 if (!iommu->evt_buf)
957 return -ENOMEM;
958
959 iommu->int_enabled = false;
960
961 init_iommu_from_pci(iommu);
962 init_iommu_from_acpi(iommu, h);
963 init_iommu_devices(iommu);
964
965 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
966 amd_iommu_np_cache = true;
967
968 return pci_enable_device(iommu->dev);
969}
970
971/*
972 * Iterates over all IOMMU entries in the ACPI table, allocates the
973 * IOMMU structure and initializes it with init_iommu_one()
974 */
975static int __init init_iommu_all(struct acpi_table_header *table)
976{
977 u8 *p = (u8 *)table, *end = (u8 *)table;
978 struct ivhd_header *h;
979 struct amd_iommu *iommu;
980 int ret;
981
982 end += table->length;
983 p += IVRS_HEADER_LENGTH;
984
985 while (p < end) {
986 h = (struct ivhd_header *)p;
987 switch (*p) {
988 case ACPI_IVHD_TYPE:
989
990 DUMP_printk("device: %02x:%02x.%01x cap: %04x "
991 "seg: %d flags: %01x info %04x\n",
992 PCI_BUS(h->devid), PCI_SLOT(h->devid),
993 PCI_FUNC(h->devid), h->cap_ptr,
994 h->pci_seg, h->flags, h->info);
995 DUMP_printk(" mmio-addr: %016llx\n",
996 h->mmio_phys);
997
998 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
999 if (iommu == NULL) {
1000 amd_iommu_init_err = -ENOMEM;
1001 return 0;
1002 }
1003
1004 ret = init_iommu_one(iommu, h);
1005 if (ret) {
1006 amd_iommu_init_err = ret;
1007 return 0;
1008 }
1009 break;
1010 default:
1011 break;
1012 }
1013 p += h->length;
1014
1015 }
1016 WARN_ON(p != end);
1017
1018 return 0;
1019}
1020
1021/****************************************************************************
1022 *
1023 * The following functions initialize the MSI interrupts for all IOMMUs
1024 * in the system. Its a bit challenging because there could be multiple
1025 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
1026 * pci_dev.
1027 *
1028 ****************************************************************************/
1029
1030static int iommu_setup_msi(struct amd_iommu *iommu)
1031{
1032 int r;
1033
1034 if (pci_enable_msi(iommu->dev))
1035 return 1;
1036
1037 r = request_threaded_irq(iommu->dev->irq,
1038 amd_iommu_int_handler,
1039 amd_iommu_int_thread,
1040 0, "AMD-Vi",
1041 iommu->dev);
1042
1043 if (r) {
1044 pci_disable_msi(iommu->dev);
1045 return 1;
1046 }
1047
1048 iommu->int_enabled = true;
1049 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
1050
1051 return 0;
1052}
1053
1054static int iommu_init_msi(struct amd_iommu *iommu)
1055{
1056 if (iommu->int_enabled)
1057 return 0;
1058
1059 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
1060 return iommu_setup_msi(iommu);
1061
1062 return 1;
1063}
1064
1065/****************************************************************************
1066 *
1067 * The next functions belong to the third pass of parsing the ACPI
1068 * table. In this last pass the memory mapping requirements are
1069 * gathered (like exclusion and unity mapping reanges).
1070 *
1071 ****************************************************************************/
1072
1073static void __init free_unity_maps(void)
1074{
1075 struct unity_map_entry *entry, *next;
1076
1077 list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
1078 list_del(&entry->list);
1079 kfree(entry);
1080 }
1081}
1082
1083/* called when we find an exclusion range definition in ACPI */
1084static int __init init_exclusion_range(struct ivmd_header *m)
1085{
1086 int i;
1087
1088 switch (m->type) {
1089 case ACPI_IVMD_TYPE:
1090 set_device_exclusion_range(m->devid, m);
1091 break;
1092 case ACPI_IVMD_TYPE_ALL:
1093 for (i = 0; i <= amd_iommu_last_bdf; ++i)
1094 set_device_exclusion_range(i, m);
1095 break;
1096 case ACPI_IVMD_TYPE_RANGE:
1097 for (i = m->devid; i <= m->aux; ++i)
1098 set_device_exclusion_range(i, m);
1099 break;
1100 default:
1101 break;
1102 }
1103
1104 return 0;
1105}
1106
1107/* called for unity map ACPI definition */
1108static int __init init_unity_map_range(struct ivmd_header *m)
1109{
1110 struct unity_map_entry *e = 0;
1111 char *s;
1112
1113 e = kzalloc(sizeof(*e), GFP_KERNEL);
1114 if (e == NULL)
1115 return -ENOMEM;
1116
1117 switch (m->type) {
1118 default:
1119 kfree(e);
1120 return 0;
1121 case ACPI_IVMD_TYPE:
1122 s = "IVMD_TYPEi\t\t\t";
1123 e->devid_start = e->devid_end = m->devid;
1124 break;
1125 case ACPI_IVMD_TYPE_ALL:
1126 s = "IVMD_TYPE_ALL\t\t";
1127 e->devid_start = 0;
1128 e->devid_end = amd_iommu_last_bdf;
1129 break;
1130 case ACPI_IVMD_TYPE_RANGE:
1131 s = "IVMD_TYPE_RANGE\t\t";
1132 e->devid_start = m->devid;
1133 e->devid_end = m->aux;
1134 break;
1135 }
1136 e->address_start = PAGE_ALIGN(m->range_start);
1137 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
1138 e->prot = m->flags >> 1;
1139
1140 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
1141 " range_start: %016llx range_end: %016llx flags: %x\n", s,
1142 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
1143 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
1144 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
1145 e->address_start, e->address_end, m->flags);
1146
1147 list_add_tail(&e->list, &amd_iommu_unity_map);
1148
1149 return 0;
1150}
1151
1152/* iterates over all memory definitions we find in the ACPI table */
1153static int __init init_memory_definitions(struct acpi_table_header *table)
1154{
1155 u8 *p = (u8 *)table, *end = (u8 *)table;
1156 struct ivmd_header *m;
1157
1158 end += table->length;
1159 p += IVRS_HEADER_LENGTH;
1160
1161 while (p < end) {
1162 m = (struct ivmd_header *)p;
1163 if (m->flags & IVMD_FLAG_EXCL_RANGE)
1164 init_exclusion_range(m);
1165 else if (m->flags & IVMD_FLAG_UNITY_MAP)
1166 init_unity_map_range(m);
1167
1168 p += m->length;
1169 }
1170
1171 return 0;
1172}
1173
1174/*
1175 * Init the device table to not allow DMA access for devices and
1176 * suppress all page faults
1177 */
1178static void init_device_table(void)
1179{
1180 u32 devid;
1181
1182 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
1183 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
1184 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
1185 }
1186}
1187
1188static void iommu_init_flags(struct amd_iommu *iommu)
1189{
1190 iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
1191 iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
1192 iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
1193
1194 iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
1195 iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
1196 iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
1197
1198 iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
1199 iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
1200 iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
1201
1202 iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
1203 iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
1204 iommu_feature_disable(iommu, CONTROL_ISOC_EN);
1205
1206 /*
1207 * make IOMMU memory accesses cache coherent
1208 */
1209 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1210}
1211
1212static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
1213{
1214 int i, j;
1215 u32 ioc_feature_control;
1216 struct pci_dev *pdev = NULL;
1217
1218 /* RD890 BIOSes may not have completely reconfigured the iommu */
1219 if (!is_rd890_iommu(iommu->dev))
1220 return;
1221
1222 /*
1223 * First, we need to ensure that the iommu is enabled. This is
1224 * controlled by a register in the northbridge
1225 */
1226 pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
1227
1228 if (!pdev)
1229 return;
1230
1231 /* Select Northbridge indirect register 0x75 and enable writing */
1232 pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
1233 pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
1234
1235 /* Enable the iommu */
1236 if (!(ioc_feature_control & 0x1))
1237 pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
1238
1239 pci_dev_put(pdev);
1240
1241 /* Restore the iommu BAR */
1242 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1243 iommu->stored_addr_lo);
1244 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
1245 iommu->stored_addr_hi);
1246
1247 /* Restore the l1 indirect regs for each of the 6 l1s */
1248 for (i = 0; i < 6; i++)
1249 for (j = 0; j < 0x12; j++)
1250 iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
1251
1252 /* Restore the l2 indirect regs */
1253 for (i = 0; i < 0x83; i++)
1254 iommu_write_l2(iommu, i, iommu->stored_l2[i]);
1255
1256 /* Lock PCI setup registers */
1257 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1258 iommu->stored_addr_lo | 1);
1259}
1260
1261/*
1262 * This function finally enables all IOMMUs found in the system after
1263 * they have been initialized
1264 */
1265static void enable_iommus(void)
1266{
1267 struct amd_iommu *iommu;
1268
1269 for_each_iommu(iommu) {
1270 iommu_disable(iommu);
1271 iommu_init_flags(iommu);
1272 iommu_set_device_table(iommu);
1273 iommu_enable_command_buffer(iommu);
1274 iommu_enable_event_buffer(iommu);
1275 iommu_set_exclusion_range(iommu);
1276 iommu_init_msi(iommu);
1277 iommu_enable(iommu);
1278 iommu_flush_all_caches(iommu);
1279 }
1280}
1281
1282static void disable_iommus(void)
1283{
1284 struct amd_iommu *iommu;
1285
1286 for_each_iommu(iommu)
1287 iommu_disable(iommu);
1288}
1289
1290/*
1291 * Suspend/Resume support
1292 * disable suspend until real resume implemented
1293 */
1294
1295static void amd_iommu_resume(void)
1296{
1297 struct amd_iommu *iommu;
1298
1299 for_each_iommu(iommu)
1300 iommu_apply_resume_quirks(iommu);
1301
1302 /* re-load the hardware */
1303 enable_iommus();
1304
1305 /*
1306 * we have to flush after the IOMMUs are enabled because a
1307 * disabled IOMMU will never execute the commands we send
1308 */
1309 for_each_iommu(iommu)
1310 iommu_flush_all_caches(iommu);
1311}
1312
1313static int amd_iommu_suspend(void)
1314{
1315 /* disable IOMMUs to go out of the way for BIOS */
1316 disable_iommus();
1317
1318 return 0;
1319}
1320
1321static struct syscore_ops amd_iommu_syscore_ops = {
1322 .suspend = amd_iommu_suspend,
1323 .resume = amd_iommu_resume,
1324};
1325
1326/*
1327 * This is the core init function for AMD IOMMU hardware in the system.
1328 * This function is called from the generic x86 DMA layer initialization
1329 * code.
1330 *
1331 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
1332 * three times:
1333 *
1334 * 1 pass) Find the highest PCI device id the driver has to handle.
1335 * Upon this information the size of the data structures is
1336 * determined that needs to be allocated.
1337 *
1338 * 2 pass) Initialize the data structures just allocated with the
1339 * information in the ACPI table about available AMD IOMMUs
1340 * in the system. It also maps the PCI devices in the
1341 * system to specific IOMMUs
1342 *
1343 * 3 pass) After the basic data structures are allocated and
1344 * initialized we update them with information about memory
1345 * remapping requirements parsed out of the ACPI table in
1346 * this last pass.
1347 *
1348 * After that the hardware is initialized and ready to go. In the last
1349 * step we do some Linux specific things like registering the driver in
1350 * the dma_ops interface and initializing the suspend/resume support
1351 * functions. Finally it prints some information about AMD IOMMUs and
1352 * the driver state and enables the hardware.
1353 */
1354static int __init amd_iommu_init(void)
1355{
1356 int i, ret = 0;
1357
1358 /*
1359 * First parse ACPI tables to find the largest Bus/Dev/Func
1360 * we need to handle. Upon this information the shared data
1361 * structures for the IOMMUs in the system will be allocated
1362 */
1363 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1364 return -ENODEV;
1365
1366 ret = amd_iommu_init_err;
1367 if (ret)
1368 goto out;
1369
1370 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1371 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1372 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
1373
1374 ret = -ENOMEM;
1375
1376 /* Device table - directly used by all IOMMUs */
1377 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1378 get_order(dev_table_size));
1379 if (amd_iommu_dev_table == NULL)
1380 goto out;
1381
1382 /*
1383 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
1384 * IOMMU see for that device
1385 */
1386 amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
1387 get_order(alias_table_size));
1388 if (amd_iommu_alias_table == NULL)
1389 goto free;
1390
1391 /* IOMMU rlookup table - find the IOMMU for a specific device */
1392 amd_iommu_rlookup_table = (void *)__get_free_pages(
1393 GFP_KERNEL | __GFP_ZERO,
1394 get_order(rlookup_table_size));
1395 if (amd_iommu_rlookup_table == NULL)
1396 goto free;
1397
1398 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1399 GFP_KERNEL | __GFP_ZERO,
1400 get_order(MAX_DOMAIN_ID/8));
1401 if (amd_iommu_pd_alloc_bitmap == NULL)
1402 goto free;
1403
1404 /* init the device table */
1405 init_device_table();
1406
1407 /*
1408 * let all alias entries point to itself
1409 */
1410 for (i = 0; i <= amd_iommu_last_bdf; ++i)
1411 amd_iommu_alias_table[i] = i;
1412
1413 /*
1414 * never allocate domain 0 because its used as the non-allocated and
1415 * error value placeholder
1416 */
1417 amd_iommu_pd_alloc_bitmap[0] = 1;
1418
1419 spin_lock_init(&amd_iommu_pd_lock);
1420
1421 /*
1422 * now the data structures are allocated and basically initialized
1423 * start the real acpi table scan
1424 */
1425 ret = -ENODEV;
1426 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1427 goto free;
1428
1429 if (amd_iommu_init_err) {
1430 ret = amd_iommu_init_err;
1431 goto free;
1432 }
1433
1434 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1435 goto free;
1436
1437 if (amd_iommu_init_err) {
1438 ret = amd_iommu_init_err;
1439 goto free;
1440 }
1441
1442 ret = amd_iommu_init_devices();
1443 if (ret)
1444 goto free;
1445
1446 enable_iommus();
1447
1448 if (iommu_pass_through)
1449 ret = amd_iommu_init_passthrough();
1450 else
1451 ret = amd_iommu_init_dma_ops();
1452
1453 if (ret)
1454 goto free_disable;
1455
1456 amd_iommu_init_api();
1457
1458 amd_iommu_init_notifier();
1459
1460 register_syscore_ops(&amd_iommu_syscore_ops);
1461
1462 if (iommu_pass_through)
1463 goto out;
1464
1465 if (amd_iommu_unmap_flush)
1466 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1467 else
1468 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1469
1470 x86_platform.iommu_shutdown = disable_iommus;
1471out:
1472 return ret;
1473
1474free_disable:
1475 disable_iommus();
1476
1477free:
1478 amd_iommu_uninit_devices();
1479
1480 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1481 get_order(MAX_DOMAIN_ID/8));
1482
1483 free_pages((unsigned long)amd_iommu_rlookup_table,
1484 get_order(rlookup_table_size));
1485
1486 free_pages((unsigned long)amd_iommu_alias_table,
1487 get_order(alias_table_size));
1488
1489 free_pages((unsigned long)amd_iommu_dev_table,
1490 get_order(dev_table_size));
1491
1492 free_iommu_all();
1493
1494 free_unity_maps();
1495
1496#ifdef CONFIG_GART_IOMMU
1497 /*
1498 * We failed to initialize the AMD IOMMU - try fallback to GART
1499 * if possible.
1500 */
1501 gart_iommu_init();
1502
1503#endif
1504
1505 goto out;
1506}
1507
1508/****************************************************************************
1509 *
1510 * Early detect code. This code runs at IOMMU detection time in the DMA
1511 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1512 * IOMMUs
1513 *
1514 ****************************************************************************/
1515static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1516{
1517 return 0;
1518}
1519
1520int __init amd_iommu_detect(void)
1521{
1522 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1523 return -ENODEV;
1524
1525 if (amd_iommu_disabled)
1526 return -ENODEV;
1527
1528 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1529 iommu_detected = 1;
1530 amd_iommu_detected = 1;
1531 x86_init.iommu.iommu_init = amd_iommu_init;
1532
1533 /* Make sure ACS will be enabled */
1534 pci_request_acs();
1535 return 1;
1536 }
1537 return -ENODEV;
1538}
1539
1540/****************************************************************************
1541 *
1542 * Parsing functions for the AMD IOMMU specific kernel command line
1543 * options.
1544 *
1545 ****************************************************************************/
1546
1547static int __init parse_amd_iommu_dump(char *str)
1548{
1549 amd_iommu_dump = true;
1550
1551 return 1;
1552}
1553
1554static int __init parse_amd_iommu_options(char *str)
1555{
1556 for (; *str; ++str) {
1557 if (strncmp(str, "fullflush", 9) == 0)
1558 amd_iommu_unmap_flush = true;
1559 if (strncmp(str, "off", 3) == 0)
1560 amd_iommu_disabled = true;
1561 }
1562
1563 return 1;
1564}
1565
1566__setup("amd_iommu_dump", parse_amd_iommu_dump);
1567__setup("amd_iommu=", parse_amd_iommu_options);
1568
1569IOMMU_INIT_FINISH(amd_iommu_detect,
1570 gart_iommu_hole_init,
1571 0,
1572 0);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 289e92862fd9..afdc3f756dea 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
27 * timer, but by default APB timer has higher rating than local APIC timers. 27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */ 28 */
29 29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h> 30#include <linux/delay.h>
31#include <linux/dw_apb_timer.h>
33#include <linux/errno.h> 32#include <linux/errno.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/slab.h> 34#include <linux/slab.h>
37#include <linux/pm.h> 35#include <linux/pm.h>
38#include <linux/pci.h>
39#include <linux/sfi.h> 36#include <linux/sfi.h>
40#include <linux/interrupt.h> 37#include <linux/interrupt.h>
41#include <linux/cpu.h> 38#include <linux/cpu.h>
@@ -44,76 +41,48 @@
44#include <asm/fixmap.h> 41#include <asm/fixmap.h>
45#include <asm/apb_timer.h> 42#include <asm/apb_timer.h>
46#include <asm/mrst.h> 43#include <asm/mrst.h>
44#include <asm/time.h>
47 45
48#define APBT_MASK CLOCKSOURCE_MASK(32)
49#define APBT_SHIFT 22
50#define APBT_CLOCKEVENT_RATING 110 46#define APBT_CLOCKEVENT_RATING 110
51#define APBT_CLOCKSOURCE_RATING 250 47#define APBT_CLOCKSOURCE_RATING 250
52#define APBT_MIN_DELTA_USEC 200
53 48
54#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
55#define APBT_CLOCKEVENT0_NUM (0) 49#define APBT_CLOCKEVENT0_NUM (0)
56#define APBT_CLOCKEVENT1_NUM (1)
57#define APBT_CLOCKSOURCE_NUM (2) 50#define APBT_CLOCKSOURCE_NUM (2)
58 51
59static unsigned long apbt_address; 52static phys_addr_t apbt_address;
60static int apb_timer_block_enabled; 53static int apb_timer_block_enabled;
61static void __iomem *apbt_virt_address; 54static void __iomem *apbt_virt_address;
62static int phy_cs_timer_id;
63 55
64/* 56/*
65 * Common DW APB timer info 57 * Common DW APB timer info
66 */ 58 */
67static uint64_t apbt_freq; 59static unsigned long apbt_freq;
68
69static void apbt_set_mode(enum clock_event_mode mode,
70 struct clock_event_device *evt);
71static int apbt_next_event(unsigned long delta,
72 struct clock_event_device *evt);
73static cycle_t apbt_read_clocksource(struct clocksource *cs);
74static void apbt_restart_clocksource(struct clocksource *cs);
75 60
76struct apbt_dev { 61struct apbt_dev {
77 struct clock_event_device evt; 62 struct dw_apb_clock_event_device *timer;
78 unsigned int num; 63 unsigned int num;
79 int cpu; 64 int cpu;
80 unsigned int irq; 65 unsigned int irq;
81 unsigned int tick; 66 char name[10];
82 unsigned int count;
83 unsigned int flags;
84 char name[10];
85}; 67};
86 68
87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); 69static struct dw_apb_clocksource *clocksource_apbt;
88 70
89#ifdef CONFIG_SMP 71static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
90static unsigned int apbt_num_timers_used;
91static struct apbt_dev *apbt_devs;
92#endif
93
94static inline unsigned long apbt_readl_reg(unsigned long a)
95{ 72{
96 return readl(apbt_virt_address + a); 73 return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
97} 74}
98 75
99static inline void apbt_writel_reg(unsigned long d, unsigned long a) 76static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
100{
101 writel(d, apbt_virt_address + a);
102}
103
104static inline unsigned long apbt_readl(int n, unsigned long a)
105{
106 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
107}
108 77
109static inline void apbt_writel(int n, unsigned long d, unsigned long a) 78#ifdef CONFIG_SMP
110{ 79static unsigned int apbt_num_timers_used;
111 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); 80#endif
112}
113 81
114static inline void apbt_set_mapping(void) 82static inline void apbt_set_mapping(void)
115{ 83{
116 struct sfi_timer_table_entry *mtmr; 84 struct sfi_timer_table_entry *mtmr;
85 int phy_cs_timer_id = 0;
117 86
118 if (apbt_virt_address) { 87 if (apbt_virt_address) {
119 pr_debug("APBT base already mapped\n"); 88 pr_debug("APBT base already mapped\n");
@@ -125,21 +94,18 @@ static inline void apbt_set_mapping(void)
125 APBT_CLOCKEVENT0_NUM); 94 APBT_CLOCKEVENT0_NUM);
126 return; 95 return;
127 } 96 }
128 apbt_address = (unsigned long)mtmr->phys_addr; 97 apbt_address = (phys_addr_t)mtmr->phys_addr;
129 if (!apbt_address) { 98 if (!apbt_address) {
130 printk(KERN_WARNING "No timer base from SFI, use default\n"); 99 printk(KERN_WARNING "No timer base from SFI, use default\n");
131 apbt_address = APBT_DEFAULT_BASE; 100 apbt_address = APBT_DEFAULT_BASE;
132 } 101 }
133 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); 102 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
134 if (apbt_virt_address) { 103 if (!apbt_virt_address) {
135 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ 104 pr_debug("Failed mapping APBT phy address at %lu\n",\
136 (void *)apbt_address, (void *)apbt_virt_address); 105 (unsigned long)apbt_address);
137 } else {
138 pr_debug("Failed mapping APBT phy address at %p\n",\
139 (void *)apbt_address);
140 goto panic_noapbt; 106 goto panic_noapbt;
141 } 107 }
142 apbt_freq = mtmr->freq_hz / USEC_PER_SEC; 108 apbt_freq = mtmr->freq_hz;
143 sfi_free_mtmr(mtmr); 109 sfi_free_mtmr(mtmr);
144 110
145 /* Now figure out the physical timer id for clocksource device */ 111 /* Now figure out the physical timer id for clocksource device */
@@ -148,9 +114,14 @@ static inline void apbt_set_mapping(void)
148 goto panic_noapbt; 114 goto panic_noapbt;
149 115
150 /* Now figure out the physical timer id */ 116 /* Now figure out the physical timer id */
151 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) 117 pr_debug("Use timer %d for clocksource\n",
152 / APBTMRS_REG_SIZE; 118 (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
153 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); 119 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
120 APBTMRS_REG_SIZE;
121
122 clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
123 "apbt0", apbt_virt_address + phy_cs_timer_id *
124 APBTMRS_REG_SIZE, apbt_freq);
154 return; 125 return;
155 126
156panic_noapbt: 127panic_noapbt:
@@ -172,82 +143,6 @@ static inline int is_apbt_capable(void)
172 return apbt_virt_address ? 1 : 0; 143 return apbt_virt_address ? 1 : 0;
173} 144}
174 145
175static struct clocksource clocksource_apbt = {
176 .name = "apbt",
177 .rating = APBT_CLOCKSOURCE_RATING,
178 .read = apbt_read_clocksource,
179 .mask = APBT_MASK,
180 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
181 .resume = apbt_restart_clocksource,
182};
183
184/* boot APB clock event device */
185static struct clock_event_device apbt_clockevent = {
186 .name = "apbt0",
187 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
188 .set_mode = apbt_set_mode,
189 .set_next_event = apbt_next_event,
190 .shift = APBT_SHIFT,
191 .irq = 0,
192 .rating = APBT_CLOCKEVENT_RATING,
193};
194
195/*
196 * start count down from 0xffff_ffff. this is done by toggling the enable bit
197 * then load initial load count to ~0.
198 */
199static void apbt_start_counter(int n)
200{
201 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
202
203 ctrl &= ~APBTMR_CONTROL_ENABLE;
204 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
205 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
206 /* enable, mask interrupt */
207 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
208 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
209 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
210 /* read it once to get cached counter value initialized */
211 apbt_read_clocksource(&clocksource_apbt);
212}
213
214static irqreturn_t apbt_interrupt_handler(int irq, void *data)
215{
216 struct apbt_dev *dev = (struct apbt_dev *)data;
217 struct clock_event_device *aevt = &dev->evt;
218
219 if (!aevt->event_handler) {
220 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
221 dev->num);
222 return IRQ_NONE;
223 }
224 aevt->event_handler(aevt);
225 return IRQ_HANDLED;
226}
227
228static void apbt_restart_clocksource(struct clocksource *cs)
229{
230 apbt_start_counter(phy_cs_timer_id);
231}
232
233static void apbt_enable_int(int n)
234{
235 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
236 /* clear pending intr */
237 apbt_readl(n, APBTMR_N_EOI);
238 ctrl &= ~APBTMR_CONTROL_INT;
239 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
240}
241
242static void apbt_disable_int(int n)
243{
244 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
245
246 ctrl |= APBTMR_CONTROL_INT;
247 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
248}
249
250
251static int __init apbt_clockevent_register(void) 146static int __init apbt_clockevent_register(void)
252{ 147{
253 struct sfi_timer_table_entry *mtmr; 148 struct sfi_timer_table_entry *mtmr;
@@ -260,45 +155,21 @@ static int __init apbt_clockevent_register(void)
260 return -ENODEV; 155 return -ENODEV;
261 } 156 }
262 157
263 /*
264 * We need to calculate the scaled math multiplication factor for
265 * nanosecond to apbt tick conversion.
266 * mult = (nsec/cycle)*2^APBT_SHIFT
267 */
268 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
269 , NSEC_PER_SEC, APBT_SHIFT);
270
271 /* Calculate the min / max delta */
272 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
273 &apbt_clockevent);
274 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
275 APBT_MIN_DELTA_USEC*apbt_freq,
276 &apbt_clockevent);
277 /*
278 * Start apbt with the boot cpu mask and make it
279 * global if not used for per cpu timer.
280 */
281 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
282 adev->num = smp_processor_id(); 158 adev->num = smp_processor_id();
283 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 159 adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
160 mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
161 APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
162 adev_virt_addr(adev), 0, apbt_freq);
163 /* Firmware does EOI handling for us. */
164 adev->timer->eoi = NULL;
284 165
285 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { 166 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
286 adev->evt.rating = APBT_CLOCKEVENT_RATING - 100; 167 global_clock_event = &adev->timer->ced;
287 global_clock_event = &adev->evt;
288 printk(KERN_DEBUG "%s clockevent registered as global\n", 168 printk(KERN_DEBUG "%s clockevent registered as global\n",
289 global_clock_event->name); 169 global_clock_event->name);
290 } 170 }
291 171
292 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, 172 dw_apb_clockevent_register(adev->timer);
293 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
294 apbt_clockevent.name, adev)) {
295 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
296 apbt_clockevent.irq);
297 }
298
299 clockevents_register_device(&adev->evt);
300 /* Start APBT 0 interrupts */
301 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
302 173
303 sfi_free_mtmr(mtmr); 174 sfi_free_mtmr(mtmr);
304 return 0; 175 return 0;
@@ -316,52 +187,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
316 irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); 187 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
317 /* APB timer irqs are set up as mp_irqs, timer is edge type */ 188 /* APB timer irqs are set up as mp_irqs, timer is edge type */
318 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); 189 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
319
320 if (system_state == SYSTEM_BOOTING) {
321 if (request_irq(adev->irq, apbt_interrupt_handler,
322 IRQF_TIMER | IRQF_DISABLED |
323 IRQF_NOBALANCING,
324 adev->name, adev)) {
325 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
326 adev->num);
327 }
328 } else
329 enable_irq(adev->irq);
330} 190}
331 191
332/* Should be called with per cpu */ 192/* Should be called with per cpu */
333void apbt_setup_secondary_clock(void) 193void apbt_setup_secondary_clock(void)
334{ 194{
335 struct apbt_dev *adev; 195 struct apbt_dev *adev;
336 struct clock_event_device *aevt;
337 int cpu; 196 int cpu;
338 197
339 /* Don't register boot CPU clockevent */ 198 /* Don't register boot CPU clockevent */
340 cpu = smp_processor_id(); 199 cpu = smp_processor_id();
341 if (!cpu) 200 if (!cpu)
342 return; 201 return;
343 /*
344 * We need to calculate the scaled math multiplication factor for
345 * nanosecond to apbt tick conversion.
346 * mult = (nsec/cycle)*2^APBT_SHIFT
347 */
348 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
349 adev = &per_cpu(cpu_apbt_dev, cpu);
350 aevt = &adev->evt;
351 202
352 memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); 203 adev = &__get_cpu_var(cpu_apbt_dev);
353 aevt->cpumask = cpumask_of(cpu); 204 if (!adev->timer) {
354 aevt->name = adev->name; 205 adev->timer = dw_apb_clockevent_init(cpu, adev->name,
355 aevt->mode = CLOCK_EVT_MODE_UNUSED; 206 APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
207 adev->irq, apbt_freq);
208 adev->timer->eoi = NULL;
209 } else {
210 dw_apb_clockevent_resume(adev->timer);
211 }
356 212
357 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", 213 printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
358 cpu, aevt->name, *(u32 *)aevt->cpumask); 214 cpu, adev->name, adev->cpu);
359 215
360 apbt_setup_irq(adev); 216 apbt_setup_irq(adev);
361 217 dw_apb_clockevent_register(adev->timer);
362 clockevents_register_device(aevt);
363
364 apbt_enable_int(cpu);
365 218
366 return; 219 return;
367} 220}
@@ -384,13 +237,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
384 237
385 switch (action & 0xf) { 238 switch (action & 0xf) {
386 case CPU_DEAD: 239 case CPU_DEAD:
387 disable_irq(adev->irq); 240 dw_apb_clockevent_pause(adev->timer);
388 apbt_disable_int(cpu);
389 if (system_state == SYSTEM_RUNNING) { 241 if (system_state == SYSTEM_RUNNING) {
390 pr_debug("skipping APBT CPU %lu offline\n", cpu); 242 pr_debug("skipping APBT CPU %lu offline\n", cpu);
391 } else if (adev) { 243 } else if (adev) {
392 pr_debug("APBT clockevent for cpu %lu offline\n", cpu); 244 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
393 free_irq(adev->irq, adev); 245 dw_apb_clockevent_stop(adev->timer);
394 } 246 }
395 break; 247 break;
396 default: 248 default:
@@ -415,116 +267,16 @@ void apbt_setup_secondary_clock(void) {}
415 267
416#endif /* CONFIG_SMP */ 268#endif /* CONFIG_SMP */
417 269
418static void apbt_set_mode(enum clock_event_mode mode,
419 struct clock_event_device *evt)
420{
421 unsigned long ctrl;
422 uint64_t delta;
423 int timer_num;
424 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
425
426 BUG_ON(!apbt_virt_address);
427
428 timer_num = adev->num;
429 pr_debug("%s CPU %d timer %d mode=%d\n",
430 __func__, first_cpu(*evt->cpumask), timer_num, mode);
431
432 switch (mode) {
433 case CLOCK_EVT_MODE_PERIODIC:
434 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
435 delta >>= apbt_clockevent.shift;
436 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
437 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
438 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
439 /*
440 * DW APB p. 46, have to disable timer before load counter,
441 * may cause sync problem.
442 */
443 ctrl &= ~APBTMR_CONTROL_ENABLE;
444 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
445 udelay(1);
446 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
447 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
448 ctrl |= APBTMR_CONTROL_ENABLE;
449 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
450 break;
451 /* APB timer does not have one-shot mode, use free running mode */
452 case CLOCK_EVT_MODE_ONESHOT:
453 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
454 /*
455 * set free running mode, this mode will let timer reload max
456 * timeout which will give time (3min on 25MHz clock) to rearm
457 * the next event, therefore emulate the one-shot mode.
458 */
459 ctrl &= ~APBTMR_CONTROL_ENABLE;
460 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
461
462 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
463 /* write again to set free running mode */
464 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
465
466 /*
467 * DW APB p. 46, load counter with all 1s before starting free
468 * running mode.
469 */
470 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
471 ctrl &= ~APBTMR_CONTROL_INT;
472 ctrl |= APBTMR_CONTROL_ENABLE;
473 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
474 break;
475
476 case CLOCK_EVT_MODE_UNUSED:
477 case CLOCK_EVT_MODE_SHUTDOWN:
478 apbt_disable_int(timer_num);
479 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
480 ctrl &= ~APBTMR_CONTROL_ENABLE;
481 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
482 break;
483
484 case CLOCK_EVT_MODE_RESUME:
485 apbt_enable_int(timer_num);
486 break;
487 }
488}
489
490static int apbt_next_event(unsigned long delta,
491 struct clock_event_device *evt)
492{
493 unsigned long ctrl;
494 int timer_num;
495
496 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
497
498 timer_num = adev->num;
499 /* Disable timer */
500 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
501 ctrl &= ~APBTMR_CONTROL_ENABLE;
502 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
503 /* write new count */
504 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
505 ctrl |= APBTMR_CONTROL_ENABLE;
506 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
507 return 0;
508}
509
510static cycle_t apbt_read_clocksource(struct clocksource *cs)
511{
512 unsigned long current_count;
513
514 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
515 return (cycle_t)~current_count;
516}
517
518static int apbt_clocksource_register(void) 270static int apbt_clocksource_register(void)
519{ 271{
520 u64 start, now; 272 u64 start, now;
521 cycle_t t1; 273 cycle_t t1;
522 274
523 /* Start the counter, use timer 2 as source, timer 0/1 for event */ 275 /* Start the counter, use timer 2 as source, timer 0/1 for event */
524 apbt_start_counter(phy_cs_timer_id); 276 dw_apb_clocksource_start(clocksource_apbt);
525 277
526 /* Verify whether apbt counter works */ 278 /* Verify whether apbt counter works */
527 t1 = apbt_read_clocksource(&clocksource_apbt); 279 t1 = dw_apb_clocksource_read(clocksource_apbt);
528 rdtscll(start); 280 rdtscll(start);
529 281
530 /* 282 /*
@@ -539,10 +291,10 @@ static int apbt_clocksource_register(void)
539 } while ((now - start) < 200000UL); 291 } while ((now - start) < 200000UL);
540 292
541 /* APBT is the only always on clocksource, it has to work! */ 293 /* APBT is the only always on clocksource, it has to work! */
542 if (t1 == apbt_read_clocksource(&clocksource_apbt)) 294 if (t1 == dw_apb_clocksource_read(clocksource_apbt))
543 panic("APBT counter not counting. APBT disabled\n"); 295 panic("APBT counter not counting. APBT disabled\n");
544 296
545 clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); 297 dw_apb_clocksource_register(clocksource_apbt);
546 298
547 return 0; 299 return 0;
548} 300}
@@ -566,10 +318,7 @@ void __init apbt_time_init(void)
566 if (apb_timer_block_enabled) 318 if (apb_timer_block_enabled)
567 return; 319 return;
568 apbt_set_mapping(); 320 apbt_set_mapping();
569 if (apbt_virt_address) { 321 if (!apbt_virt_address)
570 pr_debug("Found APBT version 0x%lx\n",\
571 apbt_readl_reg(APBTMRS_COMP_VERSION));
572 } else
573 goto out_noapbt; 322 goto out_noapbt;
574 /* 323 /*
575 * Read the frequency and check for a sane value, for ESL model 324 * Read the frequency and check for a sane value, for ESL model
@@ -577,7 +326,7 @@ void __init apbt_time_init(void)
577 */ 326 */
578 327
579 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { 328 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
580 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); 329 pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
581 goto out_noapbt; 330 goto out_noapbt;
582 } 331 }
583 if (apbt_clocksource_register()) { 332 if (apbt_clocksource_register()) {
@@ -603,30 +352,20 @@ void __init apbt_time_init(void)
603 } else { 352 } else {
604 percpu_timer = 0; 353 percpu_timer = 0;
605 apbt_num_timers_used = 1; 354 apbt_num_timers_used = 1;
606 adev = &per_cpu(cpu_apbt_dev, 0);
607 adev->flags &= ~APBT_DEV_USED;
608 } 355 }
609 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); 356 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
610 357
611 /* here we set up per CPU timer data structure */ 358 /* here we set up per CPU timer data structure */
612 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
613 GFP_KERNEL);
614 if (!apbt_devs) {
615 printk(KERN_ERR "Failed to allocate APB timer devices\n");
616 return;
617 }
618 for (i = 0; i < apbt_num_timers_used; i++) { 359 for (i = 0; i < apbt_num_timers_used; i++) {
619 adev = &per_cpu(cpu_apbt_dev, i); 360 adev = &per_cpu(cpu_apbt_dev, i);
620 adev->num = i; 361 adev->num = i;
621 adev->cpu = i; 362 adev->cpu = i;
622 p_mtmr = sfi_get_mtmr(i); 363 p_mtmr = sfi_get_mtmr(i);
623 if (p_mtmr) { 364 if (p_mtmr)
624 adev->tick = p_mtmr->freq_hz;
625 adev->irq = p_mtmr->irq; 365 adev->irq = p_mtmr->irq;
626 } else 366 else
627 printk(KERN_ERR "Failed to get timer for cpu %d\n", i); 367 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
628 adev->count = 0; 368 snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
629 sprintf(adev->name, "apbt%d", i);
630 } 369 }
631#endif 370#endif
632 371
@@ -638,17 +377,8 @@ out_noapbt:
638 panic("failed to enable APB timer\n"); 377 panic("failed to enable APB timer\n");
639} 378}
640 379
641static inline void apbt_disable(int n)
642{
643 if (is_apbt_capable()) {
644 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
645 ctrl &= ~APBTMR_CONTROL_ENABLE;
646 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
647 }
648}
649
650/* called before apb_timer_enable, use early map */ 380/* called before apb_timer_enable, use early map */
651unsigned long apbt_quick_calibrate() 381unsigned long apbt_quick_calibrate(void)
652{ 382{
653 int i, scale; 383 int i, scale;
654 u64 old, new; 384 u64 old, new;
@@ -657,31 +387,31 @@ unsigned long apbt_quick_calibrate()
657 u32 loop, shift; 387 u32 loop, shift;
658 388
659 apbt_set_mapping(); 389 apbt_set_mapping();
660 apbt_start_counter(phy_cs_timer_id); 390 dw_apb_clocksource_start(clocksource_apbt);
661 391
662 /* check if the timer can count down, otherwise return */ 392 /* check if the timer can count down, otherwise return */
663 old = apbt_read_clocksource(&clocksource_apbt); 393 old = dw_apb_clocksource_read(clocksource_apbt);
664 i = 10000; 394 i = 10000;
665 while (--i) { 395 while (--i) {
666 if (old != apbt_read_clocksource(&clocksource_apbt)) 396 if (old != dw_apb_clocksource_read(clocksource_apbt))
667 break; 397 break;
668 } 398 }
669 if (!i) 399 if (!i)
670 goto failed; 400 goto failed;
671 401
672 /* count 16 ms */ 402 /* count 16 ms */
673 loop = (apbt_freq * 1000) << 4; 403 loop = (apbt_freq / 1000) << 4;
674 404
675 /* restart the timer to ensure it won't get to 0 in the calibration */ 405 /* restart the timer to ensure it won't get to 0 in the calibration */
676 apbt_start_counter(phy_cs_timer_id); 406 dw_apb_clocksource_start(clocksource_apbt);
677 407
678 old = apbt_read_clocksource(&clocksource_apbt); 408 old = dw_apb_clocksource_read(clocksource_apbt);
679 old += loop; 409 old += loop;
680 410
681 t1 = __native_read_tsc(); 411 t1 = __native_read_tsc();
682 412
683 do { 413 do {
684 new = apbt_read_clocksource(&clocksource_apbt); 414 new = dw_apb_clocksource_read(clocksource_apbt);
685 } while (new < old); 415 } while (new < old);
686 416
687 t2 = __native_read_tsc(); 417 t2 = __native_read_tsc();
@@ -693,7 +423,7 @@ unsigned long apbt_quick_calibrate()
693 return 0; 423 return 0;
694 } 424 }
695 scale = (int)div_u64((t2 - t1), loop >> shift); 425 scale = (int)div_u64((t2 - t1), loop >> shift);
696 khz = (scale * apbt_freq * 1000) >> shift; 426 khz = (scale * (apbt_freq / 1000)) >> shift;
697 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); 427 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
698 return khz; 428 return khz;
699failed: 429failed:
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b9338b8cf420..b24be38c8cf8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
27#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/timex.h> 29#include <linux/timex.h>
30#include <linux/i8253.h>
30#include <linux/dmar.h> 31#include <linux/dmar.h>
31#include <linux/init.h> 32#include <linux/init.h>
32#include <linux/cpu.h> 33#include <linux/cpu.h>
@@ -39,7 +40,6 @@
39#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
40#include <asm/atomic.h> 41#include <asm/atomic.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
42#include <asm/i8253.h>
43#include <asm/i8259.h> 43#include <asm/i8259.h>
44#include <asm/proto.h> 44#include <asm/proto.h>
45#include <asm/apic.h> 45#include <asm/apic.h>
@@ -48,6 +48,7 @@
48#include <asm/hpet.h> 48#include <asm/hpet.h>
49#include <asm/idle.h> 49#include <asm/idle.h>
50#include <asm/mtrr.h> 50#include <asm/mtrr.h>
51#include <asm/time.h>
51#include <asm/smp.h> 52#include <asm/smp.h>
52#include <asm/mce.h> 53#include <asm/mce.h>
53#include <asm/tsc.h> 54#include <asm/tsc.h>
@@ -1429,7 +1430,7 @@ void enable_x2apic(void)
1429 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1430 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1430 if (!(msr & X2APIC_ENABLE)) { 1431 if (!(msr & X2APIC_ENABLE)) {
1431 printk_once(KERN_INFO "Enabling x2apic\n"); 1432 printk_once(KERN_INFO "Enabling x2apic\n");
1432 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1433 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
1433 } 1434 }
1434} 1435}
1435#endif /* CONFIG_X86_X2APIC */ 1436#endif /* CONFIG_X86_X2APIC */
@@ -1943,10 +1944,28 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1943 1944
1944void __cpuinit generic_processor_info(int apicid, int version) 1945void __cpuinit generic_processor_info(int apicid, int version)
1945{ 1946{
1946 int cpu; 1947 int cpu, max = nr_cpu_ids;
1948 bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
1949 phys_cpu_present_map);
1950
1951 /*
1952 * If boot cpu has not been detected yet, then only allow upto
1953 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
1954 */
1955 if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
1956 apicid != boot_cpu_physical_apicid) {
1957 int thiscpu = max + disabled_cpus - 1;
1958
1959 pr_warning(
1960 "ACPI: NR_CPUS/possible_cpus limit of %i almost"
1961 " reached. Keeping one slot for boot cpu."
1962 " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
1963
1964 disabled_cpus++;
1965 return;
1966 }
1947 1967
1948 if (num_processors >= nr_cpu_ids) { 1968 if (num_processors >= nr_cpu_ids) {
1949 int max = nr_cpu_ids;
1950 int thiscpu = max + disabled_cpus; 1969 int thiscpu = max + disabled_cpus;
1951 1970
1952 pr_warning( 1971 pr_warning(
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e5293394b548..8eb863e27ea6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1295,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
1295 * irq handler will do the explicit EOI to the io-apic. 1295 * irq handler will do the explicit EOI to the io-apic.
1296 */ 1296 */
1297 ir_entry->vector = pin; 1297 ir_entry->vector = pin;
1298
1299 apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
1300 "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
1301 "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
1302 "Avail:%X Vector:%02X Dest:%08X "
1303 "SID:%04X SQ:%X SVT:%X)\n",
1304 apic_id, irte.present, irte.fpd, irte.dst_mode,
1305 irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
1306 irte.avail, irte.vector, irte.dest_id,
1307 irte.sid, irte.sq, irte.svt);
1298 } else { 1308 } else {
1299 entry->delivery_mode = apic->irq_delivery_mode; 1309 entry->delivery_mode = apic->irq_delivery_mode;
1300 entry->dest_mode = apic->irq_dest_mode; 1310 entry->dest_mode = apic->irq_dest_mode;
@@ -1337,9 +1347,9 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1337 1347
1338 apic_printk(APIC_VERBOSE,KERN_DEBUG 1348 apic_printk(APIC_VERBOSE,KERN_DEBUG
1339 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1349 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1340 "IRQ %d Mode:%i Active:%i)\n", 1350 "IRQ %d Mode:%i Active:%i Dest:%d)\n",
1341 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector, 1351 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
1342 irq, trigger, polarity); 1352 irq, trigger, polarity, dest);
1343 1353
1344 1354
1345 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry, 1355 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
@@ -1522,10 +1532,12 @@ __apicdebuginit(void) print_IO_APIC(void)
1522 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); 1532 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1523 1533
1524 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01); 1534 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1525 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1535 printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
1536 reg_01.bits.entries);
1526 1537
1527 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1538 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1528 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1539 printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
1540 reg_01.bits.version);
1529 1541
1530 /* 1542 /*
1531 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, 1543 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1550,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void)
1550 1562
1551 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1563 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1552 1564
1553 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1565 if (intr_remapping_enabled) {
1554 " Stat Dmod Deli Vect:\n"); 1566 printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
1567 " Pol Stat Indx2 Zero Vect:\n");
1568 } else {
1569 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1570 " Stat Dmod Deli Vect:\n");
1571 }
1555 1572
1556 for (i = 0; i <= reg_01.bits.entries; i++) { 1573 for (i = 0; i <= reg_01.bits.entries; i++) {
1557 struct IO_APIC_route_entry entry; 1574 if (intr_remapping_enabled) {
1558 1575 struct IO_APIC_route_entry entry;
1559 entry = ioapic_read_entry(apic, i); 1576 struct IR_IO_APIC_route_entry *ir_entry;
1560 1577
1561 printk(KERN_DEBUG " %02x %03X ", 1578 entry = ioapic_read_entry(apic, i);
1562 i, 1579 ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
1563 entry.dest 1580 printk(KERN_DEBUG " %02x %04X ",
1564 ); 1581 i,
1582 ir_entry->index
1583 );
1584 printk("%1d %1d %1d %1d %1d "
1585 "%1d %1d %X %02X\n",
1586 ir_entry->format,
1587 ir_entry->mask,
1588 ir_entry->trigger,
1589 ir_entry->irr,
1590 ir_entry->polarity,
1591 ir_entry->delivery_status,
1592 ir_entry->index2,
1593 ir_entry->zero,
1594 ir_entry->vector
1595 );
1596 } else {
1597 struct IO_APIC_route_entry entry;
1565 1598
1566 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", 1599 entry = ioapic_read_entry(apic, i);
1567 entry.mask, 1600 printk(KERN_DEBUG " %02x %02X ",
1568 entry.trigger, 1601 i,
1569 entry.irr, 1602 entry.dest
1570 entry.polarity, 1603 );
1571 entry.delivery_status, 1604 printk("%1d %1d %1d %1d %1d "
1572 entry.dest_mode, 1605 "%1d %1d %02X\n",
1573 entry.delivery_mode, 1606 entry.mask,
1574 entry.vector 1607 entry.trigger,
1575 ); 1608 entry.irr,
1609 entry.polarity,
1610 entry.delivery_status,
1611 entry.dest_mode,
1612 entry.delivery_mode,
1613 entry.vector
1614 );
1615 }
1576 } 1616 }
1577 } 1617 }
1618
1578 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1619 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1579 for_each_active_irq(irq) { 1620 for_each_active_irq(irq) {
1580 struct irq_pin_list *entry; 1621 struct irq_pin_list *entry;
@@ -1792,7 +1833,7 @@ __apicdebuginit(int) print_ICs(void)
1792 return 0; 1833 return 0;
1793} 1834}
1794 1835
1795fs_initcall(print_ICs); 1836late_initcall(print_ICs);
1796 1837
1797 1838
1798/* Where if anywhere is the i8259 connect in external int mode */ 1839/* Where if anywhere is the i8259 connect in external int mode */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 965a7666c283..0371c484bb8a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,11 +229,11 @@
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h> 230#include <linux/acpi.h>
231#include <linux/syscore_ops.h> 231#include <linux/syscore_ops.h>
232#include <linux/i8253.h>
232 233
233#include <asm/system.h> 234#include <asm/system.h>
234#include <asm/uaccess.h> 235#include <asm/uaccess.h>
235#include <asm/desc.h> 236#include <asm/desc.h>
236#include <asm/i8253.h>
237#include <asm/olpc.h> 237#include <asm/olpc.h>
238#include <asm/paravirt.h> 238#include <asm/paravirt.h>
239#include <asm/reboot.h> 239#include <asm/reboot.h>
@@ -1220,11 +1220,11 @@ static void reinit_timer(void)
1220 1220
1221 raw_spin_lock_irqsave(&i8253_lock, flags); 1221 raw_spin_lock_irqsave(&i8253_lock, flags);
1222 /* set the clock to HZ */ 1222 /* set the clock to HZ */
1223 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1223 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1224 udelay(10); 1224 udelay(10);
1225 outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */ 1225 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
1226 udelay(10); 1226 udelay(10);
1227 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ 1227 outb_p(LATCH >> 8, PIT_CH0); /* MSB */
1228 udelay(10); 1228 udelay(10);
1229 raw_spin_unlock_irqrestore(&i8253_lock, flags); 1229 raw_spin_unlock_irqrestore(&i8253_lock, flags);
1230#endif 1230#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6fc..395a10e68067 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
63 BLANK(); 63 BLANK();
64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
65 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); 65 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
66 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
67 66
68 BLANK(); 67 BLANK();
69 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 68 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8a..755f64fb0743 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
32 */ 32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] = 33static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 34{
35 &x86_hyper_vmware,
36 &x86_hyper_ms_hyperv,
37#ifdef CONFIG_XEN_PVHVM 35#ifdef CONFIG_XEN_PVHVM
38 &x86_hyper_xen_hvm, 36 &x86_hyper_xen_hvm,
39#endif 37#endif
38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv,
40}; 40};
41 41
42const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1edf5ba4fb2b..ed6086eedf1d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
456 456
457 if (cpu_has(c, X86_FEATURE_VMX)) 457 if (cpu_has(c, X86_FEATURE_VMX))
458 detect_vmx_virtcap(c); 458 detect_vmx_virtcap(c);
459
460 /*
461 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
462 * x86_energy_perf_policy(8) is available to change it at run-time
463 */
464 if (cpu_has(c, X86_FEATURE_EPB)) {
465 u64 epb;
466
467 rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
468 if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
469 printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
470 " Set to 'normal', was 'performance'\n"
471 "ENERGY_PERF_BIAS: View and update with"
472 " x86_energy_perf_policy(8)\n");
473 epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
474 wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
475 }
476 }
459} 477}
460 478
461#ifdef CONFIG_X86_32 479#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336a..7395d5f4272d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
43 unsigned char covered; 43 unsigned char covered;
44 char *msg; 44 char *msg;
45} severities[] = { 45} severities[] = {
46#define KERNEL .context = IN_KERNEL 46#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
47#define USER .context = IN_USER 47#define KERNEL .context = IN_KERNEL
48#define SER .ser = SER_REQUIRED 48#define USER .context = IN_USER
49#define NOSER .ser = NO_SER 49#define SER .ser = SER_REQUIRED
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY 50#define NOSER .ser = NO_SER
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } 51#define BITCLR(x) .mask = x, .result = 0
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } 52#define BITSET(x) .mask = x, .result = x
53#define MCGMASK(x, res, s, m, r...) \ 53#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } 54#define MASK(x, y) .mask = x, .result = y
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff 57#define MCACOD 0xffff
60 58
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"), 59 MCESEV(
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"), 60 NO, "Invalid",
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), 61 BITCLR(MCI_STATUS_VAL)
62 ),
63 MCESEV(
64 NO, "Not enabled",
65 BITCLR(MCI_STATUS_EN)
66 ),
67 MCESEV(
68 PANIC, "Processor context corrupt",
69 BITSET(MCI_STATUS_PCC)
70 ),
64 /* When MCIP is not set something is very confused */ 71 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), 72 MCESEV(
73 PANIC, "MCIP not set in MCA handler",
74 MCGMASK(MCG_STATUS_MCIP, 0)
75 ),
66 /* Neither return not error IP -- no chance to recover -> PANIC */ 76 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, 77 MCESEV(
68 "Neither restart nor error IP"), 78 PANIC, "Neither restart nor error IP",
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", 79 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
70 KERNEL), 80 ),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), 81 MCESEV(
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, 82 PANIC, "In kernel and no restart IP",
73 "Spurious not enabled", SER), 83 KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
84 ),
85 MCESEV(
86 KEEP, "Corrected error",
87 NOSER, BITCLR(MCI_STATUS_UC)
88 ),
74 89
75 /* ignore OVER for UCNA */ 90 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, 91 MCESEV(
77 "Uncorrected no action required", SER), 92 KEEP, "Uncorrected no action required",
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, 93 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
79 "Illegal combination (UCNA with AR=1)", SER), 94 ),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), 95 MCESEV(
96 PANIC, "Illegal combination (UCNA with AR=1)",
97 SER,
98 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
99 ),
100 MCESEV(
101 KEEP, "Non signalled machine check",
102 SER, BITCLR(MCI_STATUS_S)
103 ),
81 104
82 /* AR add known MCACODs here */ 105 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, 106 MCESEV(
84 "Action required with lost events", SER), 107 PANIC, "Action required with lost events",
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, 108 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
86 "Action required; unknown MCACOD", SER), 109 ),
110 MCESEV(
111 PANIC, "Action required: unknown MCACOD",
112 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
113 ),
87 114
88 /* known AO MCACODs: */ 115 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, 116 MCESEV(
90 "Action optional: memory scrubbing error", SER), 117 AO, "Action optional: memory scrubbing error",
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, 118 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
92 "Action optional: last level cache writeback error", SER), 119 ),
93 120 MCESEV(
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, 121 AO, "Action optional: last level cache writeback error",
95 "Action optional unknown MCACOD", SER), 122 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, 123 ),
97 "Action optional with lost events", SER), 124 MCESEV(
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), 125 SOME, "Action optional: unknown MCACOD",
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"), 126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
100 BITSET(0, SOME, "No match") /* always matches. keep at end */ 127 ),
128 MCESEV(
129 SOME, "Action optional with lost events",
130 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
131 ),
132
133 MCESEV(
134 PANIC, "Overflowed uncorrected",
135 BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
136 ),
137 MCESEV(
138 UC, "Uncorrected",
139 BITSET(MCI_STATUS_UC)
140 ),
141 MCESEV(
142 SOME, "No match",
143 BITSET(0)
144 ) /* always matches. keep at end */
101}; 145};
102 146
103/* 147/*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
112 return IN_KERNEL; 156 return IN_KERNEL;
113} 157}
114 158
115int mce_severity(struct mce *a, int tolerant, char **msg) 159int mce_severity(struct mce *m, int tolerant, char **msg)
116{ 160{
117 enum context ctx = error_context(a); 161 enum context ctx = error_context(m);
118 struct severity *s; 162 struct severity *s;
119 163
120 for (s = severities;; s++) { 164 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result) 165 if ((m->status & s->mask) != s->result)
122 continue; 166 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres) 167 if ((m->mcgstatus & s->mcgmask) != s->mcgres)
124 continue; 168 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser) 169 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue; 170 continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
197 241
198static int __init severities_debugfs_init(void) 242static int __init severities_debugfs_init(void)
199{ 243{
200 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 244 struct dentry *dmce, *fsev;
201 245
202 dmce = mce_get_debugfs_dir(); 246 dmce = mce_get_debugfs_dir();
203 if (dmce == NULL) 247 if (!dmce)
204 goto err_out; 248 goto err_out;
205 fseverities_coverage = debugfs_create_file("severities-coverage", 249
206 0444, dmce, NULL, 250 fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
207 &severities_coverage_fops); 251 &severities_coverage_fops);
208 if (fseverities_coverage == NULL) 252 if (!fsev)
209 goto err_out; 253 goto err_out;
210 254
211 return 0; 255 return 0;
@@ -214,4 +258,4 @@ err_out:
214 return -ENOMEM; 258 return -ENOMEM;
215} 259}
216late_initcall(severities_debugfs_init); 260late_initcall(severities_debugfs_init);
217#endif 261#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b6464d..08363b042122 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
10#include <linux/thread_info.h> 10#include <linux/thread_info.h>
11#include <linux/capability.h> 11#include <linux/capability.h>
12#include <linux/miscdevice.h> 12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h> 13#include <linux/ratelimit.h>
15#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
16#include <linux/rcupdate.h> 15#include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
38#include <linux/mm.h> 37#include <linux/mm.h>
39#include <linux/debugfs.h> 38#include <linux/debugfs.h>
40#include <linux/edac_mce.h> 39#include <linux/edac_mce.h>
40#include <linux/irq_work.h>
41 41
42#include <asm/processor.h> 42#include <asm/processor.h>
43#include <asm/hw_irq.h>
44#include <asm/apic.h>
45#include <asm/idle.h>
46#include <asm/ipi.h>
47#include <asm/mce.h> 43#include <asm/mce.h>
48#include <asm/msr.h> 44#include <asm/msr.h>
49 45
50#include "mce-internal.h" 46#include "mce-internal.h"
51 47
52static DEFINE_MUTEX(mce_read_mutex); 48static DEFINE_MUTEX(mce_chrdev_read_mutex);
53 49
54#define rcu_dereference_check_mce(p) \ 50#define rcu_dereference_check_mce(p) \
55 rcu_dereference_index_check((p), \ 51 rcu_dereference_index_check((p), \
56 rcu_read_lock_sched_held() || \ 52 rcu_read_lock_sched_held() || \
57 lockdep_is_held(&mce_read_mutex)) 53 lockdep_is_held(&mce_chrdev_read_mutex))
58 54
59#define CREATE_TRACE_POINTS 55#define CREATE_TRACE_POINTS
60#include <trace/events/mce.h> 56#include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long mce_need_notify;
94static char mce_helper[128]; 90static char mce_helper[128];
95static char *mce_helper_argv[2] = { mce_helper, NULL }; 91static char *mce_helper_argv[2] = { mce_helper, NULL };
96 92
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
94
98static DEFINE_PER_CPU(struct mce, mces_seen); 95static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 96static int cpu_missing;
100 97
@@ -373,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
373} 370}
374 371
375/* 372/*
373 * Collect all global (w.r.t. this processor) status about this machine
374 * check into our "mce" struct so that we can use it later to assess
375 * the severity of the problem as we read per-bank specific details.
376 */
377static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
378{
379 mce_setup(m);
380
381 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
382 if (regs) {
383 /*
384 * Get the address of the instruction at the time of
385 * the machine check error.
386 */
387 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
388 m->ip = regs->ip;
389 m->cs = regs->cs;
390 }
391 /* Use accurate RIP reporting if available. */
392 if (rip_msr)
393 m->ip = mce_rdmsrl(rip_msr);
394 }
395}
396
397/*
376 * Simple lockless ring to communicate PFNs from the exception handler with the 398 * Simple lockless ring to communicate PFNs from the exception handler with the
377 * process context work function. This is vastly simplified because there's 399 * process context work function. This is vastly simplified because there's
378 * only a single reader and a single writer. 400 * only a single reader and a single writer.
@@ -443,40 +465,13 @@ static void mce_schedule_work(void)
443 } 465 }
444} 466}
445 467
446/* 468DEFINE_PER_CPU(struct irq_work, mce_irq_work);
447 * Get the address of the instruction at the time of the machine check
448 * error.
449 */
450static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
451{
452
453 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
454 m->ip = regs->ip;
455 m->cs = regs->cs;
456 } else {
457 m->ip = 0;
458 m->cs = 0;
459 }
460 if (rip_msr)
461 m->ip = mce_rdmsrl(rip_msr);
462}
463 469
464#ifdef CONFIG_X86_LOCAL_APIC 470static void mce_irq_work_cb(struct irq_work *entry)
465/*
466 * Called after interrupts have been reenabled again
467 * when a MCE happened during an interrupts off region
468 * in the kernel.
469 */
470asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
471{ 471{
472 ack_APIC_irq();
473 exit_idle();
474 irq_enter();
475 mce_notify_irq(); 472 mce_notify_irq();
476 mce_schedule_work(); 473 mce_schedule_work();
477 irq_exit();
478} 474}
479#endif
480 475
481static void mce_report_event(struct pt_regs *regs) 476static void mce_report_event(struct pt_regs *regs)
482{ 477{
@@ -492,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
492 return; 487 return;
493 } 488 }
494 489
495#ifdef CONFIG_X86_LOCAL_APIC 490 irq_work_queue(&__get_cpu_var(mce_irq_work));
496 /*
497 * Without APIC do not notify. The event will be picked
498 * up eventually.
499 */
500 if (!cpu_has_apic)
501 return;
502
503 /*
504 * When interrupts are disabled we cannot use
505 * kernel services safely. Trigger an self interrupt
506 * through the APIC to instead do the notification
507 * after interrupts are reenabled again.
508 */
509 apic->send_IPI_self(MCE_SELF_VECTOR);
510
511 /*
512 * Wait for idle afterwards again so that we don't leave the
513 * APIC in a non idle state because the normal APIC writes
514 * cannot exclude us.
515 */
516 apic_wait_icr_idle();
517#endif
518} 491}
519 492
520DEFINE_PER_CPU(unsigned, mce_poll_count); 493DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -541,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
541 514
542 percpu_inc(mce_poll_count); 515 percpu_inc(mce_poll_count);
543 516
544 mce_setup(&m); 517 mce_gather_info(&m, NULL);
545 518
546 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
547 for (i = 0; i < banks; i++) { 519 for (i = 0; i < banks; i++) {
548 if (!mce_banks[i].ctl || !test_bit(i, *b)) 520 if (!mce_banks[i].ctl || !test_bit(i, *b))
549 continue; 521 continue;
@@ -879,9 +851,9 @@ static int mce_usable_address(struct mce *m)
879{ 851{
880 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 852 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
881 return 0; 853 return 0;
882 if ((m->misc & 0x3f) > PAGE_SHIFT) 854 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
883 return 0; 855 return 0;
884 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 856 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
885 return 0; 857 return 0;
886 return 1; 858 return 1;
887} 859}
@@ -942,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
942 if (!banks) 914 if (!banks)
943 goto out; 915 goto out;
944 916
945 mce_setup(&m); 917 mce_gather_info(&m, regs);
946 918
947 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
948 final = &__get_cpu_var(mces_seen); 919 final = &__get_cpu_var(mces_seen);
949 *final = m; 920 *final = m;
950 921
@@ -1028,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1028 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 999 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1029 mce_ring_add(m.addr >> PAGE_SHIFT); 1000 mce_ring_add(m.addr >> PAGE_SHIFT);
1030 1001
1031 mce_get_rip(&m, regs);
1032 mce_log(&m); 1002 mce_log(&m);
1033 1003
1034 if (severity > worst) { 1004 if (severity > worst) {
@@ -1190,7 +1160,8 @@ int mce_notify_irq(void)
1190 clear_thread_flag(TIF_MCE_NOTIFY); 1160 clear_thread_flag(TIF_MCE_NOTIFY);
1191 1161
1192 if (test_and_clear_bit(0, &mce_need_notify)) { 1162 if (test_and_clear_bit(0, &mce_need_notify)) {
1193 wake_up_interruptible(&mce_wait); 1163 /* wake processes polling /dev/mcelog */
1164 wake_up_interruptible(&mce_chrdev_wait);
1194 1165
1195 /* 1166 /*
1196 * There is no risk of missing notifications because 1167 * There is no risk of missing notifications because
@@ -1363,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1363 return 0; 1334 return 0;
1364} 1335}
1365 1336
1366static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1337static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1367{ 1338{
1368 if (c->x86 != 5) 1339 if (c->x86 != 5)
1369 return; 1340 return 0;
1341
1370 switch (c->x86_vendor) { 1342 switch (c->x86_vendor) {
1371 case X86_VENDOR_INTEL: 1343 case X86_VENDOR_INTEL:
1372 intel_p5_mcheck_init(c); 1344 intel_p5_mcheck_init(c);
1345 return 1;
1373 break; 1346 break;
1374 case X86_VENDOR_CENTAUR: 1347 case X86_VENDOR_CENTAUR:
1375 winchip_mcheck_init(c); 1348 winchip_mcheck_init(c);
1349 return 1;
1376 break; 1350 break;
1377 } 1351 }
1352
1353 return 0;
1378} 1354}
1379 1355
1380static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1356static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1428,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1428 if (mce_disabled) 1404 if (mce_disabled)
1429 return; 1405 return;
1430 1406
1431 __mcheck_cpu_ancient_init(c); 1407 if (__mcheck_cpu_ancient_init(c))
1408 return;
1432 1409
1433 if (!mce_available(c)) 1410 if (!mce_available(c))
1434 return; 1411 return;
@@ -1444,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1444 __mcheck_cpu_init_vendor(c); 1421 __mcheck_cpu_init_vendor(c);
1445 __mcheck_cpu_init_timer(); 1422 __mcheck_cpu_init_timer();
1446 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1447 1424 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1448} 1425}
1449 1426
1450/* 1427/*
1451 * Character device to read and clear the MCE log. 1428 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1452 */ 1429 */
1453 1430
1454static DEFINE_SPINLOCK(mce_state_lock); 1431static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1455static int open_count; /* #times opened */ 1432static int mce_chrdev_open_count; /* #times opened */
1456static int open_exclu; /* already open exclusive? */ 1433static int mce_chrdev_open_exclu; /* already open exclusive? */
1457 1434
1458static int mce_open(struct inode *inode, struct file *file) 1435static int mce_chrdev_open(struct inode *inode, struct file *file)
1459{ 1436{
1460 spin_lock(&mce_state_lock); 1437 spin_lock(&mce_chrdev_state_lock);
1461 1438
1462 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1439 if (mce_chrdev_open_exclu ||
1463 spin_unlock(&mce_state_lock); 1440 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1441 spin_unlock(&mce_chrdev_state_lock);
1464 1442
1465 return -EBUSY; 1443 return -EBUSY;
1466 } 1444 }
1467 1445
1468 if (file->f_flags & O_EXCL) 1446 if (file->f_flags & O_EXCL)
1469 open_exclu = 1; 1447 mce_chrdev_open_exclu = 1;
1470 open_count++; 1448 mce_chrdev_open_count++;
1471 1449
1472 spin_unlock(&mce_state_lock); 1450 spin_unlock(&mce_chrdev_state_lock);
1473 1451
1474 return nonseekable_open(inode, file); 1452 return nonseekable_open(inode, file);
1475} 1453}
1476 1454
1477static int mce_release(struct inode *inode, struct file *file) 1455static int mce_chrdev_release(struct inode *inode, struct file *file)
1478{ 1456{
1479 spin_lock(&mce_state_lock); 1457 spin_lock(&mce_chrdev_state_lock);
1480 1458
1481 open_count--; 1459 mce_chrdev_open_count--;
1482 open_exclu = 0; 1460 mce_chrdev_open_exclu = 0;
1483 1461
1484 spin_unlock(&mce_state_lock); 1462 spin_unlock(&mce_chrdev_state_lock);
1485 1463
1486 return 0; 1464 return 0;
1487} 1465}
@@ -1530,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
1530 return 0; 1508 return 0;
1531} 1509}
1532 1510
1533static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1511static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1534 loff_t *off) 1512 size_t usize, loff_t *off)
1535{ 1513{
1536 char __user *buf = ubuf; 1514 char __user *buf = ubuf;
1537 unsigned long *cpu_tsc; 1515 unsigned long *cpu_tsc;
@@ -1542,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1542 if (!cpu_tsc) 1520 if (!cpu_tsc)
1543 return -ENOMEM; 1521 return -ENOMEM;
1544 1522
1545 mutex_lock(&mce_read_mutex); 1523 mutex_lock(&mce_chrdev_read_mutex);
1546 1524
1547 if (!mce_apei_read_done) { 1525 if (!mce_apei_read_done) {
1548 err = __mce_read_apei(&buf, usize); 1526 err = __mce_read_apei(&buf, usize);
@@ -1562,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1562 do { 1540 do {
1563 for (i = prev; i < next; i++) { 1541 for (i = prev; i < next; i++) {
1564 unsigned long start = jiffies; 1542 unsigned long start = jiffies;
1543 struct mce *m = &mcelog.entry[i];
1565 1544
1566 while (!mcelog.entry[i].finished) { 1545 while (!m->finished) {
1567 if (time_after_eq(jiffies, start + 2)) { 1546 if (time_after_eq(jiffies, start + 2)) {
1568 memset(mcelog.entry + i, 0, 1547 memset(m, 0, sizeof(*m));
1569 sizeof(struct mce));
1570 goto timeout; 1548 goto timeout;
1571 } 1549 }
1572 cpu_relax(); 1550 cpu_relax();
1573 } 1551 }
1574 smp_rmb(); 1552 smp_rmb();
1575 err |= copy_to_user(buf, mcelog.entry + i, 1553 err |= copy_to_user(buf, m, sizeof(*m));
1576 sizeof(struct mce)); 1554 buf += sizeof(*m);
1577 buf += sizeof(struct mce);
1578timeout: 1555timeout:
1579 ; 1556 ;
1580 } 1557 }
@@ -1594,13 +1571,13 @@ timeout:
1594 on_each_cpu(collect_tscs, cpu_tsc, 1); 1571 on_each_cpu(collect_tscs, cpu_tsc, 1);
1595 1572
1596 for (i = next; i < MCE_LOG_LEN; i++) { 1573 for (i = next; i < MCE_LOG_LEN; i++) {
1597 if (mcelog.entry[i].finished && 1574 struct mce *m = &mcelog.entry[i];
1598 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1575
1599 err |= copy_to_user(buf, mcelog.entry+i, 1576 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1600 sizeof(struct mce)); 1577 err |= copy_to_user(buf, m, sizeof(*m));
1601 smp_rmb(); 1578 smp_rmb();
1602 buf += sizeof(struct mce); 1579 buf += sizeof(*m);
1603 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1580 memset(m, 0, sizeof(*m));
1604 } 1581 }
1605 } 1582 }
1606 1583
@@ -1608,15 +1585,15 @@ timeout:
1608 err = -EFAULT; 1585 err = -EFAULT;
1609 1586
1610out: 1587out:
1611 mutex_unlock(&mce_read_mutex); 1588 mutex_unlock(&mce_chrdev_read_mutex);
1612 kfree(cpu_tsc); 1589 kfree(cpu_tsc);
1613 1590
1614 return err ? err : buf - ubuf; 1591 return err ? err : buf - ubuf;
1615} 1592}
1616 1593
1617static unsigned int mce_poll(struct file *file, poll_table *wait) 1594static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1618{ 1595{
1619 poll_wait(file, &mce_wait, wait); 1596 poll_wait(file, &mce_chrdev_wait, wait);
1620 if (rcu_access_index(mcelog.next)) 1597 if (rcu_access_index(mcelog.next))
1621 return POLLIN | POLLRDNORM; 1598 return POLLIN | POLLRDNORM;
1622 if (!mce_apei_read_done && apei_check_mce()) 1599 if (!mce_apei_read_done && apei_check_mce())
@@ -1624,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
1624 return 0; 1601 return 0;
1625} 1602}
1626 1603
1627static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1604static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1605 unsigned long arg)
1628{ 1606{
1629 int __user *p = (int __user *)arg; 1607 int __user *p = (int __user *)arg;
1630 1608
@@ -1652,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1652 1630
1653/* Modified in mce-inject.c, so not static or const */ 1631/* Modified in mce-inject.c, so not static or const */
1654struct file_operations mce_chrdev_ops = { 1632struct file_operations mce_chrdev_ops = {
1655 .open = mce_open, 1633 .open = mce_chrdev_open,
1656 .release = mce_release, 1634 .release = mce_chrdev_release,
1657 .read = mce_read, 1635 .read = mce_chrdev_read,
1658 .poll = mce_poll, 1636 .poll = mce_chrdev_poll,
1659 .unlocked_ioctl = mce_ioctl, 1637 .unlocked_ioctl = mce_chrdev_ioctl,
1660 .llseek = no_llseek, 1638 .llseek = no_llseek,
1661}; 1639};
1662EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1640EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1663 1641
1664static struct miscdevice mce_log_device = { 1642static struct miscdevice mce_chrdev_device = {
1665 MISC_MCELOG_MINOR, 1643 MISC_MCELOG_MINOR,
1666 "mcelog", 1644 "mcelog",
1667 &mce_chrdev_ops, 1645 &mce_chrdev_ops,
@@ -1719,7 +1697,7 @@ int __init mcheck_init(void)
1719} 1697}
1720 1698
1721/* 1699/*
1722 * Sysfs support 1700 * mce_syscore: PM support
1723 */ 1701 */
1724 1702
1725/* 1703/*
@@ -1739,12 +1717,12 @@ static int mce_disable_error_reporting(void)
1739 return 0; 1717 return 0;
1740} 1718}
1741 1719
1742static int mce_suspend(void) 1720static int mce_syscore_suspend(void)
1743{ 1721{
1744 return mce_disable_error_reporting(); 1722 return mce_disable_error_reporting();
1745} 1723}
1746 1724
1747static void mce_shutdown(void) 1725static void mce_syscore_shutdown(void)
1748{ 1726{
1749 mce_disable_error_reporting(); 1727 mce_disable_error_reporting();
1750} 1728}
@@ -1754,18 +1732,22 @@ static void mce_shutdown(void)
1754 * Only one CPU is active at this time, the others get re-added later using 1732 * Only one CPU is active at this time, the others get re-added later using
1755 * CPU hotplug: 1733 * CPU hotplug:
1756 */ 1734 */
1757static void mce_resume(void) 1735static void mce_syscore_resume(void)
1758{ 1736{
1759 __mcheck_cpu_init_generic(); 1737 __mcheck_cpu_init_generic();
1760 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1738 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1761} 1739}
1762 1740
1763static struct syscore_ops mce_syscore_ops = { 1741static struct syscore_ops mce_syscore_ops = {
1764 .suspend = mce_suspend, 1742 .suspend = mce_syscore_suspend,
1765 .shutdown = mce_shutdown, 1743 .shutdown = mce_syscore_shutdown,
1766 .resume = mce_resume, 1744 .resume = mce_syscore_resume,
1767}; 1745};
1768 1746
1747/*
1748 * mce_sysdev: Sysfs support
1749 */
1750
1769static void mce_cpu_restart(void *data) 1751static void mce_cpu_restart(void *data)
1770{ 1752{
1771 del_timer_sync(&__get_cpu_var(mce_timer)); 1753 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1801,11 +1783,11 @@ static void mce_enable_ce(void *all)
1801 __mcheck_cpu_init_timer(); 1783 __mcheck_cpu_init_timer();
1802} 1784}
1803 1785
1804static struct sysdev_class mce_sysclass = { 1786static struct sysdev_class mce_sysdev_class = {
1805 .name = "machinecheck", 1787 .name = "machinecheck",
1806}; 1788};
1807 1789
1808DEFINE_PER_CPU(struct sys_device, mce_dev); 1790DEFINE_PER_CPU(struct sys_device, mce_sysdev);
1809 1791
1810__cpuinitdata 1792__cpuinitdata
1811void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1793void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1934,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
1934 &mce_cmci_disabled 1916 &mce_cmci_disabled
1935}; 1917};
1936 1918
1937static struct sysdev_attribute *mce_attrs[] = { 1919static struct sysdev_attribute *mce_sysdev_attrs[] = {
1938 &attr_tolerant.attr, 1920 &attr_tolerant.attr,
1939 &attr_check_interval.attr, 1921 &attr_check_interval.attr,
1940 &attr_trigger, 1922 &attr_trigger,
@@ -1945,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
1945 NULL 1927 NULL
1946}; 1928};
1947 1929
1948static cpumask_var_t mce_dev_initialized; 1930static cpumask_var_t mce_sysdev_initialized;
1949 1931
1950/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1932/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1951static __cpuinit int mce_create_device(unsigned int cpu) 1933static __cpuinit int mce_sysdev_create(unsigned int cpu)
1952{ 1934{
1935 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
1953 int err; 1936 int err;
1954 int i, j; 1937 int i, j;
1955 1938
1956 if (!mce_available(&boot_cpu_data)) 1939 if (!mce_available(&boot_cpu_data))
1957 return -EIO; 1940 return -EIO;
1958 1941
1959 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1942 memset(&sysdev->kobj, 0, sizeof(struct kobject));
1960 per_cpu(mce_dev, cpu).id = cpu; 1943 sysdev->id = cpu;
1961 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1944 sysdev->cls = &mce_sysdev_class;
1962 1945
1963 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1946 err = sysdev_register(sysdev);
1964 if (err) 1947 if (err)
1965 return err; 1948 return err;
1966 1949
1967 for (i = 0; mce_attrs[i]; i++) { 1950 for (i = 0; mce_sysdev_attrs[i]; i++) {
1968 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1951 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
1969 if (err) 1952 if (err)
1970 goto error; 1953 goto error;
1971 } 1954 }
1972 for (j = 0; j < banks; j++) { 1955 for (j = 0; j < banks; j++) {
1973 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1956 err = sysdev_create_file(sysdev, &mce_banks[j].attr);
1974 &mce_banks[j].attr);
1975 if (err) 1957 if (err)
1976 goto error2; 1958 goto error2;
1977 } 1959 }
1978 cpumask_set_cpu(cpu, mce_dev_initialized); 1960 cpumask_set_cpu(cpu, mce_sysdev_initialized);
1979 1961
1980 return 0; 1962 return 0;
1981error2: 1963error2:
1982 while (--j >= 0) 1964 while (--j >= 0)
1983 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1965 sysdev_remove_file(sysdev, &mce_banks[j].attr);
1984error: 1966error:
1985 while (--i >= 0) 1967 while (--i >= 0)
1986 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1968 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
1987 1969
1988 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1970 sysdev_unregister(sysdev);
1989 1971
1990 return err; 1972 return err;
1991} 1973}
1992 1974
1993static __cpuinit void mce_remove_device(unsigned int cpu) 1975static __cpuinit void mce_sysdev_remove(unsigned int cpu)
1994{ 1976{
1977 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
1995 int i; 1978 int i;
1996 1979
1997 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1980 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
1998 return; 1981 return;
1999 1982
2000 for (i = 0; mce_attrs[i]; i++) 1983 for (i = 0; mce_sysdev_attrs[i]; i++)
2001 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1984 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
2002 1985
2003 for (i = 0; i < banks; i++) 1986 for (i = 0; i < banks; i++)
2004 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1987 sysdev_remove_file(sysdev, &mce_banks[i].attr);
2005 1988
2006 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1989 sysdev_unregister(sysdev);
2007 cpumask_clear_cpu(cpu, mce_dev_initialized); 1990 cpumask_clear_cpu(cpu, mce_sysdev_initialized);
2008} 1991}
2009 1992
2010/* Make sure there are no machine checks on offlined CPUs. */ 1993/* Make sure there are no machine checks on offlined CPUs. */
@@ -2054,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2054 switch (action) { 2037 switch (action) {
2055 case CPU_ONLINE: 2038 case CPU_ONLINE:
2056 case CPU_ONLINE_FROZEN: 2039 case CPU_ONLINE_FROZEN:
2057 mce_create_device(cpu); 2040 mce_sysdev_create(cpu);
2058 if (threshold_cpu_callback) 2041 if (threshold_cpu_callback)
2059 threshold_cpu_callback(action, cpu); 2042 threshold_cpu_callback(action, cpu);
2060 break; 2043 break;
@@ -2062,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2062 case CPU_DEAD_FROZEN: 2045 case CPU_DEAD_FROZEN:
2063 if (threshold_cpu_callback) 2046 if (threshold_cpu_callback)
2064 threshold_cpu_callback(action, cpu); 2047 threshold_cpu_callback(action, cpu);
2065 mce_remove_device(cpu); 2048 mce_sysdev_remove(cpu);
2066 break; 2049 break;
2067 case CPU_DOWN_PREPARE: 2050 case CPU_DOWN_PREPARE:
2068 case CPU_DOWN_PREPARE_FROZEN: 2051 case CPU_DOWN_PREPARE_FROZEN:
@@ -2116,27 +2099,28 @@ static __init int mcheck_init_device(void)
2116 if (!mce_available(&boot_cpu_data)) 2099 if (!mce_available(&boot_cpu_data))
2117 return -EIO; 2100 return -EIO;
2118 2101
2119 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2102 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
2120 2103
2121 mce_init_banks(); 2104 mce_init_banks();
2122 2105
2123 err = sysdev_class_register(&mce_sysclass); 2106 err = sysdev_class_register(&mce_sysdev_class);
2124 if (err) 2107 if (err)
2125 return err; 2108 return err;
2126 2109
2127 for_each_online_cpu(i) { 2110 for_each_online_cpu(i) {
2128 err = mce_create_device(i); 2111 err = mce_sysdev_create(i);
2129 if (err) 2112 if (err)
2130 return err; 2113 return err;
2131 } 2114 }
2132 2115
2133 register_syscore_ops(&mce_syscore_ops); 2116 register_syscore_ops(&mce_syscore_ops);
2134 register_hotcpu_notifier(&mce_cpu_notifier); 2117 register_hotcpu_notifier(&mce_cpu_notifier);
2135 misc_register(&mce_log_device); 2118
2119 /* register character device /dev/mcelog */
2120 misc_register(&mce_chrdev_device);
2136 2121
2137 return err; 2122 return err;
2138} 2123}
2139
2140device_initcall(mcheck_init_device); 2124device_initcall(mcheck_init_device);
2141 2125
2142/* 2126/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad35143..f5474218cffe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
548 if (!b) 548 if (!b)
549 goto out; 549 goto out;
550 550
551 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, 551 err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
552 b->kobj, name); 552 b->kobj, name);
553 if (err) 553 if (err)
554 goto out; 554 goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
571 goto out; 571 goto out;
572 } 572 }
573 573
574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); 574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
575 if (!b->kobj) 575 if (!b->kobj)
576 goto out_free; 576 goto out_free;
577 577
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
591 if (i == cpu) 591 if (i == cpu)
592 continue; 592 continue;
593 593
594 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, 594 err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
595 b->kobj, name); 595 b->kobj, name);
596 if (err) 596 if (err)
597 goto out; 597 goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
669#ifdef CONFIG_SMP 669#ifdef CONFIG_SMP
670 /* sibling symlink */ 670 /* sibling symlink */
671 if (shared_bank[bank] && b->blocks->cpu != cpu) { 671 if (shared_bank[bank] && b->blocks->cpu != cpu) {
672 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); 672 sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
673 per_cpu(threshold_banks, cpu)[bank] = NULL; 673 per_cpu(threshold_banks, cpu)[bank] = NULL;
674 674
675 return; 675 return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
681 if (i == cpu) 681 if (i == cpu)
682 continue; 682 continue;
683 683
684 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); 684 sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
685 per_cpu(threshold_banks, i)[bank] = NULL; 685 per_cpu(threshold_banks, i)[bank] = NULL;
686 } 686 }
687 687
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d1..08119a37e53c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
79static int have_wrcomb(void) 79static int have_wrcomb(void)
80{ 80{
81 struct pci_dev *dev; 81 struct pci_dev *dev;
82 u8 rev;
83 82
84 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); 83 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
85 if (dev != NULL) { 84 if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
89 * chipsets to be tagged 88 * chipsets to be tagged
90 */ 89 */
91 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && 90 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
92 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { 91 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
93 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 92 dev->revision <= 5) {
94 if (rev <= 5) { 93 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
95 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); 94 pci_dev_put(dev);
96 pci_dev_put(dev); 95 return 0;
97 return 0;
98 }
99 } 96 }
100 /* 97 /*
101 * Intel 450NX errata # 23. Non ascending cacheline evictions to 98 * Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,55 +134,43 @@ static void __init init_table(void)
137} 134}
138 135
139struct set_mtrr_data { 136struct set_mtrr_data {
140 atomic_t count;
141 atomic_t gate;
142 unsigned long smp_base; 137 unsigned long smp_base;
143 unsigned long smp_size; 138 unsigned long smp_size;
144 unsigned int smp_reg; 139 unsigned int smp_reg;
145 mtrr_type smp_type; 140 mtrr_type smp_type;
146}; 141};
147 142
148static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
149
150/** 143/**
151 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. 144 * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
145 * by all the CPUs.
152 * @info: pointer to mtrr configuration data 146 * @info: pointer to mtrr configuration data
153 * 147 *
154 * Returns nothing. 148 * Returns nothing.
155 */ 149 */
156static int mtrr_work_handler(void *info) 150static int mtrr_rendezvous_handler(void *info)
157{ 151{
158#ifdef CONFIG_SMP 152#ifdef CONFIG_SMP
159 struct set_mtrr_data *data = info; 153 struct set_mtrr_data *data = info;
160 unsigned long flags;
161
162 atomic_dec(&data->count);
163 while (!atomic_read(&data->gate))
164 cpu_relax();
165
166 local_irq_save(flags);
167
168 atomic_dec(&data->count);
169 while (atomic_read(&data->gate))
170 cpu_relax();
171 154
172 /* The master has cleared me to execute */ 155 /*
156 * We use this same function to initialize the mtrrs during boot,
157 * resume, runtime cpu online and on an explicit request to set a
158 * specific MTRR.
159 *
160 * During boot or suspend, the state of the boot cpu's mtrrs has been
161 * saved, and we want to replicate that across all the cpus that come
162 * online (either at the end of boot or resume or during a runtime cpu
163 * online). If we're doing that, @reg is set to something special and on
164 * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
165 * started the boot/resume sequence, this might be a duplicate
166 * set_all()).
167 */
173 if (data->smp_reg != ~0U) { 168 if (data->smp_reg != ~0U) {
174 mtrr_if->set(data->smp_reg, data->smp_base, 169 mtrr_if->set(data->smp_reg, data->smp_base,
175 data->smp_size, data->smp_type); 170 data->smp_size, data->smp_type);
176 } else if (mtrr_aps_delayed_init) { 171 } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
177 /*
178 * Initialize the MTRRs inaddition to the synchronisation.
179 */
180 mtrr_if->set_all(); 172 mtrr_if->set_all();
181 } 173 }
182
183 atomic_dec(&data->count);
184 while (!atomic_read(&data->gate))
185 cpu_relax();
186
187 atomic_dec(&data->count);
188 local_irq_restore(flags);
189#endif 174#endif
190 return 0; 175 return 0;
191} 176}
@@ -223,20 +208,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
223 * 14. Wait for buddies to catch up 208 * 14. Wait for buddies to catch up
224 * 15. Enable interrupts. 209 * 15. Enable interrupts.
225 * 210 *
226 * What does that mean for us? Well, first we set data.count to the number 211 * What does that mean for us? Well, stop_machine() will ensure that
227 * of CPUs. As each CPU announces that it started the rendezvous handler by 212 * the rendezvous handler is started on each CPU. And in lockstep they
228 * decrementing the count, We reset data.count and set the data.gate flag 213 * do the state transition of disabling interrupts, updating MTRR's
229 * allowing all the cpu's to proceed with the work. As each cpu disables 214 * (the CPU vendors may each do it differently, so we call mtrr_if->set()
230 * interrupts, it'll decrement data.count once. We wait until it hits 0 and 215 * callback and let them take care of it.) and enabling interrupts.
231 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
232 * are waiting for that flag to be cleared. Once it's cleared, each
233 * CPU goes through the transition of updating MTRRs.
234 * The CPU vendors may each do it differently,
235 * so we call mtrr_if->set() callback and let them take care of it.
236 * When they're done, they again decrement data->count and wait for data.gate
237 * to be set.
238 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
239 * Everyone then enables interrupts and we all continue on.
240 * 216 *
241 * Note that the mechanism is the same for UP systems, too; all the SMP stuff 217 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
242 * becomes nops. 218 * becomes nops.
@@ -244,92 +220,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
244static void 220static void
245set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) 221set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
246{ 222{
247 struct set_mtrr_data data; 223 struct set_mtrr_data data = { .smp_reg = reg,
248 unsigned long flags; 224 .smp_base = base,
249 int cpu; 225 .smp_size = size,
250 226 .smp_type = type
251 preempt_disable(); 227 };
252
253 data.smp_reg = reg;
254 data.smp_base = base;
255 data.smp_size = size;
256 data.smp_type = type;
257 atomic_set(&data.count, num_booting_cpus() - 1);
258
259 /* Make sure data.count is visible before unleashing other CPUs */
260 smp_wmb();
261 atomic_set(&data.gate, 0);
262
263 /* Start the ball rolling on other CPUs */
264 for_each_online_cpu(cpu) {
265 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
266
267 if (cpu == smp_processor_id())
268 continue;
269
270 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
271 }
272
273
274 while (atomic_read(&data.count))
275 cpu_relax();
276
277 /* Ok, reset count and toggle gate */
278 atomic_set(&data.count, num_booting_cpus() - 1);
279 smp_wmb();
280 atomic_set(&data.gate, 1);
281
282 local_irq_save(flags);
283
284 while (atomic_read(&data.count))
285 cpu_relax();
286
287 /* Ok, reset count and toggle gate */
288 atomic_set(&data.count, num_booting_cpus() - 1);
289 smp_wmb();
290 atomic_set(&data.gate, 0);
291
292 /* Do our MTRR business */
293
294 /*
295 * HACK!
296 *
297 * We use this same function to initialize the mtrrs during boot,
298 * resume, runtime cpu online and on an explicit request to set a
299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
310 */
311 if (reg != ~0U)
312 mtrr_if->set(reg, base, size, type);
313 else
314 mtrr_if->set_all();
315 228
316 /* Wait for the others */ 229 stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
317 while (atomic_read(&data.count)) 230}
318 cpu_relax();
319
320 atomic_set(&data.count, num_booting_cpus() - 1);
321 smp_wmb();
322 atomic_set(&data.gate, 1);
323
324 /*
325 * Wait here for everyone to have seen the gate change
326 * So we're the last ones to touch 'data'
327 */
328 while (atomic_read(&data.count))
329 cpu_relax();
330 231
331 local_irq_restore(flags); 232static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
332 preempt_enable(); 233 unsigned long size, mtrr_type type)
234{
235 struct set_mtrr_data data = { .smp_reg = reg,
236 .smp_base = base,
237 .smp_size = size,
238 .smp_type = type
239 };
240
241 stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
242 cpu_callout_mask);
333} 243}
334 244
335/** 245/**
@@ -783,7 +693,7 @@ void mtrr_ap_init(void)
783 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug 693 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
784 * lock to prevent mtrr entry changes 694 * lock to prevent mtrr entry changes
785 */ 695 */
786 set_mtrr(~0U, 0, 0, 0); 696 set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
787} 697}
788 698
789/** 699/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b4b179..4ee3abf20ed6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/highmem.h>
26#include <linux/cpu.h> 25#include <linux/cpu.h>
27#include <linux/bitops.h> 26#include <linux/bitops.h>
28 27
@@ -45,38 +44,27 @@ do { \
45#endif 44#endif
46 45
47/* 46/*
48 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context 47 * | NHM/WSM | SNB |
48 * register -------------------------------
49 * | HT | no HT | HT | no HT |
50 *-----------------------------------------
51 * offcore | core | core | cpu | core |
52 * lbr_sel | core | core | cpu | core |
53 * ld_lat | cpu | core | cpu | core |
54 *-----------------------------------------
55 *
56 * Given that there is a small number of shared regs,
57 * we can pre-allocate their slot in the per-cpu
58 * per-core reg tables.
49 */ 59 */
50static unsigned long 60enum extra_reg_type {
51copy_from_user_nmi(void *to, const void __user *from, unsigned long n) 61 EXTRA_REG_NONE = -1, /* not used */
52{
53 unsigned long offset, addr = (unsigned long)from;
54 unsigned long size, len = 0;
55 struct page *page;
56 void *map;
57 int ret;
58
59 do {
60 ret = __get_user_pages_fast(addr, 1, 0, &page);
61 if (!ret)
62 break;
63
64 offset = addr & (PAGE_SIZE - 1);
65 size = min(PAGE_SIZE - offset, n - len);
66
67 map = kmap_atomic(page);
68 memcpy(to, map+offset, size);
69 kunmap_atomic(map);
70 put_page(page);
71 62
72 len += size; 63 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
73 to += size; 64 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
74 addr += size;
75 65
76 } while (len < n); 66 EXTRA_REG_MAX /* number of entries needed */
77 67};
78 return len;
79}
80 68
81struct event_constraint { 69struct event_constraint {
82 union { 70 union {
@@ -132,11 +120,10 @@ struct cpu_hw_events {
132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 120 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
133 121
134 /* 122 /*
135 * Intel percore register state. 123 * manage shared (per-core, per-cpu) registers
136 * Coordinate shared resources between HT threads. 124 * used on Intel NHM/WSM/SNB
137 */ 125 */
138 int percore_used; /* Used by this CPU? */ 126 struct intel_shared_regs *shared_regs;
139 struct intel_percore *per_core;
140 127
141 /* 128 /*
142 * AMD specific bits 129 * AMD specific bits
@@ -187,26 +174,45 @@ struct cpu_hw_events {
187 for ((e) = (c); (e)->weight; (e)++) 174 for ((e) = (c); (e)->weight; (e)++)
188 175
189/* 176/*
177 * Per register state.
178 */
179struct er_account {
180 raw_spinlock_t lock; /* per-core: protect structure */
181 u64 config; /* extra MSR config */
182 u64 reg; /* extra MSR number */
183 atomic_t ref; /* reference count */
184};
185
186/*
190 * Extra registers for specific events. 187 * Extra registers for specific events.
188 *
191 * Some events need large masks and require external MSRs. 189 * Some events need large masks and require external MSRs.
192 * Define a mapping to these extra registers. 190 * Those extra MSRs end up being shared for all events on
191 * a PMU and sometimes between PMU of sibling HT threads.
192 * In either case, the kernel needs to handle conflicting
193 * accesses to those extra, shared, regs. The data structure
194 * to manage those registers is stored in cpu_hw_event.
193 */ 195 */
194struct extra_reg { 196struct extra_reg {
195 unsigned int event; 197 unsigned int event;
196 unsigned int msr; 198 unsigned int msr;
197 u64 config_mask; 199 u64 config_mask;
198 u64 valid_mask; 200 u64 valid_mask;
201 int idx; /* per_xxx->regs[] reg index */
199}; 202};
200 203
201#define EVENT_EXTRA_REG(e, ms, m, vm) { \ 204#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
202 .event = (e), \ 205 .event = (e), \
203 .msr = (ms), \ 206 .msr = (ms), \
204 .config_mask = (m), \ 207 .config_mask = (m), \
205 .valid_mask = (vm), \ 208 .valid_mask = (vm), \
209 .idx = EXTRA_REG_##i \
206 } 210 }
207#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ 211
208 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) 212#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
209#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) 213 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
214
215#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
210 216
211union perf_capabilities { 217union perf_capabilities {
212 struct { 218 struct {
@@ -252,7 +258,6 @@ struct x86_pmu {
252 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 258 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
253 struct perf_event *event); 259 struct perf_event *event);
254 struct event_constraint *event_constraints; 260 struct event_constraint *event_constraints;
255 struct event_constraint *percore_constraints;
256 void (*quirks)(void); 261 void (*quirks)(void);
257 int perfctr_second_write; 262 int perfctr_second_write;
258 263
@@ -286,8 +291,12 @@ struct x86_pmu {
286 * Extra registers for events 291 * Extra registers for events
287 */ 292 */
288 struct extra_reg *extra_regs; 293 struct extra_reg *extra_regs;
294 unsigned int er_flags;
289}; 295};
290 296
297#define ERF_NO_HT_SHARING 1
298#define ERF_HAS_RSP_1 2
299
291static struct x86_pmu x86_pmu __read_mostly; 300static struct x86_pmu x86_pmu __read_mostly;
292 301
293static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 302static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -393,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
393 */ 402 */
394static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 403static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
395{ 404{
405 struct hw_perf_event_extra *reg;
396 struct extra_reg *er; 406 struct extra_reg *er;
397 407
398 event->hw.extra_reg = 0; 408 reg = &event->hw.extra_reg;
399 event->hw.extra_config = 0;
400 409
401 if (!x86_pmu.extra_regs) 410 if (!x86_pmu.extra_regs)
402 return 0; 411 return 0;
@@ -406,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
406 continue; 415 continue;
407 if (event->attr.config1 & ~er->valid_mask) 416 if (event->attr.config1 & ~er->valid_mask)
408 return -EINVAL; 417 return -EINVAL;
409 event->hw.extra_reg = er->msr; 418
410 event->hw.extra_config = event->attr.config1; 419 reg->idx = er->idx;
420 reg->config = event->attr.config1;
421 reg->reg = er->msr;
411 break; 422 break;
412 } 423 }
413 return 0; 424 return 0;
@@ -706,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
706 event->hw.last_cpu = -1; 717 event->hw.last_cpu = -1;
707 event->hw.last_tag = ~0ULL; 718 event->hw.last_tag = ~0ULL;
708 719
720 /* mark unused */
721 event->hw.extra_reg.idx = EXTRA_REG_NONE;
722
709 return x86_pmu.hw_config(event); 723 return x86_pmu.hw_config(event);
710} 724}
711 725
@@ -747,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
747static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, 761static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
748 u64 enable_mask) 762 u64 enable_mask)
749{ 763{
750 if (hwc->extra_reg) 764 if (hwc->extra_reg.reg)
751 wrmsrl(hwc->extra_reg, hwc->extra_config); 765 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
752 wrmsrl(hwc->config_base, hwc->config | enable_mask); 766 wrmsrl(hwc->config_base, hwc->config | enable_mask);
753} 767}
754 768
@@ -1332,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1332 if (!x86_perf_event_set_period(event)) 1346 if (!x86_perf_event_set_period(event))
1333 continue; 1347 continue;
1334 1348
1335 if (perf_event_overflow(event, 1, &data, regs)) 1349 if (perf_event_overflow(event, &data, regs))
1336 x86_pmu_stop(event, 0); 1350 x86_pmu_stop(event, 0);
1337 } 1351 }
1338 1352
@@ -1637,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
1637 perf_pmu_enable(pmu); 1651 perf_pmu_enable(pmu);
1638 return 0; 1652 return 0;
1639} 1653}
1654/*
1655 * a fake_cpuc is used to validate event groups. Due to
1656 * the extra reg logic, we need to also allocate a fake
1657 * per_core and per_cpu structure. Otherwise, group events
1658 * using extra reg may conflict without the kernel being
1659 * able to catch this when the last event gets added to
1660 * the group.
1661 */
1662static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1663{
1664 kfree(cpuc->shared_regs);
1665 kfree(cpuc);
1666}
1667
1668static struct cpu_hw_events *allocate_fake_cpuc(void)
1669{
1670 struct cpu_hw_events *cpuc;
1671 int cpu = raw_smp_processor_id();
1672
1673 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1674 if (!cpuc)
1675 return ERR_PTR(-ENOMEM);
1676
1677 /* only needed, if we have extra_regs */
1678 if (x86_pmu.extra_regs) {
1679 cpuc->shared_regs = allocate_shared_regs(cpu);
1680 if (!cpuc->shared_regs)
1681 goto error;
1682 }
1683 return cpuc;
1684error:
1685 free_fake_cpuc(cpuc);
1686 return ERR_PTR(-ENOMEM);
1687}
1640 1688
1641/* 1689/*
1642 * validate that we can schedule this event 1690 * validate that we can schedule this event
@@ -1647,9 +1695,9 @@ static int validate_event(struct perf_event *event)
1647 struct event_constraint *c; 1695 struct event_constraint *c;
1648 int ret = 0; 1696 int ret = 0;
1649 1697
1650 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); 1698 fake_cpuc = allocate_fake_cpuc();
1651 if (!fake_cpuc) 1699 if (IS_ERR(fake_cpuc))
1652 return -ENOMEM; 1700 return PTR_ERR(fake_cpuc);
1653 1701
1654 c = x86_pmu.get_event_constraints(fake_cpuc, event); 1702 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1655 1703
@@ -1659,7 +1707,7 @@ static int validate_event(struct perf_event *event)
1659 if (x86_pmu.put_event_constraints) 1707 if (x86_pmu.put_event_constraints)
1660 x86_pmu.put_event_constraints(fake_cpuc, event); 1708 x86_pmu.put_event_constraints(fake_cpuc, event);
1661 1709
1662 kfree(fake_cpuc); 1710 free_fake_cpuc(fake_cpuc);
1663 1711
1664 return ret; 1712 return ret;
1665} 1713}
@@ -1679,36 +1727,32 @@ static int validate_group(struct perf_event *event)
1679{ 1727{
1680 struct perf_event *leader = event->group_leader; 1728 struct perf_event *leader = event->group_leader;
1681 struct cpu_hw_events *fake_cpuc; 1729 struct cpu_hw_events *fake_cpuc;
1682 int ret, n; 1730 int ret = -ENOSPC, n;
1683
1684 ret = -ENOMEM;
1685 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1686 if (!fake_cpuc)
1687 goto out;
1688 1731
1732 fake_cpuc = allocate_fake_cpuc();
1733 if (IS_ERR(fake_cpuc))
1734 return PTR_ERR(fake_cpuc);
1689 /* 1735 /*
1690 * the event is not yet connected with its 1736 * the event is not yet connected with its
1691 * siblings therefore we must first collect 1737 * siblings therefore we must first collect
1692 * existing siblings, then add the new event 1738 * existing siblings, then add the new event
1693 * before we can simulate the scheduling 1739 * before we can simulate the scheduling
1694 */ 1740 */
1695 ret = -ENOSPC;
1696 n = collect_events(fake_cpuc, leader, true); 1741 n = collect_events(fake_cpuc, leader, true);
1697 if (n < 0) 1742 if (n < 0)
1698 goto out_free; 1743 goto out;
1699 1744
1700 fake_cpuc->n_events = n; 1745 fake_cpuc->n_events = n;
1701 n = collect_events(fake_cpuc, event, false); 1746 n = collect_events(fake_cpuc, event, false);
1702 if (n < 0) 1747 if (n < 0)
1703 goto out_free; 1748 goto out;
1704 1749
1705 fake_cpuc->n_events = n; 1750 fake_cpuc->n_events = n;
1706 1751
1707 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 1752 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1708 1753
1709out_free:
1710 kfree(fake_cpuc);
1711out: 1754out:
1755 free_fake_cpuc(fake_cpuc);
1712 return ret; 1756 return ret;
1713} 1757}
1714 1758
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219e..941caa2e449b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
89 [ C(RESULT_MISS) ] = -1, 89 [ C(RESULT_MISS) ] = -1,
90 }, 90 },
91 }, 91 },
92 [ C(NODE) ] = {
93 [ C(OP_READ) ] = {
94 [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
95 [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
96 },
97 [ C(OP_WRITE) ] = {
98 [ C(RESULT_ACCESS) ] = -1,
99 [ C(RESULT_MISS) ] = -1,
100 },
101 [ C(OP_PREFETCH) ] = {
102 [ C(RESULT_ACCESS) ] = -1,
103 [ C(RESULT_MISS) ] = -1,
104 },
105 },
92}; 106};
93 107
94/* 108/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c48..45fbb8f7f549 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/* 3/*
15 * Per core state 4 * Per core/cpu state
16 * This used to coordinate shared registers for HT threads. 5 *
6 * Used to coordinate shared registers between HT threads or
7 * among events on a single PMU.
17 */ 8 */
18struct intel_percore { 9struct intel_shared_regs {
19 raw_spinlock_t lock; /* protect structure */ 10 struct er_account regs[EXTRA_REG_MAX];
20 struct er_account regs[MAX_EXTRA_REGS]; 11 int refcnt; /* per-core: #HT threads */
21 int refcnt; /* number of threads */ 12 unsigned core_id; /* per-core: core id */
22 unsigned core_id;
23}; 13};
24 14
25/* 15/*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
88 78
89static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = 79static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
90{ 80{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 81 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
92 EVENT_EXTRA_END 82 EVENT_EXTRA_END
93}; 83};
94 84
95static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
101static struct event_constraint intel_westmere_event_constraints[] __read_mostly = 85static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
102{ 86{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 87 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -116,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 100 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 101 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ 102 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 103 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 104 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END 105 EVENT_CONSTRAINT_END
@@ -125,15 +107,13 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
125 107
126static struct extra_reg intel_westmere_extra_regs[] __read_mostly = 108static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
127{ 109{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 110 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), 111 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
130 EVENT_EXTRA_END 112 EVENT_EXTRA_END
131}; 113};
132 114
133static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = 115static struct event_constraint intel_v1_event_constraints[] __read_mostly =
134{ 116{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END 117 EVENT_CONSTRAINT_END
138}; 118};
139 119
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
145 EVENT_CONSTRAINT_END 125 EVENT_CONSTRAINT_END
146}; 126};
147 127
128static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
129 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
130 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
131 EVENT_EXTRA_END
132};
133
148static u64 intel_pmu_event_map(int hw_event) 134static u64 intel_pmu_event_map(int hw_event)
149{ 135{
150 return intel_perfmon_event_map[hw_event]; 136 return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
245 [ C(RESULT_MISS) ] = -1, 231 [ C(RESULT_MISS) ] = -1,
246 }, 232 },
247 }, 233 },
234 [ C(NODE) ] = {
235 [ C(OP_READ) ] = {
236 [ C(RESULT_ACCESS) ] = -1,
237 [ C(RESULT_MISS) ] = -1,
238 },
239 [ C(OP_WRITE) ] = {
240 [ C(RESULT_ACCESS) ] = -1,
241 [ C(RESULT_MISS) ] = -1,
242 },
243 [ C(OP_PREFETCH) ] = {
244 [ C(RESULT_ACCESS) ] = -1,
245 [ C(RESULT_MISS) ] = -1,
246 },
247 },
248
248}; 249};
249 250
250static __initconst const u64 westmere_hw_cache_event_ids 251static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
346 [ C(RESULT_MISS) ] = -1, 347 [ C(RESULT_MISS) ] = -1,
347 }, 348 },
348 }, 349 },
350 [ C(NODE) ] = {
351 [ C(OP_READ) ] = {
352 [ C(RESULT_ACCESS) ] = 0x01b7,
353 [ C(RESULT_MISS) ] = 0x01b7,
354 },
355 [ C(OP_WRITE) ] = {
356 [ C(RESULT_ACCESS) ] = 0x01b7,
357 [ C(RESULT_MISS) ] = 0x01b7,
358 },
359 [ C(OP_PREFETCH) ] = {
360 [ C(RESULT_ACCESS) ] = 0x01b7,
361 [ C(RESULT_MISS) ] = 0x01b7,
362 },
363 },
349}; 364};
350 365
351/* 366/*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
398 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, 413 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
399 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, 414 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
400 }, 415 },
401 } 416 },
417 [ C(NODE) ] = {
418 [ C(OP_READ) ] = {
419 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
420 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
421 },
422 [ C(OP_WRITE) ] = {
423 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
424 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
425 },
426 [ C(OP_PREFETCH) ] = {
427 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
428 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
429 },
430 },
402}; 431};
403 432
404static __initconst const u64 nehalem_hw_cache_event_ids 433static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
500 [ C(RESULT_MISS) ] = -1, 529 [ C(RESULT_MISS) ] = -1,
501 }, 530 },
502 }, 531 },
532 [ C(NODE) ] = {
533 [ C(OP_READ) ] = {
534 [ C(RESULT_ACCESS) ] = 0x01b7,
535 [ C(RESULT_MISS) ] = 0x01b7,
536 },
537 [ C(OP_WRITE) ] = {
538 [ C(RESULT_ACCESS) ] = 0x01b7,
539 [ C(RESULT_MISS) ] = 0x01b7,
540 },
541 [ C(OP_PREFETCH) ] = {
542 [ C(RESULT_ACCESS) ] = 0x01b7,
543 [ C(RESULT_MISS) ] = 0x01b7,
544 },
545 },
503}; 546};
504 547
505static __initconst const u64 core2_hw_cache_event_ids 548static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
1003 1046
1004 data.period = event->hw.last_period; 1047 data.period = event->hw.last_period;
1005 1048
1006 if (perf_event_overflow(event, 1, &data, regs)) 1049 if (perf_event_overflow(event, &data, regs))
1007 x86_pmu_stop(event, 0); 1050 x86_pmu_stop(event, 0);
1008 } 1051 }
1009 1052
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
1037 return NULL; 1080 return NULL;
1038} 1081}
1039 1082
1083static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
1084{
1085 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
1086 return false;
1087
1088 if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
1089 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1090 event->hw.config |= 0x01bb;
1091 event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
1092 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
1093 } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
1094 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1095 event->hw.config |= 0x01b7;
1096 event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
1097 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
1098 }
1099
1100 if (event->hw.extra_reg.idx == orig_idx)
1101 return false;
1102
1103 return true;
1104}
1105
1106/*
1107 * manage allocation of shared extra msr for certain events
1108 *
1109 * sharing can be:
1110 * per-cpu: to be shared between the various events on a single PMU
1111 * per-core: per-cpu + shared by HT threads
1112 */
1040static struct event_constraint * 1113static struct event_constraint *
1041intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1114__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1115 struct perf_event *event)
1042{ 1116{
1043 struct hw_perf_event *hwc = &event->hw; 1117 struct event_constraint *c = &emptyconstraint;
1044 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; 1118 struct hw_perf_event_extra *reg = &event->hw.extra_reg;
1045 struct event_constraint *c;
1046 struct intel_percore *pc;
1047 struct er_account *era; 1119 struct er_account *era;
1048 int i; 1120 unsigned long flags;
1049 int free_slot; 1121 int orig_idx = reg->idx;
1050 int found;
1051 1122
1052 if (!x86_pmu.percore_constraints || hwc->extra_alloc) 1123 /* already allocated shared msr */
1053 return NULL; 1124 if (reg->alloc)
1125 return &unconstrained;
1054 1126
1055 for (c = x86_pmu.percore_constraints; c->cmask; c++) { 1127again:
1056 if (e != c->code) 1128 era = &cpuc->shared_regs->regs[reg->idx];
1057 continue; 1129 /*
1130 * we use spin_lock_irqsave() to avoid lockdep issues when
1131 * passing a fake cpuc
1132 */
1133 raw_spin_lock_irqsave(&era->lock, flags);
1134
1135 if (!atomic_read(&era->ref) || era->config == reg->config) {
1136
1137 /* lock in msr value */
1138 era->config = reg->config;
1139 era->reg = reg->reg;
1140
1141 /* one more user */
1142 atomic_inc(&era->ref);
1143
1144 /* no need to reallocate during incremental event scheduling */
1145 reg->alloc = 1;
1058 1146
1059 /* 1147 /*
1060 * Allocate resource per core. 1148 * All events using extra_reg are unconstrained.
1149 * Avoids calling x86_get_event_constraints()
1150 *
1151 * Must revisit if extra_reg controlling events
1152 * ever have constraints. Worst case we go through
1153 * the regular event constraint table.
1061 */ 1154 */
1062 pc = cpuc->per_core; 1155 c = &unconstrained;
1063 if (!pc) 1156 } else if (intel_try_alt_er(event, orig_idx)) {
1064 break; 1157 raw_spin_unlock(&era->lock);
1065 c = &emptyconstraint; 1158 goto again;
1066 raw_spin_lock(&pc->lock);
1067 free_slot = -1;
1068 found = 0;
1069 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1070 era = &pc->regs[i];
1071 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1072 /* Allow sharing same config */
1073 if (hwc->extra_config == era->extra_config) {
1074 era->ref++;
1075 cpuc->percore_used = 1;
1076 hwc->extra_alloc = 1;
1077 c = NULL;
1078 }
1079 /* else conflict */
1080 found = 1;
1081 break;
1082 } else if (era->ref == 0 && free_slot == -1)
1083 free_slot = i;
1084 }
1085 if (!found && free_slot != -1) {
1086 era = &pc->regs[free_slot];
1087 era->ref = 1;
1088 era->extra_reg = hwc->extra_reg;
1089 era->extra_config = hwc->extra_config;
1090 cpuc->percore_used = 1;
1091 hwc->extra_alloc = 1;
1092 c = NULL;
1093 }
1094 raw_spin_unlock(&pc->lock);
1095 return c;
1096 } 1159 }
1160 raw_spin_unlock_irqrestore(&era->lock, flags);
1097 1161
1098 return NULL; 1162 return c;
1163}
1164
1165static void
1166__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
1167 struct hw_perf_event_extra *reg)
1168{
1169 struct er_account *era;
1170
1171 /*
1172 * only put constraint if extra reg was actually
1173 * allocated. Also takes care of event which do
1174 * not use an extra shared reg
1175 */
1176 if (!reg->alloc)
1177 return;
1178
1179 era = &cpuc->shared_regs->regs[reg->idx];
1180
1181 /* one fewer user */
1182 atomic_dec(&era->ref);
1183
1184 /* allocate again next time */
1185 reg->alloc = 0;
1186}
1187
1188static struct event_constraint *
1189intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1190 struct perf_event *event)
1191{
1192 struct event_constraint *c = NULL;
1193
1194 if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
1195 c = __intel_shared_reg_get_constraints(cpuc, event);
1196
1197 return c;
1099} 1198}
1100 1199
1101static struct event_constraint * 1200static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
1111 if (c) 1210 if (c)
1112 return c; 1211 return c;
1113 1212
1114 c = intel_percore_constraints(cpuc, event); 1213 c = intel_shared_regs_constraints(cpuc, event);
1115 if (c) 1214 if (c)
1116 return c; 1215 return c;
1117 1216
1118 return x86_get_event_constraints(cpuc, event); 1217 return x86_get_event_constraints(cpuc, event);
1119} 1218}
1120 1219
1121static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 1220static void
1221intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1122 struct perf_event *event) 1222 struct perf_event *event)
1123{ 1223{
1124 struct extra_reg *er; 1224 struct hw_perf_event_extra *reg;
1125 struct intel_percore *pc;
1126 struct er_account *era;
1127 struct hw_perf_event *hwc = &event->hw;
1128 int i, allref;
1129 1225
1130 if (!cpuc->percore_used) 1226 reg = &event->hw.extra_reg;
1131 return; 1227 if (reg->idx != EXTRA_REG_NONE)
1132 1228 __intel_shared_reg_put_constraints(cpuc, reg);
1133 for (er = x86_pmu.extra_regs; er->msr; er++) { 1229}
1134 if (er->event != (hwc->config & er->config_mask))
1135 continue;
1136 1230
1137 pc = cpuc->per_core; 1231static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1138 raw_spin_lock(&pc->lock); 1232 struct perf_event *event)
1139 for (i = 0; i < MAX_EXTRA_REGS; i++) { 1233{
1140 era = &pc->regs[i]; 1234 intel_put_shared_regs_event_constraints(cpuc, event);
1141 if (era->ref > 0 &&
1142 era->extra_config == hwc->extra_config &&
1143 era->extra_reg == er->msr) {
1144 era->ref--;
1145 hwc->extra_alloc = 0;
1146 break;
1147 }
1148 }
1149 allref = 0;
1150 for (i = 0; i < MAX_EXTRA_REGS; i++)
1151 allref += pc->regs[i].ref;
1152 if (allref == 0)
1153 cpuc->percore_used = 0;
1154 raw_spin_unlock(&pc->lock);
1155 break;
1156 }
1157} 1235}
1158 1236
1159static int intel_pmu_hw_config(struct perf_event *event) 1237static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
1231 .event_constraints = intel_core_event_constraints, 1309 .event_constraints = intel_core_event_constraints,
1232}; 1310};
1233 1311
1312static struct intel_shared_regs *allocate_shared_regs(int cpu)
1313{
1314 struct intel_shared_regs *regs;
1315 int i;
1316
1317 regs = kzalloc_node(sizeof(struct intel_shared_regs),
1318 GFP_KERNEL, cpu_to_node(cpu));
1319 if (regs) {
1320 /*
1321 * initialize the locks to keep lockdep happy
1322 */
1323 for (i = 0; i < EXTRA_REG_MAX; i++)
1324 raw_spin_lock_init(&regs->regs[i].lock);
1325
1326 regs->core_id = -1;
1327 }
1328 return regs;
1329}
1330
1234static int intel_pmu_cpu_prepare(int cpu) 1331static int intel_pmu_cpu_prepare(int cpu)
1235{ 1332{
1236 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1333 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1237 1334
1238 if (!cpu_has_ht_siblings()) 1335 if (!x86_pmu.extra_regs)
1239 return NOTIFY_OK; 1336 return NOTIFY_OK;
1240 1337
1241 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), 1338 cpuc->shared_regs = allocate_shared_regs(cpu);
1242 GFP_KERNEL, cpu_to_node(cpu)); 1339 if (!cpuc->shared_regs)
1243 if (!cpuc->per_core)
1244 return NOTIFY_BAD; 1340 return NOTIFY_BAD;
1245 1341
1246 raw_spin_lock_init(&cpuc->per_core->lock);
1247 cpuc->per_core->core_id = -1;
1248 return NOTIFY_OK; 1342 return NOTIFY_OK;
1249} 1343}
1250 1344
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
1260 */ 1354 */
1261 intel_pmu_lbr_reset(); 1355 intel_pmu_lbr_reset();
1262 1356
1263 if (!cpu_has_ht_siblings()) 1357 if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
1264 return; 1358 return;
1265 1359
1266 for_each_cpu(i, topology_thread_cpumask(cpu)) { 1360 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1267 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; 1361 struct intel_shared_regs *pc;
1268 1362
1363 pc = per_cpu(cpu_hw_events, i).shared_regs;
1269 if (pc && pc->core_id == core_id) { 1364 if (pc && pc->core_id == core_id) {
1270 kfree(cpuc->per_core); 1365 kfree(cpuc->shared_regs);
1271 cpuc->per_core = pc; 1366 cpuc->shared_regs = pc;
1272 break; 1367 break;
1273 } 1368 }
1274 } 1369 }
1275 1370
1276 cpuc->per_core->core_id = core_id; 1371 cpuc->shared_regs->core_id = core_id;
1277 cpuc->per_core->refcnt++; 1372 cpuc->shared_regs->refcnt++;
1278} 1373}
1279 1374
1280static void intel_pmu_cpu_dying(int cpu) 1375static void intel_pmu_cpu_dying(int cpu)
1281{ 1376{
1282 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1377 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1283 struct intel_percore *pc = cpuc->per_core; 1378 struct intel_shared_regs *pc;
1284 1379
1380 pc = cpuc->shared_regs;
1285 if (pc) { 1381 if (pc) {
1286 if (pc->core_id == -1 || --pc->refcnt == 0) 1382 if (pc->core_id == -1 || --pc->refcnt == 0)
1287 kfree(pc); 1383 kfree(pc);
1288 cpuc->per_core = NULL; 1384 cpuc->shared_regs = NULL;
1289 } 1385 }
1290 1386
1291 fini_debug_store_on_cpu(cpu); 1387 fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1532,6 @@ static __init int intel_pmu_init(void)
1436 1532
1437 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1533 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1438 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; 1534 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1439 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1535 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1441 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1536 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442 1537
@@ -1481,10 +1576,10 @@ static __init int intel_pmu_init(void)
1481 intel_pmu_lbr_init_nhm(); 1576 intel_pmu_lbr_init_nhm();
1482 1577
1483 x86_pmu.event_constraints = intel_westmere_event_constraints; 1578 x86_pmu.event_constraints = intel_westmere_event_constraints;
1484 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1485 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1579 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1486 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; 1580 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1487 x86_pmu.extra_regs = intel_westmere_extra_regs; 1581 x86_pmu.extra_regs = intel_westmere_extra_regs;
1582 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1488 1583
1489 /* UOPS_ISSUED.STALLED_CYCLES */ 1584 /* UOPS_ISSUED.STALLED_CYCLES */
1490 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1585 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1502,6 +1597,10 @@ static __init int intel_pmu_init(void)
1502 1597
1503 x86_pmu.event_constraints = intel_snb_event_constraints; 1598 x86_pmu.event_constraints = intel_snb_event_constraints;
1504 x86_pmu.pebs_constraints = intel_snb_pebs_events; 1599 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1600 x86_pmu.extra_regs = intel_snb_extra_regs;
1601 /* all extra regs are per-cpu when HT is on */
1602 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1603 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
1505 1604
1506 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ 1605 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1507 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1606 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1512,11 +1611,19 @@ static __init int intel_pmu_init(void)
1512 break; 1611 break;
1513 1612
1514 default: 1613 default:
1515 /* 1614 switch (x86_pmu.version) {
1516 * default constraints for v2 and up 1615 case 1:
1517 */ 1616 x86_pmu.event_constraints = intel_v1_event_constraints;
1518 x86_pmu.event_constraints = intel_gen_event_constraints; 1617 pr_cont("generic architected perfmon v1, ");
1519 pr_cont("generic architected perfmon, "); 1618 break;
1619 default:
1620 /*
1621 * default constraints for v2 and up
1622 */
1623 x86_pmu.event_constraints = intel_gen_event_constraints;
1624 pr_cont("generic architected perfmon, ");
1625 break;
1626 }
1520 } 1627 }
1521 return 0; 1628 return 0;
1522} 1629}
@@ -1528,4 +1635,8 @@ static int intel_pmu_init(void)
1528 return 0; 1635 return 0;
1529} 1636}
1530 1637
1638static struct intel_shared_regs *allocate_shared_regs(int cpu)
1639{
1640 return NULL;
1641}
1531#endif /* CONFIG_CPU_SUP_INTEL */ 1642#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee25..1b1ef3addcfd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
340 */ 340 */
341 perf_prepare_sample(&header, &data, event, &regs); 341 perf_prepare_sample(&header, &data, event, &regs);
342 342
343 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) 343 if (perf_output_begin(&handle, event, header.size * (top - at)))
344 return 1; 344 return 1;
345 345
346 for (; at < top; at++) { 346 for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
616 else 616 else
617 regs.flags &= ~PERF_EFLAGS_EXACT; 617 regs.flags &= ~PERF_EFLAGS_EXACT;
618 618
619 if (perf_event_overflow(event, 1, &data, &regs)) 619 if (perf_event_overflow(event, &data, &regs))
620 x86_pmu_stop(event, 0); 620 x86_pmu_stop(event, 0);
621} 621}
622 622
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ead584fb6a7d..7809d2bcb209 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
554 [ C(RESULT_MISS) ] = -1, 554 [ C(RESULT_MISS) ] = -1,
555 }, 555 },
556 }, 556 },
557 [ C(NODE) ] = {
558 [ C(OP_READ) ] = {
559 [ C(RESULT_ACCESS) ] = -1,
560 [ C(RESULT_MISS) ] = -1,
561 },
562 [ C(OP_WRITE) ] = {
563 [ C(RESULT_ACCESS) ] = -1,
564 [ C(RESULT_MISS) ] = -1,
565 },
566 [ C(OP_PREFETCH) ] = {
567 [ C(RESULT_ACCESS) ] = -1,
568 [ C(RESULT_MISS) ] = -1,
569 },
570 },
557}; 571};
558 572
573/*
574 * Because of Netburst being quite restricted in how many
575 * identical events may run simultaneously, we introduce event aliases,
576 * ie the different events which have the same functionality but
577 * utilize non-intersected resources (ESCR/CCCR/counter registers).
578 *
579 * This allow us to relax restrictions a bit and run two or more
580 * identical events together.
581 *
582 * Never set any custom internal bits such as P4_CONFIG_HT,
583 * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
584 * either up to date automatically or not applicable at all.
585 */
586struct p4_event_alias {
587 u64 original;
588 u64 alternative;
589} p4_event_aliases[] = {
590 {
591 /*
592 * Non-halted cycles can be substituted with non-sleeping cycles (see
593 * Intel SDM Vol3b for details). We need this alias to be able
594 * to run nmi-watchdog and 'perf top' (or any other user space tool
595 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
596 * simultaneously.
597 */
598 .original =
599 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
600 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
601 .alternative =
602 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
603 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
604 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
605 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
606 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
607 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
608 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
609 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
610 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
611 p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
612 P4_CCCR_COMPARE),
613 },
614};
615
616static u64 p4_get_alias_event(u64 config)
617{
618 u64 config_match;
619 int i;
620
621 /*
622 * Only event with special mark is allowed,
623 * we're to be sure it didn't come as malformed
624 * RAW event.
625 */
626 if (!(config & P4_CONFIG_ALIASABLE))
627 return 0;
628
629 config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
630
631 for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
632 if (config_match == p4_event_aliases[i].original) {
633 config_match = p4_event_aliases[i].alternative;
634 break;
635 } else if (config_match == p4_event_aliases[i].alternative) {
636 config_match = p4_event_aliases[i].original;
637 break;
638 }
639 }
640
641 if (i >= ARRAY_SIZE(p4_event_aliases))
642 return 0;
643
644 return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
645}
646
559static u64 p4_general_events[PERF_COUNT_HW_MAX] = { 647static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
560 /* non-halted CPU clocks */ 648 /* non-halted CPU clocks */
561 [PERF_COUNT_HW_CPU_CYCLES] = 649 [PERF_COUNT_HW_CPU_CYCLES] =
562 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | 650 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
563 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), 651 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
652 P4_CONFIG_ALIASABLE,
564 653
565 /* 654 /*
566 * retired instructions 655 * retired instructions
@@ -945,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
945 1034
946 if (!x86_perf_event_set_period(event)) 1035 if (!x86_perf_event_set_period(event))
947 continue; 1036 continue;
948 if (perf_event_overflow(event, 1, &data, regs)) 1037 if (perf_event_overflow(event, &data, regs))
949 x86_pmu_stop(event, 0); 1038 x86_pmu_stop(event, 0);
950 } 1039 }
951 1040
@@ -1120,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1120 struct p4_event_bind *bind; 1209 struct p4_event_bind *bind;
1121 unsigned int i, thread, num; 1210 unsigned int i, thread, num;
1122 int cntr_idx, escr_idx; 1211 int cntr_idx, escr_idx;
1212 u64 config_alias;
1213 int pass;
1123 1214
1124 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 1215 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1125 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); 1216 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1128,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1128 1219
1129 hwc = &cpuc->event_list[i]->hw; 1220 hwc = &cpuc->event_list[i]->hw;
1130 thread = p4_ht_thread(cpu); 1221 thread = p4_ht_thread(cpu);
1222 pass = 0;
1223
1224again:
1225 /*
1226 * It's possible to hit a circular lock
1227 * between original and alternative events
1228 * if both are scheduled already.
1229 */
1230 if (pass > 2)
1231 goto done;
1232
1131 bind = p4_config_get_bind(hwc->config); 1233 bind = p4_config_get_bind(hwc->config);
1132 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); 1234 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
1133 if (unlikely(escr_idx == -1)) 1235 if (unlikely(escr_idx == -1))
@@ -1141,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1141 } 1243 }
1142 1244
1143 cntr_idx = p4_next_cntr(thread, used_mask, bind); 1245 cntr_idx = p4_next_cntr(thread, used_mask, bind);
1144 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) 1246 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
1145 goto done; 1247 /*
1248 * Check whether an event alias is still available.
1249 */
1250 config_alias = p4_get_alias_event(hwc->config);
1251 if (!config_alias)
1252 goto done;
1253 hwc->config = config_alias;
1254 pass++;
1255 goto again;
1256 }
1146 1257
1147 p4_pmu_swap_config_ts(hwc, cpu); 1258 p4_pmu_swap_config_ts(hwc, cpu);
1148 if (assign) 1259 if (assign)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 9aeb78a23de4..a621f3427685 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -134,6 +134,24 @@ static int __init add_bus_probe(void)
134module_init(add_bus_probe); 134module_init(add_bus_probe);
135 135
136#ifdef CONFIG_PCI 136#ifdef CONFIG_PCI
137struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
138{
139 struct device_node *np;
140
141 for_each_node_by_type(np, "pci") {
142 const void *prop;
143 unsigned int bus_min;
144
145 prop = of_get_property(np, "bus-range", NULL);
146 if (!prop)
147 continue;
148 bus_min = be32_to_cpup(prop);
149 if (bus->number == bus_min)
150 return np;
151 }
152 return NULL;
153}
154
137static int x86_of_pci_irq_enable(struct pci_dev *dev) 155static int x86_of_pci_irq_enable(struct pci_dev *dev)
138{ 156{
139 struct of_irq oirq; 157 struct of_irq oirq;
@@ -165,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
165 183
166void __cpuinit x86_of_pci_init(void) 184void __cpuinit x86_of_pci_init(void)
167{ 185{
168 struct device_node *np;
169
170 pcibios_enable_irq = x86_of_pci_irq_enable; 186 pcibios_enable_irq = x86_of_pci_irq_enable;
171 pcibios_disable_irq = x86_of_pci_irq_disable; 187 pcibios_disable_irq = x86_of_pci_irq_disable;
172
173 for_each_node_by_type(np, "pci") {
174 const void *prop;
175 struct pci_bus *bus;
176 unsigned int bus_min;
177 struct device_node *child;
178
179 prop = of_get_property(np, "bus-range", NULL);
180 if (!prop)
181 continue;
182 bus_min = be32_to_cpup(prop);
183
184 bus = pci_find_bus(0, bus_min);
185 if (!bus) {
186 printk(KERN_ERR "Can't find a node for bus %s.\n",
187 np->full_name);
188 continue;
189 }
190
191 if (bus->self)
192 bus->self->dev.of_node = np;
193 else
194 bus->dev.of_node = np;
195
196 for_each_child_of_node(np, child) {
197 struct pci_dev *dev;
198 u32 devfn;
199
200 prop = of_get_property(child, "reg", NULL);
201 if (!prop)
202 continue;
203
204 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
205 dev = pci_get_slot(bus, devfn);
206 if (!dev)
207 continue;
208 dev->dev.of_node = child;
209 pci_dev_put(dev);
210 }
211 }
212} 188}
213#endif 189#endif
214 190
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d2..19853ad8afc5 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
105} 105}
106 106
107/* 107/*
108 * We are returning from the irq stack and go to the previous one.
109 * If the previous stack is also in the irq stack, then bp in the first
110 * frame of the irq stack points to the previous, interrupted one.
111 * Otherwise we have another level of indirection: We first save
112 * the bp of the previous stack, then we switch the stack to the irq one
113 * and save a new bp that links to the previous one.
114 * (See save_args())
115 */
116static inline unsigned long
117fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
118 unsigned long *irq_stack, unsigned long *irq_stack_end)
119{
120#ifdef CONFIG_FRAME_POINTER
121 struct stack_frame *frame = (struct stack_frame *)bp;
122 unsigned long next;
123
124 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
125 if (!probe_kernel_address(&frame->next_frame, next))
126 return next;
127 else
128 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
129 "callchain\n", &frame->next_frame);
130 }
131#endif
132 return bp;
133}
134
135/*
136 * x86-64 can have up to three kernel stacks: 108 * x86-64 can have up to three kernel stacks:
137 * process stack 109 * process stack
138 * interrupt stack 110 * interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
155 task = current; 127 task = current;
156 128
157 if (!stack) { 129 if (!stack) {
158 stack = &dummy; 130 if (regs)
159 if (task && task != current) 131 stack = (unsigned long *)regs->sp;
132 else if (task && task != current)
160 stack = (unsigned long *)task->thread.sp; 133 stack = (unsigned long *)task->thread.sp;
134 else
135 stack = &dummy;
161 } 136 }
162 137
163 if (!bp) 138 if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
205 * pointer (index -1 to end) in the IRQ stack: 180 * pointer (index -1 to end) in the IRQ stack:
206 */ 181 */
207 stack = (unsigned long *) (irq_stack_end[-1]); 182 stack = (unsigned long *) (irq_stack_end[-1]);
208 bp = fixup_bp_irq_link(bp, stack, irq_stack,
209 irq_stack_end);
210 irq_stack_end = NULL; 183 irq_stack_end = NULL;
211 ops->stack(data, "EOI"); 184 ops->stack(data, "EOI");
212 continue; 185 continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0c989e..e13329d800c8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
9/* 9/*
10 * entry.S contains the system-call and fault low-level handling routines. 10 * entry.S contains the system-call and fault low-level handling routines.
11 * 11 *
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
12 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
14 * 16 *
@@ -297,27 +299,26 @@ ENDPROC(native_usergs_sysret64)
297 .endm 299 .endm
298 300
299/* save partial stack frame */ 301/* save partial stack frame */
300 .pushsection .kprobes.text, "ax" 302 .macro SAVE_ARGS_IRQ
301ENTRY(save_args)
302 XCPT_FRAME
303 cld 303 cld
304 /* 304 /* start from rbp in pt_regs and jump over */
305 * start from rbp in pt_regs and jump over 305 movq_cfi rdi, RDI-RBP
306 * return address. 306 movq_cfi rsi, RSI-RBP
307 */ 307 movq_cfi rdx, RDX-RBP
308 movq_cfi rdi, RDI+8-RBP 308 movq_cfi rcx, RCX-RBP
309 movq_cfi rsi, RSI+8-RBP 309 movq_cfi rax, RAX-RBP
310 movq_cfi rdx, RDX+8-RBP 310 movq_cfi r8, R8-RBP
311 movq_cfi rcx, RCX+8-RBP 311 movq_cfi r9, R9-RBP
312 movq_cfi rax, RAX+8-RBP 312 movq_cfi r10, R10-RBP
313 movq_cfi r8, R8+8-RBP 313 movq_cfi r11, R11-RBP
314 movq_cfi r9, R9+8-RBP 314
315 movq_cfi r10, R10+8-RBP 315 /* Save rbp so that we can unwind from get_irq_regs() */
316 movq_cfi r11, R11+8-RBP 316 movq_cfi rbp, 0
317 317
318 leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ 318 /* Save previous stack value */
319 movq_cfi rbp, 8 /* push %rbp */ 319 movq %rsp, %rsi
320 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ 320
321 leaq -RBP(%rsp),%rdi /* arg1 for handler */
321 testl $3, CS(%rdi) 322 testl $3, CS(%rdi)
322 je 1f 323 je 1f
323 SWAPGS 324 SWAPGS
@@ -329,19 +330,14 @@ ENTRY(save_args)
329 */ 330 */
3301: incl PER_CPU_VAR(irq_count) 3311: incl PER_CPU_VAR(irq_count)
331 jne 2f 332 jne 2f
332 popq_cfi %rax /* move return address... */
333 mov PER_CPU_VAR(irq_stack_ptr),%rsp 333 mov PER_CPU_VAR(irq_stack_ptr),%rsp
334 EMPTY_FRAME 0 334 EMPTY_FRAME 0
335 pushq_cfi %rbp /* backlink for unwinder */ 335
336 pushq_cfi %rax /* ... to the new stack */ 3362: /* Store previous stack value */
337 /* 337 pushq %rsi
338 * We entered an interrupt context - irqs are off: 338 /* We entered an interrupt context - irqs are off: */
339 */ 339 TRACE_IRQS_OFF
3402: TRACE_IRQS_OFF 340 .endm
341 ret
342 CFI_ENDPROC
343END(save_args)
344 .popsection
345 341
346ENTRY(save_rest) 342ENTRY(save_rest)
347 PARTIAL_FRAME 1 REST_SKIP+8 343 PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +469,7 @@ ENTRY(system_call_after_swapgs)
473 * and short: 469 * and short:
474 */ 470 */
475 ENABLE_INTERRUPTS(CLBR_NONE) 471 ENABLE_INTERRUPTS(CLBR_NONE)
476 SAVE_ARGS 8,1 472 SAVE_ARGS 8,0
477 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 473 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
478 movq %rcx,RIP-ARGOFFSET(%rsp) 474 movq %rcx,RIP-ARGOFFSET(%rsp)
479 CFI_REL_OFFSET rip,RIP-ARGOFFSET 475 CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +504,7 @@ sysret_check:
508 TRACE_IRQS_ON 504 TRACE_IRQS_ON
509 movq RIP-ARGOFFSET(%rsp),%rcx 505 movq RIP-ARGOFFSET(%rsp),%rcx
510 CFI_REGISTER rip,rcx 506 CFI_REGISTER rip,rcx
511 RESTORE_ARGS 0,-ARG_SKIP,1 507 RESTORE_ARGS 1,-ARG_SKIP,0
512 /*CFI_REGISTER rflags,r11*/ 508 /*CFI_REGISTER rflags,r11*/
513 movq PER_CPU_VAR(old_rsp), %rsp 509 movq PER_CPU_VAR(old_rsp), %rsp
514 USERGS_SYSRET64 510 USERGS_SYSRET64
@@ -791,7 +787,7 @@ END(interrupt)
791 /* reserve pt_regs for scratch regs and rbp */ 787 /* reserve pt_regs for scratch regs and rbp */
792 subq $ORIG_RAX-RBP, %rsp 788 subq $ORIG_RAX-RBP, %rsp
793 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP 789 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
794 call save_args 790 SAVE_ARGS_IRQ
795 PARTIAL_FRAME 0 791 PARTIAL_FRAME 0
796 call \func 792 call \func
797 .endm 793 .endm
@@ -814,15 +810,14 @@ ret_from_intr:
814 DISABLE_INTERRUPTS(CLBR_NONE) 810 DISABLE_INTERRUPTS(CLBR_NONE)
815 TRACE_IRQS_OFF 811 TRACE_IRQS_OFF
816 decl PER_CPU_VAR(irq_count) 812 decl PER_CPU_VAR(irq_count)
817 leaveq
818 813
819 CFI_RESTORE rbp 814 /* Restore saved previous stack */
815 popq %rsi
816 leaq 16(%rsi), %rsp
817
820 CFI_DEF_CFA_REGISTER rsp 818 CFI_DEF_CFA_REGISTER rsp
821 CFI_ADJUST_CFA_OFFSET -8 819 CFI_ADJUST_CFA_OFFSET -16
822 820
823 /* we did not save rbx, restore only from ARGOFFSET */
824 addq $8, %rsp
825 CFI_ADJUST_CFA_OFFSET -8
826exit_intr: 821exit_intr:
827 GET_THREAD_INFO(%rcx) 822 GET_THREAD_INFO(%rcx)
828 testl $3,CS-ARGOFFSET(%rsp) 823 testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +853,7 @@ retint_restore_args: /* return to kernel space */
858 */ 853 */
859 TRACE_IRQS_IRETQ 854 TRACE_IRQS_IRETQ
860restore_args: 855restore_args:
861 RESTORE_ARGS 0,8,0 856 RESTORE_ARGS 1,8,1
862 857
863irq_return: 858irq_return:
864 INTERRUPT_RETURN 859 INTERRUPT_RETURN
@@ -991,11 +986,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
991apicinterrupt THERMAL_APIC_VECTOR \ 986apicinterrupt THERMAL_APIC_VECTOR \
992 thermal_interrupt smp_thermal_interrupt 987 thermal_interrupt smp_thermal_interrupt
993 988
994#ifdef CONFIG_X86_MCE
995apicinterrupt MCE_SELF_VECTOR \
996 mce_self_interrupt smp_mce_self_interrupt
997#endif
998
999#ifdef CONFIG_SMP 989#ifdef CONFIG_SMP
1000apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 990apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1001 call_function_single_interrupt smp_call_function_single_interrupt 991 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1121,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1121zeroentry coprocessor_error do_coprocessor_error 1111zeroentry coprocessor_error do_coprocessor_error
1122errorentry alignment_check do_alignment_check 1112errorentry alignment_check do_alignment_check
1123zeroentry simd_coprocessor_error do_simd_coprocessor_error 1113zeroentry simd_coprocessor_error do_simd_coprocessor_error
1114zeroentry emulate_vsyscall do_emulate_vsyscall
1115
1124 1116
1125 /* Reload gs selector with exception handling */ 1117 /* Reload gs selector with exception handling */
1126 /* edi: new selector */ 1118 /* edi: new selector */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 6781765b3a0d..4aecc54236a9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
4#include <linux/sysdev.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/i8253.h>
7#include <linux/slab.h> 8#include <linux/slab.h>
8#include <linux/hpet.h> 9#include <linux/hpet.h>
9#include <linux/init.h> 10#include <linux/init.h>
@@ -12,8 +13,8 @@
12#include <linux/io.h> 13#include <linux/io.h>
13 14
14#include <asm/fixmap.h> 15#include <asm/fixmap.h>
15#include <asm/i8253.h>
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17#include <asm/time.h>
17 18
18#define HPET_MASK CLOCKSOURCE_MASK(32) 19#define HPET_MASK CLOCKSOURCE_MASK(32)
19 20
@@ -71,7 +72,7 @@ static inline void hpet_set_mapping(void)
71{ 72{
72 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); 73 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
73#ifdef CONFIG_X86_64 74#ifdef CONFIG_X86_64
74 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); 75 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
75#endif 76#endif
76} 77}
77 78
@@ -738,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
738 return (cycle_t)hpet_readl(HPET_COUNTER); 739 return (cycle_t)hpet_readl(HPET_COUNTER);
739} 740}
740 741
741#ifdef CONFIG_X86_64
742static cycle_t __vsyscall_fn vread_hpet(void)
743{
744 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
745}
746#endif
747
748static struct clocksource clocksource_hpet = { 742static struct clocksource clocksource_hpet = {
749 .name = "hpet", 743 .name = "hpet",
750 .rating = 250, 744 .rating = 250,
@@ -753,7 +747,7 @@ static struct clocksource clocksource_hpet = {
753 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 747 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
754 .resume = hpet_resume_counter, 748 .resume = hpet_resume_counter,
755#ifdef CONFIG_X86_64 749#ifdef CONFIG_X86_64
756 .vread = vread_hpet, 750 .archdata = { .vclock_mode = VCLOCK_HPET },
757#endif 751#endif
758}; 752};
759 753
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index fb66dc9e36cb..f2b96de3c7c1 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,113 +3,24 @@
3 * 3 *
4 */ 4 */
5#include <linux/clockchips.h> 5#include <linux/clockchips.h>
6#include <linux/interrupt.h>
7#include <linux/spinlock.h>
8#include <linux/jiffies.h>
9#include <linux/module.h> 6#include <linux/module.h>
10#include <linux/timex.h> 7#include <linux/timex.h>
11#include <linux/delay.h> 8#include <linux/i8253.h>
12#include <linux/init.h>
13#include <linux/io.h>
14 9
15#include <asm/i8253.h>
16#include <asm/hpet.h> 10#include <asm/hpet.h>
11#include <asm/time.h>
17#include <asm/smp.h> 12#include <asm/smp.h>
18 13
19DEFINE_RAW_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock);
21
22/* 14/*
23 * HPET replaces the PIT, when enabled. So we need to know, which of 15 * HPET replaces the PIT, when enabled. So we need to know, which of
24 * the two timers is used 16 * the two timers is used
25 */ 17 */
26struct clock_event_device *global_clock_event; 18struct clock_event_device *global_clock_event;
27 19
28/*
29 * Initialize the PIT timer.
30 *
31 * This is also called after resume to bring the PIT into operation again.
32 */
33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt)
35{
36 raw_spin_lock(&i8253_lock);
37
38 switch (mode) {
39 case CLOCK_EVT_MODE_PERIODIC:
40 /* binary, mode 2, LSB/MSB, ch 0 */
41 outb_pit(0x34, PIT_MODE);
42 outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
43 outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
44 break;
45
46 case CLOCK_EVT_MODE_SHUTDOWN:
47 case CLOCK_EVT_MODE_UNUSED:
48 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
49 evt->mode == CLOCK_EVT_MODE_ONESHOT) {
50 outb_pit(0x30, PIT_MODE);
51 outb_pit(0, PIT_CH0);
52 outb_pit(0, PIT_CH0);
53 }
54 break;
55
56 case CLOCK_EVT_MODE_ONESHOT:
57 /* One shot setup */
58 outb_pit(0x38, PIT_MODE);
59 break;
60
61 case CLOCK_EVT_MODE_RESUME:
62 /* Nothing to do here */
63 break;
64 }
65 raw_spin_unlock(&i8253_lock);
66}
67
68/*
69 * Program the next event in oneshot mode
70 *
71 * Delta is given in PIT ticks
72 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{
75 raw_spin_lock(&i8253_lock);
76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 raw_spin_unlock(&i8253_lock);
79
80 return 0;
81}
82
83/*
84 * On UP the PIT can serve all of the possible timer functions. On SMP systems
85 * it can be solely used for the global tick.
86 *
87 * The profiling and update capabilities are switched off once the local apic is
88 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
89 * !using_apic_timer decisions in do_timer_interrupt_hook()
90 */
91static struct clock_event_device pit_ce = {
92 .name = "pit",
93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
94 .set_mode = init_pit_timer,
95 .set_next_event = pit_next_event,
96 .irq = 0,
97};
98
99/*
100 * Initialize the conversion factor and the min/max deltas of the clock event
101 * structure and register the clock event source with the framework.
102 */
103void __init setup_pit_timer(void) 20void __init setup_pit_timer(void)
104{ 21{
105 /* 22 clockevent_i8253_init(true);
106 * Start pit with the boot cpu mask and make it global after the 23 global_clock_event = &i8253_clockevent;
107 * IO_APIC has been initialized.
108 */
109 pit_ce.cpumask = cpumask_of(smp_processor_id());
110
111 clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
112 global_clock_event = &pit_ce;
113} 24}
114 25
115#ifndef CONFIG_X86_64 26#ifndef CONFIG_X86_64
@@ -123,7 +34,7 @@ static int __init init_pit_clocksource(void)
123 * - when local APIC timer is active (PIT is switched off) 34 * - when local APIC timer is active (PIT is switched off)
124 */ 35 */
125 if (num_possible_cpus() > 1 || is_hpet_enabled() || 36 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
126 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) 37 i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
127 return 0; 38 return 0;
128 39
129 return clocksource_i8253_init(); 40 return clocksource_i8253_init();
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4ef993e..f09d4bbe2d2d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -272,9 +272,6 @@ static void __init apic_intr_init(void)
272#ifdef CONFIG_X86_MCE_THRESHOLD 272#ifdef CONFIG_X86_MCE_THRESHOLD
273 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 273 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
274#endif 274#endif
275#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
276 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
277#endif
278 275
279#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 276#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
280 /* self generated IPI for local APIC timer */ 277 /* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b5..00354d4919a9 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
608 return register_die_notifier(&kgdb_notifier); 608 return register_die_notifier(&kgdb_notifier);
609} 609}
610 610
611static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, 611static void kgdb_hw_overflow_handler(struct perf_event *event,
612 struct perf_sample_data *data, struct pt_regs *regs) 612 struct perf_sample_data *data, struct pt_regs *regs)
613{ 613{
614 struct task_struct *tsk = current; 614 struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
638 for (i = 0; i < HBP_NUM; i++) { 638 for (i = 0; i < HBP_NUM; i++) {
639 if (breakinfo[i].pev) 639 if (breakinfo[i].pev)
640 continue; 640 continue;
641 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 641 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
642 if (IS_ERR((void * __force)breakinfo[i].pev)) { 642 if (IS_ERR((void * __force)breakinfo[i].pev)) {
643 printk(KERN_ERR "kgdb: Could not allocate hw" 643 printk(KERN_ERR "kgdb: Could not allocate hw"
644 "breakpoints\nDisabling the kernel debugger\n"); 644 "breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab16..591be0ee1934 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_CONTAINER_SECTION_HDR 8 69#define SECTION_HDR_SIZE 8
70#define UCODE_CONTAINER_HEADER_SIZE 12 70#define CONTAINER_HDR_SZ 12
71 71
72static struct equiv_cpu_entry *equiv_cpu_table; 72static struct equiv_cpu_entry *equiv_cpu_table;
73 73
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
158{ 158{
159 struct cpuinfo_x86 *c = &cpu_data(cpu); 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
160 unsigned int max_size, actual_size; 160 u32 max_size, actual_size;
161 161
162#define F1XH_MPB_MAX_SIZE 2048 162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824 163#define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
175 break; 175 break;
176 } 176 }
177 177
178 actual_size = buf[4] + (buf[5] << 8); 178 actual_size = *(u32 *)(buf + 4);
179 179
180 if (actual_size > size || actual_size > max_size) { 180 if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
181 pr_err("section size mismatch\n"); 181 pr_err("section size mismatch\n");
182 return 0; 182 return 0;
183 } 183 }
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
191 struct microcode_header_amd *mc = NULL; 191 struct microcode_header_amd *mc = NULL;
192 unsigned int actual_size = 0; 192 unsigned int actual_size = 0;
193 193
194 if (buf[0] != UCODE_UCODE_TYPE) { 194 if (*(u32 *)buf != UCODE_UCODE_TYPE) {
195 pr_err("invalid type field in container file section header\n"); 195 pr_err("invalid type field in container file section header\n");
196 goto out; 196 goto out;
197 } 197 }
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
204 if (!mc) 204 if (!mc)
205 goto out; 205 goto out;
206 206
207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); 207 get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; 208 *mc_size = actual_size + SECTION_HDR_SIZE;
209 209
210out: 210out:
211 return mc; 211 return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
229 return -ENOMEM; 229 return -ENOMEM;
230 } 230 }
231 231
232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size); 232 get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
233 233
234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 /* add header length */
235 return size + CONTAINER_HDR_SZ;
235} 236}
236 237
237static void free_equiv_cpu_table(void) 238static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 807c2a2b80f1..82528799c5de 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
528 return ret; 528 return ret;
529} 529}
530 530
531static void ptrace_triggered(struct perf_event *bp, int nmi, 531static void ptrace_triggered(struct perf_event *bp,
532 struct perf_sample_data *data, 532 struct perf_sample_data *data,
533 struct pt_regs *regs) 533 struct pt_regs *regs)
534{ 534{
@@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
715 attr.bp_type = HW_BREAKPOINT_W; 715 attr.bp_type = HW_BREAKPOINT_W;
716 attr.disabled = 1; 716 attr.disabled = 1;
717 717
718 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); 718 bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
719 NULL, tsk);
719 720
720 /* 721 /*
721 * CHECKME: the previous code returned -EIO if the addr wasn't 722 * CHECKME: the previous code returned -EIO if the addr wasn't
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 8bbe8c56916d..b78643d0f9a5 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,7 +10,7 @@
10 10
11static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) 11static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
12{ 12{
13 u8 config, rev; 13 u8 config;
14 u16 word; 14 u16 word;
15 15
16 /* BIOS may enable hardware IRQ balancing for 16 /* BIOS may enable hardware IRQ balancing for
@@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
18 * based platforms. 18 * based platforms.
19 * Disable SW irqbalance/affinity on those platforms. 19 * Disable SW irqbalance/affinity on those platforms.
20 */ 20 */
21 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 21 if (dev->revision > 0x9)
22 if (rev > 0x9)
23 return; 22 return;
24 23
25 /* enable access to config space*/ 24 /* enable access to config space*/
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 0c016f727695..9242436e9937 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -294,6 +294,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
294 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), 294 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
295 }, 295 },
296 }, 296 },
297 { /* Handle reboot issue on Acer Aspire one */
298 .callback = set_bios_reboot,
299 .ident = "Acer Aspire One A110",
300 .matches = {
301 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
302 DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
303 },
304 },
297 { } 305 { }
298}; 306};
299 307
@@ -411,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
411 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), 419 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
412 }, 420 },
413 }, 421 },
422 { /* Handle problems with rebooting on the Latitude E6320. */
423 .callback = set_pci_reboot,
424 .ident = "Dell Latitude E6320",
425 .matches = {
426 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
427 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
428 },
429 },
430 { /* Handle problems with rebooting on the Latitude E5420. */
431 .callback = set_pci_reboot,
432 .ident = "Dell Latitude E5420",
433 .matches = {
434 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
435 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
436 },
437 },
438 { /* Handle problems with rebooting on the Latitude E6420. */
439 .callback = set_pci_reboot,
440 .ident = "Dell Latitude E6420",
441 .matches = {
442 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
443 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
444 },
445 },
414 { } 446 { }
415}; 447};
416 448
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11c..36818f8ec2be 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -97,6 +97,8 @@ relocate_kernel:
97 ret 97 ret
98 98
99identity_mapped: 99identity_mapped:
100 /* set return address to 0 if not preserving context */
101 pushl $0
100 /* store the start address on the stack */ 102 /* store the start address on the stack */
101 pushl %edx 103 pushl %edx
102 104
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d476..7a6f3b3be3cf 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -100,6 +100,8 @@ relocate_kernel:
100 ret 100 ret
101 101
102identity_mapped: 102identity_mapped:
103 /* set return address to 0 if not preserving context */
104 pushq $0
103 /* store the start address on the stack */ 105 /* store the start address on the stack */
104 pushq %rdx 106 pushq %rdx
105 107
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 40a24932a8a1..54ddaeb221c1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
485asmlinkage int 485asmlinkage int
486sys_sigsuspend(int history0, int history1, old_sigset_t mask) 486sys_sigsuspend(int history0, int history1, old_sigset_t mask)
487{ 487{
488 mask &= _BLOCKABLE; 488 sigset_t blocked;
489 spin_lock_irq(&current->sighand->siglock); 489
490 current->saved_sigmask = current->blocked; 490 current->saved_sigmask = current->blocked;
491 siginitset(&current->blocked, mask); 491
492 recalc_sigpending(); 492 mask &= _BLOCKABLE;
493 spin_unlock_irq(&current->sighand->siglock); 493 siginitset(&blocked, mask);
494 set_current_blocked(&blocked);
494 495
495 current->state = TASK_INTERRUPTIBLE; 496 current->state = TASK_INTERRUPTIBLE;
496 schedule(); 497 schedule();
497 set_restore_sigmask();
498 498
499 set_restore_sigmask();
499 return -ERESTARTNOHAND; 500 return -ERESTARTNOHAND;
500} 501}
501 502
@@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
572 goto badframe; 573 goto badframe;
573 574
574 sigdelsetmask(&set, ~_BLOCKABLE); 575 sigdelsetmask(&set, ~_BLOCKABLE);
575 spin_lock_irq(&current->sighand->siglock); 576 set_current_blocked(&set);
576 current->blocked = set;
577 recalc_sigpending();
578 spin_unlock_irq(&current->sighand->siglock);
579 577
580 if (restore_sigcontext(regs, &frame->sc, &ax)) 578 if (restore_sigcontext(regs, &frame->sc, &ax))
581 goto badframe; 579 goto badframe;
@@ -653,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
653 651
654static int 652static int
655setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 653setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
656 sigset_t *set, struct pt_regs *regs) 654 struct pt_regs *regs)
657{ 655{
658 int usig = signr_convert(sig); 656 int usig = signr_convert(sig);
657 sigset_t *set = &current->blocked;
659 int ret; 658 int ret;
660 659
660 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
661 set = &current->saved_sigmask;
662
661 /* Set up the stack frame */ 663 /* Set up the stack frame */
662 if (is_ia32) { 664 if (is_ia32) {
663 if (ka->sa.sa_flags & SA_SIGINFO) 665 if (ka->sa.sa_flags & SA_SIGINFO)
@@ -672,12 +674,13 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
672 return -EFAULT; 674 return -EFAULT;
673 } 675 }
674 676
677 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
675 return ret; 678 return ret;
676} 679}
677 680
678static int 681static int
679handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
680 sigset_t *oldset, struct pt_regs *regs) 683 struct pt_regs *regs)
681{ 684{
682 sigset_t blocked; 685 sigset_t blocked;
683 int ret; 686 int ret;
@@ -712,20 +715,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
712 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 715 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
713 regs->flags &= ~X86_EFLAGS_TF; 716 regs->flags &= ~X86_EFLAGS_TF;
714 717
715 ret = setup_rt_frame(sig, ka, info, oldset, regs); 718 ret = setup_rt_frame(sig, ka, info, regs);
716 719
717 if (ret) 720 if (ret)
718 return ret; 721 return ret;
719 722
720#ifdef CONFIG_X86_64
721 /*
722 * This has nothing to do with segment registers,
723 * despite the name. This magic affects uaccess.h
724 * macros' behavior. Reset it to the normal setting.
725 */
726 set_fs(USER_DS);
727#endif
728
729 /* 723 /*
730 * Clear the direction flag as per the ABI for function entry. 724 * Clear the direction flag as per the ABI for function entry.
731 */ 725 */
@@ -767,7 +761,6 @@ static void do_signal(struct pt_regs *regs)
767 struct k_sigaction ka; 761 struct k_sigaction ka;
768 siginfo_t info; 762 siginfo_t info;
769 int signr; 763 int signr;
770 sigset_t *oldset;
771 764
772 /* 765 /*
773 * We want the common case to go fast, which is why we may in certain 766 * We want the common case to go fast, which is why we may in certain
@@ -779,23 +772,10 @@ static void do_signal(struct pt_regs *regs)
779 if (!user_mode(regs)) 772 if (!user_mode(regs))
780 return; 773 return;
781 774
782 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
783 oldset = &current->saved_sigmask;
784 else
785 oldset = &current->blocked;
786
787 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 775 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
788 if (signr > 0) { 776 if (signr > 0) {
789 /* Whee! Actually deliver the signal. */ 777 /* Whee! Actually deliver the signal. */
790 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 778 handle_signal(signr, &info, &ka, regs);
791 /*
792 * A signal was successfully delivered; the saved
793 * sigmask will have been stored in the signal frame,
794 * and will be restored by sigreturn, so we can simply
795 * clear the TS_RESTORE_SIGMASK flag.
796 */
797 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
798 }
799 return; 779 return;
800 } 780 }
801 781
@@ -823,7 +803,7 @@ static void do_signal(struct pt_regs *regs)
823 */ 803 */
824 if (current_thread_info()->status & TS_RESTORE_SIGMASK) { 804 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
825 current_thread_info()->status &= ~TS_RESTORE_SIGMASK; 805 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
826 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL); 806 set_current_blocked(&current->saved_sigmask);
827 } 807 }
828} 808}
829 809
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9fd3137230d4..9f548cb4a958 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -438,7 +438,7 @@ static void impress_friends(void)
438void __inquire_remote_apic(int apicid) 438void __inquire_remote_apic(int apicid)
439{ 439{
440 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 440 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
441 char *names[] = { "ID", "VERSION", "SPIV" }; 441 const char * const names[] = { "ID", "VERSION", "SPIV" };
442 int timeout; 442 int timeout;
443 u32 status; 443 u32 status;
444 444
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55d9bc03f696..fdd0c6430e5a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
66} 66}
67EXPORT_SYMBOL_GPL(save_stack_trace); 67EXPORT_SYMBOL_GPL(save_stack_trace);
68 68
69void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) 69void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
70{ 70{
71 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); 71 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
72 if (trace->nr_entries < trace->max_entries) 72 if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 30ac65df7d4e..e07a2fc876b9 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
36#include <asm/bootparam.h> 36#include <asm/bootparam.h>
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
39#include <asm/swiotlb.h>
39#include <asm/fixmap.h> 40#include <asm/fixmap.h>
40#include <asm/proto.h> 41#include <asm/proto.h>
41#include <asm/setup.h> 42#include <asm/setup.h>
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 00cbb272627f..5a64d057be57 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,13 +11,13 @@
11 11
12#include <linux/clockchips.h> 12#include <linux/clockchips.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/i8253.h>
14#include <linux/time.h> 15#include <linux/time.h>
15#include <linux/mca.h> 16#include <linux/mca.h>
16 17
17#include <asm/vsyscall.h> 18#include <asm/vsyscall.h>
18#include <asm/x86_init.h> 19#include <asm/x86_init.h>
19#include <asm/i8259.h> 20#include <asm/i8259.h>
20#include <asm/i8253.h>
21#include <asm/timer.h> 21#include <asm/timer.h>
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/time.h> 23#include <asm/time.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9de..fbc097a085ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -872,6 +872,12 @@ void __init trap_init(void)
872 set_bit(SYSCALL_VECTOR, used_vectors); 872 set_bit(SYSCALL_VECTOR, used_vectors);
873#endif 873#endif
874 874
875#ifdef CONFIG_X86_64
876 BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
877 set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
878 set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
879#endif
880
875 /* 881 /*
876 * Should be a barrier for any external CPU state: 882 * Should be a barrier for any external CPU state:
877 */ 883 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6cc6922262af..db483369f10b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,7 +5,6 @@
5#include <linux/timer.h> 5#include <linux/timer.h>
6#include <linux/acpi_pmtmr.h> 6#include <linux/acpi_pmtmr.h>
7#include <linux/cpufreq.h> 7#include <linux/cpufreq.h>
8#include <linux/dmi.h>
9#include <linux/delay.h> 8#include <linux/delay.h>
10#include <linux/clocksource.h> 9#include <linux/clocksource.h>
11#include <linux/percpu.h> 10#include <linux/percpu.h>
@@ -777,7 +776,7 @@ static struct clocksource clocksource_tsc = {
777 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 776 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
778 CLOCK_SOURCE_MUST_VERIFY, 777 CLOCK_SOURCE_MUST_VERIFY,
779#ifdef CONFIG_X86_64 778#ifdef CONFIG_X86_64
780 .vread = vread_tsc, 779 .archdata = { .vclock_mode = VCLOCK_TSC },
781#endif 780#endif
782}; 781};
783 782
@@ -800,27 +799,6 @@ void mark_tsc_unstable(char *reason)
800 799
801EXPORT_SYMBOL_GPL(mark_tsc_unstable); 800EXPORT_SYMBOL_GPL(mark_tsc_unstable);
802 801
803static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
804{
805 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
806 d->ident);
807 tsc_unstable = 1;
808 return 0;
809}
810
811/* List of systems that have known TSC problems */
812static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
813 {
814 .callback = dmi_mark_tsc_unstable,
815 .ident = "IBM Thinkpad 380XD",
816 .matches = {
817 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
818 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
819 },
820 },
821 {}
822};
823
824static void __init check_system_tsc_reliable(void) 802static void __init check_system_tsc_reliable(void)
825{ 803{
826#ifdef CONFIG_MGEODE_LX 804#ifdef CONFIG_MGEODE_LX
@@ -1010,8 +988,6 @@ void __init tsc_init(void)
1010 lpj_fine = lpj; 988 lpj_fine = lpj;
1011 989
1012 use_tsc_delay(); 990 use_tsc_delay();
1013 /* Check and install the TSC clocksource */
1014 dmi_check_system(bad_tsc_dmi_table);
1015 991
1016 if (unsynchronized_tsc()) 992 if (unsynchronized_tsc())
1017 mark_tsc_unstable("TSCs unsynchronized"); 993 mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 89aed99aafce..4aa9c54a9b76 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -161,50 +161,47 @@ SECTIONS
161 161
162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) 162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
164#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
165 ADDR(.vsyscall_0) + offset \
166 : AT(VLOAD(.vsyscall_var_ ## x)) { \
167 *(.vsyscall_var_ ## x) \
168 } \
169 x = VVIRT(.vsyscall_var_ ## x);
170 164
171 . = ALIGN(4096); 165 . = ALIGN(4096);
172 __vsyscall_0 = .; 166 __vsyscall_0 = .;
173 167
174 . = VSYSCALL_ADDR; 168 . = VSYSCALL_ADDR;
175 .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { 169 .vsyscall : AT(VLOAD(.vsyscall)) {
176 *(.vsyscall_0) 170 *(.vsyscall_0)
177 } :user
178 171
179 . = ALIGN(L1_CACHE_BYTES); 172 . = 1024;
180 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
181 *(.vsyscall_fn)
182 }
183
184 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
185 *(.vsyscall_1) 173 *(.vsyscall_1)
186 }
187 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
188 *(.vsyscall_2)
189 }
190 174
191 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { 175 . = 2048;
192 *(.vsyscall_3) 176 *(.vsyscall_2)
193 }
194
195#define __VVAR_KERNEL_LDS
196#include <asm/vvar.h>
197#undef __VVAR_KERNEL_LDS
198 177
199 . = __vsyscall_0 + PAGE_SIZE; 178 . = 4096; /* Pad the whole page. */
179 } :user =0xcc
180 . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
200 181
201#undef VSYSCALL_ADDR 182#undef VSYSCALL_ADDR
202#undef VLOAD_OFFSET 183#undef VLOAD_OFFSET
203#undef VLOAD 184#undef VLOAD
204#undef VVIRT_OFFSET 185#undef VVIRT_OFFSET
205#undef VVIRT 186#undef VVIRT
187
188 __vvar_page = .;
189
190 .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
191
192 /* Place all vvars at the offsets in asm/vvar.h. */
193#define EMIT_VVAR(name, offset) \
194 . = offset; \
195 *(.vvar_ ## name)
196#define __VVAR_KERNEL_LDS
197#include <asm/vvar.h>
198#undef __VVAR_KERNEL_LDS
206#undef EMIT_VVAR 199#undef EMIT_VVAR
207 200
201 } :data
202
203 . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
204
208#endif /* CONFIG_X86_64 */ 205#endif /* CONFIG_X86_64 */
209 206
210 /* Init code and data - will be freed after init */ 207 /* Init code and data - will be freed after init */
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
deleted file mode 100644
index a81aa9e9894c..000000000000
--- a/arch/x86/kernel/vread_tsc_64.c
+++ /dev/null
@@ -1,36 +0,0 @@
1/* This code runs in userspace. */
2
3#define DISABLE_BRANCH_PROFILING
4#include <asm/vgtod.h>
5
6notrace cycle_t __vsyscall_fn vread_tsc(void)
7{
8 cycle_t ret;
9 u64 last;
10
11 /*
12 * Empirically, a fence (of type that depends on the CPU)
13 * before rdtsc is enough to ensure that rdtsc is ordered
14 * with respect to loads. The various CPU manuals are unclear
15 * as to whether rdtsc can be reordered with later loads,
16 * but no one has ever seen it happen.
17 */
18 rdtsc_barrier();
19 ret = (cycle_t)vget_cycles();
20
21 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
22
23 if (likely(ret >= last))
24 return ret;
25
26 /*
27 * GCC likes to generate cmov here, but this branch is extremely
28 * predictable (it's just a funciton of time and the likely is
29 * very likely) and there's a data dependence, so force GCC
30 * to generate a branch instead. I don't barrier() because
31 * we don't actually need a barrier, and if this function
32 * ever gets inlined it will generate worse code.
33 */
34 asm volatile ("");
35 return last;
36}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3e682184d76c..dda7dff9cef7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs. 3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 * 4 *
5 * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
6 *
5 * Thanks to hpa@transmeta.com for some useful hint. 7 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with 8 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name. 9 * a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid 13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this 14 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS. 15 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 * 16 *
16 * Note: the concept clashes with user mode linux. If you use UML and 17 * Note: the concept clashes with user mode linux. UML users should
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 18 * use the vDSO.
18 */ 19 */
19 20
20/* Disable profiling for userspace code: */ 21/* Disable profiling for userspace code: */
@@ -32,9 +33,12 @@
32#include <linux/cpu.h> 33#include <linux/cpu.h>
33#include <linux/smp.h> 34#include <linux/smp.h>
34#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/syscalls.h>
37#include <linux/ratelimit.h>
35 38
36#include <asm/vsyscall.h> 39#include <asm/vsyscall.h>
37#include <asm/pgtable.h> 40#include <asm/pgtable.h>
41#include <asm/compat.h>
38#include <asm/page.h> 42#include <asm/page.h>
39#include <asm/unistd.h> 43#include <asm/unistd.h>
40#include <asm/fixmap.h> 44#include <asm/fixmap.h>
@@ -44,16 +48,12 @@
44#include <asm/desc.h> 48#include <asm/desc.h>
45#include <asm/topology.h> 49#include <asm/topology.h>
46#include <asm/vgtod.h> 50#include <asm/vgtod.h>
47 51#include <asm/traps.h>
48#define __vsyscall(nr) \
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50#define __syscall_clobber "r11","cx","memory"
51 52
52DEFINE_VVAR(int, vgetcpu_mode); 53DEFINE_VVAR(int, vgetcpu_mode);
53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = 54DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
54{ 55{
55 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), 56 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
56 .sysctl_enabled = 1,
57}; 57};
58 58
59void update_vsyscall_tz(void) 59void update_vsyscall_tz(void)
@@ -72,179 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
72 unsigned long flags; 72 unsigned long flags;
73 73
74 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); 74 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
75
75 /* copy vsyscall data */ 76 /* copy vsyscall data */
76 vsyscall_gtod_data.clock.vread = clock->vread; 77 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
77 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 78 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
78 vsyscall_gtod_data.clock.mask = clock->mask; 79 vsyscall_gtod_data.clock.mask = clock->mask;
79 vsyscall_gtod_data.clock.mult = mult; 80 vsyscall_gtod_data.clock.mult = mult;
80 vsyscall_gtod_data.clock.shift = clock->shift; 81 vsyscall_gtod_data.clock.shift = clock->shift;
81 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 82 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
82 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 83 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
83 vsyscall_gtod_data.wall_to_monotonic = *wtm; 84 vsyscall_gtod_data.wall_to_monotonic = *wtm;
84 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 85 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
86
85 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 87 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
86} 88}
87 89
88/* RED-PEN may want to readd seq locking, but then the variable should be 90static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
89 * write-once. 91 const char *message)
90 */
91static __always_inline void do_get_tz(struct timezone * tz)
92{ 92{
93 *tz = VVAR(vsyscall_gtod_data).sys_tz; 93 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
94} 94 struct task_struct *tsk;
95 95
96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 96 if (!show_unhandled_signals || !__ratelimit(&rs))
97{ 97 return;
98 int ret;
99 asm volatile("syscall"
100 : "=a" (ret)
101 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
102 : __syscall_clobber );
103 return ret;
104}
105 98
106static __always_inline long time_syscall(long *t) 99 tsk = current;
107{
108 long secs;
109 asm volatile("syscall"
110 : "=a" (secs)
111 : "0" (__NR_time),"D" (t) : __syscall_clobber);
112 return secs;
113}
114 100
115static __always_inline void do_vgettimeofday(struct timeval * tv) 101 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
116{ 102 level, tsk->comm, task_pid_nr(tsk),
117 cycle_t now, base, mask, cycle_delta; 103 message, regs->ip - 2, regs->cs,
118 unsigned seq; 104 regs->sp, regs->ax, regs->si, regs->di);
119 unsigned long mult, shift, nsec;
120 cycle_t (*vread)(void);
121 do {
122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
123
124 vread = VVAR(vsyscall_gtod_data).clock.vread;
125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
127 gettimeofday(tv,NULL);
128 return;
129 }
130
131 now = vread();
132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
133 mask = VVAR(vsyscall_gtod_data).clock.mask;
134 mult = VVAR(vsyscall_gtod_data).clock.mult;
135 shift = VVAR(vsyscall_gtod_data).clock.shift;
136
137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
140
141 /* calculate interval: */
142 cycle_delta = (now - base) & mask;
143 /* convert to nsecs: */
144 nsec += (cycle_delta * mult) >> shift;
145
146 while (nsec >= NSEC_PER_SEC) {
147 tv->tv_sec += 1;
148 nsec -= NSEC_PER_SEC;
149 }
150 tv->tv_usec = nsec / NSEC_PER_USEC;
151} 105}
152 106
153int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) 107static int addr_to_vsyscall_nr(unsigned long addr)
154{ 108{
155 if (tv) 109 int nr;
156 do_vgettimeofday(tv);
157 if (tz)
158 do_get_tz(tz);
159 return 0;
160}
161 110
162/* This will break when the xtime seconds get inaccurate, but that is 111 if ((addr & ~0xC00UL) != VSYSCALL_START)
163 * unlikely */ 112 return -EINVAL;
164time_t __vsyscall(1) vtime(time_t *t)
165{
166 unsigned seq;
167 time_t result;
168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
169 return time_syscall(t);
170 113
171 do { 114 nr = (addr & 0xC00UL) >> 10;
172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); 115 if (nr >= 3)
116 return -EINVAL;
173 117
174 result = VVAR(vsyscall_gtod_data).wall_time_sec; 118 return nr;
119}
175 120
176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); 121void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
122{
123 struct task_struct *tsk;
124 unsigned long caller;
125 int vsyscall_nr;
126 long ret;
127
128 local_irq_enable();
129
130 /*
131 * Real 64-bit user mode code has cs == __USER_CS. Anything else
132 * is bogus.
133 */
134 if (regs->cs != __USER_CS) {
135 /*
136 * If we trapped from kernel mode, we might as well OOPS now
137 * instead of returning to some random address and OOPSing
138 * then.
139 */
140 BUG_ON(!user_mode(regs));
141
142 /* Compat mode and non-compat 32-bit CS should both segfault. */
143 warn_bad_vsyscall(KERN_WARNING, regs,
144 "illegal int 0xcc from 32-bit mode");
145 goto sigsegv;
146 }
177 147
178 if (t) 148 /*
179 *t = result; 149 * x86-ism here: regs->ip points to the instruction after the int 0xcc,
180 return result; 150 * and int 0xcc is two bytes long.
181} 151 */
152 vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
153 if (vsyscall_nr < 0) {
154 warn_bad_vsyscall(KERN_WARNING, regs,
155 "illegal int 0xcc (exploit attempt?)");
156 goto sigsegv;
157 }
182 158
183/* Fast way to get current CPU and node. 159 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
184 This helps to do per node and per CPU caches in user space. 160 warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
185 The result is not guaranteed without CPU affinity, but usually 161 goto sigsegv;
186 works out because the scheduler tries to keep a thread on the same 162 }
187 CPU.
188 163
189 tcache must point to a two element sized long array. 164 tsk = current;
190 All arguments can be NULL. */ 165 if (seccomp_mode(&tsk->seccomp))
191long __vsyscall(2) 166 do_exit(SIGKILL);
192vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 167
193{ 168 switch (vsyscall_nr) {
194 unsigned int p; 169 case 0:
195 unsigned long j = 0; 170 ret = sys_gettimeofday(
196 171 (struct timeval __user *)regs->di,
197 /* Fast cache - only recompute value once per jiffies and avoid 172 (struct timezone __user *)regs->si);
198 relatively costly rdtscp/cpuid otherwise. 173 break;
199 This works because the scheduler usually keeps the process 174
200 on the same CPU and this syscall doesn't guarantee its 175 case 1:
201 results anyways. 176 ret = sys_time((time_t __user *)regs->di);
202 We do this here because otherwise user space would do it on 177 break;
203 its own in a likely inferior way (no access to jiffies). 178
204 If you don't like it pass NULL. */ 179 case 2:
205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { 180 ret = sys_getcpu((unsigned __user *)regs->di,
206 p = tcache->blob[1]; 181 (unsigned __user *)regs->si,
207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 182 0);
208 /* Load per CPU data from RDTSCP */ 183 break;
209 native_read_tscp(&p);
210 } else {
211 /* Load per CPU data from GDT */
212 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
213 } 184 }
214 if (tcache) { 185
215 tcache->blob[0] = j; 186 if (ret == -EFAULT) {
216 tcache->blob[1] = p; 187 /*
188 * Bad news -- userspace fed a bad pointer to a vsyscall.
189 *
190 * With a real vsyscall, that would have caused SIGSEGV.
191 * To make writing reliable exploits using the emulated
192 * vsyscalls harder, generate SIGSEGV here as well.
193 */
194 warn_bad_vsyscall(KERN_INFO, regs,
195 "vsyscall fault (exploit attempt?)");
196 goto sigsegv;
217 } 197 }
218 if (cpu)
219 *cpu = p & 0xfff;
220 if (node)
221 *node = p >> 12;
222 return 0;
223}
224 198
225static long __vsyscall(3) venosys_1(void) 199 regs->ax = ret;
226{
227 return -ENOSYS;
228}
229 200
230#ifdef CONFIG_SYSCTL 201 /* Emulate a ret instruction. */
231static ctl_table kernel_table2[] = { 202 regs->ip = caller;
232 { .procname = "vsyscall64", 203 regs->sp += 8;
233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
234 .mode = 0644,
235 .proc_handler = proc_dointvec },
236 {}
237};
238 204
239static ctl_table kernel_root_table2[] = { 205 local_irq_disable();
240 { .procname = "kernel", .mode = 0555, 206 return;
241 .child = kernel_table2 }, 207
242 {} 208sigsegv:
243}; 209 regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */
244#endif 210 force_sig(SIGSEGV, current);
211 local_irq_disable();
212}
245 213
246/* Assume __initcall executes before all user space. Hopefully kmod 214/*
247 doesn't violate that. We'll find out if it does. */ 215 * Assume __initcall executes before all user space. Hopefully kmod
216 * doesn't violate that. We'll find out if it does.
217 */
248static void __cpuinit vsyscall_set_cpu(int cpu) 218static void __cpuinit vsyscall_set_cpu(int cpu)
249{ 219{
250 unsigned long d; 220 unsigned long d;
@@ -255,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
255 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 225 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
256 write_rdtscp_aux((node << 12) | cpu); 226 write_rdtscp_aux((node << 12) | cpu);
257 227
258 /* Store cpu number in limit so that it can be loaded quickly 228 /*
259 in user space in vgetcpu. 229 * Store cpu number in limit so that it can be loaded quickly
260 12 bits for the CPU and 8 bits for the node. */ 230 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
231 */
261 d = 0x0f40000000000ULL; 232 d = 0x0f40000000000ULL;
262 d |= cpu; 233 d |= cpu;
263 d |= (node & 0xf) << 12; 234 d |= (node & 0xf) << 12;
264 d |= (node >> 4) << 48; 235 d |= (node >> 4) << 48;
236
265 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 237 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
266} 238}
267 239
@@ -275,8 +247,10 @@ static int __cpuinit
275cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) 247cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
276{ 248{
277 long cpu = (long)arg; 249 long cpu = (long)arg;
250
278 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 251 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
279 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); 252 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
253
280 return NOTIFY_DONE; 254 return NOTIFY_DONE;
281} 255}
282 256
@@ -284,25 +258,23 @@ void __init map_vsyscall(void)
284{ 258{
285 extern char __vsyscall_0; 259 extern char __vsyscall_0;
286 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 260 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
261 extern char __vvar_page;
262 unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
287 263
288 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ 264 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
289 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); 265 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
266 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
267 BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
290} 268}
291 269
292static int __init vsyscall_init(void) 270static int __init vsyscall_init(void)
293{ 271{
294 BUG_ON(((unsigned long) &vgettimeofday != 272 BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
295 VSYSCALL_ADDR(__NR_vgettimeofday))); 273
296 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
297 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
298 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
299#ifdef CONFIG_SYSCTL
300 register_sysctl_table(kernel_root_table2);
301#endif
302 on_each_cpu(cpu_vsyscall_init, NULL, 1); 274 on_each_cpu(cpu_vsyscall_init, NULL, 1);
303 /* notifier priority > KVM */ 275 /* notifier priority > KVM */
304 hotcpu_notifier(cpu_vsyscall_notifier, 30); 276 hotcpu_notifier(cpu_vsyscall_notifier, 30);
277
305 return 0; 278 return 0;
306} 279}
307
308__initcall(vsyscall_init); 280__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 000000000000..ffa845eae5ca
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
1/*
2 * vsyscall_emu_64.S: Vsyscall emulation page
3 *
4 * Copyright (c) 2011 Andy Lutomirski
5 *
6 * Subject to the GNU General Public License, version 2
7 */
8
9#include <linux/linkage.h>
10#include <asm/irq_vectors.h>
11
12/* The unused parts of the page are filled with 0xcc by the linker script. */
13
14.section .vsyscall_0, "a"
15ENTRY(vsyscall_0)
16 int $VSYSCALL_EMU_VECTOR
17END(vsyscall_0)
18
19.section .vsyscall_1, "a"
20ENTRY(vsyscall_1)
21 int $VSYSCALL_EMU_VECTOR
22END(vsyscall_1)
23
24.section .vsyscall_2, "a"
25ENTRY(vsyscall_2)
26 int $VSYSCALL_EMU_VECTOR
27END(vsyscall_2)