aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig37
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h32
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h13
-rw-r--r--arch/x86/include/asm/desc.h10
-rw-r--r--arch/x86/include/asm/dma-mapping.h2
-rw-r--r--arch/x86/include/asm/es7000/apic.h82
-rw-r--r--arch/x86/include/asm/es7000/ipi.h12
-rw-r--r--arch/x86/include/asm/genapic_32.h13
-rw-r--r--arch/x86/include/asm/genapic_64.h14
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/ipi.h23
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h11
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h28
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h18
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h1
-rw-r--r--arch/x86/include/asm/mtrr.h25
-rw-r--r--arch/x86/include/asm/numaq/apic.h12
-rw-r--r--arch/x86/include/asm/numaq/ipi.h13
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pci_64.h1
-rw-r--r--arch/x86/include/asm/smp.h6
-rw-r--r--arch/x86/include/asm/summit/apic.h55
-rw-r--r--arch/x86/include/asm/summit/ipi.h9
-rw-r--r--arch/x86/include/asm/svm.h (renamed from arch/x86/kvm/svm.h)0
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/uaccess_32.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h6
-rw-r--r--arch/x86/include/asm/virtext.h132
-rw-r--r--arch/x86/include/asm/vmx.h (renamed from arch/x86/kvm/vmx.h)27
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/apic.c34
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c45
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c108
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h18
-rw-r--r--arch/x86/kernel/crash.c18
-rw-r--r--arch/x86/kernel/genapic_flat_64.c107
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c81
-rw-r--r--arch/x86/kernel/genx2apic_phys.c74
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c61
-rw-r--r--arch/x86/kernel/hpet.c15
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic.c1021
-rw-r--r--arch/x86/kernel/ipi.c28
-rw-r--r--arch/x86/kernel/irq.c6
-rw-r--r--arch/x86/kernel/irq_32.c15
-rw-r--r--arch/x86/kernel/irq_64.c17
-rw-r--r--arch/x86/kernel/irqinit_32.c19
-rw-r--r--arch/x86/kernel/irqinit_64.c16
-rw-r--r--arch/x86/kernel/kvmclock.c10
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/pci-dma.c13
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c29
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c67
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/smp.c8
-rw-r--r--arch/x86/kernel/smpboot.c33
-rw-r--r--arch/x86/kernel/tlb_32.c2
-rw-r--r--arch/x86/kernel/tlb_64.c2
-rw-r--r--arch/x86/kernel/traps.c12
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kvm/i8254.c19
-rw-r--r--arch/x86/kvm/i8259.c52
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_svm.h2
-rw-r--r--arch/x86/kvm/lapic.c58
-rw-r--r--arch/x86/kvm/mmu.c444
-rw-r--r--arch/x86/kvm/paging_tmpl.h44
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/vmx.c350
-rw-r--r--arch/x86/kvm/x86.c117
-rw-r--r--arch/x86/kvm/x86_emulate.c297
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/lguest/i386_head.S15
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/lib/usercopy_64.c4
-rw-r--r--arch/x86/mach-generic/bigsmp.c5
-rw-r--r--arch/x86/mach-generic/es7000.c5
-rw-r--r--arch/x86/mach-generic/numaq.c5
-rw-r--r--arch/x86/mach-generic/summit.c5
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c9
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/numa_64.c4
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/oprofile/op_model_amd.c89
-rw-r--r--arch/x86/xen/mmu.c20
-rw-r--r--arch/x86/xen/smp.c27
-rw-r--r--arch/x86/xen/suspend.c3
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
99 files changed, 2911 insertions, 1310 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 98a0ed52b5c3..249d1e0824b5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -247,6 +247,28 @@ config X86_HAS_BOOT_CPU_ID
247 def_bool y 247 def_bool y
248 depends on X86_VOYAGER 248 depends on X86_VOYAGER
249 249
250config SPARSE_IRQ
251 bool "Support sparse irq numbering"
252 depends on PCI_MSI || HT_IRQ
253 help
254 This enables support for sparse irqs. This is useful for distro
255 kernels that want to define a high CONFIG_NR_CPUS value but still
256 want to have low kernel memory footprint on smaller machines.
257
258 ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
259 out the irq_desc[] array in a more NUMA-friendly way. )
260
261 If you don't know what to do here, say N.
262
263config NUMA_MIGRATE_IRQ_DESC
264 bool "Move irq desc when changing irq smp_affinity"
265 depends on SPARSE_IRQ && NUMA
266 default n
267 help
268 This enables moving irq_desc to cpu/node that irq will use handled.
269
270 If you don't know what to do here, say N.
271
250config X86_FIND_SMP_CONFIG 272config X86_FIND_SMP_CONFIG
251 def_bool y 273 def_bool y
252 depends on X86_MPPARSE || X86_VOYAGER 274 depends on X86_MPPARSE || X86_VOYAGER
@@ -479,7 +501,7 @@ config HPET_TIMER
479 The HPET provides a stable time base on SMP 501 The HPET provides a stable time base on SMP
480 systems, unlike the TSC, but it is more expensive to access, 502 systems, unlike the TSC, but it is more expensive to access,
481 as it is off-chip. You can find the HPET spec at 503 as it is off-chip. You can find the HPET spec at
482 <http://www.intel.com/hardwaredesign/hpetspec.htm>. 504 <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
483 505
484 You can safely choose Y here. However, HPET will only be 506 You can safely choose Y here. However, HPET will only be
485 activated if the platform and the BIOS support this feature. 507 activated if the platform and the BIOS support this feature.
@@ -579,19 +601,20 @@ config IOMMU_HELPER
579 601
580config MAXSMP 602config MAXSMP
581 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 603 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
582 depends on X86_64 && SMP && BROKEN 604 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
605 select CPUMASK_OFFSTACK
583 default n 606 default n
584 help 607 help
585 Configure maximum number of CPUS and NUMA Nodes for this architecture. 608 Configure maximum number of CPUS and NUMA Nodes for this architecture.
586 If unsure, say N. 609 If unsure, say N.
587 610
588config NR_CPUS 611config NR_CPUS
589 int "Maximum number of CPUs (2-512)" if !MAXSMP 612 int "Maximum number of CPUs" if SMP && !MAXSMP
590 range 2 512 613 range 2 512 if SMP && !MAXSMP
591 depends on SMP 614 default "1" if !SMP
592 default "4096" if MAXSMP 615 default "4096" if MAXSMP
593 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 616 default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
594 default "8" 617 default "8" if SMP
595 help 618 help
596 This allows you to specify the maximum number of CPUs which this 619 This allows you to specify the maximum number of CPUs which this
597 kernel will support. The maximum supported value is 512 and the 620 kernel will support. The maximum supported value is 512 and the
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
index ce547f24a1cd..d8dd9f537911 100644
--- a/arch/x86/include/asm/bigsmp/apic.h
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -9,12 +9,12 @@ static inline int apic_id_registered(void)
9 return (1); 9 return (1);
10} 10}
11 11
12static inline cpumask_t target_cpus(void) 12static inline const cpumask_t *target_cpus(void)
13{ 13{
14#ifdef CONFIG_SMP 14#ifdef CONFIG_SMP
15 return cpu_online_map; 15 return &cpu_online_map;
16#else 16#else
17 return cpumask_of_cpu(0); 17 return &cpumask_of_cpu(0);
18#endif 18#endif
19} 19}
20 20
@@ -79,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid)
79 79
80static inline int cpu_present_to_apicid(int mps_cpu) 80static inline int cpu_present_to_apicid(int mps_cpu)
81{ 81{
82 if (mps_cpu < NR_CPUS) 82 if (mps_cpu < nr_cpu_ids)
83 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); 83 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
84 84
85 return BAD_APICID; 85 return BAD_APICID;
@@ -94,7 +94,7 @@ extern u8 cpu_2_logical_apicid[];
94/* Mapping from cpu number to logical apicid */ 94/* Mapping from cpu number to logical apicid */
95static inline int cpu_to_logical_apicid(int cpu) 95static inline int cpu_to_logical_apicid(int cpu)
96{ 96{
97 if (cpu >= NR_CPUS) 97 if (cpu >= nr_cpu_ids)
98 return BAD_APICID; 98 return BAD_APICID;
99 return cpu_physical_id(cpu); 99 return cpu_physical_id(cpu);
100} 100}
@@ -119,16 +119,34 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
119} 119}
120 120
121/* As we are using single CPU as destination, pick only one CPU here */ 121/* As we are using single CPU as destination, pick only one CPU here */
122static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 122static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
123{ 123{
124 int cpu; 124 int cpu;
125 int apicid; 125 int apicid;
126 126
127 cpu = first_cpu(cpumask); 127 cpu = first_cpu(*cpumask);
128 apicid = cpu_to_logical_apicid(cpu); 128 apicid = cpu_to_logical_apicid(cpu);
129 return apicid; 129 return apicid;
130} 130}
131 131
132static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
133 const struct cpumask *andmask)
134{
135 int cpu;
136
137 /*
138 * We're using fixed IRQ delivery, can only return one phys APIC ID.
139 * May as well be the first.
140 */
141 for_each_cpu_and(cpu, cpumask, andmask)
142 if (cpumask_test_cpu(cpu, cpu_online_mask))
143 break;
144 if (cpu < nr_cpu_ids)
145 return cpu_to_logical_apicid(cpu);
146
147 return BAD_APICID;
148}
149
132static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) 150static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
133{ 151{
134 return cpuid_apic >> index_msb; 152 return cpuid_apic >> index_msb;
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
index 9404c535b7ec..27fcd01b3ae6 100644
--- a/arch/x86/include/asm/bigsmp/ipi.h
+++ b/arch/x86/include/asm/bigsmp/ipi.h
@@ -1,25 +1,22 @@
1#ifndef __ASM_MACH_IPI_H 1#ifndef __ASM_MACH_IPI_H
2#define __ASM_MACH_IPI_H 2#define __ASM_MACH_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t mask, int vector); 4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
10 11
11static inline void send_IPI_allbutself(int vector) 12static inline void send_IPI_allbutself(int vector)
12{ 13{
13 cpumask_t mask = cpu_online_map; 14 send_IPI_mask_allbutself(cpu_online_mask, vector);
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18} 15}
19 16
20static inline void send_IPI_all(int vector) 17static inline void send_IPI_all(int vector)
21{ 18{
22 send_IPI_mask(cpu_online_map, vector); 19 send_IPI_mask(cpu_online_mask, vector);
23} 20}
24 21
25#endif /* __ASM_MACH_IPI_H */ 22#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e6b82b17b072..dc27705f5443 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -320,16 +320,14 @@ static inline void set_intr_gate(unsigned int n, void *addr)
320 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); 320 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
321} 321}
322 322
323#define SYS_VECTOR_FREE 0
324#define SYS_VECTOR_ALLOCED 1
325
326extern int first_system_vector; 323extern int first_system_vector;
327extern char system_vectors[]; 324/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
325extern unsigned long used_vectors[];
328 326
329static inline void alloc_system_vector(int vector) 327static inline void alloc_system_vector(int vector)
330{ 328{
331 if (system_vectors[vector] == SYS_VECTOR_FREE) { 329 if (!test_bit(vector, used_vectors)) {
332 system_vectors[vector] = SYS_VECTOR_ALLOCED; 330 set_bit(vector, used_vectors);
333 if (first_system_vector > vector) 331 if (first_system_vector > vector)
334 first_system_vector = vector; 332 first_system_vector = vector;
335 } else 333 } else
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index dc22c0733282..4035357f5b9d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -65,7 +65,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
65 return dma_ops; 65 return dma_ops;
66 else 66 else
67 return dev->archdata.dma_ops; 67 return dev->archdata.dma_ops;
68#endif /* _ASM_X86_DMA_MAPPING_H */ 68#endif
69} 69}
70 70
71/* Make sure we keep the same behaviour */ 71/* Make sure we keep the same behaviour */
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
index e24ef876915f..51ac1230294e 100644
--- a/arch/x86/include/asm/es7000/apic.h
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -9,14 +9,14 @@ static inline int apic_id_registered(void)
9 return (1); 9 return (1);
10} 10}
11 11
12static inline cpumask_t target_cpus_cluster(void) 12static inline const cpumask_t *target_cpus_cluster(void)
13{ 13{
14 return CPU_MASK_ALL; 14 return &CPU_MASK_ALL;
15} 15}
16 16
17static inline cpumask_t target_cpus(void) 17static inline const cpumask_t *target_cpus(void)
18{ 18{
19 return cpumask_of_cpu(smp_processor_id()); 19 return &cpumask_of_cpu(smp_processor_id());
20} 20}
21 21
22#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) 22#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER)
@@ -80,9 +80,10 @@ extern int apic_version [MAX_APICS];
80static inline void setup_apic_routing(void) 80static inline void setup_apic_routing(void)
81{ 81{
82 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 82 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
83 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", 83 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
84 (apic_version[apic] == 0x14) ? 84 (apic_version[apic] == 0x14) ?
85 "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); 85 "Physical Cluster" : "Logical Cluster",
86 nr_ioapics, cpus_addr(*target_cpus())[0]);
86} 87}
87 88
88static inline int multi_timer_check(int apic, int irq) 89static inline int multi_timer_check(int apic, int irq)
@@ -100,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu)
100{ 101{
101 if (!mps_cpu) 102 if (!mps_cpu)
102 return boot_cpu_physical_apicid; 103 return boot_cpu_physical_apicid;
103 else if (mps_cpu < NR_CPUS) 104 else if (mps_cpu < nr_cpu_ids)
104 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); 105 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
105 else 106 else
106 return BAD_APICID; 107 return BAD_APICID;
@@ -120,9 +121,9 @@ extern u8 cpu_2_logical_apicid[];
120static inline int cpu_to_logical_apicid(int cpu) 121static inline int cpu_to_logical_apicid(int cpu)
121{ 122{
122#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
123 if (cpu >= NR_CPUS) 124 if (cpu >= nr_cpu_ids)
124 return BAD_APICID; 125 return BAD_APICID;
125 return (int)cpu_2_logical_apicid[cpu]; 126 return (int)cpu_2_logical_apicid[cpu];
126#else 127#else
127 return logical_smp_processor_id(); 128 return logical_smp_processor_id();
128#endif 129#endif
@@ -146,14 +147,15 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid)
146 return (1); 147 return (1);
147} 148}
148 149
149static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask) 150static inline unsigned int
151cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
150{ 152{
151 int num_bits_set; 153 int num_bits_set;
152 int cpus_found = 0; 154 int cpus_found = 0;
153 int cpu; 155 int cpu;
154 int apicid; 156 int apicid;
155 157
156 num_bits_set = cpus_weight(cpumask); 158 num_bits_set = cpumask_weight(cpumask);
157 /* Return id to all */ 159 /* Return id to all */
158 if (num_bits_set == NR_CPUS) 160 if (num_bits_set == NR_CPUS)
159 return 0xFF; 161 return 0xFF;
@@ -161,10 +163,10 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
161 * The cpus in the mask must all be on the apic cluster. If are not 163 * The cpus in the mask must all be on the apic cluster. If are not
162 * on the same apicid cluster return default value of TARGET_CPUS. 164 * on the same apicid cluster return default value of TARGET_CPUS.
163 */ 165 */
164 cpu = first_cpu(cpumask); 166 cpu = cpumask_first(cpumask);
165 apicid = cpu_to_logical_apicid(cpu); 167 apicid = cpu_to_logical_apicid(cpu);
166 while (cpus_found < num_bits_set) { 168 while (cpus_found < num_bits_set) {
167 if (cpu_isset(cpu, cpumask)) { 169 if (cpumask_test_cpu(cpu, cpumask)) {
168 int new_apicid = cpu_to_logical_apicid(cpu); 170 int new_apicid = cpu_to_logical_apicid(cpu);
169 if (apicid_cluster(apicid) != 171 if (apicid_cluster(apicid) !=
170 apicid_cluster(new_apicid)){ 172 apicid_cluster(new_apicid)){
@@ -179,14 +181,14 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
179 return apicid; 181 return apicid;
180} 182}
181 183
182static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 184static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
183{ 185{
184 int num_bits_set; 186 int num_bits_set;
185 int cpus_found = 0; 187 int cpus_found = 0;
186 int cpu; 188 int cpu;
187 int apicid; 189 int apicid;
188 190
189 num_bits_set = cpus_weight(cpumask); 191 num_bits_set = cpus_weight(*cpumask);
190 /* Return id to all */ 192 /* Return id to all */
191 if (num_bits_set == NR_CPUS) 193 if (num_bits_set == NR_CPUS)
192 return cpu_to_logical_apicid(0); 194 return cpu_to_logical_apicid(0);
@@ -194,10 +196,52 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
194 * The cpus in the mask must all be on the apic cluster. If are not 196 * The cpus in the mask must all be on the apic cluster. If are not
195 * on the same apicid cluster return default value of TARGET_CPUS. 197 * on the same apicid cluster return default value of TARGET_CPUS.
196 */ 198 */
197 cpu = first_cpu(cpumask); 199 cpu = first_cpu(*cpumask);
200 apicid = cpu_to_logical_apicid(cpu);
201 while (cpus_found < num_bits_set) {
202 if (cpu_isset(cpu, *cpumask)) {
203 int new_apicid = cpu_to_logical_apicid(cpu);
204 if (apicid_cluster(apicid) !=
205 apicid_cluster(new_apicid)){
206 printk ("%s: Not a valid mask!\n", __func__);
207 return cpu_to_logical_apicid(0);
208 }
209 apicid = new_apicid;
210 cpus_found++;
211 }
212 cpu++;
213 }
214 return apicid;
215}
216
217
218static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
219 const struct cpumask *andmask)
220{
221 int num_bits_set;
222 int cpus_found = 0;
223 int cpu;
224 int apicid = cpu_to_logical_apicid(0);
225 cpumask_var_t cpumask;
226
227 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
228 return apicid;
229
230 cpumask_and(cpumask, inmask, andmask);
231 cpumask_and(cpumask, cpumask, cpu_online_mask);
232
233 num_bits_set = cpumask_weight(cpumask);
234 /* Return id to all */
235 if (num_bits_set == NR_CPUS)
236 goto exit;
237 /*
238 * The cpus in the mask must all be on the apic cluster. If are not
239 * on the same apicid cluster return default value of TARGET_CPUS.
240 */
241 cpu = cpumask_first(cpumask);
198 apicid = cpu_to_logical_apicid(cpu); 242 apicid = cpu_to_logical_apicid(cpu);
199 while (cpus_found < num_bits_set) { 243 while (cpus_found < num_bits_set) {
200 if (cpu_isset(cpu, cpumask)) { 244 if (cpumask_test_cpu(cpu, cpumask)) {
201 int new_apicid = cpu_to_logical_apicid(cpu); 245 int new_apicid = cpu_to_logical_apicid(cpu);
202 if (apicid_cluster(apicid) != 246 if (apicid_cluster(apicid) !=
203 apicid_cluster(new_apicid)){ 247 apicid_cluster(new_apicid)){
@@ -209,6 +253,8 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
209 } 253 }
210 cpu++; 254 cpu++;
211 } 255 }
256exit:
257 free_cpumask_var(cpumask);
212 return apicid; 258 return apicid;
213} 259}
214 260
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
index 632a955fcc0a..7e8ed24d4b8a 100644
--- a/arch/x86/include/asm/es7000/ipi.h
+++ b/arch/x86/include/asm/es7000/ipi.h
@@ -1,24 +1,22 @@
1#ifndef __ASM_ES7000_IPI_H 1#ifndef __ASM_ES7000_IPI_H
2#define __ASM_ES7000_IPI_H 2#define __ASM_ES7000_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t mask, int vector); 4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
10 11
11static inline void send_IPI_allbutself(int vector) 12static inline void send_IPI_allbutself(int vector)
12{ 13{
13 cpumask_t mask = cpu_online_map; 14 send_IPI_mask_allbutself(cpu_online_mask, vector);
14 cpu_clear(smp_processor_id(), mask);
15 if (!cpus_empty(mask))
16 send_IPI_mask(mask, vector);
17} 15}
18 16
19static inline void send_IPI_all(int vector) 17static inline void send_IPI_all(int vector)
20{ 18{
21 send_IPI_mask(cpu_online_map, vector); 19 send_IPI_mask(cpu_online_mask, vector);
22} 20}
23 21
24#endif /* __ASM_ES7000_IPI_H */ 22#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
index 0ac17d33a8c7..746f37a7963a 100644
--- a/arch/x86/include/asm/genapic_32.h
+++ b/arch/x86/include/asm/genapic_32.h
@@ -24,7 +24,7 @@ struct genapic {
24 int (*probe)(void); 24 int (*probe)(void);
25 25
26 int (*apic_id_registered)(void); 26 int (*apic_id_registered)(void);
27 cpumask_t (*target_cpus)(void); 27 const struct cpumask *(*target_cpus)(void);
28 int int_delivery_mode; 28 int int_delivery_mode;
29 int int_dest_mode; 29 int int_dest_mode;
30 int ESR_DISABLE; 30 int ESR_DISABLE;
@@ -57,12 +57,16 @@ struct genapic {
57 57
58 unsigned (*get_apic_id)(unsigned long x); 58 unsigned (*get_apic_id)(unsigned long x);
59 unsigned long apic_id_mask; 59 unsigned long apic_id_mask;
60 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); 60 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
61 cpumask_t (*vector_allocation_domain)(int cpu); 61 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
62 const struct cpumask *andmask);
63 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
62 64
63#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
64 /* ipi */ 66 /* ipi */
65 void (*send_IPI_mask)(cpumask_t mask, int vector); 67 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
68 void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
69 int vector);
66 void (*send_IPI_allbutself)(int vector); 70 void (*send_IPI_allbutself)(int vector);
67 void (*send_IPI_all)(int vector); 71 void (*send_IPI_all)(int vector);
68#endif 72#endif
@@ -114,6 +118,7 @@ struct genapic {
114 APICFUNC(get_apic_id) \ 118 APICFUNC(get_apic_id) \
115 .apic_id_mask = APIC_ID_MASK, \ 119 .apic_id_mask = APIC_ID_MASK, \
116 APICFUNC(cpu_mask_to_apicid) \ 120 APICFUNC(cpu_mask_to_apicid) \
121 APICFUNC(cpu_mask_to_apicid_and) \
117 APICFUNC(vector_allocation_domain) \ 122 APICFUNC(vector_allocation_domain) \
118 APICFUNC(acpi_madt_oem_check) \ 123 APICFUNC(acpi_madt_oem_check) \
119 IPIFUNC(send_IPI_mask) \ 124 IPIFUNC(send_IPI_mask) \
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
index 2cae011668b7..adf32fb56aa6 100644
--- a/arch/x86/include/asm/genapic_64.h
+++ b/arch/x86/include/asm/genapic_64.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_GENAPIC_64_H 1#ifndef _ASM_X86_GENAPIC_64_H
2#define _ASM_X86_GENAPIC_64_H 2#define _ASM_X86_GENAPIC_64_H
3 3
4#include <linux/cpumask.h>
5
4/* 6/*
5 * Copyright 2004 James Cleverdon, IBM. 7 * Copyright 2004 James Cleverdon, IBM.
6 * Subject to the GNU Public License, v.2 8 * Subject to the GNU Public License, v.2
@@ -18,16 +20,20 @@ struct genapic {
18 u32 int_delivery_mode; 20 u32 int_delivery_mode;
19 u32 int_dest_mode; 21 u32 int_dest_mode;
20 int (*apic_id_registered)(void); 22 int (*apic_id_registered)(void);
21 cpumask_t (*target_cpus)(void); 23 const struct cpumask *(*target_cpus)(void);
22 cpumask_t (*vector_allocation_domain)(int cpu); 24 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
23 void (*init_apic_ldr)(void); 25 void (*init_apic_ldr)(void);
24 /* ipi */ 26 /* ipi */
25 void (*send_IPI_mask)(cpumask_t mask, int vector); 27 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
28 void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
29 int vector);
26 void (*send_IPI_allbutself)(int vector); 30 void (*send_IPI_allbutself)(int vector);
27 void (*send_IPI_all)(int vector); 31 void (*send_IPI_all)(int vector);
28 void (*send_IPI_self)(int vector); 32 void (*send_IPI_self)(int vector);
29 /* */ 33 /* */
30 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); 34 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
35 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
36 const struct cpumask *andmask);
31 unsigned int (*phys_pkg_id)(int index_msb); 37 unsigned int (*phys_pkg_id)(int index_msb);
32 unsigned int (*get_apic_id)(unsigned long x); 38 unsigned int (*get_apic_id)(unsigned long x);
33 unsigned long (*set_apic_id)(unsigned int id); 39 unsigned long (*set_apic_id)(unsigned int id);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index e475e009ae5d..7a1f44ac1f17 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -198,17 +198,14 @@ extern void restore_IO_APIC_setup(void);
198extern void reinit_intr_remapped_IO_APIC(int); 198extern void reinit_intr_remapped_IO_APIC(int);
199#endif 199#endif
200 200
201extern int probe_nr_irqs(void); 201extern void probe_nr_irqs_gsi(void);
202 202
203#else /* !CONFIG_X86_IO_APIC */ 203#else /* !CONFIG_X86_IO_APIC */
204#define io_apic_assign_pci_irqs 0 204#define io_apic_assign_pci_irqs 0
205static const int timer_through_8259 = 0; 205static const int timer_through_8259 = 0;
206static inline void ioapic_init_mappings(void) { } 206static inline void ioapic_init_mappings(void) { }
207 207
208static inline int probe_nr_irqs(void) 208static inline void probe_nr_irqs_gsi(void) { }
209{
210 return NR_IRQS;
211}
212#endif 209#endif
213 210
214#endif /* _ASM_X86_IO_APIC_H */ 211#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 295b13193f4d..a6ee9e6f530f 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -7,8 +7,6 @@ extern struct dma_mapping_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 7extern int force_iommu, no_iommu;
8extern int iommu_detected; 8extern int iommu_detected;
9 9
10extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
11
12/* 10 seconds */ 10/* 10 seconds */
13#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
14 12
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index f89dffb28aa9..c745a306f7d3 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector,
117 native_apic_mem_write(APIC_ICR, cfg); 117 native_apic_mem_write(APIC_ICR, cfg);
118} 118}
119 119
120static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) 120static inline void send_IPI_mask_sequence(const struct cpumask *mask,
121 int vector)
121{ 122{
122 unsigned long flags; 123 unsigned long flags;
123 unsigned long query_cpu; 124 unsigned long query_cpu;
@@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
128 * - mbligh 129 * - mbligh
129 */ 130 */
130 local_irq_save(flags); 131 local_irq_save(flags);
131 for_each_cpu_mask_nr(query_cpu, mask) { 132 for_each_cpu(query_cpu, mask) {
132 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), 133 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
133 vector, APIC_DEST_PHYSICAL); 134 vector, APIC_DEST_PHYSICAL);
134 } 135 }
135 local_irq_restore(flags); 136 local_irq_restore(flags);
136} 137}
137 138
139static inline void send_IPI_mask_allbutself(const struct cpumask *mask,
140 int vector)
141{
142 unsigned long flags;
143 unsigned int query_cpu;
144 unsigned int this_cpu = smp_processor_id();
145
146 /* See Hack comment above */
147
148 local_irq_save(flags);
149 for_each_cpu(query_cpu, mask)
150 if (query_cpu != this_cpu)
151 __send_IPI_dest_field(
152 per_cpu(x86_cpu_to_apicid, query_cpu),
153 vector, APIC_DEST_PHYSICAL);
154 local_irq_restore(flags);
155}
156
138#endif /* _ASM_X86_IPI_H */ 157#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 28e409fc73f3..592688ed04d3 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -33,7 +33,7 @@ static inline int irq_canonicalize(int irq)
33 33
34#ifdef CONFIG_HOTPLUG_CPU 34#ifdef CONFIG_HOTPLUG_CPU
35#include <linux/cpumask.h> 35#include <linux/cpumask.h>
36extern void fixup_irqs(cpumask_t map); 36extern void fixup_irqs(void);
37#endif 37#endif
38 38
39extern unsigned int do_IRQ(struct pt_regs *regs); 39extern unsigned int do_IRQ(struct pt_regs *regs);
@@ -42,5 +42,6 @@ extern void native_init_IRQ(void);
42 42
43/* Interrupt vector management */ 43/* Interrupt vector management */
44extern DECLARE_BITMAP(used_vectors, NR_VECTORS); 44extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
45extern int vector_used_by_percpu_irq(unsigned int vector);
45 46
46#endif /* _ASM_X86_IRQ_H */ 47#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..f7ff65032b9d 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -101,12 +101,23 @@
101#define LAST_VM86_IRQ 15 101#define LAST_VM86_IRQ 15
102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) 102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
103 103
104#define NR_IRQS_LEGACY 16
105
104#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) 106#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
107
108#ifndef CONFIG_SPARSE_IRQ
105# if NR_CPUS < MAX_IO_APICS 109# if NR_CPUS < MAX_IO_APICS
106# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) 110# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
107# else 111# else
108# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) 112# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
109# endif 113# endif
114#else
115# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
116# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
117# else
118# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
119# endif
120#endif
110 121
111#elif defined(CONFIG_X86_VOYAGER) 122#elif defined(CONFIG_X86_VOYAGER)
112 123
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8346be87cfa1..97215a458e5f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -21,6 +21,7 @@
21 21
22#include <asm/pvclock-abi.h> 22#include <asm/pvclock-abi.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/mtrr.h>
24 25
25#define KVM_MAX_VCPUS 16 26#define KVM_MAX_VCPUS 16
26#define KVM_MEMORY_SLOTS 32 27#define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
86#define KVM_MIN_FREE_MMU_PAGES 5 87#define KVM_MIN_FREE_MMU_PAGES 5
87#define KVM_REFILL_PAGES 25 88#define KVM_REFILL_PAGES 25
88#define KVM_MAX_CPUID_ENTRIES 40 89#define KVM_MAX_CPUID_ENTRIES 40
90#define KVM_NR_FIXED_MTRR_REGION 88
89#define KVM_NR_VAR_MTRR 8 91#define KVM_NR_VAR_MTRR 8
90 92
91extern spinlock_t kvm_lock; 93extern spinlock_t kvm_lock;
@@ -180,6 +182,8 @@ struct kvm_mmu_page {
180 struct list_head link; 182 struct list_head link;
181 struct hlist_node hash_link; 183 struct hlist_node hash_link;
182 184
185 struct list_head oos_link;
186
183 /* 187 /*
184 * The following two entries are used to key the shadow page in the 188 * The following two entries are used to key the shadow page in the
185 * hash table. 189 * hash table.
@@ -190,13 +194,16 @@ struct kvm_mmu_page {
190 u64 *spt; 194 u64 *spt;
191 /* hold the gfn of each spte inside spt */ 195 /* hold the gfn of each spte inside spt */
192 gfn_t *gfns; 196 gfn_t *gfns;
193 unsigned long slot_bitmap; /* One bit set per slot which has memory 197 /*
194 * in this shadow page. 198 * One bit set per slot which has memory
195 */ 199 * in this shadow page.
200 */
201 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
196 int multimapped; /* More than one parent_pte? */ 202 int multimapped; /* More than one parent_pte? */
197 int root_count; /* Currently serving as active root */ 203 int root_count; /* Currently serving as active root */
198 bool unsync; 204 bool unsync;
199 bool unsync_children; 205 bool global;
206 unsigned int unsync_children;
200 union { 207 union {
201 u64 *parent_pte; /* !multimapped */ 208 u64 *parent_pte; /* !multimapped */
202 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ 209 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch {
327 334
328 bool nmi_pending; 335 bool nmi_pending;
329 bool nmi_injected; 336 bool nmi_injected;
337 bool nmi_window_open;
330 338
331 u64 mtrr[0x100]; 339 struct mtrr_state_type mtrr_state;
340 u32 pat;
332}; 341};
333 342
334struct kvm_mem_alias { 343struct kvm_mem_alias {
@@ -350,11 +359,13 @@ struct kvm_arch{
350 */ 359 */
351 struct list_head active_mmu_pages; 360 struct list_head active_mmu_pages;
352 struct list_head assigned_dev_head; 361 struct list_head assigned_dev_head;
362 struct list_head oos_global_pages;
353 struct dmar_domain *intel_iommu_domain; 363 struct dmar_domain *intel_iommu_domain;
354 struct kvm_pic *vpic; 364 struct kvm_pic *vpic;
355 struct kvm_ioapic *vioapic; 365 struct kvm_ioapic *vioapic;
356 struct kvm_pit *vpit; 366 struct kvm_pit *vpit;
357 struct hlist_head irq_ack_notifier_list; 367 struct hlist_head irq_ack_notifier_list;
368 int vapics_in_nmi_mode;
358 369
359 int round_robin_prev_vcpu; 370 int round_robin_prev_vcpu;
360 unsigned int tss_addr; 371 unsigned int tss_addr;
@@ -378,6 +389,7 @@ struct kvm_vm_stat {
378 u32 mmu_recycled; 389 u32 mmu_recycled;
379 u32 mmu_cache_miss; 390 u32 mmu_cache_miss;
380 u32 mmu_unsync; 391 u32 mmu_unsync;
392 u32 mmu_unsync_global;
381 u32 remote_tlb_flush; 393 u32 remote_tlb_flush;
382 u32 lpages; 394 u32 lpages;
383}; 395};
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat {
397 u32 halt_exits; 409 u32 halt_exits;
398 u32 halt_wakeup; 410 u32 halt_wakeup;
399 u32 request_irq_exits; 411 u32 request_irq_exits;
412 u32 request_nmi_exits;
400 u32 irq_exits; 413 u32 irq_exits;
401 u32 host_state_reload; 414 u32 host_state_reload;
402 u32 efer_reload; 415 u32 efer_reload;
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat {
405 u32 insn_emulation_fail; 418 u32 insn_emulation_fail;
406 u32 hypercalls; 419 u32 hypercalls;
407 u32 irq_injections; 420 u32 irq_injections;
421 u32 nmi_injections;
408}; 422};
409 423
410struct descriptor_table { 424struct descriptor_table {
@@ -477,6 +491,7 @@ struct kvm_x86_ops {
477 491
478 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 492 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
479 int (*get_tdp_level)(void); 493 int (*get_tdp_level)(void);
494 int (*get_mt_mask_shift)(void);
480}; 495};
481 496
482extern struct kvm_x86_ops *kvm_x86_ops; 497extern struct kvm_x86_ops *kvm_x86_ops;
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
490void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 505void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
491void kvm_mmu_set_base_ptes(u64 base_pte); 506void kvm_mmu_set_base_ptes(u64 base_pte);
492void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 507void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
493 u64 dirty_mask, u64 nx_mask, u64 x_mask); 508 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
494 509
495int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 510int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
496void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 511void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector);
587 602
588void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 603void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
589void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 604void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
590 const u8 *new, int bytes); 605 const u8 *new, int bytes,
606 bool guest_initiated);
591int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 607int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
592void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 608void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
593int kvm_mmu_load(struct kvm_vcpu *vcpu); 609int kvm_mmu_load(struct kvm_vcpu *vcpu);
594void kvm_mmu_unload(struct kvm_vcpu *vcpu); 610void kvm_mmu_unload(struct kvm_vcpu *vcpu);
595void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 611void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
612void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
596 613
597int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 614int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
598 615
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void);
607int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 624int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
608int complete_pio(struct kvm_vcpu *vcpu); 625int complete_pio(struct kvm_vcpu *vcpu);
609 626
627struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
628
610static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 629static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
611{ 630{
612 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 631 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
702 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 721 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
703} 722}
704 723
705#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
706#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
707#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
708#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
709#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
710#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
711#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
712#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
713#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
714#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
715#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
716
717#define MSR_IA32_TIME_STAMP_COUNTER 0x010 724#define MSR_IA32_TIME_STAMP_COUNTER 0x010
718 725
719#define TSS_IOPB_BASE_OFFSET 0x66 726#define TSS_IOPB_BASE_OFFSET 0x66
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 25179a29f208..6a159732881a 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -123,6 +123,7 @@ struct decode_cache {
123 u8 ad_bytes; 123 u8 ad_bytes;
124 u8 rex_prefix; 124 u8 rex_prefix;
125 struct operand src; 125 struct operand src;
126 struct operand src2;
126 struct operand dst; 127 struct operand dst;
127 bool has_seg_override; 128 bool has_seg_override;
128 u8 seg_override; 129 u8 seg_override;
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt {
146 /* Register state before/after emulation. */ 147 /* Register state before/after emulation. */
147 struct kvm_vcpu *vcpu; 148 struct kvm_vcpu *vcpu;
148 149
149 /* Linear faulting address (if emulating a page-faulting instruction) */
150 unsigned long eflags; 150 unsigned long eflags;
151
152 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 151 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
153 int mode; 152 int mode;
154
155 u32 cs_base; 153 u32 cs_base;
156 154
157 /* decode cache */ 155 /* decode cache */
158
159 struct decode_cache decode; 156 struct decode_cache decode;
160}; 157};
161 158
162/* Repeat String Operation Prefix */ 159/* Repeat String Operation Prefix */
163#define REPE_PREFIX 1 160#define REPE_PREFIX 1
164#define REPNE_PREFIX 2 161#define REPNE_PREFIX 2
165 162
166/* Execution mode, passed to the emulator. */ 163/* Execution mode, passed to the emulator. */
167#define X86EMUL_MODE_REAL 0 /* Real mode. */ 164#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt {
170#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 167#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
171 168
172/* Host execution mode. */ 169/* Host execution mode. */
173#if defined(__i386__) 170#if defined(CONFIG_X86_32)
174#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 171#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
175#elif defined(CONFIG_X86_64) 172#elif defined(CONFIG_X86_64)
176#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 173#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
index 6cb3a467e067..cc09cbbee27e 100644
--- a/arch/x86/include/asm/mach-default/mach_apic.h
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -8,12 +8,12 @@
8 8
9#define APIC_DFR_VALUE (APIC_DFR_FLAT) 9#define APIC_DFR_VALUE (APIC_DFR_FLAT)
10 10
11static inline cpumask_t target_cpus(void) 11static inline const struct cpumask *target_cpus(void)
12{ 12{
13#ifdef CONFIG_SMP 13#ifdef CONFIG_SMP
14 return cpu_online_map; 14 return cpu_online_mask;
15#else 15#else
16 return cpumask_of_cpu(0); 16 return cpumask_of(0);
17#endif 17#endif
18} 18}
19 19
@@ -28,6 +28,7 @@ static inline cpumask_t target_cpus(void)
28#define apic_id_registered (genapic->apic_id_registered) 28#define apic_id_registered (genapic->apic_id_registered)
29#define init_apic_ldr (genapic->init_apic_ldr) 29#define init_apic_ldr (genapic->init_apic_ldr)
30#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) 30#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
31#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
31#define phys_pkg_id (genapic->phys_pkg_id) 32#define phys_pkg_id (genapic->phys_pkg_id)
32#define vector_allocation_domain (genapic->vector_allocation_domain) 33#define vector_allocation_domain (genapic->vector_allocation_domain)
33#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) 34#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
@@ -61,9 +62,19 @@ static inline int apic_id_registered(void)
61 return physid_isset(read_apic_id(), phys_cpu_present_map); 62 return physid_isset(read_apic_id(), phys_cpu_present_map);
62} 63}
63 64
64static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 65static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask)
65{ 66{
66 return cpus_addr(cpumask)[0]; 67 return cpumask_bits(cpumask)[0];
68}
69
70static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
71 const struct cpumask *andmask)
72{
73 unsigned long mask1 = cpumask_bits(cpumask)[0];
74 unsigned long mask2 = cpumask_bits(andmask)[0];
75 unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
76
77 return (unsigned int)(mask1 & mask2 & mask3);
67} 78}
68 79
69static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) 80static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
@@ -88,7 +99,7 @@ static inline int apicid_to_node(int logical_apicid)
88#endif 99#endif
89} 100}
90 101
91static inline cpumask_t vector_allocation_domain(int cpu) 102static inline void vector_allocation_domain(int cpu, struct cpumask *retmask)
92{ 103{
93 /* Careful. Some cpus do not strictly honor the set of cpus 104 /* Careful. Some cpus do not strictly honor the set of cpus
94 * specified in the interrupt destination when using lowest 105 * specified in the interrupt destination when using lowest
@@ -98,8 +109,7 @@ static inline cpumask_t vector_allocation_domain(int cpu)
98 * deliver interrupts to the wrong hyperthread when only one 109 * deliver interrupts to the wrong hyperthread when only one
99 * hyperthread was specified in the interrupt desitination. 110 * hyperthread was specified in the interrupt desitination.
100 */ 111 */
101 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 112 *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
102 return domain;
103} 113}
104#endif 114#endif
105 115
@@ -131,7 +141,7 @@ static inline int cpu_to_logical_apicid(int cpu)
131 141
132static inline int cpu_present_to_apicid(int mps_cpu) 142static inline int cpu_present_to_apicid(int mps_cpu)
133{ 143{
134 if (mps_cpu < NR_CPUS && cpu_present(mps_cpu)) 144 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
135 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); 145 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
136 else 146 else
137 return BAD_APICID; 147 return BAD_APICID;
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
index fabca01ebacf..191312d155da 100644
--- a/arch/x86/include/asm/mach-default/mach_ipi.h
+++ b/arch/x86/include/asm/mach-default/mach_ipi.h
@@ -4,7 +4,8 @@
4/* Avoid include hell */ 4/* Avoid include hell */
5#define NMI_VECTOR 0x02 5#define NMI_VECTOR 0x02
6 6
7void send_IPI_mask_bitmask(cpumask_t mask, int vector); 7void send_IPI_mask_bitmask(const struct cpumask *mask, int vector);
8void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
8void __send_IPI_shortcut(unsigned int shortcut, int vector); 9void __send_IPI_shortcut(unsigned int shortcut, int vector);
9 10
10extern int no_broadcast; 11extern int no_broadcast;
@@ -12,28 +13,27 @@ extern int no_broadcast;
12#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
13#include <asm/genapic.h> 14#include <asm/genapic.h>
14#define send_IPI_mask (genapic->send_IPI_mask) 15#define send_IPI_mask (genapic->send_IPI_mask)
16#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself)
15#else 17#else
16static inline void send_IPI_mask(cpumask_t mask, int vector) 18static inline void send_IPI_mask(const struct cpumask *mask, int vector)
17{ 19{
18 send_IPI_mask_bitmask(mask, vector); 20 send_IPI_mask_bitmask(mask, vector);
19} 21}
22void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
20#endif 23#endif
21 24
22static inline void __local_send_IPI_allbutself(int vector) 25static inline void __local_send_IPI_allbutself(int vector)
23{ 26{
24 if (no_broadcast || vector == NMI_VECTOR) { 27 if (no_broadcast || vector == NMI_VECTOR)
25 cpumask_t mask = cpu_online_map; 28 send_IPI_mask_allbutself(cpu_online_mask, vector);
26 29 else
27 cpu_clear(smp_processor_id(), mask);
28 send_IPI_mask(mask, vector);
29 } else
30 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); 30 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
31} 31}
32 32
33static inline void __local_send_IPI_all(int vector) 33static inline void __local_send_IPI_all(int vector)
34{ 34{
35 if (no_broadcast || vector == NMI_VECTOR) 35 if (no_broadcast || vector == NMI_VECTOR)
36 send_IPI_mask(cpu_online_map, vector); 36 send_IPI_mask(cpu_online_mask, vector);
37 else 37 else
38 __send_IPI_shortcut(APIC_DEST_ALLINC, vector); 38 __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
39} 39}
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
index e430f47df667..48553e958ad5 100644
--- a/arch/x86/include/asm/mach-generic/mach_apic.h
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -24,6 +24,7 @@
24#define check_phys_apicid_present (genapic->check_phys_apicid_present) 24#define check_phys_apicid_present (genapic->check_phys_apicid_present)
25#define check_apicid_used (genapic->check_apicid_used) 25#define check_apicid_used (genapic->check_apicid_used)
26#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) 26#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
27#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
27#define vector_allocation_domain (genapic->vector_allocation_domain) 28#define vector_allocation_domain (genapic->vector_allocation_domain)
28#define enable_apic_mode (genapic->enable_apic_mode) 29#define enable_apic_mode (genapic->enable_apic_mode)
29#define phys_pkg_id (genapic->phys_pkg_id) 30#define phys_pkg_id (genapic->phys_pkg_id)
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 7c1e4258b31e..cb988aab716d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -57,6 +57,31 @@ struct mtrr_gentry {
57}; 57};
58#endif /* !__i386__ */ 58#endif /* !__i386__ */
59 59
60struct mtrr_var_range {
61 u32 base_lo;
62 u32 base_hi;
63 u32 mask_lo;
64 u32 mask_hi;
65};
66
67/* In the Intel processor's MTRR interface, the MTRR type is always held in
68 an 8 bit field: */
69typedef u8 mtrr_type;
70
71#define MTRR_NUM_FIXED_RANGES 88
72#define MTRR_MAX_VAR_RANGES 256
73
74struct mtrr_state_type {
75 struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
76 mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
77 unsigned char enabled;
78 unsigned char have_fixed;
79 mtrr_type def_type;
80};
81
82#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
83#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
84
60/* These are the various ioctls */ 85/* These are the various ioctls */
61#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) 86#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
62#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) 87#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
index 0bf2a06b7a4e..c80f00d29965 100644
--- a/arch/x86/include/asm/numaq/apic.h
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -7,9 +7,9 @@
7 7
8#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) 8#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
9 9
10static inline cpumask_t target_cpus(void) 10static inline const cpumask_t *target_cpus(void)
11{ 11{
12 return CPU_MASK_ALL; 12 return &CPU_MASK_ALL;
13} 13}
14 14
15#define NO_BALANCE_IRQ (1) 15#define NO_BALANCE_IRQ (1)
@@ -122,7 +122,13 @@ static inline void enable_apic_mode(void)
122 * We use physical apicids here, not logical, so just return the default 122 * We use physical apicids here, not logical, so just return the default
123 * physical broadcast to stop people from breaking us 123 * physical broadcast to stop people from breaking us
124 */ 124 */
125static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 125static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
126{
127 return (int) 0xF;
128}
129
130static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
131 const struct cpumask *andmask)
126{ 132{
127 return (int) 0xF; 133 return (int) 0xF;
128} 134}
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
index 935588d286cf..a8374c652778 100644
--- a/arch/x86/include/asm/numaq/ipi.h
+++ b/arch/x86/include/asm/numaq/ipi.h
@@ -1,25 +1,22 @@
1#ifndef __ASM_NUMAQ_IPI_H 1#ifndef __ASM_NUMAQ_IPI_H
2#define __ASM_NUMAQ_IPI_H 2#define __ASM_NUMAQ_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t, int vector); 4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
10 11
11static inline void send_IPI_allbutself(int vector) 12static inline void send_IPI_allbutself(int vector)
12{ 13{
13 cpumask_t mask = cpu_online_map; 14 send_IPI_mask_allbutself(cpu_online_mask, vector);
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18} 15}
19 16
20static inline void send_IPI_all(int vector) 17static inline void send_IPI_all(int vector)
21{ 18{
22 send_IPI_mask(cpu_online_map, vector); 19 send_IPI_mask(cpu_online_mask, vector);
23} 20}
24 21
25#endif /* __ASM_NUMAQ_IPI_H */ 22#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 647781298e7e..66834c41c049 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -84,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
84static inline void early_quirks(void) { } 84static inline void early_quirks(void) { }
85#endif 85#endif
86 86
87extern void pci_iommu_alloc(void);
88
87#endif /* __KERNEL__ */ 89#endif /* __KERNEL__ */
88 90
89#ifdef CONFIG_X86_32 91#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index d02d936840a3..4da207982777 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void); 25extern void dma32_reserve_bootmem(void);
26extern void pci_iommu_alloc(void);
27 26
28/* The PCI address space does equal the physical memory 27/* The PCI address space does equal the physical memory
29 * address space. The networking and block device layers use 28 * address space. The networking and block device layers use
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d12811ce51d9..830b9fcb6427 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -60,7 +60,7 @@ struct smp_ops {
60 void (*cpu_die)(unsigned int cpu); 60 void (*cpu_die)(unsigned int cpu);
61 void (*play_dead)(void); 61 void (*play_dead)(void);
62 62
63 void (*send_call_func_ipi)(cpumask_t mask); 63 void (*send_call_func_ipi)(const struct cpumask *mask);
64 void (*send_call_func_single_ipi)(int cpu); 64 void (*send_call_func_single_ipi)(int cpu);
65}; 65};
66 66
@@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu)
125 125
126static inline void arch_send_call_function_ipi(cpumask_t mask) 126static inline void arch_send_call_function_ipi(cpumask_t mask)
127{ 127{
128 smp_ops.send_call_func_ipi(mask); 128 smp_ops.send_call_func_ipi(&mask);
129} 129}
130 130
131void cpu_disable_common(void); 131void cpu_disable_common(void);
@@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu);
138void native_play_dead(void); 138void native_play_dead(void);
139void play_dead_common(void); 139void play_dead_common(void);
140 140
141void native_send_call_func_ipi(cpumask_t mask); 141void native_send_call_func_ipi(const struct cpumask *mask);
142void native_send_call_func_single_ipi(int cpu); 142void native_send_call_func_single_ipi(int cpu);
143 143
144extern void prefill_possible_map(void); 144extern void prefill_possible_map(void);
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
index 9b3070f1c2ac..99327d1be49f 100644
--- a/arch/x86/include/asm/summit/apic.h
+++ b/arch/x86/include/asm/summit/apic.h
@@ -14,13 +14,13 @@
14 14
15#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) 15#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
16 16
17static inline cpumask_t target_cpus(void) 17static inline const cpumask_t *target_cpus(void)
18{ 18{
19 /* CPU_MASK_ALL (0xff) has undefined behaviour with 19 /* CPU_MASK_ALL (0xff) has undefined behaviour with
20 * dest_LowestPrio mode logical clustered apic interrupt routing 20 * dest_LowestPrio mode logical clustered apic interrupt routing
21 * Just start on cpu 0. IRQ balancing will spread load 21 * Just start on cpu 0. IRQ balancing will spread load
22 */ 22 */
23 return cpumask_of_cpu(0); 23 return &cpumask_of_cpu(0);
24} 24}
25 25
26#define INT_DELIVERY_MODE (dest_LowestPrio) 26#define INT_DELIVERY_MODE (dest_LowestPrio)
@@ -137,14 +137,14 @@ static inline void enable_apic_mode(void)
137{ 137{
138} 138}
139 139
140static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 140static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
141{ 141{
142 int num_bits_set; 142 int num_bits_set;
143 int cpus_found = 0; 143 int cpus_found = 0;
144 int cpu; 144 int cpu;
145 int apicid; 145 int apicid;
146 146
147 num_bits_set = cpus_weight(cpumask); 147 num_bits_set = cpus_weight(*cpumask);
148 /* Return id to all */ 148 /* Return id to all */
149 if (num_bits_set == NR_CPUS) 149 if (num_bits_set == NR_CPUS)
150 return (int) 0xFF; 150 return (int) 0xFF;
@@ -152,10 +152,10 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
152 * The cpus in the mask must all be on the apic cluster. If are not 152 * The cpus in the mask must all be on the apic cluster. If are not
153 * on the same apicid cluster return default value of TARGET_CPUS. 153 * on the same apicid cluster return default value of TARGET_CPUS.
154 */ 154 */
155 cpu = first_cpu(cpumask); 155 cpu = first_cpu(*cpumask);
156 apicid = cpu_to_logical_apicid(cpu); 156 apicid = cpu_to_logical_apicid(cpu);
157 while (cpus_found < num_bits_set) { 157 while (cpus_found < num_bits_set) {
158 if (cpu_isset(cpu, cpumask)) { 158 if (cpu_isset(cpu, *cpumask)) {
159 int new_apicid = cpu_to_logical_apicid(cpu); 159 int new_apicid = cpu_to_logical_apicid(cpu);
160 if (apicid_cluster(apicid) != 160 if (apicid_cluster(apicid) !=
161 apicid_cluster(new_apicid)){ 161 apicid_cluster(new_apicid)){
@@ -170,6 +170,49 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
170 return apicid; 170 return apicid;
171} 171}
172 172
173static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
174 const struct cpumask *andmask)
175{
176 int num_bits_set;
177 int cpus_found = 0;
178 int cpu;
179 int apicid = 0xFF;
180 cpumask_var_t cpumask;
181
182 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
183 return (int) 0xFF;
184
185 cpumask_and(cpumask, inmask, andmask);
186 cpumask_and(cpumask, cpumask, cpu_online_mask);
187
188 num_bits_set = cpumask_weight(cpumask);
189 /* Return id to all */
190 if (num_bits_set == nr_cpu_ids)
191 goto exit;
192 /*
193 * The cpus in the mask must all be on the apic cluster. If are not
194 * on the same apicid cluster return default value of TARGET_CPUS.
195 */
196 cpu = cpumask_first(cpumask);
197 apicid = cpu_to_logical_apicid(cpu);
198 while (cpus_found < num_bits_set) {
199 if (cpumask_test_cpu(cpu, cpumask)) {
200 int new_apicid = cpu_to_logical_apicid(cpu);
201 if (apicid_cluster(apicid) !=
202 apicid_cluster(new_apicid)){
203 printk ("%s: Not a valid mask!\n", __func__);
204 return 0xFF;
205 }
206 apicid = apicid | new_apicid;
207 cpus_found++;
208 }
209 cpu++;
210 }
211exit:
212 free_cpumask_var(cpumask);
213 return apicid;
214}
215
173/* cpuid returns the value latched in the HW at reset, not the APIC ID 216/* cpuid returns the value latched in the HW at reset, not the APIC ID
174 * register's value. For any box whose BIOS changes APIC IDs, like 217 * register's value. For any box whose BIOS changes APIC IDs, like
175 * clustered APIC systems, we must use hard_smp_processor_id. 218 * clustered APIC systems, we must use hard_smp_processor_id.
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
index 53bd1e7bd7b4..a8a2c24f50cc 100644
--- a/arch/x86/include/asm/summit/ipi.h
+++ b/arch/x86/include/asm/summit/ipi.h
@@ -1,9 +1,10 @@
1#ifndef __ASM_SUMMIT_IPI_H 1#ifndef __ASM_SUMMIT_IPI_H
2#define __ASM_SUMMIT_IPI_H 2#define __ASM_SUMMIT_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t mask, int vector); 4void send_IPI_mask_sequence(const cpumask_t *mask, int vector);
5void send_IPI_mask_allbutself(const cpumask_t *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const cpumask_t *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
@@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector)
14 cpu_clear(smp_processor_id(), mask); 15 cpu_clear(smp_processor_id(), mask);
15 16
16 if (!cpus_empty(mask)) 17 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector); 18 send_IPI_mask(&mask, vector);
18} 19}
19 20
20static inline void send_IPI_all(int vector) 21static inline void send_IPI_all(int vector)
21{ 22{
22 send_IPI_mask(cpu_online_map, vector); 23 send_IPI_mask(&cpu_online_map, vector);
23} 24}
24 25
25#endif /* __ASM_SUMMIT_IPI_H */ 26#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e869..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/include/asm/svm.h
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index ff386ff50ed7..79e31e9dcdda 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -226,6 +226,8 @@ extern cpumask_t cpu_coregroup_map(int cpu);
226#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) 226#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
227#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) 227#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
228#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) 228#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
229#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu))
230#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
229 231
230/* indicates that pointers to the topology cpumask_t maps are valid */ 232/* indicates that pointers to the topology cpumask_t maps are valid */
231#define arch_provides_topology_pointers yes 233#define arch_provides_topology_pointers yes
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 580c3ee6c58c..4340055b7559 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
157 int __ret_gu; \ 157 int __ret_gu; \
158 unsigned long __val_gu; \ 158 unsigned long __val_gu; \
159 __chk_user_ptr(ptr); \ 159 __chk_user_ptr(ptr); \
160 might_fault(); \
160 switch (sizeof(*(ptr))) { \ 161 switch (sizeof(*(ptr))) { \
161 case 1: \ 162 case 1: \
162 __get_user_x(1, __ret_gu, __val_gu, ptr); \ 163 __get_user_x(1, __ret_gu, __val_gu, ptr); \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
241 int __ret_pu; \ 242 int __ret_pu; \
242 __typeof__(*(ptr)) __pu_val; \ 243 __typeof__(*(ptr)) __pu_val; \
243 __chk_user_ptr(ptr); \ 244 __chk_user_ptr(ptr); \
245 might_fault(); \
244 __pu_val = x; \ 246 __pu_val = x; \
245 switch (sizeof(*(ptr))) { \ 247 switch (sizeof(*(ptr))) { \
246 case 1: \ 248 case 1: \
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index d095a3aeea1b..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
82static __always_inline unsigned long __must_check 82static __always_inline unsigned long __must_check
83__copy_to_user(void __user *to, const void *from, unsigned long n) 83__copy_to_user(void __user *to, const void *from, unsigned long n)
84{ 84{
85 might_sleep(); 85 might_fault();
86 return __copy_to_user_inatomic(to, from, n); 86 return __copy_to_user_inatomic(to, from, n);
87} 87}
88 88
89static __always_inline unsigned long 89static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
137static __always_inline unsigned long 137static __always_inline unsigned long
138__copy_from_user(void *to, const void __user *from, unsigned long n) 138__copy_from_user(void *to, const void __user *from, unsigned long n)
139{ 139{
140 might_sleep(); 140 might_fault();
141 if (__builtin_constant_p(n)) { 141 if (__builtin_constant_p(n)) {
142 unsigned long ret; 142 unsigned long ret;
143 143
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
159static __always_inline unsigned long __copy_from_user_nocache(void *to, 159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n) 160 const void __user *from, unsigned long n)
161{ 161{
162 might_sleep(); 162 might_fault();
163 if (__builtin_constant_p(n)) { 163 if (__builtin_constant_p(n)) {
164 unsigned long ret; 164 unsigned long ret;
165 165
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f8cfd00db450..84210c479fca 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,6 +29,8 @@ static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 29int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 30{
31 int ret = 0; 31 int ret = 0;
32
33 might_fault();
32 if (!__builtin_constant_p(size)) 34 if (!__builtin_constant_p(size))
33 return copy_user_generic(dst, (__force void *)src, size); 35 return copy_user_generic(dst, (__force void *)src, size);
34 switch (size) { 36 switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
71int __copy_to_user(void __user *dst, const void *src, unsigned size) 73int __copy_to_user(void __user *dst, const void *src, unsigned size)
72{ 74{
73 int ret = 0; 75 int ret = 0;
76
77 might_fault();
74 if (!__builtin_constant_p(size)) 78 if (!__builtin_constant_p(size))
75 return copy_user_generic((__force void *)dst, src, size); 79 return copy_user_generic((__force void *)dst, src, size);
76 switch (size) { 80 switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
113int __copy_in_user(void __user *dst, const void __user *src, unsigned size) 117int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
114{ 118{
115 int ret = 0; 119 int ret = 0;
120
121 might_fault();
116 if (!__builtin_constant_p(size)) 122 if (!__builtin_constant_p(size))
117 return copy_user_generic((__force void *)dst, 123 return copy_user_generic((__force void *)dst,
118 (__force void *)src, size); 124 (__force void *)src, size);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 000000000000..593636275238
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,132 @@
1/* CPU virtualization extensions handling
2 *
3 * This should carry the code for handling CPU virtualization extensions
4 * that needs to live in the kernel core.
5 *
6 * Author: Eduardo Habkost <ehabkost@redhat.com>
7 *
8 * Copyright (C) 2008, Red Hat Inc.
9 *
10 * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 */
15#ifndef _ASM_X86_VIRTEX_H
16#define _ASM_X86_VIRTEX_H
17
18#include <asm/processor.h>
19#include <asm/system.h>
20
21#include <asm/vmx.h>
22#include <asm/svm.h>
23
24/*
25 * VMX functions:
26 */
27
28static inline int cpu_has_vmx(void)
29{
30 unsigned long ecx = cpuid_ecx(1);
31 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
32}
33
34
35/** Disable VMX on the current CPU
36 *
37 * vmxoff causes a undefined-opcode exception if vmxon was not run
38 * on the CPU previously. Only call this function if you know VMX
39 * is enabled.
40 */
41static inline void cpu_vmxoff(void)
42{
43 asm volatile (ASM_VMX_VMXOFF : : : "cc");
44 write_cr4(read_cr4() & ~X86_CR4_VMXE);
45}
46
47static inline int cpu_vmx_enabled(void)
48{
49 return read_cr4() & X86_CR4_VMXE;
50}
51
52/** Disable VMX if it is enabled on the current CPU
53 *
54 * You shouldn't call this if cpu_has_vmx() returns 0.
55 */
56static inline void __cpu_emergency_vmxoff(void)
57{
58 if (cpu_vmx_enabled())
59 cpu_vmxoff();
60}
61
62/** Disable VMX if it is supported and enabled on the current CPU
63 */
64static inline void cpu_emergency_vmxoff(void)
65{
66 if (cpu_has_vmx())
67 __cpu_emergency_vmxoff();
68}
69
70
71
72
73/*
74 * SVM functions:
75 */
76
77/** Check if the CPU has SVM support
78 *
79 * You can use the 'msg' arg to get a message describing the problem,
80 * if the function returns zero. Simply pass NULL if you are not interested
81 * on the messages; gcc should take care of not generating code for
82 * the messages on this case.
83 */
84static inline int cpu_has_svm(const char **msg)
85{
86 uint32_t eax, ebx, ecx, edx;
87
88 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
89 if (msg)
90 *msg = "not amd";
91 return 0;
92 }
93
94 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
95 if (eax < SVM_CPUID_FUNC) {
96 if (msg)
97 *msg = "can't execute cpuid_8000000a";
98 return 0;
99 }
100
101 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
102 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
103 if (msg)
104 *msg = "svm not available";
105 return 0;
106 }
107 return 1;
108}
109
110
111/** Disable SVM on the current CPU
112 *
113 * You should call this only if cpu_has_svm() returned true.
114 */
115static inline void cpu_svm_disable(void)
116{
117 uint64_t efer;
118
119 wrmsrl(MSR_VM_HSAVE_PA, 0);
120 rdmsrl(MSR_EFER, efer);
121 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
122}
123
124/** Makes sure SVM is disabled, if it is supported on the CPU
125 */
126static inline void cpu_emergency_svm_disable(void)
127{
128 if (cpu_has_svm(NULL))
129 cpu_svm_disable();
130}
131
132#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h
index ec5edc339da6..d0238e6151d8 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -63,10 +63,13 @@
63 63
64#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 64#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
65#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 65#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
66#define VM_EXIT_SAVE_IA32_PAT 0x00040000
67#define VM_EXIT_LOAD_IA32_PAT 0x00080000
66 68
67#define VM_ENTRY_IA32E_MODE 0x00000200 69#define VM_ENTRY_IA32E_MODE 0x00000200
68#define VM_ENTRY_SMM 0x00000400 70#define VM_ENTRY_SMM 0x00000400
69#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 71#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
72#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
70 73
71/* VMCS Encodings */ 74/* VMCS Encodings */
72enum vmcs_field { 75enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
112 VMCS_LINK_POINTER_HIGH = 0x00002801, 115 VMCS_LINK_POINTER_HIGH = 0x00002801,
113 GUEST_IA32_DEBUGCTL = 0x00002802, 116 GUEST_IA32_DEBUGCTL = 0x00002802,
114 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, 117 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
118 GUEST_IA32_PAT = 0x00002804,
119 GUEST_IA32_PAT_HIGH = 0x00002805,
115 GUEST_PDPTR0 = 0x0000280a, 120 GUEST_PDPTR0 = 0x0000280a,
116 GUEST_PDPTR0_HIGH = 0x0000280b, 121 GUEST_PDPTR0_HIGH = 0x0000280b,
117 GUEST_PDPTR1 = 0x0000280c, 122 GUEST_PDPTR1 = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
120 GUEST_PDPTR2_HIGH = 0x0000280f, 125 GUEST_PDPTR2_HIGH = 0x0000280f,
121 GUEST_PDPTR3 = 0x00002810, 126 GUEST_PDPTR3 = 0x00002810,
122 GUEST_PDPTR3_HIGH = 0x00002811, 127 GUEST_PDPTR3_HIGH = 0x00002811,
128 HOST_IA32_PAT = 0x00002c00,
129 HOST_IA32_PAT_HIGH = 0x00002c01,
123 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 130 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
124 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 131 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
125 EXCEPTION_BITMAP = 0x00004004, 132 EXCEPTION_BITMAP = 0x00004004,
@@ -331,8 +338,9 @@ enum vmcs_field {
331 338
332#define AR_RESERVD_MASK 0xfffe0f00 339#define AR_RESERVD_MASK 0xfffe0f00
333 340
334#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 341#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
335#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 342#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
343#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
336 344
337#define VMX_NR_VPIDS (1 << 16) 345#define VMX_NR_VPIDS (1 << 16)
338#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 346#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
@@ -356,4 +364,19 @@ enum vmcs_field {
356 364
357#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 365#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
358 366
367
368#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
369#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
370#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
371#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
372#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
373#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
374#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
375#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
376#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
377#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
378#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
379
380
381
359#endif 382#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88dd768eab6d..d364df03c1d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -109,6 +109,8 @@ obj-$(CONFIG_MICROCODE) += microcode.o
109 109
110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
111 111
112obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
113
112### 114###
113# 64 bit specific files 115# 64 bit specific files
114ifeq ($(CONFIG_X86_64),y) 116ifeq ($(CONFIG_X86_64),y)
@@ -122,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y)
122 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 124 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
123 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
124 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
125 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
126 127
127 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
128endif 129endif
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 66198cbe464d..d652515e2855 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -119,8 +119,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
119 119
120int first_system_vector = 0xfe; 120int first_system_vector = 0xfe;
121 121
122char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
123
124/* 122/*
125 * Debug level, exported for io_apic.c 123 * Debug level, exported for io_apic.c
126 */ 124 */
@@ -142,7 +140,7 @@ static int lapic_next_event(unsigned long delta,
142 struct clock_event_device *evt); 140 struct clock_event_device *evt);
143static void lapic_timer_setup(enum clock_event_mode mode, 141static void lapic_timer_setup(enum clock_event_mode mode,
144 struct clock_event_device *evt); 142 struct clock_event_device *evt);
145static void lapic_timer_broadcast(cpumask_t mask); 143static void lapic_timer_broadcast(const cpumask_t *mask);
146static void apic_pm_activate(void); 144static void apic_pm_activate(void);
147 145
148/* 146/*
@@ -455,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
455/* 453/*
456 * Local APIC timer broadcast function 454 * Local APIC timer broadcast function
457 */ 455 */
458static void lapic_timer_broadcast(cpumask_t mask) 456static void lapic_timer_broadcast(const cpumask_t *mask)
459{ 457{
460#ifdef CONFIG_SMP 458#ifdef CONFIG_SMP
461 send_IPI_mask(mask, LOCAL_TIMER_VECTOR); 459 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@ -471,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void)
471 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 469 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
472 470
473 memcpy(levt, &lapic_clockevent, sizeof(*levt)); 471 memcpy(levt, &lapic_clockevent, sizeof(*levt));
474 levt->cpumask = cpumask_of_cpu(smp_processor_id()); 472 levt->cpumask = cpumask_of(smp_processor_id());
475 473
476 clockevents_register_device(levt); 474 clockevents_register_device(levt);
477} 475}
@@ -1807,28 +1805,32 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1807void __cpuinit generic_processor_info(int apicid, int version) 1805void __cpuinit generic_processor_info(int apicid, int version)
1808{ 1806{
1809 int cpu; 1807 int cpu;
1810 cpumask_t tmp_map;
1811 1808
1812 /* 1809 /*
1813 * Validate version 1810 * Validate version
1814 */ 1811 */
1815 if (version == 0x0) { 1812 if (version == 0x0) {
1816 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " 1813 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1817 "fixing up to 0x10. (tell your hw vendor)\n", 1814 "fixing up to 0x10. (tell your hw vendor)\n",
1818 version); 1815 version);
1819 version = 0x10; 1816 version = 0x10;
1820 } 1817 }
1821 apic_version[apicid] = version; 1818 apic_version[apicid] = version;
1822 1819
1823 if (num_processors >= NR_CPUS) { 1820 if (num_processors >= nr_cpu_ids) {
1824 pr_warning("WARNING: NR_CPUS limit of %i reached." 1821 int max = nr_cpu_ids;
1825 " Processor ignored.\n", NR_CPUS); 1822 int thiscpu = max + disabled_cpus;
1823
1824 pr_warning(
1825 "ACPI: NR_CPUS/possible_cpus limit of %i reached."
1826 " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
1827
1828 disabled_cpus++;
1826 return; 1829 return;
1827 } 1830 }
1828 1831
1829 num_processors++; 1832 num_processors++;
1830 cpus_complement(tmp_map, cpu_present_map); 1833 cpu = cpumask_next_zero(-1, cpu_present_mask);
1831 cpu = first_cpu(tmp_map);
1832 1834
1833 physid_set(apicid, phys_cpu_present_map); 1835 physid_set(apicid, phys_cpu_present_map);
1834 if (apicid == boot_cpu_physical_apicid) { 1836 if (apicid == boot_cpu_physical_apicid) {
@@ -1878,8 +1880,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
1878 } 1880 }
1879#endif 1881#endif
1880 1882
1881 cpu_set(cpu, cpu_possible_map); 1883 set_cpu_possible(cpu, true);
1882 cpu_set(cpu, cpu_present_map); 1884 set_cpu_present(cpu, true);
1883} 1885}
1884 1886
1885#ifdef CONFIG_X86_64 1887#ifdef CONFIG_X86_64
@@ -2081,7 +2083,7 @@ __cpuinit int apic_is_clustered_box(void)
2081 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); 2083 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2082 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 2084 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2083 2085
2084 for (i = 0; i < NR_CPUS; i++) { 2086 for (i = 0; i < nr_cpu_ids; i++) {
2085 /* are we being called early in kernel startup? */ 2087 /* are we being called early in kernel startup? */
2086 if (bios_cpu_apicid) { 2088 if (bios_cpu_apicid) {
2087 id = bios_cpu_apicid[i]; 2089 id = bios_cpu_apicid[i];
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 68b5d8681cbb..c6ecda64f5f1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
534 per_cpu(cpuid4_info, cpu) = NULL; 534 per_cpu(cpuid4_info, cpu) = NULL;
535} 535}
536 536
537static int __cpuinit detect_cache_attributes(unsigned int cpu) 537static void get_cpu_leaves(void *_retval)
538{ 538{
539 struct _cpuid4_info *this_leaf; 539 int j, *retval = _retval, cpu = smp_processor_id();
540 unsigned long j;
541 int retval;
542 cpumask_t oldmask;
543
544 if (num_cache_leaves == 0)
545 return -ENOENT;
546
547 per_cpu(cpuid4_info, cpu) = kzalloc(
548 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
549 if (per_cpu(cpuid4_info, cpu) == NULL)
550 return -ENOMEM;
551
552 oldmask = current->cpus_allowed;
553 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
554 if (retval)
555 goto out;
556 540
557 /* Do cpuid and store the results */ 541 /* Do cpuid and store the results */
558 for (j = 0; j < num_cache_leaves; j++) { 542 for (j = 0; j < num_cache_leaves; j++) {
543 struct _cpuid4_info *this_leaf;
559 this_leaf = CPUID4_INFO_IDX(cpu, j); 544 this_leaf = CPUID4_INFO_IDX(cpu, j);
560 retval = cpuid4_cache_lookup(j, this_leaf); 545 *retval = cpuid4_cache_lookup(j, this_leaf);
561 if (unlikely(retval < 0)) { 546 if (unlikely(*retval < 0)) {
562 int i; 547 int i;
563 548
564 for (i = 0; i < j; i++) 549 for (i = 0; i < j; i++)
@@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
567 } 552 }
568 cache_shared_cpu_map_setup(cpu, j); 553 cache_shared_cpu_map_setup(cpu, j);
569 } 554 }
570 set_cpus_allowed_ptr(current, &oldmask); 555}
556
557static int __cpuinit detect_cache_attributes(unsigned int cpu)
558{
559 int retval;
560
561 if (num_cache_leaves == 0)
562 return -ENOENT;
563
564 per_cpu(cpuid4_info, cpu) = kzalloc(
565 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
566 if (per_cpu(cpuid4_info, cpu) == NULL)
567 return -ENOMEM;
571 568
572out: 569 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
573 if (retval) { 570 if (retval) {
574 kfree(per_cpu(cpuid4_info, cpu)); 571 kfree(per_cpu(cpuid4_info, cpu));
575 per_cpu(cpuid4_info, cpu) = NULL; 572 per_cpu(cpuid4_info, cpu) = NULL;
@@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
626 cpumask_t *mask = &this_leaf->shared_cpu_map; 623 cpumask_t *mask = &this_leaf->shared_cpu_map;
627 624
628 n = type? 625 n = type?
629 cpulist_scnprintf(buf, len-2, *mask): 626 cpulist_scnprintf(buf, len-2, mask) :
630 cpumask_scnprintf(buf, len-2, *mask); 627 cpumask_scnprintf(buf, len-2, mask);
631 buf[n++] = '\n'; 628 buf[n++] = '\n';
632 buf[n] = '\0'; 629 buf[n] = '\0';
633 } 630 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 748c8f9e7a05..a5a5e0530370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
83 * CPU Initialization 83 * CPU Initialization
84 */ 84 */
85 85
86struct thresh_restart {
87 struct threshold_block *b;
88 int reset;
89 u16 old_limit;
90};
91
86/* must be called with correct cpu affinity */ 92/* must be called with correct cpu affinity */
87static void threshold_restart_bank(struct threshold_block *b, 93static long threshold_restart_bank(void *_tr)
88 int reset, u16 old_limit)
89{ 94{
95 struct thresh_restart *tr = _tr;
90 u32 mci_misc_hi, mci_misc_lo; 96 u32 mci_misc_hi, mci_misc_lo;
91 97
92 rdmsr(b->address, mci_misc_lo, mci_misc_hi); 98 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
93 99
94 if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 100 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
95 reset = 1; /* limit cannot be lower than err count */ 101 tr->reset = 1; /* limit cannot be lower than err count */
96 102
97 if (reset) { /* reset err count and overflow bit */ 103 if (tr->reset) { /* reset err count and overflow bit */
98 mci_misc_hi = 104 mci_misc_hi =
99 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 105 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
100 (THRESHOLD_MAX - b->threshold_limit); 106 (THRESHOLD_MAX - tr->b->threshold_limit);
101 } else if (old_limit) { /* change limit w/o reset */ 107 } else if (tr->old_limit) { /* change limit w/o reset */
102 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 108 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
103 (old_limit - b->threshold_limit); 109 (tr->old_limit - tr->b->threshold_limit);
104 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 110 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
105 (new_count & THRESHOLD_MAX); 111 (new_count & THRESHOLD_MAX);
106 } 112 }
107 113
108 b->interrupt_enable ? 114 tr->b->interrupt_enable ?
109 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 115 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
110 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 116 (mci_misc_hi &= ~MASK_INT_TYPE_HI);
111 117
112 mci_misc_hi |= MASK_COUNT_EN_HI; 118 mci_misc_hi |= MASK_COUNT_EN_HI;
113 wrmsr(b->address, mci_misc_lo, mci_misc_hi); 119 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
120 return 0;
114} 121}
115 122
116/* cpu init entry point, called from mce.c with preempt off */ 123/* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
120 unsigned int cpu = smp_processor_id(); 127 unsigned int cpu = smp_processor_id();
121 u8 lvt_off; 128 u8 lvt_off;
122 u32 low = 0, high = 0, address = 0; 129 u32 low = 0, high = 0, address = 0;
130 struct thresh_restart tr;
123 131
124 for (bank = 0; bank < NR_BANKS; ++bank) { 132 for (bank = 0; bank < NR_BANKS; ++bank) {
125 for (block = 0; block < NR_BLOCKS; ++block) { 133 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
162 wrmsr(address, low, high); 170 wrmsr(address, low, high);
163 171
164 threshold_defaults.address = address; 172 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0); 173 tr.b = &threshold_defaults;
174 tr.reset = 0;
175 tr.old_limit = 0;
176 threshold_restart_bank(&tr);
166 } 177 }
167 } 178 }
168} 179}
@@ -251,20 +262,6 @@ struct threshold_attr {
251 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 262 ssize_t(*store) (struct threshold_block *, const char *, size_t count);
252}; 263};
253 264
254static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
255 cpumask_t *newmask)
256{
257 *oldmask = current->cpus_allowed;
258 cpus_clear(*newmask);
259 cpu_set(cpu, *newmask);
260 set_cpus_allowed_ptr(current, newmask);
261}
262
263static void affinity_restore(const cpumask_t *oldmask)
264{
265 set_cpus_allowed_ptr(current, oldmask);
266}
267
268#define SHOW_FIELDS(name) \ 265#define SHOW_FIELDS(name) \
269static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 266static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
270{ \ 267{ \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
277 const char *buf, size_t count) 274 const char *buf, size_t count)
278{ 275{
279 char *end; 276 char *end;
280 cpumask_t oldmask, newmask; 277 struct thresh_restart tr;
281 unsigned long new = simple_strtoul(buf, &end, 0); 278 unsigned long new = simple_strtoul(buf, &end, 0);
282 if (end == buf) 279 if (end == buf)
283 return -EINVAL; 280 return -EINVAL;
284 b->interrupt_enable = !!new; 281 b->interrupt_enable = !!new;
285 282
286 affinity_set(b->cpu, &oldmask, &newmask); 283 tr.b = b;
287 threshold_restart_bank(b, 0, 0); 284 tr.reset = 0;
288 affinity_restore(&oldmask); 285 tr.old_limit = 0;
286 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
289 287
290 return end - buf; 288 return end - buf;
291} 289}
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
294 const char *buf, size_t count) 292 const char *buf, size_t count)
295{ 293{
296 char *end; 294 char *end;
297 cpumask_t oldmask, newmask; 295 struct thresh_restart tr;
298 u16 old;
299 unsigned long new = simple_strtoul(buf, &end, 0); 296 unsigned long new = simple_strtoul(buf, &end, 0);
300 if (end == buf) 297 if (end == buf)
301 return -EINVAL; 298 return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 new = THRESHOLD_MAX; 300 new = THRESHOLD_MAX;
304 if (new < 1) 301 if (new < 1)
305 new = 1; 302 new = 1;
306 old = b->threshold_limit; 303 tr.old_limit = b->threshold_limit;
307 b->threshold_limit = new; 304 b->threshold_limit = new;
305 tr.b = b;
306 tr.reset = 0;
308 307
309 affinity_set(b->cpu, &oldmask, &newmask); 308 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
310 threshold_restart_bank(b, 0, old);
311 affinity_restore(&oldmask);
312 309
313 return end - buf; 310 return end - buf;
314} 311}
315 312
316static ssize_t show_error_count(struct threshold_block *b, char *buf) 313static long local_error_count(void *_b)
317{ 314{
318 u32 high, low; 315 struct threshold_block *b = _b;
319 cpumask_t oldmask, newmask; 316 u32 low, high;
320 affinity_set(b->cpu, &oldmask, &newmask); 317
321 rdmsr(b->address, low, high); 318 rdmsr(b->address, low, high);
322 affinity_restore(&oldmask); 319 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
323 return sprintf(buf, "%x\n", 320}
324 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); 321
322static ssize_t show_error_count(struct threshold_block *b, char *buf)
323{
324 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
325} 325}
326 326
327static ssize_t store_error_count(struct threshold_block *b, 327static ssize_t store_error_count(struct threshold_block *b,
328 const char *buf, size_t count) 328 const char *buf, size_t count)
329{ 329{
330 cpumask_t oldmask, newmask; 330 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
331 affinity_set(b->cpu, &oldmask, &newmask); 331
332 threshold_restart_bank(b, 1, 0); 332 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
333 affinity_restore(&oldmask);
334 return 1; 333 return 1;
335} 334}
336 335
@@ -463,12 +462,19 @@ out_free:
463 return err; 462 return err;
464} 463}
465 464
465static long local_allocate_threshold_blocks(void *_bank)
466{
467 unsigned int *bank = _bank;
468
469 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
470 MSR_IA32_MC0_MISC + *bank * 4);
471}
472
466/* symlinks sibling shared banks to first core. first core owns dir/files. */ 473/* symlinks sibling shared banks to first core. first core owns dir/files. */
467static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) 474static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
468{ 475{
469 int i, err = 0; 476 int i, err = 0;
470 struct threshold_bank *b = NULL; 477 struct threshold_bank *b = NULL;
471 cpumask_t oldmask, newmask;
472 char name[32]; 478 char name[32];
473 479
474 sprintf(name, "threshold_bank%i", bank); 480 sprintf(name, "threshold_bank%i", bank);
@@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
519 525
520 per_cpu(threshold_banks, cpu)[bank] = b; 526 per_cpu(threshold_banks, cpu)[bank] = b;
521 527
522 affinity_set(cpu, &oldmask, &newmask); 528 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
523 err = allocate_threshold_blocks(cpu, bank, 0,
524 MSR_IA32_MC0_MISC + bank * 4);
525 affinity_restore(&oldmask);
526
527 if (err) 529 if (err)
528 goto out_free; 530 goto out_free;
529 531
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..b59ddcc88cd8 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
14#include <asm/pat.h> 14#include <asm/pat.h>
15#include "mtrr.h" 15#include "mtrr.h"
16 16
17struct mtrr_state {
18 struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
19 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
20 unsigned char enabled;
21 unsigned char have_fixed;
22 mtrr_type def_type;
23};
24
25struct fixed_range_block { 17struct fixed_range_block {
26 int base_msr; /* start address of an MTRR block */ 18 int base_msr; /* start address of an MTRR block */
27 int ranges; /* number of MTRRs in this block */ 19 int ranges; /* number of MTRRs in this block */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
35}; 27};
36 28
37static unsigned long smp_changes_mask; 29static unsigned long smp_changes_mask;
38static struct mtrr_state mtrr_state = {};
39static int mtrr_state_set; 30static int mtrr_state_set;
40u64 mtrr_tom2; 31u64 mtrr_tom2;
41 32
33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state);
35
42#undef MODULE_PARAM_PREFIX 36#undef MODULE_PARAM_PREFIX
43#define MODULE_PARAM_PREFIX "mtrr." 37#define MODULE_PARAM_PREFIX "mtrr."
44 38
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 44fcb237bd52..d259e5d2e054 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
49 49
50u32 num_var_ranges = 0; 50u32 num_var_ranges = 0;
51 51
52unsigned int mtrr_usage_table[MAX_VAR_RANGES]; 52unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
53static DEFINE_MUTEX(mtrr_mutex); 53static DEFINE_MUTEX(mtrr_mutex);
54 54
55u64 size_or_mask, size_and_mask; 55u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b23..ffd60409cc6d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
8#define MTRRcap_MSR 0x0fe 8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff 9#define MTRRdefType_MSR 0x2ff
10 10
11#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
12#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
13
14#define NUM_FIXED_RANGES 88
15#define MAX_VAR_RANGES 256
16#define MTRRfix64K_00000_MSR 0x250 11#define MTRRfix64K_00000_MSR 0x250
17#define MTRRfix16K_80000_MSR 0x258 12#define MTRRfix16K_80000_MSR 0x258
18#define MTRRfix16K_A0000_MSR 0x259 13#define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
29#define MTRR_CHANGE_MASK_VARIABLE 0x02 24#define MTRR_CHANGE_MASK_VARIABLE 0x02
30#define MTRR_CHANGE_MASK_DEFTYPE 0x04 25#define MTRR_CHANGE_MASK_DEFTYPE 0x04
31 26
32/* In the Intel processor's MTRR interface, the MTRR type is always held in 27extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
33 an 8 bit field: */
34typedef u8 mtrr_type;
35
36extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
37 28
38struct mtrr_ops { 29struct mtrr_ops {
39 u32 vendor; 30 u32 vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
70 u32 ccr3; 61 u32 ccr3;
71}; 62};
72 63
73struct mtrr_var_range {
74 u32 base_lo;
75 u32 base_hi;
76 u32 mask_lo;
77 u32 mask_hi;
78};
79
80void set_mtrr_done(struct set_mtrr_context *ctxt); 64void set_mtrr_done(struct set_mtrr_context *ctxt);
81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); 65void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); 66void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd7..c689d19e35ab 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/smp.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h>
29 30
30#include <mach_ipi.h> 31#include <mach_ipi.h>
31 32
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
49#endif 50#endif
50 crash_save_cpu(regs, cpu); 51 crash_save_cpu(regs, cpu);
51 52
53 /* Disable VMX or SVM if needed.
54 *
55 * We need to disable virtualization on all CPUs.
56 * Having VMX or SVM enabled on any CPU may break rebooting
57 * after the kdump kernel has finished its task.
58 */
59 cpu_emergency_vmxoff();
60 cpu_emergency_svm_disable();
61
52 disable_local_APIC(); 62 disable_local_APIC();
53} 63}
54 64
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
80 local_irq_disable(); 90 local_irq_disable();
81 91
82 kdump_nmi_shootdown_cpus(); 92 kdump_nmi_shootdown_cpus();
93
94 /* Booting kdump kernel with VMX or SVM enabled won't work,
95 * because (among other limitations) we can't disable paging
96 * with the virt flags.
97 */
98 cpu_emergency_vmxoff();
99 cpu_emergency_svm_disable();
100
83 lapic_shutdown(); 101 lapic_shutdown();
84#if defined(CONFIG_X86_IO_APIC) 102#if defined(CONFIG_X86_IO_APIC)
85 disable_IO_APIC(); 103 disable_IO_APIC();
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index c0262791bda4..34185488e4fb 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
30 return 1; 30 return 1;
31} 31}
32 32
33static cpumask_t flat_target_cpus(void) 33static const struct cpumask *flat_target_cpus(void)
34{ 34{
35 return cpu_online_map; 35 return cpu_online_mask;
36} 36}
37 37
38static cpumask_t flat_vector_allocation_domain(int cpu) 38static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
39{ 39{
40 /* Careful. Some cpus do not strictly honor the set of cpus 40 /* Careful. Some cpus do not strictly honor the set of cpus
41 * specified in the interrupt destination when using lowest 41 * specified in the interrupt destination when using lowest
@@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
45 * deliver interrupts to the wrong hyperthread when only one 45 * deliver interrupts to the wrong hyperthread when only one
46 * hyperthread was specified in the interrupt desitination. 46 * hyperthread was specified in the interrupt desitination.
47 */ 47 */
48 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 48 cpumask_clear(retmask);
49 return domain; 49 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
50} 50}
51 51
52/* 52/*
@@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void)
69 apic_write(APIC_LDR, val); 69 apic_write(APIC_LDR, val);
70} 70}
71 71
72static void flat_send_IPI_mask(cpumask_t cpumask, int vector) 72static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
73{ 73{
74 unsigned long mask = cpus_addr(cpumask)[0];
75 unsigned long flags; 74 unsigned long flags;
76 75
77 local_irq_save(flags); 76 local_irq_save(flags);
@@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
79 local_irq_restore(flags); 78 local_irq_restore(flags);
80} 79}
81 80
81static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
82{
83 unsigned long mask = cpumask_bits(cpumask)[0];
84
85 _flat_send_IPI_mask(mask, vector);
86}
87
88static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
89 int vector)
90{
91 unsigned long mask = cpumask_bits(cpumask)[0];
92 int cpu = smp_processor_id();
93
94 if (cpu < BITS_PER_LONG)
95 clear_bit(cpu, &mask);
96 _flat_send_IPI_mask(mask, vector);
97}
98
82static void flat_send_IPI_allbutself(int vector) 99static void flat_send_IPI_allbutself(int vector)
83{ 100{
101 int cpu = smp_processor_id();
84#ifdef CONFIG_HOTPLUG_CPU 102#ifdef CONFIG_HOTPLUG_CPU
85 int hotplug = 1; 103 int hotplug = 1;
86#else 104#else
87 int hotplug = 0; 105 int hotplug = 0;
88#endif 106#endif
89 if (hotplug || vector == NMI_VECTOR) { 107 if (hotplug || vector == NMI_VECTOR) {
90 cpumask_t allbutme = cpu_online_map; 108 if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
109 unsigned long mask = cpumask_bits(cpu_online_mask)[0];
91 110
92 cpu_clear(smp_processor_id(), allbutme); 111 if (cpu < BITS_PER_LONG)
112 clear_bit(cpu, &mask);
93 113
94 if (!cpus_empty(allbutme)) 114 _flat_send_IPI_mask(mask, vector);
95 flat_send_IPI_mask(allbutme, vector); 115 }
96 } else if (num_online_cpus() > 1) { 116 } else if (num_online_cpus() > 1) {
97 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); 117 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
98 } 118 }
@@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector)
101static void flat_send_IPI_all(int vector) 121static void flat_send_IPI_all(int vector)
102{ 122{
103 if (vector == NMI_VECTOR) 123 if (vector == NMI_VECTOR)
104 flat_send_IPI_mask(cpu_online_map, vector); 124 flat_send_IPI_mask(cpu_online_mask, vector);
105 else 125 else
106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 126 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
107} 127}
@@ -135,9 +155,18 @@ static int flat_apic_id_registered(void)
135 return physid_isset(read_xapic_id(), phys_cpu_present_map); 155 return physid_isset(read_xapic_id(), phys_cpu_present_map);
136} 156}
137 157
138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 158static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
159{
160 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
161}
162
163static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
164 const struct cpumask *andmask)
139{ 165{
140 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; 166 unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
167 unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
168
169 return mask1 & mask2;
141} 170}
142 171
143static unsigned int phys_pkg_id(int index_msb) 172static unsigned int phys_pkg_id(int index_msb)
@@ -157,8 +186,10 @@ struct genapic apic_flat = {
157 .send_IPI_all = flat_send_IPI_all, 186 .send_IPI_all = flat_send_IPI_all,
158 .send_IPI_allbutself = flat_send_IPI_allbutself, 187 .send_IPI_allbutself = flat_send_IPI_allbutself,
159 .send_IPI_mask = flat_send_IPI_mask, 188 .send_IPI_mask = flat_send_IPI_mask,
189 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
160 .send_IPI_self = apic_send_IPI_self, 190 .send_IPI_self = apic_send_IPI_self,
161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 191 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
192 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
162 .phys_pkg_id = phys_pkg_id, 193 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id, 194 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id, 195 .set_apic_id = set_apic_id,
@@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
188 return 0; 219 return 0;
189} 220}
190 221
191static cpumask_t physflat_target_cpus(void) 222static const struct cpumask *physflat_target_cpus(void)
192{ 223{
193 return cpu_online_map; 224 return cpu_online_mask;
194} 225}
195 226
196static cpumask_t physflat_vector_allocation_domain(int cpu) 227static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
197{ 228{
198 return cpumask_of_cpu(cpu); 229 cpumask_clear(retmask);
230 cpumask_set_cpu(cpu, retmask);
199} 231}
200 232
201static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) 233static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
202{ 234{
203 send_IPI_mask_sequence(cpumask, vector); 235 send_IPI_mask_sequence(cpumask, vector);
204} 236}
205 237
206static void physflat_send_IPI_allbutself(int vector) 238static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
239 int vector)
207{ 240{
208 cpumask_t allbutme = cpu_online_map; 241 send_IPI_mask_allbutself(cpumask, vector);
242}
209 243
210 cpu_clear(smp_processor_id(), allbutme); 244static void physflat_send_IPI_allbutself(int vector)
211 physflat_send_IPI_mask(allbutme, vector); 245{
246 send_IPI_mask_allbutself(cpu_online_mask, vector);
212} 247}
213 248
214static void physflat_send_IPI_all(int vector) 249static void physflat_send_IPI_all(int vector)
215{ 250{
216 physflat_send_IPI_mask(cpu_online_map, vector); 251 physflat_send_IPI_mask(cpu_online_mask, vector);
217} 252}
218 253
219static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) 254static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
220{ 255{
221 int cpu; 256 int cpu;
222 257
@@ -224,13 +259,31 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
224 * We're using fixed IRQ delivery, can only return one phys APIC ID. 259 * We're using fixed IRQ delivery, can only return one phys APIC ID.
225 * May as well be the first. 260 * May as well be the first.
226 */ 261 */
227 cpu = first_cpu(cpumask); 262 cpu = cpumask_first(cpumask);
228 if ((unsigned)cpu < nr_cpu_ids) 263 if ((unsigned)cpu < nr_cpu_ids)
229 return per_cpu(x86_cpu_to_apicid, cpu); 264 return per_cpu(x86_cpu_to_apicid, cpu);
230 else 265 else
231 return BAD_APICID; 266 return BAD_APICID;
232} 267}
233 268
269static unsigned int
270physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
271 const struct cpumask *andmask)
272{
273 int cpu;
274
275 /*
276 * We're using fixed IRQ delivery, can only return one phys APIC ID.
277 * May as well be the first.
278 */
279 for_each_cpu_and(cpu, cpumask, andmask)
280 if (cpumask_test_cpu(cpu, cpu_online_mask))
281 break;
282 if (cpu < nr_cpu_ids)
283 return per_cpu(x86_cpu_to_apicid, cpu);
284 return BAD_APICID;
285}
286
234struct genapic apic_physflat = { 287struct genapic apic_physflat = {
235 .name = "physical flat", 288 .name = "physical flat",
236 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 289 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
@@ -243,8 +296,10 @@ struct genapic apic_physflat = {
243 .send_IPI_all = physflat_send_IPI_all, 296 .send_IPI_all = physflat_send_IPI_all,
244 .send_IPI_allbutself = physflat_send_IPI_allbutself, 297 .send_IPI_allbutself = physflat_send_IPI_allbutself,
245 .send_IPI_mask = physflat_send_IPI_mask, 298 .send_IPI_mask = physflat_send_IPI_mask,
299 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
246 .send_IPI_self = apic_send_IPI_self, 300 .send_IPI_self = apic_send_IPI_self,
247 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 301 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
302 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
248 .phys_pkg_id = phys_pkg_id, 303 .phys_pkg_id = phys_pkg_id,
249 .get_apic_id = get_apic_id, 304 .get_apic_id = get_apic_id,
250 .set_apic_id = set_apic_id, 305 .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index f6a2c8eb48a6..6ce497cc372d 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
22 22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24 24
25static cpumask_t x2apic_target_cpus(void) 25static const struct cpumask *x2apic_target_cpus(void)
26{ 26{
27 return cpumask_of_cpu(0); 27 return cpumask_of(0);
28} 28}
29 29
30/* 30/*
31 * for now each logical cpu is in its own vector allocation domain. 31 * for now each logical cpu is in its own vector allocation domain.
32 */ 32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu) 33static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
34{ 34{
35 cpumask_t domain = CPU_MASK_NONE; 35 cpumask_clear(retmask);
36 cpu_set(cpu, domain); 36 cpumask_set_cpu(cpu, retmask);
37 return domain;
38} 37}
39 38
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, 39static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register 55 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes. 56 * writes.
58 */ 57 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector) 58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
60{ 59{
61 unsigned long flags; 60 unsigned long flags;
62 unsigned long query_cpu; 61 unsigned long query_cpu;
63 62
64 local_irq_save(flags); 63 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) { 64 for_each_cpu(query_cpu, mask)
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), 65 __x2apic_send_IPI_dest(
67 vector, APIC_DEST_LOGICAL); 66 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
68 } 67 vector, APIC_DEST_LOGICAL);
69 local_irq_restore(flags); 68 local_irq_restore(flags);
70} 69}
71 70
72static void x2apic_send_IPI_allbutself(int vector) 71static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
72 int vector)
73{ 73{
74 cpumask_t mask = cpu_online_map; 74 unsigned long flags;
75 unsigned long query_cpu;
76 unsigned long this_cpu = smp_processor_id();
75 77
76 cpu_clear(smp_processor_id(), mask); 78 local_irq_save(flags);
79 for_each_cpu(query_cpu, mask)
80 if (query_cpu != this_cpu)
81 __x2apic_send_IPI_dest(
82 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
83 vector, APIC_DEST_LOGICAL);
84 local_irq_restore(flags);
85}
86
87static void x2apic_send_IPI_allbutself(int vector)
88{
89 unsigned long flags;
90 unsigned long query_cpu;
91 unsigned long this_cpu = smp_processor_id();
77 92
78 if (!cpus_empty(mask)) 93 local_irq_save(flags);
79 x2apic_send_IPI_mask(mask, vector); 94 for_each_online_cpu(query_cpu)
95 if (query_cpu != this_cpu)
96 __x2apic_send_IPI_dest(
97 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
98 vector, APIC_DEST_LOGICAL);
99 local_irq_restore(flags);
80} 100}
81 101
82static void x2apic_send_IPI_all(int vector) 102static void x2apic_send_IPI_all(int vector)
83{ 103{
84 x2apic_send_IPI_mask(cpu_online_map, vector); 104 x2apic_send_IPI_mask(cpu_online_mask, vector);
85} 105}
86 106
87static int x2apic_apic_id_registered(void) 107static int x2apic_apic_id_registered(void)
@@ -89,21 +109,38 @@ static int x2apic_apic_id_registered(void)
89 return 1; 109 return 1;
90} 110}
91 111
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) 112static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
93{ 113{
94 int cpu; 114 int cpu;
95 115
96 /* 116 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID. 117 * We're using fixed IRQ delivery, can only return one logical APIC ID.
98 * May as well be the first. 118 * May as well be the first.
99 */ 119 */
100 cpu = first_cpu(cpumask); 120 cpu = cpumask_first(cpumask);
101 if ((unsigned)cpu < NR_CPUS) 121 if ((unsigned)cpu < nr_cpu_ids)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu); 122 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else 123 else
104 return BAD_APICID; 124 return BAD_APICID;
105} 125}
106 126
127static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
128 const struct cpumask *andmask)
129{
130 int cpu;
131
132 /*
133 * We're using fixed IRQ delivery, can only return one logical APIC ID.
134 * May as well be the first.
135 */
136 for_each_cpu_and(cpu, cpumask, andmask)
137 if (cpumask_test_cpu(cpu, cpu_online_mask))
138 break;
139 if (cpu < nr_cpu_ids)
140 return per_cpu(x86_cpu_to_logical_apicid, cpu);
141 return BAD_APICID;
142}
143
107static unsigned int get_apic_id(unsigned long x) 144static unsigned int get_apic_id(unsigned long x)
108{ 145{
109 unsigned int id; 146 unsigned int id;
@@ -150,8 +187,10 @@ struct genapic apic_x2apic_cluster = {
150 .send_IPI_all = x2apic_send_IPI_all, 187 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 188 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask, 189 .send_IPI_mask = x2apic_send_IPI_mask,
190 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
153 .send_IPI_self = x2apic_send_IPI_self, 191 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 192 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
193 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
155 .phys_pkg_id = phys_pkg_id, 194 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id, 195 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id, 196 .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index a177c7880ab5..21bcc0e098ba 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31 31
32static cpumask_t x2apic_target_cpus(void) 32static const struct cpumask *x2apic_target_cpus(void)
33{ 33{
34 return cpumask_of_cpu(0); 34 return cpumask_of(0);
35} 35}
36 36
37static cpumask_t x2apic_vector_allocation_domain(int cpu) 37static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
38{ 38{
39 cpumask_t domain = CPU_MASK_NONE; 39 cpumask_clear(retmask);
40 cpu_set(cpu, domain); 40 cpumask_set_cpu(cpu, retmask);
41 return domain;
42} 41}
43 42
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, 43static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
54 x2apic_icr_write(cfg, apicid); 53 x2apic_icr_write(cfg, apicid);
55} 54}
56 55
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector) 56static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58{ 57{
59 unsigned long flags; 58 unsigned long flags;
60 unsigned long query_cpu; 59 unsigned long query_cpu;
61 60
62 local_irq_save(flags); 61 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) { 62 for_each_cpu(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 63 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL); 64 vector, APIC_DEST_PHYSICAL);
66 } 65 }
67 local_irq_restore(flags); 66 local_irq_restore(flags);
68} 67}
69 68
70static void x2apic_send_IPI_allbutself(int vector) 69static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
70 int vector)
71{ 71{
72 cpumask_t mask = cpu_online_map; 72 unsigned long flags;
73 unsigned long query_cpu;
74 unsigned long this_cpu = smp_processor_id();
75
76 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) {
78 if (query_cpu != this_cpu)
79 __x2apic_send_IPI_dest(
80 per_cpu(x86_cpu_to_apicid, query_cpu),
81 vector, APIC_DEST_PHYSICAL);
82 }
83 local_irq_restore(flags);
84}
73 85
74 cpu_clear(smp_processor_id(), mask); 86static void x2apic_send_IPI_allbutself(int vector)
87{
88 unsigned long flags;
89 unsigned long query_cpu;
90 unsigned long this_cpu = smp_processor_id();
75 91
76 if (!cpus_empty(mask)) 92 local_irq_save(flags);
77 x2apic_send_IPI_mask(mask, vector); 93 for_each_online_cpu(query_cpu)
94 if (query_cpu != this_cpu)
95 __x2apic_send_IPI_dest(
96 per_cpu(x86_cpu_to_apicid, query_cpu),
97 vector, APIC_DEST_PHYSICAL);
98 local_irq_restore(flags);
78} 99}
79 100
80static void x2apic_send_IPI_all(int vector) 101static void x2apic_send_IPI_all(int vector)
81{ 102{
82 x2apic_send_IPI_mask(cpu_online_map, vector); 103 x2apic_send_IPI_mask(cpu_online_mask, vector);
83} 104}
84 105
85static int x2apic_apic_id_registered(void) 106static int x2apic_apic_id_registered(void)
@@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void)
87 return 1; 108 return 1;
88} 109}
89 110
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) 111static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
91{ 112{
92 int cpu; 113 int cpu;
93 114
@@ -95,13 +116,30 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
95 * We're using fixed IRQ delivery, can only return one phys APIC ID. 116 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first. 117 * May as well be the first.
97 */ 118 */
98 cpu = first_cpu(cpumask); 119 cpu = cpumask_first(cpumask);
99 if ((unsigned)cpu < NR_CPUS) 120 if ((unsigned)cpu < nr_cpu_ids)
100 return per_cpu(x86_cpu_to_apicid, cpu); 121 return per_cpu(x86_cpu_to_apicid, cpu);
101 else 122 else
102 return BAD_APICID; 123 return BAD_APICID;
103} 124}
104 125
126static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
127 const struct cpumask *andmask)
128{
129 int cpu;
130
131 /*
132 * We're using fixed IRQ delivery, can only return one phys APIC ID.
133 * May as well be the first.
134 */
135 for_each_cpu_and(cpu, cpumask, andmask)
136 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break;
138 if (cpu < nr_cpu_ids)
139 return per_cpu(x86_cpu_to_apicid, cpu);
140 return BAD_APICID;
141}
142
105static unsigned int get_apic_id(unsigned long x) 143static unsigned int get_apic_id(unsigned long x)
106{ 144{
107 unsigned int id; 145 unsigned int id;
@@ -145,8 +183,10 @@ struct genapic apic_x2apic_phys = {
145 .send_IPI_all = x2apic_send_IPI_all, 183 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 184 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask, 185 .send_IPI_mask = x2apic_send_IPI_mask,
186 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
148 .send_IPI_self = x2apic_send_IPI_self, 187 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 188 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
189 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
150 .phys_pkg_id = phys_pkg_id, 190 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id, 191 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id, 192 .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index dece17289731..b193e082f6ce 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -79,16 +79,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
79 79
80/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 80/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
81 81
82static cpumask_t uv_target_cpus(void) 82static const struct cpumask *uv_target_cpus(void)
83{ 83{
84 return cpumask_of_cpu(0); 84 return cpumask_of(0);
85} 85}
86 86
87static cpumask_t uv_vector_allocation_domain(int cpu) 87static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
88{ 88{
89 cpumask_t domain = CPU_MASK_NONE; 89 cpumask_clear(retmask);
90 cpu_set(cpu, domain); 90 cpumask_set_cpu(cpu, retmask);
91 return domain;
92} 91}
93 92
94int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 93int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -127,28 +126,37 @@ static void uv_send_IPI_one(int cpu, int vector)
127 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 126 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
128} 127}
129 128
130static void uv_send_IPI_mask(cpumask_t mask, int vector) 129static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
131{ 130{
132 unsigned int cpu; 131 unsigned int cpu;
133 132
134 for_each_possible_cpu(cpu) 133 for_each_cpu(cpu, mask)
135 if (cpu_isset(cpu, mask)) 134 uv_send_IPI_one(cpu, vector);
135}
136
137static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
138{
139 unsigned int cpu;
140 unsigned int this_cpu = smp_processor_id();
141
142 for_each_cpu(cpu, mask)
143 if (cpu != this_cpu)
136 uv_send_IPI_one(cpu, vector); 144 uv_send_IPI_one(cpu, vector);
137} 145}
138 146
139static void uv_send_IPI_allbutself(int vector) 147static void uv_send_IPI_allbutself(int vector)
140{ 148{
141 cpumask_t mask = cpu_online_map; 149 unsigned int cpu;
142 150 unsigned int this_cpu = smp_processor_id();
143 cpu_clear(smp_processor_id(), mask);
144 151
145 if (!cpus_empty(mask)) 152 for_each_online_cpu(cpu)
146 uv_send_IPI_mask(mask, vector); 153 if (cpu != this_cpu)
154 uv_send_IPI_one(cpu, vector);
147} 155}
148 156
149static void uv_send_IPI_all(int vector) 157static void uv_send_IPI_all(int vector)
150{ 158{
151 uv_send_IPI_mask(cpu_online_map, vector); 159 uv_send_IPI_mask(cpu_online_mask, vector);
152} 160}
153 161
154static int uv_apic_id_registered(void) 162static int uv_apic_id_registered(void)
@@ -160,7 +168,7 @@ static void uv_init_apic_ldr(void)
160{ 168{
161} 169}
162 170
163static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 171static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
164{ 172{
165 int cpu; 173 int cpu;
166 174
@@ -168,13 +176,30 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
168 * We're using fixed IRQ delivery, can only return one phys APIC ID. 176 * We're using fixed IRQ delivery, can only return one phys APIC ID.
169 * May as well be the first. 177 * May as well be the first.
170 */ 178 */
171 cpu = first_cpu(cpumask); 179 cpu = cpumask_first(cpumask);
172 if ((unsigned)cpu < nr_cpu_ids) 180 if ((unsigned)cpu < nr_cpu_ids)
173 return per_cpu(x86_cpu_to_apicid, cpu); 181 return per_cpu(x86_cpu_to_apicid, cpu);
174 else 182 else
175 return BAD_APICID; 183 return BAD_APICID;
176} 184}
177 185
186static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
187 const struct cpumask *andmask)
188{
189 int cpu;
190
191 /*
192 * We're using fixed IRQ delivery, can only return one phys APIC ID.
193 * May as well be the first.
194 */
195 for_each_cpu_and(cpu, cpumask, andmask)
196 if (cpumask_test_cpu(cpu, cpu_online_mask))
197 break;
198 if (cpu < nr_cpu_ids)
199 return per_cpu(x86_cpu_to_apicid, cpu);
200 return BAD_APICID;
201}
202
178static unsigned int get_apic_id(unsigned long x) 203static unsigned int get_apic_id(unsigned long x)
179{ 204{
180 unsigned int id; 205 unsigned int id;
@@ -222,8 +247,10 @@ struct genapic apic_x2apic_uv_x = {
222 .send_IPI_all = uv_send_IPI_all, 247 .send_IPI_all = uv_send_IPI_all,
223 .send_IPI_allbutself = uv_send_IPI_allbutself, 248 .send_IPI_allbutself = uv_send_IPI_allbutself,
224 .send_IPI_mask = uv_send_IPI_mask, 249 .send_IPI_mask = uv_send_IPI_mask,
250 .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
225 .send_IPI_self = uv_send_IPI_self, 251 .send_IPI_self = uv_send_IPI_self,
226 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 252 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
253 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
227 .phys_pkg_id = phys_pkg_id, 254 .phys_pkg_id = phys_pkg_id,
228 .get_apic_id = get_apic_id, 255 .get_apic_id = get_apic_id,
229 .set_apic_id = set_apic_id, 256 .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3f0a3edf0a57..cd759ad90690 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -248,7 +248,7 @@ static void hpet_legacy_clockevent_register(void)
248 * Start hpet with the boot cpu mask and make it 248 * Start hpet with the boot cpu mask and make it
249 * global after the IO_APIC has been initialized. 249 * global after the IO_APIC has been initialized.
250 */ 250 */
251 hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); 251 hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
252 clockevents_register_device(&hpet_clockevent); 252 clockevents_register_device(&hpet_clockevent);
253 global_clock_event = &hpet_clockevent; 253 global_clock_event = &hpet_clockevent;
254 printk(KERN_DEBUG "hpet clockevent registered\n"); 254 printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -303,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
303 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); 303 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
304 hpet_setup_msi_irq(hdev->irq); 304 hpet_setup_msi_irq(hdev->irq);
305 disable_irq(hdev->irq); 305 disable_irq(hdev->irq);
306 irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); 306 irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
307 enable_irq(hdev->irq); 307 enable_irq(hdev->irq);
308 } 308 }
309 break; 309 break;
@@ -451,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
451 return -1; 451 return -1;
452 452
453 disable_irq(dev->irq); 453 disable_irq(dev->irq);
454 irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); 454 irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
455 enable_irq(dev->irq); 455 enable_irq(dev->irq);
456 456
457 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", 457 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -502,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
502 /* 5 usec minimum reprogramming delta. */ 502 /* 5 usec minimum reprogramming delta. */
503 evt->min_delta_ns = 5000; 503 evt->min_delta_ns = 5000;
504 504
505 evt->cpumask = cpumask_of_cpu(hdev->cpu); 505 evt->cpumask = cpumask_of(hdev->cpu);
506 clockevents_register_device(evt); 506 clockevents_register_device(evt);
507} 507}
508 508
@@ -813,7 +813,7 @@ int __init hpet_enable(void)
813 813
814out_nohpet: 814out_nohpet:
815 hpet_clear_mapping(); 815 hpet_clear_mapping();
816 boot_hpet_disable = 1; 816 hpet_address = 0;
817 return 0; 817 return 0;
818} 818}
819 819
@@ -836,10 +836,11 @@ static __init int hpet_late_init(void)
836 836
837 hpet_address = force_hpet_address; 837 hpet_address = force_hpet_address;
838 hpet_enable(); 838 hpet_enable();
839 if (!hpet_virt_address)
840 return -ENODEV;
841 } 839 }
842 840
841 if (!hpet_virt_address)
842 return -ENODEV;
843
843 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 844 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
844 845
845 for_each_online_cpu(cpu) { 846 for_each_online_cpu(cpu) {
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c1b5e3ece1f2..10f92fb532f3 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
114 * Start pit with the boot cpu mask and make it global after the 114 * Start pit with the boot cpu mask and make it global after the
115 * IO_APIC has been initialized. 115 * IO_APIC has been initialized.
116 */ 116 */
117 pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); 117 pit_clockevent.cpumask = cpumask_of(smp_processor_id());
118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
119 pit_clockevent.shift); 119 pit_clockevent.shift);
120 pit_clockevent.max_delta_ns = 120 pit_clockevent.max_delta_ns =
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index d39918076bb4..df3bf269beab 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,7 +10,6 @@
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/desc.h> 11#include <asm/desc.h>
12 12
13static struct fs_struct init_fs = INIT_FS;
14static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
16struct mm_struct init_mm = INIT_MM(init_mm); 15struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index b8c8a8e99341..69911722b9d3 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,94 +108,277 @@ static int __init parse_noapic(char *str)
108early_param("noapic", parse_noapic); 108early_param("noapic", parse_noapic);
109 109
110struct irq_pin_list; 110struct irq_pin_list;
111
112/*
113 * This is performance-critical, we want to do it O(1)
114 *
115 * the indexing order of this array favors 1:1 mappings
116 * between pins and IRQs.
117 */
118
119struct irq_pin_list {
120 int apic, pin;
121 struct irq_pin_list *next;
122};
123
124static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
125{
126 struct irq_pin_list *pin;
127 int node;
128
129 node = cpu_to_node(cpu);
130
131 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
132 printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node);
133
134 return pin;
135}
136
111struct irq_cfg { 137struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin; 138 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain; 139 cpumask_var_t domain;
115 cpumask_t old_domain; 140 cpumask_var_t old_domain;
116 unsigned move_cleanup_count; 141 unsigned move_cleanup_count;
117 u8 vector; 142 u8 vector;
118 u8 move_in_progress : 1; 143 u8 move_in_progress : 1;
144#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
145 u8 move_desc_pending : 1;
146#endif
119}; 147};
120 148
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 149/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
150#ifdef CONFIG_SPARSE_IRQ
151static struct irq_cfg irq_cfgx[] = {
152#else
122static struct irq_cfg irq_cfgx[NR_IRQS] = { 153static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, 154#endif
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, 155 [0] = { .vector = IRQ0_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, 156 [1] = { .vector = IRQ1_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, 157 [2] = { .vector = IRQ2_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, 158 [3] = { .vector = IRQ3_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, 159 [4] = { .vector = IRQ4_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, 160 [5] = { .vector = IRQ5_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, 161 [6] = { .vector = IRQ6_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, 162 [7] = { .vector = IRQ7_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, 163 [8] = { .vector = IRQ8_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, 164 [9] = { .vector = IRQ9_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, 165 [10] = { .vector = IRQ10_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, 166 [11] = { .vector = IRQ11_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, 167 [12] = { .vector = IRQ12_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, 168 [13] = { .vector = IRQ13_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, 169 [14] = { .vector = IRQ14_VECTOR, },
170 [15] = { .vector = IRQ15_VECTOR, },
139}; 171};
140 172
141#define for_each_irq_cfg(irq, cfg) \ 173int __init arch_early_irq_init(void)
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) 174{
175 struct irq_cfg *cfg;
176 struct irq_desc *desc;
177 int count;
178 int i;
179
180 cfg = irq_cfgx;
181 count = ARRAY_SIZE(irq_cfgx);
182
183 for (i = 0; i < count; i++) {
184 desc = irq_to_desc(i);
185 desc->chip_data = &cfg[i];
186 alloc_bootmem_cpumask_var(&cfg[i].domain);
187 alloc_bootmem_cpumask_var(&cfg[i].old_domain);
188 if (i < NR_IRQS_LEGACY)
189 cpumask_setall(cfg[i].domain);
190 }
191
192 return 0;
193}
143 194
195#ifdef CONFIG_SPARSE_IRQ
144static struct irq_cfg *irq_cfg(unsigned int irq) 196static struct irq_cfg *irq_cfg(unsigned int irq)
145{ 197{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL; 198 struct irq_cfg *cfg = NULL;
199 struct irq_desc *desc;
200
201 desc = irq_to_desc(irq);
202 if (desc)
203 cfg = desc->chip_data;
204
205 return cfg;
147} 206}
148 207
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq) 208static struct irq_cfg *get_one_free_irq_cfg(int cpu)
150{ 209{
151 return irq_cfg(irq); 210 struct irq_cfg *cfg;
211 int node;
212
213 node = cpu_to_node(cpu);
214
215 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
216 if (cfg) {
217 /* FIXME: needs alloc_cpumask_var_node() */
218 if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
219 kfree(cfg);
220 cfg = NULL;
221 } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
222 free_cpumask_var(cfg->domain);
223 kfree(cfg);
224 cfg = NULL;
225 } else {
226 cpumask_clear(cfg->domain);
227 cpumask_clear(cfg->old_domain);
228 }
229 }
230 printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
231
232 return cfg;
152} 233}
153 234
154/* 235int arch_init_chip_data(struct irq_desc *desc, int cpu)
155 * Rough estimation of how many shared IRQs there are, can be changed 236{
156 * anytime. 237 struct irq_cfg *cfg;
157 */
158#define MAX_PLUS_SHARED_IRQS NR_IRQS
159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
160 238
161/* 239 cfg = desc->chip_data;
162 * This is performance-critical, we want to do it O(1) 240 if (!cfg) {
163 * 241 desc->chip_data = get_one_free_irq_cfg(cpu);
164 * the indexing order of this array favors 1:1 mappings 242 if (!desc->chip_data) {
165 * between pins and IRQs. 243 printk(KERN_ERR "can not alloc irq_cfg\n");
166 */ 244 BUG_ON(1);
245 }
246 }
167 247
168struct irq_pin_list { 248 return 0;
169 int apic, pin; 249}
170 struct irq_pin_list *next;
171};
172 250
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; 251#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
174static struct irq_pin_list *irq_2_pin_ptr;
175 252
176static void __init irq_2_pin_init(void) 253static void
254init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
177{ 255{
178 struct irq_pin_list *pin = irq_2_pin_head; 256 struct irq_pin_list *old_entry, *head, *tail, *entry;
179 int i; 257
258 cfg->irq_2_pin = NULL;
259 old_entry = old_cfg->irq_2_pin;
260 if (!old_entry)
261 return;
262
263 entry = get_one_free_irq_2_pin(cpu);
264 if (!entry)
265 return;
180 266
181 for (i = 1; i < PIN_MAP_SIZE; i++) 267 entry->apic = old_entry->apic;
182 pin[i-1].next = &pin[i]; 268 entry->pin = old_entry->pin;
269 head = entry;
270 tail = entry;
271 old_entry = old_entry->next;
272 while (old_entry) {
273 entry = get_one_free_irq_2_pin(cpu);
274 if (!entry) {
275 entry = head;
276 while (entry) {
277 head = entry->next;
278 kfree(entry);
279 entry = head;
280 }
281 /* still use the old one */
282 return;
283 }
284 entry->apic = old_entry->apic;
285 entry->pin = old_entry->pin;
286 tail->next = entry;
287 tail = entry;
288 old_entry = old_entry->next;
289 }
183 290
184 irq_2_pin_ptr = &pin[0]; 291 tail->next = NULL;
292 cfg->irq_2_pin = head;
185} 293}
186 294
187static struct irq_pin_list *get_one_free_irq_2_pin(void) 295static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
188{ 296{
189 struct irq_pin_list *pin = irq_2_pin_ptr; 297 struct irq_pin_list *entry, *next;
190 298
191 if (!pin) 299 if (old_cfg->irq_2_pin == cfg->irq_2_pin)
192 panic("can not get more irq_2_pin\n"); 300 return;
193 301
194 irq_2_pin_ptr = pin->next; 302 entry = old_cfg->irq_2_pin;
195 pin->next = NULL; 303
196 return pin; 304 while (entry) {
305 next = entry->next;
306 kfree(entry);
307 entry = next;
308 }
309 old_cfg->irq_2_pin = NULL;
310}
311
312void arch_init_copy_chip_data(struct irq_desc *old_desc,
313 struct irq_desc *desc, int cpu)
314{
315 struct irq_cfg *cfg;
316 struct irq_cfg *old_cfg;
317
318 cfg = get_one_free_irq_cfg(cpu);
319
320 if (!cfg)
321 return;
322
323 desc->chip_data = cfg;
324
325 old_cfg = old_desc->chip_data;
326
327 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
328
329 init_copy_irq_2_pin(old_cfg, cfg, cpu);
197} 330}
198 331
332static void free_irq_cfg(struct irq_cfg *old_cfg)
333{
334 kfree(old_cfg);
335}
336
337void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
338{
339 struct irq_cfg *old_cfg, *cfg;
340
341 old_cfg = old_desc->chip_data;
342 cfg = desc->chip_data;
343
344 if (old_cfg == cfg)
345 return;
346
347 if (old_cfg) {
348 free_irq_2_pin(old_cfg, cfg);
349 free_irq_cfg(old_cfg);
350 old_desc->chip_data = NULL;
351 }
352}
353
354static void
355set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
356{
357 struct irq_cfg *cfg = desc->chip_data;
358
359 if (!cfg->move_in_progress) {
360 /* it means that domain is not changed */
361 if (!cpumask_intersects(&desc->affinity, mask))
362 cfg->move_desc_pending = 1;
363 }
364}
365#endif
366
367#else
368static struct irq_cfg *irq_cfg(unsigned int irq)
369{
370 return irq < nr_irqs ? irq_cfgx + irq : NULL;
371}
372
373#endif
374
375#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
376static inline void
377set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
378{
379}
380#endif
381
199struct io_apic { 382struct io_apic {
200 unsigned int index; 383 unsigned int index;
201 unsigned int unused[3]; 384 unsigned int unused[3];
@@ -237,11 +420,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
237 writel(value, &io_apic->data); 420 writel(value, &io_apic->data);
238} 421}
239 422
240static bool io_apic_level_ack_pending(unsigned int irq) 423static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
241{ 424{
242 struct irq_pin_list *entry; 425 struct irq_pin_list *entry;
243 unsigned long flags; 426 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
245 427
246 spin_lock_irqsave(&ioapic_lock, flags); 428 spin_lock_irqsave(&ioapic_lock, flags);
247 entry = cfg->irq_2_pin; 429 entry = cfg->irq_2_pin;
@@ -323,13 +505,32 @@ static void ioapic_mask_entry(int apic, int pin)
323} 505}
324 506
325#ifdef CONFIG_SMP 507#ifdef CONFIG_SMP
326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 508static void send_cleanup_vector(struct irq_cfg *cfg)
509{
510 cpumask_var_t cleanup_mask;
511
512 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
513 unsigned int i;
514 cfg->move_cleanup_count = 0;
515 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
516 cfg->move_cleanup_count++;
517 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
518 send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
519 } else {
520 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
521 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
522 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
523 free_cpumask_var(cleanup_mask);
524 }
525 cfg->move_in_progress = 0;
526}
527
528static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
327{ 529{
328 int apic, pin; 530 int apic, pin;
329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry; 531 struct irq_pin_list *entry;
532 u8 vector = cfg->vector;
331 533
332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin; 534 entry = cfg->irq_2_pin;
334 for (;;) { 535 for (;;) {
335 unsigned int reg; 536 unsigned int reg;
@@ -359,36 +560,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
359 } 560 }
360} 561}
361 562
362static int assign_irq_vector(int irq, cpumask_t mask); 563static int
564assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
363 565
364static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 566/*
567 * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
568 * of that, or returns BAD_APICID and leaves desc->affinity untouched.
569 */
570static unsigned int
571set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
572{
573 struct irq_cfg *cfg;
574 unsigned int irq;
575
576 if (!cpumask_intersects(mask, cpu_online_mask))
577 return BAD_APICID;
578
579 irq = desc->irq;
580 cfg = desc->chip_data;
581 if (assign_irq_vector(irq, cfg, mask))
582 return BAD_APICID;
583
584 cpumask_and(&desc->affinity, cfg->domain, mask);
585 set_extra_move_desc(desc, mask);
586 return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
587}
588
589static void
590set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
365{ 591{
366 struct irq_cfg *cfg; 592 struct irq_cfg *cfg;
367 unsigned long flags; 593 unsigned long flags;
368 unsigned int dest; 594 unsigned int dest;
369 cpumask_t tmp; 595 unsigned int irq;
370 struct irq_desc *desc;
371 596
372 cpus_and(tmp, mask, cpu_online_map); 597 irq = desc->irq;
373 if (cpus_empty(tmp)) 598 cfg = desc->chip_data;
374 return;
375 599
376 cfg = irq_cfg(irq); 600 spin_lock_irqsave(&ioapic_lock, flags);
377 if (assign_irq_vector(irq, mask)) 601 dest = set_desc_affinity(desc, mask);
378 return; 602 if (dest != BAD_APICID) {
603 /* Only the high 8 bits are valid. */
604 dest = SET_APIC_LOGICAL_ID(dest);
605 __target_IO_APIC_irq(irq, dest, cfg);
606 }
607 spin_unlock_irqrestore(&ioapic_lock, flags);
608}
379 609
380 cpus_and(tmp, cfg->domain, mask); 610static void
381 dest = cpu_mask_to_apicid(tmp); 611set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
382 /* 612{
383 * Only the high 8 bits are valid. 613 struct irq_desc *desc;
384 */
385 dest = SET_APIC_LOGICAL_ID(dest);
386 614
387 desc = irq_to_desc(irq); 615 desc = irq_to_desc(irq);
388 spin_lock_irqsave(&ioapic_lock, flags); 616
389 __target_IO_APIC_irq(irq, dest, cfg->vector); 617 set_ioapic_affinity_irq_desc(desc, mask);
390 desc->affinity = mask;
391 spin_unlock_irqrestore(&ioapic_lock, flags);
392} 618}
393#endif /* CONFIG_SMP */ 619#endif /* CONFIG_SMP */
394 620
@@ -397,16 +623,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
397 * shared ISA-space IRQs, so we have to support them. We are super 623 * shared ISA-space IRQs, so we have to support them. We are super
398 * fast in the common case, and fast for shared ISA-space IRQs. 624 * fast in the common case, and fast for shared ISA-space IRQs.
399 */ 625 */
400static void add_pin_to_irq(unsigned int irq, int apic, int pin) 626static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
401{ 627{
402 struct irq_cfg *cfg;
403 struct irq_pin_list *entry; 628 struct irq_pin_list *entry;
404 629
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin; 630 entry = cfg->irq_2_pin;
408 if (!entry) { 631 if (!entry) {
409 entry = get_one_free_irq_2_pin(); 632 entry = get_one_free_irq_2_pin(cpu);
633 if (!entry) {
634 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
635 apic, pin);
636 return;
637 }
410 cfg->irq_2_pin = entry; 638 cfg->irq_2_pin = entry;
411 entry->apic = apic; 639 entry->apic = apic;
412 entry->pin = pin; 640 entry->pin = pin;
@@ -421,7 +649,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
421 entry = entry->next; 649 entry = entry->next;
422 } 650 }
423 651
424 entry->next = get_one_free_irq_2_pin(); 652 entry->next = get_one_free_irq_2_pin(cpu);
425 entry = entry->next; 653 entry = entry->next;
426 entry->apic = apic; 654 entry->apic = apic;
427 entry->pin = pin; 655 entry->pin = pin;
@@ -430,11 +658,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
430/* 658/*
431 * Reroute an IRQ to a different pin. 659 * Reroute an IRQ to a different pin.
432 */ 660 */
433static void __init replace_pin_at_irq(unsigned int irq, 661static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
434 int oldapic, int oldpin, 662 int oldapic, int oldpin,
435 int newapic, int newpin) 663 int newapic, int newpin)
436{ 664{
437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin; 665 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0; 666 int replaced = 0;
440 667
@@ -451,18 +678,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
451 678
452 /* why? call replace before add? */ 679 /* why? call replace before add? */
453 if (!replaced) 680 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin); 681 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
455} 682}
456 683
457static inline void io_apic_modify_irq(unsigned int irq, 684static inline void io_apic_modify_irq(struct irq_cfg *cfg,
458 int mask_and, int mask_or, 685 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry)) 686 void (*final)(struct irq_pin_list *entry))
460{ 687{
461 int pin; 688 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry; 689 struct irq_pin_list *entry;
464 690
465 cfg = irq_cfg(irq);
466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { 691 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
467 unsigned int reg; 692 unsigned int reg;
468 pin = entry->pin; 693 pin = entry->pin;
@@ -475,9 +700,9 @@ static inline void io_apic_modify_irq(unsigned int irq,
475 } 700 }
476} 701}
477 702
478static void __unmask_IO_APIC_irq(unsigned int irq) 703static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
479{ 704{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); 705 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
481} 706}
482 707
483#ifdef CONFIG_X86_64 708#ifdef CONFIG_X86_64
@@ -492,47 +717,64 @@ static void io_apic_sync(struct irq_pin_list *entry)
492 readl(&io_apic->data); 717 readl(&io_apic->data);
493} 718}
494 719
495static void __mask_IO_APIC_irq(unsigned int irq) 720static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
496{ 721{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 722 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498} 723}
499#else /* CONFIG_X86_32 */ 724#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq) 725static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
501{ 726{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); 727 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
503} 728}
504 729
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq) 730static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
506{ 731{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, 732 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL); 733 IO_APIC_REDIR_MASKED, NULL);
509} 734}
510 735
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq) 736static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
512{ 737{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 738 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 739 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515} 740}
516#endif /* CONFIG_X86_32 */ 741#endif /* CONFIG_X86_32 */
517 742
518static void mask_IO_APIC_irq (unsigned int irq) 743static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
519{ 744{
745 struct irq_cfg *cfg = desc->chip_data;
520 unsigned long flags; 746 unsigned long flags;
521 747
748 BUG_ON(!cfg);
749
522 spin_lock_irqsave(&ioapic_lock, flags); 750 spin_lock_irqsave(&ioapic_lock, flags);
523 __mask_IO_APIC_irq(irq); 751 __mask_IO_APIC_irq(cfg);
524 spin_unlock_irqrestore(&ioapic_lock, flags); 752 spin_unlock_irqrestore(&ioapic_lock, flags);
525} 753}
526 754
527static void unmask_IO_APIC_irq (unsigned int irq) 755static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
528{ 756{
757 struct irq_cfg *cfg = desc->chip_data;
529 unsigned long flags; 758 unsigned long flags;
530 759
531 spin_lock_irqsave(&ioapic_lock, flags); 760 spin_lock_irqsave(&ioapic_lock, flags);
532 __unmask_IO_APIC_irq(irq); 761 __unmask_IO_APIC_irq(cfg);
533 spin_unlock_irqrestore(&ioapic_lock, flags); 762 spin_unlock_irqrestore(&ioapic_lock, flags);
534} 763}
535 764
765static void mask_IO_APIC_irq(unsigned int irq)
766{
767 struct irq_desc *desc = irq_to_desc(irq);
768
769 mask_IO_APIC_irq_desc(desc);
770}
771static void unmask_IO_APIC_irq(unsigned int irq)
772{
773 struct irq_desc *desc = irq_to_desc(irq);
774
775 unmask_IO_APIC_irq_desc(desc);
776}
777
536static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 778static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
537{ 779{
538 struct IO_APIC_route_entry entry; 780 struct IO_APIC_route_entry entry;
@@ -809,7 +1051,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
809 */ 1051 */
810static int EISA_ELCR(unsigned int irq) 1052static int EISA_ELCR(unsigned int irq)
811{ 1053{
812 if (irq < 16) { 1054 if (irq < NR_IRQS_LEGACY) {
813 unsigned int port = 0x4d0 + (irq >> 3); 1055 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1; 1056 return (inb(port) >> (irq & 7)) & 1;
815 } 1057 }
@@ -1034,7 +1276,8 @@ void unlock_vector_lock(void)
1034 spin_unlock(&vector_lock); 1276 spin_unlock(&vector_lock);
1035} 1277}
1036 1278
1037static int __assign_irq_vector(int irq, cpumask_t mask) 1279static int
1280__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1038{ 1281{
1039 /* 1282 /*
1040 * NOTE! The local APIC isn't very good at handling 1283 * NOTE! The local APIC isn't very good at handling
@@ -1049,52 +1292,49 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
1049 */ 1292 */
1050 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1293 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
1051 unsigned int old_vector; 1294 unsigned int old_vector;
1052 int cpu; 1295 int cpu, err;
1053 struct irq_cfg *cfg; 1296 cpumask_var_t tmp_mask;
1054
1055 cfg = irq_cfg(irq);
1056
1057 /* Only try and allocate irqs on cpus that are present */
1058 cpus_and(mask, mask, cpu_online_map);
1059 1297
1060 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1298 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
1061 return -EBUSY; 1299 return -EBUSY;
1062 1300
1301 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
1302 return -ENOMEM;
1303
1063 old_vector = cfg->vector; 1304 old_vector = cfg->vector;
1064 if (old_vector) { 1305 if (old_vector) {
1065 cpumask_t tmp; 1306 cpumask_and(tmp_mask, mask, cpu_online_mask);
1066 cpus_and(tmp, cfg->domain, mask); 1307 cpumask_and(tmp_mask, cfg->domain, tmp_mask);
1067 if (!cpus_empty(tmp)) 1308 if (!cpumask_empty(tmp_mask)) {
1309 free_cpumask_var(tmp_mask);
1068 return 0; 1310 return 0;
1311 }
1069 } 1312 }
1070 1313
1071 for_each_cpu_mask_nr(cpu, mask) { 1314 /* Only try and allocate irqs on cpus that are present */
1072 cpumask_t domain, new_mask; 1315 err = -ENOSPC;
1316 for_each_cpu_and(cpu, mask, cpu_online_mask) {
1073 int new_cpu; 1317 int new_cpu;
1074 int vector, offset; 1318 int vector, offset;
1075 1319
1076 domain = vector_allocation_domain(cpu); 1320 vector_allocation_domain(cpu, tmp_mask);
1077 cpus_and(new_mask, domain, cpu_online_map);
1078 1321
1079 vector = current_vector; 1322 vector = current_vector;
1080 offset = current_offset; 1323 offset = current_offset;
1081next: 1324next:
1082 vector += 8; 1325 vector += 8;
1083 if (vector >= first_system_vector) { 1326 if (vector >= first_system_vector) {
1084 /* If we run out of vectors on large boxen, must share them. */ 1327 /* If out of vectors on large boxen, must share them. */
1085 offset = (offset + 1) % 8; 1328 offset = (offset + 1) % 8;
1086 vector = FIRST_DEVICE_VECTOR + offset; 1329 vector = FIRST_DEVICE_VECTOR + offset;
1087 } 1330 }
1088 if (unlikely(current_vector == vector)) 1331 if (unlikely(current_vector == vector))
1089 continue; 1332 continue;
1090#ifdef CONFIG_X86_64 1333
1091 if (vector == IA32_SYSCALL_VECTOR) 1334 if (test_bit(vector, used_vectors))
1092 goto next;
1093#else
1094 if (vector == SYSCALL_VECTOR)
1095 goto next; 1335 goto next;
1096#endif 1336
1097 for_each_cpu_mask_nr(new_cpu, new_mask) 1337 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1098 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1338 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
1099 goto next; 1339 goto next;
1100 /* Found one! */ 1340 /* Found one! */
@@ -1102,49 +1342,47 @@ next:
1102 current_offset = offset; 1342 current_offset = offset;
1103 if (old_vector) { 1343 if (old_vector) {
1104 cfg->move_in_progress = 1; 1344 cfg->move_in_progress = 1;
1105 cfg->old_domain = cfg->domain; 1345 cpumask_copy(cfg->old_domain, cfg->domain);
1106 } 1346 }
1107 for_each_cpu_mask_nr(new_cpu, new_mask) 1347 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1108 per_cpu(vector_irq, new_cpu)[vector] = irq; 1348 per_cpu(vector_irq, new_cpu)[vector] = irq;
1109 cfg->vector = vector; 1349 cfg->vector = vector;
1110 cfg->domain = domain; 1350 cpumask_copy(cfg->domain, tmp_mask);
1111 return 0; 1351 err = 0;
1352 break;
1112 } 1353 }
1113 return -ENOSPC; 1354 free_cpumask_var(tmp_mask);
1355 return err;
1114} 1356}
1115 1357
1116static int assign_irq_vector(int irq, cpumask_t mask) 1358static int
1359assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1117{ 1360{
1118 int err; 1361 int err;
1119 unsigned long flags; 1362 unsigned long flags;
1120 1363
1121 spin_lock_irqsave(&vector_lock, flags); 1364 spin_lock_irqsave(&vector_lock, flags);
1122 err = __assign_irq_vector(irq, mask); 1365 err = __assign_irq_vector(irq, cfg, mask);
1123 spin_unlock_irqrestore(&vector_lock, flags); 1366 spin_unlock_irqrestore(&vector_lock, flags);
1124 return err; 1367 return err;
1125} 1368}
1126 1369
1127static void __clear_irq_vector(int irq) 1370static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1128{ 1371{
1129 struct irq_cfg *cfg;
1130 cpumask_t mask;
1131 int cpu, vector; 1372 int cpu, vector;
1132 1373
1133 cfg = irq_cfg(irq);
1134 BUG_ON(!cfg->vector); 1374 BUG_ON(!cfg->vector);
1135 1375
1136 vector = cfg->vector; 1376 vector = cfg->vector;
1137 cpus_and(mask, cfg->domain, cpu_online_map); 1377 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
1138 for_each_cpu_mask_nr(cpu, mask)
1139 per_cpu(vector_irq, cpu)[vector] = -1; 1378 per_cpu(vector_irq, cpu)[vector] = -1;
1140 1379
1141 cfg->vector = 0; 1380 cfg->vector = 0;
1142 cpus_clear(cfg->domain); 1381 cpumask_clear(cfg->domain);
1143 1382
1144 if (likely(!cfg->move_in_progress)) 1383 if (likely(!cfg->move_in_progress))
1145 return; 1384 return;
1146 cpus_and(mask, cfg->old_domain, cpu_online_map); 1385 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
1147 for_each_cpu_mask_nr(cpu, mask) {
1148 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; 1386 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
1149 vector++) { 1387 vector++) {
1150 if (per_cpu(vector_irq, cpu)[vector] != irq) 1388 if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1162,10 +1400,12 @@ void __setup_vector_irq(int cpu)
1162 /* This function must be called with vector_lock held */ 1400 /* This function must be called with vector_lock held */
1163 int irq, vector; 1401 int irq, vector;
1164 struct irq_cfg *cfg; 1402 struct irq_cfg *cfg;
1403 struct irq_desc *desc;
1165 1404
1166 /* Mark the inuse vectors */ 1405 /* Mark the inuse vectors */
1167 for_each_irq_cfg(irq, cfg) { 1406 for_each_irq_desc(irq, desc) {
1168 if (!cpu_isset(cpu, cfg->domain)) 1407 cfg = desc->chip_data;
1408 if (!cpumask_test_cpu(cpu, cfg->domain))
1169 continue; 1409 continue;
1170 vector = cfg->vector; 1410 vector = cfg->vector;
1171 per_cpu(vector_irq, cpu)[vector] = irq; 1411 per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1177,7 +1417,7 @@ void __setup_vector_irq(int cpu)
1177 continue; 1417 continue;
1178 1418
1179 cfg = irq_cfg(irq); 1419 cfg = irq_cfg(irq);
1180 if (!cpu_isset(cpu, cfg->domain)) 1420 if (!cpumask_test_cpu(cpu, cfg->domain))
1181 per_cpu(vector_irq, cpu)[vector] = -1; 1421 per_cpu(vector_irq, cpu)[vector] = -1;
1182 } 1422 }
1183} 1423}
@@ -1215,11 +1455,8 @@ static inline int IO_APIC_irq_trigger(int irq)
1215} 1455}
1216#endif 1456#endif
1217 1457
1218static void ioapic_register_intr(int irq, unsigned long trigger) 1458static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
1219{ 1459{
1220 struct irq_desc *desc;
1221
1222 desc = irq_to_desc(irq);
1223 1460
1224 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1461 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1225 trigger == IOAPIC_LEVEL) 1462 trigger == IOAPIC_LEVEL)
@@ -1311,23 +1548,22 @@ static int setup_ioapic_entry(int apic, int irq,
1311 return 0; 1548 return 0;
1312} 1549}
1313 1550
1314static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1551static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
1315 int trigger, int polarity) 1552 int trigger, int polarity)
1316{ 1553{
1317 struct irq_cfg *cfg; 1554 struct irq_cfg *cfg;
1318 struct IO_APIC_route_entry entry; 1555 struct IO_APIC_route_entry entry;
1319 cpumask_t mask; 1556 unsigned int dest;
1320 1557
1321 if (!IO_APIC_IRQ(irq)) 1558 if (!IO_APIC_IRQ(irq))
1322 return; 1559 return;
1323 1560
1324 cfg = irq_cfg(irq); 1561 cfg = desc->chip_data;
1325 1562
1326 mask = TARGET_CPUS; 1563 if (assign_irq_vector(irq, cfg, TARGET_CPUS))
1327 if (assign_irq_vector(irq, mask))
1328 return; 1564 return;
1329 1565
1330 cpus_and(mask, cfg->domain, mask); 1566 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
1331 1567
1332 apic_printk(APIC_VERBOSE,KERN_DEBUG 1568 apic_printk(APIC_VERBOSE,KERN_DEBUG
1333 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1569 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1337,16 +1573,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1337 1573
1338 1574
1339 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, 1575 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
1340 cpu_mask_to_apicid(mask), trigger, polarity, 1576 dest, trigger, polarity, cfg->vector)) {
1341 cfg->vector)) {
1342 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1577 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1343 mp_ioapics[apic].mp_apicid, pin); 1578 mp_ioapics[apic].mp_apicid, pin);
1344 __clear_irq_vector(irq); 1579 __clear_irq_vector(irq, cfg);
1345 return; 1580 return;
1346 } 1581 }
1347 1582
1348 ioapic_register_intr(irq, trigger); 1583 ioapic_register_intr(irq, desc, trigger);
1349 if (irq < 16) 1584 if (irq < NR_IRQS_LEGACY)
1350 disable_8259A_irq(irq); 1585 disable_8259A_irq(irq);
1351 1586
1352 ioapic_write_entry(apic, pin, entry); 1587 ioapic_write_entry(apic, pin, entry);
@@ -1356,6 +1591,9 @@ static void __init setup_IO_APIC_irqs(void)
1356{ 1591{
1357 int apic, pin, idx, irq; 1592 int apic, pin, idx, irq;
1358 int notcon = 0; 1593 int notcon = 0;
1594 struct irq_desc *desc;
1595 struct irq_cfg *cfg;
1596 int cpu = boot_cpu_id;
1359 1597
1360 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1598 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1361 1599
@@ -1387,9 +1625,15 @@ static void __init setup_IO_APIC_irqs(void)
1387 if (multi_timer_check(apic, irq)) 1625 if (multi_timer_check(apic, irq))
1388 continue; 1626 continue;
1389#endif 1627#endif
1390 add_pin_to_irq(irq, apic, pin); 1628 desc = irq_to_desc_alloc_cpu(irq, cpu);
1629 if (!desc) {
1630 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1631 continue;
1632 }
1633 cfg = desc->chip_data;
1634 add_pin_to_irq_cpu(cfg, cpu, apic, pin);
1391 1635
1392 setup_IO_APIC_irq(apic, pin, irq, 1636 setup_IO_APIC_irq(apic, pin, irq, desc,
1393 irq_trigger(idx), irq_polarity(idx)); 1637 irq_trigger(idx), irq_polarity(idx));
1394 } 1638 }
1395 } 1639 }
@@ -1448,6 +1692,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1448 union IO_APIC_reg_03 reg_03; 1692 union IO_APIC_reg_03 reg_03;
1449 unsigned long flags; 1693 unsigned long flags;
1450 struct irq_cfg *cfg; 1694 struct irq_cfg *cfg;
1695 struct irq_desc *desc;
1451 unsigned int irq; 1696 unsigned int irq;
1452 1697
1453 if (apic_verbosity == APIC_QUIET) 1698 if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1782,11 @@ __apicdebuginit(void) print_IO_APIC(void)
1537 } 1782 }
1538 } 1783 }
1539 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1784 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1540 for_each_irq_cfg(irq, cfg) { 1785 for_each_irq_desc(irq, desc) {
1541 struct irq_pin_list *entry = cfg->irq_2_pin; 1786 struct irq_pin_list *entry;
1787
1788 cfg = desc->chip_data;
1789 entry = cfg->irq_2_pin;
1542 if (!entry) 1790 if (!entry)
1543 continue; 1791 continue;
1544 printk(KERN_DEBUG "IRQ%d ", irq); 1792 printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,14 +2270,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2022{ 2270{
2023 int was_pending = 0; 2271 int was_pending = 0;
2024 unsigned long flags; 2272 unsigned long flags;
2273 struct irq_cfg *cfg;
2025 2274
2026 spin_lock_irqsave(&ioapic_lock, flags); 2275 spin_lock_irqsave(&ioapic_lock, flags);
2027 if (irq < 16) { 2276 if (irq < NR_IRQS_LEGACY) {
2028 disable_8259A_irq(irq); 2277 disable_8259A_irq(irq);
2029 if (i8259A_irq_pending(irq)) 2278 if (i8259A_irq_pending(irq))
2030 was_pending = 1; 2279 was_pending = 1;
2031 } 2280 }
2032 __unmask_IO_APIC_irq(irq); 2281 cfg = irq_cfg(irq);
2282 __unmask_IO_APIC_irq(cfg);
2033 spin_unlock_irqrestore(&ioapic_lock, flags); 2283 spin_unlock_irqrestore(&ioapic_lock, flags);
2034 2284
2035 return was_pending; 2285 return was_pending;
@@ -2043,7 +2293,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2043 unsigned long flags; 2293 unsigned long flags;
2044 2294
2045 spin_lock_irqsave(&vector_lock, flags); 2295 spin_lock_irqsave(&vector_lock, flags);
2046 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); 2296 send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2047 spin_unlock_irqrestore(&vector_lock, flags); 2297 spin_unlock_irqrestore(&vector_lock, flags);
2048 2298
2049 return 1; 2299 return 1;
@@ -2092,35 +2342,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2092 * as simple as edge triggered migration and we can do the irq migration 2342 * as simple as edge triggered migration and we can do the irq migration
2093 * with a simple atomic update to IO-APIC RTE. 2343 * with a simple atomic update to IO-APIC RTE.
2094 */ 2344 */
2095static void migrate_ioapic_irq(int irq, cpumask_t mask) 2345static void
2346migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2096{ 2347{
2097 struct irq_cfg *cfg; 2348 struct irq_cfg *cfg;
2098 struct irq_desc *desc;
2099 cpumask_t tmp, cleanup_mask;
2100 struct irte irte; 2349 struct irte irte;
2101 int modify_ioapic_rte; 2350 int modify_ioapic_rte;
2102 unsigned int dest; 2351 unsigned int dest;
2103 unsigned long flags; 2352 unsigned long flags;
2353 unsigned int irq;
2104 2354
2105 cpus_and(tmp, mask, cpu_online_map); 2355 if (!cpumask_intersects(mask, cpu_online_mask))
2106 if (cpus_empty(tmp))
2107 return; 2356 return;
2108 2357
2358 irq = desc->irq;
2109 if (get_irte(irq, &irte)) 2359 if (get_irte(irq, &irte))
2110 return; 2360 return;
2111 2361
2112 if (assign_irq_vector(irq, mask)) 2362 cfg = desc->chip_data;
2363 if (assign_irq_vector(irq, cfg, mask))
2113 return; 2364 return;
2114 2365
2115 cfg = irq_cfg(irq); 2366 set_extra_move_desc(desc, mask);
2116 cpus_and(tmp, cfg->domain, mask); 2367
2117 dest = cpu_mask_to_apicid(tmp); 2368 dest = cpu_mask_to_apicid_and(cfg->domain, mask);
2118 2369
2119 desc = irq_to_desc(irq);
2120 modify_ioapic_rte = desc->status & IRQ_LEVEL; 2370 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2121 if (modify_ioapic_rte) { 2371 if (modify_ioapic_rte) {
2122 spin_lock_irqsave(&ioapic_lock, flags); 2372 spin_lock_irqsave(&ioapic_lock, flags);
2123 __target_IO_APIC_irq(irq, dest, cfg->vector); 2373 __target_IO_APIC_irq(irq, dest, cfg);
2124 spin_unlock_irqrestore(&ioapic_lock, flags); 2374 spin_unlock_irqrestore(&ioapic_lock, flags);
2125 } 2375 }
2126 2376
@@ -2132,24 +2382,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
2132 */ 2382 */
2133 modify_irte(irq, &irte); 2383 modify_irte(irq, &irte);
2134 2384
2135 if (cfg->move_in_progress) { 2385 if (cfg->move_in_progress)
2136 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 2386 send_cleanup_vector(cfg);
2137 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2138 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2139 cfg->move_in_progress = 0;
2140 }
2141 2387
2142 desc->affinity = mask; 2388 cpumask_copy(&desc->affinity, mask);
2143} 2389}
2144 2390
2145static int migrate_irq_remapped_level(int irq) 2391static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2146{ 2392{
2147 int ret = -1; 2393 int ret = -1;
2148 struct irq_desc *desc = irq_to_desc(irq); 2394 struct irq_cfg *cfg = desc->chip_data;
2149 2395
2150 mask_IO_APIC_irq(irq); 2396 mask_IO_APIC_irq_desc(desc);
2151 2397
2152 if (io_apic_level_ack_pending(irq)) { 2398 if (io_apic_level_ack_pending(cfg)) {
2153 /* 2399 /*
2154 * Interrupt in progress. Migrating irq now will change the 2400 * Interrupt in progress. Migrating irq now will change the
2155 * vector information in the IO-APIC RTE and that will confuse 2401 * vector information in the IO-APIC RTE and that will confuse
@@ -2161,14 +2407,15 @@ static int migrate_irq_remapped_level(int irq)
2161 } 2407 }
2162 2408
2163 /* everthing is clear. we have right of way */ 2409 /* everthing is clear. we have right of way */
2164 migrate_ioapic_irq(irq, desc->pending_mask); 2410 migrate_ioapic_irq_desc(desc, &desc->pending_mask);
2165 2411
2166 ret = 0; 2412 ret = 0;
2167 desc->status &= ~IRQ_MOVE_PENDING; 2413 desc->status &= ~IRQ_MOVE_PENDING;
2168 cpus_clear(desc->pending_mask); 2414 cpumask_clear(&desc->pending_mask);
2169 2415
2170unmask: 2416unmask:
2171 unmask_IO_APIC_irq(irq); 2417 unmask_IO_APIC_irq_desc(desc);
2418
2172 return ret; 2419 return ret;
2173} 2420}
2174 2421
@@ -2189,7 +2436,7 @@ static void ir_irq_migration(struct work_struct *work)
2189 continue; 2436 continue;
2190 } 2437 }
2191 2438
2192 desc->chip->set_affinity(irq, desc->pending_mask); 2439 desc->chip->set_affinity(irq, &desc->pending_mask);
2193 spin_unlock_irqrestore(&desc->lock, flags); 2440 spin_unlock_irqrestore(&desc->lock, flags);
2194 } 2441 }
2195 } 2442 }
@@ -2198,18 +2445,24 @@ static void ir_irq_migration(struct work_struct *work)
2198/* 2445/*
2199 * Migrates the IRQ destination in the process context. 2446 * Migrates the IRQ destination in the process context.
2200 */ 2447 */
2201static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 2448static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2449 const struct cpumask *mask)
2202{ 2450{
2203 struct irq_desc *desc = irq_to_desc(irq);
2204
2205 if (desc->status & IRQ_LEVEL) { 2451 if (desc->status & IRQ_LEVEL) {
2206 desc->status |= IRQ_MOVE_PENDING; 2452 desc->status |= IRQ_MOVE_PENDING;
2207 desc->pending_mask = mask; 2453 cpumask_copy(&desc->pending_mask, mask);
2208 migrate_irq_remapped_level(irq); 2454 migrate_irq_remapped_level_desc(desc);
2209 return; 2455 return;
2210 } 2456 }
2211 2457
2212 migrate_ioapic_irq(irq, mask); 2458 migrate_ioapic_irq_desc(desc, mask);
2459}
2460static void set_ir_ioapic_affinity_irq(unsigned int irq,
2461 const struct cpumask *mask)
2462{
2463 struct irq_desc *desc = irq_to_desc(irq);
2464
2465 set_ir_ioapic_affinity_irq_desc(desc, mask);
2213} 2466}
2214#endif 2467#endif
2215 2468
@@ -2228,6 +2481,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2228 struct irq_cfg *cfg; 2481 struct irq_cfg *cfg;
2229 irq = __get_cpu_var(vector_irq)[vector]; 2482 irq = __get_cpu_var(vector_irq)[vector];
2230 2483
2484 if (irq == -1)
2485 continue;
2486
2231 desc = irq_to_desc(irq); 2487 desc = irq_to_desc(irq);
2232 if (!desc) 2488 if (!desc)
2233 continue; 2489 continue;
@@ -2237,7 +2493,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2237 if (!cfg->move_cleanup_count) 2493 if (!cfg->move_cleanup_count)
2238 goto unlock; 2494 goto unlock;
2239 2495
2240 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) 2496 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2241 goto unlock; 2497 goto unlock;
2242 2498
2243 __get_cpu_var(vector_irq)[vector] = -1; 2499 __get_cpu_var(vector_irq)[vector] = -1;
@@ -2249,28 +2505,44 @@ unlock:
2249 irq_exit(); 2505 irq_exit();
2250} 2506}
2251 2507
2252static void irq_complete_move(unsigned int irq) 2508static void irq_complete_move(struct irq_desc **descp)
2253{ 2509{
2254 struct irq_cfg *cfg = irq_cfg(irq); 2510 struct irq_desc *desc = *descp;
2511 struct irq_cfg *cfg = desc->chip_data;
2255 unsigned vector, me; 2512 unsigned vector, me;
2256 2513
2257 if (likely(!cfg->move_in_progress)) 2514 if (likely(!cfg->move_in_progress)) {
2515#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2516 if (likely(!cfg->move_desc_pending))
2517 return;
2518
2519 /* domain has not changed, but affinity did */
2520 me = smp_processor_id();
2521 if (cpu_isset(me, desc->affinity)) {
2522 *descp = desc = move_irq_desc(desc, me);
2523 /* get the new one */
2524 cfg = desc->chip_data;
2525 cfg->move_desc_pending = 0;
2526 }
2527#endif
2258 return; 2528 return;
2529 }
2259 2530
2260 vector = ~get_irq_regs()->orig_ax; 2531 vector = ~get_irq_regs()->orig_ax;
2261 me = smp_processor_id(); 2532 me = smp_processor_id();
2262 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { 2533#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2263 cpumask_t cleanup_mask; 2534 *descp = desc = move_irq_desc(desc, me);
2535 /* get the new one */
2536 cfg = desc->chip_data;
2537#endif
2264 2538
2265 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 2539 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2266 cfg->move_cleanup_count = cpus_weight(cleanup_mask); 2540 send_cleanup_vector(cfg);
2267 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2268 cfg->move_in_progress = 0;
2269 }
2270} 2541}
2271#else 2542#else
2272static inline void irq_complete_move(unsigned int irq) {} 2543static inline void irq_complete_move(struct irq_desc **descp) {}
2273#endif 2544#endif
2545
2274#ifdef CONFIG_INTR_REMAP 2546#ifdef CONFIG_INTR_REMAP
2275static void ack_x2apic_level(unsigned int irq) 2547static void ack_x2apic_level(unsigned int irq)
2276{ 2548{
@@ -2281,11 +2553,14 @@ static void ack_x2apic_edge(unsigned int irq)
2281{ 2553{
2282 ack_x2APIC_irq(); 2554 ack_x2APIC_irq();
2283} 2555}
2556
2284#endif 2557#endif
2285 2558
2286static void ack_apic_edge(unsigned int irq) 2559static void ack_apic_edge(unsigned int irq)
2287{ 2560{
2288 irq_complete_move(irq); 2561 struct irq_desc *desc = irq_to_desc(irq);
2562
2563 irq_complete_move(&desc);
2289 move_native_irq(irq); 2564 move_native_irq(irq);
2290 ack_APIC_irq(); 2565 ack_APIC_irq();
2291} 2566}
@@ -2294,18 +2569,21 @@ atomic_t irq_mis_count;
2294 2569
2295static void ack_apic_level(unsigned int irq) 2570static void ack_apic_level(unsigned int irq)
2296{ 2571{
2572 struct irq_desc *desc = irq_to_desc(irq);
2573
2297#ifdef CONFIG_X86_32 2574#ifdef CONFIG_X86_32
2298 unsigned long v; 2575 unsigned long v;
2299 int i; 2576 int i;
2300#endif 2577#endif
2578 struct irq_cfg *cfg;
2301 int do_unmask_irq = 0; 2579 int do_unmask_irq = 0;
2302 2580
2303 irq_complete_move(irq); 2581 irq_complete_move(&desc);
2304#ifdef CONFIG_GENERIC_PENDING_IRQ 2582#ifdef CONFIG_GENERIC_PENDING_IRQ
2305 /* If we are moving the irq we need to mask it */ 2583 /* If we are moving the irq we need to mask it */
2306 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2584 if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
2307 do_unmask_irq = 1; 2585 do_unmask_irq = 1;
2308 mask_IO_APIC_irq(irq); 2586 mask_IO_APIC_irq_desc(desc);
2309 } 2587 }
2310#endif 2588#endif
2311 2589
@@ -2329,7 +2607,8 @@ static void ack_apic_level(unsigned int irq)
2329 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2607 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2330 * The idea is from Manfred Spraul. --macro 2608 * The idea is from Manfred Spraul. --macro
2331 */ 2609 */
2332 i = irq_cfg(irq)->vector; 2610 cfg = desc->chip_data;
2611 i = cfg->vector;
2333 2612
2334 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2613 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2335#endif 2614#endif
@@ -2368,17 +2647,18 @@ static void ack_apic_level(unsigned int irq)
2368 * accurate and is causing problems then it is a hardware bug 2647 * accurate and is causing problems then it is a hardware bug
2369 * and you can go talk to the chipset vendor about it. 2648 * and you can go talk to the chipset vendor about it.
2370 */ 2649 */
2371 if (!io_apic_level_ack_pending(irq)) 2650 cfg = desc->chip_data;
2651 if (!io_apic_level_ack_pending(cfg))
2372 move_masked_irq(irq); 2652 move_masked_irq(irq);
2373 unmask_IO_APIC_irq(irq); 2653 unmask_IO_APIC_irq_desc(desc);
2374 } 2654 }
2375 2655
2376#ifdef CONFIG_X86_32 2656#ifdef CONFIG_X86_32
2377 if (!(v & (1 << (i & 0x1f)))) { 2657 if (!(v & (1 << (i & 0x1f)))) {
2378 atomic_inc(&irq_mis_count); 2658 atomic_inc(&irq_mis_count);
2379 spin_lock(&ioapic_lock); 2659 spin_lock(&ioapic_lock);
2380 __mask_and_edge_IO_APIC_irq(irq); 2660 __mask_and_edge_IO_APIC_irq(cfg);
2381 __unmask_and_level_IO_APIC_irq(irq); 2661 __unmask_and_level_IO_APIC_irq(cfg);
2382 spin_unlock(&ioapic_lock); 2662 spin_unlock(&ioapic_lock);
2383 } 2663 }
2384#endif 2664#endif
@@ -2429,20 +2709,19 @@ static inline void init_IO_APIC_traps(void)
2429 * Also, we've got to be careful not to trash gate 2709 * Also, we've got to be careful not to trash gate
2430 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2710 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2431 */ 2711 */
2432 for_each_irq_cfg(irq, cfg) { 2712 for_each_irq_desc(irq, desc) {
2433 if (IO_APIC_IRQ(irq) && !cfg->vector) { 2713 cfg = desc->chip_data;
2714 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2434 /* 2715 /*
2435 * Hmm.. We don't have an entry for this, 2716 * Hmm.. We don't have an entry for this,
2436 * so default to an old-fashioned 8259 2717 * so default to an old-fashioned 8259
2437 * interrupt if we can.. 2718 * interrupt if we can..
2438 */ 2719 */
2439 if (irq < 16) 2720 if (irq < NR_IRQS_LEGACY)
2440 make_8259A_irq(irq); 2721 make_8259A_irq(irq);
2441 else { 2722 else
2442 desc = irq_to_desc(irq);
2443 /* Strange. Oh, well.. */ 2723 /* Strange. Oh, well.. */
2444 desc->chip = &no_irq_chip; 2724 desc->chip = &no_irq_chip;
2445 }
2446 } 2725 }
2447 } 2726 }
2448} 2727}
@@ -2467,7 +2746,7 @@ static void unmask_lapic_irq(unsigned int irq)
2467 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2746 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2468} 2747}
2469 2748
2470static void ack_lapic_irq (unsigned int irq) 2749static void ack_lapic_irq(unsigned int irq)
2471{ 2750{
2472 ack_APIC_irq(); 2751 ack_APIC_irq();
2473} 2752}
@@ -2479,11 +2758,8 @@ static struct irq_chip lapic_chip __read_mostly = {
2479 .ack = ack_lapic_irq, 2758 .ack = ack_lapic_irq,
2480}; 2759};
2481 2760
2482static void lapic_register_intr(int irq) 2761static void lapic_register_intr(int irq, struct irq_desc *desc)
2483{ 2762{
2484 struct irq_desc *desc;
2485
2486 desc = irq_to_desc(irq);
2487 desc->status &= ~IRQ_LEVEL; 2763 desc->status &= ~IRQ_LEVEL;
2488 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2764 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2489 "edge"); 2765 "edge");
@@ -2587,7 +2863,9 @@ int timer_through_8259 __initdata;
2587 */ 2863 */
2588static inline void __init check_timer(void) 2864static inline void __init check_timer(void)
2589{ 2865{
2590 struct irq_cfg *cfg = irq_cfg(0); 2866 struct irq_desc *desc = irq_to_desc(0);
2867 struct irq_cfg *cfg = desc->chip_data;
2868 int cpu = boot_cpu_id;
2591 int apic1, pin1, apic2, pin2; 2869 int apic1, pin1, apic2, pin2;
2592 unsigned long flags; 2870 unsigned long flags;
2593 unsigned int ver; 2871 unsigned int ver;
@@ -2602,7 +2880,7 @@ static inline void __init check_timer(void)
2602 * get/set the timer IRQ vector: 2880 * get/set the timer IRQ vector:
2603 */ 2881 */
2604 disable_8259A_irq(0); 2882 disable_8259A_irq(0);
2605 assign_irq_vector(0, TARGET_CPUS); 2883 assign_irq_vector(0, cfg, TARGET_CPUS);
2606 2884
2607 /* 2885 /*
2608 * As IRQ0 is to be enabled in the 8259A, the virtual 2886 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2653,10 +2931,10 @@ static inline void __init check_timer(void)
2653 * Ok, does IRQ0 through the IOAPIC work? 2931 * Ok, does IRQ0 through the IOAPIC work?
2654 */ 2932 */
2655 if (no_pin1) { 2933 if (no_pin1) {
2656 add_pin_to_irq(0, apic1, pin1); 2934 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
2657 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2935 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2658 } 2936 }
2659 unmask_IO_APIC_irq(0); 2937 unmask_IO_APIC_irq_desc(desc);
2660 if (timer_irq_works()) { 2938 if (timer_irq_works()) {
2661 if (nmi_watchdog == NMI_IO_APIC) { 2939 if (nmi_watchdog == NMI_IO_APIC) {
2662 setup_nmi(); 2940 setup_nmi();
@@ -2682,9 +2960,9 @@ static inline void __init check_timer(void)
2682 /* 2960 /*
2683 * legacy devices should be connected to IO APIC #0 2961 * legacy devices should be connected to IO APIC #0
2684 */ 2962 */
2685 replace_pin_at_irq(0, apic1, pin1, apic2, pin2); 2963 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
2686 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2964 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2687 unmask_IO_APIC_irq(0); 2965 unmask_IO_APIC_irq_desc(desc);
2688 enable_8259A_irq(0); 2966 enable_8259A_irq(0);
2689 if (timer_irq_works()) { 2967 if (timer_irq_works()) {
2690 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2968 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2716,7 +2994,7 @@ static inline void __init check_timer(void)
2716 apic_printk(APIC_QUIET, KERN_INFO 2994 apic_printk(APIC_QUIET, KERN_INFO
2717 "...trying to set up timer as Virtual Wire IRQ...\n"); 2995 "...trying to set up timer as Virtual Wire IRQ...\n");
2718 2996
2719 lapic_register_intr(0); 2997 lapic_register_intr(0, desc);
2720 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2998 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
2721 enable_8259A_irq(0); 2999 enable_8259A_irq(0);
2722 3000
@@ -2901,22 +3179,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
2901 unsigned int irq; 3179 unsigned int irq;
2902 unsigned int new; 3180 unsigned int new;
2903 unsigned long flags; 3181 unsigned long flags;
2904 struct irq_cfg *cfg_new; 3182 struct irq_cfg *cfg_new = NULL;
2905 3183 int cpu = boot_cpu_id;
2906 irq_want = nr_irqs - 1; 3184 struct irq_desc *desc_new = NULL;
2907 3185
2908 irq = 0; 3186 irq = 0;
2909 spin_lock_irqsave(&vector_lock, flags); 3187 spin_lock_irqsave(&vector_lock, flags);
2910 for (new = irq_want; new > 0; new--) { 3188 for (new = irq_want; new < NR_IRQS; new++) {
2911 if (platform_legacy_irq(new)) 3189 if (platform_legacy_irq(new))
2912 continue; 3190 continue;
2913 cfg_new = irq_cfg(new); 3191
2914 if (cfg_new && cfg_new->vector != 0) 3192 desc_new = irq_to_desc_alloc_cpu(new, cpu);
3193 if (!desc_new) {
3194 printk(KERN_INFO "can not get irq_desc for %d\n", new);
2915 continue; 3195 continue;
2916 /* check if need to create one */ 3196 }
2917 if (!cfg_new) 3197 cfg_new = desc_new->chip_data;
2918 cfg_new = irq_cfg_alloc(new); 3198
2919 if (__assign_irq_vector(new, TARGET_CPUS) == 0) 3199 if (cfg_new->vector != 0)
3200 continue;
3201 if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
2920 irq = new; 3202 irq = new;
2921 break; 3203 break;
2922 } 3204 }
@@ -2924,15 +3206,21 @@ unsigned int create_irq_nr(unsigned int irq_want)
2924 3206
2925 if (irq > 0) { 3207 if (irq > 0) {
2926 dynamic_irq_init(irq); 3208 dynamic_irq_init(irq);
3209 /* restore it, in case dynamic_irq_init clear it */
3210 if (desc_new)
3211 desc_new->chip_data = cfg_new;
2927 } 3212 }
2928 return irq; 3213 return irq;
2929} 3214}
2930 3215
3216static int nr_irqs_gsi = NR_IRQS_LEGACY;
2931int create_irq(void) 3217int create_irq(void)
2932{ 3218{
3219 unsigned int irq_want;
2933 int irq; 3220 int irq;
2934 3221
2935 irq = create_irq_nr(nr_irqs - 1); 3222 irq_want = nr_irqs_gsi;
3223 irq = create_irq_nr(irq_want);
2936 3224
2937 if (irq == 0) 3225 if (irq == 0)
2938 irq = -1; 3226 irq = -1;
@@ -2943,14 +3231,22 @@ int create_irq(void)
2943void destroy_irq(unsigned int irq) 3231void destroy_irq(unsigned int irq)
2944{ 3232{
2945 unsigned long flags; 3233 unsigned long flags;
3234 struct irq_cfg *cfg;
3235 struct irq_desc *desc;
2946 3236
3237 /* store it, in case dynamic_irq_cleanup clear it */
3238 desc = irq_to_desc(irq);
3239 cfg = desc->chip_data;
2947 dynamic_irq_cleanup(irq); 3240 dynamic_irq_cleanup(irq);
3241 /* connect back irq_cfg */
3242 if (desc)
3243 desc->chip_data = cfg;
2948 3244
2949#ifdef CONFIG_INTR_REMAP 3245#ifdef CONFIG_INTR_REMAP
2950 free_irte(irq); 3246 free_irte(irq);
2951#endif 3247#endif
2952 spin_lock_irqsave(&vector_lock, flags); 3248 spin_lock_irqsave(&vector_lock, flags);
2953 __clear_irq_vector(irq); 3249 __clear_irq_vector(irq, cfg);
2954 spin_unlock_irqrestore(&vector_lock, flags); 3250 spin_unlock_irqrestore(&vector_lock, flags);
2955} 3251}
2956 3252
@@ -2963,16 +3259,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2963 struct irq_cfg *cfg; 3259 struct irq_cfg *cfg;
2964 int err; 3260 int err;
2965 unsigned dest; 3261 unsigned dest;
2966 cpumask_t tmp;
2967 3262
2968 tmp = TARGET_CPUS; 3263 cfg = irq_cfg(irq);
2969 err = assign_irq_vector(irq, tmp); 3264 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
2970 if (err) 3265 if (err)
2971 return err; 3266 return err;
2972 3267
2973 cfg = irq_cfg(irq); 3268 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
2974 cpus_and(tmp, cfg->domain, tmp);
2975 dest = cpu_mask_to_apicid(tmp);
2976 3269
2977#ifdef CONFIG_INTR_REMAP 3270#ifdef CONFIG_INTR_REMAP
2978 if (irq_remapped(irq)) { 3271 if (irq_remapped(irq)) {
@@ -3026,64 +3319,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3026} 3319}
3027 3320
3028#ifdef CONFIG_SMP 3321#ifdef CONFIG_SMP
3029static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3322static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3030{ 3323{
3324 struct irq_desc *desc = irq_to_desc(irq);
3031 struct irq_cfg *cfg; 3325 struct irq_cfg *cfg;
3032 struct msi_msg msg; 3326 struct msi_msg msg;
3033 unsigned int dest; 3327 unsigned int dest;
3034 cpumask_t tmp;
3035 struct irq_desc *desc;
3036 3328
3037 cpus_and(tmp, mask, cpu_online_map); 3329 dest = set_desc_affinity(desc, mask);
3038 if (cpus_empty(tmp)) 3330 if (dest == BAD_APICID)
3039 return; 3331 return;
3040 3332
3041 if (assign_irq_vector(irq, mask)) 3333 cfg = desc->chip_data;
3042 return;
3043 3334
3044 cfg = irq_cfg(irq); 3335 read_msi_msg_desc(desc, &msg);
3045 cpus_and(tmp, cfg->domain, mask);
3046 dest = cpu_mask_to_apicid(tmp);
3047
3048 read_msi_msg(irq, &msg);
3049 3336
3050 msg.data &= ~MSI_DATA_VECTOR_MASK; 3337 msg.data &= ~MSI_DATA_VECTOR_MASK;
3051 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3338 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3052 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3339 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3053 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3340 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3054 3341
3055 write_msi_msg(irq, &msg); 3342 write_msi_msg_desc(desc, &msg);
3056 desc = irq_to_desc(irq);
3057 desc->affinity = mask;
3058} 3343}
3059
3060#ifdef CONFIG_INTR_REMAP 3344#ifdef CONFIG_INTR_REMAP
3061/* 3345/*
3062 * Migrate the MSI irq to another cpumask. This migration is 3346 * Migrate the MSI irq to another cpumask. This migration is
3063 * done in the process context using interrupt-remapping hardware. 3347 * done in the process context using interrupt-remapping hardware.
3064 */ 3348 */
3065static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3349static void
3350ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3066{ 3351{
3067 struct irq_cfg *cfg; 3352 struct irq_desc *desc = irq_to_desc(irq);
3353 struct irq_cfg *cfg = desc->chip_data;
3068 unsigned int dest; 3354 unsigned int dest;
3069 cpumask_t tmp, cleanup_mask;
3070 struct irte irte; 3355 struct irte irte;
3071 struct irq_desc *desc;
3072
3073 cpus_and(tmp, mask, cpu_online_map);
3074 if (cpus_empty(tmp))
3075 return;
3076 3356
3077 if (get_irte(irq, &irte)) 3357 if (get_irte(irq, &irte))
3078 return; 3358 return;
3079 3359
3080 if (assign_irq_vector(irq, mask)) 3360 dest = set_desc_affinity(desc, mask);
3361 if (dest == BAD_APICID)
3081 return; 3362 return;
3082 3363
3083 cfg = irq_cfg(irq);
3084 cpus_and(tmp, cfg->domain, mask);
3085 dest = cpu_mask_to_apicid(tmp);
3086
3087 irte.vector = cfg->vector; 3364 irte.vector = cfg->vector;
3088 irte.dest_id = IRTE_DEST(dest); 3365 irte.dest_id = IRTE_DEST(dest);
3089 3366
@@ -3097,16 +3374,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
3097 * at the new destination. So, time to cleanup the previous 3374 * at the new destination. So, time to cleanup the previous
3098 * vector allocation. 3375 * vector allocation.
3099 */ 3376 */
3100 if (cfg->move_in_progress) { 3377 if (cfg->move_in_progress)
3101 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 3378 send_cleanup_vector(cfg);
3102 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
3103 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
3104 cfg->move_in_progress = 0;
3105 }
3106
3107 desc = irq_to_desc(irq);
3108 desc->affinity = mask;
3109} 3379}
3380
3110#endif 3381#endif
3111#endif /* CONFIG_SMP */ 3382#endif /* CONFIG_SMP */
3112 3383
@@ -3165,7 +3436,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3165} 3436}
3166#endif 3437#endif
3167 3438
3168static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) 3439static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3169{ 3440{
3170 int ret; 3441 int ret;
3171 struct msi_msg msg; 3442 struct msi_msg msg;
@@ -3174,7 +3445,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3174 if (ret < 0) 3445 if (ret < 0)
3175 return ret; 3446 return ret;
3176 3447
3177 set_irq_msi(irq, desc); 3448 set_irq_msi(irq, msidesc);
3178 write_msi_msg(irq, &msg); 3449 write_msi_msg(irq, &msg);
3179 3450
3180#ifdef CONFIG_INTR_REMAP 3451#ifdef CONFIG_INTR_REMAP
@@ -3194,26 +3465,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3194 return 0; 3465 return 0;
3195} 3466}
3196 3467
3197static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) 3468int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
3198{
3199 unsigned int irq;
3200
3201 irq = dev->bus->number;
3202 irq <<= 8;
3203 irq |= dev->devfn;
3204 irq <<= 12;
3205
3206 return irq;
3207}
3208
3209int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3210{ 3469{
3211 unsigned int irq; 3470 unsigned int irq;
3212 int ret; 3471 int ret;
3213 unsigned int irq_want; 3472 unsigned int irq_want;
3214 3473
3215 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3474 irq_want = nr_irqs_gsi;
3216
3217 irq = create_irq_nr(irq_want); 3475 irq = create_irq_nr(irq_want);
3218 if (irq == 0) 3476 if (irq == 0)
3219 return -1; 3477 return -1;
@@ -3227,7 +3485,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3227 goto error; 3485 goto error;
3228no_ir: 3486no_ir:
3229#endif 3487#endif
3230 ret = setup_msi_irq(dev, desc, irq); 3488 ret = setup_msi_irq(dev, msidesc, irq);
3231 if (ret < 0) { 3489 if (ret < 0) {
3232 destroy_irq(irq); 3490 destroy_irq(irq);
3233 return ret; 3491 return ret;
@@ -3245,7 +3503,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3245{ 3503{
3246 unsigned int irq; 3504 unsigned int irq;
3247 int ret, sub_handle; 3505 int ret, sub_handle;
3248 struct msi_desc *desc; 3506 struct msi_desc *msidesc;
3249 unsigned int irq_want; 3507 unsigned int irq_want;
3250 3508
3251#ifdef CONFIG_INTR_REMAP 3509#ifdef CONFIG_INTR_REMAP
@@ -3253,10 +3511,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3253 int index = 0; 3511 int index = 0;
3254#endif 3512#endif
3255 3513
3256 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3514 irq_want = nr_irqs_gsi;
3257 sub_handle = 0; 3515 sub_handle = 0;
3258 list_for_each_entry(desc, &dev->msi_list, list) { 3516 list_for_each_entry(msidesc, &dev->msi_list, list) {
3259 irq = create_irq_nr(irq_want--); 3517 irq = create_irq_nr(irq_want);
3518 irq_want++;
3260 if (irq == 0) 3519 if (irq == 0)
3261 return -1; 3520 return -1;
3262#ifdef CONFIG_INTR_REMAP 3521#ifdef CONFIG_INTR_REMAP
@@ -3288,7 +3547,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3288 } 3547 }
3289no_ir: 3548no_ir:
3290#endif 3549#endif
3291 ret = setup_msi_irq(dev, desc, irq); 3550 ret = setup_msi_irq(dev, msidesc, irq);
3292 if (ret < 0) 3551 if (ret < 0)
3293 goto error; 3552 goto error;
3294 sub_handle++; 3553 sub_handle++;
@@ -3307,24 +3566,18 @@ void arch_teardown_msi_irq(unsigned int irq)
3307 3566
3308#ifdef CONFIG_DMAR 3567#ifdef CONFIG_DMAR
3309#ifdef CONFIG_SMP 3568#ifdef CONFIG_SMP
3310static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) 3569static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3311{ 3570{
3571 struct irq_desc *desc = irq_to_desc(irq);
3312 struct irq_cfg *cfg; 3572 struct irq_cfg *cfg;
3313 struct msi_msg msg; 3573 struct msi_msg msg;
3314 unsigned int dest; 3574 unsigned int dest;
3315 cpumask_t tmp;
3316 struct irq_desc *desc;
3317 3575
3318 cpus_and(tmp, mask, cpu_online_map); 3576 dest = set_desc_affinity(desc, mask);
3319 if (cpus_empty(tmp)) 3577 if (dest == BAD_APICID)
3320 return; 3578 return;
3321 3579
3322 if (assign_irq_vector(irq, mask)) 3580 cfg = desc->chip_data;
3323 return;
3324
3325 cfg = irq_cfg(irq);
3326 cpus_and(tmp, cfg->domain, mask);
3327 dest = cpu_mask_to_apicid(tmp);
3328 3581
3329 dmar_msi_read(irq, &msg); 3582 dmar_msi_read(irq, &msg);
3330 3583
@@ -3334,9 +3587,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
3334 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3587 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3335 3588
3336 dmar_msi_write(irq, &msg); 3589 dmar_msi_write(irq, &msg);
3337 desc = irq_to_desc(irq);
3338 desc->affinity = mask;
3339} 3590}
3591
3340#endif /* CONFIG_SMP */ 3592#endif /* CONFIG_SMP */
3341 3593
3342struct irq_chip dmar_msi_type = { 3594struct irq_chip dmar_msi_type = {
@@ -3368,24 +3620,18 @@ int arch_setup_dmar_msi(unsigned int irq)
3368#ifdef CONFIG_HPET_TIMER 3620#ifdef CONFIG_HPET_TIMER
3369 3621
3370#ifdef CONFIG_SMP 3622#ifdef CONFIG_SMP
3371static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) 3623static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3372{ 3624{
3625 struct irq_desc *desc = irq_to_desc(irq);
3373 struct irq_cfg *cfg; 3626 struct irq_cfg *cfg;
3374 struct irq_desc *desc;
3375 struct msi_msg msg; 3627 struct msi_msg msg;
3376 unsigned int dest; 3628 unsigned int dest;
3377 cpumask_t tmp;
3378 3629
3379 cpus_and(tmp, mask, cpu_online_map); 3630 dest = set_desc_affinity(desc, mask);
3380 if (cpus_empty(tmp)) 3631 if (dest == BAD_APICID)
3381 return; 3632 return;
3382 3633
3383 if (assign_irq_vector(irq, mask)) 3634 cfg = desc->chip_data;
3384 return;
3385
3386 cfg = irq_cfg(irq);
3387 cpus_and(tmp, cfg->domain, mask);
3388 dest = cpu_mask_to_apicid(tmp);
3389 3635
3390 hpet_msi_read(irq, &msg); 3636 hpet_msi_read(irq, &msg);
3391 3637
@@ -3395,9 +3641,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
3395 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3641 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3396 3642
3397 hpet_msi_write(irq, &msg); 3643 hpet_msi_write(irq, &msg);
3398 desc = irq_to_desc(irq);
3399 desc->affinity = mask;
3400} 3644}
3645
3401#endif /* CONFIG_SMP */ 3646#endif /* CONFIG_SMP */
3402 3647
3403struct irq_chip hpet_msi_type = { 3648struct irq_chip hpet_msi_type = {
@@ -3450,28 +3695,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3450 write_ht_irq_msg(irq, &msg); 3695 write_ht_irq_msg(irq, &msg);
3451} 3696}
3452 3697
3453static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 3698static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3454{ 3699{
3700 struct irq_desc *desc = irq_to_desc(irq);
3455 struct irq_cfg *cfg; 3701 struct irq_cfg *cfg;
3456 unsigned int dest; 3702 unsigned int dest;
3457 cpumask_t tmp;
3458 struct irq_desc *desc;
3459 3703
3460 cpus_and(tmp, mask, cpu_online_map); 3704 dest = set_desc_affinity(desc, mask);
3461 if (cpus_empty(tmp)) 3705 if (dest == BAD_APICID)
3462 return; 3706 return;
3463 3707
3464 if (assign_irq_vector(irq, mask)) 3708 cfg = desc->chip_data;
3465 return;
3466
3467 cfg = irq_cfg(irq);
3468 cpus_and(tmp, cfg->domain, mask);
3469 dest = cpu_mask_to_apicid(tmp);
3470 3709
3471 target_ht_irq(irq, dest, cfg->vector); 3710 target_ht_irq(irq, dest, cfg->vector);
3472 desc = irq_to_desc(irq);
3473 desc->affinity = mask;
3474} 3711}
3712
3475#endif 3713#endif
3476 3714
3477static struct irq_chip ht_irq_chip = { 3715static struct irq_chip ht_irq_chip = {
@@ -3489,17 +3727,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3489{ 3727{
3490 struct irq_cfg *cfg; 3728 struct irq_cfg *cfg;
3491 int err; 3729 int err;
3492 cpumask_t tmp;
3493 3730
3494 tmp = TARGET_CPUS; 3731 cfg = irq_cfg(irq);
3495 err = assign_irq_vector(irq, tmp); 3732 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3496 if (!err) { 3733 if (!err) {
3497 struct ht_irq_msg msg; 3734 struct ht_irq_msg msg;
3498 unsigned dest; 3735 unsigned dest;
3499 3736
3500 cfg = irq_cfg(irq); 3737 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
3501 cpus_and(tmp, cfg->domain, tmp);
3502 dest = cpu_mask_to_apicid(tmp);
3503 3738
3504 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3739 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3505 3740
@@ -3535,7 +3770,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3535int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, 3770int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3536 unsigned long mmr_offset) 3771 unsigned long mmr_offset)
3537{ 3772{
3538 const cpumask_t *eligible_cpu = get_cpu_mask(cpu); 3773 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3539 struct irq_cfg *cfg; 3774 struct irq_cfg *cfg;
3540 int mmr_pnode; 3775 int mmr_pnode;
3541 unsigned long mmr_value; 3776 unsigned long mmr_value;
@@ -3543,7 +3778,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3543 unsigned long flags; 3778 unsigned long flags;
3544 int err; 3779 int err;
3545 3780
3546 err = assign_irq_vector(irq, *eligible_cpu); 3781 cfg = irq_cfg(irq);
3782
3783 err = assign_irq_vector(irq, cfg, eligible_cpu);
3547 if (err != 0) 3784 if (err != 0)
3548 return err; 3785 return err;
3549 3786
@@ -3552,8 +3789,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3552 irq_name); 3789 irq_name);
3553 spin_unlock_irqrestore(&vector_lock, flags); 3790 spin_unlock_irqrestore(&vector_lock, flags);
3554 3791
3555 cfg = irq_cfg(irq);
3556
3557 mmr_value = 0; 3792 mmr_value = 0;
3558 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3793 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3559 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3794 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3564,7 +3799,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3564 entry->polarity = 0; 3799 entry->polarity = 0;
3565 entry->trigger = 0; 3800 entry->trigger = 0;
3566 entry->mask = 0; 3801 entry->mask = 0;
3567 entry->dest = cpu_mask_to_apicid(*eligible_cpu); 3802 entry->dest = cpu_mask_to_apicid(eligible_cpu);
3568 3803
3569 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3804 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3570 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3805 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3605,9 +3840,16 @@ int __init io_apic_get_redir_entries (int ioapic)
3605 return reg_01.bits.entries; 3840 return reg_01.bits.entries;
3606} 3841}
3607 3842
3608int __init probe_nr_irqs(void) 3843void __init probe_nr_irqs_gsi(void)
3609{ 3844{
3610 return NR_IRQS; 3845 int idx;
3846 int nr = 0;
3847
3848 for (idx = 0; idx < nr_ioapics; idx++)
3849 nr += io_apic_get_redir_entries(idx) + 1;
3850
3851 if (nr > nr_irqs_gsi)
3852 nr_irqs_gsi = nr;
3611} 3853}
3612 3854
3613/* -------------------------------------------------------------------------- 3855/* --------------------------------------------------------------------------
@@ -3706,19 +3948,31 @@ int __init io_apic_get_version(int ioapic)
3706 3948
3707int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3949int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3708{ 3950{
3951 struct irq_desc *desc;
3952 struct irq_cfg *cfg;
3953 int cpu = boot_cpu_id;
3954
3709 if (!IO_APIC_IRQ(irq)) { 3955 if (!IO_APIC_IRQ(irq)) {
3710 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3956 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3711 ioapic); 3957 ioapic);
3712 return -EINVAL; 3958 return -EINVAL;
3713 } 3959 }
3714 3960
3961 desc = irq_to_desc_alloc_cpu(irq, cpu);
3962 if (!desc) {
3963 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3964 return 0;
3965 }
3966
3715 /* 3967 /*
3716 * IRQs < 16 are already in the irq_2_pin[] map 3968 * IRQs < 16 are already in the irq_2_pin[] map
3717 */ 3969 */
3718 if (irq >= 16) 3970 if (irq >= NR_IRQS_LEGACY) {
3719 add_pin_to_irq(irq, ioapic, pin); 3971 cfg = desc->chip_data;
3972 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
3973 }
3720 3974
3721 setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); 3975 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
3722 3976
3723 return 0; 3977 return 0;
3724} 3978}
@@ -3756,7 +4010,7 @@ void __init setup_ioapic_dest(void)
3756 int pin, ioapic, irq, irq_entry; 4010 int pin, ioapic, irq, irq_entry;
3757 struct irq_desc *desc; 4011 struct irq_desc *desc;
3758 struct irq_cfg *cfg; 4012 struct irq_cfg *cfg;
3759 cpumask_t mask; 4013 const struct cpumask *mask;
3760 4014
3761 if (skip_ioapic_setup == 1) 4015 if (skip_ioapic_setup == 1)
3762 return; 4016 return;
@@ -3772,9 +4026,10 @@ void __init setup_ioapic_dest(void)
3772 * when you have too many devices, because at that time only boot 4026 * when you have too many devices, because at that time only boot
3773 * cpu is online. 4027 * cpu is online.
3774 */ 4028 */
3775 cfg = irq_cfg(irq); 4029 desc = irq_to_desc(irq);
4030 cfg = desc->chip_data;
3776 if (!cfg->vector) { 4031 if (!cfg->vector) {
3777 setup_IO_APIC_irq(ioapic, pin, irq, 4032 setup_IO_APIC_irq(ioapic, pin, irq, desc,
3778 irq_trigger(irq_entry), 4033 irq_trigger(irq_entry),
3779 irq_polarity(irq_entry)); 4034 irq_polarity(irq_entry));
3780 continue; 4035 continue;
@@ -3784,19 +4039,18 @@ void __init setup_ioapic_dest(void)
3784 /* 4039 /*
3785 * Honour affinities which have been set in early boot 4040 * Honour affinities which have been set in early boot
3786 */ 4041 */
3787 desc = irq_to_desc(irq);
3788 if (desc->status & 4042 if (desc->status &
3789 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 4043 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
3790 mask = desc->affinity; 4044 mask = &desc->affinity;
3791 else 4045 else
3792 mask = TARGET_CPUS; 4046 mask = TARGET_CPUS;
3793 4047
3794#ifdef CONFIG_INTR_REMAP 4048#ifdef CONFIG_INTR_REMAP
3795 if (intr_remapping_enabled) 4049 if (intr_remapping_enabled)
3796 set_ir_ioapic_affinity_irq(irq, mask); 4050 set_ir_ioapic_affinity_irq_desc(desc, mask);
3797 else 4051 else
3798#endif 4052#endif
3799 set_ioapic_affinity_irq(irq, mask); 4053 set_ioapic_affinity_irq_desc(desc, mask);
3800 } 4054 }
3801 4055
3802 } 4056 }
@@ -3845,7 +4099,6 @@ void __init ioapic_init_mappings(void)
3845 struct resource *ioapic_res; 4099 struct resource *ioapic_res;
3846 int i; 4100 int i;
3847 4101
3848 irq_2_pin_init();
3849 ioapic_res = ioapic_setup_resources(); 4102 ioapic_res = ioapic_setup_resources();
3850 for (i = 0; i < nr_ioapics; i++) { 4103 for (i = 0; i < nr_ioapics; i++) {
3851 if (smp_found_config) { 4104 if (smp_found_config) {
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index f1c688e46f35..285bbf8831fa 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
116/* 116/*
117 * This is only used on smaller machines. 117 * This is only used on smaller machines.
118 */ 118 */
119void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) 119void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
120{ 120{
121 unsigned long mask = cpus_addr(cpumask)[0]; 121 unsigned long mask = cpumask_bits(cpumask)[0];
122 unsigned long flags; 122 unsigned long flags;
123 123
124 local_irq_save(flags); 124 local_irq_save(flags);
125 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); 125 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
126 __send_IPI_dest_field(mask, vector); 126 __send_IPI_dest_field(mask, vector);
127 local_irq_restore(flags); 127 local_irq_restore(flags);
128} 128}
129 129
130void send_IPI_mask_sequence(cpumask_t mask, int vector) 130void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
131{ 131{
132 unsigned long flags; 132 unsigned long flags;
133 unsigned int query_cpu; 133 unsigned int query_cpu;
@@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
139 */ 139 */
140 140
141 local_irq_save(flags); 141 local_irq_save(flags);
142 for_each_possible_cpu(query_cpu) { 142 for_each_cpu(query_cpu, mask)
143 if (cpu_isset(query_cpu, mask)) { 143 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
144 local_irq_restore(flags);
145}
146
147void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
148{
149 unsigned long flags;
150 unsigned int query_cpu;
151 unsigned int this_cpu = smp_processor_id();
152
153 /* See Hack comment above */
154
155 local_irq_save(flags);
156 for_each_cpu(query_cpu, mask)
157 if (query_cpu != this_cpu)
144 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), 158 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
145 vector); 159 vector);
146 }
147 }
148 local_irq_restore(flags); 160 local_irq_restore(flags);
149} 161}
150 162
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..bce53e1352a0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -9,6 +9,7 @@
9#include <asm/apic.h> 9#include <asm/apic.h>
10#include <asm/io_apic.h> 10#include <asm/io_apic.h>
11#include <asm/smp.h> 11#include <asm/smp.h>
12#include <asm/irq.h>
12 13
13atomic_t irq_err_count; 14atomic_t irq_err_count;
14 15
@@ -118,6 +119,9 @@ int show_interrupts(struct seq_file *p, void *v)
118 } 119 }
119 120
120 desc = irq_to_desc(i); 121 desc = irq_to_desc(i);
122 if (!desc)
123 return 0;
124
121 spin_lock_irqsave(&desc->lock, flags); 125 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP 126#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i); 127 any_count = kstat_irqs(i);
@@ -187,3 +191,5 @@ u64 arch_irq_stat(void)
187#endif 191#endif
188 return sum; 192 return sum;
189} 193}
194
195EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de0..9dc5588f336a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs)
233#ifdef CONFIG_HOTPLUG_CPU 233#ifdef CONFIG_HOTPLUG_CPU
234#include <mach_apic.h> 234#include <mach_apic.h>
235 235
236void fixup_irqs(cpumask_t map) 236/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
237void fixup_irqs(void)
237{ 238{
238 unsigned int irq; 239 unsigned int irq;
239 static int warned; 240 static int warned;
240 struct irq_desc *desc; 241 struct irq_desc *desc;
241 242
242 for_each_irq_desc(irq, desc) { 243 for_each_irq_desc(irq, desc) {
243 cpumask_t mask; 244 const struct cpumask *affinity;
244 245
246 if (!desc)
247 continue;
245 if (irq == 2) 248 if (irq == 2)
246 continue; 249 continue;
247 250
248 cpus_and(mask, desc->affinity, map); 251 affinity = &desc->affinity;
249 if (any_online_cpu(mask) == NR_CPUS) { 252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
250 printk("Breaking affinity for irq %i\n", irq); 253 printk("Breaking affinity for irq %i\n", irq);
251 mask = map; 254 affinity = cpu_all_mask;
252 } 255 }
253 if (desc->chip->set_affinity) 256 if (desc->chip->set_affinity)
254 desc->chip->set_affinity(irq, mask); 257 desc->chip->set_affinity(irq, affinity);
255 else if (desc->action && !(warned++)) 258 else if (desc->action && !(warned++))
256 printk("Cannot set affinity for irq %i\n", irq); 259 printk("Cannot set affinity for irq %i\n", irq);
257 } 260 }
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1df869e5bd0b..6383d50f82ea 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -80,40 +80,43 @@ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
80} 80}
81 81
82#ifdef CONFIG_HOTPLUG_CPU 82#ifdef CONFIG_HOTPLUG_CPU
83void fixup_irqs(cpumask_t map) 83/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
84void fixup_irqs(void)
84{ 85{
85 unsigned int irq; 86 unsigned int irq;
86 static int warned; 87 static int warned;
87 struct irq_desc *desc; 88 struct irq_desc *desc;
88 89
89 for_each_irq_desc(irq, desc) { 90 for_each_irq_desc(irq, desc) {
90 cpumask_t mask;
91 int break_affinity = 0; 91 int break_affinity = 0;
92 int set_affinity = 1; 92 int set_affinity = 1;
93 const struct cpumask *affinity;
93 94
95 if (!desc)
96 continue;
94 if (irq == 2) 97 if (irq == 2)
95 continue; 98 continue;
96 99
97 /* interrupt's are disabled at this point */ 100 /* interrupt's are disabled at this point */
98 spin_lock(&desc->lock); 101 spin_lock(&desc->lock);
99 102
103 affinity = &desc->affinity;
100 if (!irq_has_action(irq) || 104 if (!irq_has_action(irq) ||
101 cpus_equal(desc->affinity, map)) { 105 cpumask_equal(affinity, cpu_online_mask)) {
102 spin_unlock(&desc->lock); 106 spin_unlock(&desc->lock);
103 continue; 107 continue;
104 } 108 }
105 109
106 cpus_and(mask, desc->affinity, map); 110 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
107 if (cpus_empty(mask)) {
108 break_affinity = 1; 111 break_affinity = 1;
109 mask = map; 112 affinity = cpu_all_mask;
110 } 113 }
111 114
112 if (desc->chip->mask) 115 if (desc->chip->mask)
113 desc->chip->mask(irq); 116 desc->chip->mask(irq);
114 117
115 if (desc->chip->set_affinity) 118 if (desc->chip->set_affinity)
116 desc->chip->set_affinity(irq, mask); 119 desc->chip->set_affinity(irq, affinity);
117 else if (!(warned++)) 120 else if (!(warned++))
118 set_affinity = 0; 121 set_affinity = 0;
119 122
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63044a5..84723295f88a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
68 /* 68 /*
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < NR_IRQS_LEGACY; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i); 72 struct irq_desc *desc = irq_to_desc(i);
74 73
75 desc->status = IRQ_DISABLED; 74 desc->status = IRQ_DISABLED;
@@ -111,6 +110,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
111 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 110 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
112}; 111};
113 112
113int vector_used_by_percpu_irq(unsigned int vector)
114{
115 int cpu;
116
117 for_each_online_cpu(cpu) {
118 if (per_cpu(vector_irq, cpu)[vector] != -1)
119 return 1;
120 }
121
122 return 0;
123}
124
114/* Overridden in paravirt.c */ 125/* Overridden in paravirt.c */
115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 126void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
116 127
@@ -147,10 +158,12 @@ void __init native_init_IRQ(void)
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 158 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
148 159
149 /* IPI for single call function */ 160 /* IPI for single call function */
150 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); 161 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
162 call_function_single_interrupt);
151 163
152 /* Low priority IPI to cleanup after moving an irq */ 164 /* Low priority IPI to cleanup after moving an irq */
153 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 165 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
166 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
154#endif 167#endif
155 168
156#ifdef CONFIG_X86_LOCAL_APIC 169#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3ce626e..31ebfe38e96c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -69,6 +69,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
69 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 69 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
70}; 70};
71 71
72int vector_used_by_percpu_irq(unsigned int vector)
73{
74 int cpu;
75
76 for_each_online_cpu(cpu) {
77 if (per_cpu(vector_irq, cpu)[vector] != -1)
78 return 1;
79 }
80
81 return 0;
82}
83
72void __init init_ISA_irqs(void) 84void __init init_ISA_irqs(void)
73{ 85{
74 int i; 86 int i;
@@ -76,8 +88,7 @@ void __init init_ISA_irqs(void)
76 init_bsp_APIC(); 88 init_bsp_APIC();
77 init_8259A(0); 89 init_8259A(0);
78 90
79 for (i = 0; i < 16; i++) { 91 for (i = 0; i < NR_IRQS_LEGACY; i++) {
80 /* first time call this irq_desc */
81 struct irq_desc *desc = irq_to_desc(i); 92 struct irq_desc *desc = irq_to_desc(i);
82 93
83 desc->status = IRQ_DISABLED; 94 desc->status = IRQ_DISABLED;
@@ -122,6 +133,7 @@ static void __init smp_intr_init(void)
122 133
123 /* Low priority IPI to cleanup after moving an irq */ 134 /* Low priority IPI to cleanup after moving an irq */
124 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 135 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
136 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
125#endif 137#endif
126} 138}
127 139
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e169ae9b6a62..652fce6d2cce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
89 */ 89 */
90static unsigned long kvm_get_tsc_khz(void) 90static unsigned long kvm_get_tsc_khz(void)
91{ 91{
92 return preset_lpj; 92 struct pvclock_vcpu_time_info *src;
93 src = &per_cpu(hv_clock, 0);
94 return pvclock_tsc_khz(src);
93} 95}
94 96
95static void kvm_get_preset_lpj(void) 97static void kvm_get_preset_lpj(void)
96{ 98{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz; 99 unsigned long khz;
99 u64 lpj; 100 u64 lpj;
100 101
101 src = &per_cpu(hv_clock, 0); 102 khz = kvm_get_tsc_khz();
102 khz = pvclock_tsc_khz(src);
103 103
104 lpj = ((u64)khz * 1000); 104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ); 105 do_div(lpj, HZ);
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
194#endif 194#endif
195 kvm_get_preset_lpj(); 195 kvm_get_preset_lpj();
196 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
197 pv_info.paravirt_enabled = 1;
198 pv_info.name = "KVM";
197 } 199 }
198} 200}
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3b599518c322..c12314c9e86f 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
287 .set_mode = mfgpt_set_mode, 287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event, 288 .set_next_event = mfgpt_next_event,
289 .rating = 250, 289 .rating = 250,
290 .cpumask = CPU_MASK_ALL, 290 .cpumask = cpu_all_mask,
291 .shift = 32 291 .shift = 32
292}; 292};
293 293
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 7a3dfceb90e4..19a1044a0cd9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -101,11 +101,15 @@ static void __init dma32_free_bootmem(void)
101 dma32_bootmem_ptr = NULL; 101 dma32_bootmem_ptr = NULL;
102 dma32_bootmem_size = 0; 102 dma32_bootmem_size = 0;
103} 103}
104#endif
104 105
105void __init pci_iommu_alloc(void) 106void __init pci_iommu_alloc(void)
106{ 107{
108#ifdef CONFIG_X86_64
107 /* free the range so iommu could get some range less than 4G */ 109 /* free the range so iommu could get some range less than 4G */
108 dma32_free_bootmem(); 110 dma32_free_bootmem();
111#endif
112
109 /* 113 /*
110 * The order of these functions is important for 114 * The order of these functions is important for
111 * fall-back/fail-over reasons 115 * fall-back/fail-over reasons
@@ -121,15 +125,6 @@ void __init pci_iommu_alloc(void)
121 pci_swiotlb_init(); 125 pci_swiotlb_init();
122} 126}
123 127
124unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
125{
126 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
127
128 return size >> PAGE_SHIFT;
129}
130EXPORT_SYMBOL(iommu_nr_pages);
131#endif
132
133void *dma_generic_alloc_coherent(struct device *dev, size_t size, 128void *dma_generic_alloc_coherent(struct device *dev, size_t size,
134 dma_addr_t *dma_addr, gfp_t flag) 129 dma_addr_t *dma_addr, gfp_t flag)
135{ 130{
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111abb..242c3440687f 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
3#include <linux/pci.h> 3#include <linux/pci.h>
4#include <linux/cache.h> 4#include <linux/cache.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/swiotlb.h>
7#include <linux/bootmem.h>
6#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
7 9
8#include <asm/iommu.h> 10#include <asm/iommu.h>
@@ -11,6 +13,31 @@
11 13
12int swiotlb __read_mostly; 14int swiotlb __read_mostly;
13 15
16void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
37{
38 return 0;
39}
40
14static dma_addr_t 41static dma_addr_t
15swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, 42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
16 int direction) 43 int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
50void __init pci_swiotlb_init(void) 77void __init pci_swiotlb_init(void)
51{ 78{
52 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 79 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
80#ifdef CONFIG_X86_64
53 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 81 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
54 swiotlb = 1; 82 swiotlb = 1;
83#endif
55 if (swiotlb_force) 84 if (swiotlb_force)
56 swiotlb = 1; 85 swiotlb = 1;
57 if (swiotlb) { 86 if (swiotlb) {
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 67465ed89310..309949e9e1c1 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
168 ich_force_enable_hpet); 168 ich_force_enable_hpet);
169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
170 ich_force_enable_hpet); 170 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
172 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, 173DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
172 ich_force_enable_hpet); 174 ich_force_enable_hpet);
173 175
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a90913cccfb7..bf088c61fa40 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -13,6 +13,7 @@
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/reboot.h> 14#include <asm/reboot.h>
15#include <asm/pci_x86.h> 15#include <asm/pci_x86.h>
16#include <asm/virtext.h>
16 17
17#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
18# include <linux/dmi.h> 19# include <linux/dmi.h>
@@ -39,6 +40,12 @@ int reboot_force;
39static int reboot_cpu = -1; 40static int reboot_cpu = -1;
40#endif 41#endif
41 42
43/* This is set if we need to go through the 'emergency' path.
44 * When machine_emergency_restart() is called, we may be on
45 * an inconsistent state and won't be able to do a clean cleanup
46 */
47static int reboot_emergency;
48
42/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ 49/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
43bool port_cf9_safe = false; 50bool port_cf9_safe = false;
44 51
@@ -368,6 +375,48 @@ static inline void kb_wait(void)
368 } 375 }
369} 376}
370 377
378static void vmxoff_nmi(int cpu, struct die_args *args)
379{
380 cpu_emergency_vmxoff();
381}
382
383/* Use NMIs as IPIs to tell all CPUs to disable virtualization
384 */
385static void emergency_vmx_disable_all(void)
386{
387 /* Just make sure we won't change CPUs while doing this */
388 local_irq_disable();
389
390 /* We need to disable VMX on all CPUs before rebooting, otherwise
391 * we risk hanging up the machine, because the CPU ignore INIT
392 * signals when VMX is enabled.
393 *
394 * We can't take any locks and we may be on an inconsistent
395 * state, so we use NMIs as IPIs to tell the other CPUs to disable
396 * VMX and halt.
397 *
398 * For safety, we will avoid running the nmi_shootdown_cpus()
399 * stuff unnecessarily, but we don't have a way to check
400 * if other CPUs have VMX enabled. So we will call it only if the
401 * CPU we are running on has VMX enabled.
402 *
403 * We will miss cases where VMX is not enabled on all CPUs. This
404 * shouldn't do much harm because KVM always enable VMX on all
405 * CPUs anyway. But we can miss it on the small window where KVM
406 * is still enabling VMX.
407 */
408 if (cpu_has_vmx() && cpu_vmx_enabled()) {
409 /* Disable VMX on this CPU.
410 */
411 cpu_vmxoff();
412
413 /* Halt and disable VMX on the other CPUs */
414 nmi_shootdown_cpus(vmxoff_nmi);
415
416 }
417}
418
419
371void __attribute__((weak)) mach_reboot_fixups(void) 420void __attribute__((weak)) mach_reboot_fixups(void)
372{ 421{
373} 422}
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void)
376{ 425{
377 int i; 426 int i;
378 427
428 if (reboot_emergency)
429 emergency_vmx_disable_all();
430
379 /* Tell the BIOS if we want cold or warm reboot */ 431 /* Tell the BIOS if we want cold or warm reboot */
380 *((unsigned short *)__va(0x472)) = reboot_mode; 432 *((unsigned short *)__va(0x472)) = reboot_mode;
381 433
@@ -482,13 +534,19 @@ void native_machine_shutdown(void)
482#endif 534#endif
483} 535}
484 536
537static void __machine_emergency_restart(int emergency)
538{
539 reboot_emergency = emergency;
540 machine_ops.emergency_restart();
541}
542
485static void native_machine_restart(char *__unused) 543static void native_machine_restart(char *__unused)
486{ 544{
487 printk("machine restart\n"); 545 printk("machine restart\n");
488 546
489 if (!reboot_force) 547 if (!reboot_force)
490 machine_shutdown(); 548 machine_shutdown();
491 machine_emergency_restart(); 549 __machine_emergency_restart(0);
492} 550}
493 551
494static void native_machine_halt(void) 552static void native_machine_halt(void)
@@ -532,7 +590,7 @@ void machine_shutdown(void)
532 590
533void machine_emergency_restart(void) 591void machine_emergency_restart(void)
534{ 592{
535 machine_ops.emergency_restart(); 593 __machine_emergency_restart(1);
536} 594}
537 595
538void machine_restart(char *cmd) 596void machine_restart(char *cmd)
@@ -592,10 +650,7 @@ static int crash_nmi_callback(struct notifier_block *self,
592 650
593static void smp_send_nmi_allbutself(void) 651static void smp_send_nmi_allbutself(void)
594{ 652{
595 cpumask_t mask = cpu_online_map; 653 send_IPI_allbutself(NMI_VECTOR);
596 cpu_clear(safe_smp_processor_id(), mask);
597 if (!cpus_empty(mask))
598 send_IPI_mask(mask, NMI_VECTOR);
599} 654}
600 655
601static struct notifier_block crash_nmi_nb = { 656static struct notifier_block crash_nmi_nb = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 08e02e8453c9..ae0d8042cf69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -953,7 +953,7 @@ void __init setup_arch(char **cmdline_p)
953 ioapic_init_mappings(); 953 ioapic_init_mappings();
954 954
955 /* need to wait for io_apic is mapped */ 955 /* need to wait for io_apic is mapped */
956 nr_irqs = probe_nr_irqs(); 956 probe_nr_irqs_gsi();
957 957
958 kvm_guest_init(); 958 kvm_guest_init();
959 959
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3bb770..0b63b08e7530 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -152,6 +152,11 @@ void __init setup_per_cpu_areas(void)
152 old_size = PERCPU_ENOUGH_ROOM; 152 old_size = PERCPU_ENOUGH_ROOM;
153 align = max_t(unsigned long, PAGE_SIZE, align); 153 align = max_t(unsigned long, PAGE_SIZE, align);
154 size = roundup(old_size, align); 154 size = roundup(old_size, align);
155
156 printk(KERN_INFO
157 "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
158 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
159
155 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", 160 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
156 size); 161 size);
157 162
@@ -168,24 +173,24 @@ void __init setup_per_cpu_areas(void)
168 "cpu %d has no node %d or node-local memory\n", 173 "cpu %d has no node %d or node-local memory\n",
169 cpu, node); 174 cpu, node);
170 if (ptr) 175 if (ptr)
171 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", 176 printk(KERN_DEBUG
177 "per cpu data for cpu%d at %016lx\n",
172 cpu, __pa(ptr)); 178 cpu, __pa(ptr));
173 } 179 }
174 else { 180 else {
175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 181 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
176 __pa(MAX_DMA_ADDRESS)); 182 __pa(MAX_DMA_ADDRESS));
177 if (ptr) 183 if (ptr)
178 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", 184 printk(KERN_DEBUG
179 cpu, node, __pa(ptr)); 185 "per cpu data for cpu%d on node%d "
186 "at %016lx\n",
187 cpu, node, __pa(ptr));
180 } 188 }
181#endif 189#endif
182 per_cpu_offset(cpu) = ptr - __per_cpu_start; 190 per_cpu_offset(cpu) = ptr - __per_cpu_start;
183 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 191 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
184 } 192 }
185 193
186 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
187 NR_CPUS, nr_cpu_ids, nr_node_ids);
188
189 /* Setup percpu data maps */ 194 /* Setup percpu data maps */
190 setup_per_cpu_maps(); 195 setup_per_cpu_maps();
191 196
@@ -282,7 +287,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
282 else 287 else
283 cpu_clear(cpu, *mask); 288 cpu_clear(cpu, *mask);
284 289
285 cpulist_scnprintf(buf, sizeof(buf), *mask); 290 cpulist_scnprintf(buf, sizeof(buf), mask);
286 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 291 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
287 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); 292 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
288 } 293 }
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 7e558db362c1..beea2649a240 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -118,22 +118,22 @@ static void native_smp_send_reschedule(int cpu)
118 WARN_ON(1); 118 WARN_ON(1);
119 return; 119 return;
120 } 120 }
121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 121 send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
122} 122}
123 123
124void native_send_call_func_single_ipi(int cpu) 124void native_send_call_func_single_ipi(int cpu)
125{ 125{
126 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); 126 send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
127} 127}
128 128
129void native_send_call_func_ipi(cpumask_t mask) 129void native_send_call_func_ipi(const struct cpumask *mask)
130{ 130{
131 cpumask_t allbutself; 131 cpumask_t allbutself;
132 132
133 allbutself = cpu_online_map; 133 allbutself = cpu_online_map;
134 cpu_clear(smp_processor_id(), allbutself); 134 cpu_clear(smp_processor_id(), allbutself);
135 135
136 if (cpus_equal(mask, allbutself) && 136 if (cpus_equal(*mask, allbutself) &&
137 cpus_equal(cpu_online_map, cpu_callout_map)) 137 cpus_equal(cpu_online_map, cpu_callout_map))
138 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 138 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
139 else 139 else
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f8500c969442..31869bf5fabd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -102,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings);
102/* Last level cache ID of each logical CPU */ 102/* Last level cache ID of each logical CPU */
103DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 103DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
104 104
105/* bitmap of online cpus */
106cpumask_t cpu_online_map __read_mostly;
107EXPORT_SYMBOL(cpu_online_map);
108
109cpumask_t cpu_callin_map; 105cpumask_t cpu_callin_map;
110cpumask_t cpu_callout_map; 106cpumask_t cpu_callout_map;
111cpumask_t cpu_possible_map;
112EXPORT_SYMBOL(cpu_possible_map);
113 107
114/* representing HT siblings of each logical CPU */ 108/* representing HT siblings of each logical CPU */
115DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); 109DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@ -1260,6 +1254,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1260 check_nmi_watchdog(); 1254 check_nmi_watchdog();
1261} 1255}
1262 1256
1257static int __initdata setup_possible_cpus = -1;
1258static int __init _setup_possible_cpus(char *str)
1259{
1260 get_option(&str, &setup_possible_cpus);
1261 return 0;
1262}
1263early_param("possible_cpus", _setup_possible_cpus);
1264
1265
1263/* 1266/*
1264 * cpu_possible_map should be static, it cannot change as cpu's 1267 * cpu_possible_map should be static, it cannot change as cpu's
1265 * are onlined, or offlined. The reason is per-cpu data-structures 1268 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1272,7 +1275,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1272 * 1275 *
1273 * Three ways to find out the number of additional hotplug CPUs: 1276 * Three ways to find out the number of additional hotplug CPUs:
1274 * - If the BIOS specified disabled CPUs in ACPI/mptables use that. 1277 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
1275 * - The user can overwrite it with additional_cpus=NUM 1278 * - The user can overwrite it with possible_cpus=NUM
1276 * - Otherwise don't reserve additional CPUs. 1279 * - Otherwise don't reserve additional CPUs.
1277 * We do this because additional CPUs waste a lot of memory. 1280 * We do this because additional CPUs waste a lot of memory.
1278 * -AK 1281 * -AK
@@ -1285,9 +1288,17 @@ __init void prefill_possible_map(void)
1285 if (!num_processors) 1288 if (!num_processors)
1286 num_processors = 1; 1289 num_processors = 1;
1287 1290
1288 possible = num_processors + disabled_cpus; 1291 if (setup_possible_cpus == -1)
1289 if (possible > NR_CPUS) 1292 possible = num_processors + disabled_cpus;
1290 possible = NR_CPUS; 1293 else
1294 possible = setup_possible_cpus;
1295
1296 if (possible > CONFIG_NR_CPUS) {
1297 printk(KERN_WARNING
1298 "%d Processors exceeds NR_CPUS limit of %d\n",
1299 possible, CONFIG_NR_CPUS);
1300 possible = CONFIG_NR_CPUS;
1301 }
1291 1302
1292 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1303 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
1293 possible, max_t(int, possible - num_processors, 0)); 1304 possible, max_t(int, possible - num_processors, 0));
@@ -1352,7 +1363,7 @@ void cpu_disable_common(void)
1352 lock_vector_lock(); 1363 lock_vector_lock();
1353 remove_cpu_from_maps(cpu); 1364 remove_cpu_from_maps(cpu);
1354 unlock_vector_lock(); 1365 unlock_vector_lock();
1355 fixup_irqs(cpu_online_map); 1366 fixup_irqs();
1356} 1367}
1357 1368
1358int native_cpu_disable(void) 1369int native_cpu_disable(void)
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 8da059f949be..ce5054642247 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -163,7 +163,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
163 * We have to send the IPI only to 163 * We have to send the IPI only to
164 * CPUs affected. 164 * CPUs affected.
165 */ 165 */
166 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); 166 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
167 167
168 while (!cpus_empty(flush_cpumask)) 168 while (!cpus_empty(flush_cpumask))
169 /* nothing. lockup detection does not belong here */ 169 /* nothing. lockup detection does not belong here */
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 29887d7081a9..f8be6f1d2e48 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
191 * We have to send the IPI only to 191 * We have to send the IPI only to
192 * CPUs affected. 192 * CPUs affected.
193 */ 193 */
194 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); 194 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
195 195
196 while (!cpus_empty(f->flush_cpumask)) 196 while (!cpus_empty(f->flush_cpumask))
197 cpu_relax(); 197 cpu_relax();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 961e26a69d55..ce6650eb64e9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -72,9 +72,6 @@
72 72
73#include "cpu/mcheck/mce.h" 73#include "cpu/mcheck/mce.h"
74 74
75DECLARE_BITMAP(used_vectors, NR_VECTORS);
76EXPORT_SYMBOL_GPL(used_vectors);
77
78asmlinkage int system_call(void); 75asmlinkage int system_call(void);
79 76
80/* Do we ignore FPU interrupts ? */ 77/* Do we ignore FPU interrupts ? */
@@ -89,6 +86,9 @@ gate_desc idt_table[256]
89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 86 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
90#endif 87#endif
91 88
89DECLARE_BITMAP(used_vectors, NR_VECTORS);
90EXPORT_SYMBOL_GPL(used_vectors);
91
92static int ignore_nmis; 92static int ignore_nmis;
93 93
94static inline void conditional_sti(struct pt_regs *regs) 94static inline void conditional_sti(struct pt_regs *regs)
@@ -946,9 +946,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
946 946
947void __init trap_init(void) 947void __init trap_init(void)
948{ 948{
949#ifdef CONFIG_X86_32
950 int i; 949 int i;
951#endif
952 950
953#ifdef CONFIG_EISA 951#ifdef CONFIG_EISA
954 void __iomem *p = early_ioremap(0x0FFFD9, 4); 952 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1005,11 +1003,15 @@ void __init trap_init(void)
1005 } 1003 }
1006 1004
1007 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 1005 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1006#endif
1008 1007
1009 /* Reserve all the builtin and the syscall vector: */ 1008 /* Reserve all the builtin and the syscall vector: */
1010 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1009 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1011 set_bit(i, used_vectors); 1010 set_bit(i, used_vectors);
1012 1011
1012#ifdef CONFIG_X86_64
1013 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
1014#else
1013 set_bit(SYSCALL_VECTOR, used_vectors); 1015 set_bit(SYSCALL_VECTOR, used_vectors);
1014#endif 1016#endif
1015 /* 1017 /*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 254ee07f8635..c4c1f9e09402 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 /* Upper bound is clockevent's use of ulong for cycle deltas. */ 226 /* Upper bound is clockevent's use of ulong for cycle deltas. */
227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); 227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
228 evt->min_delta_ns = clockevent_delta2ns(1, evt); 228 evt->min_delta_ns = clockevent_delta2ns(1, evt);
229 evt->cpumask = cpumask_of_cpu(cpu); 229 evt->cpumask = cpumask_of(cpu);
230 230
231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
232 evt->name, evt->mult, evt->shift); 232 evt->name, evt->mult, evt->shift);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59ebd37ad79e..e665d1c623ca 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
603 603
604static void __inject_pit_timer_intr(struct kvm *kvm) 604static void __inject_pit_timer_intr(struct kvm *kvm)
605{ 605{
606 struct kvm_vcpu *vcpu;
607 int i;
608
606 mutex_lock(&kvm->lock); 609 mutex_lock(&kvm->lock);
607 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 610 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
608 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 611 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
609 mutex_unlock(&kvm->lock); 612 mutex_unlock(&kvm->lock);
613
614 /*
615 * Provides NMI watchdog support via Virtual Wire mode.
616 * The route is: PIT -> PIC -> LVT0 in NMI mode.
617 *
618 * Note: Our Virtual Wire implementation is simplified, only
619 * propagating PIT interrupts to all VCPUs when they have set
620 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
621 * VCPU0, and only if its LVT0 is in EXTINT mode.
622 */
623 if (kvm->arch.vapics_in_nmi_mode > 0)
624 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
625 vcpu = kvm->vcpus[i];
626 if (vcpu)
627 kvm_apic_nmi_wd_deliver(vcpu);
628 }
610} 629}
611 630
612void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) 631void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1a..179dcb0103fd 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
26 * Port from Qemu. 26 * Port from Qemu.
27 */ 27 */
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/bitops.h>
29#include "irq.h" 30#include "irq.h"
30 31
31#include <linux/kvm_host.h> 32#include <linux/kvm_host.h>
32 33
34static void pic_lock(struct kvm_pic *s)
35{
36 spin_lock(&s->lock);
37}
38
39static void pic_unlock(struct kvm_pic *s)
40{
41 struct kvm *kvm = s->kvm;
42 unsigned acks = s->pending_acks;
43 bool wakeup = s->wakeup_needed;
44 struct kvm_vcpu *vcpu;
45
46 s->pending_acks = 0;
47 s->wakeup_needed = false;
48
49 spin_unlock(&s->lock);
50
51 while (acks) {
52 kvm_notify_acked_irq(kvm, __ffs(acks));
53 acks &= acks - 1;
54 }
55
56 if (wakeup) {
57 vcpu = s->kvm->vcpus[0];
58 if (vcpu)
59 kvm_vcpu_kick(vcpu);
60 }
61}
62
33static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 63static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
34{ 64{
35 s->isr &= ~(1 << irq); 65 s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
136 166
137void kvm_pic_update_irq(struct kvm_pic *s) 167void kvm_pic_update_irq(struct kvm_pic *s)
138{ 168{
169 pic_lock(s);
139 pic_update_irq(s); 170 pic_update_irq(s);
171 pic_unlock(s);
140} 172}
141 173
142void kvm_pic_set_irq(void *opaque, int irq, int level) 174void kvm_pic_set_irq(void *opaque, int irq, int level)
143{ 175{
144 struct kvm_pic *s = opaque; 176 struct kvm_pic *s = opaque;
145 177
178 pic_lock(s);
146 if (irq >= 0 && irq < PIC_NUM_PINS) { 179 if (irq >= 0 && irq < PIC_NUM_PINS) {
147 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 180 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
148 pic_update_irq(s); 181 pic_update_irq(s);
149 } 182 }
183 pic_unlock(s);
150} 184}
151 185
152/* 186/*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
172 int irq, irq2, intno; 206 int irq, irq2, intno;
173 struct kvm_pic *s = pic_irqchip(kvm); 207 struct kvm_pic *s = pic_irqchip(kvm);
174 208
209 pic_lock(s);
175 irq = pic_get_irq(&s->pics[0]); 210 irq = pic_get_irq(&s->pics[0]);
176 if (irq >= 0) { 211 if (irq >= 0) {
177 pic_intack(&s->pics[0], irq); 212 pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
196 intno = s->pics[0].irq_base + irq; 231 intno = s->pics[0].irq_base + irq;
197 } 232 }
198 pic_update_irq(s); 233 pic_update_irq(s);
234 pic_unlock(s);
199 kvm_notify_acked_irq(kvm, irq); 235 kvm_notify_acked_irq(kvm, irq);
200 236
201 return intno; 237 return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
203 239
204void kvm_pic_reset(struct kvm_kpic_state *s) 240void kvm_pic_reset(struct kvm_kpic_state *s)
205{ 241{
206 int irq, irqbase; 242 int irq, irqbase, n;
207 struct kvm *kvm = s->pics_state->irq_request_opaque; 243 struct kvm *kvm = s->pics_state->irq_request_opaque;
208 struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; 244 struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
209 245
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
214 250
215 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { 251 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
216 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) 252 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
217 if (s->irr & (1 << irq) || s->isr & (1 << irq)) 253 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
218 kvm_notify_acked_irq(kvm, irq+irqbase); 254 n = irq + irqbase;
255 s->pics_state->pending_acks |= 1 << n;
256 }
219 } 257 }
220 s->last_irr = 0; 258 s->last_irr = 0;
221 s->irr = 0; 259 s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
406 printk(KERN_ERR "PIC: non byte write\n"); 444 printk(KERN_ERR "PIC: non byte write\n");
407 return; 445 return;
408 } 446 }
447 pic_lock(s);
409 switch (addr) { 448 switch (addr) {
410 case 0x20: 449 case 0x20:
411 case 0x21: 450 case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
418 elcr_ioport_write(&s->pics[addr & 1], addr, data); 457 elcr_ioport_write(&s->pics[addr & 1], addr, data);
419 break; 458 break;
420 } 459 }
460 pic_unlock(s);
421} 461}
422 462
423static void picdev_read(struct kvm_io_device *this, 463static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
431 printk(KERN_ERR "PIC: non byte read\n"); 471 printk(KERN_ERR "PIC: non byte read\n");
432 return; 472 return;
433 } 473 }
474 pic_lock(s);
434 switch (addr) { 475 switch (addr) {
435 case 0x20: 476 case 0x20:
436 case 0x21: 477 case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
444 break; 485 break;
445 } 486 }
446 *(unsigned char *)val = data; 487 *(unsigned char *)val = data;
488 pic_unlock(s);
447} 489}
448 490
449/* 491/*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
459 s->output = level; 501 s->output = level;
460 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 502 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
461 s->pics[0].isr_ack &= ~(1 << irq); 503 s->pics[0].isr_ack &= ~(1 << irq);
462 kvm_vcpu_kick(vcpu); 504 s->wakeup_needed = true;
463 } 505 }
464} 506}
465 507
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
469 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 511 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
470 if (!s) 512 if (!s)
471 return NULL; 513 return NULL;
514 spin_lock_init(&s->lock);
515 s->kvm = kvm;
472 s->pics[0].elcr_mask = 0xf8; 516 s->pics[0].elcr_mask = 0xf8;
473 s->pics[1].elcr_mask = 0xde; 517 s->pics[1].elcr_mask = 0xde;
474 s->irq_request = pic_irq_request; 518 s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf31..2bf32a03ceec 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
25#include <linux/mm_types.h> 25#include <linux/mm_types.h>
26#include <linux/hrtimer.h> 26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
28#include <linux/spinlock.h>
28 29
29#include "iodev.h" 30#include "iodev.h"
30#include "ioapic.h" 31#include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
59}; 60};
60 61
61struct kvm_pic { 62struct kvm_pic {
63 spinlock_t lock;
64 bool wakeup_needed;
65 unsigned pending_acks;
66 struct kvm *kvm;
62 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
63 irq_request_func *irq_request; 68 irq_request_func *irq_request;
64 void *irq_request_opaque; 69 void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
87void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); 92void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
88void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); 93void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
89void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); 94void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
95void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
90void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); 96void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
91void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); 97void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
92void __kvm_migrate_timers(struct kvm_vcpu *vcpu); 98void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c036..8e5ee99551f6 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
7#include <linux/kvm_host.h> 7#include <linux/kvm_host.h>
8#include <asm/msr.h> 8#include <asm/msr.h>
9 9
10#include "svm.h" 10#include <asm/svm.h>
11 11
12static const u32 host_save_user_msrs[] = { 12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab48943..afac68c0815c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
130 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; 130 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
131} 131}
132 132
133static inline int apic_lvt_nmi_mode(u32 lvt_val)
134{
135 return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
136}
137
133static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { 138static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
134 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ 139 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
135 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 140 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
354 359
355 case APIC_DM_NMI: 360 case APIC_DM_NMI:
356 kvm_inject_nmi(vcpu); 361 kvm_inject_nmi(vcpu);
362 kvm_vcpu_kick(vcpu);
357 break; 363 break;
358 364
359 case APIC_DM_INIT: 365 case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
380 } 386 }
381 break; 387 break;
382 388
389 case APIC_DM_EXTINT:
390 /*
391 * Should only be called by kvm_apic_local_deliver() with LVT0,
392 * before NMI watchdog was enabled. Already handled by
393 * kvm_apic_accept_pic_intr().
394 */
395 break;
396
383 default: 397 default:
384 printk(KERN_ERR "TODO: unsupported delivery mode %x\n", 398 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
385 delivery_mode); 399 delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
663 apic->timer.period))); 677 apic->timer.period)));
664} 678}
665 679
680static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
681{
682 int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
683
684 if (apic_lvt_nmi_mode(lvt0_val)) {
685 if (!nmi_wd_enabled) {
686 apic_debug("Receive NMI setting on APIC_LVT0 "
687 "for cpu %d\n", apic->vcpu->vcpu_id);
688 apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
689 }
690 } else if (nmi_wd_enabled)
691 apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
692}
693
666static void apic_mmio_write(struct kvm_io_device *this, 694static void apic_mmio_write(struct kvm_io_device *this,
667 gpa_t address, int len, const void *data) 695 gpa_t address, int len, const void *data)
668{ 696{
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
743 apic_set_reg(apic, APIC_ICR2, val & 0xff000000); 771 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
744 break; 772 break;
745 773
774 case APIC_LVT0:
775 apic_manage_nmi_watchdog(apic, val);
746 case APIC_LVTT: 776 case APIC_LVTT:
747 case APIC_LVTTHMR: 777 case APIC_LVTTHMR:
748 case APIC_LVTPC: 778 case APIC_LVTPC:
749 case APIC_LVT0:
750 case APIC_LVT1: 779 case APIC_LVT1:
751 case APIC_LVTERR: 780 case APIC_LVTERR:
752 /* TODO: Check vector */ 781 /* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
961 return 0; 990 return 0;
962} 991}
963 992
964static int __inject_apic_timer_irq(struct kvm_lapic *apic) 993static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
994{
995 u32 reg = apic_get_reg(apic, lvt_type);
996 int vector, mode, trig_mode;
997
998 if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
999 vector = reg & APIC_VECTOR_MASK;
1000 mode = reg & APIC_MODE_MASK;
1001 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
1002 return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
1003 }
1004 return 0;
1005}
1006
1007void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
965{ 1008{
966 int vector; 1009 struct kvm_lapic *apic = vcpu->arch.apic;
967 1010
968 vector = apic_lvt_vector(apic, APIC_LVTT); 1011 if (apic)
969 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); 1012 kvm_apic_local_deliver(apic, APIC_LVT0);
970} 1013}
971 1014
972static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) 1015static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1061{ 1104{
1062 struct kvm_lapic *apic = vcpu->arch.apic; 1105 struct kvm_lapic *apic = vcpu->arch.apic;
1063 1106
1064 if (apic && apic_lvt_enabled(apic, APIC_LVTT) && 1107 if (apic && atomic_read(&apic->timer.pending) > 0) {
1065 atomic_read(&apic->timer.pending) > 0) { 1108 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1066 if (__inject_apic_timer_irq(apic))
1067 atomic_dec(&apic->timer.pending); 1109 atomic_dec(&apic->timer.pending);
1068 } 1110 }
1069} 1111}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 410ddbc1aa2e..83f11c7474a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
17 * 17 *
18 */ 18 */
19 19
20#include "vmx.h"
21#include "mmu.h" 20#include "mmu.h"
22 21
23#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
33#include <asm/page.h> 32#include <asm/page.h>
34#include <asm/cmpxchg.h> 33#include <asm/cmpxchg.h>
35#include <asm/io.h> 34#include <asm/io.h>
35#include <asm/vmx.h>
36 36
37/* 37/*
38 * When setting this variable to true it enables Two-Dimensional-Paging 38 * When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
168static u64 __read_mostly shadow_user_mask; 168static u64 __read_mostly shadow_user_mask;
169static u64 __read_mostly shadow_accessed_mask; 169static u64 __read_mostly shadow_accessed_mask;
170static u64 __read_mostly shadow_dirty_mask; 170static u64 __read_mostly shadow_dirty_mask;
171static u64 __read_mostly shadow_mt_mask;
171 172
172void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 173void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
173{ 174{
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
183EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 184EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
184 185
185void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 186void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
186 u64 dirty_mask, u64 nx_mask, u64 x_mask) 187 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
187{ 188{
188 shadow_user_mask = user_mask; 189 shadow_user_mask = user_mask;
189 shadow_accessed_mask = accessed_mask; 190 shadow_accessed_mask = accessed_mask;
190 shadow_dirty_mask = dirty_mask; 191 shadow_dirty_mask = dirty_mask;
191 shadow_nx_mask = nx_mask; 192 shadow_nx_mask = nx_mask;
192 shadow_x_mask = x_mask; 193 shadow_x_mask = x_mask;
194 shadow_mt_mask = mt_mask;
193} 195}
194EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 196EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
195 197
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
384{ 386{
385 int *write_count; 387 int *write_count;
386 388
387 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); 389 gfn = unalias_gfn(kvm, gfn);
390 write_count = slot_largepage_idx(gfn,
391 gfn_to_memslot_unaliased(kvm, gfn));
388 *write_count += 1; 392 *write_count += 1;
389} 393}
390 394
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
392{ 396{
393 int *write_count; 397 int *write_count;
394 398
395 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); 399 gfn = unalias_gfn(kvm, gfn);
400 write_count = slot_largepage_idx(gfn,
401 gfn_to_memslot_unaliased(kvm, gfn));
396 *write_count -= 1; 402 *write_count -= 1;
397 WARN_ON(*write_count < 0); 403 WARN_ON(*write_count < 0);
398} 404}
399 405
400static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) 406static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
401{ 407{
402 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 408 struct kvm_memory_slot *slot;
403 int *largepage_idx; 409 int *largepage_idx;
404 410
411 gfn = unalias_gfn(kvm, gfn);
412 slot = gfn_to_memslot_unaliased(kvm, gfn);
405 if (slot) { 413 if (slot) {
406 largepage_idx = slot_largepage_idx(gfn, slot); 414 largepage_idx = slot_largepage_idx(gfn, slot);
407 return *largepage_idx; 415 return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
613 return NULL; 621 return NULL;
614} 622}
615 623
616static void rmap_write_protect(struct kvm *kvm, u64 gfn) 624static int rmap_write_protect(struct kvm *kvm, u64 gfn)
617{ 625{
618 unsigned long *rmapp; 626 unsigned long *rmapp;
619 u64 *spte; 627 u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
659 spte = rmap_next(kvm, rmapp, spte); 667 spte = rmap_next(kvm, rmapp, spte);
660 } 668 }
661 669
662 if (write_protected) 670 return write_protected;
663 kvm_flush_remote_tlbs(kvm);
664} 671}
665 672
666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 673static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
786 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 793 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
787 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 794 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
788 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 795 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
796 INIT_LIST_HEAD(&sp->oos_link);
789 ASSERT(is_empty_shadow_page(sp->spt)); 797 ASSERT(is_empty_shadow_page(sp->spt));
790 sp->slot_bitmap = 0; 798 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
791 sp->multimapped = 0; 799 sp->multimapped = 0;
800 sp->global = 1;
792 sp->parent_pte = parent_pte; 801 sp->parent_pte = parent_pte;
793 --vcpu->kvm->arch.n_free_mmu_pages; 802 --vcpu->kvm->arch.n_free_mmu_pages;
794 return sp; 803 return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
900 struct kvm_mmu_page *sp = page_header(__pa(spte)); 909 struct kvm_mmu_page *sp = page_header(__pa(spte));
901 910
902 index = spte - sp->spt; 911 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap); 912 if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
904 sp->unsync_children = 1; 913 sp->unsync_children++;
914 WARN_ON(!sp->unsync_children);
905} 915}
906 916
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 917static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
928 938
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 939static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{ 940{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp); 941 kvm_mmu_update_parents_unsync(sp);
933 return 1; 942 return 1;
934} 943}
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{ 968{
960} 969}
961 970
971#define KVM_PAGE_ARRAY_NR 16
972
973struct kvm_mmu_pages {
974 struct mmu_page_and_offset {
975 struct kvm_mmu_page *sp;
976 unsigned int idx;
977 } page[KVM_PAGE_ARRAY_NR];
978 unsigned int nr;
979};
980
962#define for_each_unsync_children(bitmap, idx) \ 981#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \ 982 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \ 983 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1)) 984 idx = find_next_bit(bitmap, 512, idx+1))
966 985
967static int mmu_unsync_walk(struct kvm_mmu_page *sp, 986int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker) 987 int idx)
969{ 988{
970 int i, ret; 989 int i;
971 990
972 if (!sp->unsync_children) 991 if (sp->unsync)
973 return 0; 992 for (i=0; i < pvec->nr; i++)
993 if (pvec->page[i].sp == sp)
994 return 0;
995
996 pvec->page[pvec->nr].sp = sp;
997 pvec->page[pvec->nr].idx = idx;
998 pvec->nr++;
999 return (pvec->nr == KVM_PAGE_ARRAY_NR);
1000}
1001
1002static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1003 struct kvm_mmu_pages *pvec)
1004{
1005 int i, ret, nr_unsync_leaf = 0;
974 1006
975 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1007 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i]; 1008 u64 ent = sp->spt[i];
977 1009
978 if (is_shadow_present_pte(ent)) { 1010 if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
979 struct kvm_mmu_page *child; 1011 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK); 1012 child = page_header(ent & PT64_BASE_ADDR_MASK);
981 1013
982 if (child->unsync_children) { 1014 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker); 1015 if (mmu_pages_add(pvec, child, i))
984 if (ret) 1016 return -ENOSPC;
1017
1018 ret = __mmu_unsync_walk(child, pvec);
1019 if (!ret)
1020 __clear_bit(i, sp->unsync_child_bitmap);
1021 else if (ret > 0)
1022 nr_unsync_leaf += ret;
1023 else
985 return ret; 1024 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 } 1025 }
988 1026
989 if (child->unsync) { 1027 if (child->unsync) {
990 ret = walker->entry(child, walker); 1028 nr_unsync_leaf++;
991 __clear_bit(i, sp->unsync_child_bitmap); 1029 if (mmu_pages_add(pvec, child, i))
992 if (ret) 1030 return -ENOSPC;
993 return ret;
994 } 1031 }
995 } 1032 }
996 } 1033 }
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) 1035 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0; 1036 sp->unsync_children = 0;
1000 1037
1001 return 0; 1038 return nr_unsync_leaf;
1039}
1040
1041static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1042 struct kvm_mmu_pages *pvec)
1043{
1044 if (!sp->unsync_children)
1045 return 0;
1046
1047 mmu_pages_add(pvec, sp, 0);
1048 return __mmu_unsync_walk(sp, pvec);
1002} 1049}
1003 1050
1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1051static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1021 return NULL; 1068 return NULL;
1022} 1069}
1023 1070
1071static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
1072{
1073 list_del(&sp->oos_link);
1074 --kvm->stat.mmu_unsync_global;
1075}
1076
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1077static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{ 1078{
1026 WARN_ON(!sp->unsync); 1079 WARN_ON(!sp->unsync);
1027 sp->unsync = 0; 1080 sp->unsync = 0;
1081 if (sp->global)
1082 kvm_unlink_unsync_global(kvm, sp);
1028 --kvm->stat.mmu_unsync; 1083 --kvm->stat.mmu_unsync;
1029} 1084}
1030 1085
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1037 return 1; 1092 return 1;
1038 } 1093 }
1039 1094
1040 rmap_write_protect(vcpu->kvm, sp->gfn); 1095 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1096 kvm_flush_remote_tlbs(vcpu->kvm);
1041 kvm_unlink_unsync_page(vcpu->kvm, sp); 1097 kvm_unlink_unsync_page(vcpu->kvm, sp);
1042 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1098 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1043 kvm_mmu_zap_page(vcpu->kvm, sp); 1099 kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1048 return 0; 1104 return 0;
1049} 1105}
1050 1106
1051struct sync_walker { 1107struct mmu_page_path {
1052 struct kvm_vcpu *vcpu; 1108 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1053 struct kvm_unsync_walk walker; 1109 unsigned int idx[PT64_ROOT_LEVEL-1];
1054}; 1110};
1055 1111
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) 1112#define for_each_sp(pvec, sp, parents, i) \
1113 for (i = mmu_pages_next(&pvec, &parents, -1), \
1114 sp = pvec.page[i].sp; \
1115 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1116 i = mmu_pages_next(&pvec, &parents, i))
1117
1118int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
1119 int i)
1057{ 1120{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker, 1121 int n;
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061 1122
1062 kvm_sync_page(vcpu, sp); 1123 for (n = i+1; n < pvec->nr; n++) {
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); 1124 struct kvm_mmu_page *sp = pvec->page[n].sp;
1125
1126 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1127 parents->idx[0] = pvec->page[n].idx;
1128 return n;
1129 }
1130
1131 parents->parent[sp->role.level-2] = sp;
1132 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1133 }
1134
1135 return n;
1064} 1136}
1065 1137
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1138void mmu_pages_clear_parents(struct mmu_page_path *parents)
1067{ 1139{
1068 struct sync_walker walker = { 1140 struct kvm_mmu_page *sp;
1069 .walker = { .entry = mmu_sync_fn, }, 1141 unsigned int level = 0;
1070 .vcpu = vcpu, 1142
1071 }; 1143 do {
1144 unsigned int idx = parents->idx[level];
1145
1146 sp = parents->parent[level];
1147 if (!sp)
1148 return;
1149
1150 --sp->unsync_children;
1151 WARN_ON((int)sp->unsync_children < 0);
1152 __clear_bit(idx, sp->unsync_child_bitmap);
1153 level++;
1154 } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1155}
1156
1157static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1158 struct mmu_page_path *parents,
1159 struct kvm_mmu_pages *pvec)
1160{
1161 parents->parent[parent->role.level-1] = NULL;
1162 pvec->nr = 0;
1163}
1164
1165static void mmu_sync_children(struct kvm_vcpu *vcpu,
1166 struct kvm_mmu_page *parent)
1167{
1168 int i;
1169 struct kvm_mmu_page *sp;
1170 struct mmu_page_path parents;
1171 struct kvm_mmu_pages pages;
1172
1173 kvm_mmu_pages_init(parent, &parents, &pages);
1174 while (mmu_unsync_walk(parent, &pages)) {
1175 int protected = 0;
1072 1176
1073 while (mmu_unsync_walk(sp, &walker.walker)) 1177 for_each_sp(pages, sp, parents, i)
1178 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1179
1180 if (protected)
1181 kvm_flush_remote_tlbs(vcpu->kvm);
1182
1183 for_each_sp(pages, sp, parents, i) {
1184 kvm_sync_page(vcpu, sp);
1185 mmu_pages_clear_parents(&parents);
1186 }
1074 cond_resched_lock(&vcpu->kvm->mmu_lock); 1187 cond_resched_lock(&vcpu->kvm->mmu_lock);
1188 kvm_mmu_pages_init(parent, &parents, &pages);
1189 }
1075} 1190}
1076 1191
1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1192static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1129 sp->role = role; 1244 sp->role = role;
1130 hlist_add_head(&sp->hash_link, bucket); 1245 hlist_add_head(&sp->hash_link, bucket);
1131 if (!metaphysical) { 1246 if (!metaphysical) {
1132 rmap_write_protect(vcpu->kvm, gfn); 1247 if (rmap_write_protect(vcpu->kvm, gfn))
1248 kvm_flush_remote_tlbs(vcpu->kvm);
1133 account_shadowed(vcpu->kvm, gfn); 1249 account_shadowed(vcpu->kvm, gfn);
1134 } 1250 }
1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1251 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
1153 if (level == PT32E_ROOT_LEVEL) { 1269 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1270 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK; 1271 shadow_addr &= PT64_BASE_ADDR_MASK;
1272 if (!shadow_addr)
1273 return 1;
1156 --level; 1274 --level;
1157 } 1275 }
1158 1276
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1237 } 1355 }
1238} 1356}
1239 1357
1240struct zap_walker { 1358static int mmu_zap_unsync_children(struct kvm *kvm,
1241 struct kvm_unsync_walk walker; 1359 struct kvm_mmu_page *parent)
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{ 1360{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker, 1361 int i, zapped = 0;
1249 walker); 1362 struct mmu_page_path parents;
1250 kvm_mmu_zap_page(zap_walk->kvm, sp); 1363 struct kvm_mmu_pages pages;
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254 1364
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) 1365 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0; 1366 return 0;
1265 mmu_unsync_walk(sp, &walker.walker); 1367
1266 return walker.zapped; 1368 kvm_mmu_pages_init(parent, &parents, &pages);
1369 while (mmu_unsync_walk(parent, &pages)) {
1370 struct kvm_mmu_page *sp;
1371
1372 for_each_sp(pages, sp, parents, i) {
1373 kvm_mmu_zap_page(kvm, sp);
1374 mmu_pages_clear_parents(&parents);
1375 }
1376 zapped += pages.nr;
1377 kvm_mmu_pages_init(parent, &parents, &pages);
1378 }
1379
1380 return zapped;
1267} 1381}
1268 1382
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1383static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1362 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); 1476 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
1363 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1477 struct kvm_mmu_page *sp = page_header(__pa(pte));
1364 1478
1365 __set_bit(slot, &sp->slot_bitmap); 1479 __set_bit(slot, sp->slot_bitmap);
1366} 1480}
1367 1481
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp) 1482static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1393 return page; 1507 return page;
1394} 1508}
1395 1509
1510/*
1511 * The function is based on mtrr_type_lookup() in
1512 * arch/x86/kernel/cpu/mtrr/generic.c
1513 */
1514static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1515 u64 start, u64 end)
1516{
1517 int i;
1518 u64 base, mask;
1519 u8 prev_match, curr_match;
1520 int num_var_ranges = KVM_NR_VAR_MTRR;
1521
1522 if (!mtrr_state->enabled)
1523 return 0xFF;
1524
1525 /* Make end inclusive end, instead of exclusive */
1526 end--;
1527
1528 /* Look in fixed ranges. Just return the type as per start */
1529 if (mtrr_state->have_fixed && (start < 0x100000)) {
1530 int idx;
1531
1532 if (start < 0x80000) {
1533 idx = 0;
1534 idx += (start >> 16);
1535 return mtrr_state->fixed_ranges[idx];
1536 } else if (start < 0xC0000) {
1537 idx = 1 * 8;
1538 idx += ((start - 0x80000) >> 14);
1539 return mtrr_state->fixed_ranges[idx];
1540 } else if (start < 0x1000000) {
1541 idx = 3 * 8;
1542 idx += ((start - 0xC0000) >> 12);
1543 return mtrr_state->fixed_ranges[idx];
1544 }
1545 }
1546
1547 /*
1548 * Look in variable ranges
1549 * Look of multiple ranges matching this address and pick type
1550 * as per MTRR precedence
1551 */
1552 if (!(mtrr_state->enabled & 2))
1553 return mtrr_state->def_type;
1554
1555 prev_match = 0xFF;
1556 for (i = 0; i < num_var_ranges; ++i) {
1557 unsigned short start_state, end_state;
1558
1559 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1560 continue;
1561
1562 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1563 (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1564 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1565 (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1566
1567 start_state = ((start & mask) == (base & mask));
1568 end_state = ((end & mask) == (base & mask));
1569 if (start_state != end_state)
1570 return 0xFE;
1571
1572 if ((start & mask) != (base & mask))
1573 continue;
1574
1575 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1576 if (prev_match == 0xFF) {
1577 prev_match = curr_match;
1578 continue;
1579 }
1580
1581 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1582 curr_match == MTRR_TYPE_UNCACHABLE)
1583 return MTRR_TYPE_UNCACHABLE;
1584
1585 if ((prev_match == MTRR_TYPE_WRBACK &&
1586 curr_match == MTRR_TYPE_WRTHROUGH) ||
1587 (prev_match == MTRR_TYPE_WRTHROUGH &&
1588 curr_match == MTRR_TYPE_WRBACK)) {
1589 prev_match = MTRR_TYPE_WRTHROUGH;
1590 curr_match = MTRR_TYPE_WRTHROUGH;
1591 }
1592
1593 if (prev_match != curr_match)
1594 return MTRR_TYPE_UNCACHABLE;
1595 }
1596
1597 if (prev_match != 0xFF)
1598 return prev_match;
1599
1600 return mtrr_state->def_type;
1601}
1602
1603static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1604{
1605 u8 mtrr;
1606
1607 mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1608 (gfn << PAGE_SHIFT) + PAGE_SIZE);
1609 if (mtrr == 0xfe || mtrr == 0xff)
1610 mtrr = MTRR_TYPE_WRBACK;
1611 return mtrr;
1612}
1613
1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1614static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1397{ 1615{
1398 unsigned index; 1616 unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1409 if (s->role.word != sp->role.word) 1627 if (s->role.word != sp->role.word)
1410 return 1; 1628 return 1;
1411 } 1629 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync; 1630 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1; 1631 sp->unsync = 1;
1632
1633 if (sp->global) {
1634 list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
1635 ++vcpu->kvm->stat.mmu_unsync_global;
1636 } else
1637 kvm_mmu_mark_parents_unsync(vcpu, sp);
1638
1415 mmu_convert_notrap(sp); 1639 mmu_convert_notrap(sp);
1416 return 0; 1640 return 0;
1417} 1641}
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1661static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault, 1662 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage, 1663 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative, 1664 int global, gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync) 1665 bool can_unsync)
1442{ 1666{
1443 u64 spte; 1667 u64 spte;
1444 int ret = 0; 1668 int ret = 0;
1669 u64 mt_mask = shadow_mt_mask;
1670 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
1671
1672 if (!(vcpu->arch.cr4 & X86_CR4_PGE))
1673 global = 0;
1674 if (!global && sp->global) {
1675 sp->global = 0;
1676 if (sp->unsync) {
1677 kvm_unlink_unsync_global(vcpu->kvm, sp);
1678 kvm_mmu_mark_parents_unsync(vcpu, sp);
1679 }
1680 }
1681
1445 /* 1682 /*
1446 * We don't set the accessed bit, since we sometimes want to see 1683 * We don't set the accessed bit, since we sometimes want to see
1447 * whether the guest actually used the pte (in order to detect 1684 * whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1460 spte |= shadow_user_mask; 1697 spte |= shadow_user_mask;
1461 if (largepage) 1698 if (largepage)
1462 spte |= PT_PAGE_SIZE_MASK; 1699 spte |= PT_PAGE_SIZE_MASK;
1700 if (mt_mask) {
1701 mt_mask = get_memory_type(vcpu, gfn) <<
1702 kvm_x86_ops->get_mt_mask_shift();
1703 spte |= mt_mask;
1704 }
1463 1705
1464 spte |= (u64)pfn << PAGE_SHIFT; 1706 spte |= (u64)pfn << PAGE_SHIFT;
1465 1707
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1474 1716
1475 spte |= PT_WRITABLE_MASK; 1717 spte |= PT_WRITABLE_MASK;
1476 1718
1719 /*
1720 * Optimization: for pte sync, if spte was writable the hash
1721 * lookup is unnecessary (and expensive). Write protection
1722 * is responsibility of mmu_get_page / kvm_sync_page.
1723 * Same reasoning can be applied to dirty page accounting.
1724 */
1725 if (!can_unsync && is_writeble_pte(*shadow_pte))
1726 goto set_pte;
1727
1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1728 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1478 pgprintk("%s: found shadow page for %lx, marking ro\n", 1729 pgprintk("%s: found shadow page for %lx, marking ro\n",
1479 __func__, gfn); 1730 __func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1746static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access, 1747 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty, 1748 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn, 1749 int *ptwrite, int largepage, int global,
1499 pfn_t pfn, bool speculative) 1750 gfn_t gfn, pfn_t pfn, bool speculative)
1500{ 1751{
1501 int was_rmapped = 0; 1752 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte); 1753 int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1529 } 1780 }
1530 } 1781 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1782 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) { 1783 dirty, largepage, global, gfn, pfn, speculative, true)) {
1533 if (write_fault) 1784 if (write_fault)
1534 *ptwrite = 1; 1785 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu); 1786 kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { 1837 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, 1838 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write, 1839 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false); 1840 walk->largepage, 0, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed; 1841 ++vcpu->stat.pf_fixed;
1591 return 1; 1842 return 1;
1592 } 1843 }
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1773 } 2024 }
1774} 2025}
1775 2026
2027static void mmu_sync_global(struct kvm_vcpu *vcpu)
2028{
2029 struct kvm *kvm = vcpu->kvm;
2030 struct kvm_mmu_page *sp, *n;
2031
2032 list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
2033 kvm_sync_page(vcpu, sp);
2034}
2035
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2036void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{ 2037{
1778 spin_lock(&vcpu->kvm->mmu_lock); 2038 spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1780 spin_unlock(&vcpu->kvm->mmu_lock); 2040 spin_unlock(&vcpu->kvm->mmu_lock);
1781} 2041}
1782 2042
2043void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
2044{
2045 spin_lock(&vcpu->kvm->mmu_lock);
2046 mmu_sync_global(vcpu);
2047 spin_unlock(&vcpu->kvm->mmu_lock);
2048}
2049
1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2050static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1784{ 2051{
1785 return vaddr; 2052 return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2178} 2445}
2179 2446
2180void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2447void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2181 const u8 *new, int bytes) 2448 const u8 *new, int bytes,
2449 bool guest_initiated)
2182{ 2450{
2183 gfn_t gfn = gpa >> PAGE_SHIFT; 2451 gfn_t gfn = gpa >> PAGE_SHIFT;
2184 struct kvm_mmu_page *sp; 2452 struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2204 kvm_mmu_free_some_pages(vcpu); 2472 kvm_mmu_free_some_pages(vcpu);
2205 ++vcpu->kvm->stat.mmu_pte_write; 2473 ++vcpu->kvm->stat.mmu_pte_write;
2206 kvm_mmu_audit(vcpu, "pre pte write"); 2474 kvm_mmu_audit(vcpu, "pre pte write");
2207 if (gfn == vcpu->arch.last_pt_write_gfn 2475 if (guest_initiated) {
2208 && !last_updated_pte_accessed(vcpu)) { 2476 if (gfn == vcpu->arch.last_pt_write_gfn
2209 ++vcpu->arch.last_pt_write_count; 2477 && !last_updated_pte_accessed(vcpu)) {
2210 if (vcpu->arch.last_pt_write_count >= 3) 2478 ++vcpu->arch.last_pt_write_count;
2211 flooded = 1; 2479 if (vcpu->arch.last_pt_write_count >= 3)
2212 } else { 2480 flooded = 1;
2213 vcpu->arch.last_pt_write_gfn = gfn; 2481 } else {
2214 vcpu->arch.last_pt_write_count = 1; 2482 vcpu->arch.last_pt_write_gfn = gfn;
2215 vcpu->arch.last_pte_updated = NULL; 2483 vcpu->arch.last_pt_write_count = 1;
2484 vcpu->arch.last_pte_updated = NULL;
2485 }
2216 } 2486 }
2217 index = kvm_page_table_hashfn(gfn); 2487 index = kvm_page_table_hashfn(gfn);
2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2488 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2352 2622
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 2623void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{ 2624{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva); 2625 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu); 2626 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg; 2627 ++vcpu->stat.invlpg;
2360} 2628}
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2451 int i; 2719 int i;
2452 u64 *pt; 2720 u64 *pt;
2453 2721
2454 if (!test_bit(slot, &sp->slot_bitmap)) 2722 if (!test_bit(slot, sp->slot_bitmap))
2455 continue; 2723 continue;
2456 2724
2457 pt = sp->spt; 2725 pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
2860 if (sp->role.metaphysical) 3128 if (sp->role.metaphysical)
2861 continue; 3129 continue;
2862 3130
2863 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
2864 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3131 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
3132 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
2865 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3133 rmapp = &slot->rmap[gfn - slot->base_gfn];
2866 if (*rmapp) 3134 if (*rmapp)
2867 printk(KERN_ERR "%s: (%s) shadow page has writable" 3135 printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 84eee43bbe74..9fd78b6e17ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
82 int *ptwrite; 82 int *ptwrite;
83 pfn_t pfn; 83 pfn_t pfn;
84 u64 *sptep; 84 u64 *sptep;
85 gpa_t pte_gpa;
85}; 86};
86 87
87static gfn_t gpte_to_gfn(pt_element_t gpte) 88static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
222 if (ret) 223 if (ret)
223 goto walk; 224 goto walk;
224 pte |= PT_DIRTY_MASK; 225 pte |= PT_DIRTY_MASK;
225 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); 226 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
226 walker->ptes[walker->level - 1] = pte; 227 walker->ptes[walker->level - 1] = pte;
227 } 228 }
228 229
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
274 return; 275 return;
275 kvm_get_pfn(pfn); 276 kvm_get_pfn(pfn);
276 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 277 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
277 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 278 gpte & PT_DIRTY_MASK, NULL, largepage,
279 gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
278 pfn, true); 280 pfn, true);
279} 281}
280 282
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
301 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, 303 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
302 sw->user_fault, sw->write_fault, 304 sw->user_fault, sw->write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 305 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, 306 sw->ptwrite, sw->largepage,
305 false); 307 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
308 gw->gfn, sw->pfn, false);
306 sw->sptep = sptep; 309 sw->sptep = sptep;
307 return 1; 310 return 1;
308 } 311 }
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
466 struct kvm_vcpu *vcpu, u64 addr, 469 struct kvm_vcpu *vcpu, u64 addr,
467 u64 *sptep, int level) 470 u64 *sptep, int level)
468{ 471{
472 struct shadow_walker *sw =
473 container_of(_sw, struct shadow_walker, walker);
469 474
470 if (level == PT_PAGE_TABLE_LEVEL) { 475 /* FIXME: properly handle invlpg on large guest pages */
471 if (is_shadow_present_pte(*sptep)) 476 if (level == PT_PAGE_TABLE_LEVEL ||
477 ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
478 struct kvm_mmu_page *sp = page_header(__pa(sptep));
479
480 sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
481 sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
482
483 if (is_shadow_present_pte(*sptep)) {
472 rmap_remove(vcpu->kvm, sptep); 484 rmap_remove(vcpu->kvm, sptep);
485 if (is_large_pte(*sptep))
486 --vcpu->kvm->stat.lpages;
487 }
473 set_shadow_pte(sptep, shadow_trap_nonpresent_pte); 488 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
474 return 1; 489 return 1;
475 } 490 }
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
480 495
481static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 496static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
482{ 497{
498 pt_element_t gpte;
483 struct shadow_walker walker = { 499 struct shadow_walker walker = {
484 .walker = { .entry = FNAME(shadow_invlpg_entry), }, 500 .walker = { .entry = FNAME(shadow_invlpg_entry), },
501 .pte_gpa = -1,
485 }; 502 };
486 503
504 spin_lock(&vcpu->kvm->mmu_lock);
487 walk_shadow(&walker.walker, vcpu, gva); 505 walk_shadow(&walker.walker, vcpu, gva);
506 spin_unlock(&vcpu->kvm->mmu_lock);
507 if (walker.pte_gpa == -1)
508 return;
509 if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
510 sizeof(pt_element_t)))
511 return;
512 if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
513 if (mmu_topup_memory_caches(vcpu))
514 return;
515 kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
516 sizeof(pt_element_t), 0);
517 }
488} 518}
489 519
490static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 520static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
580 nr_present++; 610 nr_present++;
581 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 611 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
582 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 612 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
583 is_dirty_pte(gpte), 0, gfn, 613 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
584 spte_to_pfn(sp->spt[i]), true, false); 614 spte_to_pfn(sp->spt[i]), true, false);
585 } 615 }
586 616
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d963..1452851ae258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
28 28
29#include <asm/desc.h> 29#include <asm/desc.h>
30 30
31#include <asm/virtext.h>
32
31#define __ex(x) __kvm_handle_fault_on_reboot(x) 33#define __ex(x) __kvm_handle_fault_on_reboot(x)
32 34
33MODULE_AUTHOR("Qumranet"); 35MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
245 247
246static int has_svm(void) 248static int has_svm(void)
247{ 249{
248 uint32_t eax, ebx, ecx, edx; 250 const char *msg;
249
250 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
251 printk(KERN_INFO "has_svm: not amd\n");
252 return 0;
253 }
254 251
255 cpuid(0x80000000, &eax, &ebx, &ecx, &edx); 252 if (!cpu_has_svm(&msg)) {
256 if (eax < SVM_CPUID_FUNC) { 253 printk(KERN_INFO "has_svn: %s\n", msg);
257 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
258 return 0; 254 return 0;
259 } 255 }
260 256
261 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
262 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
263 printk(KERN_DEBUG "has_svm: svm not available\n");
264 return 0;
265 }
266 return 1; 257 return 1;
267} 258}
268 259
269static void svm_hardware_disable(void *garbage) 260static void svm_hardware_disable(void *garbage)
270{ 261{
271 uint64_t efer; 262 cpu_svm_disable();
272
273 wrmsrl(MSR_VM_HSAVE_PA, 0);
274 rdmsrl(MSR_EFER, efer);
275 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
276} 263}
277 264
278static void svm_hardware_enable(void *garbage) 265static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
772 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 759 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
773 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 760 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
774 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 761 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
762
763 /*
764 * SVM always stores 0 for the 'G' bit in the CS selector in
765 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
766 * Intel's VMENTRY has a check on the 'G' bit.
767 */
768 if (seg == VCPU_SREG_CS)
769 var->g = s->limit > 0xfffff;
770
771 /*
772 * Work around a bug where the busy flag in the tr selector
773 * isn't exposed
774 */
775 if (seg == VCPU_SREG_TR)
776 var->type |= 0x2;
777
775 var->unusable = !var->present; 778 var->unusable = !var->present;
776} 779}
777 780
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1099 rep = (io_info & SVM_IOIO_REP_MASK) != 0; 1102 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1100 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; 1103 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1101 1104
1105 skip_emulated_instruction(&svm->vcpu);
1102 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1106 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1103} 1107}
1104 1108
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
1912#endif 1916#endif
1913} 1917}
1914 1918
1919static int svm_get_mt_mask_shift(void)
1920{
1921 return 0;
1922}
1923
1915static struct kvm_x86_ops svm_x86_ops = { 1924static struct kvm_x86_ops svm_x86_ops = {
1916 .cpu_has_kvm_support = has_svm, 1925 .cpu_has_kvm_support = has_svm,
1917 .disabled_by_bios = is_disabled, 1926 .disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
1967 1976
1968 .set_tss_addr = svm_set_tss_addr, 1977 .set_tss_addr = svm_set_tss_addr,
1969 .get_tdp_level = get_npt_level, 1978 .get_tdp_level = get_npt_level,
1979 .get_mt_mask_shift = svm_get_mt_mask_shift,
1970}; 1980};
1971 1981
1972static int __init svm_init(void) 1982static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4018b01e1f9..6259d7467648 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
16 */ 16 */
17 17
18#include "irq.h" 18#include "irq.h"
19#include "vmx.h"
20#include "mmu.h" 19#include "mmu.h"
21 20
22#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
31 30
32#include <asm/io.h> 31#include <asm/io.h>
33#include <asm/desc.h> 32#include <asm/desc.h>
33#include <asm/vmx.h>
34#include <asm/virtext.h>
34 35
35#define __ex(x) __kvm_handle_fault_on_reboot(x) 36#define __ex(x) __kvm_handle_fault_on_reboot(x)
36 37
@@ -90,6 +91,11 @@ struct vcpu_vmx {
90 } rmode; 91 } rmode;
91 int vpid; 92 int vpid;
92 bool emulation_required; 93 bool emulation_required;
94
95 /* Support for vnmi-less CPUs */
96 int soft_vnmi_blocked;
97 ktime_t entry_time;
98 s64 vnmi_blocked_time;
93}; 99};
94 100
95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 101static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
122 u32 vmentry_ctrl; 128 u32 vmentry_ctrl;
123} vmcs_config; 129} vmcs_config;
124 130
125struct vmx_capability { 131static struct vmx_capability {
126 u32 ept; 132 u32 ept;
127 u32 vpid; 133 u32 vpid;
128} vmx_capability; 134} vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
957 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); 963 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
958 964
959 break; 965 break;
966 case MSR_IA32_CR_PAT:
967 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
968 vmcs_write64(GUEST_IA32_PAT, data);
969 vcpu->arch.pat = data;
970 break;
971 }
972 /* Otherwise falls through to kvm_set_msr_common */
960 default: 973 default:
961 vmx_load_host_state(vmx); 974 vmx_load_host_state(vmx);
962 msr = find_msr_entry(vmx, msr_index); 975 msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
1032 1045
1033static __init int cpu_has_kvm_support(void) 1046static __init int cpu_has_kvm_support(void)
1034{ 1047{
1035 unsigned long ecx = cpuid_ecx(1); 1048 return cpu_has_vmx();
1036 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
1037} 1049}
1038 1050
1039static __init int vmx_disabled_by_bios(void) 1051static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
1079 __vcpu_clear(vmx); 1091 __vcpu_clear(vmx);
1080} 1092}
1081 1093
1082static void hardware_disable(void *garbage) 1094
1095/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
1096 * tricks.
1097 */
1098static void kvm_cpu_vmxoff(void)
1083{ 1099{
1084 vmclear_local_vcpus();
1085 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1100 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1086 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1101 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1087} 1102}
1088 1103
1104static void hardware_disable(void *garbage)
1105{
1106 vmclear_local_vcpus();
1107 kvm_cpu_vmxoff();
1108}
1109
1089static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1110static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1090 u32 msr, u32 *result) 1111 u32 msr, u32 *result)
1091{ 1112{
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1176#ifdef CONFIG_X86_64 1197#ifdef CONFIG_X86_64
1177 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 1198 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1178#endif 1199#endif
1179 opt = 0; 1200 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
1180 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 1201 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1181 &_vmexit_control) < 0) 1202 &_vmexit_control) < 0)
1182 return -EIO; 1203 return -EIO;
1183 1204
1184 min = opt = 0; 1205 min = 0;
1206 opt = VM_ENTRY_LOAD_IA32_PAT;
1185 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 1207 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1186 &_vmentry_control) < 0) 1208 &_vmentry_control) < 0)
1187 return -EIO; 1209 return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
2087 */ 2109 */
2088static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 2110static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2089{ 2111{
2090 u32 host_sysenter_cs; 2112 u32 host_sysenter_cs, msr_low, msr_high;
2091 u32 junk; 2113 u32 junk;
2114 u64 host_pat;
2092 unsigned long a; 2115 unsigned long a;
2093 struct descriptor_table dt; 2116 struct descriptor_table dt;
2094 int i; 2117 int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2176 rdmsrl(MSR_IA32_SYSENTER_EIP, a); 2199 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2177 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ 2200 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
2178 2201
2202 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2203 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2204 host_pat = msr_low | ((u64) msr_high << 32);
2205 vmcs_write64(HOST_IA32_PAT, host_pat);
2206 }
2207 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2208 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2209 host_pat = msr_low | ((u64) msr_high << 32);
2210 /* Write the default value follow host pat */
2211 vmcs_write64(GUEST_IA32_PAT, host_pat);
2212 /* Keep arch.pat sync with GUEST_IA32_PAT */
2213 vmx->vcpu.arch.pat = host_pat;
2214 }
2215
2179 for (i = 0; i < NR_VMX_MSR; ++i) { 2216 for (i = 0; i < NR_VMX_MSR; ++i) {
2180 u32 index = vmx_msr_index[i]; 2217 u32 index = vmx_msr_index[i];
2181 u32 data_low, data_high; 2218 u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2230 2267
2231 vmx->vcpu.arch.rmode.active = 0; 2268 vmx->vcpu.arch.rmode.active = 0;
2232 2269
2270 vmx->soft_vnmi_blocked = 0;
2271
2233 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 2272 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2234 kvm_set_cr8(&vmx->vcpu, 0); 2273 kvm_set_cr8(&vmx->vcpu, 0);
2235 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 2274 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
2335 return ret; 2374 return ret;
2336} 2375}
2337 2376
2377static void enable_irq_window(struct kvm_vcpu *vcpu)
2378{
2379 u32 cpu_based_vm_exec_control;
2380
2381 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2382 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2383 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2384}
2385
2386static void enable_nmi_window(struct kvm_vcpu *vcpu)
2387{
2388 u32 cpu_based_vm_exec_control;
2389
2390 if (!cpu_has_virtual_nmis()) {
2391 enable_irq_window(vcpu);
2392 return;
2393 }
2394
2395 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2396 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2397 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2398}
2399
2338static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 2400static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2339{ 2401{
2340 struct vcpu_vmx *vmx = to_vmx(vcpu); 2402 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2358 2420
2359static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2421static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2360{ 2422{
2423 struct vcpu_vmx *vmx = to_vmx(vcpu);
2424
2425 if (!cpu_has_virtual_nmis()) {
2426 /*
2427 * Tracking the NMI-blocked state in software is built upon
2428 * finding the next open IRQ window. This, in turn, depends on
2429 * well-behaving guests: They have to keep IRQs disabled at
2430 * least as long as the NMI handler runs. Otherwise we may
2431 * cause NMI nesting, maybe breaking the guest. But as this is
2432 * highly unlikely, we can live with the residual risk.
2433 */
2434 vmx->soft_vnmi_blocked = 1;
2435 vmx->vnmi_blocked_time = 0;
2436 }
2437
2438 ++vcpu->stat.nmi_injections;
2439 if (vcpu->arch.rmode.active) {
2440 vmx->rmode.irq.pending = true;
2441 vmx->rmode.irq.vector = NMI_VECTOR;
2442 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2443 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2444 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2445 INTR_INFO_VALID_MASK);
2446 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2447 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2448 return;
2449 }
2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2450 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2362 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2451 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2363} 2452}
2364 2453
2454static void vmx_update_window_states(struct kvm_vcpu *vcpu)
2455{
2456 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2457
2458 vcpu->arch.nmi_window_open =
2459 !(guest_intr & (GUEST_INTR_STATE_STI |
2460 GUEST_INTR_STATE_MOV_SS |
2461 GUEST_INTR_STATE_NMI));
2462 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2463 vcpu->arch.nmi_window_open = 0;
2464
2465 vcpu->arch.interrupt_window_open =
2466 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2467 !(guest_intr & (GUEST_INTR_STATE_STI |
2468 GUEST_INTR_STATE_MOV_SS)));
2469}
2470
2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2471static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2366{ 2472{
2367 int word_index = __ffs(vcpu->arch.irq_summary); 2473 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2374 kvm_queue_interrupt(vcpu, irq); 2480 kvm_queue_interrupt(vcpu, irq);
2375} 2481}
2376 2482
2377
2378static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2483static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2379 struct kvm_run *kvm_run) 2484 struct kvm_run *kvm_run)
2380{ 2485{
2381 u32 cpu_based_vm_exec_control; 2486 vmx_update_window_states(vcpu);
2382
2383 vcpu->arch.interrupt_window_open =
2384 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2385 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2386 2487
2387 if (vcpu->arch.interrupt_window_open && 2488 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2388 vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) 2489 if (vcpu->arch.interrupt.pending) {
2389 kvm_do_inject_irq(vcpu); 2490 enable_nmi_window(vcpu);
2491 } else if (vcpu->arch.nmi_window_open) {
2492 vcpu->arch.nmi_pending = false;
2493 vcpu->arch.nmi_injected = true;
2494 } else {
2495 enable_nmi_window(vcpu);
2496 return;
2497 }
2498 }
2499 if (vcpu->arch.nmi_injected) {
2500 vmx_inject_nmi(vcpu);
2501 if (vcpu->arch.nmi_pending)
2502 enable_nmi_window(vcpu);
2503 else if (vcpu->arch.irq_summary
2504 || kvm_run->request_interrupt_window)
2505 enable_irq_window(vcpu);
2506 return;
2507 }
2390 2508
2391 if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) 2509 if (vcpu->arch.interrupt_window_open) {
2392 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); 2510 if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2511 kvm_do_inject_irq(vcpu);
2393 2512
2394 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2513 if (vcpu->arch.interrupt.pending)
2514 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2515 }
2395 if (!vcpu->arch.interrupt_window_open && 2516 if (!vcpu->arch.interrupt_window_open &&
2396 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) 2517 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
2397 /* 2518 enable_irq_window(vcpu);
2398 * Interrupts blocked. Wait for unblock.
2399 */
2400 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2401 else
2402 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2403 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2404} 2519}
2405 2520
2406static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 2521static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2407{ 2522{
2408 int ret; 2523 int ret;
2409 struct kvm_userspace_memory_region tss_mem = { 2524 struct kvm_userspace_memory_region tss_mem = {
2410 .slot = 8, 2525 .slot = TSS_PRIVATE_MEMSLOT,
2411 .guest_phys_addr = addr, 2526 .guest_phys_addr = addr,
2412 .memory_size = PAGE_SIZE * 3, 2527 .memory_size = PAGE_SIZE * 3,
2413 .flags = 0, 2528 .flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2492 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); 2607 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
2493 } 2608 }
2494 2609
2495 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 2610 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2496 return 1; /* already handled by vmx_vcpu_run() */ 2611 return 1; /* already handled by vmx_vcpu_run() */
2497 2612
2498 if (is_no_device(intr_info)) { 2613 if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2581 rep = (exit_qualification & 32) != 0; 2696 rep = (exit_qualification & 32) != 0;
2582 port = exit_qualification >> 16; 2697 port = exit_qualification >> 16;
2583 2698
2699 skip_emulated_instruction(vcpu);
2584 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2700 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
2585} 2701}
2586 2702
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2767 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2883 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2768 2884
2769 KVMTRACE_0D(PEND_INTR, vcpu, handler); 2885 KVMTRACE_0D(PEND_INTR, vcpu, handler);
2886 ++vcpu->stat.irq_window_exits;
2770 2887
2771 /* 2888 /*
2772 * If the user space waits to inject interrupts, exit as soon as 2889 * If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2775 if (kvm_run->request_interrupt_window && 2892 if (kvm_run->request_interrupt_window &&
2776 !vcpu->arch.irq_summary) { 2893 !vcpu->arch.irq_summary) {
2777 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2894 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2778 ++vcpu->stat.irq_window_exits;
2779 return 0; 2895 return 0;
2780 } 2896 }
2781 return 1; 2897 return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2832 2948
2833static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2949static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2834{ 2950{
2951 struct vcpu_vmx *vmx = to_vmx(vcpu);
2835 unsigned long exit_qualification; 2952 unsigned long exit_qualification;
2836 u16 tss_selector; 2953 u16 tss_selector;
2837 int reason; 2954 int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2839 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2956 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2840 2957
2841 reason = (u32)exit_qualification >> 30; 2958 reason = (u32)exit_qualification >> 30;
2959 if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
2960 (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
2961 (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
2962 == INTR_TYPE_NMI_INTR) {
2963 vcpu->arch.nmi_injected = false;
2964 if (cpu_has_virtual_nmis())
2965 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2966 GUEST_INTR_STATE_NMI);
2967 }
2842 tss_selector = exit_qualification; 2968 tss_selector = exit_qualification;
2843 2969
2844 return kvm_task_switch(vcpu, tss_selector, reason); 2970 return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2927 while (!guest_state_valid(vcpu)) { 3053 while (!guest_state_valid(vcpu)) {
2928 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3054 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929 3055
2930 switch (err) { 3056 if (err == EMULATE_DO_MMIO)
2931 case EMULATE_DONE: 3057 break;
2932 break; 3058
2933 case EMULATE_DO_MMIO: 3059 if (err != EMULATE_DONE) {
2934 kvm_report_emulation_failure(vcpu, "mmio"); 3060 kvm_report_emulation_failure(vcpu, "emulation failure");
2935 /* TODO: Handle MMIO */ 3061 return;
2936 return;
2937 default:
2938 kvm_report_emulation_failure(vcpu, "emulation failure");
2939 return;
2940 } 3062 }
2941 3063
2942 if (signal_pending(current)) 3064 if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2948 local_irq_disable(); 3070 local_irq_disable();
2949 preempt_disable(); 3071 preempt_disable();
2950 3072
2951 /* Guest state should be valid now, no more emulation should be needed */ 3073 /* Guest state should be valid now except if we need to
2952 vmx->emulation_required = 0; 3074 * emulate an MMIO */
3075 if (guest_state_valid(vcpu))
3076 vmx->emulation_required = 0;
2953} 3077}
2954 3078
2955/* 3079/*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2996 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3120 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2997 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); 3121 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2998 3122
3123 /* If we need to emulate an MMIO from handle_invalid_guest_state
3124 * we just return 0 */
3125 if (vmx->emulation_required && emulate_invalid_guest_state)
3126 return 0;
3127
2999 /* Access CR3 don't cause VMExit in paging mode, so we need 3128 /* Access CR3 don't cause VMExit in paging mode, so we need
3000 * to sync with guest real CR3. */ 3129 * to sync with guest real CR3. */
3001 if (vm_need_ept() && is_paging(vcpu)) { 3130 if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3012 3141
3013 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 3142 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
3014 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 3143 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
3015 exit_reason != EXIT_REASON_EPT_VIOLATION)) 3144 exit_reason != EXIT_REASON_EPT_VIOLATION &&
3016 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 3145 exit_reason != EXIT_REASON_TASK_SWITCH))
3017 "exit reason is 0x%x\n", __func__, exit_reason); 3146 printk(KERN_WARNING "%s: unexpected, valid vectoring info "
3147 "(0x%x) and exit reason is 0x%x\n",
3148 __func__, vectoring_info, exit_reason);
3149
3150 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3151 if (vcpu->arch.interrupt_window_open) {
3152 vmx->soft_vnmi_blocked = 0;
3153 vcpu->arch.nmi_window_open = 1;
3154 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3155 vcpu->arch.nmi_pending) {
3156 /*
3157 * This CPU don't support us in finding the end of an
3158 * NMI-blocked window if the guest runs with IRQs
3159 * disabled. So we pull the trigger after 1 s of
3160 * futile waiting, but inform the user about this.
3161 */
3162 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
3163 "state on VCPU %d after 1 s timeout\n",
3164 __func__, vcpu->vcpu_id);
3165 vmx->soft_vnmi_blocked = 0;
3166 vmx->vcpu.arch.nmi_window_open = 1;
3167 }
3168 }
3169
3018 if (exit_reason < kvm_vmx_max_exit_handlers 3170 if (exit_reason < kvm_vmx_max_exit_handlers
3019 && kvm_vmx_exit_handlers[exit_reason]) 3171 && kvm_vmx_exit_handlers[exit_reason])
3020 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3172 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
3042 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); 3194 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
3043} 3195}
3044 3196
3045static void enable_irq_window(struct kvm_vcpu *vcpu)
3046{
3047 u32 cpu_based_vm_exec_control;
3048
3049 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3050 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
3051 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3052}
3053
3054static void enable_nmi_window(struct kvm_vcpu *vcpu)
3055{
3056 u32 cpu_based_vm_exec_control;
3057
3058 if (!cpu_has_virtual_nmis())
3059 return;
3060
3061 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3062 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
3063 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3064}
3065
3066static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
3067{
3068 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3069 return !(guest_intr & (GUEST_INTR_STATE_NMI |
3070 GUEST_INTR_STATE_MOV_SS |
3071 GUEST_INTR_STATE_STI));
3072}
3073
3074static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
3075{
3076 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3077 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
3078 GUEST_INTR_STATE_STI)) &&
3079 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
3080}
3081
3082static void enable_intr_window(struct kvm_vcpu *vcpu)
3083{
3084 if (vcpu->arch.nmi_pending)
3085 enable_nmi_window(vcpu);
3086 else if (kvm_cpu_has_interrupt(vcpu))
3087 enable_irq_window(vcpu);
3088}
3089
3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3197static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3091{ 3198{
3092 u32 exit_intr_info; 3199 u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3109 if (unblock_nmi && vector != DF_VECTOR) 3216 if (unblock_nmi && vector != DF_VECTOR)
3110 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3217 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3111 GUEST_INTR_STATE_NMI); 3218 GUEST_INTR_STATE_NMI);
3112 } 3219 } else if (unlikely(vmx->soft_vnmi_blocked))
3220 vmx->vnmi_blocked_time +=
3221 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3113 3222
3114 idt_vectoring_info = vmx->idt_vectoring_info; 3223 idt_vectoring_info = vmx->idt_vectoring_info;
3115 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3224 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{ 3256{
3148 update_tpr_threshold(vcpu); 3257 update_tpr_threshold(vcpu);
3149 3258
3150 if (cpu_has_virtual_nmis()) { 3259 vmx_update_window_states(vcpu);
3151 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { 3260
3152 if (vcpu->arch.interrupt.pending) { 3261 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3153 enable_nmi_window(vcpu); 3262 if (vcpu->arch.interrupt.pending) {
3154 } else if (vmx_nmi_enabled(vcpu)) { 3263 enable_nmi_window(vcpu);
3155 vcpu->arch.nmi_pending = false; 3264 } else if (vcpu->arch.nmi_window_open) {
3156 vcpu->arch.nmi_injected = true; 3265 vcpu->arch.nmi_pending = false;
3157 } else { 3266 vcpu->arch.nmi_injected = true;
3158 enable_intr_window(vcpu); 3267 } else {
3159 return; 3268 enable_nmi_window(vcpu);
3160 }
3161 }
3162 if (vcpu->arch.nmi_injected) {
3163 vmx_inject_nmi(vcpu);
3164 enable_intr_window(vcpu);
3165 return; 3269 return;
3166 } 3270 }
3167 } 3271 }
3272 if (vcpu->arch.nmi_injected) {
3273 vmx_inject_nmi(vcpu);
3274 if (vcpu->arch.nmi_pending)
3275 enable_nmi_window(vcpu);
3276 else if (kvm_cpu_has_interrupt(vcpu))
3277 enable_irq_window(vcpu);
3278 return;
3279 }
3168 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { 3280 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3169 if (vmx_irq_enabled(vcpu)) 3281 if (vcpu->arch.interrupt_window_open)
3170 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); 3282 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3171 else 3283 else
3172 enable_irq_window(vcpu); 3284 enable_irq_window(vcpu);
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3174 if (vcpu->arch.interrupt.pending) { 3286 if (vcpu->arch.interrupt.pending) {
3175 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); 3287 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3176 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); 3288 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
3289 if (kvm_cpu_has_interrupt(vcpu))
3290 enable_irq_window(vcpu);
3177 } 3291 }
3178} 3292}
3179 3293
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3213 struct vcpu_vmx *vmx = to_vmx(vcpu); 3327 struct vcpu_vmx *vmx = to_vmx(vcpu);
3214 u32 intr_info; 3328 u32 intr_info;
3215 3329
3330 /* Record the guest's net vcpu time for enforced NMI injections. */
3331 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3332 vmx->entry_time = ktime_get();
3333
3216 /* Handle invalid guest state instead of entering VMX */ 3334 /* Handle invalid guest state instead of entering VMX */
3217 if (vmx->emulation_required && emulate_invalid_guest_state) { 3335 if (vmx->emulation_required && emulate_invalid_guest_state) {
3218 handle_invalid_guest_state(vcpu, kvm_run); 3336 handle_invalid_guest_state(vcpu, kvm_run);
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3327 if (vmx->rmode.irq.pending) 3445 if (vmx->rmode.irq.pending)
3328 fixup_rmode_irq(vmx); 3446 fixup_rmode_irq(vmx);
3329 3447
3330 vcpu->arch.interrupt_window_open = 3448 vmx_update_window_states(vcpu);
3331 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3332 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
3333 3449
3334 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3450 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3335 vmx->launched = 1; 3451 vmx->launched = 1;
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3337 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3453 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3338 3454
3339 /* We need to handle NMIs before interrupts are enabled */ 3455 /* We need to handle NMIs before interrupts are enabled */
3340 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && 3456 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3341 (intr_info & INTR_INFO_VALID_MASK)) { 3457 (intr_info & INTR_INFO_VALID_MASK)) {
3342 KVMTRACE_0D(NMI, vcpu, handler); 3458 KVMTRACE_0D(NMI, vcpu, handler);
3343 asm("int $2"); 3459 asm("int $2");
@@ -3455,6 +3571,11 @@ static int get_ept_level(void)
3455 return VMX_EPT_DEFAULT_GAW + 1; 3571 return VMX_EPT_DEFAULT_GAW + 1;
3456} 3572}
3457 3573
3574static int vmx_get_mt_mask_shift(void)
3575{
3576 return VMX_EPT_MT_EPTE_SHIFT;
3577}
3578
3458static struct kvm_x86_ops vmx_x86_ops = { 3579static struct kvm_x86_ops vmx_x86_ops = {
3459 .cpu_has_kvm_support = cpu_has_kvm_support, 3580 .cpu_has_kvm_support = cpu_has_kvm_support,
3460 .disabled_by_bios = vmx_disabled_by_bios, 3581 .disabled_by_bios = vmx_disabled_by_bios,
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3510 3631
3511 .set_tss_addr = vmx_set_tss_addr, 3632 .set_tss_addr = vmx_set_tss_addr,
3512 .get_tdp_level = get_ept_level, 3633 .get_tdp_level = get_ept_level,
3634 .get_mt_mask_shift = vmx_get_mt_mask_shift,
3513}; 3635};
3514 3636
3515static int __init vmx_init(void) 3637static int __init vmx_init(void)
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void)
3566 bypass_guest_pf = 0; 3688 bypass_guest_pf = 0;
3567 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | 3689 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3568 VMX_EPT_WRITABLE_MASK | 3690 VMX_EPT_WRITABLE_MASK |
3569 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
3570 VMX_EPT_IGMT_BIT); 3691 VMX_EPT_IGMT_BIT);
3571 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 3692 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3572 VMX_EPT_EXECUTABLE_MASK); 3693 VMX_EPT_EXECUTABLE_MASK,
3694 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3573 kvm_enable_tdp(); 3695 kvm_enable_tdp();
3574 } else 3696 } else
3575 kvm_disable_tdp(); 3697 kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa2..0e6aa8141dcd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -39,6 +39,7 @@
39#include <asm/uaccess.h> 39#include <asm/uaccess.h>
40#include <asm/msr.h> 40#include <asm/msr.h>
41#include <asm/desc.h> 41#include <asm/desc.h>
42#include <asm/mtrr.h>
42 43
43#define MAX_IO_MSRS 256 44#define MAX_IO_MSRS 256
44#define CR0_RESERVED_BITS \ 45#define CR0_RESERVED_BITS \
@@ -86,6 +87,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
86 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 87 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
87 { "hypercalls", VCPU_STAT(hypercalls) }, 88 { "hypercalls", VCPU_STAT(hypercalls) },
88 { "request_irq", VCPU_STAT(request_irq_exits) }, 89 { "request_irq", VCPU_STAT(request_irq_exits) },
90 { "request_nmi", VCPU_STAT(request_nmi_exits) },
89 { "irq_exits", VCPU_STAT(irq_exits) }, 91 { "irq_exits", VCPU_STAT(irq_exits) },
90 { "host_state_reload", VCPU_STAT(host_state_reload) }, 92 { "host_state_reload", VCPU_STAT(host_state_reload) },
91 { "efer_reload", VCPU_STAT(efer_reload) }, 93 { "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +95,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
93 { "insn_emulation", VCPU_STAT(insn_emulation) }, 95 { "insn_emulation", VCPU_STAT(insn_emulation) },
94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 96 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
95 { "irq_injections", VCPU_STAT(irq_injections) }, 97 { "irq_injections", VCPU_STAT(irq_injections) },
98 { "nmi_injections", VCPU_STAT(nmi_injections) },
96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 99 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
97 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 100 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 101 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
101 { "mmu_recycled", VM_STAT(mmu_recycled) }, 104 { "mmu_recycled", VM_STAT(mmu_recycled) },
102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 105 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
103 { "mmu_unsync", VM_STAT(mmu_unsync) }, 106 { "mmu_unsync", VM_STAT(mmu_unsync) },
107 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 108 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
105 { "largepages", VM_STAT(lpages) }, 109 { "largepages", VM_STAT(lpages) },
106 { NULL } 110 { NULL }
@@ -312,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
312 kvm_x86_ops->set_cr0(vcpu, cr0); 316 kvm_x86_ops->set_cr0(vcpu, cr0);
313 vcpu->arch.cr0 = cr0; 317 vcpu->arch.cr0 = cr0;
314 318
319 kvm_mmu_sync_global(vcpu);
315 kvm_mmu_reset_context(vcpu); 320 kvm_mmu_reset_context(vcpu);
316 return; 321 return;
317} 322}
@@ -355,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
355 } 360 }
356 kvm_x86_ops->set_cr4(vcpu, cr4); 361 kvm_x86_ops->set_cr4(vcpu, cr4);
357 vcpu->arch.cr4 = cr4; 362 vcpu->arch.cr4 = cr4;
363 kvm_mmu_sync_global(vcpu);
358 kvm_mmu_reset_context(vcpu); 364 kvm_mmu_reset_context(vcpu);
359} 365}
360EXPORT_SYMBOL_GPL(kvm_set_cr4); 366EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +455,7 @@ static u32 msrs_to_save[] = {
449 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 455 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
450#endif 456#endif
451 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 457 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
452 MSR_IA32_PERF_STATUS, 458 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
453}; 459};
454 460
455static unsigned num_msrs_to_save; 461static unsigned num_msrs_to_save;
@@ -648,10 +654,38 @@ static bool msr_mtrr_valid(unsigned msr)
648 654
649static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 655static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
650{ 656{
657 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
658
651 if (!msr_mtrr_valid(msr)) 659 if (!msr_mtrr_valid(msr))
652 return 1; 660 return 1;
653 661
654 vcpu->arch.mtrr[msr - 0x200] = data; 662 if (msr == MSR_MTRRdefType) {
663 vcpu->arch.mtrr_state.def_type = data;
664 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
665 } else if (msr == MSR_MTRRfix64K_00000)
666 p[0] = data;
667 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
668 p[1 + msr - MSR_MTRRfix16K_80000] = data;
669 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
670 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
671 else if (msr == MSR_IA32_CR_PAT)
672 vcpu->arch.pat = data;
673 else { /* Variable MTRRs */
674 int idx, is_mtrr_mask;
675 u64 *pt;
676
677 idx = (msr - 0x200) / 2;
678 is_mtrr_mask = msr - 0x200 - 2 * idx;
679 if (!is_mtrr_mask)
680 pt =
681 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
682 else
683 pt =
684 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
685 *pt = data;
686 }
687
688 kvm_mmu_reset_context(vcpu);
655 return 0; 689 return 0;
656} 690}
657 691
@@ -747,10 +781,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
747 781
748static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 782static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
749{ 783{
784 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
785
750 if (!msr_mtrr_valid(msr)) 786 if (!msr_mtrr_valid(msr))
751 return 1; 787 return 1;
752 788
753 *pdata = vcpu->arch.mtrr[msr - 0x200]; 789 if (msr == MSR_MTRRdefType)
790 *pdata = vcpu->arch.mtrr_state.def_type +
791 (vcpu->arch.mtrr_state.enabled << 10);
792 else if (msr == MSR_MTRRfix64K_00000)
793 *pdata = p[0];
794 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
795 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
796 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
797 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
798 else if (msr == MSR_IA32_CR_PAT)
799 *pdata = vcpu->arch.pat;
800 else { /* Variable MTRRs */
801 int idx, is_mtrr_mask;
802 u64 *pt;
803
804 idx = (msr - 0x200) / 2;
805 is_mtrr_mask = msr - 0x200 - 2 * idx;
806 if (!is_mtrr_mask)
807 pt =
808 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
809 else
810 pt =
811 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
812 *pdata = *pt;
813 }
814
754 return 0; 815 return 0;
755} 816}
756 817
@@ -903,7 +964,6 @@ int kvm_dev_ioctl_check_extension(long ext)
903 case KVM_CAP_IRQCHIP: 964 case KVM_CAP_IRQCHIP:
904 case KVM_CAP_HLT: 965 case KVM_CAP_HLT:
905 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 966 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
906 case KVM_CAP_USER_MEMORY:
907 case KVM_CAP_SET_TSS_ADDR: 967 case KVM_CAP_SET_TSS_ADDR:
908 case KVM_CAP_EXT_CPUID: 968 case KVM_CAP_EXT_CPUID:
909 case KVM_CAP_CLOCKSOURCE: 969 case KVM_CAP_CLOCKSOURCE:
@@ -1188,6 +1248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1188 int t, times = entry->eax & 0xff; 1248 int t, times = entry->eax & 0xff;
1189 1249
1190 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1250 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1251 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1191 for (t = 1; t < times && *nent < maxnent; ++t) { 1252 for (t = 1; t < times && *nent < maxnent; ++t) {
1192 do_cpuid_1_ent(&entry[t], function, 0); 1253 do_cpuid_1_ent(&entry[t], function, 0);
1193 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1254 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1279,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1218 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1279 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1219 /* read more entries until level_type is zero */ 1280 /* read more entries until level_type is zero */
1220 for (i = 1; *nent < maxnent; ++i) { 1281 for (i = 1; *nent < maxnent; ++i) {
1221 level_type = entry[i - 1].ecx & 0xff; 1282 level_type = entry[i - 1].ecx & 0xff00;
1222 if (!level_type) 1283 if (!level_type)
1223 break; 1284 break;
1224 do_cpuid_1_ent(&entry[i], function, i); 1285 do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1379,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1318 return 0; 1379 return 0;
1319} 1380}
1320 1381
1382static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1383{
1384 vcpu_load(vcpu);
1385 kvm_inject_nmi(vcpu);
1386 vcpu_put(vcpu);
1387
1388 return 0;
1389}
1390
1321static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1391static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1322 struct kvm_tpr_access_ctl *tac) 1392 struct kvm_tpr_access_ctl *tac)
1323{ 1393{
@@ -1377,6 +1447,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1377 r = 0; 1447 r = 0;
1378 break; 1448 break;
1379 } 1449 }
1450 case KVM_NMI: {
1451 r = kvm_vcpu_ioctl_nmi(vcpu);
1452 if (r)
1453 goto out;
1454 r = 0;
1455 break;
1456 }
1380 case KVM_SET_CPUID: { 1457 case KVM_SET_CPUID: {
1381 struct kvm_cpuid __user *cpuid_arg = argp; 1458 struct kvm_cpuid __user *cpuid_arg = argp;
1382 struct kvm_cpuid cpuid; 1459 struct kvm_cpuid cpuid;
@@ -1968,7 +2045,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1968 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2045 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1969 if (ret < 0) 2046 if (ret < 0)
1970 return 0; 2047 return 0;
1971 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 2048 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
1972 return 1; 2049 return 1;
1973} 2050}
1974 2051
@@ -2404,8 +2481,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2404 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2481 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2405 memcpy(vcpu->arch.pio_data, &val, 4); 2482 memcpy(vcpu->arch.pio_data, &val, 4);
2406 2483
2407 kvm_x86_ops->skip_emulated_instruction(vcpu);
2408
2409 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2484 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2410 if (pio_dev) { 2485 if (pio_dev) {
2411 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2486 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2616,7 @@ int kvm_arch_init(void *opaque)
2541 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2616 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2542 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2617 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2543 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2618 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2544 PT_DIRTY_MASK, PT64_NX_MASK, 0); 2619 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
2545 return 0; 2620 return 0;
2546 2621
2547out: 2622out:
@@ -2729,7 +2804,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2729 2804
2730 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2805 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2731 /* when no next entry is found, the current entry[i] is reselected */ 2806 /* when no next entry is found, the current entry[i] is reselected */
2732 for (j = i + 1; j == i; j = (j + 1) % nent) { 2807 for (j = i + 1; ; j = (j + 1) % nent) {
2733 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2808 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2734 if (ej->function == e->function) { 2809 if (ej->function == e->function) {
2735 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2810 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3048,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2973 pr_debug("vcpu %d received sipi with vector # %x\n", 3048 pr_debug("vcpu %d received sipi with vector # %x\n",
2974 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3049 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2975 kvm_lapic_reset(vcpu); 3050 kvm_lapic_reset(vcpu);
2976 r = kvm_x86_ops->vcpu_reset(vcpu); 3051 r = kvm_arch_vcpu_reset(vcpu);
2977 if (r) 3052 if (r)
2978 return r; 3053 return r;
2979 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3054 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3350,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3275 kvm_desct->padding = 0; 3350 kvm_desct->padding = 0;
3276} 3351}
3277 3352
3278static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, 3353static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3279 u16 selector, 3354 u16 selector,
3280 struct descriptor_table *dtable) 3355 struct descriptor_table *dtable)
3281{ 3356{
3282 if (selector & 1 << 2) { 3357 if (selector & 1 << 2) {
3283 struct kvm_segment kvm_seg; 3358 struct kvm_segment kvm_seg;
@@ -3302,7 +3377,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3302 struct descriptor_table dtable; 3377 struct descriptor_table dtable;
3303 u16 index = selector >> 3; 3378 u16 index = selector >> 3;
3304 3379
3305 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3380 get_segment_descriptor_dtable(vcpu, selector, &dtable);
3306 3381
3307 if (dtable.limit < index * 8 + 7) { 3382 if (dtable.limit < index * 8 + 7) {
3308 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3383 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3396,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3321 struct descriptor_table dtable; 3396 struct descriptor_table dtable;
3322 u16 index = selector >> 3; 3397 u16 index = selector >> 3;
3323 3398
3324 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3399 get_segment_descriptor_dtable(vcpu, selector, &dtable);
3325 3400
3326 if (dtable.limit < index * 8 + 7) 3401 if (dtable.limit < index * 8 + 7)
3327 return 1; 3402 return 1;
@@ -3900,6 +3975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3900 /* We do fxsave: this must be aligned. */ 3975 /* We do fxsave: this must be aligned. */
3901 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 3976 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3902 3977
3978 vcpu->arch.mtrr_state.have_fixed = 1;
3903 vcpu_load(vcpu); 3979 vcpu_load(vcpu);
3904 r = kvm_arch_vcpu_reset(vcpu); 3980 r = kvm_arch_vcpu_reset(vcpu);
3905 if (r == 0) 3981 if (r == 0)
@@ -3925,6 +4001,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3925 4001
3926int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4002int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3927{ 4003{
4004 vcpu->arch.nmi_pending = false;
4005 vcpu->arch.nmi_injected = false;
4006
3928 return kvm_x86_ops->vcpu_reset(vcpu); 4007 return kvm_x86_ops->vcpu_reset(vcpu);
3929} 4008}
3930 4009
@@ -4012,6 +4091,7 @@ struct kvm *kvm_arch_create_vm(void)
4012 return ERR_PTR(-ENOMEM); 4091 return ERR_PTR(-ENOMEM);
4013 4092
4014 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4093 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4094 INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4015 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4095 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4016 4096
4017 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4097 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4128,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
4048 4128
4049void kvm_arch_destroy_vm(struct kvm *kvm) 4129void kvm_arch_destroy_vm(struct kvm *kvm)
4050{ 4130{
4051 kvm_iommu_unmap_guest(kvm);
4052 kvm_free_all_assigned_devices(kvm); 4131 kvm_free_all_assigned_devices(kvm);
4132 kvm_iommu_unmap_guest(kvm);
4053 kvm_free_pit(kvm); 4133 kvm_free_pit(kvm);
4054 kfree(kvm->arch.vpic); 4134 kfree(kvm->arch.vpic);
4055 kfree(kvm->arch.vioapic); 4135 kfree(kvm->arch.vioapic);
@@ -4127,7 +4207,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
4127int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4207int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4128{ 4208{
4129 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4209 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4130 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; 4210 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4211 || vcpu->arch.nmi_pending;
4131} 4212}
4132 4213
4133static void vcpu_kick_intr(void *info) 4214static void vcpu_kick_intr(void *info)
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0da..d174db7a3370 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ 58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
59#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */
61#define SrcMask (7<<4) 62#define SrcMask (7<<4)
62/* Generic ModRM decode. */ 63/* Generic ModRM decode. */
63#define ModRM (1<<7) 64#define ModRM (1<<7)
@@ -70,17 +71,23 @@
70#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 71#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
71#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 72#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
72#define GroupMask 0xff /* Group number stored in bits 0:7 */ 73#define GroupMask 0xff /* Group number stored in bits 0:7 */
74/* Source 2 operand type */
75#define Src2None (0<<29)
76#define Src2CL (1<<29)
77#define Src2ImmByte (2<<29)
78#define Src2One (3<<29)
79#define Src2Mask (7<<29)
73 80
74enum { 81enum {
75 Group1_80, Group1_81, Group1_82, Group1_83, 82 Group1_80, Group1_81, Group1_82, Group1_83,
76 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 83 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
77}; 84};
78 85
79static u16 opcode_table[256] = { 86static u32 opcode_table[256] = {
80 /* 0x00 - 0x07 */ 87 /* 0x00 - 0x07 */
81 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
82 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
83 0, 0, 0, 0, 90 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
84 /* 0x08 - 0x0F */ 91 /* 0x08 - 0x0F */
85 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
86 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
195 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, 202 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
196}; 203};
197 204
198static u16 twobyte_table[256] = { 205static u32 twobyte_table[256] = {
199 /* 0x00 - 0x0F */ 206 /* 0x00 - 0x0F */
200 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, 207 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
201 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 208 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
230 /* 0x90 - 0x9F */ 237 /* 0x90 - 0x9F */
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0xA0 - 0xA7 */ 239 /* 0xA0 - 0xA7 */
233 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 240 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
241 DstMem | SrcReg | Src2ImmByte | ModRM,
242 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
234 /* 0xA8 - 0xAF */ 243 /* 0xA8 - 0xAF */
235 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, 244 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
245 DstMem | SrcReg | Src2ImmByte | ModRM,
246 DstMem | SrcReg | Src2CL | ModRM,
247 ModRM, 0,
236 /* 0xB0 - 0xB7 */ 248 /* 0xB0 - 0xB7 */
237 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 249 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
238 DstMem | SrcReg | ModRM | BitOp, 250 DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254}; 266};
255 267
256static u16 group_table[] = { 268static u32 group_table[] = {
257 [Group1_80*8] = 269 [Group1_80*8] =
258 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 270 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
259 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 271 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
297 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, 309 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
298}; 310};
299 311
300static u16 group2_table[] = { 312static u32 group2_table[] = {
301 [Group7*8] = 313 [Group7*8] =
302 SrcNone | ModRM, 0, 0, 0, 314 SrcNone | ModRM, 0, 0, SrcNone | ModRM,
303 SrcNone | ModRM | DstMem | Mov, 0, 315 SrcNone | ModRM | DstMem | Mov, 0,
304 SrcMem16 | ModRM | Mov, 0, 316 SrcMem16 | ModRM | Mov, 0,
305}; 317};
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
359 "andl %"_msk",%"_LO32 _tmp"; " \ 371 "andl %"_msk",%"_LO32 _tmp"; " \
360 "orl %"_LO32 _tmp",%"_sav"; " 372 "orl %"_LO32 _tmp",%"_sav"; "
361 373
374#ifdef CONFIG_X86_64
375#define ON64(x) x
376#else
377#define ON64(x)
378#endif
379
380#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
381 do { \
382 __asm__ __volatile__ ( \
383 _PRE_EFLAGS("0", "4", "2") \
384 _op _suffix " %"_x"3,%1; " \
385 _POST_EFLAGS("0", "4", "2") \
386 : "=m" (_eflags), "=m" ((_dst).val), \
387 "=&r" (_tmp) \
388 : _y ((_src).val), "i" (EFLAGS_MASK)); \
389 } while (0)
390
391
362/* Raw emulation: instruction has two explicit operands. */ 392/* Raw emulation: instruction has two explicit operands. */
363#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ 393#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
364 do { \ 394 do { \
365 unsigned long _tmp; \ 395 unsigned long _tmp; \
366 \ 396 \
367 switch ((_dst).bytes) { \ 397 switch ((_dst).bytes) { \
368 case 2: \ 398 case 2: \
369 __asm__ __volatile__ ( \ 399 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
370 _PRE_EFLAGS("0", "4", "2") \ 400 break; \
371 _op"w %"_wx"3,%1; " \ 401 case 4: \
372 _POST_EFLAGS("0", "4", "2") \ 402 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
373 : "=m" (_eflags), "=m" ((_dst).val), \ 403 break; \
374 "=&r" (_tmp) \ 404 case 8: \
375 : _wy ((_src).val), "i" (EFLAGS_MASK)); \ 405 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
376 break; \ 406 break; \
377 case 4: \ 407 } \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "4", "2") \
380 _op"l %"_lx"3,%1; " \
381 _POST_EFLAGS("0", "4", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_2op_8byte(_op, _src, _dst, \
388 _eflags, _qx, _qy); \
389 break; \
390 } \
391 } while (0) 408 } while (0)
392 409
393#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ 410#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
394 do { \ 411 do { \
395 unsigned long __tmp; \ 412 unsigned long _tmp; \
396 switch ((_dst).bytes) { \ 413 switch ((_dst).bytes) { \
397 case 1: \ 414 case 1: \
398 __asm__ __volatile__ ( \ 415 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
399 _PRE_EFLAGS("0", "4", "2") \
400 _op"b %"_bx"3,%1; " \
401 _POST_EFLAGS("0", "4", "2") \
402 : "=m" (_eflags), "=m" ((_dst).val), \
403 "=&r" (__tmp) \
404 : _by ((_src).val), "i" (EFLAGS_MASK)); \
405 break; \ 416 break; \
406 default: \ 417 default: \
407 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 418 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
425 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 436 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
426 "w", "r", _LO32, "r", "", "r") 437 "w", "r", _LO32, "r", "", "r")
427 438
428/* Instruction has only one explicit operand (no source operand). */ 439/* Instruction has three operands and one operand is stored in ECX register */
429#define emulate_1op(_op, _dst, _eflags) \ 440#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
430 do { \ 441 do { \
431 unsigned long _tmp; \ 442 unsigned long _tmp; \
432 \ 443 _type _clv = (_cl).val; \
433 switch ((_dst).bytes) { \ 444 _type _srcv = (_src).val; \
434 case 1: \ 445 _type _dstv = (_dst).val; \
435 __asm__ __volatile__ ( \ 446 \
436 _PRE_EFLAGS("0", "3", "2") \ 447 __asm__ __volatile__ ( \
437 _op"b %1; " \ 448 _PRE_EFLAGS("0", "5", "2") \
438 _POST_EFLAGS("0", "3", "2") \ 449 _op _suffix " %4,%1 \n" \
439 : "=m" (_eflags), "=m" ((_dst).val), \ 450 _POST_EFLAGS("0", "5", "2") \
440 "=&r" (_tmp) \ 451 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
441 : "i" (EFLAGS_MASK)); \ 452 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
442 break; \ 453 ); \
443 case 2: \ 454 \
444 __asm__ __volatile__ ( \ 455 (_cl).val = (unsigned long) _clv; \
445 _PRE_EFLAGS("0", "3", "2") \ 456 (_src).val = (unsigned long) _srcv; \
446 _op"w %1; " \ 457 (_dst).val = (unsigned long) _dstv; \
447 _POST_EFLAGS("0", "3", "2") \
448 : "=m" (_eflags), "=m" ((_dst).val), \
449 "=&r" (_tmp) \
450 : "i" (EFLAGS_MASK)); \
451 break; \
452 case 4: \
453 __asm__ __volatile__ ( \
454 _PRE_EFLAGS("0", "3", "2") \
455 _op"l %1; " \
456 _POST_EFLAGS("0", "3", "2") \
457 : "=m" (_eflags), "=m" ((_dst).val), \
458 "=&r" (_tmp) \
459 : "i" (EFLAGS_MASK)); \
460 break; \
461 case 8: \
462 __emulate_1op_8byte(_op, _dst, _eflags); \
463 break; \
464 } \
465 } while (0) 458 } while (0)
466 459
467/* Emulate an instruction with quadword operands (x86/64 only). */ 460#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
468#if defined(CONFIG_X86_64) 461 do { \
469#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ 462 switch ((_dst).bytes) { \
470 do { \ 463 case 2: \
471 __asm__ __volatile__ ( \ 464 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
472 _PRE_EFLAGS("0", "4", "2") \ 465 "w", unsigned short); \
473 _op"q %"_qx"3,%1; " \ 466 break; \
474 _POST_EFLAGS("0", "4", "2") \ 467 case 4: \
475 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ 468 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
476 : _qy ((_src).val), "i" (EFLAGS_MASK)); \ 469 "l", unsigned int); \
470 break; \
471 case 8: \
472 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
473 "q", unsigned long)); \
474 break; \
475 } \
477 } while (0) 476 } while (0)
478 477
479#define __emulate_1op_8byte(_op, _dst, _eflags) \ 478#define __emulate_1op(_op, _dst, _eflags, _suffix) \
480 do { \ 479 do { \
481 __asm__ __volatile__ ( \ 480 unsigned long _tmp; \
482 _PRE_EFLAGS("0", "3", "2") \ 481 \
483 _op"q %1; " \ 482 __asm__ __volatile__ ( \
484 _POST_EFLAGS("0", "3", "2") \ 483 _PRE_EFLAGS("0", "3", "2") \
485 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ 484 _op _suffix " %1; " \
486 : "i" (EFLAGS_MASK)); \ 485 _POST_EFLAGS("0", "3", "2") \
486 : "=m" (_eflags), "+m" ((_dst).val), \
487 "=&r" (_tmp) \
488 : "i" (EFLAGS_MASK)); \
487 } while (0) 489 } while (0)
488 490
489#elif defined(__i386__) 491/* Instruction has only one explicit operand (no source operand). */
490#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) 492#define emulate_1op(_op, _dst, _eflags) \
491#define __emulate_1op_8byte(_op, _dst, _eflags) 493 do { \
492#endif /* __i386__ */ 494 switch ((_dst).bytes) { \
495 case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
496 case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
497 case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
498 case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
499 } \
500 } while (0)
493 501
494/* Fetch next part of the instruction being emulated. */ 502/* Fetch next part of the instruction being emulated. */
495#define insn_fetch(_type, _size, _eip) \ 503#define insn_fetch(_type, _size, _eip) \
@@ -1041,6 +1049,33 @@ done_prefixes:
1041 c->src.bytes = 1; 1049 c->src.bytes = 1;
1042 c->src.val = insn_fetch(s8, 1, c->eip); 1050 c->src.val = insn_fetch(s8, 1, c->eip);
1043 break; 1051 break;
1052 case SrcOne:
1053 c->src.bytes = 1;
1054 c->src.val = 1;
1055 break;
1056 }
1057
1058 /*
1059 * Decode and fetch the second source operand: register, memory
1060 * or immediate.
1061 */
1062 switch (c->d & Src2Mask) {
1063 case Src2None:
1064 break;
1065 case Src2CL:
1066 c->src2.bytes = 1;
1067 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
1068 break;
1069 case Src2ImmByte:
1070 c->src2.type = OP_IMM;
1071 c->src2.ptr = (unsigned long *)c->eip;
1072 c->src2.bytes = 1;
1073 c->src2.val = insn_fetch(u8, 1, c->eip);
1074 break;
1075 case Src2One:
1076 c->src2.bytes = 1;
1077 c->src2.val = 1;
1078 break;
1044 } 1079 }
1045 1080
1046 /* Decode and fetch the destination operand: register or memory. */ 1081 /* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1100 c->regs[VCPU_REGS_RSP]); 1135 c->regs[VCPU_REGS_RSP]);
1101} 1136}
1102 1137
1103static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1138static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1104 struct x86_emulate_ops *ops) 1139 struct x86_emulate_ops *ops)
1105{ 1140{
1106 struct decode_cache *c = &ctxt->decode; 1141 struct decode_cache *c = &ctxt->decode;
1107 int rc; 1142 int rc;
1108 1143
1109 rc = ops->read_std(register_address(c, ss_base(ctxt), 1144 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1110 c->regs[VCPU_REGS_RSP]), 1145 c->regs[VCPU_REGS_RSP]),
1111 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1146 &c->src.val, c->src.bytes, ctxt->vcpu);
1112 if (rc != 0) 1147 if (rc != 0)
1113 return rc; 1148 return rc;
1114 1149
1115 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); 1150 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
1151 return rc;
1152}
1153
1154static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1155 struct x86_emulate_ops *ops)
1156{
1157 struct decode_cache *c = &ctxt->decode;
1158 int rc;
1116 1159
1160 c->src.bytes = c->dst.bytes;
1161 rc = emulate_pop(ctxt, ops);
1162 if (rc != 0)
1163 return rc;
1164 c->dst.val = c->src.val;
1117 return 0; 1165 return 0;
1118} 1166}
1119 1167
@@ -1415,24 +1463,15 @@ special_insn:
1415 emulate_1op("dec", c->dst, ctxt->eflags); 1463 emulate_1op("dec", c->dst, ctxt->eflags);
1416 break; 1464 break;
1417 case 0x50 ... 0x57: /* push reg */ 1465 case 0x50 ... 0x57: /* push reg */
1418 c->dst.type = OP_MEM; 1466 emulate_push(ctxt);
1419 c->dst.bytes = c->op_bytes;
1420 c->dst.val = c->src.val;
1421 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1422 -c->op_bytes);
1423 c->dst.ptr = (void *) register_address(
1424 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1425 break; 1467 break;
1426 case 0x58 ... 0x5f: /* pop reg */ 1468 case 0x58 ... 0x5f: /* pop reg */
1427 pop_instruction: 1469 pop_instruction:
1428 if ((rc = ops->read_std(register_address(c, ss_base(ctxt), 1470 c->src.bytes = c->op_bytes;
1429 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1471 rc = emulate_pop(ctxt, ops);
1430 c->op_bytes, ctxt->vcpu)) != 0) 1472 if (rc != 0)
1431 goto done; 1473 goto done;
1432 1474 c->dst.val = c->src.val;
1433 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1434 c->op_bytes);
1435 c->dst.type = OP_NONE; /* Disable writeback. */
1436 break; 1475 break;
1437 case 0x63: /* movsxd */ 1476 case 0x63: /* movsxd */
1438 if (ctxt->mode != X86EMUL_MODE_PROT64) 1477 if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
1591 emulate_push(ctxt); 1630 emulate_push(ctxt);
1592 break; 1631 break;
1593 case 0x9d: /* popf */ 1632 case 0x9d: /* popf */
1633 c->dst.type = OP_REG;
1594 c->dst.ptr = (unsigned long *) &ctxt->eflags; 1634 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1635 c->dst.bytes = c->op_bytes;
1595 goto pop_instruction; 1636 goto pop_instruction;
1596 case 0xa0 ... 0xa1: /* mov */ 1637 case 0xa0 ... 0xa1: /* mov */
1597 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1638 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
1689 emulate_grp2(ctxt); 1730 emulate_grp2(ctxt);
1690 break; 1731 break;
1691 case 0xc3: /* ret */ 1732 case 0xc3: /* ret */
1733 c->dst.type = OP_REG;
1692 c->dst.ptr = &c->eip; 1734 c->dst.ptr = &c->eip;
1735 c->dst.bytes = c->op_bytes;
1693 goto pop_instruction; 1736 goto pop_instruction;
1694 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 1737 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1695 mov: 1738 mov:
@@ -1778,7 +1821,7 @@ special_insn:
1778 c->eip = saved_eip; 1821 c->eip = saved_eip;
1779 goto cannot_emulate; 1822 goto cannot_emulate;
1780 } 1823 }
1781 return 0; 1824 break;
1782 case 0xf4: /* hlt */ 1825 case 0xf4: /* hlt */
1783 ctxt->vcpu->arch.halt_request = 1; 1826 ctxt->vcpu->arch.halt_request = 1;
1784 break; 1827 break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
1999 c->src.val &= (c->dst.bytes << 3) - 1; 2042 c->src.val &= (c->dst.bytes << 3) - 1;
2000 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); 2043 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
2001 break; 2044 break;
2045 case 0xa4: /* shld imm8, r, r/m */
2046 case 0xa5: /* shld cl, r, r/m */
2047 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2048 break;
2002 case 0xab: 2049 case 0xab:
2003 bts: /* bts */ 2050 bts: /* bts */
2004 /* only subword offset */ 2051 /* only subword offset */
2005 c->src.val &= (c->dst.bytes << 3) - 1; 2052 c->src.val &= (c->dst.bytes << 3) - 1;
2006 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 2053 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
2007 break; 2054 break;
2055 case 0xac: /* shrd imm8, r, r/m */
2056 case 0xad: /* shrd cl, r, r/m */
2057 emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
2058 break;
2008 case 0xae: /* clflush */ 2059 case 0xae: /* clflush */
2009 break; 2060 break;
2010 case 0xb0 ... 0xb1: /* cmpxchg */ 2061 case 0xb0 ... 0xb1: /* cmpxchg */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 50a779264bb1..a7ed208f81e3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -738,7 +738,7 @@ static void lguest_time_init(void)
738 738
739 /* We can't set cpumask in the initializer: damn C limitations! Set it 739 /* We can't set cpumask in the initializer: damn C limitations! Set it
740 * here and register our timer device. */ 740 * here and register our timer device. */
741 lguest_clockevent.cpumask = cpumask_of_cpu(0); 741 lguest_clockevent.cpumask = cpumask_of(0);
742 clockevents_register_device(&lguest_clockevent); 742 clockevents_register_device(&lguest_clockevent);
743 743
744 /* Finally, we unblock the timer interrupt. */ 744 /* Finally, we unblock the timer interrupt. */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 5c7cef34c9e7..10b9bd35a8ff 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -30,21 +30,6 @@ ENTRY(lguest_entry)
30 movl $lguest_data - __PAGE_OFFSET, %edx 30 movl $lguest_data - __PAGE_OFFSET, %edx
31 int $LGUEST_TRAP_ENTRY 31 int $LGUEST_TRAP_ENTRY
32 32
33 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
34 * instruction uses %esi implicitly as the source for the copy we're
35 * about to do. */
36 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
37
38 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
39 * This means the first 128M of kernel memory will be mapped at
40 * PAGE_OFFSET where the kernel expects to run. This will get it far
41 * enough through boot to switch to its own pagetables. */
42 movl $32, %ecx
43 movl %esi, %edi
44 addl $((__PAGE_OFFSET >> 22) * 4), %edi
45 rep
46 movsl
47
48 /* Set up the initial stack so we can run C code. */ 33 /* Set up the initial stack so we can run C code. */
49 movl $(init_thread_union+THREAD_SIZE),%esp 34 movl $(init_thread_union+THREAD_SIZE),%esp
50 35
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f6..4a20b2f9a381 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
39#define __do_strncpy_from_user(dst, src, count, res) \ 39#define __do_strncpy_from_user(dst, src, count, res) \
40do { \ 40do { \
41 int __d0, __d1, __d2; \ 41 int __d0, __d1, __d2; \
42 might_sleep(); \ 42 might_fault(); \
43 __asm__ __volatile__( \ 43 __asm__ __volatile__( \
44 " testl %1,%1\n" \ 44 " testl %1,%1\n" \
45 " jz 2f\n" \ 45 " jz 2f\n" \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
126#define __do_clear_user(addr,size) \ 126#define __do_clear_user(addr,size) \
127do { \ 127do { \
128 int __d0; \ 128 int __d0; \
129 might_sleep(); \ 129 might_fault(); \
130 __asm__ __volatile__( \ 130 __asm__ __volatile__( \
131 "0: rep; stosl\n" \ 131 "0: rep; stosl\n" \
132 " movl %2,%0\n" \ 132 " movl %2,%0\n" \
@@ -155,7 +155,7 @@ do { \
155unsigned long 155unsigned long
156clear_user(void __user *to, unsigned long n) 156clear_user(void __user *to, unsigned long n)
157{ 157{
158 might_sleep(); 158 might_fault();
159 if (access_ok(VERIFY_WRITE, to, n)) 159 if (access_ok(VERIFY_WRITE, to, n))
160 __do_clear_user(to, n); 160 __do_clear_user(to, n);
161 return n; 161 return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
197 unsigned long mask = -__addr_ok(s); 197 unsigned long mask = -__addr_ok(s);
198 unsigned long res, tmp; 198 unsigned long res, tmp;
199 199
200 might_sleep(); 200 might_fault();
201 201
202 __asm__ __volatile__( 202 __asm__ __volatile__(
203 " testl %0, %0\n" 203 " testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718b..64d6c84e6353 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
15#define __do_strncpy_from_user(dst,src,count,res) \ 15#define __do_strncpy_from_user(dst,src,count,res) \
16do { \ 16do { \
17 long __d0, __d1, __d2; \ 17 long __d0, __d1, __d2; \
18 might_sleep(); \ 18 might_fault(); \
19 __asm__ __volatile__( \ 19 __asm__ __volatile__( \
20 " testq %1,%1\n" \ 20 " testq %1,%1\n" \
21 " jz 2f\n" \ 21 " jz 2f\n" \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
64unsigned long __clear_user(void __user *addr, unsigned long size) 64unsigned long __clear_user(void __user *addr, unsigned long size)
65{ 65{
66 long __d0; 66 long __d0;
67 might_sleep(); 67 might_fault();
68 /* no memory constraint because it doesn't change any memory gcc knows 68 /* no memory constraint because it doesn't change any memory gcc knows
69 about */ 69 about */
70 asm volatile( 70 asm volatile(
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 3624a364b7f3..bc4c7840b2a8 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -42,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
42 { } 42 { }
43}; 43};
44 44
45static cpumask_t vector_allocation_domain(int cpu) 45static void vector_allocation_domain(int cpu, cpumask_t *retmask)
46{ 46{
47 return cpumask_of_cpu(cpu); 47 cpus_clear(*retmask);
48 cpu_set(cpu, *retmask);
48} 49}
49 50
50static int probe_bigsmp(void) 51static int probe_bigsmp(void)
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 7b4e6d0d1690..4ba5ccaa1584 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -87,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
87} 87}
88#endif 88#endif
89 89
90static cpumask_t vector_allocation_domain(int cpu) 90static void vector_allocation_domain(int cpu, cpumask_t *retmask)
91{ 91{
92 /* Careful. Some cpus do not strictly honor the set of cpus 92 /* Careful. Some cpus do not strictly honor the set of cpus
93 * specified in the interrupt destination when using lowest 93 * specified in the interrupt destination when using lowest
@@ -97,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu)
97 * deliver interrupts to the wrong hyperthread when only one 97 * deliver interrupts to the wrong hyperthread when only one
98 * hyperthread was specified in the interrupt desitination. 98 * hyperthread was specified in the interrupt desitination.
99 */ 99 */
100 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 100 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
101 return domain;
102} 101}
103 102
104struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); 103struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 71a309b122e6..511d7941364f 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
38 return 0; 38 return 0;
39} 39}
40 40
41static cpumask_t vector_allocation_domain(int cpu) 41static void vector_allocation_domain(int cpu, cpumask_t *retmask)
42{ 42{
43 /* Careful. Some cpus do not strictly honor the set of cpus 43 /* Careful. Some cpus do not strictly honor the set of cpus
44 * specified in the interrupt destination when using lowest 44 * specified in the interrupt destination when using lowest
@@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu)
48 * deliver interrupts to the wrong hyperthread when only one 48 * deliver interrupts to the wrong hyperthread when only one
49 * hyperthread was specified in the interrupt desitination. 49 * hyperthread was specified in the interrupt desitination.
50 */ 50 */
51 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 51 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
52 return domain;
53} 52}
54 53
55struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); 54struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 2c6d234e0009..2821ffc188b5 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -24,7 +24,7 @@ static int probe_summit(void)
24 return 0; 24 return 0;
25} 25}
26 26
27static cpumask_t vector_allocation_domain(int cpu) 27static void vector_allocation_domain(int cpu, cpumask_t *retmask)
28{ 28{
29 /* Careful. Some cpus do not strictly honor the set of cpus 29 /* Careful. Some cpus do not strictly honor the set of cpus
30 * specified in the interrupt destination when using lowest 30 * specified in the interrupt destination when using lowest
@@ -34,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu)
34 * deliver interrupts to the wrong hyperthread when only one 34 * deliver interrupts to the wrong hyperthread when only one
35 * hyperthread was specified in the interrupt desitination. 35 * hyperthread was specified in the interrupt desitination.
36 */ 36 */
37 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 37 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
38 return domain;
39} 38}
40 39
41struct genapic apic_summit = APIC_INIT("summit", probe_summit); 40struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 52145007bd7e..a5bc05492b1e 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1;
63/* Used for the invalidate map that's also checked in the spinlock */ 63/* Used for the invalidate map that's also checked in the spinlock */
64static volatile unsigned long smp_invalidate_needed; 64static volatile unsigned long smp_invalidate_needed;
65 65
66/* Bitmask of currently online CPUs - used by setup.c for
67 /proc/cpuinfo, visible externally but still physical */
68cpumask_t cpu_online_map = CPU_MASK_NONE;
69EXPORT_SYMBOL(cpu_online_map);
70
71/* Bitmask of CPUs present in the system - exported by i386_syms.c, used 66/* Bitmask of CPUs present in the system - exported by i386_syms.c, used
72 * by scheduler but indexed physically */ 67 * by scheduler but indexed physically */
73cpumask_t phys_cpu_present_map = CPU_MASK_NONE; 68cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
@@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
218/* This is for the new dynamic CPU boot code */ 213/* This is for the new dynamic CPU boot code */
219cpumask_t cpu_callin_map = CPU_MASK_NONE; 214cpumask_t cpu_callin_map = CPU_MASK_NONE;
220cpumask_t cpu_callout_map = CPU_MASK_NONE; 215cpumask_t cpu_callout_map = CPU_MASK_NONE;
221cpumask_t cpu_possible_map = CPU_MASK_NONE;
222EXPORT_SYMBOL(cpu_possible_map);
223 216
224/* The per processor IRQ masks (these are usually kept in sync) */ 217/* The per processor IRQ masks (these are usually kept in sync) */
225static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; 218static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -679,7 +672,7 @@ void __init smp_boot_cpus(void)
679 672
680 /* loop over all the extended VIC CPUs and boot them. The 673 /* loop over all the extended VIC CPUs and boot them. The
681 * Quad CPUs must be bootstrapped by their extended VIC cpu */ 674 * Quad CPUs must be bootstrapped by their extended VIC cpu */
682 for (i = 0; i < NR_CPUS; i++) { 675 for (i = 0; i < nr_cpu_ids; i++) {
683 if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) 676 if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
684 continue; 677 continue;
685 do_boot_cpu(i); 678 do_boot_cpu(i);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ad98b18f2b48..f99a6c6c432e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/pci.h>
24#include <linux/pfn.h> 25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/bootmem.h> 27#include <linux/bootmem.h>
@@ -971,6 +972,8 @@ void __init mem_init(void)
971 int codesize, reservedpages, datasize, initsize; 972 int codesize, reservedpages, datasize, initsize;
972 int tmp; 973 int tmp;
973 974
975 pci_iommu_alloc();
976
974#ifdef CONFIG_FLATMEM 977#ifdef CONFIG_FLATMEM
975 BUG_ON(!mem_map); 978 BUG_ON(!mem_map);
976#endif 979#endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cebcbf152d46..71a14f89f89e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -278,7 +278,7 @@ void __init numa_init_array(void)
278 int rr, i; 278 int rr, i;
279 279
280 rr = first_node(node_online_map); 280 rr = first_node(node_online_map);
281 for (i = 0; i < NR_CPUS; i++) { 281 for (i = 0; i < nr_cpu_ids; i++) {
282 if (early_cpu_to_node(i) != NUMA_NO_NODE) 282 if (early_cpu_to_node(i) != NUMA_NO_NODE)
283 continue; 283 continue;
284 numa_set_node(i, rr); 284 numa_set_node(i, rr);
@@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
549 memnodemap[0] = 0; 549 memnodemap[0] = 0;
550 node_set_online(0); 550 node_set_online(0);
551 node_set(0, node_possible_map); 551 node_set(0, node_possible_map);
552 for (i = 0; i < NR_CPUS; i++) 552 for (i = 0; i < nr_cpu_ids; i++)
553 numa_set_node(i, 0); 553 numa_set_node(i, 0);
554 e820_register_active_regions(0, start_pfn, last_pfn); 554 e820_register_active_regions(0, start_pfn, last_pfn);
555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); 555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 51c0a2fc14fe..09737c8af074 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
382 if (!node_online(i)) 382 if (!node_online(i))
383 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 383 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
384 384
385 for (i = 0; i < NR_CPUS; i++) { 385 for (i = 0; i < nr_cpu_ids; i++) {
386 int node = early_cpu_to_node(i); 386 int node = early_cpu_to_node(i);
387 387
388 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 509513760a6e..98658f25f542 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -65,11 +65,13 @@ static unsigned long reset_value[NUM_COUNTERS];
65#define IBS_FETCH_BEGIN 3 65#define IBS_FETCH_BEGIN 3
66#define IBS_OP_BEGIN 4 66#define IBS_OP_BEGIN 4
67 67
68/* The function interface needs to be fixed, something like add 68/*
69 data. Should then be added to linux/oprofile.h. */ 69 * The function interface needs to be fixed, something like add
70 * data. Should then be added to linux/oprofile.h.
71 */
70extern void 72extern void
71oprofile_add_ibs_sample(struct pt_regs *const regs, 73oprofile_add_ibs_sample(struct pt_regs * const regs,
72 unsigned int *const ibs_sample, int ibs_code); 74 unsigned int * const ibs_sample, int ibs_code);
73 75
74struct ibs_fetch_sample { 76struct ibs_fetch_sample {
75 /* MSRC001_1031 IBS Fetch Linear Address Register */ 77 /* MSRC001_1031 IBS Fetch Linear Address Register */
@@ -104,11 +106,6 @@ struct ibs_op_sample {
104 unsigned int ibs_dc_phys_high; 106 unsigned int ibs_dc_phys_high;
105}; 107};
106 108
107/*
108 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
109*/
110static void clear_ibs_nmi(void);
111
112static int ibs_allowed; /* AMD Family10h and later */ 109static int ibs_allowed; /* AMD Family10h and later */
113 110
114struct op_ibs_config { 111struct op_ibs_config {
@@ -223,7 +220,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
223 (unsigned int *)&ibs_fetch, 220 (unsigned int *)&ibs_fetch,
224 IBS_FETCH_BEGIN); 221 IBS_FETCH_BEGIN);
225 222
226 /*reenable the IRQ */ 223 /* reenable the IRQ */
227 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); 224 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
228 high &= ~IBS_FETCH_HIGH_VALID_BIT; 225 high &= ~IBS_FETCH_HIGH_VALID_BIT;
229 high |= IBS_FETCH_HIGH_ENABLE; 226 high |= IBS_FETCH_HIGH_ENABLE;
@@ -331,8 +328,10 @@ static void op_amd_stop(struct op_msrs const * const msrs)
331 unsigned int low, high; 328 unsigned int low, high;
332 int i; 329 int i;
333 330
334 /* Subtle: stop on all counters to avoid race with 331 /*
335 * setting our pm callback */ 332 * Subtle: stop on all counters to avoid race with setting our
333 * pm callback
334 */
336 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 335 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
337 if (!reset_value[i]) 336 if (!reset_value[i])
338 continue; 337 continue;
@@ -343,13 +342,15 @@ static void op_amd_stop(struct op_msrs const * const msrs)
343 342
344#ifdef CONFIG_OPROFILE_IBS 343#ifdef CONFIG_OPROFILE_IBS
345 if (ibs_allowed && ibs_config.fetch_enabled) { 344 if (ibs_allowed && ibs_config.fetch_enabled) {
346 low = 0; /* clear max count and enable */ 345 /* clear max count and enable */
346 low = 0;
347 high = 0; 347 high = 0;
348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); 348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
349 } 349 }
350 350
351 if (ibs_allowed && ibs_config.op_enabled) { 351 if (ibs_allowed && ibs_config.op_enabled) {
352 low = 0; /* clear max count and enable */ 352 /* clear max count and enable */
353 low = 0;
353 high = 0; 354 high = 0;
354 wrmsr(MSR_AMD64_IBSOPCTL, low, high); 355 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
355 } 356 }
@@ -370,18 +371,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
370 } 371 }
371} 372}
372 373
373#ifndef CONFIG_OPROFILE_IBS 374#ifdef CONFIG_OPROFILE_IBS
374
375/* no IBS support */
376
377static int op_amd_init(struct oprofile_operations *ops)
378{
379 return 0;
380}
381
382static void op_amd_exit(void) {}
383
384#else
385 375
386static u8 ibs_eilvt_off; 376static u8 ibs_eilvt_off;
387 377
@@ -395,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
395 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); 385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
396} 386}
397 387
398static int pfm_amd64_setup_eilvt(void) 388static int init_ibs_nmi(void)
399{ 389{
400#define IBSCTL_LVTOFFSETVAL (1 << 8) 390#define IBSCTL_LVTOFFSETVAL (1 << 8)
401#define IBSCTL 0x1cc 391#define IBSCTL 0x1cc
@@ -443,18 +433,22 @@ static int pfm_amd64_setup_eilvt(void)
443 return 0; 433 return 0;
444} 434}
445 435
446/* 436/* uninitialize the APIC for the IBS interrupts if needed */
447 * initialize the APIC for the IBS interrupts 437static void clear_ibs_nmi(void)
448 * if available (AMD Family10h rev B0 and later) 438{
449 */ 439 if (ibs_allowed)
450static void setup_ibs(void) 440 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
441}
442
443/* initialize the APIC for the IBS interrupts if available */
444static void ibs_init(void)
451{ 445{
452 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); 446 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
453 447
454 if (!ibs_allowed) 448 if (!ibs_allowed)
455 return; 449 return;
456 450
457 if (pfm_amd64_setup_eilvt()) { 451 if (init_ibs_nmi()) {
458 ibs_allowed = 0; 452 ibs_allowed = 0;
459 return; 453 return;
460 } 454 }
@@ -462,14 +456,12 @@ static void setup_ibs(void)
462 printk(KERN_INFO "oprofile: AMD IBS detected\n"); 456 printk(KERN_INFO "oprofile: AMD IBS detected\n");
463} 457}
464 458
465 459static void ibs_exit(void)
466/*
467 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
468 * rev B0 and later */
469static void clear_ibs_nmi(void)
470{ 460{
471 if (ibs_allowed) 461 if (!ibs_allowed)
472 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); 462 return;
463
464 clear_ibs_nmi();
473} 465}
474 466
475static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 467static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -519,7 +511,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
519 511
520static int op_amd_init(struct oprofile_operations *ops) 512static int op_amd_init(struct oprofile_operations *ops)
521{ 513{
522 setup_ibs(); 514 ibs_init();
523 create_arch_files = ops->create_files; 515 create_arch_files = ops->create_files;
524 ops->create_files = setup_ibs_files; 516 ops->create_files = setup_ibs_files;
525 return 0; 517 return 0;
@@ -527,10 +519,21 @@ static int op_amd_init(struct oprofile_operations *ops)
527 519
528static void op_amd_exit(void) 520static void op_amd_exit(void)
529{ 521{
530 clear_ibs_nmi(); 522 ibs_exit();
531} 523}
532 524
533#endif 525#else
526
527/* no IBS support */
528
529static int op_amd_init(struct oprofile_operations *ops)
530{
531 return 0;
532}
533
534static void op_amd_exit(void) {}
535
536#endif /* CONFIG_OPROFILE_IBS */
534 537
535struct op_x86_model_spec const op_amd_spec = { 538struct op_x86_model_spec const op_amd_spec = {
536 .init = op_amd_init, 539 .init = op_amd_init,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 773d68d3e912..503c240e26c7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1082,7 +1082,7 @@ static void drop_other_mm_ref(void *info)
1082 1082
1083static void xen_drop_mm_ref(struct mm_struct *mm) 1083static void xen_drop_mm_ref(struct mm_struct *mm)
1084{ 1084{
1085 cpumask_t mask; 1085 cpumask_var_t mask;
1086 unsigned cpu; 1086 unsigned cpu;
1087 1087
1088 if (current->active_mm == mm) { 1088 if (current->active_mm == mm) {
@@ -1094,7 +1094,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1094 } 1094 }
1095 1095
1096 /* Get the "official" set of cpus referring to our pagetable. */ 1096 /* Get the "official" set of cpus referring to our pagetable. */
1097 mask = mm->cpu_vm_mask; 1097 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1098 for_each_online_cpu(cpu) {
1099 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
1100 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1101 continue;
1102 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1103 }
1104 return;
1105 }
1106 cpumask_copy(mask, &mm->cpu_vm_mask);
1098 1107
1099 /* It's possible that a vcpu may have a stale reference to our 1108 /* It's possible that a vcpu may have a stale reference to our
1100 cr3, because its in lazy mode, and it hasn't yet flushed 1109 cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1103,11 +1112,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1103 if needed. */ 1112 if needed. */
1104 for_each_online_cpu(cpu) { 1113 for_each_online_cpu(cpu) {
1105 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1114 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1106 cpu_set(cpu, mask); 1115 cpumask_set_cpu(cpu, mask);
1107 } 1116 }
1108 1117
1109 if (!cpus_empty(mask)) 1118 if (!cpumask_empty(mask))
1110 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1119 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1120 free_cpumask_var(mask);
1111} 1121}
1112#else 1122#else
1113static void xen_drop_mm_ref(struct mm_struct *mm) 1123static void xen_drop_mm_ref(struct mm_struct *mm)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index acd9b6705e02..c44e2069c7c7 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -33,7 +33,7 @@
33#include "xen-ops.h" 33#include "xen-ops.h"
34#include "mmu.h" 34#include "mmu.h"
35 35
36cpumask_t xen_cpu_initialized_map; 36cpumask_var_t xen_cpu_initialized_map;
37 37
38static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
39static DEFINE_PER_CPU(int, callfunc_irq); 39static DEFINE_PER_CPU(int, callfunc_irq);
@@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void)
158{ 158{
159 int i, rc; 159 int i, rc;
160 160
161 for (i = 0; i < NR_CPUS; i++) { 161 for (i = 0; i < nr_cpu_ids; i++) {
162 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 162 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
163 if (rc >= 0) { 163 if (rc >= 0) {
164 num_processors++; 164 num_processors++;
@@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
192 if (xen_smp_intr_init(0)) 192 if (xen_smp_intr_init(0))
193 BUG(); 193 BUG();
194 194
195 xen_cpu_initialized_map = cpumask_of_cpu(0); 195 if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
196 panic("could not allocate xen_cpu_initialized_map\n");
197
198 cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
196 199
197 /* Restrict the possible_map according to max_cpus. */ 200 /* Restrict the possible_map according to max_cpus. */
198 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { 201 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
199 for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--) 202 for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
200 continue; 203 continue;
201 cpu_clear(cpu, cpu_possible_map); 204 cpu_clear(cpu, cpu_possible_map);
202 } 205 }
@@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
221 struct vcpu_guest_context *ctxt; 224 struct vcpu_guest_context *ctxt;
222 struct desc_struct *gdt; 225 struct desc_struct *gdt;
223 226
224 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 227 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
225 return 0; 228 return 0;
226 229
227 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); 230 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
@@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu)
408 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 411 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
409} 412}
410 413
411static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 414static void xen_send_IPI_mask(const struct cpumask *mask,
415 enum ipi_vector vector)
412{ 416{
413 unsigned cpu; 417 unsigned cpu;
414 418
415 cpus_and(mask, mask, cpu_online_map); 419 for_each_cpu_and(cpu, mask, cpu_online_mask)
416
417 for_each_cpu_mask_nr(cpu, mask)
418 xen_send_IPI_one(cpu, vector); 420 xen_send_IPI_one(cpu, vector);
419} 421}
420 422
421static void xen_smp_send_call_function_ipi(cpumask_t mask) 423static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
422{ 424{
423 int cpu; 425 int cpu;
424 426
425 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 427 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
426 428
427 /* Make sure other vcpus get a chance to run if they need to. */ 429 /* Make sure other vcpus get a chance to run if they need to. */
428 for_each_cpu_mask_nr(cpu, mask) { 430 for_each_cpu(cpu, mask) {
429 if (xen_vcpu_stolen(cpu)) { 431 if (xen_vcpu_stolen(cpu)) {
430 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 432 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
431 break; 433 break;
@@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask)
435 437
436static void xen_smp_send_call_function_single_ipi(int cpu) 438static void xen_smp_send_call_function_single_ipi(int cpu)
437{ 439{
438 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 440 xen_send_IPI_mask(cpumask_of(cpu),
441 XEN_CALL_FUNCTION_SINGLE_VECTOR);
439} 442}
440 443
441static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 444static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 2a234db5949b..212ffe012b76 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled)
35 pfn_to_mfn(xen_start_info->console.domU.mfn); 35 pfn_to_mfn(xen_start_info->console.domU.mfn);
36 } else { 36 } else {
37#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
38 xen_cpu_initialized_map = cpu_online_map; 38 BUG_ON(xen_cpu_initialized_map == NULL);
39 cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
39#endif 40#endif
40 xen_vcpu_restore(); 41 xen_vcpu_restore();
41 } 42 }
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c9f7cda48ed7..65d75a6be0ba 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -437,7 +437,7 @@ void xen_setup_timer(int cpu)
437 evt = &per_cpu(xen_clock_events, cpu); 437 evt = &per_cpu(xen_clock_events, cpu);
438 memcpy(evt, xen_clockevent, sizeof(*evt)); 438 memcpy(evt, xen_clockevent, sizeof(*evt));
439 439
440 evt->cpumask = cpumask_of_cpu(cpu); 440 evt->cpumask = cpumask_of(cpu);
441 evt->irq = irq; 441 evt->irq = irq;
442 442
443 setup_runstate_info(cpu); 443 setup_runstate_info(cpu);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9e1afae8461f..c1f8faf0a2c5 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu); 58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu); 59void xen_uninit_lock_cpu(int cpu);
60 60
61extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_var_t xen_cpu_initialized_map;
62#else 62#else
63static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}
64#endif 64#endif