diff options
Diffstat (limited to 'arch/x86/kernel')
76 files changed, 2329 insertions, 1142 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77b22083721..fedf32a8c3ec 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER | |||
11 | CFLAGS_REMOVE_tsc.o = -pg | 11 | CFLAGS_REMOVE_tsc.o = -pg |
12 | CFLAGS_REMOVE_rtc.o = -pg | 12 | CFLAGS_REMOVE_rtc.o = -pg |
13 | CFLAGS_REMOVE_paravirt-spinlocks.o = -pg | 13 | CFLAGS_REMOVE_paravirt-spinlocks.o = -pg |
14 | CFLAGS_REMOVE_pvclock.o = -pg | ||
15 | CFLAGS_REMOVE_kvmclock.o = -pg | ||
14 | CFLAGS_REMOVE_ftrace.o = -pg | 16 | CFLAGS_REMOVE_ftrace.o = -pg |
15 | CFLAGS_REMOVE_early_printk.o = -pg | 17 | CFLAGS_REMOVE_early_printk.o = -pg |
16 | endif | 18 | endif |
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o | |||
104 | scx200-y += scx200_32.o | 106 | scx200-y += scx200_32.o |
105 | 107 | ||
106 | obj-$(CONFIG_OLPC) += olpc.o | 108 | obj-$(CONFIG_OLPC) += olpc.o |
109 | obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o | ||
107 | obj-$(CONFIG_X86_MRST) += mrst.o | 110 | obj-$(CONFIG_X86_MRST) += mrst.o |
108 | 111 | ||
109 | microcode-y := microcode_core.o | 112 | microcode-y := microcode_core.o |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index fb7a5f052e2b..fb16f17e59be 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -61,7 +61,7 @@ struct cstate_entry { | |||
61 | unsigned int ecx; | 61 | unsigned int ecx; |
62 | } states[ACPI_PROCESSOR_MAX_POWER]; | 62 | } states[ACPI_PROCESSOR_MAX_POWER]; |
63 | }; | 63 | }; |
64 | static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */ | 64 | static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */ |
65 | 65 | ||
66 | static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; | 66 | static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; |
67 | 67 | ||
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e296010..28595d6df47c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S | |||
@@ -104,7 +104,7 @@ _start: | |||
104 | movl %eax, %ecx | 104 | movl %eax, %ecx |
105 | orl %edx, %ecx | 105 | orl %edx, %ecx |
106 | jz 1f | 106 | jz 1f |
107 | movl $0xc0000080, %ecx | 107 | movl $MSR_EFER, %ecx |
108 | wrmsr | 108 | wrmsr |
109 | 1: | 109 | 1: |
110 | 110 | ||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 70237732a6c7..f65ab8b014c4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
214 | u8 *instr = a->instr; | 214 | u8 *instr = a->instr; |
215 | BUG_ON(a->replacementlen > a->instrlen); | 215 | BUG_ON(a->replacementlen > a->instrlen); |
216 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 216 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
217 | BUG_ON(a->cpuid >= NCAPINTS*32); | ||
217 | if (!boot_cpu_has(a->cpuid)) | 218 | if (!boot_cpu_has(a->cpuid)) |
218 | continue; | 219 | continue; |
219 | #ifdef CONFIG_X86_64 | 220 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0d20286d78c6..679b6450382b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -1953,6 +1953,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, | |||
1953 | size_t size, | 1953 | size_t size, |
1954 | int dir) | 1954 | int dir) |
1955 | { | 1955 | { |
1956 | dma_addr_t flush_addr; | ||
1956 | dma_addr_t i, start; | 1957 | dma_addr_t i, start; |
1957 | unsigned int pages; | 1958 | unsigned int pages; |
1958 | 1959 | ||
@@ -1960,6 +1961,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, | |||
1960 | (dma_addr + size > dma_dom->aperture_size)) | 1961 | (dma_addr + size > dma_dom->aperture_size)) |
1961 | return; | 1962 | return; |
1962 | 1963 | ||
1964 | flush_addr = dma_addr; | ||
1963 | pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); | 1965 | pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
1964 | dma_addr &= PAGE_MASK; | 1966 | dma_addr &= PAGE_MASK; |
1965 | start = dma_addr; | 1967 | start = dma_addr; |
@@ -1974,7 +1976,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, | |||
1974 | dma_ops_free_addresses(dma_dom, dma_addr, pages); | 1976 | dma_ops_free_addresses(dma_dom, dma_addr, pages); |
1975 | 1977 | ||
1976 | if (amd_iommu_unmap_flush || dma_dom->need_flush) { | 1978 | if (amd_iommu_unmap_flush || dma_dom->need_flush) { |
1977 | iommu_flush_pages(&dma_dom->domain, dma_addr, size); | 1979 | iommu_flush_pages(&dma_dom->domain, flush_addr, size); |
1978 | dma_dom->need_flush = false; | 1980 | dma_dom->need_flush = false; |
1979 | } | 1981 | } |
1980 | } | 1982 | } |
@@ -2572,6 +2574,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, | |||
2572 | static int amd_iommu_domain_has_cap(struct iommu_domain *domain, | 2574 | static int amd_iommu_domain_has_cap(struct iommu_domain *domain, |
2573 | unsigned long cap) | 2575 | unsigned long cap) |
2574 | { | 2576 | { |
2577 | switch (cap) { | ||
2578 | case IOMMU_CAP_CACHE_COHERENCY: | ||
2579 | return 1; | ||
2580 | } | ||
2581 | |||
2575 | return 0; | 2582 | return 0; |
2576 | } | 2583 | } |
2577 | 2584 | ||
@@ -2609,8 +2616,7 @@ int __init amd_iommu_init_passthrough(void) | |||
2609 | 2616 | ||
2610 | pt_domain->mode |= PAGE_MODE_NONE; | 2617 | pt_domain->mode |= PAGE_MODE_NONE; |
2611 | 2618 | ||
2612 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | 2619 | for_each_pci_dev(dev) { |
2613 | |||
2614 | if (!check_device(&dev->dev)) | 2620 | if (!check_device(&dev->dev)) |
2615 | continue; | 2621 | continue; |
2616 | 2622 | ||
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3cc63e2b8dd4..5a170cbbbed8 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -632,6 +632,13 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) | |||
632 | iommu->last_device = calc_devid(MMIO_GET_BUS(range), | 632 | iommu->last_device = calc_devid(MMIO_GET_BUS(range), |
633 | MMIO_GET_LD(range)); | 633 | MMIO_GET_LD(range)); |
634 | iommu->evt_msi_num = MMIO_MSI_NUM(misc); | 634 | iommu->evt_msi_num = MMIO_MSI_NUM(misc); |
635 | |||
636 | if (is_rd890_iommu(iommu->dev)) { | ||
637 | pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); | ||
638 | pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); | ||
639 | pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); | ||
640 | pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); | ||
641 | } | ||
635 | } | 642 | } |
636 | 643 | ||
637 | /* | 644 | /* |
@@ -649,29 +656,9 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, | |||
649 | struct ivhd_entry *e; | 656 | struct ivhd_entry *e; |
650 | 657 | ||
651 | /* | 658 | /* |
652 | * First set the recommended feature enable bits from ACPI | 659 | * First save the recommended feature enable bits from ACPI |
653 | * into the IOMMU control registers | ||
654 | */ | 660 | */ |
655 | h->flags & IVHD_FLAG_HT_TUN_EN_MASK ? | 661 | iommu->acpi_flags = h->flags; |
656 | iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : | ||
657 | iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); | ||
658 | |||
659 | h->flags & IVHD_FLAG_PASSPW_EN_MASK ? | ||
660 | iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : | ||
661 | iommu_feature_disable(iommu, CONTROL_PASSPW_EN); | ||
662 | |||
663 | h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ? | ||
664 | iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : | ||
665 | iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); | ||
666 | |||
667 | h->flags & IVHD_FLAG_ISOC_EN_MASK ? | ||
668 | iommu_feature_enable(iommu, CONTROL_ISOC_EN) : | ||
669 | iommu_feature_disable(iommu, CONTROL_ISOC_EN); | ||
670 | |||
671 | /* | ||
672 | * make IOMMU memory accesses cache coherent | ||
673 | */ | ||
674 | iommu_feature_enable(iommu, CONTROL_COHERENT_EN); | ||
675 | 662 | ||
676 | /* | 663 | /* |
677 | * Done. Now parse the device entries | 664 | * Done. Now parse the device entries |
@@ -1116,6 +1103,40 @@ static void init_device_table(void) | |||
1116 | } | 1103 | } |
1117 | } | 1104 | } |
1118 | 1105 | ||
1106 | static void iommu_init_flags(struct amd_iommu *iommu) | ||
1107 | { | ||
1108 | iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ? | ||
1109 | iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : | ||
1110 | iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); | ||
1111 | |||
1112 | iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ? | ||
1113 | iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : | ||
1114 | iommu_feature_disable(iommu, CONTROL_PASSPW_EN); | ||
1115 | |||
1116 | iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ? | ||
1117 | iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : | ||
1118 | iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); | ||
1119 | |||
1120 | iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ? | ||
1121 | iommu_feature_enable(iommu, CONTROL_ISOC_EN) : | ||
1122 | iommu_feature_disable(iommu, CONTROL_ISOC_EN); | ||
1123 | |||
1124 | /* | ||
1125 | * make IOMMU memory accesses cache coherent | ||
1126 | */ | ||
1127 | iommu_feature_enable(iommu, CONTROL_COHERENT_EN); | ||
1128 | } | ||
1129 | |||
1130 | static void iommu_apply_quirks(struct amd_iommu *iommu) | ||
1131 | { | ||
1132 | if (is_rd890_iommu(iommu->dev)) { | ||
1133 | pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); | ||
1134 | pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); | ||
1135 | pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); | ||
1136 | pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); | ||
1137 | } | ||
1138 | } | ||
1139 | |||
1119 | /* | 1140 | /* |
1120 | * This function finally enables all IOMMUs found in the system after | 1141 | * This function finally enables all IOMMUs found in the system after |
1121 | * they have been initialized | 1142 | * they have been initialized |
@@ -1126,6 +1147,8 @@ static void enable_iommus(void) | |||
1126 | 1147 | ||
1127 | for_each_iommu(iommu) { | 1148 | for_each_iommu(iommu) { |
1128 | iommu_disable(iommu); | 1149 | iommu_disable(iommu); |
1150 | iommu_apply_quirks(iommu); | ||
1151 | iommu_init_flags(iommu); | ||
1129 | iommu_set_device_table(iommu); | 1152 | iommu_set_device_table(iommu); |
1130 | iommu_enable_command_buffer(iommu); | 1153 | iommu_enable_command_buffer(iommu); |
1131 | iommu_enable_event_buffer(iommu); | 1154 | iommu_enable_event_buffer(iommu); |
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a35347501d36..8dd77800ff5d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c | |||
@@ -43,10 +43,11 @@ | |||
43 | 43 | ||
44 | #include <asm/fixmap.h> | 44 | #include <asm/fixmap.h> |
45 | #include <asm/apb_timer.h> | 45 | #include <asm/apb_timer.h> |
46 | #include <asm/mrst.h> | ||
46 | 47 | ||
47 | #define APBT_MASK CLOCKSOURCE_MASK(32) | 48 | #define APBT_MASK CLOCKSOURCE_MASK(32) |
48 | #define APBT_SHIFT 22 | 49 | #define APBT_SHIFT 22 |
49 | #define APBT_CLOCKEVENT_RATING 150 | 50 | #define APBT_CLOCKEVENT_RATING 110 |
50 | #define APBT_CLOCKSOURCE_RATING 250 | 51 | #define APBT_CLOCKSOURCE_RATING 250 |
51 | #define APBT_MIN_DELTA_USEC 200 | 52 | #define APBT_MIN_DELTA_USEC 200 |
52 | 53 | ||
@@ -83,8 +84,6 @@ struct apbt_dev { | |||
83 | char name[10]; | 84 | char name[10]; |
84 | }; | 85 | }; |
85 | 86 | ||
86 | int disable_apbt_percpu __cpuinitdata; | ||
87 | |||
88 | static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); | 87 | static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); |
89 | 88 | ||
90 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = { | |||
195 | }; | 194 | }; |
196 | 195 | ||
197 | /* | 196 | /* |
198 | * if user does not want to use per CPU apb timer, just give it a lower rating | ||
199 | * than local apic timer and skip the late per cpu timer init. | ||
200 | */ | ||
201 | static inline int __init setup_x86_mrst_timer(char *arg) | ||
202 | { | ||
203 | if (!arg) | ||
204 | return -EINVAL; | ||
205 | |||
206 | if (strcmp("apbt_only", arg) == 0) | ||
207 | disable_apbt_percpu = 0; | ||
208 | else if (strcmp("lapic_and_apbt", arg) == 0) | ||
209 | disable_apbt_percpu = 1; | ||
210 | else { | ||
211 | pr_warning("X86 MRST timer option %s not recognised" | ||
212 | " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", | ||
213 | arg); | ||
214 | return -EINVAL; | ||
215 | } | ||
216 | return 0; | ||
217 | } | ||
218 | __setup("x86_mrst_timer=", setup_x86_mrst_timer); | ||
219 | |||
220 | /* | ||
221 | * start count down from 0xffff_ffff. this is done by toggling the enable bit | 197 | * start count down from 0xffff_ffff. this is done by toggling the enable bit |
222 | * then load initial load count to ~0. | 198 | * then load initial load count to ~0. |
223 | */ | 199 | */ |
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) | |||
335 | adev->num = smp_processor_id(); | 311 | adev->num = smp_processor_id(); |
336 | memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); | 312 | memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); |
337 | 313 | ||
338 | if (disable_apbt_percpu) { | 314 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { |
339 | apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; | 315 | apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; |
340 | global_clock_event = &adev->evt; | 316 | global_clock_event = &adev->evt; |
341 | printk(KERN_DEBUG "%s clockevent registered as global\n", | 317 | printk(KERN_DEBUG "%s clockevent registered as global\n", |
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, | |||
429 | 405 | ||
430 | static __init int apbt_late_init(void) | 406 | static __init int apbt_late_init(void) |
431 | { | 407 | { |
432 | if (disable_apbt_percpu || !apb_timer_block_enabled) | 408 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || |
409 | !apb_timer_block_enabled) | ||
433 | return 0; | 410 | return 0; |
434 | /* This notifier should be called after workqueue is ready */ | 411 | /* This notifier should be called after workqueue is ready */ |
435 | hotcpu_notifier(apbt_cpuhp_notify, -20); | 412 | hotcpu_notifier(apbt_cpuhp_notify, -20); |
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, | |||
450 | int timer_num; | 427 | int timer_num; |
451 | struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); | 428 | struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); |
452 | 429 | ||
430 | BUG_ON(!apbt_virt_address); | ||
431 | |||
453 | timer_num = adev->num; | 432 | timer_num = adev->num; |
454 | pr_debug("%s CPU %d timer %d mode=%d\n", | 433 | pr_debug("%s CPU %d timer %d mode=%d\n", |
455 | __func__, first_cpu(*evt->cpumask), timer_num, mode); | 434 | __func__, first_cpu(*evt->cpumask), timer_num, mode); |
@@ -676,7 +655,7 @@ void __init apbt_time_init(void) | |||
676 | } | 655 | } |
677 | #ifdef CONFIG_SMP | 656 | #ifdef CONFIG_SMP |
678 | /* kernel cmdline disable apb timer, so we will use lapic timers */ | 657 | /* kernel cmdline disable apb timer, so we will use lapic timers */ |
679 | if (disable_apbt_percpu) { | 658 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { |
680 | printk(KERN_INFO "apbt: disabled per cpu timer\n"); | 659 | printk(KERN_INFO "apbt: disabled per cpu timer\n"); |
681 | return; | 660 | return; |
682 | } | 661 | } |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0bcf235..a2e0caf26e17 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) | |||
280 | * or BIOS forget to put that in reserved. | 280 | * or BIOS forget to put that in reserved. |
281 | * try to update e820 to make that region as reserved. | 281 | * try to update e820 to make that region as reserved. |
282 | */ | 282 | */ |
283 | u32 agp_aper_base = 0, agp_aper_order = 0; | 283 | u32 agp_aper_order = 0; |
284 | int i, fix, slot, valid_agp = 0; | 284 | int i, fix, slot, valid_agp = 0; |
285 | u32 ctl; | 285 | u32 ctl; |
286 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; | 286 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; |
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) | |||
291 | return; | 291 | return; |
292 | 292 | ||
293 | /* This is mostly duplicate of iommu_hole_init */ | 293 | /* This is mostly duplicate of iommu_hole_init */ |
294 | agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); | 294 | search_agp_bridge(&agp_aper_order, &valid_agp); |
295 | 295 | ||
296 | fix = 0; | 296 | fix = 0; |
297 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 297 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bfc507d..910f20b457c4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile | |||
@@ -2,7 +2,12 @@ | |||
2 | # Makefile for local APIC drivers and for the IO-APIC code | 2 | # Makefile for local APIC drivers and for the IO-APIC code |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o | 5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o |
6 | ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) | ||
7 | obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o | ||
8 | endif | ||
9 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o | ||
10 | |||
6 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o | 11 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o |
7 | obj-$(CONFIG_SMP) += ipi.o | 12 | obj-$(CONFIG_SMP) += ipi.o |
8 | 13 | ||
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 980508c79082..e3b534cda49a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void) | |||
1606 | * acpi lapic path already maps that address in | 1606 | * acpi lapic path already maps that address in |
1607 | * acpi_register_lapic_address() | 1607 | * acpi_register_lapic_address() |
1608 | */ | 1608 | */ |
1609 | if (!acpi_lapic) | 1609 | if (!acpi_lapic && !smp_found_config) |
1610 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | 1610 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); |
1611 | 1611 | ||
1612 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", | 1612 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 425e53a87feb..8593582d8022 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -129,7 +129,6 @@ int es7000_plat; | |||
129 | * GSI override for ES7000 platforms. | 129 | * GSI override for ES7000 platforms. |
130 | */ | 130 | */ |
131 | 131 | ||
132 | static unsigned int base; | ||
133 | 132 | ||
134 | static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) | 133 | static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) |
135 | { | 134 | { |
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 000000000000..cefd6942f0e9 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c | |||
@@ -0,0 +1,107 @@ | |||
1 | /* | ||
2 | * HW NMI watchdog support | ||
3 | * | ||
4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Arch specific calls to support NMI watchdog | ||
7 | * | ||
8 | * Bits copied from original nmi.c file | ||
9 | * | ||
10 | */ | ||
11 | #include <asm/apic.h> | ||
12 | |||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/kdebug.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/kprobes.h> | ||
17 | #include <linux/nmi.h> | ||
18 | #include <linux/module.h> | ||
19 | |||
20 | /* For reliability, we're prepared to waste bits here. */ | ||
21 | static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; | ||
22 | |||
23 | u64 hw_nmi_get_sample_period(void) | ||
24 | { | ||
25 | return (u64)(cpu_khz) * 1000 * 60; | ||
26 | } | ||
27 | |||
28 | #ifdef ARCH_HAS_NMI_WATCHDOG | ||
29 | void arch_trigger_all_cpu_backtrace(void) | ||
30 | { | ||
31 | int i; | ||
32 | |||
33 | cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); | ||
34 | |||
35 | printk(KERN_INFO "sending NMI to all CPUs:\n"); | ||
36 | apic->send_IPI_all(NMI_VECTOR); | ||
37 | |||
38 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
39 | for (i = 0; i < 10 * 1000; i++) { | ||
40 | if (cpumask_empty(to_cpumask(backtrace_mask))) | ||
41 | break; | ||
42 | mdelay(1); | ||
43 | } | ||
44 | } | ||
45 | |||
46 | static int __kprobes | ||
47 | arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, | ||
48 | unsigned long cmd, void *__args) | ||
49 | { | ||
50 | struct die_args *args = __args; | ||
51 | struct pt_regs *regs; | ||
52 | int cpu = smp_processor_id(); | ||
53 | |||
54 | switch (cmd) { | ||
55 | case DIE_NMI: | ||
56 | case DIE_NMI_IPI: | ||
57 | break; | ||
58 | |||
59 | default: | ||
60 | return NOTIFY_DONE; | ||
61 | } | ||
62 | |||
63 | regs = args->regs; | ||
64 | |||
65 | if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { | ||
66 | static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; | ||
67 | |||
68 | arch_spin_lock(&lock); | ||
69 | printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); | ||
70 | show_regs(regs); | ||
71 | dump_stack(); | ||
72 | arch_spin_unlock(&lock); | ||
73 | cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); | ||
74 | return NOTIFY_STOP; | ||
75 | } | ||
76 | |||
77 | return NOTIFY_DONE; | ||
78 | } | ||
79 | |||
80 | static __read_mostly struct notifier_block backtrace_notifier = { | ||
81 | .notifier_call = arch_trigger_all_cpu_backtrace_handler, | ||
82 | .next = NULL, | ||
83 | .priority = 1 | ||
84 | }; | ||
85 | |||
86 | static int __init register_trigger_all_cpu_backtrace(void) | ||
87 | { | ||
88 | register_die_notifier(&backtrace_notifier); | ||
89 | return 0; | ||
90 | } | ||
91 | early_initcall(register_trigger_all_cpu_backtrace); | ||
92 | #endif | ||
93 | |||
94 | /* STUB calls to mimic old nmi_watchdog behaviour */ | ||
95 | #if defined(CONFIG_X86_LOCAL_APIC) | ||
96 | unsigned int nmi_watchdog = NMI_NONE; | ||
97 | EXPORT_SYMBOL(nmi_watchdog); | ||
98 | void acpi_nmi_enable(void) { return; } | ||
99 | void acpi_nmi_disable(void) { return; } | ||
100 | #endif | ||
101 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
102 | EXPORT_SYMBOL(nmi_active); | ||
103 | int unknown_nmi_panic; | ||
104 | void cpu_nmi_set_wd_enabled(void) { return; } | ||
105 | void stop_apic_nmi_watchdog(void *unused) { return; } | ||
106 | void setup_apic_nmi_watchdog(void *unused) { return; } | ||
107 | int __init check_nmi_watchdog(void) { return 0; } | ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e41ed24ab26d..5c5b8f3dddb5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -306,14 +306,19 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, | |||
306 | 306 | ||
307 | old_cfg = old_desc->chip_data; | 307 | old_cfg = old_desc->chip_data; |
308 | 308 | ||
309 | memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); | 309 | cfg->vector = old_cfg->vector; |
310 | cfg->move_in_progress = old_cfg->move_in_progress; | ||
311 | cpumask_copy(cfg->domain, old_cfg->domain); | ||
312 | cpumask_copy(cfg->old_domain, old_cfg->old_domain); | ||
310 | 313 | ||
311 | init_copy_irq_2_pin(old_cfg, cfg, node); | 314 | init_copy_irq_2_pin(old_cfg, cfg, node); |
312 | } | 315 | } |
313 | 316 | ||
314 | static void free_irq_cfg(struct irq_cfg *old_cfg) | 317 | static void free_irq_cfg(struct irq_cfg *cfg) |
315 | { | 318 | { |
316 | kfree(old_cfg); | 319 | free_cpumask_var(cfg->domain); |
320 | free_cpumask_var(cfg->old_domain); | ||
321 | kfree(cfg); | ||
317 | } | 322 | } |
318 | 323 | ||
319 | void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) | 324 | void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) |
@@ -1728,6 +1733,8 @@ __apicdebuginit(void) print_IO_APIC(void) | |||
1728 | struct irq_pin_list *entry; | 1733 | struct irq_pin_list *entry; |
1729 | 1734 | ||
1730 | cfg = desc->chip_data; | 1735 | cfg = desc->chip_data; |
1736 | if (!cfg) | ||
1737 | continue; | ||
1731 | entry = cfg->irq_2_pin; | 1738 | entry = cfg->irq_2_pin; |
1732 | if (!entry) | 1739 | if (!entry) |
1733 | continue; | 1740 | continue; |
@@ -3397,7 +3404,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3397 | 3404 | ||
3398 | cfg = desc->chip_data; | 3405 | cfg = desc->chip_data; |
3399 | 3406 | ||
3400 | read_msi_msg_desc(desc, &msg); | 3407 | get_cached_msi_msg_desc(desc, &msg); |
3401 | 3408 | ||
3402 | msg.data &= ~MSI_DATA_VECTOR_MASK; | 3409 | msg.data &= ~MSI_DATA_VECTOR_MASK; |
3403 | msg.data |= MSI_DATA_VECTOR(cfg->vector); | 3410 | msg.data |= MSI_DATA_VECTOR(cfg->vector); |
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 1edaf15c0b8e..a43f71cb30f8 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c | |||
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
401 | int cpu = smp_processor_id(); | 401 | int cpu = smp_processor_id(); |
402 | int rc = 0; | 402 | int rc = 0; |
403 | 403 | ||
404 | /* check for other users first */ | ||
405 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
406 | == NOTIFY_STOP) { | ||
407 | rc = 1; | ||
408 | touched = 1; | ||
409 | } | ||
410 | |||
411 | sum = get_timer_irqs(cpu); | 404 | sum = get_timer_irqs(cpu); |
412 | 405 | ||
413 | if (__get_cpu_var(nmi_touch)) { | 406 | if (__get_cpu_var(nmi_touch)) { |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e46f98f36e31..f744f54cb248 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) | |||
604 | { | 604 | { |
605 | if (reason != DIE_NMI_IPI) | 605 | if (reason != DIE_NMI_IPI) |
606 | return NOTIFY_OK; | 606 | return NOTIFY_OK; |
607 | |||
608 | if (in_crash_kexec) | ||
609 | /* do nothing if entering the crash kernel */ | ||
610 | return NOTIFY_OK; | ||
607 | /* | 611 | /* |
608 | * Use a lock so only one cpu prints at a time | 612 | * Use a lock so only one cpu prints at a time |
609 | * to prevent intermixed output. | 613 | * to prevent intermixed output. |
@@ -694,9 +698,11 @@ void __init uv_system_init(void) | |||
694 | for (j = 0; j < 64; j++) { | 698 | for (j = 0; j < 64; j++) { |
695 | if (!test_bit(j, &present)) | 699 | if (!test_bit(j, &present)) |
696 | continue; | 700 | continue; |
697 | uv_blade_info[blade].pnode = (i * 64 + j); | 701 | pnode = (i * 64 + j); |
702 | uv_blade_info[blade].pnode = pnode; | ||
698 | uv_blade_info[blade].nr_possible_cpus = 0; | 703 | uv_blade_info[blade].nr_possible_cpus = 0; |
699 | uv_blade_info[blade].nr_online_cpus = 0; | 704 | uv_blade_info[blade].nr_online_cpus = 0; |
705 | max_pnode = max(pnode, max_pnode); | ||
700 | blade++; | 706 | blade++; |
701 | } | 707 | } |
702 | } | 708 | } |
@@ -734,7 +740,6 @@ void __init uv_system_init(void) | |||
734 | uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); | 740 | uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); |
735 | uv_node_to_blade[nid] = blade; | 741 | uv_node_to_blade[nid] = blade; |
736 | uv_cpu_to_blade[cpu] = blade; | 742 | uv_cpu_to_blade[cpu] = blade; |
737 | max_pnode = max(pnode, max_pnode); | ||
738 | } | 743 | } |
739 | 744 | ||
740 | /* Add blade/pnode info for nodes without cpus */ | 745 | /* Add blade/pnode info for nodes without cpus */ |
@@ -746,7 +751,6 @@ void __init uv_system_init(void) | |||
746 | pnode = (paddr >> m_val) & pnode_mask; | 751 | pnode = (paddr >> m_val) & pnode_mask; |
747 | blade = boot_pnode_to_blade(pnode); | 752 | blade = boot_pnode_to_blade(pnode); |
748 | uv_node_to_blade[nid] = blade; | 753 | uv_node_to_blade[nid] = blade; |
749 | max_pnode = max(pnode, max_pnode); | ||
750 | } | 754 | } |
751 | 755 | ||
752 | map_gru_high(max_pnode); | 756 | map_gru_high(max_pnode); |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6f..3f0ebe429a01 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -12,11 +12,11 @@ endif | |||
12 | nostackp := $(call cc-option, -fno-stack-protector) | 12 | nostackp := $(call cc-option, -fno-stack-protector) |
13 | CFLAGS_common.o := $(nostackp) | 13 | CFLAGS_common.o := $(nostackp) |
14 | 14 | ||
15 | obj-y := intel_cacheinfo.o addon_cpuid_features.o | 15 | obj-y := intel_cacheinfo.o scattered.o topology.o |
16 | obj-y += proc.o capflags.o powerflags.o common.o | 16 | obj-y += proc.o capflags.o powerflags.o common.o |
17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o | 17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o |
18 | 18 | ||
19 | obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o | 19 | obj-$(CONFIG_X86_32) += bugs.o |
20 | obj-$(CONFIG_X86_64) += bugs_64.o | 20 | obj-$(CONFIG_X86_64) += bugs_64.o |
21 | 21 | ||
22 | obj-$(CONFIG_CPU_SUP_INTEL) += intel.o | 22 | obj-$(CONFIG_CPU_SUP_INTEL) += intel.o |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d2..ba5f62f45f01 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
466 | } | 466 | } |
467 | 467 | ||
468 | } | 468 | } |
469 | if (c->x86 == 0x10 || c->x86 == 0x11) | 469 | if (c->x86 >= 0x10) |
470 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 470 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
471 | 471 | ||
472 | /* get apicid instead of initial apic id from cpuid */ | 472 | /* get apicid instead of initial apic id from cpuid */ |
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
529 | num_cache_leaves = 3; | 529 | num_cache_leaves = 3; |
530 | } | 530 | } |
531 | 531 | ||
532 | if (c->x86 >= 0xf && c->x86 <= 0x11) | 532 | if (c->x86 >= 0xf) |
533 | set_cpu_cap(c, X86_FEATURE_K8); | 533 | set_cpu_cap(c, X86_FEATURE_K8); |
534 | 534 | ||
535 | if (cpu_has_xmm2) { | 535 | if (cpu_has_xmm2) { |
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
546 | fam10h_check_enable_mmcfg(); | 546 | fam10h_check_enable_mmcfg(); |
547 | } | 547 | } |
548 | 548 | ||
549 | if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { | 549 | if (c == &boot_cpu_data && c->x86 >= 0xf) { |
550 | unsigned long long tseg; | 550 | unsigned long long tseg; |
551 | 551 | ||
552 | /* | 552 | /* |
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { | |||
609 | }; | 609 | }; |
610 | 610 | ||
611 | cpu_dev_register(amd_cpu_dev); | 611 | cpu_dev_register(amd_cpu_dev); |
612 | |||
613 | /* | ||
614 | * AMD errata checking | ||
615 | * | ||
616 | * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or | ||
617 | * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that | ||
618 | * have an OSVW id assigned, which it takes as first argument. Both take a | ||
619 | * variable number of family-specific model-stepping ranges created by | ||
620 | * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const | ||
621 | * int[] in arch/x86/include/asm/processor.h. | ||
622 | * | ||
623 | * Example: | ||
624 | * | ||
625 | * const int amd_erratum_319[] = | ||
626 | * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), | ||
627 | * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), | ||
628 | * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); | ||
629 | */ | ||
630 | |||
631 | const int amd_erratum_400[] = | ||
632 | AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), | ||
633 | AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); | ||
634 | EXPORT_SYMBOL_GPL(amd_erratum_400); | ||
635 | |||
636 | const int amd_erratum_383[] = | ||
637 | AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); | ||
638 | EXPORT_SYMBOL_GPL(amd_erratum_383); | ||
639 | |||
640 | bool cpu_has_amd_erratum(const int *erratum) | ||
641 | { | ||
642 | struct cpuinfo_x86 *cpu = ¤t_cpu_data; | ||
643 | int osvw_id = *erratum++; | ||
644 | u32 range; | ||
645 | u32 ms; | ||
646 | |||
647 | /* | ||
648 | * If called early enough that current_cpu_data hasn't been initialized | ||
649 | * yet, fall back to boot_cpu_data. | ||
650 | */ | ||
651 | if (cpu->x86 == 0) | ||
652 | cpu = &boot_cpu_data; | ||
653 | |||
654 | if (cpu->x86_vendor != X86_VENDOR_AMD) | ||
655 | return false; | ||
656 | |||
657 | if (osvw_id >= 0 && osvw_id < 65536 && | ||
658 | cpu_has(cpu, X86_FEATURE_OSVW)) { | ||
659 | u64 osvw_len; | ||
660 | |||
661 | rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); | ||
662 | if (osvw_id < osvw_len) { | ||
663 | u64 osvw_bits; | ||
664 | |||
665 | rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), | ||
666 | osvw_bits); | ||
667 | return osvw_bits & (1ULL << (osvw_id & 0x3f)); | ||
668 | } | ||
669 | } | ||
670 | |||
671 | /* OSVW unavailable or ID unknown, match family-model-stepping range */ | ||
672 | ms = (cpu->x86_model << 4) | cpu->x86_mask; | ||
673 | while ((range = *erratum++)) | ||
674 | if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && | ||
675 | (ms >= AMD_MODEL_RANGE_START(range)) && | ||
676 | (ms <= AMD_MODEL_RANGE_END(range))) | ||
677 | return true; | ||
678 | |||
679 | return false; | ||
680 | } | ||
681 | |||
682 | EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); | ||
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c deleted file mode 100644 index 2056ccf572cc..000000000000 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ /dev/null | |||
@@ -1,72 +0,0 @@ | |||
1 | /* | ||
2 | * cmpxchg*() fallbacks for CPU not supporting these instructions | ||
3 | */ | ||
4 | |||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/smp.h> | ||
7 | #include <linux/module.h> | ||
8 | |||
9 | #ifndef CONFIG_X86_CMPXCHG | ||
10 | unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) | ||
11 | { | ||
12 | u8 prev; | ||
13 | unsigned long flags; | ||
14 | |||
15 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
16 | local_irq_save(flags); | ||
17 | prev = *(u8 *)ptr; | ||
18 | if (prev == old) | ||
19 | *(u8 *)ptr = new; | ||
20 | local_irq_restore(flags); | ||
21 | return prev; | ||
22 | } | ||
23 | EXPORT_SYMBOL(cmpxchg_386_u8); | ||
24 | |||
25 | unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) | ||
26 | { | ||
27 | u16 prev; | ||
28 | unsigned long flags; | ||
29 | |||
30 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
31 | local_irq_save(flags); | ||
32 | prev = *(u16 *)ptr; | ||
33 | if (prev == old) | ||
34 | *(u16 *)ptr = new; | ||
35 | local_irq_restore(flags); | ||
36 | return prev; | ||
37 | } | ||
38 | EXPORT_SYMBOL(cmpxchg_386_u16); | ||
39 | |||
40 | unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) | ||
41 | { | ||
42 | u32 prev; | ||
43 | unsigned long flags; | ||
44 | |||
45 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
46 | local_irq_save(flags); | ||
47 | prev = *(u32 *)ptr; | ||
48 | if (prev == old) | ||
49 | *(u32 *)ptr = new; | ||
50 | local_irq_restore(flags); | ||
51 | return prev; | ||
52 | } | ||
53 | EXPORT_SYMBOL(cmpxchg_386_u32); | ||
54 | #endif | ||
55 | |||
56 | #ifndef CONFIG_X86_CMPXCHG64 | ||
57 | unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) | ||
58 | { | ||
59 | u64 prev; | ||
60 | unsigned long flags; | ||
61 | |||
62 | /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ | ||
63 | local_irq_save(flags); | ||
64 | prev = *(u64 *)ptr; | ||
65 | if (prev == old) | ||
66 | *(u64 *)ptr = new; | ||
67 | local_irq_restore(flags); | ||
68 | return prev; | ||
69 | } | ||
70 | EXPORT_SYMBOL(cmpxchg_486_u64); | ||
71 | #endif | ||
72 | |||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 68e4a6f2211e..f2f9ac7da25c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | |||
140 | static int __init x86_xsave_setup(char *s) | 140 | static int __init x86_xsave_setup(char *s) |
141 | { | 141 | { |
142 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); | 142 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); |
143 | setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); | ||
143 | return 1; | 144 | return 1; |
144 | } | 145 | } |
145 | __setup("noxsave", x86_xsave_setup); | 146 | __setup("noxsave", x86_xsave_setup); |
146 | 147 | ||
148 | static int __init x86_xsaveopt_setup(char *s) | ||
149 | { | ||
150 | setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); | ||
151 | return 1; | ||
152 | } | ||
153 | __setup("noxsaveopt", x86_xsaveopt_setup); | ||
154 | |||
147 | #ifdef CONFIG_X86_32 | 155 | #ifdef CONFIG_X86_32 |
148 | static int cachesize_override __cpuinitdata = -1; | 156 | static int cachesize_override __cpuinitdata = -1; |
149 | static int disable_x86_serial_nr __cpuinitdata = 1; | 157 | static int disable_x86_serial_nr __cpuinitdata = 1; |
@@ -537,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) | |||
537 | } | 545 | } |
538 | } | 546 | } |
539 | 547 | ||
540 | static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | 548 | void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) |
541 | { | 549 | { |
542 | u32 tfms, xlvl; | 550 | u32 tfms, xlvl; |
543 | u32 ebx; | 551 | u32 ebx; |
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
551 | c->x86_capability[4] = excap; | 559 | c->x86_capability[4] = excap; |
552 | } | 560 | } |
553 | 561 | ||
562 | /* Additional Intel-defined flags: level 0x00000007 */ | ||
563 | if (c->cpuid_level >= 0x00000007) { | ||
564 | u32 eax, ebx, ecx, edx; | ||
565 | |||
566 | cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); | ||
567 | |||
568 | if (eax > 0) | ||
569 | c->x86_capability[9] = ebx; | ||
570 | } | ||
571 | |||
554 | /* AMD-defined flags: level 0x80000001 */ | 572 | /* AMD-defined flags: level 0x80000001 */ |
555 | xlvl = cpuid_eax(0x80000000); | 573 | xlvl = cpuid_eax(0x80000000); |
556 | c->extended_cpuid_level = xlvl; | 574 | c->extended_cpuid_level = xlvl; |
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
576 | if (c->extended_cpuid_level >= 0x80000007) | 594 | if (c->extended_cpuid_level >= 0x80000007) |
577 | c->x86_power = cpuid_edx(0x80000007); | 595 | c->x86_power = cpuid_edx(0x80000007); |
578 | 596 | ||
597 | init_scattered_cpuid_features(c); | ||
579 | } | 598 | } |
580 | 599 | ||
581 | static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) | 600 | static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) |
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) | |||
731 | 750 | ||
732 | get_model_name(c); /* Default name */ | 751 | get_model_name(c); /* Default name */ |
733 | 752 | ||
734 | init_scattered_cpuid_features(c); | ||
735 | detect_nopl(c); | 753 | detect_nopl(c); |
736 | } | 754 | } |
737 | 755 | ||
@@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void) | |||
1192 | dbg_restore_debug_regs(); | 1210 | dbg_restore_debug_regs(); |
1193 | 1211 | ||
1194 | fpu_init(); | 1212 | fpu_init(); |
1213 | xsave_init(); | ||
1195 | 1214 | ||
1196 | raw_local_save_flags(kernel_eflags); | 1215 | raw_local_save_flags(kernel_eflags); |
1197 | 1216 | ||
@@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void) | |||
1252 | clear_used_math(); | 1271 | clear_used_math(); |
1253 | mxcsr_feature_mask_init(); | 1272 | mxcsr_feature_mask_init(); |
1254 | 1273 | ||
1255 | /* | 1274 | fpu_init(); |
1256 | * Boot processor to setup the FP and extended state context info. | ||
1257 | */ | ||
1258 | if (smp_processor_id() == boot_cpu_id) | ||
1259 | init_thread_xstate(); | ||
1260 | |||
1261 | xsave_init(); | 1275 | xsave_init(); |
1262 | } | 1276 | } |
1263 | #endif | 1277 | #endif |
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 3624e8a0f71b..f668bb1f7d43 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h | |||
@@ -33,5 +33,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], | |||
33 | *const __x86_cpu_dev_end[]; | 33 | *const __x86_cpu_dev_end[]; |
34 | 34 | ||
35 | extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); | 35 | extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); |
36 | extern void get_cpu_cap(struct cpuinfo_x86 *c); | ||
36 | 37 | ||
37 | #endif | 38 | #endif |
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 246cd3afbb5f..cd8da247dda1 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -72,7 +72,7 @@ struct acpi_cpufreq_data { | |||
72 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); | 72 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); |
73 | 73 | ||
74 | /* acpi_perf_data is a pointer to percpu data. */ | 74 | /* acpi_perf_data is a pointer to percpu data. */ |
75 | static struct acpi_processor_performance *acpi_perf_data; | 75 | static struct acpi_processor_performance __percpu *acpi_perf_data; |
76 | 76 | ||
77 | static struct cpufreq_driver acpi_cpufreq_driver; | 77 | static struct cpufreq_driver acpi_cpufreq_driver; |
78 | 78 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index a36de5bbb622..4f6f679f2799 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | |||
@@ -110,7 +110,7 @@ struct pcc_cpu { | |||
110 | u32 output_offset; | 110 | u32 output_offset; |
111 | }; | 111 | }; |
112 | 112 | ||
113 | static struct pcc_cpu *pcc_cpu_info; | 113 | static struct pcc_cpu __percpu *pcc_cpu_info; |
114 | 114 | ||
115 | static int pcc_cpufreq_verify(struct cpufreq_policy *policy) | 115 | static int pcc_cpufreq_verify(struct cpufreq_policy *policy) |
116 | { | 116 | { |
@@ -368,16 +368,22 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) | |||
368 | return -ENODEV; | 368 | return -ENODEV; |
369 | 369 | ||
370 | out_obj = output.pointer; | 370 | out_obj = output.pointer; |
371 | if (out_obj->type != ACPI_TYPE_BUFFER) | 371 | if (out_obj->type != ACPI_TYPE_BUFFER) { |
372 | return -ENODEV; | 372 | ret = -ENODEV; |
373 | goto out_free; | ||
374 | } | ||
373 | 375 | ||
374 | errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); | 376 | errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); |
375 | if (errors) | 377 | if (errors) { |
376 | return -ENODEV; | 378 | ret = -ENODEV; |
379 | goto out_free; | ||
380 | } | ||
377 | 381 | ||
378 | supported = *((u32 *)(out_obj->buffer.pointer + 4)); | 382 | supported = *((u32 *)(out_obj->buffer.pointer + 4)); |
379 | if (!(supported & 0x1)) | 383 | if (!(supported & 0x1)) { |
380 | return -ENODEV; | 384 | ret = -ENODEV; |
385 | goto out_free; | ||
386 | } | ||
381 | 387 | ||
382 | out_free: | 388 | out_free: |
383 | kfree(output.pointer); | 389 | kfree(output.pointer); |
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index dd531cc56a8f..8095f8611f8a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c | |||
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = | |||
34 | { | 34 | { |
35 | &x86_hyper_vmware, | 35 | &x86_hyper_vmware, |
36 | &x86_hyper_ms_hyperv, | 36 | &x86_hyper_ms_hyperv, |
37 | #ifdef CONFIG_XEN_PVHVM | ||
38 | &x86_hyper_xen_hvm, | ||
39 | #endif | ||
37 | }; | 40 | }; |
38 | 41 | ||
39 | const struct hypervisor_x86 *x86_hyper; | 42 | const struct hypervisor_x86 *x86_hyper; |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 85f69cdeae10..b4389441efbb 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | |||
39 | misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; | 39 | misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; |
40 | wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); | 40 | wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); |
41 | c->cpuid_level = cpuid_eax(0); | 41 | c->cpuid_level = cpuid_eax(0); |
42 | get_cpu_cap(c); | ||
42 | } | 43 | } |
43 | } | 44 | } |
44 | 45 | ||
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 33eae2062cf5..898c2f4eab88 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) | |||
347 | return l3; | 347 | return l3; |
348 | } | 348 | } |
349 | 349 | ||
350 | static void __cpuinit | 350 | static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, |
351 | amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | 351 | int index) |
352 | { | 352 | { |
353 | int node; | 353 | int node; |
354 | 354 | ||
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | |||
396 | this_leaf->l3 = l3_caches[node]; | 396 | this_leaf->l3 = l3_caches[node]; |
397 | } | 397 | } |
398 | 398 | ||
399 | /* | ||
400 | * check whether a slot used for disabling an L3 index is occupied. | ||
401 | * @l3: L3 cache descriptor | ||
402 | * @slot: slot number (0..1) | ||
403 | * | ||
404 | * @returns: the disabled index if used or negative value if slot free. | ||
405 | */ | ||
406 | int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) | ||
407 | { | ||
408 | unsigned int reg = 0; | ||
409 | |||
410 | pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); | ||
411 | |||
412 | /* check whether this slot is activated already */ | ||
413 | if (reg & (3UL << 30)) | ||
414 | return reg & 0xfff; | ||
415 | |||
416 | return -1; | ||
417 | } | ||
418 | |||
399 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | 419 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, |
400 | unsigned int slot) | 420 | unsigned int slot) |
401 | { | 421 | { |
402 | struct pci_dev *dev = this_leaf->l3->dev; | 422 | int index; |
403 | unsigned int reg = 0; | ||
404 | 423 | ||
405 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) | 424 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) |
406 | return -EINVAL; | 425 | return -EINVAL; |
407 | 426 | ||
408 | if (!dev) | 427 | index = amd_get_l3_disable_slot(this_leaf->l3, slot); |
409 | return -EINVAL; | 428 | if (index >= 0) |
429 | return sprintf(buf, "%d\n", index); | ||
410 | 430 | ||
411 | pci_read_config_dword(dev, 0x1BC + slot * 4, ®); | 431 | return sprintf(buf, "FREE\n"); |
412 | return sprintf(buf, "0x%08x\n", reg); | ||
413 | } | 432 | } |
414 | 433 | ||
415 | #define SHOW_CACHE_DISABLE(slot) \ | 434 | #define SHOW_CACHE_DISABLE(slot) \ |
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, | |||
451 | } | 470 | } |
452 | } | 471 | } |
453 | 472 | ||
454 | 473 | /* | |
455 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | 474 | * disable a L3 cache index by using a disable-slot |
456 | const char *buf, size_t count, | 475 | * |
457 | unsigned int slot) | 476 | * @l3: L3 cache descriptor |
477 | * @cpu: A CPU on the node containing the L3 cache | ||
478 | * @slot: slot number (0..1) | ||
479 | * @index: index to disable | ||
480 | * | ||
481 | * @return: 0 on success, error status on failure | ||
482 | */ | ||
483 | int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, | ||
484 | unsigned long index) | ||
458 | { | 485 | { |
459 | struct pci_dev *dev = this_leaf->l3->dev; | 486 | int ret = 0; |
460 | int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); | ||
461 | unsigned long val = 0; | ||
462 | 487 | ||
463 | #define SUBCACHE_MASK (3UL << 20) | 488 | #define SUBCACHE_MASK (3UL << 20) |
464 | #define SUBCACHE_INDEX 0xfff | 489 | #define SUBCACHE_INDEX 0xfff |
465 | 490 | ||
466 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) | 491 | /* |
492 | * check whether this slot is already used or | ||
493 | * the index is already disabled | ||
494 | */ | ||
495 | ret = amd_get_l3_disable_slot(l3, slot); | ||
496 | if (ret >= 0) | ||
467 | return -EINVAL; | 497 | return -EINVAL; |
468 | 498 | ||
499 | /* | ||
500 | * check whether the other slot has disabled the | ||
501 | * same index already | ||
502 | */ | ||
503 | if (index == amd_get_l3_disable_slot(l3, !slot)) | ||
504 | return -EINVAL; | ||
505 | |||
506 | /* do not allow writes outside of allowed bits */ | ||
507 | if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || | ||
508 | ((index & SUBCACHE_INDEX) > l3->indices)) | ||
509 | return -EINVAL; | ||
510 | |||
511 | amd_l3_disable_index(l3, cpu, slot, index); | ||
512 | |||
513 | return 0; | ||
514 | } | ||
515 | |||
516 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | ||
517 | const char *buf, size_t count, | ||
518 | unsigned int slot) | ||
519 | { | ||
520 | unsigned long val = 0; | ||
521 | int cpu, err = 0; | ||
522 | |||
469 | if (!capable(CAP_SYS_ADMIN)) | 523 | if (!capable(CAP_SYS_ADMIN)) |
470 | return -EPERM; | 524 | return -EPERM; |
471 | 525 | ||
472 | if (!dev) | 526 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) |
473 | return -EINVAL; | 527 | return -EINVAL; |
474 | 528 | ||
475 | if (strict_strtoul(buf, 10, &val) < 0) | 529 | cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); |
476 | return -EINVAL; | ||
477 | 530 | ||
478 | /* do not allow writes outside of allowed bits */ | 531 | if (strict_strtoul(buf, 10, &val) < 0) |
479 | if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || | ||
480 | ((val & SUBCACHE_INDEX) > this_leaf->l3->indices)) | ||
481 | return -EINVAL; | 532 | return -EINVAL; |
482 | 533 | ||
483 | amd_l3_disable_index(this_leaf->l3, cpu, slot, val); | 534 | err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); |
484 | 535 | if (err) { | |
536 | if (err == -EEXIST) | ||
537 | printk(KERN_WARNING "L3 disable slot %d in use!\n", | ||
538 | slot); | ||
539 | return err; | ||
540 | } | ||
485 | return count; | 541 | return count; |
486 | } | 542 | } |
487 | 543 | ||
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | |||
502 | 558 | ||
503 | #else /* CONFIG_CPU_SUP_AMD */ | 559 | #else /* CONFIG_CPU_SUP_AMD */ |
504 | static void __cpuinit | 560 | static void __cpuinit |
505 | amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | 561 | amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) |
506 | { | 562 | { |
507 | }; | 563 | }; |
508 | #endif /* CONFIG_CPU_SUP_AMD */ | 564 | #endif /* CONFIG_CPU_SUP_AMD */ |
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, | |||
518 | 574 | ||
519 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | 575 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { |
520 | amd_cpuid4(index, &eax, &ebx, &ecx); | 576 | amd_cpuid4(index, &eax, &ebx, &ecx); |
521 | amd_check_l3_disable(index, this_leaf); | 577 | amd_check_l3_disable(this_leaf, index); |
522 | } else { | 578 | } else { |
523 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | 579 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); |
524 | } | 580 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 745b54f9be89..8209472b27a5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c | |||
@@ -80,7 +80,7 @@ int apei_write_mce(struct mce *m) | |||
80 | rcd.hdr.revision = CPER_RECORD_REV; | 80 | rcd.hdr.revision = CPER_RECORD_REV; |
81 | rcd.hdr.signature_end = CPER_SIG_END; | 81 | rcd.hdr.signature_end = CPER_SIG_END; |
82 | rcd.hdr.section_count = 1; | 82 | rcd.hdr.section_count = 1; |
83 | rcd.hdr.error_severity = CPER_SER_FATAL; | 83 | rcd.hdr.error_severity = CPER_SEV_FATAL; |
84 | /* timestamp, platform_id, partition_id are all invalid */ | 84 | /* timestamp, platform_id, partition_id are all invalid */ |
85 | rcd.hdr.validation_bits = 0; | 85 | rcd.hdr.validation_bits = 0; |
86 | rcd.hdr.record_length = sizeof(rcd); | 86 | rcd.hdr.record_length = sizeof(rcd); |
@@ -96,7 +96,7 @@ int apei_write_mce(struct mce *m) | |||
96 | rcd.sec_hdr.validation_bits = 0; | 96 | rcd.sec_hdr.validation_bits = 0; |
97 | rcd.sec_hdr.flags = CPER_SEC_PRIMARY; | 97 | rcd.sec_hdr.flags = CPER_SEC_PRIMARY; |
98 | rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; | 98 | rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; |
99 | rcd.sec_hdr.section_severity = CPER_SER_FATAL; | 99 | rcd.sec_hdr.section_severity = CPER_SEV_FATAL; |
100 | 100 | ||
101 | memcpy(&rcd.mce, m, sizeof(*m)); | 101 | memcpy(&rcd.mce, m, sizeof(*m)); |
102 | 102 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1970ef911c99..ed41562909fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -51,7 +51,7 @@ | |||
51 | static DEFINE_MUTEX(mce_read_mutex); | 51 | static DEFINE_MUTEX(mce_read_mutex); |
52 | 52 | ||
53 | #define rcu_dereference_check_mce(p) \ | 53 | #define rcu_dereference_check_mce(p) \ |
54 | rcu_dereference_check((p), \ | 54 | rcu_dereference_index_check((p), \ |
55 | rcu_read_lock_sched_held() || \ | 55 | rcu_read_lock_sched_held() || \ |
56 | lockdep_is_held(&mce_read_mutex)) | 56 | lockdep_is_held(&mce_read_mutex)) |
57 | 57 | ||
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); | |||
107 | static int default_decode_mce(struct notifier_block *nb, unsigned long val, | 107 | static int default_decode_mce(struct notifier_block *nb, unsigned long val, |
108 | void *data) | 108 | void *data) |
109 | { | 109 | { |
110 | pr_emerg("No human readable MCE decoding support on this CPU type.\n"); | 110 | pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); |
111 | pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); | 111 | pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); |
112 | 112 | ||
113 | return NOTIFY_STOP; | 113 | return NOTIFY_STOP; |
114 | } | 114 | } |
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce) | |||
211 | 211 | ||
212 | static void print_mce(struct mce *m) | 212 | static void print_mce(struct mce *m) |
213 | { | 213 | { |
214 | pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | 214 | pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", |
215 | m->extcpu, m->mcgstatus, m->bank, m->status); | 215 | m->extcpu, m->mcgstatus, m->bank, m->status); |
216 | 216 | ||
217 | if (m->ip) { | 217 | if (m->ip) { |
218 | pr_emerg("RIP%s %02x:<%016Lx> ", | 218 | pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", |
219 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | 219 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", |
220 | m->cs, m->ip); | 220 | m->cs, m->ip); |
221 | 221 | ||
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m) | |||
224 | pr_cont("\n"); | 224 | pr_cont("\n"); |
225 | } | 225 | } |
226 | 226 | ||
227 | pr_emerg("TSC %llx ", m->tsc); | 227 | pr_emerg(HW_ERR "TSC %llx ", m->tsc); |
228 | if (m->addr) | 228 | if (m->addr) |
229 | pr_cont("ADDR %llx ", m->addr); | 229 | pr_cont("ADDR %llx ", m->addr); |
230 | if (m->misc) | 230 | if (m->misc) |
231 | pr_cont("MISC %llx ", m->misc); | 231 | pr_cont("MISC %llx ", m->misc); |
232 | 232 | ||
233 | pr_cont("\n"); | 233 | pr_cont("\n"); |
234 | pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | 234 | pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", |
235 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); | 235 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); |
236 | 236 | ||
237 | /* | 237 | /* |
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m) | |||
241 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); | 241 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); |
242 | } | 242 | } |
243 | 243 | ||
244 | static void print_mce_head(void) | ||
245 | { | ||
246 | pr_emerg("\nHARDWARE ERROR\n"); | ||
247 | } | ||
248 | |||
249 | static void print_mce_tail(void) | ||
250 | { | ||
251 | pr_emerg("This is not a software problem!\n"); | ||
252 | } | ||
253 | |||
254 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | 244 | #define PANIC_TIMEOUT 5 /* 5 seconds */ |
255 | 245 | ||
256 | static atomic_t mce_paniced; | 246 | static atomic_t mce_paniced; |
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
291 | if (atomic_inc_return(&mce_fake_paniced) > 1) | 281 | if (atomic_inc_return(&mce_fake_paniced) > 1) |
292 | return; | 282 | return; |
293 | } | 283 | } |
294 | print_mce_head(); | ||
295 | /* First print corrected ones that are still unlogged */ | 284 | /* First print corrected ones that are still unlogged */ |
296 | for (i = 0; i < MCE_LOG_LEN; i++) { | 285 | for (i = 0; i < MCE_LOG_LEN; i++) { |
297 | struct mce *m = &mcelog.entry[i]; | 286 | struct mce *m = &mcelog.entry[i]; |
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
322 | apei_err = apei_write_mce(final); | 311 | apei_err = apei_write_mce(final); |
323 | } | 312 | } |
324 | if (cpu_missing) | 313 | if (cpu_missing) |
325 | printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); | 314 | pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); |
326 | print_mce_tail(); | ||
327 | if (exp) | 315 | if (exp) |
328 | printk(KERN_EMERG "Machine check: %s\n", exp); | 316 | pr_emerg(HW_ERR "Machine check: %s\n", exp); |
329 | if (!fake_panic) { | 317 | if (!fake_panic) { |
330 | if (panic_timeout == 0) | 318 | if (panic_timeout == 0) |
331 | panic_timeout = mce_panic_timeout; | 319 | panic_timeout = mce_panic_timeout; |
332 | panic(msg); | 320 | panic(msg); |
333 | } else | 321 | } else |
334 | printk(KERN_EMERG "Fake kernel panic: %s\n", msg); | 322 | pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); |
335 | } | 323 | } |
336 | 324 | ||
337 | /* Support code for software error injection */ | 325 | /* Support code for software error injection */ |
@@ -1221,7 +1209,7 @@ int mce_notify_irq(void) | |||
1221 | schedule_work(&mce_trigger_work); | 1209 | schedule_work(&mce_trigger_work); |
1222 | 1210 | ||
1223 | if (__ratelimit(&ratelimit)) | 1211 | if (__ratelimit(&ratelimit)) |
1224 | printk(KERN_INFO "Machine check events logged\n"); | 1212 | pr_info(HW_ERR "Machine check events logged\n"); |
1225 | 1213 | ||
1226 | return 1; | 1214 | return 1; |
1227 | } | 1215 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 224392d8fe8c..5e975298fa81 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -530,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
530 | err = -ENOMEM; | 530 | err = -ENOMEM; |
531 | goto out; | 531 | goto out; |
532 | } | 532 | } |
533 | if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { | 533 | if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { |
534 | kfree(b); | 534 | kfree(b); |
535 | err = -ENOMEM; | 535 | err = -ENOMEM; |
536 | goto out; | 536 | goto out; |
@@ -543,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
543 | #ifndef CONFIG_SMP | 543 | #ifndef CONFIG_SMP |
544 | cpumask_setall(b->cpus); | 544 | cpumask_setall(b->cpus); |
545 | #else | 545 | #else |
546 | cpumask_copy(b->cpus, c->llc_shared_map); | 546 | cpumask_set_cpu(cpu, b->cpus); |
547 | #endif | 547 | #endif |
548 | 548 | ||
549 | per_cpu(threshold_banks, cpu)[bank] = b; | 549 | per_cpu(threshold_banks, cpu)[bank] = b; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e40920a..6fcd0936194f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot) | |||
95 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 95 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
96 | 96 | ||
97 | /* Already owned by someone else? */ | 97 | /* Already owned by someone else? */ |
98 | if (val & CMCI_EN) { | 98 | if (val & MCI_CTL2_CMCI_EN) { |
99 | if (test_and_clear_bit(i, owned) && !boot) | 99 | if (test_and_clear_bit(i, owned) && !boot) |
100 | print_update("SHD", &hdr, i); | 100 | print_update("SHD", &hdr, i); |
101 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 101 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
102 | continue; | 102 | continue; |
103 | } | 103 | } |
104 | 104 | ||
105 | val |= CMCI_EN | CMCI_THRESHOLD; | 105 | val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; |
106 | val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; | ||
106 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 107 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
107 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 108 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
108 | 109 | ||
109 | /* Did the enable bit stick? -- the bank supports CMCI */ | 110 | /* Did the enable bit stick? -- the bank supports CMCI */ |
110 | if (val & CMCI_EN) { | 111 | if (val & MCI_CTL2_CMCI_EN) { |
111 | if (!test_and_set_bit(i, owned) && !boot) | 112 | if (!test_and_set_bit(i, owned) && !boot) |
112 | print_update("CMCI", &hdr, i); | 113 | print_update("CMCI", &hdr, i); |
113 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 114 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
@@ -155,7 +156,7 @@ void cmci_clear(void) | |||
155 | continue; | 156 | continue; |
156 | /* Disable CMCI */ | 157 | /* Disable CMCI */ |
157 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 158 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
158 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | 159 | val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); |
159 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 160 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
160 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | 161 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); |
161 | } | 162 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e1a0a3bf9716..d9368eeda309 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -34,15 +34,25 @@ | |||
34 | /* How long to wait between reporting thermal events */ | 34 | /* How long to wait between reporting thermal events */ |
35 | #define CHECK_INTERVAL (300 * HZ) | 35 | #define CHECK_INTERVAL (300 * HZ) |
36 | 36 | ||
37 | #define THERMAL_THROTTLING_EVENT 0 | ||
38 | #define POWER_LIMIT_EVENT 1 | ||
39 | |||
37 | /* | 40 | /* |
38 | * Current thermal throttling state: | 41 | * Current thermal event state: |
39 | */ | 42 | */ |
40 | struct thermal_state { | 43 | struct _thermal_state { |
41 | bool is_throttled; | 44 | bool new_event; |
42 | 45 | int event; | |
43 | u64 next_check; | 46 | u64 next_check; |
44 | unsigned long throttle_count; | 47 | unsigned long count; |
45 | unsigned long last_throttle_count; | 48 | unsigned long last_count; |
49 | }; | ||
50 | |||
51 | struct thermal_state { | ||
52 | struct _thermal_state core_throttle; | ||
53 | struct _thermal_state core_power_limit; | ||
54 | struct _thermal_state package_throttle; | ||
55 | struct _thermal_state package_power_limit; | ||
46 | }; | 56 | }; |
47 | 57 | ||
48 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); | 58 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); |
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly; | |||
53 | 63 | ||
54 | #ifdef CONFIG_SYSFS | 64 | #ifdef CONFIG_SYSFS |
55 | #define define_therm_throt_sysdev_one_ro(_name) \ | 65 | #define define_therm_throt_sysdev_one_ro(_name) \ |
56 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) | 66 | static SYSDEV_ATTR(_name, 0444, \ |
67 | therm_throt_sysdev_show_##_name, \ | ||
68 | NULL) \ | ||
57 | 69 | ||
58 | #define define_therm_throt_sysdev_show_func(name) \ | 70 | #define define_therm_throt_sysdev_show_func(event, name) \ |
59 | \ | 71 | \ |
60 | static ssize_t therm_throt_sysdev_show_##name( \ | 72 | static ssize_t therm_throt_sysdev_show_##event##_##name( \ |
61 | struct sys_device *dev, \ | 73 | struct sys_device *dev, \ |
62 | struct sysdev_attribute *attr, \ | 74 | struct sysdev_attribute *attr, \ |
63 | char *buf) \ | 75 | char *buf) \ |
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \ | |||
66 | ssize_t ret; \ | 78 | ssize_t ret; \ |
67 | \ | 79 | \ |
68 | preempt_disable(); /* CPU hotplug */ \ | 80 | preempt_disable(); /* CPU hotplug */ \ |
69 | if (cpu_online(cpu)) \ | 81 | if (cpu_online(cpu)) { \ |
70 | ret = sprintf(buf, "%lu\n", \ | 82 | ret = sprintf(buf, "%lu\n", \ |
71 | per_cpu(thermal_state, cpu).name); \ | 83 | per_cpu(thermal_state, cpu).event.name); \ |
72 | else \ | 84 | } else \ |
73 | ret = 0; \ | 85 | ret = 0; \ |
74 | preempt_enable(); \ | 86 | preempt_enable(); \ |
75 | \ | 87 | \ |
76 | return ret; \ | 88 | return ret; \ |
77 | } | 89 | } |
78 | 90 | ||
79 | define_therm_throt_sysdev_show_func(throttle_count); | 91 | define_therm_throt_sysdev_show_func(core_throttle, count); |
80 | define_therm_throt_sysdev_one_ro(throttle_count); | 92 | define_therm_throt_sysdev_one_ro(core_throttle_count); |
93 | |||
94 | define_therm_throt_sysdev_show_func(core_power_limit, count); | ||
95 | define_therm_throt_sysdev_one_ro(core_power_limit_count); | ||
96 | |||
97 | define_therm_throt_sysdev_show_func(package_throttle, count); | ||
98 | define_therm_throt_sysdev_one_ro(package_throttle_count); | ||
99 | |||
100 | define_therm_throt_sysdev_show_func(package_power_limit, count); | ||
101 | define_therm_throt_sysdev_one_ro(package_power_limit_count); | ||
81 | 102 | ||
82 | static struct attribute *thermal_throttle_attrs[] = { | 103 | static struct attribute *thermal_throttle_attrs[] = { |
83 | &attr_throttle_count.attr, | 104 | &attr_core_throttle_count.attr, |
84 | NULL | 105 | NULL |
85 | }; | 106 | }; |
86 | 107 | ||
87 | static struct attribute_group thermal_throttle_attr_group = { | 108 | static struct attribute_group thermal_attr_group = { |
88 | .attrs = thermal_throttle_attrs, | 109 | .attrs = thermal_throttle_attrs, |
89 | .name = "thermal_throttle" | 110 | .name = "thermal_throttle" |
90 | }; | 111 | }; |
91 | #endif /* CONFIG_SYSFS */ | 112 | #endif /* CONFIG_SYSFS */ |
92 | 113 | ||
114 | #define CORE_LEVEL 0 | ||
115 | #define PACKAGE_LEVEL 1 | ||
116 | |||
93 | /*** | 117 | /*** |
94 | * therm_throt_process - Process thermal throttling event from interrupt | 118 | * therm_throt_process - Process thermal throttling event from interrupt |
95 | * @curr: Whether the condition is current or not (boolean), since the | 119 | * @curr: Whether the condition is current or not (boolean), since the |
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { | |||
106 | * 1 : Event should be logged further, and a message has been | 130 | * 1 : Event should be logged further, and a message has been |
107 | * printed to the syslog. | 131 | * printed to the syslog. |
108 | */ | 132 | */ |
109 | static int therm_throt_process(bool is_throttled) | 133 | static int therm_throt_process(bool new_event, int event, int level) |
110 | { | 134 | { |
111 | struct thermal_state *state; | 135 | struct _thermal_state *state; |
112 | unsigned int this_cpu; | 136 | unsigned int this_cpu = smp_processor_id(); |
113 | bool was_throttled; | 137 | bool old_event; |
114 | u64 now; | 138 | u64 now; |
139 | struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); | ||
115 | 140 | ||
116 | this_cpu = smp_processor_id(); | ||
117 | now = get_jiffies_64(); | 141 | now = get_jiffies_64(); |
118 | state = &per_cpu(thermal_state, this_cpu); | 142 | if (level == CORE_LEVEL) { |
143 | if (event == THERMAL_THROTTLING_EVENT) | ||
144 | state = &pstate->core_throttle; | ||
145 | else if (event == POWER_LIMIT_EVENT) | ||
146 | state = &pstate->core_power_limit; | ||
147 | else | ||
148 | return 0; | ||
149 | } else if (level == PACKAGE_LEVEL) { | ||
150 | if (event == THERMAL_THROTTLING_EVENT) | ||
151 | state = &pstate->package_throttle; | ||
152 | else if (event == POWER_LIMIT_EVENT) | ||
153 | state = &pstate->package_power_limit; | ||
154 | else | ||
155 | return 0; | ||
156 | } else | ||
157 | return 0; | ||
119 | 158 | ||
120 | was_throttled = state->is_throttled; | 159 | old_event = state->new_event; |
121 | state->is_throttled = is_throttled; | 160 | state->new_event = new_event; |
122 | 161 | ||
123 | if (is_throttled) | 162 | if (new_event) |
124 | state->throttle_count++; | 163 | state->count++; |
125 | 164 | ||
126 | if (time_before64(now, state->next_check) && | 165 | if (time_before64(now, state->next_check) && |
127 | state->throttle_count != state->last_throttle_count) | 166 | state->count != state->last_count) |
128 | return 0; | 167 | return 0; |
129 | 168 | ||
130 | state->next_check = now + CHECK_INTERVAL; | 169 | state->next_check = now + CHECK_INTERVAL; |
131 | state->last_throttle_count = state->throttle_count; | 170 | state->last_count = state->count; |
132 | 171 | ||
133 | /* if we just entered the thermal event */ | 172 | /* if we just entered the thermal event */ |
134 | if (is_throttled) { | 173 | if (new_event) { |
135 | printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); | 174 | if (event == THERMAL_THROTTLING_EVENT) |
175 | printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", | ||
176 | this_cpu, | ||
177 | level == CORE_LEVEL ? "Core" : "Package", | ||
178 | state->count); | ||
179 | else | ||
180 | printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", | ||
181 | this_cpu, | ||
182 | level == CORE_LEVEL ? "Core" : "Package", | ||
183 | state->count); | ||
136 | 184 | ||
137 | add_taint(TAINT_MACHINE_CHECK); | 185 | add_taint(TAINT_MACHINE_CHECK); |
138 | return 1; | 186 | return 1; |
139 | } | 187 | } |
140 | if (was_throttled) { | 188 | if (old_event) { |
141 | printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); | 189 | if (event == THERMAL_THROTTLING_EVENT) |
190 | printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", | ||
191 | this_cpu, | ||
192 | level == CORE_LEVEL ? "Core" : "Package"); | ||
193 | else | ||
194 | printk(KERN_INFO "CPU%d: %s power limit normal\n", | ||
195 | this_cpu, | ||
196 | level == CORE_LEVEL ? "Core" : "Package"); | ||
142 | return 1; | 197 | return 1; |
143 | } | 198 | } |
144 | 199 | ||
@@ -147,15 +202,35 @@ static int therm_throt_process(bool is_throttled) | |||
147 | 202 | ||
148 | #ifdef CONFIG_SYSFS | 203 | #ifdef CONFIG_SYSFS |
149 | /* Add/Remove thermal_throttle interface for CPU device: */ | 204 | /* Add/Remove thermal_throttle interface for CPU device: */ |
150 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) | 205 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, |
206 | unsigned int cpu) | ||
151 | { | 207 | { |
152 | return sysfs_create_group(&sys_dev->kobj, | 208 | int err; |
153 | &thermal_throttle_attr_group); | 209 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
210 | |||
211 | err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); | ||
212 | if (err) | ||
213 | return err; | ||
214 | |||
215 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
216 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
217 | &attr_core_power_limit_count.attr, | ||
218 | thermal_attr_group.name); | ||
219 | if (cpu_has(c, X86_FEATURE_PTS)) | ||
220 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
221 | &attr_package_throttle_count.attr, | ||
222 | thermal_attr_group.name); | ||
223 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
224 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
225 | &attr_package_power_limit_count.attr, | ||
226 | thermal_attr_group.name); | ||
227 | |||
228 | return err; | ||
154 | } | 229 | } |
155 | 230 | ||
156 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) | 231 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) |
157 | { | 232 | { |
158 | sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); | 233 | sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); |
159 | } | 234 | } |
160 | 235 | ||
161 | /* Mutex protecting device creation against CPU hotplug: */ | 236 | /* Mutex protecting device creation against CPU hotplug: */ |
@@ -177,7 +252,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, | |||
177 | case CPU_UP_PREPARE: | 252 | case CPU_UP_PREPARE: |
178 | case CPU_UP_PREPARE_FROZEN: | 253 | case CPU_UP_PREPARE_FROZEN: |
179 | mutex_lock(&therm_cpu_lock); | 254 | mutex_lock(&therm_cpu_lock); |
180 | err = thermal_throttle_add_dev(sys_dev); | 255 | err = thermal_throttle_add_dev(sys_dev, cpu); |
181 | mutex_unlock(&therm_cpu_lock); | 256 | mutex_unlock(&therm_cpu_lock); |
182 | WARN_ON(err); | 257 | WARN_ON(err); |
183 | break; | 258 | break; |
@@ -213,7 +288,7 @@ static __init int thermal_throttle_init_device(void) | |||
213 | #endif | 288 | #endif |
214 | /* connect live CPUs to sysfs */ | 289 | /* connect live CPUs to sysfs */ |
215 | for_each_online_cpu(cpu) { | 290 | for_each_online_cpu(cpu) { |
216 | err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); | 291 | err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); |
217 | WARN_ON(err); | 292 | WARN_ON(err); |
218 | } | 293 | } |
219 | #ifdef CONFIG_HOTPLUG_CPU | 294 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -226,14 +301,50 @@ device_initcall(thermal_throttle_init_device); | |||
226 | 301 | ||
227 | #endif /* CONFIG_SYSFS */ | 302 | #endif /* CONFIG_SYSFS */ |
228 | 303 | ||
304 | /* | ||
305 | * Set up the most two significant bit to notify mce log that this thermal | ||
306 | * event type. | ||
307 | * This is a temp solution. May be changed in the future with mce log | ||
308 | * infrasture. | ||
309 | */ | ||
310 | #define CORE_THROTTLED (0) | ||
311 | #define CORE_POWER_LIMIT ((__u64)1 << 62) | ||
312 | #define PACKAGE_THROTTLED ((__u64)2 << 62) | ||
313 | #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) | ||
314 | |||
229 | /* Thermal transition interrupt handler */ | 315 | /* Thermal transition interrupt handler */ |
230 | static void intel_thermal_interrupt(void) | 316 | static void intel_thermal_interrupt(void) |
231 | { | 317 | { |
232 | __u64 msr_val; | 318 | __u64 msr_val; |
319 | struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); | ||
233 | 320 | ||
234 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 321 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
235 | if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) | 322 | |
236 | mce_log_therm_throt_event(msr_val); | 323 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, |
324 | THERMAL_THROTTLING_EVENT, | ||
325 | CORE_LEVEL) != 0) | ||
326 | mce_log_therm_throt_event(CORE_THROTTLED | msr_val); | ||
327 | |||
328 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
329 | if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, | ||
330 | POWER_LIMIT_EVENT, | ||
331 | CORE_LEVEL) != 0) | ||
332 | mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); | ||
333 | |||
334 | if (cpu_has(c, X86_FEATURE_PTS)) { | ||
335 | rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); | ||
336 | if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, | ||
337 | THERMAL_THROTTLING_EVENT, | ||
338 | PACKAGE_LEVEL) != 0) | ||
339 | mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); | ||
340 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
341 | if (therm_throt_process(msr_val & | ||
342 | PACKAGE_THERM_STATUS_POWER_LIMIT, | ||
343 | POWER_LIMIT_EVENT, | ||
344 | PACKAGE_LEVEL) != 0) | ||
345 | mce_log_therm_throt_event(PACKAGE_POWER_LIMIT | ||
346 | | msr_val); | ||
347 | } | ||
237 | } | 348 | } |
238 | 349 | ||
239 | static void unexpected_thermal_interrupt(void) | 350 | static void unexpected_thermal_interrupt(void) |
@@ -335,8 +446,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c) | |||
335 | apic_write(APIC_LVTTHMR, h); | 446 | apic_write(APIC_LVTTHMR, h); |
336 | 447 | ||
337 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | 448 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); |
338 | wrmsr(MSR_IA32_THERM_INTERRUPT, | 449 | if (cpu_has(c, X86_FEATURE_PLN)) |
339 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | 450 | wrmsr(MSR_IA32_THERM_INTERRUPT, |
451 | l | (THERM_INT_LOW_ENABLE | ||
452 | | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); | ||
453 | else | ||
454 | wrmsr(MSR_IA32_THERM_INTERRUPT, | ||
455 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | ||
456 | |||
457 | if (cpu_has(c, X86_FEATURE_PTS)) { | ||
458 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); | ||
459 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
460 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, | ||
461 | l | (PACKAGE_THERM_INT_LOW_ENABLE | ||
462 | | PACKAGE_THERM_INT_HIGH_ENABLE | ||
463 | | PACKAGE_THERM_INT_PLN_ENABLE), h); | ||
464 | else | ||
465 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, | ||
466 | l | (PACKAGE_THERM_INT_LOW_ENABLE | ||
467 | | PACKAGE_THERM_INT_HIGH_ENABLE), h); | ||
468 | } | ||
340 | 469 | ||
341 | smp_thermal_vector = intel_thermal_interrupt; | 470 | smp_thermal_vector = intel_thermal_interrupt; |
342 | 471 | ||
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 16f41bbe46b6..d944bf6c50e9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/mshyperv.h> | 18 | #include <asm/mshyperv.h> |
19 | 19 | ||
20 | struct ms_hyperv_info ms_hyperv; | 20 | struct ms_hyperv_info ms_hyperv; |
21 | EXPORT_SYMBOL_GPL(ms_hyperv); | ||
21 | 22 | ||
22 | static bool __init ms_hyperv_platform(void) | 23 | static bool __init ms_hyperv_platform(void) |
23 | { | 24 | { |
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f012..c5f59d071425 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c | |||
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) | |||
632 | unsigned long gran_base, chunk_base, lose_base; | 632 | unsigned long gran_base, chunk_base, lose_base; |
633 | char gran_factor, chunk_factor, lose_factor; | 633 | char gran_factor, chunk_factor, lose_factor; |
634 | 634 | ||
635 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), | 635 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); |
636 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), | 636 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); |
637 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), | 637 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); |
638 | 638 | ||
639 | pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", | 639 | pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", |
640 | result[i].bad ? "*BAD*" : " ", | 640 | result[i].bad ? "*BAD*" : " ", |
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a441c61c..7d28d7d03885 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, | |||
433 | { | 433 | { |
434 | unsigned int mask_lo, mask_hi, base_lo, base_hi; | 434 | unsigned int mask_lo, mask_hi, base_lo, base_hi; |
435 | unsigned int tmp, hi; | 435 | unsigned int tmp, hi; |
436 | int cpu; | ||
437 | 436 | ||
438 | /* | 437 | /* |
439 | * get_mtrr doesn't need to update mtrr_state, also it could be called | 438 | * get_mtrr doesn't need to update mtrr_state, also it could be called |
440 | * from any cpu, so try to print it out directly. | 439 | * from any cpu, so try to print it out directly. |
441 | */ | 440 | */ |
442 | cpu = get_cpu(); | 441 | get_cpu(); |
443 | 442 | ||
444 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); | 443 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); |
445 | 444 | ||
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd9b602..01c0f3ee6cc3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -35,6 +35,7 @@ | |||
35 | 35 | ||
36 | #include <linux/types.h> /* FIXME: kvm_para.h needs this */ | 36 | #include <linux/types.h> /* FIXME: kvm_para.h needs this */ |
37 | 37 | ||
38 | #include <linux/stop_machine.h> | ||
38 | #include <linux/kvm_para.h> | 39 | #include <linux/kvm_para.h> |
39 | #include <linux/uaccess.h> | 40 | #include <linux/uaccess.h> |
40 | #include <linux/module.h> | 41 | #include <linux/module.h> |
@@ -143,22 +144,28 @@ struct set_mtrr_data { | |||
143 | mtrr_type smp_type; | 144 | mtrr_type smp_type; |
144 | }; | 145 | }; |
145 | 146 | ||
147 | static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); | ||
148 | |||
146 | /** | 149 | /** |
147 | * ipi_handler - Synchronisation handler. Executed by "other" CPUs. | 150 | * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. |
148 | * @info: pointer to mtrr configuration data | 151 | * @info: pointer to mtrr configuration data |
149 | * | 152 | * |
150 | * Returns nothing. | 153 | * Returns nothing. |
151 | */ | 154 | */ |
152 | static void ipi_handler(void *info) | 155 | static int mtrr_work_handler(void *info) |
153 | { | 156 | { |
154 | #ifdef CONFIG_SMP | 157 | #ifdef CONFIG_SMP |
155 | struct set_mtrr_data *data = info; | 158 | struct set_mtrr_data *data = info; |
156 | unsigned long flags; | 159 | unsigned long flags; |
157 | 160 | ||
161 | atomic_dec(&data->count); | ||
162 | while (!atomic_read(&data->gate)) | ||
163 | cpu_relax(); | ||
164 | |||
158 | local_irq_save(flags); | 165 | local_irq_save(flags); |
159 | 166 | ||
160 | atomic_dec(&data->count); | 167 | atomic_dec(&data->count); |
161 | while (!atomic_read(&data->gate)) | 168 | while (atomic_read(&data->gate)) |
162 | cpu_relax(); | 169 | cpu_relax(); |
163 | 170 | ||
164 | /* The master has cleared me to execute */ | 171 | /* The master has cleared me to execute */ |
@@ -173,12 +180,13 @@ static void ipi_handler(void *info) | |||
173 | } | 180 | } |
174 | 181 | ||
175 | atomic_dec(&data->count); | 182 | atomic_dec(&data->count); |
176 | while (atomic_read(&data->gate)) | 183 | while (!atomic_read(&data->gate)) |
177 | cpu_relax(); | 184 | cpu_relax(); |
178 | 185 | ||
179 | atomic_dec(&data->count); | 186 | atomic_dec(&data->count); |
180 | local_irq_restore(flags); | 187 | local_irq_restore(flags); |
181 | #endif | 188 | #endif |
189 | return 0; | ||
182 | } | 190 | } |
183 | 191 | ||
184 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) | 192 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) |
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) | |||
198 | * | 206 | * |
199 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: | 207 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: |
200 | * | 208 | * |
201 | * 1. Send IPI to do the following: | 209 | * 1. Queue work to do the following on all processors: |
202 | * 2. Disable Interrupts | 210 | * 2. Disable Interrupts |
203 | * 3. Wait for all procs to do so | 211 | * 3. Wait for all procs to do so |
204 | * 4. Enter no-fill cache mode | 212 | * 4. Enter no-fill cache mode |
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) | |||
215 | * 15. Enable interrupts. | 223 | * 15. Enable interrupts. |
216 | * | 224 | * |
217 | * What does that mean for us? Well, first we set data.count to the number | 225 | * What does that mean for us? Well, first we set data.count to the number |
218 | * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait | 226 | * of CPUs. As each CPU announces that it started the rendezvous handler by |
219 | * until it hits 0 and proceed. We set the data.gate flag and reset data.count. | 227 | * decrementing the count, We reset data.count and set the data.gate flag |
220 | * Meanwhile, they are waiting for that flag to be set. Once it's set, each | 228 | * allowing all the cpu's to proceed with the work. As each cpu disables |
229 | * interrupts, it'll decrement data.count once. We wait until it hits 0 and | ||
230 | * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they | ||
231 | * are waiting for that flag to be cleared. Once it's cleared, each | ||
221 | * CPU goes through the transition of updating MTRRs. | 232 | * CPU goes through the transition of updating MTRRs. |
222 | * The CPU vendors may each do it differently, | 233 | * The CPU vendors may each do it differently, |
223 | * so we call mtrr_if->set() callback and let them take care of it. | 234 | * so we call mtrr_if->set() callback and let them take care of it. |
224 | * When they're done, they again decrement data->count and wait for data.gate | 235 | * When they're done, they again decrement data->count and wait for data.gate |
225 | * to be reset. | 236 | * to be set. |
226 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag | 237 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag |
227 | * Everyone then enables interrupts and we all continue on. | 238 | * Everyone then enables interrupts and we all continue on. |
228 | * | 239 | * |
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
234 | { | 245 | { |
235 | struct set_mtrr_data data; | 246 | struct set_mtrr_data data; |
236 | unsigned long flags; | 247 | unsigned long flags; |
248 | int cpu; | ||
249 | |||
250 | preempt_disable(); | ||
237 | 251 | ||
238 | data.smp_reg = reg; | 252 | data.smp_reg = reg; |
239 | data.smp_base = base; | 253 | data.smp_base = base; |
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
246 | atomic_set(&data.gate, 0); | 260 | atomic_set(&data.gate, 0); |
247 | 261 | ||
248 | /* Start the ball rolling on other CPUs */ | 262 | /* Start the ball rolling on other CPUs */ |
249 | if (smp_call_function(ipi_handler, &data, 0) != 0) | 263 | for_each_online_cpu(cpu) { |
250 | panic("mtrr: timed out waiting for other CPUs\n"); | 264 | struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); |
265 | |||
266 | if (cpu == smp_processor_id()) | ||
267 | continue; | ||
268 | |||
269 | stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); | ||
270 | } | ||
251 | 271 | ||
252 | local_irq_save(flags); | ||
253 | 272 | ||
254 | while (atomic_read(&data.count)) | 273 | while (atomic_read(&data.count)) |
255 | cpu_relax(); | 274 | cpu_relax(); |
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
259 | smp_wmb(); | 278 | smp_wmb(); |
260 | atomic_set(&data.gate, 1); | 279 | atomic_set(&data.gate, 1); |
261 | 280 | ||
281 | local_irq_save(flags); | ||
282 | |||
283 | while (atomic_read(&data.count)) | ||
284 | cpu_relax(); | ||
285 | |||
286 | /* Ok, reset count and toggle gate */ | ||
287 | atomic_set(&data.count, num_booting_cpus() - 1); | ||
288 | smp_wmb(); | ||
289 | atomic_set(&data.gate, 0); | ||
290 | |||
262 | /* Do our MTRR business */ | 291 | /* Do our MTRR business */ |
263 | 292 | ||
264 | /* | 293 | /* |
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
279 | 308 | ||
280 | atomic_set(&data.count, num_booting_cpus() - 1); | 309 | atomic_set(&data.count, num_booting_cpus() - 1); |
281 | smp_wmb(); | 310 | smp_wmb(); |
282 | atomic_set(&data.gate, 0); | 311 | atomic_set(&data.gate, 1); |
283 | 312 | ||
284 | /* | 313 | /* |
285 | * Wait here for everyone to have seen the gate change | 314 | * Wait here for everyone to have seen the gate change |
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
289 | cpu_relax(); | 318 | cpu_relax(); |
290 | 319 | ||
291 | local_irq_restore(flags); | 320 | local_irq_restore(flags); |
321 | preempt_enable(); | ||
292 | } | 322 | } |
293 | 323 | ||
294 | /** | 324 | /** |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5db5b7d65a18..03a5b0385ad6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -102,6 +102,7 @@ struct cpu_hw_events { | |||
102 | */ | 102 | */ |
103 | struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ | 103 | struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ |
104 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | 104 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; |
105 | unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | ||
105 | int enabled; | 106 | int enabled; |
106 | 107 | ||
107 | int n_events; | 108 | int n_events; |
@@ -220,6 +221,7 @@ struct x86_pmu { | |||
220 | struct perf_event *event); | 221 | struct perf_event *event); |
221 | struct event_constraint *event_constraints; | 222 | struct event_constraint *event_constraints; |
222 | void (*quirks)(void); | 223 | void (*quirks)(void); |
224 | int perfctr_second_write; | ||
223 | 225 | ||
224 | int (*cpu_prepare)(int cpu); | 226 | int (*cpu_prepare)(int cpu); |
225 | void (*cpu_starting)(int cpu); | 227 | void (*cpu_starting)(int cpu); |
@@ -295,10 +297,10 @@ x86_perf_event_update(struct perf_event *event) | |||
295 | * count to the generic event atomically: | 297 | * count to the generic event atomically: |
296 | */ | 298 | */ |
297 | again: | 299 | again: |
298 | prev_raw_count = atomic64_read(&hwc->prev_count); | 300 | prev_raw_count = local64_read(&hwc->prev_count); |
299 | rdmsrl(hwc->event_base + idx, new_raw_count); | 301 | rdmsrl(hwc->event_base + idx, new_raw_count); |
300 | 302 | ||
301 | if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, | 303 | if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, |
302 | new_raw_count) != prev_raw_count) | 304 | new_raw_count) != prev_raw_count) |
303 | goto again; | 305 | goto again; |
304 | 306 | ||
@@ -313,8 +315,8 @@ again: | |||
313 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | 315 | delta = (new_raw_count << shift) - (prev_raw_count << shift); |
314 | delta >>= shift; | 316 | delta >>= shift; |
315 | 317 | ||
316 | atomic64_add(delta, &event->count); | 318 | local64_add(delta, &event->count); |
317 | atomic64_sub(delta, &hwc->period_left); | 319 | local64_sub(delta, &hwc->period_left); |
318 | 320 | ||
319 | return new_raw_count; | 321 | return new_raw_count; |
320 | } | 322 | } |
@@ -438,7 +440,7 @@ static int x86_setup_perfctr(struct perf_event *event) | |||
438 | if (!hwc->sample_period) { | 440 | if (!hwc->sample_period) { |
439 | hwc->sample_period = x86_pmu.max_period; | 441 | hwc->sample_period = x86_pmu.max_period; |
440 | hwc->last_period = hwc->sample_period; | 442 | hwc->last_period = hwc->sample_period; |
441 | atomic64_set(&hwc->period_left, hwc->sample_period); | 443 | local64_set(&hwc->period_left, hwc->sample_period); |
442 | } else { | 444 | } else { |
443 | /* | 445 | /* |
444 | * If we have a PMU initialized but no APIC | 446 | * If we have a PMU initialized but no APIC |
@@ -885,7 +887,7 @@ static int | |||
885 | x86_perf_event_set_period(struct perf_event *event) | 887 | x86_perf_event_set_period(struct perf_event *event) |
886 | { | 888 | { |
887 | struct hw_perf_event *hwc = &event->hw; | 889 | struct hw_perf_event *hwc = &event->hw; |
888 | s64 left = atomic64_read(&hwc->period_left); | 890 | s64 left = local64_read(&hwc->period_left); |
889 | s64 period = hwc->sample_period; | 891 | s64 period = hwc->sample_period; |
890 | int ret = 0, idx = hwc->idx; | 892 | int ret = 0, idx = hwc->idx; |
891 | 893 | ||
@@ -897,14 +899,14 @@ x86_perf_event_set_period(struct perf_event *event) | |||
897 | */ | 899 | */ |
898 | if (unlikely(left <= -period)) { | 900 | if (unlikely(left <= -period)) { |
899 | left = period; | 901 | left = period; |
900 | atomic64_set(&hwc->period_left, left); | 902 | local64_set(&hwc->period_left, left); |
901 | hwc->last_period = period; | 903 | hwc->last_period = period; |
902 | ret = 1; | 904 | ret = 1; |
903 | } | 905 | } |
904 | 906 | ||
905 | if (unlikely(left <= 0)) { | 907 | if (unlikely(left <= 0)) { |
906 | left += period; | 908 | left += period; |
907 | atomic64_set(&hwc->period_left, left); | 909 | local64_set(&hwc->period_left, left); |
908 | hwc->last_period = period; | 910 | hwc->last_period = period; |
909 | ret = 1; | 911 | ret = 1; |
910 | } | 912 | } |
@@ -923,10 +925,19 @@ x86_perf_event_set_period(struct perf_event *event) | |||
923 | * The hw event starts counting from this event offset, | 925 | * The hw event starts counting from this event offset, |
924 | * mark it to be able to extra future deltas: | 926 | * mark it to be able to extra future deltas: |
925 | */ | 927 | */ |
926 | atomic64_set(&hwc->prev_count, (u64)-left); | 928 | local64_set(&hwc->prev_count, (u64)-left); |
927 | 929 | ||
928 | wrmsrl(hwc->event_base + idx, | 930 | wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); |
931 | |||
932 | /* | ||
933 | * Due to erratum on certan cpu we need | ||
934 | * a second write to be sure the register | ||
935 | * is updated properly | ||
936 | */ | ||
937 | if (x86_pmu.perfctr_second_write) { | ||
938 | wrmsrl(hwc->event_base + idx, | ||
929 | (u64)(-left) & x86_pmu.cntval_mask); | 939 | (u64)(-left) & x86_pmu.cntval_mask); |
940 | } | ||
930 | 941 | ||
931 | perf_event_update_userpage(event); | 942 | perf_event_update_userpage(event); |
932 | 943 | ||
@@ -969,7 +980,7 @@ static int x86_pmu_enable(struct perf_event *event) | |||
969 | * skip the schedulability test here, it will be peformed | 980 | * skip the schedulability test here, it will be peformed |
970 | * at commit time(->commit_txn) as a whole | 981 | * at commit time(->commit_txn) as a whole |
971 | */ | 982 | */ |
972 | if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) | 983 | if (cpuc->group_flag & PERF_EVENT_TXN) |
973 | goto out; | 984 | goto out; |
974 | 985 | ||
975 | ret = x86_pmu.schedule_events(cpuc, n, assign); | 986 | ret = x86_pmu.schedule_events(cpuc, n, assign); |
@@ -1000,6 +1011,7 @@ static int x86_pmu_start(struct perf_event *event) | |||
1000 | x86_perf_event_set_period(event); | 1011 | x86_perf_event_set_period(event); |
1001 | cpuc->events[idx] = event; | 1012 | cpuc->events[idx] = event; |
1002 | __set_bit(idx, cpuc->active_mask); | 1013 | __set_bit(idx, cpuc->active_mask); |
1014 | __set_bit(idx, cpuc->running); | ||
1003 | x86_pmu.enable(event); | 1015 | x86_pmu.enable(event); |
1004 | perf_event_update_userpage(event); | 1016 | perf_event_update_userpage(event); |
1005 | 1017 | ||
@@ -1096,7 +1108,7 @@ static void x86_pmu_disable(struct perf_event *event) | |||
1096 | * The events never got scheduled and ->cancel_txn will truncate | 1108 | * The events never got scheduled and ->cancel_txn will truncate |
1097 | * the event_list. | 1109 | * the event_list. |
1098 | */ | 1110 | */ |
1099 | if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) | 1111 | if (cpuc->group_flag & PERF_EVENT_TXN) |
1100 | return; | 1112 | return; |
1101 | 1113 | ||
1102 | x86_pmu_stop(event); | 1114 | x86_pmu_stop(event); |
@@ -1131,8 +1143,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1131 | cpuc = &__get_cpu_var(cpu_hw_events); | 1143 | cpuc = &__get_cpu_var(cpu_hw_events); |
1132 | 1144 | ||
1133 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1145 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
1134 | if (!test_bit(idx, cpuc->active_mask)) | 1146 | if (!test_bit(idx, cpuc->active_mask)) { |
1147 | /* | ||
1148 | * Though we deactivated the counter some cpus | ||
1149 | * might still deliver spurious interrupts still | ||
1150 | * in flight. Catch them: | ||
1151 | */ | ||
1152 | if (__test_and_clear_bit(idx, cpuc->running)) | ||
1153 | handled++; | ||
1135 | continue; | 1154 | continue; |
1155 | } | ||
1136 | 1156 | ||
1137 | event = cpuc->events[idx]; | 1157 | event = cpuc->events[idx]; |
1138 | hwc = &event->hw; | 1158 | hwc = &event->hw; |
@@ -1144,7 +1164,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1144 | /* | 1164 | /* |
1145 | * event overflow | 1165 | * event overflow |
1146 | */ | 1166 | */ |
1147 | handled = 1; | 1167 | handled++; |
1148 | data.period = event->hw.last_period; | 1168 | data.period = event->hw.last_period; |
1149 | 1169 | ||
1150 | if (!x86_perf_event_set_period(event)) | 1170 | if (!x86_perf_event_set_period(event)) |
@@ -1190,12 +1210,20 @@ void perf_events_lapic_init(void) | |||
1190 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1210 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1191 | } | 1211 | } |
1192 | 1212 | ||
1213 | struct pmu_nmi_state { | ||
1214 | unsigned int marked; | ||
1215 | int handled; | ||
1216 | }; | ||
1217 | |||
1218 | static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); | ||
1219 | |||
1193 | static int __kprobes | 1220 | static int __kprobes |
1194 | perf_event_nmi_handler(struct notifier_block *self, | 1221 | perf_event_nmi_handler(struct notifier_block *self, |
1195 | unsigned long cmd, void *__args) | 1222 | unsigned long cmd, void *__args) |
1196 | { | 1223 | { |
1197 | struct die_args *args = __args; | 1224 | struct die_args *args = __args; |
1198 | struct pt_regs *regs; | 1225 | unsigned int this_nmi; |
1226 | int handled; | ||
1199 | 1227 | ||
1200 | if (!atomic_read(&active_events)) | 1228 | if (!atomic_read(&active_events)) |
1201 | return NOTIFY_DONE; | 1229 | return NOTIFY_DONE; |
@@ -1204,22 +1232,47 @@ perf_event_nmi_handler(struct notifier_block *self, | |||
1204 | case DIE_NMI: | 1232 | case DIE_NMI: |
1205 | case DIE_NMI_IPI: | 1233 | case DIE_NMI_IPI: |
1206 | break; | 1234 | break; |
1207 | 1235 | case DIE_NMIUNKNOWN: | |
1236 | this_nmi = percpu_read(irq_stat.__nmi_count); | ||
1237 | if (this_nmi != __get_cpu_var(pmu_nmi).marked) | ||
1238 | /* let the kernel handle the unknown nmi */ | ||
1239 | return NOTIFY_DONE; | ||
1240 | /* | ||
1241 | * This one is a PMU back-to-back nmi. Two events | ||
1242 | * trigger 'simultaneously' raising two back-to-back | ||
1243 | * NMIs. If the first NMI handles both, the latter | ||
1244 | * will be empty and daze the CPU. So, we drop it to | ||
1245 | * avoid false-positive 'unknown nmi' messages. | ||
1246 | */ | ||
1247 | return NOTIFY_STOP; | ||
1208 | default: | 1248 | default: |
1209 | return NOTIFY_DONE; | 1249 | return NOTIFY_DONE; |
1210 | } | 1250 | } |
1211 | 1251 | ||
1212 | regs = args->regs; | ||
1213 | |||
1214 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1252 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1215 | /* | 1253 | |
1216 | * Can't rely on the handled return value to say it was our NMI, two | 1254 | handled = x86_pmu.handle_irq(args->regs); |
1217 | * events could trigger 'simultaneously' raising two back-to-back NMIs. | 1255 | if (!handled) |
1218 | * | 1256 | return NOTIFY_DONE; |
1219 | * If the first NMI handles both, the latter will be empty and daze | 1257 | |
1220 | * the CPU. | 1258 | this_nmi = percpu_read(irq_stat.__nmi_count); |
1221 | */ | 1259 | if ((handled > 1) || |
1222 | x86_pmu.handle_irq(regs); | 1260 | /* the next nmi could be a back-to-back nmi */ |
1261 | ((__get_cpu_var(pmu_nmi).marked == this_nmi) && | ||
1262 | (__get_cpu_var(pmu_nmi).handled > 1))) { | ||
1263 | /* | ||
1264 | * We could have two subsequent back-to-back nmis: The | ||
1265 | * first handles more than one counter, the 2nd | ||
1266 | * handles only one counter and the 3rd handles no | ||
1267 | * counter. | ||
1268 | * | ||
1269 | * This is the 2nd nmi because the previous was | ||
1270 | * handling more than one counter. We will mark the | ||
1271 | * next (3rd) and then drop it if unhandled. | ||
1272 | */ | ||
1273 | __get_cpu_var(pmu_nmi).marked = this_nmi + 1; | ||
1274 | __get_cpu_var(pmu_nmi).handled = handled; | ||
1275 | } | ||
1223 | 1276 | ||
1224 | return NOTIFY_STOP; | 1277 | return NOTIFY_STOP; |
1225 | } | 1278 | } |
@@ -1388,7 +1441,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) | |||
1388 | { | 1441 | { |
1389 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1442 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1390 | 1443 | ||
1391 | cpuc->group_flag |= PERF_EVENT_TXN_STARTED; | 1444 | cpuc->group_flag |= PERF_EVENT_TXN; |
1392 | cpuc->n_txn = 0; | 1445 | cpuc->n_txn = 0; |
1393 | } | 1446 | } |
1394 | 1447 | ||
@@ -1401,7 +1454,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) | |||
1401 | { | 1454 | { |
1402 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1455 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1403 | 1456 | ||
1404 | cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; | 1457 | cpuc->group_flag &= ~PERF_EVENT_TXN; |
1405 | /* | 1458 | /* |
1406 | * Truncate the collected events. | 1459 | * Truncate the collected events. |
1407 | */ | 1460 | */ |
@@ -1435,11 +1488,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) | |||
1435 | */ | 1488 | */ |
1436 | memcpy(cpuc->assign, assign, n*sizeof(int)); | 1489 | memcpy(cpuc->assign, assign, n*sizeof(int)); |
1437 | 1490 | ||
1438 | /* | 1491 | cpuc->group_flag &= ~PERF_EVENT_TXN; |
1439 | * Clear out the txn count so that ->cancel_txn() which gets | ||
1440 | * run after ->commit_txn() doesn't undo things. | ||
1441 | */ | ||
1442 | cpuc->n_txn = 0; | ||
1443 | 1492 | ||
1444 | return 0; | 1493 | return 0; |
1445 | } | 1494 | } |
@@ -1607,8 +1656,6 @@ static const struct stacktrace_ops backtrace_ops = { | |||
1607 | .walk_stack = print_context_stack_bp, | 1656 | .walk_stack = print_context_stack_bp, |
1608 | }; | 1657 | }; |
1609 | 1658 | ||
1610 | #include "../dumpstack.h" | ||
1611 | |||
1612 | static void | 1659 | static void |
1613 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | 1660 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) |
1614 | { | 1661 | { |
@@ -1730,22 +1777,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
1730 | return entry; | 1777 | return entry; |
1731 | } | 1778 | } |
1732 | 1779 | ||
1733 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) | ||
1734 | { | ||
1735 | regs->ip = ip; | ||
1736 | /* | ||
1737 | * perf_arch_fetch_caller_regs adds another call, we need to increment | ||
1738 | * the skip level | ||
1739 | */ | ||
1740 | regs->bp = rewind_frame_pointer(skip + 1); | ||
1741 | regs->cs = __KERNEL_CS; | ||
1742 | /* | ||
1743 | * We abuse bit 3 to pass exact information, see perf_misc_flags | ||
1744 | * and the comment with PERF_EFLAGS_EXACT. | ||
1745 | */ | ||
1746 | regs->flags = 0; | ||
1747 | } | ||
1748 | |||
1749 | unsigned long perf_instruction_pointer(struct pt_regs *regs) | 1780 | unsigned long perf_instruction_pointer(struct pt_regs *regs) |
1750 | { | 1781 | { |
1751 | unsigned long ip; | 1782 | unsigned long ip; |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 214ac860ebe0..ee05c90012d2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added) | |||
491 | * Intel Errata AAP53 (model 30) | 491 | * Intel Errata AAP53 (model 30) |
492 | * Intel Errata BD53 (model 44) | 492 | * Intel Errata BD53 (model 44) |
493 | * | 493 | * |
494 | * These chips need to be 'reset' when adding counters by programming | 494 | * The official story: |
495 | * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5 | 495 | * These chips need to be 'reset' when adding counters by programming the |
496 | * either in sequence on the same PMC or on different PMCs. | 496 | * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either |
497 | * in sequence on the same PMC or on different PMCs. | ||
498 | * | ||
499 | * In practise it appears some of these events do in fact count, and | ||
500 | * we need to programm all 4 events. | ||
497 | */ | 501 | */ |
498 | static void intel_pmu_nhm_enable_all(int added) | 502 | static void intel_pmu_nhm_workaround(void) |
499 | { | 503 | { |
500 | if (added) { | 504 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
501 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 505 | static const unsigned long nhm_magic[4] = { |
502 | int i; | 506 | 0x4300B5, |
507 | 0x4300D2, | ||
508 | 0x4300B1, | ||
509 | 0x4300B1 | ||
510 | }; | ||
511 | struct perf_event *event; | ||
512 | int i; | ||
513 | |||
514 | /* | ||
515 | * The Errata requires below steps: | ||
516 | * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; | ||
517 | * 2) Configure 4 PERFEVTSELx with the magic events and clear | ||
518 | * the corresponding PMCx; | ||
519 | * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; | ||
520 | * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; | ||
521 | * 5) Clear 4 pairs of ERFEVTSELx and PMCx; | ||
522 | */ | ||
503 | 523 | ||
504 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2); | 524 | /* |
505 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1); | 525 | * The real steps we choose are a little different from above. |
506 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5); | 526 | * A) To reduce MSR operations, we don't run step 1) as they |
527 | * are already cleared before this function is called; | ||
528 | * B) Call x86_perf_event_update to save PMCx before configuring | ||
529 | * PERFEVTSELx with magic number; | ||
530 | * C) With step 5), we do clear only when the PERFEVTSELx is | ||
531 | * not used currently. | ||
532 | * D) Call x86_perf_event_set_period to restore PMCx; | ||
533 | */ | ||
534 | |||
535 | /* We always operate 4 pairs of PERF Counters */ | ||
536 | for (i = 0; i < 4; i++) { | ||
537 | event = cpuc->events[i]; | ||
538 | if (event) | ||
539 | x86_perf_event_update(event); | ||
540 | } | ||
507 | 541 | ||
508 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); | 542 | for (i = 0; i < 4; i++) { |
509 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); | 543 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); |
544 | wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); | ||
545 | } | ||
510 | 546 | ||
511 | for (i = 0; i < 3; i++) { | 547 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); |
512 | struct perf_event *event = cpuc->events[i]; | 548 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); |
513 | 549 | ||
514 | if (!event) | 550 | for (i = 0; i < 4; i++) { |
515 | continue; | 551 | event = cpuc->events[i]; |
516 | 552 | ||
553 | if (event) { | ||
554 | x86_perf_event_set_period(event); | ||
517 | __x86_pmu_enable_event(&event->hw, | 555 | __x86_pmu_enable_event(&event->hw, |
518 | ARCH_PERFMON_EVENTSEL_ENABLE); | 556 | ARCH_PERFMON_EVENTSEL_ENABLE); |
519 | } | 557 | } else |
558 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); | ||
520 | } | 559 | } |
560 | } | ||
561 | |||
562 | static void intel_pmu_nhm_enable_all(int added) | ||
563 | { | ||
564 | if (added) | ||
565 | intel_pmu_nhm_workaround(); | ||
521 | intel_pmu_enable_all(added); | 566 | intel_pmu_enable_all(added); |
522 | } | 567 | } |
523 | 568 | ||
@@ -667,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
667 | struct perf_sample_data data; | 712 | struct perf_sample_data data; |
668 | struct cpu_hw_events *cpuc; | 713 | struct cpu_hw_events *cpuc; |
669 | int bit, loops; | 714 | int bit, loops; |
670 | u64 ack, status; | 715 | u64 status; |
716 | int handled = 0; | ||
671 | 717 | ||
672 | perf_sample_data_init(&data, 0); | 718 | perf_sample_data_init(&data, 0); |
673 | 719 | ||
@@ -683,6 +729,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
683 | 729 | ||
684 | loops = 0; | 730 | loops = 0; |
685 | again: | 731 | again: |
732 | intel_pmu_ack_status(status); | ||
686 | if (++loops > 100) { | 733 | if (++loops > 100) { |
687 | WARN_ONCE(1, "perfevents: irq loop stuck!\n"); | 734 | WARN_ONCE(1, "perfevents: irq loop stuck!\n"); |
688 | perf_event_print_debug(); | 735 | perf_event_print_debug(); |
@@ -691,19 +738,22 @@ again: | |||
691 | } | 738 | } |
692 | 739 | ||
693 | inc_irq_stat(apic_perf_irqs); | 740 | inc_irq_stat(apic_perf_irqs); |
694 | ack = status; | ||
695 | 741 | ||
696 | intel_pmu_lbr_read(); | 742 | intel_pmu_lbr_read(); |
697 | 743 | ||
698 | /* | 744 | /* |
699 | * PEBS overflow sets bit 62 in the global status register | 745 | * PEBS overflow sets bit 62 in the global status register |
700 | */ | 746 | */ |
701 | if (__test_and_clear_bit(62, (unsigned long *)&status)) | 747 | if (__test_and_clear_bit(62, (unsigned long *)&status)) { |
748 | handled++; | ||
702 | x86_pmu.drain_pebs(regs); | 749 | x86_pmu.drain_pebs(regs); |
750 | } | ||
703 | 751 | ||
704 | for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { | 752 | for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { |
705 | struct perf_event *event = cpuc->events[bit]; | 753 | struct perf_event *event = cpuc->events[bit]; |
706 | 754 | ||
755 | handled++; | ||
756 | |||
707 | if (!test_bit(bit, cpuc->active_mask)) | 757 | if (!test_bit(bit, cpuc->active_mask)) |
708 | continue; | 758 | continue; |
709 | 759 | ||
@@ -716,8 +766,6 @@ again: | |||
716 | x86_pmu_stop(event); | 766 | x86_pmu_stop(event); |
717 | } | 767 | } |
718 | 768 | ||
719 | intel_pmu_ack_status(ack); | ||
720 | |||
721 | /* | 769 | /* |
722 | * Repeat if there is more work to be done: | 770 | * Repeat if there is more work to be done: |
723 | */ | 771 | */ |
@@ -727,7 +775,7 @@ again: | |||
727 | 775 | ||
728 | done: | 776 | done: |
729 | intel_pmu_enable_all(0); | 777 | intel_pmu_enable_all(0); |
730 | return 1; | 778 | return handled; |
731 | } | 779 | } |
732 | 780 | ||
733 | static struct event_constraint * | 781 | static struct event_constraint * |
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ae85d69644d1..249015173992 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c | |||
@@ -21,22 +21,36 @@ struct p4_event_bind { | |||
21 | char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ | 21 | char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ |
22 | }; | 22 | }; |
23 | 23 | ||
24 | struct p4_cache_event_bind { | 24 | struct p4_pebs_bind { |
25 | unsigned int metric_pebs; | 25 | unsigned int metric_pebs; |
26 | unsigned int metric_vert; | 26 | unsigned int metric_vert; |
27 | }; | 27 | }; |
28 | 28 | ||
29 | #define P4_GEN_CACHE_EVENT_BIND(name) \ | 29 | /* it sets P4_PEBS_ENABLE_UOP_TAG as well */ |
30 | [P4_CACHE__##name] = { \ | 30 | #define P4_GEN_PEBS_BIND(name, pebs, vert) \ |
31 | .metric_pebs = P4_PEBS__##name, \ | 31 | [P4_PEBS_METRIC__##name] = { \ |
32 | .metric_vert = P4_VERT__##name, \ | 32 | .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ |
33 | .metric_vert = vert, \ | ||
33 | } | 34 | } |
34 | 35 | ||
35 | static struct p4_cache_event_bind p4_cache_event_bind_map[] = { | 36 | /* |
36 | P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), | 37 | * note we have P4_PEBS_ENABLE_UOP_TAG always set here |
37 | P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), | 38 | * |
38 | P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), | 39 | * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of |
39 | P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), | 40 | * event configuration to find out which values are to be |
41 | * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT | ||
42 | * resgisters | ||
43 | */ | ||
44 | static struct p4_pebs_bind p4_pebs_bind_map[] = { | ||
45 | P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), | ||
46 | P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), | ||
47 | P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), | ||
48 | P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), | ||
49 | P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), | ||
50 | P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), | ||
51 | P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), | ||
52 | P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), | ||
53 | P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), | ||
40 | }; | 54 | }; |
41 | 55 | ||
42 | /* | 56 | /* |
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = { | |||
281 | }, | 295 | }, |
282 | }; | 296 | }; |
283 | 297 | ||
284 | #define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ | 298 | #define P4_GEN_CACHE_EVENT(event, bit, metric) \ |
285 | p4_config_pack_escr(P4_ESCR_EVENT(event) | \ | 299 | p4_config_pack_escr(P4_ESCR_EVENT(event) | \ |
286 | P4_ESCR_EMASK_BIT(event, bit)) | \ | 300 | P4_ESCR_EMASK_BIT(event, bit)) | \ |
287 | p4_config_pack_cccr(cache_event | \ | 301 | p4_config_pack_cccr(metric | \ |
288 | P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) | 302 | P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) |
289 | 303 | ||
290 | static __initconst const u64 p4_hw_cache_event_ids | 304 | static __initconst const u64 p4_hw_cache_event_ids |
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids | |||
296 | [ C(OP_READ) ] = { | 310 | [ C(OP_READ) ] = { |
297 | [ C(RESULT_ACCESS) ] = 0x0, | 311 | [ C(RESULT_ACCESS) ] = 0x0, |
298 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 312 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
299 | P4_CACHE__1stl_cache_load_miss_retired), | 313 | P4_PEBS_METRIC__1stl_cache_load_miss_retired), |
300 | }, | 314 | }, |
301 | }, | 315 | }, |
302 | [ C(LL ) ] = { | 316 | [ C(LL ) ] = { |
303 | [ C(OP_READ) ] = { | 317 | [ C(OP_READ) ] = { |
304 | [ C(RESULT_ACCESS) ] = 0x0, | 318 | [ C(RESULT_ACCESS) ] = 0x0, |
305 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 319 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
306 | P4_CACHE__2ndl_cache_load_miss_retired), | 320 | P4_PEBS_METRIC__2ndl_cache_load_miss_retired), |
307 | }, | 321 | }, |
308 | }, | 322 | }, |
309 | [ C(DTLB) ] = { | 323 | [ C(DTLB) ] = { |
310 | [ C(OP_READ) ] = { | 324 | [ C(OP_READ) ] = { |
311 | [ C(RESULT_ACCESS) ] = 0x0, | 325 | [ C(RESULT_ACCESS) ] = 0x0, |
312 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 326 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
313 | P4_CACHE__dtlb_load_miss_retired), | 327 | P4_PEBS_METRIC__dtlb_load_miss_retired), |
314 | }, | 328 | }, |
315 | [ C(OP_WRITE) ] = { | 329 | [ C(OP_WRITE) ] = { |
316 | [ C(RESULT_ACCESS) ] = 0x0, | 330 | [ C(RESULT_ACCESS) ] = 0x0, |
317 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | 331 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, |
318 | P4_CACHE__dtlb_store_miss_retired), | 332 | P4_PEBS_METRIC__dtlb_store_miss_retired), |
319 | }, | 333 | }, |
320 | }, | 334 | }, |
321 | [ C(ITLB) ] = { | 335 | [ C(ITLB) ] = { |
322 | [ C(OP_READ) ] = { | 336 | [ C(OP_READ) ] = { |
323 | [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, | 337 | [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, |
324 | P4_CACHE__itlb_reference_hit), | 338 | P4_PEBS_METRIC__none), |
325 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, | 339 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, |
326 | P4_CACHE__itlb_reference_miss), | 340 | P4_PEBS_METRIC__none), |
327 | }, | 341 | }, |
328 | [ C(OP_WRITE) ] = { | 342 | [ C(OP_WRITE) ] = { |
329 | [ C(RESULT_ACCESS) ] = -1, | 343 | [ C(RESULT_ACCESS) ] = -1, |
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event) | |||
414 | return config; | 428 | return config; |
415 | } | 429 | } |
416 | 430 | ||
431 | static int p4_validate_raw_event(struct perf_event *event) | ||
432 | { | ||
433 | unsigned int v; | ||
434 | |||
435 | /* user data may have out-of-bound event index */ | ||
436 | v = p4_config_unpack_event(event->attr.config); | ||
437 | if (v >= ARRAY_SIZE(p4_event_bind_map)) { | ||
438 | pr_warning("P4 PMU: Unknown event code: %d\n", v); | ||
439 | return -EINVAL; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * it may have some screwed PEBS bits | ||
444 | */ | ||
445 | if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { | ||
446 | pr_warning("P4 PMU: PEBS are not supported yet\n"); | ||
447 | return -EINVAL; | ||
448 | } | ||
449 | v = p4_config_unpack_metric(event->attr.config); | ||
450 | if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { | ||
451 | pr_warning("P4 PMU: Unknown metric code: %d\n", v); | ||
452 | return -EINVAL; | ||
453 | } | ||
454 | |||
455 | return 0; | ||
456 | } | ||
457 | |||
417 | static int p4_hw_config(struct perf_event *event) | 458 | static int p4_hw_config(struct perf_event *event) |
418 | { | 459 | { |
419 | int cpu = get_cpu(); | 460 | int cpu = get_cpu(); |
420 | int rc = 0; | 461 | int rc = 0; |
421 | unsigned int evnt; | ||
422 | u32 escr, cccr; | 462 | u32 escr, cccr; |
423 | 463 | ||
424 | /* | 464 | /* |
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event) | |||
438 | 478 | ||
439 | if (event->attr.type == PERF_TYPE_RAW) { | 479 | if (event->attr.type == PERF_TYPE_RAW) { |
440 | 480 | ||
441 | /* user data may have out-of-bound event index */ | 481 | rc = p4_validate_raw_event(event); |
442 | evnt = p4_config_unpack_event(event->attr.config); | 482 | if (rc) |
443 | if (evnt >= ARRAY_SIZE(p4_event_bind_map)) { | ||
444 | rc = -EINVAL; | ||
445 | goto out; | 483 | goto out; |
446 | } | ||
447 | 484 | ||
448 | /* | 485 | /* |
449 | * We don't control raw events so it's up to the caller | 486 | * We don't control raw events so it's up to the caller |
@@ -451,12 +488,17 @@ static int p4_hw_config(struct perf_event *event) | |||
451 | * on HT machine but allow HT-compatible specifics to be | 488 | * on HT machine but allow HT-compatible specifics to be |
452 | * passed on) | 489 | * passed on) |
453 | * | 490 | * |
491 | * Note that for RAW events we allow user to use P4_CCCR_RESERVED | ||
492 | * bits since we keep additional info here (for cache events and etc) | ||
493 | * | ||
454 | * XXX: HT wide things should check perf_paranoid_cpu() && | 494 | * XXX: HT wide things should check perf_paranoid_cpu() && |
455 | * CAP_SYS_ADMIN | 495 | * CAP_SYS_ADMIN |
456 | */ | 496 | */ |
457 | event->hw.config |= event->attr.config & | 497 | event->hw.config |= event->attr.config & |
458 | (p4_config_pack_escr(P4_ESCR_MASK_HT) | | 498 | (p4_config_pack_escr(P4_ESCR_MASK_HT) | |
459 | p4_config_pack_cccr(P4_CCCR_MASK_HT)); | 499 | p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); |
500 | |||
501 | event->hw.config &= ~P4_CCCR_FORCE_OVF; | ||
460 | } | 502 | } |
461 | 503 | ||
462 | rc = x86_setup_perfctr(event); | 504 | rc = x86_setup_perfctr(event); |
@@ -482,6 +524,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) | |||
482 | return overflow; | 524 | return overflow; |
483 | } | 525 | } |
484 | 526 | ||
527 | static void p4_pmu_disable_pebs(void) | ||
528 | { | ||
529 | /* | ||
530 | * FIXME | ||
531 | * | ||
532 | * It's still allowed that two threads setup same cache | ||
533 | * events so we can't simply clear metrics until we knew | ||
534 | * noone is depending on us, so we need kind of counter | ||
535 | * for "ReplayEvent" users. | ||
536 | * | ||
537 | * What is more complex -- RAW events, if user (for some | ||
538 | * reason) will pass some cache event metric with improper | ||
539 | * event opcode -- it's fine from hardware point of view | ||
540 | * but completely nonsence from "meaning" of such action. | ||
541 | * | ||
542 | * So at moment let leave metrics turned on forever -- it's | ||
543 | * ok for now but need to be revisited! | ||
544 | * | ||
545 | * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); | ||
546 | * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); | ||
547 | */ | ||
548 | } | ||
549 | |||
485 | static inline void p4_pmu_disable_event(struct perf_event *event) | 550 | static inline void p4_pmu_disable_event(struct perf_event *event) |
486 | { | 551 | { |
487 | struct hw_perf_event *hwc = &event->hw; | 552 | struct hw_perf_event *hwc = &event->hw; |
@@ -507,6 +572,26 @@ static void p4_pmu_disable_all(void) | |||
507 | continue; | 572 | continue; |
508 | p4_pmu_disable_event(event); | 573 | p4_pmu_disable_event(event); |
509 | } | 574 | } |
575 | |||
576 | p4_pmu_disable_pebs(); | ||
577 | } | ||
578 | |||
579 | /* configuration must be valid */ | ||
580 | static void p4_pmu_enable_pebs(u64 config) | ||
581 | { | ||
582 | struct p4_pebs_bind *bind; | ||
583 | unsigned int idx; | ||
584 | |||
585 | BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); | ||
586 | |||
587 | idx = p4_config_unpack_metric(config); | ||
588 | if (idx == P4_PEBS_METRIC__none) | ||
589 | return; | ||
590 | |||
591 | bind = &p4_pebs_bind_map[idx]; | ||
592 | |||
593 | (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); | ||
594 | (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); | ||
510 | } | 595 | } |
511 | 596 | ||
512 | static void p4_pmu_enable_event(struct perf_event *event) | 597 | static void p4_pmu_enable_event(struct perf_event *event) |
@@ -515,9 +600,7 @@ static void p4_pmu_enable_event(struct perf_event *event) | |||
515 | int thread = p4_ht_config_thread(hwc->config); | 600 | int thread = p4_ht_config_thread(hwc->config); |
516 | u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); | 601 | u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); |
517 | unsigned int idx = p4_config_unpack_event(hwc->config); | 602 | unsigned int idx = p4_config_unpack_event(hwc->config); |
518 | unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config); | ||
519 | struct p4_event_bind *bind; | 603 | struct p4_event_bind *bind; |
520 | struct p4_cache_event_bind *bind_cache; | ||
521 | u64 escr_addr, cccr; | 604 | u64 escr_addr, cccr; |
522 | 605 | ||
523 | bind = &p4_event_bind_map[idx]; | 606 | bind = &p4_event_bind_map[idx]; |
@@ -537,16 +620,10 @@ static void p4_pmu_enable_event(struct perf_event *event) | |||
537 | cccr = p4_config_unpack_cccr(hwc->config); | 620 | cccr = p4_config_unpack_cccr(hwc->config); |
538 | 621 | ||
539 | /* | 622 | /* |
540 | * it could be Cache event so that we need to | 623 | * it could be Cache event so we need to write metrics |
541 | * set metrics into additional MSRs | 624 | * into additional MSRs |
542 | */ | 625 | */ |
543 | BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); | 626 | p4_pmu_enable_pebs(hwc->config); |
544 | if (idx_cache > P4_CACHE__NONE && | ||
545 | idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) { | ||
546 | bind_cache = &p4_cache_event_bind_map[idx_cache]; | ||
547 | (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs); | ||
548 | (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert); | ||
549 | } | ||
550 | 627 | ||
551 | (void)checking_wrmsrl(escr_addr, escr_conf); | 628 | (void)checking_wrmsrl(escr_addr, escr_conf); |
552 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, | 629 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, |
@@ -581,9 +658,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) | |||
581 | cpuc = &__get_cpu_var(cpu_hw_events); | 658 | cpuc = &__get_cpu_var(cpu_hw_events); |
582 | 659 | ||
583 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 660 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
661 | int overflow; | ||
584 | 662 | ||
585 | if (!test_bit(idx, cpuc->active_mask)) | 663 | if (!test_bit(idx, cpuc->active_mask)) { |
664 | /* catch in-flight IRQs */ | ||
665 | if (__test_and_clear_bit(idx, cpuc->running)) | ||
666 | handled++; | ||
586 | continue; | 667 | continue; |
668 | } | ||
587 | 669 | ||
588 | event = cpuc->events[idx]; | 670 | event = cpuc->events[idx]; |
589 | hwc = &event->hw; | 671 | hwc = &event->hw; |
@@ -591,12 +673,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) | |||
591 | WARN_ON_ONCE(hwc->idx != idx); | 673 | WARN_ON_ONCE(hwc->idx != idx); |
592 | 674 | ||
593 | /* it might be unflagged overflow */ | 675 | /* it might be unflagged overflow */ |
594 | handled = p4_pmu_clear_cccr_ovf(hwc); | 676 | overflow = p4_pmu_clear_cccr_ovf(hwc); |
595 | 677 | ||
596 | val = x86_perf_event_update(event); | 678 | val = x86_perf_event_update(event); |
597 | if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) | 679 | if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) |
598 | continue; | 680 | continue; |
599 | 681 | ||
682 | handled += overflow; | ||
683 | |||
600 | /* event overflow for sure */ | 684 | /* event overflow for sure */ |
601 | data.period = event->hw.last_period; | 685 | data.period = event->hw.last_period; |
602 | 686 | ||
@@ -829,6 +913,15 @@ static __initconst const struct x86_pmu p4_pmu = { | |||
829 | .max_period = (1ULL << 39) - 1, | 913 | .max_period = (1ULL << 39) - 1, |
830 | .hw_config = p4_hw_config, | 914 | .hw_config = p4_hw_config, |
831 | .schedule_events = p4_pmu_schedule_events, | 915 | .schedule_events = p4_pmu_schedule_events, |
916 | /* | ||
917 | * This handles erratum N15 in intel doc 249199-029, | ||
918 | * the counter may not be updated correctly on write | ||
919 | * so we need a second write operation to do the trick | ||
920 | * (the official workaround didn't work) | ||
921 | * | ||
922 | * the former idea is taken from OProfile code | ||
923 | */ | ||
924 | .perfctr_second_write = 1, | ||
832 | }; | 925 | }; |
833 | 926 | ||
834 | static __init int p4_pmu_init(void) | 927 | static __init int p4_pmu_init(void) |
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000000..d49079515122 --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c | |||
@@ -0,0 +1,64 @@ | |||
1 | /* | ||
2 | * Routines to indentify additional cpu features that are scattered in | ||
3 | * cpuid space. | ||
4 | */ | ||
5 | #include <linux/cpu.h> | ||
6 | |||
7 | #include <asm/pat.h> | ||
8 | #include <asm/processor.h> | ||
9 | |||
10 | #include <asm/apic.h> | ||
11 | |||
12 | struct cpuid_bit { | ||
13 | u16 feature; | ||
14 | u8 reg; | ||
15 | u8 bit; | ||
16 | u32 level; | ||
17 | u32 sub_leaf; | ||
18 | }; | ||
19 | |||
20 | enum cpuid_regs { | ||
21 | CR_EAX = 0, | ||
22 | CR_ECX, | ||
23 | CR_EDX, | ||
24 | CR_EBX | ||
25 | }; | ||
26 | |||
27 | void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | ||
28 | { | ||
29 | u32 max_level; | ||
30 | u32 regs[4]; | ||
31 | const struct cpuid_bit *cb; | ||
32 | |||
33 | static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { | ||
34 | { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, | ||
35 | { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, | ||
36 | { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, | ||
37 | { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, | ||
38 | { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, | ||
39 | { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, | ||
40 | { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, | ||
41 | { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, | ||
42 | { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, | ||
43 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, | ||
44 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, | ||
45 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, | ||
46 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, | ||
47 | { 0, 0, 0, 0, 0 } | ||
48 | }; | ||
49 | |||
50 | for (cb = cpuid_bits; cb->feature; cb++) { | ||
51 | |||
52 | /* Verify that the level is valid */ | ||
53 | max_level = cpuid_eax(cb->level & 0xffff0000); | ||
54 | if (max_level < cb->level || | ||
55 | max_level > (cb->level | 0xffff)) | ||
56 | continue; | ||
57 | |||
58 | cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], | ||
59 | ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); | ||
60 | |||
61 | if (regs[cb->reg] & (1 << cb->bit)) | ||
62 | set_cpu_cap(c, cb->feature); | ||
63 | } | ||
64 | } | ||
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c index 10fa5684a662..4397e987a1cf 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/topology.c | |||
@@ -1,62 +1,14 @@ | |||
1 | /* | 1 | /* |
2 | * Routines to indentify additional cpu features that are scattered in | 2 | * Check for extended topology enumeration cpuid leaf 0xb and if it |
3 | * cpuid space. | 3 | * exists, use it for populating initial_apicid and cpu topology |
4 | * detection. | ||
4 | */ | 5 | */ |
5 | #include <linux/cpu.h> | ||
6 | 6 | ||
7 | #include <linux/cpu.h> | ||
8 | #include <asm/apic.h> | ||
7 | #include <asm/pat.h> | 9 | #include <asm/pat.h> |
8 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
9 | 11 | ||
10 | #include <asm/apic.h> | ||
11 | |||
12 | struct cpuid_bit { | ||
13 | u16 feature; | ||
14 | u8 reg; | ||
15 | u8 bit; | ||
16 | u32 level; | ||
17 | }; | ||
18 | |||
19 | enum cpuid_regs { | ||
20 | CR_EAX = 0, | ||
21 | CR_ECX, | ||
22 | CR_EDX, | ||
23 | CR_EBX | ||
24 | }; | ||
25 | |||
26 | void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | ||
27 | { | ||
28 | u32 max_level; | ||
29 | u32 regs[4]; | ||
30 | const struct cpuid_bit *cb; | ||
31 | |||
32 | static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { | ||
33 | { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, | ||
34 | { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, | ||
35 | { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, | ||
36 | { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, | ||
37 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, | ||
38 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, | ||
39 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, | ||
40 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, | ||
41 | { 0, 0, 0, 0 } | ||
42 | }; | ||
43 | |||
44 | for (cb = cpuid_bits; cb->feature; cb++) { | ||
45 | |||
46 | /* Verify that the level is valid */ | ||
47 | max_level = cpuid_eax(cb->level & 0xffff0000); | ||
48 | if (max_level < cb->level || | ||
49 | max_level > (cb->level | 0xffff)) | ||
50 | continue; | ||
51 | |||
52 | cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], | ||
53 | ®s[CR_ECX], ®s[CR_EDX]); | ||
54 | |||
55 | if (regs[cb->reg] & (1 << cb->bit)) | ||
56 | set_cpu_cap(c, cb->feature); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | /* leaf 0xb SMT level */ | 12 | /* leaf 0xb SMT level */ |
61 | #define SMT_LEVEL 0 | 13 | #define SMT_LEVEL 0 |
62 | 14 | ||
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b9d1ff588445..227b0448960d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c | |||
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void) | |||
51 | 51 | ||
52 | static unsigned long vmware_get_tsc_khz(void) | 52 | static unsigned long vmware_get_tsc_khz(void) |
53 | { | 53 | { |
54 | uint64_t tsc_hz; | 54 | uint64_t tsc_hz, lpj; |
55 | uint32_t eax, ebx, ecx, edx; | 55 | uint32_t eax, ebx, ecx, edx; |
56 | 56 | ||
57 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); | 57 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); |
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void) | |||
62 | printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", | 62 | printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", |
63 | (unsigned long) tsc_hz / 1000, | 63 | (unsigned long) tsc_hz / 1000, |
64 | (unsigned long) tsc_hz % 1000); | 64 | (unsigned long) tsc_hz % 1000); |
65 | |||
66 | if (!preset_lpj) { | ||
67 | lpj = ((u64)tsc_hz * 1000); | ||
68 | do_div(lpj, HZ); | ||
69 | preset_lpj = lpj; | ||
70 | } | ||
71 | |||
65 | return tsc_hz; | 72 | return tsc_hz; |
66 | } | 73 | } |
67 | 74 | ||
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ebd4c51d096a..764c7c2b1811 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -28,6 +28,8 @@ | |||
28 | #include <asm/reboot.h> | 28 | #include <asm/reboot.h> |
29 | #include <asm/virtext.h> | 29 | #include <asm/virtext.h> |
30 | 30 | ||
31 | int in_crash_kexec; | ||
32 | |||
31 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) | 33 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) |
32 | 34 | ||
33 | static void kdump_nmi_callback(int cpu, struct die_args *args) | 35 | static void kdump_nmi_callback(int cpu, struct die_args *args) |
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) | |||
61 | 63 | ||
62 | static void kdump_nmi_shootdown_cpus(void) | 64 | static void kdump_nmi_shootdown_cpus(void) |
63 | { | 65 | { |
66 | in_crash_kexec = 1; | ||
64 | nmi_shootdown_cpus(kdump_nmi_callback); | 67 | nmi_shootdown_cpus(kdump_nmi_callback); |
65 | 68 | ||
66 | disable_local_APIC(); | 69 | disable_local_APIC(); |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c89a386930b7..6e8752c1bd52 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -18,7 +18,6 @@ | |||
18 | 18 | ||
19 | #include <asm/stacktrace.h> | 19 | #include <asm/stacktrace.h> |
20 | 20 | ||
21 | #include "dumpstack.h" | ||
22 | 21 | ||
23 | int panic_on_unrecovered_nmi; | 22 | int panic_on_unrecovered_nmi; |
24 | int panic_on_io_nmi; | 23 | int panic_on_io_nmi; |
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be4fd44..000000000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null | |||
@@ -1,56 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | |||
6 | #ifndef DUMPSTACK_H | ||
7 | #define DUMPSTACK_H | ||
8 | |||
9 | #ifdef CONFIG_X86_32 | ||
10 | #define STACKSLOTS_PER_LINE 8 | ||
11 | #define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) | ||
12 | #else | ||
13 | #define STACKSLOTS_PER_LINE 4 | ||
14 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) | ||
15 | #endif | ||
16 | |||
17 | #include <linux/uaccess.h> | ||
18 | |||
19 | extern void | ||
20 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
21 | unsigned long *stack, unsigned long bp, char *log_lvl); | ||
22 | |||
23 | extern void | ||
24 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
25 | unsigned long *sp, unsigned long bp, char *log_lvl); | ||
26 | |||
27 | extern unsigned int code_bytes; | ||
28 | |||
29 | /* The form of the top of the frame on the stack */ | ||
30 | struct stack_frame { | ||
31 | struct stack_frame *next_frame; | ||
32 | unsigned long return_address; | ||
33 | }; | ||
34 | |||
35 | struct stack_frame_ia32 { | ||
36 | u32 next_frame; | ||
37 | u32 return_address; | ||
38 | }; | ||
39 | |||
40 | static inline unsigned long rewind_frame_pointer(int n) | ||
41 | { | ||
42 | struct stack_frame *frame; | ||
43 | |||
44 | get_bp(frame); | ||
45 | |||
46 | #ifdef CONFIG_FRAME_POINTER | ||
47 | while (n--) { | ||
48 | if (probe_kernel_address(&frame->next_frame, frame)) | ||
49 | break; | ||
50 | } | ||
51 | #endif | ||
52 | |||
53 | return (unsigned long)frame; | ||
54 | } | ||
55 | |||
56 | #endif /* DUMPSTACK_H */ | ||
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a189d93..0f6376ffa2d9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -16,8 +16,6 @@ | |||
16 | 16 | ||
17 | #include <asm/stacktrace.h> | 17 | #include <asm/stacktrace.h> |
18 | 18 | ||
19 | #include "dumpstack.h" | ||
20 | |||
21 | 19 | ||
22 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | 20 | void dump_trace(struct task_struct *task, struct pt_regs *regs, |
23 | unsigned long *stack, unsigned long bp, | 21 | unsigned long *stack, unsigned long bp, |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1f05f3..57a21f11c791 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -16,7 +16,6 @@ | |||
16 | 16 | ||
17 | #include <asm/stacktrace.h> | 17 | #include <asm/stacktrace.h> |
18 | 18 | ||
19 | #include "dumpstack.h" | ||
20 | 19 | ||
21 | #define N_EXCEPTION_STACKS_END \ | 20 | #define N_EXCEPTION_STACKS_END \ |
22 | (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) | 21 | (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) |
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index e5cc7e82e60d..ebdb85cf2686 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <asm/apic.h> | 18 | #include <asm/apic.h> |
19 | #include <asm/iommu.h> | 19 | #include <asm/iommu.h> |
20 | #include <asm/gart.h> | 20 | #include <asm/gart.h> |
21 | #include <asm/hpet.h> | ||
22 | 21 | ||
23 | static void __init fix_hypertransport_config(int num, int slot, int func) | 22 | static void __init fix_hypertransport_config(int num, int slot, int func) |
24 | { | 23 | { |
@@ -192,21 +191,6 @@ static void __init ati_bugs_contd(int num, int slot, int func) | |||
192 | } | 191 | } |
193 | #endif | 192 | #endif |
194 | 193 | ||
195 | /* | ||
196 | * Force the read back of the CMP register in hpet_next_event() | ||
197 | * to work around the problem that the CMP register write seems to be | ||
198 | * delayed. See hpet_next_event() for details. | ||
199 | * | ||
200 | * We do this on all SMBUS incarnations for now until we have more | ||
201 | * information about the affected chipsets. | ||
202 | */ | ||
203 | static void __init ati_hpet_bugs(int num, int slot, int func) | ||
204 | { | ||
205 | #ifdef CONFIG_HPET_TIMER | ||
206 | hpet_readback_cmp = 1; | ||
207 | #endif | ||
208 | } | ||
209 | |||
210 | #define QFLAG_APPLY_ONCE 0x1 | 194 | #define QFLAG_APPLY_ONCE 0x1 |
211 | #define QFLAG_APPLIED 0x2 | 195 | #define QFLAG_APPLIED 0x2 |
212 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) | 196 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) |
@@ -236,8 +220,6 @@ static struct chipset early_qrk[] __initdata = { | |||
236 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, | 220 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, |
237 | { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, | 221 | { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, |
238 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, | 222 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, |
239 | { PCI_VENDOR_ID_ATI, PCI_ANY_ID, | ||
240 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, | ||
241 | {} | 223 | {} |
242 | }; | 224 | }; |
243 | 225 | ||
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..227d00920d2f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -611,14 +611,14 @@ ldt_ss: | |||
611 | * compensating for the offset by changing to the ESPFIX segment with | 611 | * compensating for the offset by changing to the ESPFIX segment with |
612 | * a base address that matches for the difference. | 612 | * a base address that matches for the difference. |
613 | */ | 613 | */ |
614 | #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) | ||
614 | mov %esp, %edx /* load kernel esp */ | 615 | mov %esp, %edx /* load kernel esp */ |
615 | mov PT_OLDESP(%esp), %eax /* load userspace esp */ | 616 | mov PT_OLDESP(%esp), %eax /* load userspace esp */ |
616 | mov %dx, %ax /* eax: new kernel esp */ | 617 | mov %dx, %ax /* eax: new kernel esp */ |
617 | sub %eax, %edx /* offset (low word is 0) */ | 618 | sub %eax, %edx /* offset (low word is 0) */ |
618 | PER_CPU(gdt_page, %ebx) | ||
619 | shr $16, %edx | 619 | shr $16, %edx |
620 | mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ | 620 | mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ |
621 | mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ | 621 | mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ |
622 | pushl $__ESPFIX_SS | 622 | pushl $__ESPFIX_SS |
623 | CFI_ADJUST_CFA_OFFSET 4 | 623 | CFI_ADJUST_CFA_OFFSET 4 |
624 | push %eax /* new kernel esp */ | 624 | push %eax /* new kernel esp */ |
@@ -791,9 +791,8 @@ ptregs_clone: | |||
791 | * normal stack and adjusts ESP with the matching offset. | 791 | * normal stack and adjusts ESP with the matching offset. |
792 | */ | 792 | */ |
793 | /* fixup the stack */ | 793 | /* fixup the stack */ |
794 | PER_CPU(gdt_page, %ebx) | 794 | mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ |
795 | mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ | 795 | mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ |
796 | mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ | ||
797 | shl $16, %eax | 796 | shl $16, %eax |
798 | addl %esp, %eax /* the adjusted stack pointer */ | 797 | addl %esp, %eax /* the adjusted stack pointer */ |
799 | pushl $__KERNEL_DS | 798 | pushl $__KERNEL_DS |
@@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error) | |||
914 | .balign 4 | 913 | .balign 4 |
915 | .long 661b | 914 | .long 661b |
916 | .long 663f | 915 | .long 663f |
917 | .byte X86_FEATURE_XMM | 916 | .word X86_FEATURE_XMM |
918 | .byte 662b-661b | 917 | .byte 662b-661b |
919 | .byte 664f-663f | 918 | .byte 664f-663f |
920 | .previous | 919 | .previous |
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback) | |||
1166 | .previous | 1165 | .previous |
1167 | ENDPROC(xen_failsafe_callback) | 1166 | ENDPROC(xen_failsafe_callback) |
1168 | 1167 | ||
1168 | BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, | ||
1169 | xen_evtchn_do_upcall) | ||
1170 | |||
1169 | #endif /* CONFIG_XEN */ | 1171 | #endif /* CONFIG_XEN */ |
1170 | 1172 | ||
1171 | #ifdef CONFIG_FUNCTION_TRACER | 1173 | #ifdef CONFIG_FUNCTION_TRACER |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4db7c4d12ffa..17be5ec7cbba 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1065,6 +1065,7 @@ ENTRY(\sym) | |||
1065 | END(\sym) | 1065 | END(\sym) |
1066 | .endm | 1066 | .endm |
1067 | 1067 | ||
1068 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) | ||
1068 | .macro paranoidzeroentry_ist sym do_sym ist | 1069 | .macro paranoidzeroentry_ist sym do_sym ist |
1069 | ENTRY(\sym) | 1070 | ENTRY(\sym) |
1070 | INTR_FRAME | 1071 | INTR_FRAME |
@@ -1076,10 +1077,9 @@ ENTRY(\sym) | |||
1076 | TRACE_IRQS_OFF | 1077 | TRACE_IRQS_OFF |
1077 | movq %rsp,%rdi /* pt_regs pointer */ | 1078 | movq %rsp,%rdi /* pt_regs pointer */ |
1078 | xorl %esi,%esi /* no error code */ | 1079 | xorl %esi,%esi /* no error code */ |
1079 | PER_CPU(init_tss, %r12) | 1080 | subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) |
1080 | subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) | ||
1081 | call \do_sym | 1081 | call \do_sym |
1082 | addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) | 1082 | addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) |
1083 | jmp paranoid_exit /* %ebx: no swapgs flag */ | 1083 | jmp paranoid_exit /* %ebx: no swapgs flag */ |
1084 | CFI_ENDPROC | 1084 | CFI_ENDPROC |
1085 | END(\sym) | 1085 | END(\sym) |
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper) | |||
1185 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. | 1185 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. |
1186 | * | 1186 | * |
1187 | * C extern interface: | 1187 | * C extern interface: |
1188 | * extern long execve(char *name, char **argv, char **envp) | 1188 | * extern long execve(const char *name, char **argv, char **envp) |
1189 | * | 1189 | * |
1190 | * asm input arguments: | 1190 | * asm input arguments: |
1191 | * rdi: name, rsi: argv, rdx: envp | 1191 | * rdi: name, rsi: argv, rdx: envp |
1192 | * | 1192 | * |
1193 | * We want to fallback into: | 1193 | * We want to fallback into: |
1194 | * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) | 1194 | * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs) |
1195 | * | 1195 | * |
1196 | * do_sys_execve asm fallback arguments: | 1196 | * do_sys_execve asm fallback arguments: |
1197 | * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack | 1197 | * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack |
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) | |||
1329 | CFI_ENDPROC | 1329 | CFI_ENDPROC |
1330 | END(xen_failsafe_callback) | 1330 | END(xen_failsafe_callback) |
1331 | 1331 | ||
1332 | apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ | ||
1333 | xen_hvm_callback_vector xen_evtchn_do_upcall | ||
1334 | |||
1332 | #endif /* CONFIG_XEN */ | 1335 | #endif /* CONFIG_XEN */ |
1333 | 1336 | ||
1334 | /* | 1337 | /* |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d85..fa8c1b8e09fb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -131,6 +131,12 @@ ENTRY(startup_32) | |||
131 | movsl | 131 | movsl |
132 | 1: | 132 | 1: |
133 | 133 | ||
134 | #ifdef CONFIG_OLPC_OPENFIRMWARE | ||
135 | /* save OFW's pgdir table for later use when calling into OFW */ | ||
136 | movl %cr3, %eax | ||
137 | movl %eax, pa(olpc_ofw_pgd) | ||
138 | #endif | ||
139 | |||
134 | #ifdef CONFIG_PARAVIRT | 140 | #ifdef CONFIG_PARAVIRT |
135 | /* This is can only trip for a broken bootloader... */ | 141 | /* This is can only trip for a broken bootloader... */ |
136 | cmpw $0x207, pa(boot_params + BP_version) | 142 | cmpw $0x207, pa(boot_params + BP_version) |
@@ -328,7 +334,7 @@ ENTRY(startup_32_smp) | |||
328 | /* | 334 | /* |
329 | * Enable paging | 335 | * Enable paging |
330 | */ | 336 | */ |
331 | movl $pa(swapper_pg_dir),%eax | 337 | movl pa(initial_page_table), %eax |
332 | movl %eax,%cr3 /* set the page table pointer.. */ | 338 | movl %eax,%cr3 /* set the page table pointer.. */ |
333 | movl %cr0,%eax | 339 | movl %cr0,%eax |
334 | orl $X86_CR0_PG,%eax | 340 | orl $X86_CR0_PG,%eax |
@@ -608,6 +614,8 @@ ignore_int: | |||
608 | .align 4 | 614 | .align 4 |
609 | ENTRY(initial_code) | 615 | ENTRY(initial_code) |
610 | .long i386_start_kernel | 616 | .long i386_start_kernel |
617 | ENTRY(initial_page_table) | ||
618 | .long pa(swapper_pg_dir) | ||
611 | 619 | ||
612 | /* | 620 | /* |
613 | * BSS section | 621 | * BSS section |
@@ -623,6 +631,10 @@ ENTRY(swapper_pg_dir) | |||
623 | #endif | 631 | #endif |
624 | swapper_pg_fixmap: | 632 | swapper_pg_fixmap: |
625 | .fill 1024,4,0 | 633 | .fill 1024,4,0 |
634 | #ifdef CONFIG_X86_TRAMPOLINE | ||
635 | ENTRY(trampoline_pg_dir) | ||
636 | .fill 1024,4,0 | ||
637 | #endif | ||
626 | ENTRY(empty_zero_page) | 638 | ENTRY(empty_zero_page) |
627 | .fill 4096,1,0 | 639 | .fill 4096,1,0 |
628 | 640 | ||
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f16b7a6..239046bd447f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) | |||
234 | * init data section till per cpu areas are set up. | 234 | * init data section till per cpu areas are set up. |
235 | */ | 235 | */ |
236 | movl $MSR_GS_BASE,%ecx | 236 | movl $MSR_GS_BASE,%ecx |
237 | movq initial_gs(%rip),%rax | 237 | movl initial_gs(%rip),%eax |
238 | movq %rax,%rdx | 238 | movl initial_gs+4(%rip),%edx |
239 | shrq $32,%rdx | ||
240 | wrmsr | 239 | wrmsr |
241 | 240 | ||
242 | /* esi is pointer to real mode structure with interesting info. | 241 | /* esi is pointer to real mode structure with interesting info. |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba390d731175..7494999141b3 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <asm/hpet.h> | 16 | #include <asm/hpet.h> |
17 | 17 | ||
18 | #define HPET_MASK CLOCKSOURCE_MASK(32) | 18 | #define HPET_MASK CLOCKSOURCE_MASK(32) |
19 | #define HPET_SHIFT 22 | ||
20 | 19 | ||
21 | /* FSEC = 10^-15 | 20 | /* FSEC = 10^-15 |
22 | NSEC = 10^-9 */ | 21 | NSEC = 10^-9 */ |
@@ -36,7 +35,6 @@ | |||
36 | unsigned long hpet_address; | 35 | unsigned long hpet_address; |
37 | u8 hpet_blockid; /* OS timer block num */ | 36 | u8 hpet_blockid; /* OS timer block num */ |
38 | u8 hpet_msi_disable; | 37 | u8 hpet_msi_disable; |
39 | u8 hpet_readback_cmp; | ||
40 | 38 | ||
41 | #ifdef CONFIG_PCI_MSI | 39 | #ifdef CONFIG_PCI_MSI |
42 | static unsigned long hpet_num_timers; | 40 | static unsigned long hpet_num_timers; |
@@ -396,23 +394,27 @@ static int hpet_next_event(unsigned long delta, | |||
396 | * at that point and we would wait for the next hpet interrupt | 394 | * at that point and we would wait for the next hpet interrupt |
397 | * forever. We found out that reading the CMP register back | 395 | * forever. We found out that reading the CMP register back |
398 | * forces the transfer so we can rely on the comparison with | 396 | * forces the transfer so we can rely on the comparison with |
399 | * the counter register below. | 397 | * the counter register below. If the read back from the |
398 | * compare register does not match the value we programmed | ||
399 | * then we might have a real hardware problem. We can not do | ||
400 | * much about it here, but at least alert the user/admin with | ||
401 | * a prominent warning. | ||
400 | * | 402 | * |
401 | * That works fine on those ATI chipsets, but on newer Intel | 403 | * An erratum on some chipsets (ICH9,..), results in |
402 | * chipsets (ICH9...) this triggers due to an erratum: Reading | 404 | * comparator read immediately following a write returning old |
403 | * the comparator immediately following a write is returning | 405 | * value. Workaround for this is to read this value second |
404 | * the old value. | 406 | * time, when first read returns old value. |
405 | * | 407 | * |
406 | * We restrict the read back to the affected ATI chipsets (set | 408 | * In fact the write to the comparator register is delayed up |
407 | * by quirks) and also run it with hpet=verbose for debugging | 409 | * to two HPET cycles so the workaround we tried to restrict |
408 | * purposes. | 410 | * the readback to those known to be borked ATI chipsets |
411 | * failed miserably. So we give up on optimizations forever | ||
412 | * and penalize all HPET incarnations unconditionally. | ||
409 | */ | 413 | */ |
410 | if (hpet_readback_cmp || hpet_verbose) { | 414 | if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { |
411 | u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); | 415 | if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) |
412 | |||
413 | if (cmp != cnt) | ||
414 | printk_once(KERN_WARNING | 416 | printk_once(KERN_WARNING |
415 | "hpet: compare register read back failed.\n"); | 417 | "hpet: compare register read back failed.\n"); |
416 | } | 418 | } |
417 | 419 | ||
418 | return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; | 420 | return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; |
@@ -504,7 +506,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) | |||
504 | { | 506 | { |
505 | unsigned int irq; | 507 | unsigned int irq; |
506 | 508 | ||
507 | irq = create_irq(); | 509 | irq = create_irq_nr(0, -1); |
508 | if (!irq) | 510 | if (!irq) |
509 | return -EINVAL; | 511 | return -EINVAL; |
510 | 512 | ||
@@ -583,7 +585,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) | |||
583 | * scaled math multiplication factor for nanosecond to hpet tick | 585 | * scaled math multiplication factor for nanosecond to hpet tick |
584 | * conversion. | 586 | * conversion. |
585 | */ | 587 | */ |
586 | hpet_freq = 1000000000000000ULL; | 588 | hpet_freq = FSEC_PER_SEC; |
587 | do_div(hpet_freq, hpet_period); | 589 | do_div(hpet_freq, hpet_period); |
588 | evt->mult = div_sc((unsigned long) hpet_freq, | 590 | evt->mult = div_sc((unsigned long) hpet_freq, |
589 | NSEC_PER_SEC, evt->shift); | 591 | NSEC_PER_SEC, evt->shift); |
@@ -787,7 +789,6 @@ static struct clocksource clocksource_hpet = { | |||
787 | .rating = 250, | 789 | .rating = 250, |
788 | .read = read_hpet, | 790 | .read = read_hpet, |
789 | .mask = HPET_MASK, | 791 | .mask = HPET_MASK, |
790 | .shift = HPET_SHIFT, | ||
791 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 792 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
792 | .resume = hpet_resume_counter, | 793 | .resume = hpet_resume_counter, |
793 | #ifdef CONFIG_X86_64 | 794 | #ifdef CONFIG_X86_64 |
@@ -798,6 +799,7 @@ static struct clocksource clocksource_hpet = { | |||
798 | static int hpet_clocksource_register(void) | 799 | static int hpet_clocksource_register(void) |
799 | { | 800 | { |
800 | u64 start, now; | 801 | u64 start, now; |
802 | u64 hpet_freq; | ||
801 | cycle_t t1; | 803 | cycle_t t1; |
802 | 804 | ||
803 | /* Start the counter */ | 805 | /* Start the counter */ |
@@ -832,9 +834,15 @@ static int hpet_clocksource_register(void) | |||
832 | * mult = (hpet_period * 2^shift)/10^6 | 834 | * mult = (hpet_period * 2^shift)/10^6 |
833 | * mult = (hpet_period << shift)/FSEC_PER_NSEC | 835 | * mult = (hpet_period << shift)/FSEC_PER_NSEC |
834 | */ | 836 | */ |
835 | clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); | ||
836 | 837 | ||
837 | clocksource_register(&clocksource_hpet); | 838 | /* Need to convert hpet_period (fsec/cyc) to cyc/sec: |
839 | * | ||
840 | * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) | ||
841 | * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period | ||
842 | */ | ||
843 | hpet_freq = FSEC_PER_SEC; | ||
844 | do_div(hpet_freq, hpet_period); | ||
845 | clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); | ||
838 | 846 | ||
839 | return 0; | 847 | return 0; |
840 | } | 848 | } |
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a8f1b803d2fd..ff15c9dcc25d 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c | |||
@@ -206,6 +206,25 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) | |||
206 | int arch_bp_generic_fields(int x86_len, int x86_type, | 206 | int arch_bp_generic_fields(int x86_len, int x86_type, |
207 | int *gen_len, int *gen_type) | 207 | int *gen_len, int *gen_type) |
208 | { | 208 | { |
209 | /* Type */ | ||
210 | switch (x86_type) { | ||
211 | case X86_BREAKPOINT_EXECUTE: | ||
212 | if (x86_len != X86_BREAKPOINT_LEN_X) | ||
213 | return -EINVAL; | ||
214 | |||
215 | *gen_type = HW_BREAKPOINT_X; | ||
216 | *gen_len = sizeof(long); | ||
217 | return 0; | ||
218 | case X86_BREAKPOINT_WRITE: | ||
219 | *gen_type = HW_BREAKPOINT_W; | ||
220 | break; | ||
221 | case X86_BREAKPOINT_RW: | ||
222 | *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; | ||
223 | break; | ||
224 | default: | ||
225 | return -EINVAL; | ||
226 | } | ||
227 | |||
209 | /* Len */ | 228 | /* Len */ |
210 | switch (x86_len) { | 229 | switch (x86_len) { |
211 | case X86_BREAKPOINT_LEN_1: | 230 | case X86_BREAKPOINT_LEN_1: |
@@ -226,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type, | |||
226 | return -EINVAL; | 245 | return -EINVAL; |
227 | } | 246 | } |
228 | 247 | ||
229 | /* Type */ | ||
230 | switch (x86_type) { | ||
231 | case X86_BREAKPOINT_EXECUTE: | ||
232 | *gen_type = HW_BREAKPOINT_X; | ||
233 | break; | ||
234 | case X86_BREAKPOINT_WRITE: | ||
235 | *gen_type = HW_BREAKPOINT_W; | ||
236 | break; | ||
237 | case X86_BREAKPOINT_RW: | ||
238 | *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; | ||
239 | break; | ||
240 | default: | ||
241 | return -EINVAL; | ||
242 | } | ||
243 | |||
244 | return 0; | 248 | return 0; |
245 | } | 249 | } |
246 | 250 | ||
@@ -251,6 +255,29 @@ static int arch_build_bp_info(struct perf_event *bp) | |||
251 | 255 | ||
252 | info->address = bp->attr.bp_addr; | 256 | info->address = bp->attr.bp_addr; |
253 | 257 | ||
258 | /* Type */ | ||
259 | switch (bp->attr.bp_type) { | ||
260 | case HW_BREAKPOINT_W: | ||
261 | info->type = X86_BREAKPOINT_WRITE; | ||
262 | break; | ||
263 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
264 | info->type = X86_BREAKPOINT_RW; | ||
265 | break; | ||
266 | case HW_BREAKPOINT_X: | ||
267 | info->type = X86_BREAKPOINT_EXECUTE; | ||
268 | /* | ||
269 | * x86 inst breakpoints need to have a specific undefined len. | ||
270 | * But we still need to check userspace is not trying to setup | ||
271 | * an unsupported length, to get a range breakpoint for example. | ||
272 | */ | ||
273 | if (bp->attr.bp_len == sizeof(long)) { | ||
274 | info->len = X86_BREAKPOINT_LEN_X; | ||
275 | return 0; | ||
276 | } | ||
277 | default: | ||
278 | return -EINVAL; | ||
279 | } | ||
280 | |||
254 | /* Len */ | 281 | /* Len */ |
255 | switch (bp->attr.bp_len) { | 282 | switch (bp->attr.bp_len) { |
256 | case HW_BREAKPOINT_LEN_1: | 283 | case HW_BREAKPOINT_LEN_1: |
@@ -271,21 +298,6 @@ static int arch_build_bp_info(struct perf_event *bp) | |||
271 | return -EINVAL; | 298 | return -EINVAL; |
272 | } | 299 | } |
273 | 300 | ||
274 | /* Type */ | ||
275 | switch (bp->attr.bp_type) { | ||
276 | case HW_BREAKPOINT_W: | ||
277 | info->type = X86_BREAKPOINT_WRITE; | ||
278 | break; | ||
279 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
280 | info->type = X86_BREAKPOINT_RW; | ||
281 | break; | ||
282 | case HW_BREAKPOINT_X: | ||
283 | info->type = X86_BREAKPOINT_EXECUTE; | ||
284 | break; | ||
285 | default: | ||
286 | return -EINVAL; | ||
287 | } | ||
288 | |||
289 | return 0; | 301 | return 0; |
290 | } | 302 | } |
291 | /* | 303 | /* |
@@ -466,6 +478,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) | |||
466 | 478 | ||
467 | perf_bp_event(bp, args->regs); | 479 | perf_bp_event(bp, args->regs); |
468 | 480 | ||
481 | /* | ||
482 | * Set up resume flag to avoid breakpoint recursion when | ||
483 | * returning back to origin. | ||
484 | */ | ||
485 | if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) | ||
486 | args->regs->flags |= X86_EFLAGS_RF; | ||
487 | |||
469 | rcu_read_unlock(); | 488 | rcu_read_unlock(); |
470 | } | 489 | } |
471 | /* | 490 | /* |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index c4444bce8469..a46cb3522c0c 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -40,6 +40,7 @@ | |||
40 | 40 | ||
41 | static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; | 41 | static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; |
42 | unsigned int xstate_size; | 42 | unsigned int xstate_size; |
43 | EXPORT_SYMBOL_GPL(xstate_size); | ||
43 | unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); | 44 | unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); |
44 | static struct i387_fxsave_struct fx_scratch __cpuinitdata; | 45 | static struct i387_fxsave_struct fx_scratch __cpuinitdata; |
45 | 46 | ||
@@ -59,18 +60,18 @@ void __cpuinit mxcsr_feature_mask_init(void) | |||
59 | stts(); | 60 | stts(); |
60 | } | 61 | } |
61 | 62 | ||
62 | void __cpuinit init_thread_xstate(void) | 63 | static void __cpuinit init_thread_xstate(void) |
63 | { | 64 | { |
65 | /* | ||
66 | * Note that xstate_size might be overwriten later during | ||
67 | * xsave_init(). | ||
68 | */ | ||
69 | |||
64 | if (!HAVE_HWFP) { | 70 | if (!HAVE_HWFP) { |
65 | xstate_size = sizeof(struct i387_soft_struct); | 71 | xstate_size = sizeof(struct i387_soft_struct); |
66 | return; | 72 | return; |
67 | } | 73 | } |
68 | 74 | ||
69 | if (cpu_has_xsave) { | ||
70 | xsave_cntxt_init(); | ||
71 | return; | ||
72 | } | ||
73 | |||
74 | if (cpu_has_fxsr) | 75 | if (cpu_has_fxsr) |
75 | xstate_size = sizeof(struct i387_fxsave_struct); | 76 | xstate_size = sizeof(struct i387_fxsave_struct); |
76 | #ifdef CONFIG_X86_32 | 77 | #ifdef CONFIG_X86_32 |
@@ -84,6 +85,7 @@ void __cpuinit init_thread_xstate(void) | |||
84 | * Called at bootup to set up the initial FPU state that is later cloned | 85 | * Called at bootup to set up the initial FPU state that is later cloned |
85 | * into all processes. | 86 | * into all processes. |
86 | */ | 87 | */ |
88 | |||
87 | void __cpuinit fpu_init(void) | 89 | void __cpuinit fpu_init(void) |
88 | { | 90 | { |
89 | unsigned long oldcr0 = read_cr0(); | 91 | unsigned long oldcr0 = read_cr0(); |
@@ -93,19 +95,24 @@ void __cpuinit fpu_init(void) | |||
93 | 95 | ||
94 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ | 96 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ |
95 | 97 | ||
96 | /* | ||
97 | * Boot processor to setup the FP and extended state context info. | ||
98 | */ | ||
99 | if (!smp_processor_id()) | 98 | if (!smp_processor_id()) |
100 | init_thread_xstate(); | 99 | init_thread_xstate(); |
101 | xsave_init(); | ||
102 | 100 | ||
103 | mxcsr_feature_mask_init(); | 101 | mxcsr_feature_mask_init(); |
104 | /* clean state in init */ | 102 | /* clean state in init */ |
105 | current_thread_info()->status = 0; | 103 | current_thread_info()->status = 0; |
106 | clear_used_math(); | 104 | clear_used_math(); |
107 | } | 105 | } |
108 | #endif /* CONFIG_X86_64 */ | 106 | |
107 | #else /* CONFIG_X86_64 */ | ||
108 | |||
109 | void __cpuinit fpu_init(void) | ||
110 | { | ||
111 | if (!smp_processor_id()) | ||
112 | init_thread_xstate(); | ||
113 | } | ||
114 | |||
115 | #endif /* CONFIG_X86_32 */ | ||
109 | 116 | ||
110 | void fpu_finit(struct fpu *fpu) | 117 | void fpu_finit(struct fpu *fpu) |
111 | { | 118 | { |
@@ -191,6 +198,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, | |||
191 | if (ret) | 198 | if (ret) |
192 | return ret; | 199 | return ret; |
193 | 200 | ||
201 | sanitize_i387_state(target); | ||
202 | |||
194 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, | 203 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, |
195 | &target->thread.fpu.state->fxsave, 0, -1); | 204 | &target->thread.fpu.state->fxsave, 0, -1); |
196 | } | 205 | } |
@@ -208,6 +217,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
208 | if (ret) | 217 | if (ret) |
209 | return ret; | 218 | return ret; |
210 | 219 | ||
220 | sanitize_i387_state(target); | ||
221 | |||
211 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, | 222 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, |
212 | &target->thread.fpu.state->fxsave, 0, -1); | 223 | &target->thread.fpu.state->fxsave, 0, -1); |
213 | 224 | ||
@@ -447,6 +458,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, | |||
447 | -1); | 458 | -1); |
448 | } | 459 | } |
449 | 460 | ||
461 | sanitize_i387_state(target); | ||
462 | |||
450 | if (kbuf && pos == 0 && count == sizeof(env)) { | 463 | if (kbuf && pos == 0 && count == sizeof(env)) { |
451 | convert_from_fxsr(kbuf, target); | 464 | convert_from_fxsr(kbuf, target); |
452 | return 0; | 465 | return 0; |
@@ -468,6 +481,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
468 | if (ret) | 481 | if (ret) |
469 | return ret; | 482 | return ret; |
470 | 483 | ||
484 | sanitize_i387_state(target); | ||
485 | |||
471 | if (!HAVE_HWFP) | 486 | if (!HAVE_HWFP) |
472 | return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); | 487 | return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); |
473 | 488 | ||
@@ -534,6 +549,9 @@ static int save_i387_xsave(void __user *buf) | |||
534 | struct _fpstate_ia32 __user *fx = buf; | 549 | struct _fpstate_ia32 __user *fx = buf; |
535 | int err = 0; | 550 | int err = 0; |
536 | 551 | ||
552 | |||
553 | sanitize_i387_state(tsk); | ||
554 | |||
537 | /* | 555 | /* |
538 | * For legacy compatible, we always set FP/SSE bits in the bit | 556 | * For legacy compatible, we always set FP/SSE bits in the bit |
539 | * vector while saving the state to the user context. | 557 | * vector while saving the state to the user context. |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 01ab17ae2ae7..852b81967a37 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -49,55 +49,94 @@ | |||
49 | #include <asm/system.h> | 49 | #include <asm/system.h> |
50 | #include <asm/apic.h> | 50 | #include <asm/apic.h> |
51 | 51 | ||
52 | /** | 52 | struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = |
53 | * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs | ||
54 | * @gdb_regs: A pointer to hold the registers in the order GDB wants. | ||
55 | * @regs: The &struct pt_regs of the current process. | ||
56 | * | ||
57 | * Convert the pt_regs in @regs into the format for registers that | ||
58 | * GDB expects, stored in @gdb_regs. | ||
59 | */ | ||
60 | void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
61 | { | 53 | { |
62 | #ifndef CONFIG_X86_32 | 54 | #ifdef CONFIG_X86_32 |
63 | u32 *gdb_regs32 = (u32 *)gdb_regs; | 55 | { "ax", 4, offsetof(struct pt_regs, ax) }, |
56 | { "cx", 4, offsetof(struct pt_regs, cx) }, | ||
57 | { "dx", 4, offsetof(struct pt_regs, dx) }, | ||
58 | { "bx", 4, offsetof(struct pt_regs, bx) }, | ||
59 | { "sp", 4, offsetof(struct pt_regs, sp) }, | ||
60 | { "bp", 4, offsetof(struct pt_regs, bp) }, | ||
61 | { "si", 4, offsetof(struct pt_regs, si) }, | ||
62 | { "di", 4, offsetof(struct pt_regs, di) }, | ||
63 | { "ip", 4, offsetof(struct pt_regs, ip) }, | ||
64 | { "flags", 4, offsetof(struct pt_regs, flags) }, | ||
65 | { "cs", 4, offsetof(struct pt_regs, cs) }, | ||
66 | { "ss", 4, offsetof(struct pt_regs, ss) }, | ||
67 | { "ds", 4, offsetof(struct pt_regs, ds) }, | ||
68 | { "es", 4, offsetof(struct pt_regs, es) }, | ||
69 | { "fs", 4, -1 }, | ||
70 | { "gs", 4, -1 }, | ||
71 | #else | ||
72 | { "ax", 8, offsetof(struct pt_regs, ax) }, | ||
73 | { "bx", 8, offsetof(struct pt_regs, bx) }, | ||
74 | { "cx", 8, offsetof(struct pt_regs, cx) }, | ||
75 | { "dx", 8, offsetof(struct pt_regs, dx) }, | ||
76 | { "si", 8, offsetof(struct pt_regs, dx) }, | ||
77 | { "di", 8, offsetof(struct pt_regs, di) }, | ||
78 | { "bp", 8, offsetof(struct pt_regs, bp) }, | ||
79 | { "sp", 8, offsetof(struct pt_regs, sp) }, | ||
80 | { "r8", 8, offsetof(struct pt_regs, r8) }, | ||
81 | { "r9", 8, offsetof(struct pt_regs, r9) }, | ||
82 | { "r10", 8, offsetof(struct pt_regs, r10) }, | ||
83 | { "r11", 8, offsetof(struct pt_regs, r11) }, | ||
84 | { "r12", 8, offsetof(struct pt_regs, r12) }, | ||
85 | { "r13", 8, offsetof(struct pt_regs, r13) }, | ||
86 | { "r14", 8, offsetof(struct pt_regs, r14) }, | ||
87 | { "r15", 8, offsetof(struct pt_regs, r15) }, | ||
88 | { "ip", 8, offsetof(struct pt_regs, ip) }, | ||
89 | { "flags", 4, offsetof(struct pt_regs, flags) }, | ||
90 | { "cs", 4, offsetof(struct pt_regs, cs) }, | ||
91 | { "ss", 4, offsetof(struct pt_regs, ss) }, | ||
64 | #endif | 92 | #endif |
65 | gdb_regs[GDB_AX] = regs->ax; | 93 | }; |
66 | gdb_regs[GDB_BX] = regs->bx; | 94 | |
67 | gdb_regs[GDB_CX] = regs->cx; | 95 | int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) |
68 | gdb_regs[GDB_DX] = regs->dx; | 96 | { |
69 | gdb_regs[GDB_SI] = regs->si; | 97 | if ( |
70 | gdb_regs[GDB_DI] = regs->di; | ||
71 | gdb_regs[GDB_BP] = regs->bp; | ||
72 | gdb_regs[GDB_PC] = regs->ip; | ||
73 | #ifdef CONFIG_X86_32 | 98 | #ifdef CONFIG_X86_32 |
74 | gdb_regs[GDB_PS] = regs->flags; | 99 | regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || |
75 | gdb_regs[GDB_DS] = regs->ds; | 100 | #endif |
76 | gdb_regs[GDB_ES] = regs->es; | 101 | regno == GDB_SP || regno == GDB_ORIG_AX) |
77 | gdb_regs[GDB_CS] = regs->cs; | 102 | return 0; |
78 | gdb_regs[GDB_FS] = 0xFFFF; | 103 | |
79 | gdb_regs[GDB_GS] = 0xFFFF; | 104 | if (dbg_reg_def[regno].offset != -1) |
80 | if (user_mode_vm(regs)) { | 105 | memcpy((void *)regs + dbg_reg_def[regno].offset, mem, |
81 | gdb_regs[GDB_SS] = regs->ss; | 106 | dbg_reg_def[regno].size); |
82 | gdb_regs[GDB_SP] = regs->sp; | 107 | return 0; |
83 | } else { | 108 | } |
84 | gdb_regs[GDB_SS] = __KERNEL_DS; | 109 | |
85 | gdb_regs[GDB_SP] = kernel_stack_pointer(regs); | 110 | char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) |
111 | { | ||
112 | if (regno == GDB_ORIG_AX) { | ||
113 | memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); | ||
114 | return "orig_ax"; | ||
86 | } | 115 | } |
87 | #else | 116 | if (regno >= DBG_MAX_REG_NUM || regno < 0) |
88 | gdb_regs[GDB_R8] = regs->r8; | 117 | return NULL; |
89 | gdb_regs[GDB_R9] = regs->r9; | 118 | |
90 | gdb_regs[GDB_R10] = regs->r10; | 119 | if (dbg_reg_def[regno].offset != -1) |
91 | gdb_regs[GDB_R11] = regs->r11; | 120 | memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, |
92 | gdb_regs[GDB_R12] = regs->r12; | 121 | dbg_reg_def[regno].size); |
93 | gdb_regs[GDB_R13] = regs->r13; | 122 | |
94 | gdb_regs[GDB_R14] = regs->r14; | 123 | switch (regno) { |
95 | gdb_regs[GDB_R15] = regs->r15; | 124 | #ifdef CONFIG_X86_32 |
96 | gdb_regs32[GDB_PS] = regs->flags; | 125 | case GDB_SS: |
97 | gdb_regs32[GDB_CS] = regs->cs; | 126 | if (!user_mode_vm(regs)) |
98 | gdb_regs32[GDB_SS] = regs->ss; | 127 | *(unsigned long *)mem = __KERNEL_DS; |
99 | gdb_regs[GDB_SP] = kernel_stack_pointer(regs); | 128 | break; |
129 | case GDB_SP: | ||
130 | if (!user_mode_vm(regs)) | ||
131 | *(unsigned long *)mem = kernel_stack_pointer(regs); | ||
132 | break; | ||
133 | case GDB_GS: | ||
134 | case GDB_FS: | ||
135 | *(unsigned long *)mem = 0xFFFF; | ||
136 | break; | ||
100 | #endif | 137 | #endif |
138 | } | ||
139 | return dbg_reg_def[regno].name; | ||
101 | } | 140 | } |
102 | 141 | ||
103 | /** | 142 | /** |
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) | |||
150 | gdb_regs[GDB_SP] = p->thread.sp; | 189 | gdb_regs[GDB_SP] = p->thread.sp; |
151 | } | 190 | } |
152 | 191 | ||
153 | /** | ||
154 | * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. | ||
155 | * @gdb_regs: A pointer to hold the registers we've received from GDB. | ||
156 | * @regs: A pointer to a &struct pt_regs to hold these values in. | ||
157 | * | ||
158 | * Convert the GDB regs in @gdb_regs into the pt_regs, and store them | ||
159 | * in @regs. | ||
160 | */ | ||
161 | void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
162 | { | ||
163 | #ifndef CONFIG_X86_32 | ||
164 | u32 *gdb_regs32 = (u32 *)gdb_regs; | ||
165 | #endif | ||
166 | regs->ax = gdb_regs[GDB_AX]; | ||
167 | regs->bx = gdb_regs[GDB_BX]; | ||
168 | regs->cx = gdb_regs[GDB_CX]; | ||
169 | regs->dx = gdb_regs[GDB_DX]; | ||
170 | regs->si = gdb_regs[GDB_SI]; | ||
171 | regs->di = gdb_regs[GDB_DI]; | ||
172 | regs->bp = gdb_regs[GDB_BP]; | ||
173 | regs->ip = gdb_regs[GDB_PC]; | ||
174 | #ifdef CONFIG_X86_32 | ||
175 | regs->flags = gdb_regs[GDB_PS]; | ||
176 | regs->ds = gdb_regs[GDB_DS]; | ||
177 | regs->es = gdb_regs[GDB_ES]; | ||
178 | regs->cs = gdb_regs[GDB_CS]; | ||
179 | #else | ||
180 | regs->r8 = gdb_regs[GDB_R8]; | ||
181 | regs->r9 = gdb_regs[GDB_R9]; | ||
182 | regs->r10 = gdb_regs[GDB_R10]; | ||
183 | regs->r11 = gdb_regs[GDB_R11]; | ||
184 | regs->r12 = gdb_regs[GDB_R12]; | ||
185 | regs->r13 = gdb_regs[GDB_R13]; | ||
186 | regs->r14 = gdb_regs[GDB_R14]; | ||
187 | regs->r15 = gdb_regs[GDB_R15]; | ||
188 | regs->flags = gdb_regs32[GDB_PS]; | ||
189 | regs->cs = gdb_regs32[GDB_CS]; | ||
190 | regs->ss = gdb_regs32[GDB_SS]; | ||
191 | #endif | ||
192 | } | ||
193 | |||
194 | static struct hw_breakpoint { | 192 | static struct hw_breakpoint { |
195 | unsigned enabled; | 193 | unsigned enabled; |
196 | unsigned long addr; | 194 | unsigned long addr; |
197 | int len; | 195 | int len; |
198 | int type; | 196 | int type; |
199 | struct perf_event **pev; | 197 | struct perf_event * __percpu *pev; |
200 | } breakinfo[4]; | 198 | } breakinfo[HBP_NUM]; |
201 | 199 | ||
202 | static unsigned long early_dr7; | 200 | static unsigned long early_dr7; |
203 | 201 | ||
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void) | |||
205 | { | 203 | { |
206 | int breakno; | 204 | int breakno; |
207 | 205 | ||
208 | for (breakno = 0; breakno < 4; breakno++) { | 206 | for (breakno = 0; breakno < HBP_NUM; breakno++) { |
209 | struct perf_event *bp; | 207 | struct perf_event *bp; |
210 | struct arch_hw_breakpoint *info; | 208 | struct arch_hw_breakpoint *info; |
211 | int val; | 209 | int val; |
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) | |||
292 | { | 290 | { |
293 | int i; | 291 | int i; |
294 | 292 | ||
295 | for (i = 0; i < 4; i++) | 293 | for (i = 0; i < HBP_NUM; i++) |
296 | if (breakinfo[i].addr == addr && breakinfo[i].enabled) | 294 | if (breakinfo[i].addr == addr && breakinfo[i].enabled) |
297 | break; | 295 | break; |
298 | if (i == 4) | 296 | if (i == HBP_NUM) |
299 | return -1; | 297 | return -1; |
300 | 298 | ||
301 | if (hw_break_release_slot(i)) { | 299 | if (hw_break_release_slot(i)) { |
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void) | |||
313 | int cpu = raw_smp_processor_id(); | 311 | int cpu = raw_smp_processor_id(); |
314 | struct perf_event *bp; | 312 | struct perf_event *bp; |
315 | 313 | ||
316 | for (i = 0; i < 4; i++) { | 314 | for (i = 0; i < HBP_NUM; i++) { |
317 | if (!breakinfo[i].enabled) | 315 | if (!breakinfo[i].enabled) |
318 | continue; | 316 | continue; |
319 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); | 317 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); |
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) | |||
333 | { | 331 | { |
334 | int i; | 332 | int i; |
335 | 333 | ||
336 | for (i = 0; i < 4; i++) | 334 | for (i = 0; i < HBP_NUM; i++) |
337 | if (!breakinfo[i].enabled) | 335 | if (!breakinfo[i].enabled) |
338 | break; | 336 | break; |
339 | if (i == 4) | 337 | if (i == HBP_NUM) |
340 | return -1; | 338 | return -1; |
341 | 339 | ||
342 | switch (bptype) { | 340 | switch (bptype) { |
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) | |||
397 | 395 | ||
398 | /* Disable hardware debugging while we are in kgdb: */ | 396 | /* Disable hardware debugging while we are in kgdb: */ |
399 | set_debugreg(0UL, 7); | 397 | set_debugreg(0UL, 7); |
400 | for (i = 0; i < 4; i++) { | 398 | for (i = 0; i < HBP_NUM; i++) { |
401 | if (!breakinfo[i].enabled) | 399 | if (!breakinfo[i].enabled) |
402 | continue; | 400 | continue; |
403 | if (dbg_is_early) { | 401 | if (dbg_is_early) { |
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
458 | { | 456 | { |
459 | unsigned long addr; | 457 | unsigned long addr; |
460 | char *ptr; | 458 | char *ptr; |
461 | int newPC; | ||
462 | 459 | ||
463 | switch (remcomInBuffer[0]) { | 460 | switch (remcomInBuffer[0]) { |
464 | case 'c': | 461 | case 'c': |
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
469 | linux_regs->ip = addr; | 466 | linux_regs->ip = addr; |
470 | case 'D': | 467 | case 'D': |
471 | case 'k': | 468 | case 'k': |
472 | newPC = linux_regs->ip; | ||
473 | |||
474 | /* clear the trace bit */ | 469 | /* clear the trace bit */ |
475 | linux_regs->flags &= ~X86_EFLAGS_TF; | 470 | linux_regs->flags &= ~X86_EFLAGS_TF; |
476 | atomic_set(&kgdb_cpu_doing_single_step, -1); | 471 | atomic_set(&kgdb_cpu_doing_single_step, -1); |
@@ -645,7 +640,7 @@ void kgdb_arch_late(void) | |||
645 | attr.bp_len = HW_BREAKPOINT_LEN_1; | 640 | attr.bp_len = HW_BREAKPOINT_LEN_1; |
646 | attr.bp_type = HW_BREAKPOINT_W; | 641 | attr.bp_type = HW_BREAKPOINT_W; |
647 | attr.disabled = 1; | 642 | attr.disabled = 1; |
648 | for (i = 0; i < 4; i++) { | 643 | for (i = 0; i < HBP_NUM; i++) { |
649 | if (breakinfo[i].pev) | 644 | if (breakinfo[i].pev) |
650 | continue; | 645 | continue; |
651 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); | 646 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); |
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 675879b65ce6..770ebfb349e9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to) | |||
126 | } | 126 | } |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * Check for the REX prefix which can only exist on X86_64 | 129 | * Skip the prefixes of the instruction. |
130 | * X86_32 always returns 0 | ||
131 | */ | 130 | */ |
132 | static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) | 131 | static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) |
133 | { | 132 | { |
133 | insn_attr_t attr; | ||
134 | |||
135 | attr = inat_get_opcode_attribute((insn_byte_t)*insn); | ||
136 | while (inat_is_legacy_prefix(attr)) { | ||
137 | insn++; | ||
138 | attr = inat_get_opcode_attribute((insn_byte_t)*insn); | ||
139 | } | ||
134 | #ifdef CONFIG_X86_64 | 140 | #ifdef CONFIG_X86_64 |
135 | if ((*insn & 0xf0) == 0x40) | 141 | if (inat_is_rex_prefix(attr)) |
136 | return 1; | 142 | insn++; |
137 | #endif | 143 | #endif |
138 | return 0; | 144 | return insn; |
139 | } | 145 | } |
140 | 146 | ||
141 | /* | 147 | /* |
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr) | |||
272 | */ | 278 | */ |
273 | static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | 279 | static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) |
274 | { | 280 | { |
281 | /* Skip prefixes */ | ||
282 | insn = skip_prefixes(insn); | ||
283 | |||
275 | switch (*insn) { | 284 | switch (*insn) { |
276 | case 0xfa: /* cli */ | 285 | case 0xfa: /* cli */ |
277 | case 0xfb: /* sti */ | 286 | case 0xfb: /* sti */ |
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | |||
280 | return 1; | 289 | return 1; |
281 | } | 290 | } |
282 | 291 | ||
283 | /* | ||
284 | * on X86_64, 0x40-0x4f are REX prefixes so we need to look | ||
285 | * at the next byte instead.. but of course not recurse infinitely | ||
286 | */ | ||
287 | if (is_REX_prefix(insn)) | ||
288 | return is_IF_modifier(++insn); | ||
289 | |||
290 | return 0; | 292 | return 0; |
291 | } | 293 | } |
292 | 294 | ||
@@ -707,6 +709,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
707 | struct hlist_node *node, *tmp; | 709 | struct hlist_node *node, *tmp; |
708 | unsigned long flags, orig_ret_address = 0; | 710 | unsigned long flags, orig_ret_address = 0; |
709 | unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; | 711 | unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; |
712 | kprobe_opcode_t *correct_ret_addr = NULL; | ||
710 | 713 | ||
711 | INIT_HLIST_HEAD(&empty_rp); | 714 | INIT_HLIST_HEAD(&empty_rp); |
712 | kretprobe_hash_lock(current, &head, &flags); | 715 | kretprobe_hash_lock(current, &head, &flags); |
@@ -738,14 +741,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
738 | /* another task is sharing our hash bucket */ | 741 | /* another task is sharing our hash bucket */ |
739 | continue; | 742 | continue; |
740 | 743 | ||
744 | orig_ret_address = (unsigned long)ri->ret_addr; | ||
745 | |||
746 | if (orig_ret_address != trampoline_address) | ||
747 | /* | ||
748 | * This is the real return address. Any other | ||
749 | * instances associated with this task are for | ||
750 | * other calls deeper on the call stack | ||
751 | */ | ||
752 | break; | ||
753 | } | ||
754 | |||
755 | kretprobe_assert(ri, orig_ret_address, trampoline_address); | ||
756 | |||
757 | correct_ret_addr = ri->ret_addr; | ||
758 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | ||
759 | if (ri->task != current) | ||
760 | /* another task is sharing our hash bucket */ | ||
761 | continue; | ||
762 | |||
763 | orig_ret_address = (unsigned long)ri->ret_addr; | ||
741 | if (ri->rp && ri->rp->handler) { | 764 | if (ri->rp && ri->rp->handler) { |
742 | __get_cpu_var(current_kprobe) = &ri->rp->kp; | 765 | __get_cpu_var(current_kprobe) = &ri->rp->kp; |
743 | get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; | 766 | get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; |
767 | ri->ret_addr = correct_ret_addr; | ||
744 | ri->rp->handler(ri, regs); | 768 | ri->rp->handler(ri, regs); |
745 | __get_cpu_var(current_kprobe) = NULL; | 769 | __get_cpu_var(current_kprobe) = NULL; |
746 | } | 770 | } |
747 | 771 | ||
748 | orig_ret_address = (unsigned long)ri->ret_addr; | ||
749 | recycle_rp_inst(ri, &empty_rp); | 772 | recycle_rp_inst(ri, &empty_rp); |
750 | 773 | ||
751 | if (orig_ret_address != trampoline_address) | 774 | if (orig_ret_address != trampoline_address) |
@@ -757,8 +780,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
757 | break; | 780 | break; |
758 | } | 781 | } |
759 | 782 | ||
760 | kretprobe_assert(ri, orig_ret_address, trampoline_address); | ||
761 | |||
762 | kretprobe_hash_unlock(current, &flags); | 783 | kretprobe_hash_unlock(current, &flags); |
763 | 784 | ||
764 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { | 785 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { |
@@ -803,9 +824,8 @@ static void __kprobes resume_execution(struct kprobe *p, | |||
803 | unsigned long orig_ip = (unsigned long)p->addr; | 824 | unsigned long orig_ip = (unsigned long)p->addr; |
804 | kprobe_opcode_t *insn = p->ainsn.insn; | 825 | kprobe_opcode_t *insn = p->ainsn.insn; |
805 | 826 | ||
806 | /*skip the REX prefix*/ | 827 | /* Skip prefixes */ |
807 | if (is_REX_prefix(insn)) | 828 | insn = skip_prefixes(insn); |
808 | insn++; | ||
809 | 829 | ||
810 | regs->flags &= ~X86_EFLAGS_TF; | 830 | regs->flags &= ~X86_EFLAGS_TF; |
811 | switch (*insn) { | 831 | switch (*insn) { |
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e0bc186d7501..1c355c550960 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
@@ -239,11 +239,10 @@ int module_finalize(const Elf_Ehdr *hdr, | |||
239 | apply_paravirt(pseg, pseg + para->sh_size); | 239 | apply_paravirt(pseg, pseg + para->sh_size); |
240 | } | 240 | } |
241 | 241 | ||
242 | return module_bug_finalize(hdr, sechdrs, me); | 242 | return 0; |
243 | } | 243 | } |
244 | 244 | ||
245 | void module_arch_cleanup(struct module *mod) | 245 | void module_arch_cleanup(struct module *mod) |
246 | { | 246 | { |
247 | alternatives_smp_module_del(mod); | 247 | alternatives_smp_module_del(mod); |
248 | module_bug_cleanup(mod); | ||
249 | } | 248 | } |
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d86dbf7e54be..d7b6f7fb4fec 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) | |||
274 | 274 | ||
275 | void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } | 275 | void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } |
276 | 276 | ||
277 | static void __init smp_register_lapic_address(unsigned long address) | ||
278 | { | ||
279 | mp_lapic_addr = address; | ||
280 | |||
281 | set_fixmap_nocache(FIX_APIC_BASE, address); | ||
282 | if (boot_cpu_physical_apicid == -1U) { | ||
283 | boot_cpu_physical_apicid = read_apic_id(); | ||
284 | apic_version[boot_cpu_physical_apicid] = | ||
285 | GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
286 | } | ||
287 | } | ||
288 | |||
277 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | 289 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) |
278 | { | 290 | { |
279 | char str[16]; | 291 | char str[16]; |
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | |||
295 | if (early) | 307 | if (early) |
296 | return 1; | 308 | return 1; |
297 | 309 | ||
310 | /* Initialize the lapic mapping */ | ||
311 | if (!acpi_lapic) | ||
312 | smp_register_lapic_address(mpc->lapic); | ||
313 | |||
298 | if (mpc->oemptr) | 314 | if (mpc->oemptr) |
299 | x86_init.mpparse.smp_read_mpc_oem(mpc); | 315 | x86_init.mpparse.smp_read_mpc_oem(mpc); |
300 | 316 | ||
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 5915e0b33303..79ae68154e87 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c | |||
@@ -25,8 +25,34 @@ | |||
25 | #include <asm/i8259.h> | 25 | #include <asm/i8259.h> |
26 | #include <asm/apb_timer.h> | 26 | #include <asm/apb_timer.h> |
27 | 27 | ||
28 | /* | ||
29 | * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, | ||
30 | * cmdline option x86_mrst_timer can be used to override the configuration | ||
31 | * to prefer one or the other. | ||
32 | * at runtime, there are basically three timer configurations: | ||
33 | * 1. per cpu apbt clock only | ||
34 | * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only | ||
35 | * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. | ||
36 | * | ||
37 | * by default (without cmdline option), platform code first detects cpu type | ||
38 | * to see if we are on lincroft or penwell, then set up both lapic or apbt | ||
39 | * clocks accordingly. | ||
40 | * i.e. by default, medfield uses configuration #2, moorestown uses #1. | ||
41 | * config #3 is supported but not recommended on medfield. | ||
42 | * | ||
43 | * rating and feature summary: | ||
44 | * lapic (with C3STOP) --------- 100 | ||
45 | * apbt (always-on) ------------ 110 | ||
46 | * lapic (always-on,ARAT) ------ 150 | ||
47 | */ | ||
48 | |||
49 | __cpuinitdata enum mrst_timer_options mrst_timer_options; | ||
50 | |||
28 | static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; | 51 | static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; |
29 | static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; | 52 | static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; |
53 | enum mrst_cpu_type __mrst_cpu_chip; | ||
54 | EXPORT_SYMBOL_GPL(__mrst_cpu_chip); | ||
55 | |||
30 | int sfi_mtimer_num; | 56 | int sfi_mtimer_num; |
31 | 57 | ||
32 | struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; | 58 | struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; |
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) | |||
167 | return 0; | 193 | return 0; |
168 | } | 194 | } |
169 | 195 | ||
170 | /* | ||
171 | * the secondary clock in Moorestown can be APBT or LAPIC clock, default to | ||
172 | * APBT but cmdline option can also override it. | ||
173 | */ | ||
174 | static void __cpuinit mrst_setup_secondary_clock(void) | ||
175 | { | ||
176 | /* restore default lapic clock if disabled by cmdline */ | ||
177 | if (disable_apbt_percpu) | ||
178 | return setup_secondary_APIC_clock(); | ||
179 | apbt_setup_secondary_clock(); | ||
180 | } | ||
181 | |||
182 | static unsigned long __init mrst_calibrate_tsc(void) | 196 | static unsigned long __init mrst_calibrate_tsc(void) |
183 | { | 197 | { |
184 | unsigned long flags, fast_calibrate; | 198 | unsigned long flags, fast_calibrate; |
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void) | |||
195 | 209 | ||
196 | void __init mrst_time_init(void) | 210 | void __init mrst_time_init(void) |
197 | { | 211 | { |
212 | switch (mrst_timer_options) { | ||
213 | case MRST_TIMER_APBT_ONLY: | ||
214 | break; | ||
215 | case MRST_TIMER_LAPIC_APBT: | ||
216 | x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; | ||
217 | x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; | ||
218 | break; | ||
219 | default: | ||
220 | if (!boot_cpu_has(X86_FEATURE_ARAT)) | ||
221 | break; | ||
222 | x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; | ||
223 | x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; | ||
224 | return; | ||
225 | } | ||
226 | /* we need at least one APB timer */ | ||
198 | sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); | 227 | sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); |
199 | pre_init_apic_IRQ0(); | 228 | pre_init_apic_IRQ0(); |
200 | apbt_time_init(); | 229 | apbt_time_init(); |
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void) | |||
205 | sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); | 234 | sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); |
206 | } | 235 | } |
207 | 236 | ||
208 | /* | 237 | void __cpuinit mrst_arch_setup(void) |
209 | * if we use per cpu apb timer, the bootclock already setup. if we use lapic | ||
210 | * timer and one apbt timer for broadcast, we need to set up lapic boot clock. | ||
211 | */ | ||
212 | static void __init mrst_setup_boot_clock(void) | ||
213 | { | 238 | { |
214 | pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); | 239 | if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) |
215 | if (disable_apbt_percpu) | 240 | __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; |
216 | setup_boot_APIC_clock(); | 241 | else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) |
217 | }; | 242 | __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; |
243 | else { | ||
244 | pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", | ||
245 | boot_cpu_data.x86, boot_cpu_data.x86_model); | ||
246 | __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; | ||
247 | } | ||
248 | pr_debug("Moorestown CPU %s identified\n", | ||
249 | (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? | ||
250 | "Lincroft" : "Penwell"); | ||
251 | } | ||
218 | 252 | ||
219 | /* MID systems don't have i8042 controller */ | 253 | /* MID systems don't have i8042 controller */ |
220 | static int mrst_i8042_detect(void) | 254 | static int mrst_i8042_detect(void) |
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void) | |||
232 | x86_init.resources.reserve_resources = x86_init_noop; | 266 | x86_init.resources.reserve_resources = x86_init_noop; |
233 | 267 | ||
234 | x86_init.timers.timer_init = mrst_time_init; | 268 | x86_init.timers.timer_init = mrst_time_init; |
235 | x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; | 269 | x86_init.timers.setup_percpu_clockev = x86_init_noop; |
236 | 270 | ||
237 | x86_init.irqs.pre_vector_init = x86_init_noop; | 271 | x86_init.irqs.pre_vector_init = x86_init_noop; |
238 | 272 | ||
239 | x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; | 273 | x86_init.oem.arch_setup = mrst_arch_setup; |
274 | |||
275 | x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; | ||
240 | 276 | ||
241 | x86_platform.calibrate_tsc = mrst_calibrate_tsc; | 277 | x86_platform.calibrate_tsc = mrst_calibrate_tsc; |
242 | x86_platform.i8042_detect = mrst_i8042_detect; | 278 | x86_platform.i8042_detect = mrst_i8042_detect; |
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void) | |||
250 | x86_init.mpparse.get_smp_config = x86_init_uint_noop; | 286 | x86_init.mpparse.get_smp_config = x86_init_uint_noop; |
251 | 287 | ||
252 | } | 288 | } |
289 | |||
290 | /* | ||
291 | * if user does not want to use per CPU apb timer, just give it a lower rating | ||
292 | * than local apic timer and skip the late per cpu timer init. | ||
293 | */ | ||
294 | static inline int __init setup_x86_mrst_timer(char *arg) | ||
295 | { | ||
296 | if (!arg) | ||
297 | return -EINVAL; | ||
298 | |||
299 | if (strcmp("apbt_only", arg) == 0) | ||
300 | mrst_timer_options = MRST_TIMER_APBT_ONLY; | ||
301 | else if (strcmp("lapic_and_apbt", arg) == 0) | ||
302 | mrst_timer_options = MRST_TIMER_LAPIC_APBT; | ||
303 | else { | ||
304 | pr_warning("X86 MRST timer option %s not recognised" | ||
305 | " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", | ||
306 | arg); | ||
307 | return -EINVAL; | ||
308 | } | ||
309 | return 0; | ||
310 | } | ||
311 | __setup("x86_mrst_timer=", setup_x86_mrst_timer); | ||
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b3..0e0cdde519be 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c | |||
@@ -21,10 +21,7 @@ | |||
21 | #include <asm/geode.h> | 21 | #include <asm/geode.h> |
22 | #include <asm/setup.h> | 22 | #include <asm/setup.h> |
23 | #include <asm/olpc.h> | 23 | #include <asm/olpc.h> |
24 | 24 | #include <asm/olpc_ofw.h> | |
25 | #ifdef CONFIG_OPEN_FIRMWARE | ||
26 | #include <asm/ofw.h> | ||
27 | #endif | ||
28 | 25 | ||
29 | struct olpc_platform_t olpc_platform_info; | 26 | struct olpc_platform_t olpc_platform_info; |
30 | EXPORT_SYMBOL_GPL(olpc_platform_info); | 27 | EXPORT_SYMBOL_GPL(olpc_platform_info); |
@@ -145,7 +142,7 @@ restart: | |||
145 | * The OBF flag will sometimes misbehave due to what we believe | 142 | * The OBF flag will sometimes misbehave due to what we believe |
146 | * is a hardware quirk.. | 143 | * is a hardware quirk.. |
147 | */ | 144 | */ |
148 | printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); | 145 | pr_devel("olpc-ec: running cmd 0x%x\n", cmd); |
149 | outb(cmd, 0x6c); | 146 | outb(cmd, 0x6c); |
150 | 147 | ||
151 | if (wait_on_ibf(0x6c, 0)) { | 148 | if (wait_on_ibf(0x6c, 0)) { |
@@ -162,8 +159,7 @@ restart: | |||
162 | " EC accept data!\n"); | 159 | " EC accept data!\n"); |
163 | goto err; | 160 | goto err; |
164 | } | 161 | } |
165 | printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", | 162 | pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); |
166 | inbuf[i]); | ||
167 | outb(inbuf[i], 0x68); | 163 | outb(inbuf[i], 0x68); |
168 | } | 164 | } |
169 | } | 165 | } |
@@ -176,8 +172,7 @@ restart: | |||
176 | goto restart; | 172 | goto restart; |
177 | } | 173 | } |
178 | outbuf[i] = inb(0x68); | 174 | outbuf[i] = inb(0x68); |
179 | printk(KERN_DEBUG "olpc-ec: received 0x%x\n", | 175 | pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); |
180 | outbuf[i]); | ||
181 | } | 176 | } |
182 | } | 177 | } |
183 | 178 | ||
@@ -188,14 +183,15 @@ err: | |||
188 | } | 183 | } |
189 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); | 184 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); |
190 | 185 | ||
191 | #ifdef CONFIG_OPEN_FIRMWARE | 186 | #ifdef CONFIG_OLPC_OPENFIRMWARE |
192 | static void __init platform_detect(void) | 187 | static void __init platform_detect(void) |
193 | { | 188 | { |
194 | size_t propsize; | 189 | size_t propsize; |
195 | __be32 rev; | 190 | __be32 rev; |
191 | const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; | ||
192 | void *res[] = { &propsize }; | ||
196 | 193 | ||
197 | if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, | 194 | if (olpc_ofw("getprop", args, res) || propsize != 4) { |
198 | &propsize) || propsize != 4) { | ||
199 | printk(KERN_ERR "ofw: getprop call failed!\n"); | 195 | printk(KERN_ERR "ofw: getprop call failed!\n"); |
200 | rev = cpu_to_be32(0); | 196 | rev = cpu_to_be32(0); |
201 | } | 197 | } |
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 000000000000..3218aa71ab5e --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c | |||
@@ -0,0 +1,106 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <asm/page.h> | ||
5 | #include <asm/setup.h> | ||
6 | #include <asm/io.h> | ||
7 | #include <asm/pgtable.h> | ||
8 | #include <asm/olpc_ofw.h> | ||
9 | |||
10 | /* address of OFW callback interface; will be NULL if OFW isn't found */ | ||
11 | static int (*olpc_ofw_cif)(int *); | ||
12 | |||
13 | /* page dir entry containing OFW's pgdir table; filled in by head_32.S */ | ||
14 | u32 olpc_ofw_pgd __initdata; | ||
15 | |||
16 | static DEFINE_SPINLOCK(ofw_lock); | ||
17 | |||
18 | #define MAXARGS 10 | ||
19 | |||
20 | void __init setup_olpc_ofw_pgd(void) | ||
21 | { | ||
22 | pgd_t *base, *ofw_pde; | ||
23 | |||
24 | if (!olpc_ofw_cif) | ||
25 | return; | ||
26 | |||
27 | /* fetch OFW's PDE */ | ||
28 | base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); | ||
29 | if (!base) { | ||
30 | printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); | ||
31 | olpc_ofw_cif = NULL; | ||
32 | return; | ||
33 | } | ||
34 | ofw_pde = &base[OLPC_OFW_PDE_NR]; | ||
35 | |||
36 | /* install OFW's PDE permanently into the kernel's pgtable */ | ||
37 | set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); | ||
38 | /* implicit optimization barrier here due to uninline function return */ | ||
39 | |||
40 | early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); | ||
41 | } | ||
42 | |||
43 | int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, | ||
44 | void **res) | ||
45 | { | ||
46 | int ofw_args[MAXARGS + 3]; | ||
47 | unsigned long flags; | ||
48 | int ret, i, *p; | ||
49 | |||
50 | BUG_ON(nr_args + nr_res > MAXARGS); | ||
51 | |||
52 | if (!olpc_ofw_cif) | ||
53 | return -EIO; | ||
54 | |||
55 | ofw_args[0] = (int)name; | ||
56 | ofw_args[1] = nr_args; | ||
57 | ofw_args[2] = nr_res; | ||
58 | |||
59 | p = &ofw_args[3]; | ||
60 | for (i = 0; i < nr_args; i++, p++) | ||
61 | *p = (int)args[i]; | ||
62 | |||
63 | /* call into ofw */ | ||
64 | spin_lock_irqsave(&ofw_lock, flags); | ||
65 | ret = olpc_ofw_cif(ofw_args); | ||
66 | spin_unlock_irqrestore(&ofw_lock, flags); | ||
67 | |||
68 | if (!ret) { | ||
69 | for (i = 0; i < nr_res; i++, p++) | ||
70 | *((int *)res[i]) = *p; | ||
71 | } | ||
72 | |||
73 | return ret; | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(__olpc_ofw); | ||
76 | |||
77 | /* OFW cif _should_ be above this address */ | ||
78 | #define OFW_MIN 0xff000000 | ||
79 | |||
80 | /* OFW starts on a 1MB boundary */ | ||
81 | #define OFW_BOUND (1<<20) | ||
82 | |||
83 | void __init olpc_ofw_detect(void) | ||
84 | { | ||
85 | struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; | ||
86 | unsigned long start; | ||
87 | |||
88 | /* ensure OFW booted us by checking for "OFW " string */ | ||
89 | if (hdr->ofw_magic != OLPC_OFW_SIG) | ||
90 | return; | ||
91 | |||
92 | olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; | ||
93 | |||
94 | if ((unsigned long)olpc_ofw_cif < OFW_MIN) { | ||
95 | printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", | ||
96 | (unsigned long)olpc_ofw_cif); | ||
97 | olpc_ofw_cif = NULL; | ||
98 | return; | ||
99 | } | ||
100 | |||
101 | /* determine where OFW starts in memory */ | ||
102 | start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); | ||
103 | printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", | ||
104 | (unsigned long)olpc_ofw_cif, (-start) >> 20); | ||
105 | reserve_top_address(-start); | ||
106 | } | ||
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 4b7e3d8b01dd..9f07cfcbd3a5 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <asm/calgary.h> | 13 | #include <asm/calgary.h> |
14 | #include <asm/amd_iommu.h> | 14 | #include <asm/amd_iommu.h> |
15 | #include <asm/x86_init.h> | 15 | #include <asm/x86_init.h> |
16 | #include <asm/xen/swiotlb-xen.h> | ||
16 | 17 | ||
17 | static int forbid_dac __read_mostly; | 18 | static int forbid_dac __read_mostly; |
18 | 19 | ||
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void) | |||
132 | /* free the range so iommu could get some range less than 4G */ | 133 | /* free the range so iommu could get some range less than 4G */ |
133 | dma32_free_bootmem(); | 134 | dma32_free_bootmem(); |
134 | 135 | ||
135 | if (pci_swiotlb_detect()) | 136 | if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) |
136 | goto out; | 137 | goto out; |
137 | 138 | ||
138 | gart_iommu_hole_init(); | 139 | gart_iommu_hole_init(); |
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void) | |||
144 | /* needs to be called after gart_iommu_hole_init */ | 145 | /* needs to be called after gart_iommu_hole_init */ |
145 | amd_iommu_detect(); | 146 | amd_iommu_detect(); |
146 | out: | 147 | out: |
148 | pci_xen_swiotlb_init(); | ||
149 | |||
147 | pci_swiotlb_init(); | 150 | pci_swiotlb_init(); |
148 | } | 151 | } |
149 | 152 | ||
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void) | |||
296 | #endif | 299 | #endif |
297 | x86_init.iommu.iommu_init(); | 300 | x86_init.iommu.iommu_init(); |
298 | 301 | ||
299 | if (swiotlb) { | 302 | if (swiotlb || xen_swiotlb) { |
300 | printk(KERN_INFO "PCI-DMA: " | 303 | printk(KERN_INFO "PCI-DMA: " |
301 | "Using software bounce buffering for IO (SWIOTLB)\n"); | 304 | "Using software bounce buffering for IO (SWIOTLB)\n"); |
302 | swiotlb_print_info(); | 305 | swiotlb_print_info(); |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cbcf013a0ec6..57d1868a86aa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -301,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread); | |||
301 | /* | 301 | /* |
302 | * sys_execve() executes a new program. | 302 | * sys_execve() executes a new program. |
303 | */ | 303 | */ |
304 | long sys_execve(char __user *name, char __user * __user *argv, | 304 | long sys_execve(const char __user *name, |
305 | char __user * __user *envp, struct pt_regs *regs) | 305 | const char __user *const __user *argv, |
306 | const char __user *const __user *envp, struct pt_regs *regs) | ||
306 | { | 307 | { |
307 | long error; | 308 | long error; |
308 | char *filename; | 309 | char *filename; |
@@ -526,44 +527,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) | |||
526 | return (edx & MWAIT_EDX_C1); | 527 | return (edx & MWAIT_EDX_C1); |
527 | } | 528 | } |
528 | 529 | ||
529 | /* | 530 | bool c1e_detected; |
530 | * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. | 531 | EXPORT_SYMBOL(c1e_detected); |
531 | * For more information see | ||
532 | * - Erratum #400 for NPT family 0xf and family 0x10 CPUs | ||
533 | * - Erratum #365 for family 0x11 (not affected because C1e not in use) | ||
534 | */ | ||
535 | static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) | ||
536 | { | ||
537 | u64 val; | ||
538 | if (c->x86_vendor != X86_VENDOR_AMD) | ||
539 | goto no_c1e_idle; | ||
540 | |||
541 | /* Family 0x0f models < rev F do not have C1E */ | ||
542 | if (c->x86 == 0x0F && c->x86_model >= 0x40) | ||
543 | return 1; | ||
544 | |||
545 | if (c->x86 == 0x10) { | ||
546 | /* | ||
547 | * check OSVW bit for CPUs that are not affected | ||
548 | * by erratum #400 | ||
549 | */ | ||
550 | if (cpu_has(c, X86_FEATURE_OSVW)) { | ||
551 | rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); | ||
552 | if (val >= 2) { | ||
553 | rdmsrl(MSR_AMD64_OSVW_STATUS, val); | ||
554 | if (!(val & BIT(1))) | ||
555 | goto no_c1e_idle; | ||
556 | } | ||
557 | } | ||
558 | return 1; | ||
559 | } | ||
560 | |||
561 | no_c1e_idle: | ||
562 | return 0; | ||
563 | } | ||
564 | 532 | ||
565 | static cpumask_var_t c1e_mask; | 533 | static cpumask_var_t c1e_mask; |
566 | static int c1e_detected; | ||
567 | 534 | ||
568 | void c1e_remove_cpu(int cpu) | 535 | void c1e_remove_cpu(int cpu) |
569 | { | 536 | { |
@@ -585,12 +552,12 @@ static void c1e_idle(void) | |||
585 | u32 lo, hi; | 552 | u32 lo, hi; |
586 | 553 | ||
587 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); | 554 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); |
555 | |||
588 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { | 556 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { |
589 | c1e_detected = 1; | 557 | c1e_detected = true; |
590 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) | 558 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) |
591 | mark_tsc_unstable("TSC halt in AMD C1E"); | 559 | mark_tsc_unstable("TSC halt in AMD C1E"); |
592 | printk(KERN_INFO "System has AMD C1E enabled\n"); | 560 | printk(KERN_INFO "System has AMD C1E enabled\n"); |
593 | set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); | ||
594 | } | 561 | } |
595 | } | 562 | } |
596 | 563 | ||
@@ -639,7 +606,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | |||
639 | */ | 606 | */ |
640 | printk(KERN_INFO "using mwait in idle threads.\n"); | 607 | printk(KERN_INFO "using mwait in idle threads.\n"); |
641 | pm_idle = mwait_idle; | 608 | pm_idle = mwait_idle; |
642 | } else if (check_c1e_idle(c)) { | 609 | } else if (cpu_has_amd_erratum(amd_erratum_400)) { |
610 | /* E400: APIC timer interrupt does not wake up CPU from C1e */ | ||
643 | printk(KERN_INFO "using C1E aware idle routine\n"); | 611 | printk(KERN_INFO "using C1E aware idle routine\n"); |
644 | pm_idle = c1e_idle; | 612 | pm_idle = c1e_idle; |
645 | } else | 613 | } else |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af47..96586c3cbbbf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -57,6 +57,8 @@ | |||
57 | #include <asm/syscalls.h> | 57 | #include <asm/syscalls.h> |
58 | #include <asm/debugreg.h> | 58 | #include <asm/debugreg.h> |
59 | 59 | ||
60 | #include <trace/events/power.h> | ||
61 | |||
60 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 62 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
61 | 63 | ||
62 | /* | 64 | /* |
@@ -111,6 +113,8 @@ void cpu_idle(void) | |||
111 | stop_critical_timings(); | 113 | stop_critical_timings(); |
112 | pm_idle(); | 114 | pm_idle(); |
113 | start_critical_timings(); | 115 | start_critical_timings(); |
116 | |||
117 | trace_power_end(smp_processor_id()); | ||
114 | } | 118 | } |
115 | tick_nohz_restart_sched_tick(); | 119 | tick_nohz_restart_sched_tick(); |
116 | preempt_enable_no_resched(); | 120 | preempt_enable_no_resched(); |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3c2422a99f1f..3d9ea531ddd1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -51,6 +51,8 @@ | |||
51 | #include <asm/syscalls.h> | 51 | #include <asm/syscalls.h> |
52 | #include <asm/debugreg.h> | 52 | #include <asm/debugreg.h> |
53 | 53 | ||
54 | #include <trace/events/power.h> | ||
55 | |||
54 | asmlinkage extern void ret_from_fork(void); | 56 | asmlinkage extern void ret_from_fork(void); |
55 | 57 | ||
56 | DEFINE_PER_CPU(unsigned long, old_rsp); | 58 | DEFINE_PER_CPU(unsigned long, old_rsp); |
@@ -138,6 +140,9 @@ void cpu_idle(void) | |||
138 | stop_critical_timings(); | 140 | stop_critical_timings(); |
139 | pm_idle(); | 141 | pm_idle(); |
140 | start_critical_timings(); | 142 | start_critical_timings(); |
143 | |||
144 | trace_power_end(smp_processor_id()); | ||
145 | |||
141 | /* In many cases the interrupt that ended idle | 146 | /* In many cases the interrupt that ended idle |
142 | has already called exit_idle. But some idle | 147 | has already called exit_idle. But some idle |
143 | loops can be woken up without interrupt. */ | 148 | loops can be woken up without interrupt. */ |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4ae4acbd031..c3a4fbb2b996 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -102,6 +102,7 @@ | |||
102 | 102 | ||
103 | #include <asm/paravirt.h> | 103 | #include <asm/paravirt.h> |
104 | #include <asm/hypervisor.h> | 104 | #include <asm/hypervisor.h> |
105 | #include <asm/olpc_ofw.h> | ||
105 | 106 | ||
106 | #include <asm/percpu.h> | 107 | #include <asm/percpu.h> |
107 | #include <asm/topology.h> | 108 | #include <asm/topology.h> |
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p) | |||
736 | /* VMI may relocate the fixmap; do this before touching ioremap area */ | 737 | /* VMI may relocate the fixmap; do this before touching ioremap area */ |
737 | vmi_init(); | 738 | vmi_init(); |
738 | 739 | ||
740 | /* OFW also may relocate the fixmap */ | ||
741 | olpc_ofw_detect(); | ||
742 | |||
739 | early_trap_init(); | 743 | early_trap_init(); |
740 | early_cpu_init(); | 744 | early_cpu_init(); |
741 | early_ioremap_init(); | 745 | early_ioremap_init(); |
742 | 746 | ||
747 | setup_olpc_ofw_pgd(); | ||
748 | |||
743 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); | 749 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); |
744 | screen_info = boot_params.screen_info; | 750 | screen_info = boot_params.screen_info; |
745 | edid_info = boot_params.edid_info; | 751 | edid_info = boot_params.edid_info; |
@@ -1008,6 +1014,8 @@ void __init setup_arch(char **cmdline_p) | |||
1008 | paging_init(); | 1014 | paging_init(); |
1009 | x86_init.paging.pagetable_setup_done(swapper_pg_dir); | 1015 | x86_init.paging.pagetable_setup_done(swapper_pg_dir); |
1010 | 1016 | ||
1017 | setup_trampoline_page_table(); | ||
1018 | |||
1011 | tboot_probe(); | 1019 | tboot_probe(); |
1012 | 1020 | ||
1013 | #ifdef CONFIG_X86_64 | 1021 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d6..8b3bfc4dd708 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -73,7 +73,6 @@ | |||
73 | 73 | ||
74 | #ifdef CONFIG_X86_32 | 74 | #ifdef CONFIG_X86_32 |
75 | u8 apicid_2_node[MAX_APICID]; | 75 | u8 apicid_2_node[MAX_APICID]; |
76 | static int low_mappings; | ||
77 | #endif | 76 | #endif |
78 | 77 | ||
79 | /* State of each CPU */ | 78 | /* State of each CPU */ |
@@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; | |||
91 | static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); | 90 | static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); |
92 | #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) | 91 | #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) |
93 | #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) | 92 | #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) |
93 | |||
94 | /* | ||
95 | * We need this for trampoline_base protection from concurrent accesses when | ||
96 | * off- and onlining cores wildly. | ||
97 | */ | ||
98 | static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); | ||
99 | |||
100 | void cpu_hotplug_driver_lock() | ||
101 | { | ||
102 | mutex_lock(&x86_cpu_hotplug_driver_mutex); | ||
103 | } | ||
104 | |||
105 | void cpu_hotplug_driver_unlock() | ||
106 | { | ||
107 | mutex_unlock(&x86_cpu_hotplug_driver_mutex); | ||
108 | } | ||
109 | |||
110 | ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } | ||
111 | ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } | ||
94 | #else | 112 | #else |
95 | static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; | 113 | static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; |
96 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) | 114 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) |
@@ -281,6 +299,18 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
281 | * fragile that we want to limit the things done here to the | 299 | * fragile that we want to limit the things done here to the |
282 | * most necessary things. | 300 | * most necessary things. |
283 | */ | 301 | */ |
302 | |||
303 | #ifdef CONFIG_X86_32 | ||
304 | /* | ||
305 | * Switch away from the trampoline page-table | ||
306 | * | ||
307 | * Do this before cpu_init() because it needs to access per-cpu | ||
308 | * data which may not be mapped in the trampoline page-table. | ||
309 | */ | ||
310 | load_cr3(swapper_pg_dir); | ||
311 | __flush_tlb_all(); | ||
312 | #endif | ||
313 | |||
284 | vmi_bringup(); | 314 | vmi_bringup(); |
285 | cpu_init(); | 315 | cpu_init(); |
286 | preempt_disable(); | 316 | preempt_disable(); |
@@ -299,12 +329,6 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
299 | legacy_pic->chip->unmask(0); | 329 | legacy_pic->chip->unmask(0); |
300 | } | 330 | } |
301 | 331 | ||
302 | #ifdef CONFIG_X86_32 | ||
303 | while (low_mappings) | ||
304 | cpu_relax(); | ||
305 | __flush_tlb_all(); | ||
306 | #endif | ||
307 | |||
308 | /* This must be done before setting cpu_online_mask */ | 332 | /* This must be done before setting cpu_online_mask */ |
309 | set_cpu_sibling_map(raw_smp_processor_id()); | 333 | set_cpu_sibling_map(raw_smp_processor_id()); |
310 | wmb(); | 334 | wmb(); |
@@ -735,12 +759,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) | |||
735 | goto do_rest; | 759 | goto do_rest; |
736 | } | 760 | } |
737 | 761 | ||
738 | if (!keventd_up() || current_is_keventd()) | 762 | schedule_work(&c_idle.work); |
739 | c_idle.work.func(&c_idle.work); | 763 | wait_for_completion(&c_idle.done); |
740 | else { | ||
741 | schedule_work(&c_idle.work); | ||
742 | wait_for_completion(&c_idle.done); | ||
743 | } | ||
744 | 764 | ||
745 | if (IS_ERR(c_idle.idle)) { | 765 | if (IS_ERR(c_idle.idle)) { |
746 | printk("failed fork for CPU %d\n", cpu); | 766 | printk("failed fork for CPU %d\n", cpu); |
@@ -754,6 +774,7 @@ do_rest: | |||
754 | #ifdef CONFIG_X86_32 | 774 | #ifdef CONFIG_X86_32 |
755 | /* Stack for startup_32 can be just as for start_secondary onwards */ | 775 | /* Stack for startup_32 can be just as for start_secondary onwards */ |
756 | irq_ctx_init(cpu); | 776 | irq_ctx_init(cpu); |
777 | initial_page_table = __pa(&trampoline_pg_dir); | ||
757 | #else | 778 | #else |
758 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); | 779 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); |
759 | initial_gs = per_cpu_offset(cpu); | 780 | initial_gs = per_cpu_offset(cpu); |
@@ -816,6 +837,13 @@ do_rest: | |||
816 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | 837 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) |
817 | break; /* It has booted */ | 838 | break; /* It has booted */ |
818 | udelay(100); | 839 | udelay(100); |
840 | /* | ||
841 | * Allow other tasks to run while we wait for the | ||
842 | * AP to come online. This also gives a chance | ||
843 | * for the MTRR work(triggered by the AP coming online) | ||
844 | * to be completed in the stop machine context. | ||
845 | */ | ||
846 | schedule(); | ||
819 | } | 847 | } |
820 | 848 | ||
821 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | 849 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) |
@@ -894,20 +922,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
894 | 922 | ||
895 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | 923 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; |
896 | 924 | ||
897 | #ifdef CONFIG_X86_32 | ||
898 | /* init low mem mapping */ | ||
899 | clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
900 | min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); | ||
901 | flush_tlb_all(); | ||
902 | low_mappings = 1; | ||
903 | |||
904 | err = do_boot_cpu(apicid, cpu); | 925 | err = do_boot_cpu(apicid, cpu); |
905 | 926 | ||
906 | zap_low_mappings(false); | ||
907 | low_mappings = 0; | ||
908 | #else | ||
909 | err = do_boot_cpu(apicid, cpu); | ||
910 | #endif | ||
911 | if (err) { | 927 | if (err) { |
912 | pr_debug("do_boot_cpu failed %d\n", err); | 928 | pr_debug("do_boot_cpu failed %d\n", err); |
913 | return -EIO; | 929 | return -EIO; |
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefbb3f6c..b53c525368a7 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name) | |||
23 | return 0; | 23 | return 0; |
24 | } | 24 | } |
25 | 25 | ||
26 | static void save_stack_address(void *data, unsigned long addr, int reliable) | 26 | static void |
27 | __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) | ||
27 | { | 28 | { |
28 | struct stack_trace *trace = data; | 29 | struct stack_trace *trace = data; |
30 | #ifdef CONFIG_FRAME_POINTER | ||
29 | if (!reliable) | 31 | if (!reliable) |
30 | return; | 32 | return; |
33 | #endif | ||
34 | if (nosched && in_sched_functions(addr)) | ||
35 | return; | ||
31 | if (trace->skip > 0) { | 36 | if (trace->skip > 0) { |
32 | trace->skip--; | 37 | trace->skip--; |
33 | return; | 38 | return; |
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) | |||
36 | trace->entries[trace->nr_entries++] = addr; | 41 | trace->entries[trace->nr_entries++] = addr; |
37 | } | 42 | } |
38 | 43 | ||
44 | static void save_stack_address(void *data, unsigned long addr, int reliable) | ||
45 | { | ||
46 | return __save_stack_address(data, addr, reliable, false); | ||
47 | } | ||
48 | |||
39 | static void | 49 | static void |
40 | save_stack_address_nosched(void *data, unsigned long addr, int reliable) | 50 | save_stack_address_nosched(void *data, unsigned long addr, int reliable) |
41 | { | 51 | { |
42 | struct stack_trace *trace = (struct stack_trace *)data; | 52 | return __save_stack_address(data, addr, reliable, true); |
43 | if (!reliable) | ||
44 | return; | ||
45 | if (in_sched_functions(addr)) | ||
46 | return; | ||
47 | if (trace->skip > 0) { | ||
48 | trace->skip--; | ||
49 | return; | ||
50 | } | ||
51 | if (trace->nr_entries < trace->max_entries) | ||
52 | trace->entries[trace->nr_entries++] = addr; | ||
53 | } | 53 | } |
54 | 54 | ||
55 | static const struct stacktrace_ops save_stack_ops = { | 55 | static const struct stacktrace_ops save_stack_ops = { |
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); | |||
96 | 96 | ||
97 | /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ | 97 | /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ |
98 | 98 | ||
99 | struct stack_frame { | 99 | struct stack_frame_user { |
100 | const void __user *next_fp; | 100 | const void __user *next_fp; |
101 | unsigned long ret_addr; | 101 | unsigned long ret_addr; |
102 | }; | 102 | }; |
103 | 103 | ||
104 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | 104 | static int |
105 | copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) | ||
105 | { | 106 | { |
106 | int ret; | 107 | int ret; |
107 | 108 | ||
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) | |||
126 | trace->entries[trace->nr_entries++] = regs->ip; | 127 | trace->entries[trace->nr_entries++] = regs->ip; |
127 | 128 | ||
128 | while (trace->nr_entries < trace->max_entries) { | 129 | while (trace->nr_entries < trace->max_entries) { |
129 | struct stack_frame frame; | 130 | struct stack_frame_user frame; |
130 | 131 | ||
131 | frame.next_fp = NULL; | 132 | frame.next_fp = NULL; |
132 | frame.ret_addr = 0; | 133 | frame.ret_addr = 0; |
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 196552bb412c..d5e06624e34a 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c | |||
@@ -28,7 +28,9 @@ | |||
28 | * Do a system call from kernel instead of calling sys_execve so we | 28 | * Do a system call from kernel instead of calling sys_execve so we |
29 | * end up with proper pt_regs. | 29 | * end up with proper pt_regs. |
30 | */ | 30 | */ |
31 | int kernel_execve(const char *filename, char *const argv[], char *const envp[]) | 31 | int kernel_execve(const char *filename, |
32 | const char *const argv[], | ||
33 | const char *const envp[]) | ||
32 | { | 34 | { |
33 | long __res; | 35 | long __res; |
34 | asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" | 36 | asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" |
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..b35786dc9b8f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -337,3 +337,6 @@ ENTRY(sys_call_table) | |||
337 | .long sys_rt_tgsigqueueinfo /* 335 */ | 337 | .long sys_rt_tgsigqueueinfo /* 335 */ |
338 | .long sys_perf_event_open | 338 | .long sys_perf_event_open |
339 | .long sys_recvmmsg | 339 | .long sys_recvmmsg |
340 | .long sys_fanotify_init | ||
341 | .long sys_fanotify_mark | ||
342 | .long sys_prlimit64 /* 340 */ | ||
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 7fea555929e2..312ef0292815 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/seq_file.h> | 9 | #include <linux/seq_file.h> |
10 | #include <linux/proc_fs.h> | 10 | #include <linux/proc_fs.h> |
11 | #include <linux/debugfs.h> | ||
11 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
12 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
13 | 14 | ||
@@ -22,19 +23,37 @@ | |||
22 | #include <asm/irq_vectors.h> | 23 | #include <asm/irq_vectors.h> |
23 | #include <asm/timer.h> | 24 | #include <asm/timer.h> |
24 | 25 | ||
25 | struct msg_desc { | 26 | /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ |
26 | struct bau_payload_queue_entry *msg; | 27 | static int timeout_base_ns[] = { |
27 | int msg_slot; | 28 | 20, |
28 | int sw_ack_slot; | 29 | 160, |
29 | struct bau_payload_queue_entry *va_queue_first; | 30 | 1280, |
30 | struct bau_payload_queue_entry *va_queue_last; | 31 | 10240, |
32 | 81920, | ||
33 | 655360, | ||
34 | 5242880, | ||
35 | 167772160 | ||
31 | }; | 36 | }; |
32 | 37 | static int timeout_us; | |
33 | #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL | ||
34 | |||
35 | static int uv_bau_max_concurrent __read_mostly; | ||
36 | |||
37 | static int nobau; | 38 | static int nobau; |
39 | static int baudisabled; | ||
40 | static spinlock_t disable_lock; | ||
41 | static cycles_t congested_cycles; | ||
42 | |||
43 | /* tunables: */ | ||
44 | static int max_bau_concurrent = MAX_BAU_CONCURRENT; | ||
45 | static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; | ||
46 | static int plugged_delay = PLUGGED_DELAY; | ||
47 | static int plugsb4reset = PLUGSB4RESET; | ||
48 | static int timeoutsb4reset = TIMEOUTSB4RESET; | ||
49 | static int ipi_reset_limit = IPI_RESET_LIMIT; | ||
50 | static int complete_threshold = COMPLETE_THRESHOLD; | ||
51 | static int congested_response_us = CONGESTED_RESPONSE_US; | ||
52 | static int congested_reps = CONGESTED_REPS; | ||
53 | static int congested_period = CONGESTED_PERIOD; | ||
54 | static struct dentry *tunables_dir; | ||
55 | static struct dentry *tunables_file; | ||
56 | |||
38 | static int __init setup_nobau(char *arg) | 57 | static int __init setup_nobau(char *arg) |
39 | { | 58 | { |
40 | nobau = 1; | 59 | nobau = 1; |
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); | |||
52 | static DEFINE_PER_CPU(struct bau_control, bau_control); | 71 | static DEFINE_PER_CPU(struct bau_control, bau_control); |
53 | static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); | 72 | static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); |
54 | 73 | ||
55 | struct reset_args { | ||
56 | int sender; | ||
57 | }; | ||
58 | |||
59 | /* | 74 | /* |
60 | * Determine the first node on a uvhub. 'Nodes' are used for kernel | 75 | * Determine the first node on a uvhub. 'Nodes' are used for kernel |
61 | * memory allocation. | 76 | * memory allocation. |
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, | |||
126 | struct ptc_stats *stat; | 141 | struct ptc_stats *stat; |
127 | 142 | ||
128 | msg = mdp->msg; | 143 | msg = mdp->msg; |
129 | stat = &per_cpu(ptcstats, bcp->cpu); | 144 | stat = bcp->statp; |
130 | stat->d_retries++; | 145 | stat->d_retries++; |
131 | /* | 146 | /* |
132 | * cancel any message from msg+1 to the retry itself | 147 | * cancel any message from msg+1 to the retry itself |
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, | |||
146 | slot2 = msg2 - mdp->va_queue_first; | 161 | slot2 = msg2 - mdp->va_queue_first; |
147 | mmr = uv_read_local_mmr | 162 | mmr = uv_read_local_mmr |
148 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); | 163 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); |
149 | msg_res = ((msg2->sw_ack_vector << 8) | | 164 | msg_res = msg2->sw_ack_vector; |
150 | msg2->sw_ack_vector); | ||
151 | /* | 165 | /* |
152 | * This is a message retry; clear the resources held | 166 | * This is a message retry; clear the resources held |
153 | * by the previous message only if they timed out. | 167 | * by the previous message only if they timed out. |
154 | * If it has not timed out we have an unexpected | 168 | * If it has not timed out we have an unexpected |
155 | * situation to report. | 169 | * situation to report. |
156 | */ | 170 | */ |
157 | if (mmr & (msg_res << 8)) { | 171 | if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { |
158 | /* | 172 | /* |
159 | * is the resource timed out? | 173 | * is the resource timed out? |
160 | * make everyone ignore the cancelled message. | 174 | * make everyone ignore the cancelled message. |
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, | |||
164 | cancel_count++; | 178 | cancel_count++; |
165 | uv_write_local_mmr( | 179 | uv_write_local_mmr( |
166 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, | 180 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, |
167 | (msg_res << 8) | msg_res); | 181 | (msg_res << UV_SW_ACK_NPENDING) | |
168 | } else | 182 | msg_res); |
169 | printk(KERN_INFO "note bau retry: no effect\n"); | 183 | } |
170 | } | 184 | } |
171 | } | 185 | } |
172 | if (!cancel_count) | 186 | if (!cancel_count) |
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp, | |||
190 | * This must be a normal message, or retry of a normal message | 204 | * This must be a normal message, or retry of a normal message |
191 | */ | 205 | */ |
192 | msg = mdp->msg; | 206 | msg = mdp->msg; |
193 | stat = &per_cpu(ptcstats, bcp->cpu); | 207 | stat = bcp->statp; |
194 | if (msg->address == TLB_FLUSH_ALL) { | 208 | if (msg->address == TLB_FLUSH_ALL) { |
195 | local_flush_tlb(); | 209 | local_flush_tlb(); |
196 | stat->d_alltlb++; | 210 | stat->d_alltlb++; |
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr) | |||
274 | 288 | ||
275 | bcp = &per_cpu(bau_control, smp_processor_id()); | 289 | bcp = &per_cpu(bau_control, smp_processor_id()); |
276 | rap = (struct reset_args *)ptr; | 290 | rap = (struct reset_args *)ptr; |
277 | stat = &per_cpu(ptcstats, bcp->cpu); | 291 | stat = bcp->statp; |
278 | stat->d_resets++; | 292 | stat->d_resets++; |
279 | 293 | ||
280 | /* | 294 | /* |
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr) | |||
302 | */ | 316 | */ |
303 | mmr = uv_read_local_mmr | 317 | mmr = uv_read_local_mmr |
304 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); | 318 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); |
305 | msg_res = ((msg->sw_ack_vector << 8) | | 319 | msg_res = msg->sw_ack_vector; |
306 | msg->sw_ack_vector); | ||
307 | if (mmr & msg_res) { | 320 | if (mmr & msg_res) { |
308 | stat->d_rcanceled++; | 321 | stat->d_rcanceled++; |
309 | uv_write_local_mmr( | 322 | uv_write_local_mmr( |
310 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, | 323 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, |
311 | msg_res); | 324 | (msg_res << UV_SW_ACK_NPENDING) | |
325 | msg_res); | ||
312 | } | 326 | } |
313 | } | 327 | } |
314 | } | 328 | } |
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc, | |||
386 | unsigned long mmr_offset, int right_shift, int this_cpu, | 400 | unsigned long mmr_offset, int right_shift, int this_cpu, |
387 | struct bau_control *bcp, struct bau_control *smaster, long try) | 401 | struct bau_control *bcp, struct bau_control *smaster, long try) |
388 | { | 402 | { |
389 | int relaxes = 0; | ||
390 | unsigned long descriptor_status; | 403 | unsigned long descriptor_status; |
391 | unsigned long mmr; | ||
392 | unsigned long mask; | ||
393 | cycles_t ttime; | 404 | cycles_t ttime; |
394 | cycles_t timeout_time; | 405 | struct ptc_stats *stat = bcp->statp; |
395 | struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); | ||
396 | struct bau_control *hmaster; | 406 | struct bau_control *hmaster; |
397 | 407 | ||
398 | hmaster = bcp->uvhub_master; | 408 | hmaster = bcp->uvhub_master; |
399 | timeout_time = get_cycles() + bcp->timeout_interval; | ||
400 | 409 | ||
401 | /* spin on the status MMR, waiting for it to go idle */ | 410 | /* spin on the status MMR, waiting for it to go idle */ |
402 | while ((descriptor_status = (((unsigned long) | 411 | while ((descriptor_status = (((unsigned long) |
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc, | |||
423 | * pending. In that case hardware returns the | 432 | * pending. In that case hardware returns the |
424 | * ERROR that looks like a destination timeout. | 433 | * ERROR that looks like a destination timeout. |
425 | */ | 434 | */ |
426 | if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { | 435 | if (cycles_2_us(ttime - bcp->send_message) < |
436 | timeout_us) { | ||
427 | bcp->conseccompletes = 0; | 437 | bcp->conseccompletes = 0; |
428 | return FLUSH_RETRY_PLUGGED; | 438 | return FLUSH_RETRY_PLUGGED; |
429 | } | 439 | } |
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc, | |||
435 | * descriptor_status is still BUSY | 445 | * descriptor_status is still BUSY |
436 | */ | 446 | */ |
437 | cpu_relax(); | 447 | cpu_relax(); |
438 | relaxes++; | ||
439 | if (relaxes >= 10000) { | ||
440 | relaxes = 0; | ||
441 | if (get_cycles() > timeout_time) { | ||
442 | quiesce_local_uvhub(hmaster); | ||
443 | |||
444 | /* single-thread the register change */ | ||
445 | spin_lock(&hmaster->masks_lock); | ||
446 | mmr = uv_read_local_mmr(mmr_offset); | ||
447 | mask = 0UL; | ||
448 | mask |= (3UL < right_shift); | ||
449 | mask = ~mask; | ||
450 | mmr &= mask; | ||
451 | uv_write_local_mmr(mmr_offset, mmr); | ||
452 | spin_unlock(&hmaster->masks_lock); | ||
453 | end_uvhub_quiesce(hmaster); | ||
454 | stat->s_busy++; | ||
455 | return FLUSH_GIVEUP; | ||
456 | } | ||
457 | } | ||
458 | } | 448 | } |
459 | } | 449 | } |
460 | bcp->conseccompletes++; | 450 | bcp->conseccompletes++; |
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) | |||
494 | return 1; | 484 | return 1; |
495 | } | 485 | } |
496 | 486 | ||
487 | /* | ||
488 | * Our retries are blocked by all destination swack resources being | ||
489 | * in use, and a timeout is pending. In that case hardware immediately | ||
490 | * returns the ERROR that looks like a destination timeout. | ||
491 | */ | ||
492 | static void | ||
493 | destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, | ||
494 | struct bau_control *hmaster, struct ptc_stats *stat) | ||
495 | { | ||
496 | udelay(bcp->plugged_delay); | ||
497 | bcp->plugged_tries++; | ||
498 | if (bcp->plugged_tries >= bcp->plugsb4reset) { | ||
499 | bcp->plugged_tries = 0; | ||
500 | quiesce_local_uvhub(hmaster); | ||
501 | spin_lock(&hmaster->queue_lock); | ||
502 | uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); | ||
503 | spin_unlock(&hmaster->queue_lock); | ||
504 | end_uvhub_quiesce(hmaster); | ||
505 | bcp->ipi_attempts++; | ||
506 | stat->s_resets_plug++; | ||
507 | } | ||
508 | } | ||
509 | |||
510 | static void | ||
511 | destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, | ||
512 | struct bau_control *hmaster, struct ptc_stats *stat) | ||
513 | { | ||
514 | hmaster->max_bau_concurrent = 1; | ||
515 | bcp->timeout_tries++; | ||
516 | if (bcp->timeout_tries >= bcp->timeoutsb4reset) { | ||
517 | bcp->timeout_tries = 0; | ||
518 | quiesce_local_uvhub(hmaster); | ||
519 | spin_lock(&hmaster->queue_lock); | ||
520 | uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); | ||
521 | spin_unlock(&hmaster->queue_lock); | ||
522 | end_uvhub_quiesce(hmaster); | ||
523 | bcp->ipi_attempts++; | ||
524 | stat->s_resets_timeout++; | ||
525 | } | ||
526 | } | ||
527 | |||
528 | /* | ||
529 | * Completions are taking a very long time due to a congested numalink | ||
530 | * network. | ||
531 | */ | ||
532 | static void | ||
533 | disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) | ||
534 | { | ||
535 | int tcpu; | ||
536 | struct bau_control *tbcp; | ||
537 | |||
538 | /* let only one cpu do this disabling */ | ||
539 | spin_lock(&disable_lock); | ||
540 | if (!baudisabled && bcp->period_requests && | ||
541 | ((bcp->period_time / bcp->period_requests) > congested_cycles)) { | ||
542 | /* it becomes this cpu's job to turn on the use of the | ||
543 | BAU again */ | ||
544 | baudisabled = 1; | ||
545 | bcp->set_bau_off = 1; | ||
546 | bcp->set_bau_on_time = get_cycles() + | ||
547 | sec_2_cycles(bcp->congested_period); | ||
548 | stat->s_bau_disabled++; | ||
549 | for_each_present_cpu(tcpu) { | ||
550 | tbcp = &per_cpu(bau_control, tcpu); | ||
551 | tbcp->baudisabled = 1; | ||
552 | } | ||
553 | } | ||
554 | spin_unlock(&disable_lock); | ||
555 | } | ||
556 | |||
497 | /** | 557 | /** |
498 | * uv_flush_send_and_wait | 558 | * uv_flush_send_and_wait |
499 | * | 559 | * |
500 | * Send a broadcast and wait for it to complete. | 560 | * Send a broadcast and wait for it to complete. |
501 | * | 561 | * |
502 | * The flush_mask contains the cpus the broadcast is to be sent to, plus | 562 | * The flush_mask contains the cpus the broadcast is to be sent to including |
503 | * cpus that are on the local uvhub. | 563 | * cpus that are on the local uvhub. |
504 | * | 564 | * |
505 | * Returns NULL if all flushing represented in the mask was done. The mask | 565 | * Returns 0 if all flushing represented in the mask was done. |
506 | * is zeroed. | 566 | * Returns 1 if it gives up entirely and the original cpu mask is to be |
507 | * Returns @flush_mask if some remote flushing remains to be done. The | 567 | * returned to the kernel. |
508 | * mask will have some bits still set, representing any cpus on the local | ||
509 | * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. | ||
510 | */ | 568 | */ |
511 | const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | 569 | int uv_flush_send_and_wait(struct bau_desc *bau_desc, |
512 | struct cpumask *flush_mask, | 570 | struct cpumask *flush_mask, struct bau_control *bcp) |
513 | struct bau_control *bcp) | ||
514 | { | 571 | { |
515 | int right_shift; | 572 | int right_shift; |
516 | int uvhub; | ||
517 | int bit; | ||
518 | int completion_status = 0; | 573 | int completion_status = 0; |
519 | int seq_number = 0; | 574 | int seq_number = 0; |
520 | long try = 0; | 575 | long try = 0; |
521 | int cpu = bcp->uvhub_cpu; | 576 | int cpu = bcp->uvhub_cpu; |
522 | int this_cpu = bcp->cpu; | 577 | int this_cpu = bcp->cpu; |
523 | int this_uvhub = bcp->uvhub; | ||
524 | unsigned long mmr_offset; | 578 | unsigned long mmr_offset; |
525 | unsigned long index; | 579 | unsigned long index; |
526 | cycles_t time1; | 580 | cycles_t time1; |
527 | cycles_t time2; | 581 | cycles_t time2; |
528 | struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); | 582 | cycles_t elapsed; |
583 | struct ptc_stats *stat = bcp->statp; | ||
529 | struct bau_control *smaster = bcp->socket_master; | 584 | struct bau_control *smaster = bcp->socket_master; |
530 | struct bau_control *hmaster = bcp->uvhub_master; | 585 | struct bau_control *hmaster = bcp->uvhub_master; |
531 | 586 | ||
532 | /* | ||
533 | * Spin here while there are hmaster->max_concurrent or more active | ||
534 | * descriptors. This is the per-uvhub 'throttle'. | ||
535 | */ | ||
536 | if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, | 587 | if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, |
537 | &hmaster->active_descriptor_count, | 588 | &hmaster->active_descriptor_count, |
538 | hmaster->max_concurrent)) { | 589 | hmaster->max_bau_concurrent)) { |
539 | stat->s_throttles++; | 590 | stat->s_throttles++; |
540 | do { | 591 | do { |
541 | cpu_relax(); | 592 | cpu_relax(); |
542 | } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, | 593 | } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, |
543 | &hmaster->active_descriptor_count, | 594 | &hmaster->active_descriptor_count, |
544 | hmaster->max_concurrent)); | 595 | hmaster->max_bau_concurrent)); |
545 | } | 596 | } |
546 | |||
547 | while (hmaster->uvhub_quiesce) | 597 | while (hmaster->uvhub_quiesce) |
548 | cpu_relax(); | 598 | cpu_relax(); |
549 | 599 | ||
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | |||
557 | } | 607 | } |
558 | time1 = get_cycles(); | 608 | time1 = get_cycles(); |
559 | do { | 609 | do { |
560 | /* | ||
561 | * Every message from any given cpu gets a unique message | ||
562 | * sequence number. But retries use that same number. | ||
563 | * Our message may have timed out at the destination because | ||
564 | * all sw-ack resources are in use and there is a timeout | ||
565 | * pending there. In that case, our last send never got | ||
566 | * placed into the queue and we need to persist until it | ||
567 | * does. | ||
568 | * | ||
569 | * Make any retry a type MSG_RETRY so that the destination will | ||
570 | * free any resource held by a previous message from this cpu. | ||
571 | */ | ||
572 | if (try == 0) { | 610 | if (try == 0) { |
573 | /* use message type set by the caller the first time */ | 611 | bau_desc->header.msg_type = MSG_REGULAR; |
574 | seq_number = bcp->message_number++; | 612 | seq_number = bcp->message_number++; |
575 | } else { | 613 | } else { |
576 | /* use RETRY type on all the rest; same sequence */ | ||
577 | bau_desc->header.msg_type = MSG_RETRY; | 614 | bau_desc->header.msg_type = MSG_RETRY; |
578 | stat->s_retry_messages++; | 615 | stat->s_retry_messages++; |
579 | } | 616 | } |
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | |||
581 | index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | | 618 | index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | |
582 | bcp->uvhub_cpu; | 619 | bcp->uvhub_cpu; |
583 | bcp->send_message = get_cycles(); | 620 | bcp->send_message = get_cycles(); |
584 | |||
585 | uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); | 621 | uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); |
586 | |||
587 | try++; | 622 | try++; |
588 | completion_status = uv_wait_completion(bau_desc, mmr_offset, | 623 | completion_status = uv_wait_completion(bau_desc, mmr_offset, |
589 | right_shift, this_cpu, bcp, smaster, try); | 624 | right_shift, this_cpu, bcp, smaster, try); |
590 | 625 | ||
591 | if (completion_status == FLUSH_RETRY_PLUGGED) { | 626 | if (completion_status == FLUSH_RETRY_PLUGGED) { |
592 | /* | 627 | destination_plugged(bau_desc, bcp, hmaster, stat); |
593 | * Our retries may be blocked by all destination swack | ||
594 | * resources being consumed, and a timeout pending. In | ||
595 | * that case hardware immediately returns the ERROR | ||
596 | * that looks like a destination timeout. | ||
597 | */ | ||
598 | udelay(TIMEOUT_DELAY); | ||
599 | bcp->plugged_tries++; | ||
600 | if (bcp->plugged_tries >= PLUGSB4RESET) { | ||
601 | bcp->plugged_tries = 0; | ||
602 | quiesce_local_uvhub(hmaster); | ||
603 | spin_lock(&hmaster->queue_lock); | ||
604 | uv_reset_with_ipi(&bau_desc->distribution, | ||
605 | this_cpu); | ||
606 | spin_unlock(&hmaster->queue_lock); | ||
607 | end_uvhub_quiesce(hmaster); | ||
608 | bcp->ipi_attempts++; | ||
609 | stat->s_resets_plug++; | ||
610 | } | ||
611 | } else if (completion_status == FLUSH_RETRY_TIMEOUT) { | 628 | } else if (completion_status == FLUSH_RETRY_TIMEOUT) { |
612 | hmaster->max_concurrent = 1; | 629 | destination_timeout(bau_desc, bcp, hmaster, stat); |
613 | bcp->timeout_tries++; | ||
614 | udelay(TIMEOUT_DELAY); | ||
615 | if (bcp->timeout_tries >= TIMEOUTSB4RESET) { | ||
616 | bcp->timeout_tries = 0; | ||
617 | quiesce_local_uvhub(hmaster); | ||
618 | spin_lock(&hmaster->queue_lock); | ||
619 | uv_reset_with_ipi(&bau_desc->distribution, | ||
620 | this_cpu); | ||
621 | spin_unlock(&hmaster->queue_lock); | ||
622 | end_uvhub_quiesce(hmaster); | ||
623 | bcp->ipi_attempts++; | ||
624 | stat->s_resets_timeout++; | ||
625 | } | ||
626 | } | 630 | } |
627 | if (bcp->ipi_attempts >= 3) { | 631 | if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { |
628 | bcp->ipi_attempts = 0; | 632 | bcp->ipi_attempts = 0; |
629 | completion_status = FLUSH_GIVEUP; | 633 | completion_status = FLUSH_GIVEUP; |
630 | break; | 634 | break; |
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, | |||
633 | } while ((completion_status == FLUSH_RETRY_PLUGGED) || | 637 | } while ((completion_status == FLUSH_RETRY_PLUGGED) || |
634 | (completion_status == FLUSH_RETRY_TIMEOUT)); | 638 | (completion_status == FLUSH_RETRY_TIMEOUT)); |
635 | time2 = get_cycles(); | 639 | time2 = get_cycles(); |
636 | 640 | bcp->plugged_tries = 0; | |
637 | if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) | 641 | bcp->timeout_tries = 0; |
638 | && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) | 642 | if ((completion_status == FLUSH_COMPLETE) && |
639 | hmaster->max_concurrent++; | 643 | (bcp->conseccompletes > bcp->complete_threshold) && |
640 | 644 | (hmaster->max_bau_concurrent < | |
641 | /* | 645 | hmaster->max_bau_concurrent_constant)) |
642 | * hold any cpu not timing out here; no other cpu currently held by | 646 | hmaster->max_bau_concurrent++; |
643 | * the 'throttle' should enter the activation code | ||
644 | */ | ||
645 | while (hmaster->uvhub_quiesce) | 647 | while (hmaster->uvhub_quiesce) |
646 | cpu_relax(); | 648 | cpu_relax(); |
647 | atomic_dec(&hmaster->active_descriptor_count); | 649 | atomic_dec(&hmaster->active_descriptor_count); |
648 | 650 | if (time2 > time1) { | |
649 | /* guard against cycles wrap */ | 651 | elapsed = time2 - time1; |
650 | if (time2 > time1) | 652 | stat->s_time += elapsed; |
651 | stat->s_time += (time2 - time1); | 653 | if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { |
652 | else | 654 | bcp->period_requests++; |
653 | stat->s_requestor--; /* don't count this one */ | 655 | bcp->period_time += elapsed; |
656 | if ((elapsed > congested_cycles) && | ||
657 | (bcp->period_requests > bcp->congested_reps)) { | ||
658 | disable_for_congestion(bcp, stat); | ||
659 | } | ||
660 | } | ||
661 | } else | ||
662 | stat->s_requestor--; | ||
654 | if (completion_status == FLUSH_COMPLETE && try > 1) | 663 | if (completion_status == FLUSH_COMPLETE && try > 1) |
655 | stat->s_retriesok++; | 664 | stat->s_retriesok++; |
656 | else if (completion_status == FLUSH_GIVEUP) { | 665 | else if (completion_status == FLUSH_GIVEUP) { |
657 | /* | ||
658 | * Cause the caller to do an IPI-style TLB shootdown on | ||
659 | * the target cpu's, all of which are still in the mask. | ||
660 | */ | ||
661 | stat->s_giveup++; | 666 | stat->s_giveup++; |
662 | return flush_mask; | 667 | return 1; |
663 | } | ||
664 | |||
665 | /* | ||
666 | * Success, so clear the remote cpu's from the mask so we don't | ||
667 | * use the IPI method of shootdown on them. | ||
668 | */ | ||
669 | for_each_cpu(bit, flush_mask) { | ||
670 | uvhub = uv_cpu_to_blade_id(bit); | ||
671 | if (uvhub == this_uvhub) | ||
672 | continue; | ||
673 | cpumask_clear_cpu(bit, flush_mask); | ||
674 | } | 668 | } |
675 | if (!cpumask_empty(flush_mask)) | 669 | return 0; |
676 | return flush_mask; | ||
677 | |||
678 | return NULL; | ||
679 | } | 670 | } |
680 | 671 | ||
681 | /** | 672 | /** |
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
707 | struct mm_struct *mm, | 698 | struct mm_struct *mm, |
708 | unsigned long va, unsigned int cpu) | 699 | unsigned long va, unsigned int cpu) |
709 | { | 700 | { |
710 | int remotes; | ||
711 | int tcpu; | 701 | int tcpu; |
712 | int uvhub; | 702 | int uvhub; |
713 | int locals = 0; | 703 | int locals = 0; |
704 | int remotes = 0; | ||
705 | int hubs = 0; | ||
714 | struct bau_desc *bau_desc; | 706 | struct bau_desc *bau_desc; |
715 | struct cpumask *flush_mask; | 707 | struct cpumask *flush_mask; |
716 | struct ptc_stats *stat; | 708 | struct ptc_stats *stat; |
717 | struct bau_control *bcp; | 709 | struct bau_control *bcp; |
710 | struct bau_control *tbcp; | ||
718 | 711 | ||
712 | /* kernel was booted 'nobau' */ | ||
719 | if (nobau) | 713 | if (nobau) |
720 | return cpumask; | 714 | return cpumask; |
721 | 715 | ||
722 | bcp = &per_cpu(bau_control, cpu); | 716 | bcp = &per_cpu(bau_control, cpu); |
717 | stat = bcp->statp; | ||
718 | |||
719 | /* bau was disabled due to slow response */ | ||
720 | if (bcp->baudisabled) { | ||
721 | /* the cpu that disabled it must re-enable it */ | ||
722 | if (bcp->set_bau_off) { | ||
723 | if (get_cycles() >= bcp->set_bau_on_time) { | ||
724 | stat->s_bau_reenabled++; | ||
725 | baudisabled = 0; | ||
726 | for_each_present_cpu(tcpu) { | ||
727 | tbcp = &per_cpu(bau_control, tcpu); | ||
728 | tbcp->baudisabled = 0; | ||
729 | tbcp->period_requests = 0; | ||
730 | tbcp->period_time = 0; | ||
731 | } | ||
732 | } | ||
733 | } | ||
734 | return cpumask; | ||
735 | } | ||
736 | |||
723 | /* | 737 | /* |
724 | * Each sending cpu has a per-cpu mask which it fills from the caller's | 738 | * Each sending cpu has a per-cpu mask which it fills from the caller's |
725 | * cpu mask. Only remote cpus are converted to uvhubs and copied. | 739 | * cpu mask. All cpus are converted to uvhubs and copied to the |
740 | * activation descriptor. | ||
726 | */ | 741 | */ |
727 | flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); | 742 | flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); |
728 | /* | 743 | /* don't actually do a shootdown of the local cpu */ |
729 | * copy cpumask to flush_mask, removing current cpu | ||
730 | * (current cpu should already have been flushed by the caller and | ||
731 | * should never be returned if we return flush_mask) | ||
732 | */ | ||
733 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); | 744 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); |
734 | if (cpu_isset(cpu, *cpumask)) | 745 | if (cpu_isset(cpu, *cpumask)) |
735 | locals++; /* current cpu was targeted */ | 746 | stat->s_ntargself++; |
736 | 747 | ||
737 | bau_desc = bcp->descriptor_base; | 748 | bau_desc = bcp->descriptor_base; |
738 | bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; | 749 | bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; |
739 | |||
740 | bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); | 750 | bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); |
741 | remotes = 0; | 751 | |
752 | /* cpu statistics */ | ||
742 | for_each_cpu(tcpu, flush_mask) { | 753 | for_each_cpu(tcpu, flush_mask) { |
743 | uvhub = uv_cpu_to_blade_id(tcpu); | 754 | uvhub = uv_cpu_to_blade_id(tcpu); |
744 | if (uvhub == bcp->uvhub) { | ||
745 | locals++; | ||
746 | continue; | ||
747 | } | ||
748 | bau_uvhub_set(uvhub, &bau_desc->distribution); | 755 | bau_uvhub_set(uvhub, &bau_desc->distribution); |
749 | remotes++; | 756 | if (uvhub == bcp->uvhub) |
750 | } | 757 | locals++; |
751 | if (remotes == 0) { | ||
752 | /* | ||
753 | * No off_hub flushing; return status for local hub. | ||
754 | * Return the caller's mask if all were local (the current | ||
755 | * cpu may be in that mask). | ||
756 | */ | ||
757 | if (locals) | ||
758 | return cpumask; | ||
759 | else | 758 | else |
760 | return NULL; | 759 | remotes++; |
761 | } | 760 | } |
762 | stat = &per_cpu(ptcstats, cpu); | 761 | if ((locals + remotes) == 0) |
762 | return NULL; | ||
763 | stat->s_requestor++; | 763 | stat->s_requestor++; |
764 | stat->s_ntargcpu += remotes; | 764 | stat->s_ntargcpu += remotes + locals; |
765 | stat->s_ntargremotes += remotes; | ||
766 | stat->s_ntarglocals += locals; | ||
765 | remotes = bau_uvhub_weight(&bau_desc->distribution); | 767 | remotes = bau_uvhub_weight(&bau_desc->distribution); |
766 | stat->s_ntarguvhub += remotes; | 768 | |
767 | if (remotes >= 16) | 769 | /* uvhub statistics */ |
770 | hubs = bau_uvhub_weight(&bau_desc->distribution); | ||
771 | if (locals) { | ||
772 | stat->s_ntarglocaluvhub++; | ||
773 | stat->s_ntargremoteuvhub += (hubs - 1); | ||
774 | } else | ||
775 | stat->s_ntargremoteuvhub += hubs; | ||
776 | stat->s_ntarguvhub += hubs; | ||
777 | if (hubs >= 16) | ||
768 | stat->s_ntarguvhub16++; | 778 | stat->s_ntarguvhub16++; |
769 | else if (remotes >= 8) | 779 | else if (hubs >= 8) |
770 | stat->s_ntarguvhub8++; | 780 | stat->s_ntarguvhub8++; |
771 | else if (remotes >= 4) | 781 | else if (hubs >= 4) |
772 | stat->s_ntarguvhub4++; | 782 | stat->s_ntarguvhub4++; |
773 | else if (remotes >= 2) | 783 | else if (hubs >= 2) |
774 | stat->s_ntarguvhub2++; | 784 | stat->s_ntarguvhub2++; |
775 | else | 785 | else |
776 | stat->s_ntarguvhub1++; | 786 | stat->s_ntarguvhub1++; |
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
779 | bau_desc->payload.sending_cpu = cpu; | 789 | bau_desc->payload.sending_cpu = cpu; |
780 | 790 | ||
781 | /* | 791 | /* |
782 | * uv_flush_send_and_wait returns null if all cpu's were messaged, or | 792 | * uv_flush_send_and_wait returns 0 if all cpu's were messaged, |
783 | * the adjusted flush_mask if any cpu's were not messaged. | 793 | * or 1 if it gave up and the original cpumask should be returned. |
784 | */ | 794 | */ |
785 | return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); | 795 | if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) |
796 | return NULL; | ||
797 | else | ||
798 | return cpumask; | ||
786 | } | 799 | } |
787 | 800 | ||
788 | /* | 801 | /* |
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) | |||
810 | 823 | ||
811 | time_start = get_cycles(); | 824 | time_start = get_cycles(); |
812 | bcp = &per_cpu(bau_control, smp_processor_id()); | 825 | bcp = &per_cpu(bau_control, smp_processor_id()); |
813 | stat = &per_cpu(ptcstats, smp_processor_id()); | 826 | stat = bcp->statp; |
814 | msgdesc.va_queue_first = bcp->va_queue_first; | 827 | msgdesc.va_queue_first = bcp->va_queue_first; |
815 | msgdesc.va_queue_last = bcp->va_queue_last; | 828 | msgdesc.va_queue_last = bcp->va_queue_last; |
816 | msg = bcp->bau_msg_head; | 829 | msg = bcp->bau_msg_head; |
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) | |||
908 | } | 921 | } |
909 | 922 | ||
910 | static inline unsigned long long | 923 | static inline unsigned long long |
911 | millisec_2_cycles(unsigned long millisec) | 924 | microsec_2_cycles(unsigned long microsec) |
912 | { | 925 | { |
913 | unsigned long ns; | 926 | unsigned long ns; |
914 | unsigned long long cyc; | 927 | unsigned long long cyc; |
915 | 928 | ||
916 | ns = millisec * 1000; | 929 | ns = microsec * 1000; |
917 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); | 930 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); |
918 | return cyc; | 931 | return cyc; |
919 | } | 932 | } |
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
931 | 944 | ||
932 | if (!cpu) { | 945 | if (!cpu) { |
933 | seq_printf(file, | 946 | seq_printf(file, |
934 | "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); | 947 | "# cpu sent stime self locals remotes ncpus localhub "); |
948 | seq_printf(file, | ||
949 | "remotehub numuvhubs numuvhubs16 numuvhubs8 "); | ||
935 | seq_printf(file, | 950 | seq_printf(file, |
936 | "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); | 951 | "numuvhubs4 numuvhubs2 numuvhubs1 dto "); |
937 | seq_printf(file, | 952 | seq_printf(file, |
938 | "retries rok resetp resett giveup sto bz throt "); | 953 | "retries rok resetp resett giveup sto bz throt "); |
939 | seq_printf(file, | 954 | seq_printf(file, |
940 | "sw_ack recv rtime all "); | 955 | "sw_ack recv rtime all "); |
941 | seq_printf(file, | 956 | seq_printf(file, |
942 | "one mult none retry canc nocan reset rcan\n"); | 957 | "one mult none retry canc nocan reset rcan "); |
958 | seq_printf(file, | ||
959 | "disable enable\n"); | ||
943 | } | 960 | } |
944 | if (cpu < num_possible_cpus() && cpu_online(cpu)) { | 961 | if (cpu < num_possible_cpus() && cpu_online(cpu)) { |
945 | stat = &per_cpu(ptcstats, cpu); | 962 | stat = &per_cpu(ptcstats, cpu); |
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
947 | seq_printf(file, | 964 | seq_printf(file, |
948 | "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", | 965 | "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", |
949 | cpu, stat->s_requestor, cycles_2_us(stat->s_time), | 966 | cpu, stat->s_requestor, cycles_2_us(stat->s_time), |
950 | stat->s_ntarguvhub, stat->s_ntarguvhub16, | 967 | stat->s_ntargself, stat->s_ntarglocals, |
968 | stat->s_ntargremotes, stat->s_ntargcpu, | ||
969 | stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, | ||
970 | stat->s_ntarguvhub, stat->s_ntarguvhub16); | ||
971 | seq_printf(file, "%ld %ld %ld %ld %ld ", | ||
951 | stat->s_ntarguvhub8, stat->s_ntarguvhub4, | 972 | stat->s_ntarguvhub8, stat->s_ntarguvhub4, |
952 | stat->s_ntarguvhub2, stat->s_ntarguvhub1, | 973 | stat->s_ntarguvhub2, stat->s_ntarguvhub1, |
953 | stat->s_ntargcpu, stat->s_dtimeout); | 974 | stat->s_dtimeout); |
954 | seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", | 975 | seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", |
955 | stat->s_retry_messages, stat->s_retriesok, | 976 | stat->s_retry_messages, stat->s_retriesok, |
956 | stat->s_resets_plug, stat->s_resets_timeout, | 977 | stat->s_resets_plug, stat->s_resets_timeout, |
957 | stat->s_giveup, stat->s_stimeout, | 978 | stat->s_giveup, stat->s_stimeout, |
958 | stat->s_busy, stat->s_throttles); | 979 | stat->s_busy, stat->s_throttles); |
980 | |||
959 | /* destination side statistics */ | 981 | /* destination side statistics */ |
960 | seq_printf(file, | 982 | seq_printf(file, |
961 | "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", | 983 | "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", |
962 | uv_read_global_mmr64(uv_cpu_to_pnode(cpu), | 984 | uv_read_global_mmr64(uv_cpu_to_pnode(cpu), |
963 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), | 985 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), |
964 | stat->d_requestee, cycles_2_us(stat->d_time), | 986 | stat->d_requestee, cycles_2_us(stat->d_time), |
@@ -966,15 +988,36 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
966 | stat->d_nomsg, stat->d_retries, stat->d_canceled, | 988 | stat->d_nomsg, stat->d_retries, stat->d_canceled, |
967 | stat->d_nocanceled, stat->d_resets, | 989 | stat->d_nocanceled, stat->d_resets, |
968 | stat->d_rcanceled); | 990 | stat->d_rcanceled); |
991 | seq_printf(file, "%ld %ld\n", | ||
992 | stat->s_bau_disabled, stat->s_bau_reenabled); | ||
969 | } | 993 | } |
970 | 994 | ||
971 | return 0; | 995 | return 0; |
972 | } | 996 | } |
973 | 997 | ||
974 | /* | 998 | /* |
999 | * Display the tunables thru debugfs | ||
1000 | */ | ||
1001 | static ssize_t tunables_read(struct file *file, char __user *userbuf, | ||
1002 | size_t count, loff_t *ppos) | ||
1003 | { | ||
1004 | char buf[300]; | ||
1005 | int ret; | ||
1006 | |||
1007 | ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", | ||
1008 | "max_bau_concurrent plugged_delay plugsb4reset", | ||
1009 | "timeoutsb4reset ipi_reset_limit complete_threshold", | ||
1010 | "congested_response_us congested_reps congested_period", | ||
1011 | max_bau_concurrent, plugged_delay, plugsb4reset, | ||
1012 | timeoutsb4reset, ipi_reset_limit, complete_threshold, | ||
1013 | congested_response_us, congested_reps, congested_period); | ||
1014 | |||
1015 | return simple_read_from_buffer(userbuf, count, ppos, buf, ret); | ||
1016 | } | ||
1017 | |||
1018 | /* | ||
975 | * -1: resetf the statistics | 1019 | * -1: resetf the statistics |
976 | * 0: display meaning of the statistics | 1020 | * 0: display meaning of the statistics |
977 | * >0: maximum concurrent active descriptors per uvhub (throttle) | ||
978 | */ | 1021 | */ |
979 | static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | 1022 | static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, |
980 | size_t count, loff_t *data) | 1023 | size_t count, loff_t *data) |
@@ -983,7 +1026,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | |||
983 | long input_arg; | 1026 | long input_arg; |
984 | char optstr[64]; | 1027 | char optstr[64]; |
985 | struct ptc_stats *stat; | 1028 | struct ptc_stats *stat; |
986 | struct bau_control *bcp; | ||
987 | 1029 | ||
988 | if (count == 0 || count > sizeof(optstr)) | 1030 | if (count == 0 || count > sizeof(optstr)) |
989 | return -EINVAL; | 1031 | return -EINVAL; |
@@ -1059,29 +1101,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | |||
1059 | "reset: number of ipi-style reset requests processed\n"); | 1101 | "reset: number of ipi-style reset requests processed\n"); |
1060 | printk(KERN_DEBUG | 1102 | printk(KERN_DEBUG |
1061 | "rcan: number messages canceled by reset requests\n"); | 1103 | "rcan: number messages canceled by reset requests\n"); |
1104 | printk(KERN_DEBUG | ||
1105 | "disable: number times use of the BAU was disabled\n"); | ||
1106 | printk(KERN_DEBUG | ||
1107 | "enable: number times use of the BAU was re-enabled\n"); | ||
1062 | } else if (input_arg == -1) { | 1108 | } else if (input_arg == -1) { |
1063 | for_each_present_cpu(cpu) { | 1109 | for_each_present_cpu(cpu) { |
1064 | stat = &per_cpu(ptcstats, cpu); | 1110 | stat = &per_cpu(ptcstats, cpu); |
1065 | memset(stat, 0, sizeof(struct ptc_stats)); | 1111 | memset(stat, 0, sizeof(struct ptc_stats)); |
1066 | } | 1112 | } |
1067 | } else { | 1113 | } |
1068 | uv_bau_max_concurrent = input_arg; | 1114 | |
1069 | bcp = &per_cpu(bau_control, smp_processor_id()); | 1115 | return count; |
1070 | if (uv_bau_max_concurrent < 1 || | 1116 | } |
1071 | uv_bau_max_concurrent > bcp->cpus_in_uvhub) { | 1117 | |
1072 | printk(KERN_DEBUG | 1118 | static int local_atoi(const char *name) |
1073 | "Error: BAU max concurrent %d; %d is invalid\n", | 1119 | { |
1074 | bcp->max_concurrent, uv_bau_max_concurrent); | 1120 | int val = 0; |
1075 | return -EINVAL; | 1121 | |
1076 | } | 1122 | for (;; name++) { |
1077 | printk(KERN_DEBUG "Set BAU max concurrent:%d\n", | 1123 | switch (*name) { |
1078 | uv_bau_max_concurrent); | 1124 | case '0' ... '9': |
1079 | for_each_present_cpu(cpu) { | 1125 | val = 10*val+(*name-'0'); |
1080 | bcp = &per_cpu(bau_control, cpu); | 1126 | break; |
1081 | bcp->max_concurrent = uv_bau_max_concurrent; | 1127 | default: |
1128 | return val; | ||
1082 | } | 1129 | } |
1083 | } | 1130 | } |
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * set the tunables | ||
1135 | * 0 values reset them to defaults | ||
1136 | */ | ||
1137 | static ssize_t tunables_write(struct file *file, const char __user *user, | ||
1138 | size_t count, loff_t *data) | ||
1139 | { | ||
1140 | int cpu; | ||
1141 | int cnt = 0; | ||
1142 | int val; | ||
1143 | char *p; | ||
1144 | char *q; | ||
1145 | char instr[64]; | ||
1146 | struct bau_control *bcp; | ||
1147 | |||
1148 | if (count == 0 || count > sizeof(instr)-1) | ||
1149 | return -EINVAL; | ||
1150 | if (copy_from_user(instr, user, count)) | ||
1151 | return -EFAULT; | ||
1084 | 1152 | ||
1153 | instr[count] = '\0'; | ||
1154 | /* count the fields */ | ||
1155 | p = instr + strspn(instr, WHITESPACE); | ||
1156 | q = p; | ||
1157 | for (; *p; p = q + strspn(q, WHITESPACE)) { | ||
1158 | q = p + strcspn(p, WHITESPACE); | ||
1159 | cnt++; | ||
1160 | if (q == p) | ||
1161 | break; | ||
1162 | } | ||
1163 | if (cnt != 9) { | ||
1164 | printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); | ||
1165 | return -EINVAL; | ||
1166 | } | ||
1167 | |||
1168 | p = instr + strspn(instr, WHITESPACE); | ||
1169 | q = p; | ||
1170 | for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { | ||
1171 | q = p + strcspn(p, WHITESPACE); | ||
1172 | val = local_atoi(p); | ||
1173 | switch (cnt) { | ||
1174 | case 0: | ||
1175 | if (val == 0) { | ||
1176 | max_bau_concurrent = MAX_BAU_CONCURRENT; | ||
1177 | max_bau_concurrent_constant = | ||
1178 | MAX_BAU_CONCURRENT; | ||
1179 | continue; | ||
1180 | } | ||
1181 | bcp = &per_cpu(bau_control, smp_processor_id()); | ||
1182 | if (val < 1 || val > bcp->cpus_in_uvhub) { | ||
1183 | printk(KERN_DEBUG | ||
1184 | "Error: BAU max concurrent %d is invalid\n", | ||
1185 | val); | ||
1186 | return -EINVAL; | ||
1187 | } | ||
1188 | max_bau_concurrent = val; | ||
1189 | max_bau_concurrent_constant = val; | ||
1190 | continue; | ||
1191 | case 1: | ||
1192 | if (val == 0) | ||
1193 | plugged_delay = PLUGGED_DELAY; | ||
1194 | else | ||
1195 | plugged_delay = val; | ||
1196 | continue; | ||
1197 | case 2: | ||
1198 | if (val == 0) | ||
1199 | plugsb4reset = PLUGSB4RESET; | ||
1200 | else | ||
1201 | plugsb4reset = val; | ||
1202 | continue; | ||
1203 | case 3: | ||
1204 | if (val == 0) | ||
1205 | timeoutsb4reset = TIMEOUTSB4RESET; | ||
1206 | else | ||
1207 | timeoutsb4reset = val; | ||
1208 | continue; | ||
1209 | case 4: | ||
1210 | if (val == 0) | ||
1211 | ipi_reset_limit = IPI_RESET_LIMIT; | ||
1212 | else | ||
1213 | ipi_reset_limit = val; | ||
1214 | continue; | ||
1215 | case 5: | ||
1216 | if (val == 0) | ||
1217 | complete_threshold = COMPLETE_THRESHOLD; | ||
1218 | else | ||
1219 | complete_threshold = val; | ||
1220 | continue; | ||
1221 | case 6: | ||
1222 | if (val == 0) | ||
1223 | congested_response_us = CONGESTED_RESPONSE_US; | ||
1224 | else | ||
1225 | congested_response_us = val; | ||
1226 | continue; | ||
1227 | case 7: | ||
1228 | if (val == 0) | ||
1229 | congested_reps = CONGESTED_REPS; | ||
1230 | else | ||
1231 | congested_reps = val; | ||
1232 | continue; | ||
1233 | case 8: | ||
1234 | if (val == 0) | ||
1235 | congested_period = CONGESTED_PERIOD; | ||
1236 | else | ||
1237 | congested_period = val; | ||
1238 | continue; | ||
1239 | } | ||
1240 | if (q == p) | ||
1241 | break; | ||
1242 | } | ||
1243 | for_each_present_cpu(cpu) { | ||
1244 | bcp = &per_cpu(bau_control, cpu); | ||
1245 | bcp->max_bau_concurrent = max_bau_concurrent; | ||
1246 | bcp->max_bau_concurrent_constant = max_bau_concurrent; | ||
1247 | bcp->plugged_delay = plugged_delay; | ||
1248 | bcp->plugsb4reset = plugsb4reset; | ||
1249 | bcp->timeoutsb4reset = timeoutsb4reset; | ||
1250 | bcp->ipi_reset_limit = ipi_reset_limit; | ||
1251 | bcp->complete_threshold = complete_threshold; | ||
1252 | bcp->congested_response_us = congested_response_us; | ||
1253 | bcp->congested_reps = congested_reps; | ||
1254 | bcp->congested_period = congested_period; | ||
1255 | } | ||
1085 | return count; | 1256 | return count; |
1086 | } | 1257 | } |
1087 | 1258 | ||
@@ -1097,6 +1268,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file) | |||
1097 | return seq_open(file, &uv_ptc_seq_ops); | 1268 | return seq_open(file, &uv_ptc_seq_ops); |
1098 | } | 1269 | } |
1099 | 1270 | ||
1271 | static int tunables_open(struct inode *inode, struct file *file) | ||
1272 | { | ||
1273 | return 0; | ||
1274 | } | ||
1275 | |||
1100 | static const struct file_operations proc_uv_ptc_operations = { | 1276 | static const struct file_operations proc_uv_ptc_operations = { |
1101 | .open = uv_ptc_proc_open, | 1277 | .open = uv_ptc_proc_open, |
1102 | .read = seq_read, | 1278 | .read = seq_read, |
@@ -1105,6 +1281,12 @@ static const struct file_operations proc_uv_ptc_operations = { | |||
1105 | .release = seq_release, | 1281 | .release = seq_release, |
1106 | }; | 1282 | }; |
1107 | 1283 | ||
1284 | static const struct file_operations tunables_fops = { | ||
1285 | .open = tunables_open, | ||
1286 | .read = tunables_read, | ||
1287 | .write = tunables_write, | ||
1288 | }; | ||
1289 | |||
1108 | static int __init uv_ptc_init(void) | 1290 | static int __init uv_ptc_init(void) |
1109 | { | 1291 | { |
1110 | struct proc_dir_entry *proc_uv_ptc; | 1292 | struct proc_dir_entry *proc_uv_ptc; |
@@ -1119,6 +1301,20 @@ static int __init uv_ptc_init(void) | |||
1119 | UV_PTC_BASENAME); | 1301 | UV_PTC_BASENAME); |
1120 | return -EINVAL; | 1302 | return -EINVAL; |
1121 | } | 1303 | } |
1304 | |||
1305 | tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); | ||
1306 | if (!tunables_dir) { | ||
1307 | printk(KERN_ERR "unable to create debugfs directory %s\n", | ||
1308 | UV_BAU_TUNABLES_DIR); | ||
1309 | return -EINVAL; | ||
1310 | } | ||
1311 | tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, | ||
1312 | tunables_dir, NULL, &tunables_fops); | ||
1313 | if (!tunables_file) { | ||
1314 | printk(KERN_ERR "unable to create debugfs file %s\n", | ||
1315 | UV_BAU_TUNABLES_FILE); | ||
1316 | return -EINVAL; | ||
1317 | } | ||
1122 | return 0; | 1318 | return 0; |
1123 | } | 1319 | } |
1124 | 1320 | ||
@@ -1259,15 +1455,45 @@ static void __init uv_init_uvhub(int uvhub, int vector) | |||
1259 | } | 1455 | } |
1260 | 1456 | ||
1261 | /* | 1457 | /* |
1458 | * We will set BAU_MISC_CONTROL with a timeout period. | ||
1459 | * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. | ||
1460 | * So the destination timeout period has be be calculated from them. | ||
1461 | */ | ||
1462 | static int | ||
1463 | calculate_destination_timeout(void) | ||
1464 | { | ||
1465 | unsigned long mmr_image; | ||
1466 | int mult1; | ||
1467 | int mult2; | ||
1468 | int index; | ||
1469 | int base; | ||
1470 | int ret; | ||
1471 | unsigned long ts_ns; | ||
1472 | |||
1473 | mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; | ||
1474 | mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); | ||
1475 | index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; | ||
1476 | mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); | ||
1477 | mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; | ||
1478 | base = timeout_base_ns[index]; | ||
1479 | ts_ns = base * mult1 * mult2; | ||
1480 | ret = ts_ns / 1000; | ||
1481 | return ret; | ||
1482 | } | ||
1483 | |||
1484 | /* | ||
1262 | * initialize the bau_control structure for each cpu | 1485 | * initialize the bau_control structure for each cpu |
1263 | */ | 1486 | */ |
1264 | static void uv_init_per_cpu(int nuvhubs) | 1487 | static void __init uv_init_per_cpu(int nuvhubs) |
1265 | { | 1488 | { |
1266 | int i, j, k; | 1489 | int i; |
1267 | int cpu; | 1490 | int cpu; |
1268 | int pnode; | 1491 | int pnode; |
1269 | int uvhub; | 1492 | int uvhub; |
1493 | int have_hmaster; | ||
1270 | short socket = 0; | 1494 | short socket = 0; |
1495 | unsigned short socket_mask; | ||
1496 | unsigned char *uvhub_mask; | ||
1271 | struct bau_control *bcp; | 1497 | struct bau_control *bcp; |
1272 | struct uvhub_desc *bdp; | 1498 | struct uvhub_desc *bdp; |
1273 | struct socket_desc *sdp; | 1499 | struct socket_desc *sdp; |
@@ -1278,7 +1504,7 @@ static void uv_init_per_cpu(int nuvhubs) | |||
1278 | short cpu_number[16]; | 1504 | short cpu_number[16]; |
1279 | }; | 1505 | }; |
1280 | struct uvhub_desc { | 1506 | struct uvhub_desc { |
1281 | short num_sockets; | 1507 | unsigned short socket_mask; |
1282 | short num_cpus; | 1508 | short num_cpus; |
1283 | short uvhub; | 1509 | short uvhub; |
1284 | short pnode; | 1510 | short pnode; |
@@ -1286,57 +1512,84 @@ static void uv_init_per_cpu(int nuvhubs) | |||
1286 | }; | 1512 | }; |
1287 | struct uvhub_desc *uvhub_descs; | 1513 | struct uvhub_desc *uvhub_descs; |
1288 | 1514 | ||
1515 | timeout_us = calculate_destination_timeout(); | ||
1516 | |||
1289 | uvhub_descs = (struct uvhub_desc *) | 1517 | uvhub_descs = (struct uvhub_desc *) |
1290 | kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); | 1518 | kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); |
1291 | memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); | 1519 | memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); |
1520 | uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); | ||
1292 | for_each_present_cpu(cpu) { | 1521 | for_each_present_cpu(cpu) { |
1293 | bcp = &per_cpu(bau_control, cpu); | 1522 | bcp = &per_cpu(bau_control, cpu); |
1294 | memset(bcp, 0, sizeof(struct bau_control)); | 1523 | memset(bcp, 0, sizeof(struct bau_control)); |
1295 | spin_lock_init(&bcp->masks_lock); | ||
1296 | bcp->max_concurrent = uv_bau_max_concurrent; | ||
1297 | pnode = uv_cpu_hub_info(cpu)->pnode; | 1524 | pnode = uv_cpu_hub_info(cpu)->pnode; |
1298 | uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; | 1525 | uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; |
1526 | *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); | ||
1299 | bdp = &uvhub_descs[uvhub]; | 1527 | bdp = &uvhub_descs[uvhub]; |
1300 | bdp->num_cpus++; | 1528 | bdp->num_cpus++; |
1301 | bdp->uvhub = uvhub; | 1529 | bdp->uvhub = uvhub; |
1302 | bdp->pnode = pnode; | 1530 | bdp->pnode = pnode; |
1303 | /* time interval to catch a hardware stay-busy bug */ | 1531 | /* kludge: 'assuming' one node per socket, and assuming that |
1304 | bcp->timeout_interval = millisec_2_cycles(3); | 1532 | disabling a socket just leaves a gap in node numbers */ |
1305 | /* kludge: assume uv_hub.h is constant */ | 1533 | socket = (cpu_to_node(cpu) & 1); |
1306 | socket = (cpu_physical_id(cpu)>>5)&1; | 1534 | bdp->socket_mask |= (1 << socket); |
1307 | if (socket >= bdp->num_sockets) | ||
1308 | bdp->num_sockets = socket+1; | ||
1309 | sdp = &bdp->socket[socket]; | 1535 | sdp = &bdp->socket[socket]; |
1310 | sdp->cpu_number[sdp->num_cpus] = cpu; | 1536 | sdp->cpu_number[sdp->num_cpus] = cpu; |
1311 | sdp->num_cpus++; | 1537 | sdp->num_cpus++; |
1312 | } | 1538 | } |
1313 | socket = 0; | 1539 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) { |
1314 | for_each_possible_blade(uvhub) { | 1540 | if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) |
1541 | continue; | ||
1542 | have_hmaster = 0; | ||
1315 | bdp = &uvhub_descs[uvhub]; | 1543 | bdp = &uvhub_descs[uvhub]; |
1316 | for (i = 0; i < bdp->num_sockets; i++) { | 1544 | socket_mask = bdp->socket_mask; |
1317 | sdp = &bdp->socket[i]; | 1545 | socket = 0; |
1318 | for (j = 0; j < sdp->num_cpus; j++) { | 1546 | while (socket_mask) { |
1319 | cpu = sdp->cpu_number[j]; | 1547 | if (!(socket_mask & 1)) |
1548 | goto nextsocket; | ||
1549 | sdp = &bdp->socket[socket]; | ||
1550 | for (i = 0; i < sdp->num_cpus; i++) { | ||
1551 | cpu = sdp->cpu_number[i]; | ||
1320 | bcp = &per_cpu(bau_control, cpu); | 1552 | bcp = &per_cpu(bau_control, cpu); |
1321 | bcp->cpu = cpu; | 1553 | bcp->cpu = cpu; |
1322 | if (j == 0) { | 1554 | if (i == 0) { |
1323 | smaster = bcp; | 1555 | smaster = bcp; |
1324 | if (i == 0) | 1556 | if (!have_hmaster) { |
1557 | have_hmaster++; | ||
1325 | hmaster = bcp; | 1558 | hmaster = bcp; |
1559 | } | ||
1326 | } | 1560 | } |
1327 | bcp->cpus_in_uvhub = bdp->num_cpus; | 1561 | bcp->cpus_in_uvhub = bdp->num_cpus; |
1328 | bcp->cpus_in_socket = sdp->num_cpus; | 1562 | bcp->cpus_in_socket = sdp->num_cpus; |
1329 | bcp->socket_master = smaster; | 1563 | bcp->socket_master = smaster; |
1564 | bcp->uvhub = bdp->uvhub; | ||
1330 | bcp->uvhub_master = hmaster; | 1565 | bcp->uvhub_master = hmaster; |
1331 | for (k = 0; k < DEST_Q_SIZE; k++) | 1566 | bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> |
1332 | bcp->socket_acknowledge_count[k] = 0; | 1567 | blade_processor_id; |
1333 | bcp->uvhub_cpu = | ||
1334 | uv_cpu_hub_info(cpu)->blade_processor_id; | ||
1335 | } | 1568 | } |
1569 | nextsocket: | ||
1336 | socket++; | 1570 | socket++; |
1571 | socket_mask = (socket_mask >> 1); | ||
1337 | } | 1572 | } |
1338 | } | 1573 | } |
1339 | kfree(uvhub_descs); | 1574 | kfree(uvhub_descs); |
1575 | kfree(uvhub_mask); | ||
1576 | for_each_present_cpu(cpu) { | ||
1577 | bcp = &per_cpu(bau_control, cpu); | ||
1578 | bcp->baudisabled = 0; | ||
1579 | bcp->statp = &per_cpu(ptcstats, cpu); | ||
1580 | /* time interval to catch a hardware stay-busy bug */ | ||
1581 | bcp->timeout_interval = microsec_2_cycles(2*timeout_us); | ||
1582 | bcp->max_bau_concurrent = max_bau_concurrent; | ||
1583 | bcp->max_bau_concurrent_constant = max_bau_concurrent; | ||
1584 | bcp->plugged_delay = plugged_delay; | ||
1585 | bcp->plugsb4reset = plugsb4reset; | ||
1586 | bcp->timeoutsb4reset = timeoutsb4reset; | ||
1587 | bcp->ipi_reset_limit = ipi_reset_limit; | ||
1588 | bcp->complete_threshold = complete_threshold; | ||
1589 | bcp->congested_response_us = congested_response_us; | ||
1590 | bcp->congested_reps = congested_reps; | ||
1591 | bcp->congested_period = congested_period; | ||
1592 | } | ||
1340 | } | 1593 | } |
1341 | 1594 | ||
1342 | /* | 1595 | /* |
@@ -1361,10 +1614,11 @@ static int __init uv_bau_init(void) | |||
1361 | zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), | 1614 | zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), |
1362 | GFP_KERNEL, cpu_to_node(cur_cpu)); | 1615 | GFP_KERNEL, cpu_to_node(cur_cpu)); |
1363 | 1616 | ||
1364 | uv_bau_max_concurrent = MAX_BAU_CONCURRENT; | ||
1365 | uv_nshift = uv_hub_info->m_val; | 1617 | uv_nshift = uv_hub_info->m_val; |
1366 | uv_mmask = (1UL << uv_hub_info->m_val) - 1; | 1618 | uv_mmask = (1UL << uv_hub_info->m_val) - 1; |
1367 | nuvhubs = uv_num_possible_blades(); | 1619 | nuvhubs = uv_num_possible_blades(); |
1620 | spin_lock_init(&disable_lock); | ||
1621 | congested_cycles = microsec_2_cycles(congested_response_us); | ||
1368 | 1622 | ||
1369 | uv_init_per_cpu(nuvhubs); | 1623 | uv_init_per_cpu(nuvhubs); |
1370 | 1624 | ||
@@ -1383,15 +1637,19 @@ static int __init uv_bau_init(void) | |||
1383 | alloc_intr_gate(vector, uv_bau_message_intr1); | 1637 | alloc_intr_gate(vector, uv_bau_message_intr1); |
1384 | 1638 | ||
1385 | for_each_possible_blade(uvhub) { | 1639 | for_each_possible_blade(uvhub) { |
1386 | pnode = uv_blade_to_pnode(uvhub); | 1640 | if (uv_blade_nr_possible_cpus(uvhub)) { |
1387 | /* INIT the bau */ | 1641 | pnode = uv_blade_to_pnode(uvhub); |
1388 | uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, | 1642 | /* INIT the bau */ |
1389 | ((unsigned long)1 << 63)); | 1643 | uv_write_global_mmr64(pnode, |
1390 | mmr = 1; /* should be 1 to broadcast to both sockets */ | 1644 | UVH_LB_BAU_SB_ACTIVATION_CONTROL, |
1391 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); | 1645 | ((unsigned long)1 << 63)); |
1646 | mmr = 1; /* should be 1 to broadcast to both sockets */ | ||
1647 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, | ||
1648 | mmr); | ||
1649 | } | ||
1392 | } | 1650 | } |
1393 | 1651 | ||
1394 | return 0; | 1652 | return 0; |
1395 | } | 1653 | } |
1396 | core_initcall(uv_bau_init); | 1654 | core_initcall(uv_bau_init); |
1397 | core_initcall(uv_ptc_init); | 1655 | fs_initcall(uv_ptc_init); |
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index c652ef62742d..e2a595257390 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c | |||
@@ -1,6 +1,7 @@ | |||
1 | #include <linux/io.h> | 1 | #include <linux/io.h> |
2 | 2 | ||
3 | #include <asm/trampoline.h> | 3 | #include <asm/trampoline.h> |
4 | #include <asm/pgtable.h> | ||
4 | #include <asm/e820.h> | 5 | #include <asm/e820.h> |
5 | 6 | ||
6 | #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) | 7 | #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) |
@@ -37,3 +38,19 @@ unsigned long __trampinit setup_trampoline(void) | |||
37 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); | 38 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); |
38 | return virt_to_phys(trampoline_base); | 39 | return virt_to_phys(trampoline_base); |
39 | } | 40 | } |
41 | |||
42 | void __init setup_trampoline_page_table(void) | ||
43 | { | ||
44 | #ifdef CONFIG_X86_32 | ||
45 | /* Copy kernel address range */ | ||
46 | clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, | ||
47 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
48 | KERNEL_PGD_PTRS); | ||
49 | |||
50 | /* Initialize low mappings */ | ||
51 | clone_pgd_range(trampoline_pg_dir, | ||
52 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
53 | min_t(unsigned long, KERNEL_PGD_PTRS, | ||
54 | KERNEL_PGD_BOUNDARY)); | ||
55 | #endif | ||
56 | } | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 725ef4d17cd5..60788dee0f8a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
392 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | 392 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) |
393 | == NOTIFY_STOP) | 393 | == NOTIFY_STOP) |
394 | return; | 394 | return; |
395 | |||
395 | #ifdef CONFIG_X86_LOCAL_APIC | 396 | #ifdef CONFIG_X86_LOCAL_APIC |
397 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
398 | == NOTIFY_STOP) | ||
399 | return; | ||
400 | |||
401 | #ifndef CONFIG_LOCKUP_DETECTOR | ||
396 | /* | 402 | /* |
397 | * Ok, so this is none of the documented NMI sources, | 403 | * Ok, so this is none of the documented NMI sources, |
398 | * so it must be the NMI watchdog. | 404 | * so it must be the NMI watchdog. |
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
400 | if (nmi_watchdog_tick(regs, reason)) | 406 | if (nmi_watchdog_tick(regs, reason)) |
401 | return; | 407 | return; |
402 | if (!do_nmi_callback(regs, cpu)) | 408 | if (!do_nmi_callback(regs, cpu)) |
409 | #endif /* !CONFIG_LOCKUP_DETECTOR */ | ||
403 | unknown_nmi_error(reason, regs); | 410 | unknown_nmi_error(reason, regs); |
404 | #else | 411 | #else |
405 | unknown_nmi_error(reason, regs); | 412 | unknown_nmi_error(reason, regs); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae1841..26a863a9c2a8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -626,6 +626,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | |||
626 | local_irq_restore(flags); | 626 | local_irq_restore(flags); |
627 | } | 627 | } |
628 | 628 | ||
629 | static unsigned long long cyc2ns_suspend; | ||
630 | |||
631 | void save_sched_clock_state(void) | ||
632 | { | ||
633 | if (!sched_clock_stable) | ||
634 | return; | ||
635 | |||
636 | cyc2ns_suspend = sched_clock(); | ||
637 | } | ||
638 | |||
639 | /* | ||
640 | * Even on processors with invariant TSC, TSC gets reset in some the | ||
641 | * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to | ||
642 | * arbitrary value (still sync'd across cpu's) during resume from such sleep | ||
643 | * states. To cope up with this, recompute the cyc2ns_offset for each cpu so | ||
644 | * that sched_clock() continues from the point where it was left off during | ||
645 | * suspend. | ||
646 | */ | ||
647 | void restore_sched_clock_state(void) | ||
648 | { | ||
649 | unsigned long long offset; | ||
650 | unsigned long flags; | ||
651 | int cpu; | ||
652 | |||
653 | if (!sched_clock_stable) | ||
654 | return; | ||
655 | |||
656 | local_irq_save(flags); | ||
657 | |||
658 | __get_cpu_var(cyc2ns_offset) = 0; | ||
659 | offset = cyc2ns_suspend - sched_clock(); | ||
660 | |||
661 | for_each_possible_cpu(cpu) | ||
662 | per_cpu(cyc2ns_offset, cpu) = offset; | ||
663 | |||
664 | local_irq_restore(flags); | ||
665 | } | ||
666 | |||
629 | #ifdef CONFIG_CPU_FREQ | 667 | #ifdef CONFIG_CPU_FREQ |
630 | 668 | ||
631 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency | 669 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency |
@@ -751,7 +789,6 @@ static struct clocksource clocksource_tsc = { | |||
751 | .read = read_tsc, | 789 | .read = read_tsc, |
752 | .resume = resume_tsc, | 790 | .resume = resume_tsc, |
753 | .mask = CLOCKSOURCE_MASK(64), | 791 | .mask = CLOCKSOURCE_MASK(64), |
754 | .shift = 22, | ||
755 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | 792 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | |
756 | CLOCK_SOURCE_MUST_VERIFY, | 793 | CLOCK_SOURCE_MUST_VERIFY, |
757 | #ifdef CONFIG_X86_64 | 794 | #ifdef CONFIG_X86_64 |
@@ -845,8 +882,6 @@ __cpuinit int unsynchronized_tsc(void) | |||
845 | 882 | ||
846 | static void __init init_tsc_clocksource(void) | 883 | static void __init init_tsc_clocksource(void) |
847 | { | 884 | { |
848 | clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, | ||
849 | clocksource_tsc.shift); | ||
850 | if (tsc_clocksource_reliable) | 885 | if (tsc_clocksource_reliable) |
851 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; | 886 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; |
852 | /* lower the rating if we already know its unstable: */ | 887 | /* lower the rating if we already know its unstable: */ |
@@ -854,7 +889,7 @@ static void __init init_tsc_clocksource(void) | |||
854 | clocksource_tsc.rating = 0; | 889 | clocksource_tsc.rating = 0; |
855 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; | 890 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; |
856 | } | 891 | } |
857 | clocksource_register(&clocksource_tsc); | 892 | clocksource_register_khz(&clocksource_tsc, tsc_khz); |
858 | } | 893 | } |
859 | 894 | ||
860 | #ifdef CONFIG_X86_64 | 895 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a975a1..56a8c2a867d9 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S | |||
@@ -31,6 +31,7 @@ | |||
31 | */ | 31 | */ |
32 | 32 | ||
33 | #include <asm/cpufeature.h> | 33 | #include <asm/cpufeature.h> |
34 | #include <asm/msr-index.h> | ||
34 | 35 | ||
35 | verify_cpu: | 36 | verify_cpu: |
36 | pushfl # Save caller passed flags | 37 | pushfl # Save caller passed flags |
@@ -88,7 +89,7 @@ verify_cpu_sse_test: | |||
88 | je verify_cpu_sse_ok | 89 | je verify_cpu_sse_ok |
89 | test %di,%di | 90 | test %di,%di |
90 | jz verify_cpu_no_longmode # only try to force SSE on AMD | 91 | jz verify_cpu_no_longmode # only try to force SSE on AMD |
91 | movl $0xc0010015,%ecx # HWCR | 92 | movl $MSR_K7_HWCR,%ecx |
92 | rdmsr | 93 | rdmsr |
93 | btr $15,%eax # enable SSE | 94 | btr $15,%eax # enable SSE |
94 | wrmsr | 95 | wrmsr |
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab9c60f..dcbb28c4b694 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void) | |||
73 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | 73 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
74 | } | 74 | } |
75 | 75 | ||
76 | void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, | 76 | void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, |
77 | u32 mult) | 77 | struct clocksource *clock, u32 mult) |
78 | { | 78 | { |
79 | unsigned long flags; | 79 | unsigned long flags; |
80 | 80 | ||
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, | |||
87 | vsyscall_gtod_data.clock.shift = clock->shift; | 87 | vsyscall_gtod_data.clock.shift = clock->shift; |
88 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; | 88 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; |
89 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | 89 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; |
90 | vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; | 90 | vsyscall_gtod_data.wall_to_monotonic = *wtm; |
91 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); | 91 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); |
92 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | 92 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
93 | } | 93 | } |
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | |||
169 | * unlikely */ | 169 | * unlikely */ |
170 | time_t __vsyscall(1) vtime(time_t *t) | 170 | time_t __vsyscall(1) vtime(time_t *t) |
171 | { | 171 | { |
172 | struct timeval tv; | 172 | unsigned seq; |
173 | time_t result; | 173 | time_t result; |
174 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) | 174 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) |
175 | return time_syscall(t); | 175 | return time_syscall(t); |
176 | 176 | ||
177 | vgettimeofday(&tv, NULL); | 177 | do { |
178 | result = tv.tv_sec; | 178 | seq = read_seqbegin(&__vsyscall_gtod_data.lock); |
179 | |||
180 | result = __vsyscall_gtod_data.wall_time_sec; | ||
181 | |||
182 | } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); | ||
183 | |||
179 | if (t) | 184 | if (t) |
180 | *t = result; | 185 | *t = result; |
181 | return result; | 186 | return result; |
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24a..9c253bd65e24 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -16,11 +16,88 @@ | |||
16 | */ | 16 | */ |
17 | u64 pcntxt_mask; | 17 | u64 pcntxt_mask; |
18 | 18 | ||
19 | /* | ||
20 | * Represents init state for the supported extended state. | ||
21 | */ | ||
22 | static struct xsave_struct *init_xstate_buf; | ||
23 | |||
19 | struct _fpx_sw_bytes fx_sw_reserved; | 24 | struct _fpx_sw_bytes fx_sw_reserved; |
20 | #ifdef CONFIG_IA32_EMULATION | 25 | #ifdef CONFIG_IA32_EMULATION |
21 | struct _fpx_sw_bytes fx_sw_reserved_ia32; | 26 | struct _fpx_sw_bytes fx_sw_reserved_ia32; |
22 | #endif | 27 | #endif |
23 | 28 | ||
29 | static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; | ||
30 | |||
31 | /* | ||
32 | * If a processor implementation discern that a processor state component is | ||
33 | * in its initialized state it may modify the corresponding bit in the | ||
34 | * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory | ||
35 | * layout in the case of xsaveopt. While presenting the xstate information to | ||
36 | * the user, we always ensure that the memory layout of a feature will be in | ||
37 | * the init state if the corresponding header bit is zero. This is to ensure | ||
38 | * that the user doesn't see some stale state in the memory layout during | ||
39 | * signal handling, debugging etc. | ||
40 | */ | ||
41 | void __sanitize_i387_state(struct task_struct *tsk) | ||
42 | { | ||
43 | u64 xstate_bv; | ||
44 | int feature_bit = 0x2; | ||
45 | struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; | ||
46 | |||
47 | if (!fx) | ||
48 | return; | ||
49 | |||
50 | BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); | ||
51 | |||
52 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; | ||
53 | |||
54 | /* | ||
55 | * None of the feature bits are in init state. So nothing else | ||
56 | * to do for us, as the memory layout is upto date. | ||
57 | */ | ||
58 | if ((xstate_bv & pcntxt_mask) == pcntxt_mask) | ||
59 | return; | ||
60 | |||
61 | /* | ||
62 | * FP is in init state | ||
63 | */ | ||
64 | if (!(xstate_bv & XSTATE_FP)) { | ||
65 | fx->cwd = 0x37f; | ||
66 | fx->swd = 0; | ||
67 | fx->twd = 0; | ||
68 | fx->fop = 0; | ||
69 | fx->rip = 0; | ||
70 | fx->rdp = 0; | ||
71 | memset(&fx->st_space[0], 0, 128); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * SSE is in init state | ||
76 | */ | ||
77 | if (!(xstate_bv & XSTATE_SSE)) | ||
78 | memset(&fx->xmm_space[0], 0, 256); | ||
79 | |||
80 | xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; | ||
81 | |||
82 | /* | ||
83 | * Update all the other memory layouts for which the corresponding | ||
84 | * header bit is in the init state. | ||
85 | */ | ||
86 | while (xstate_bv) { | ||
87 | if (xstate_bv & 0x1) { | ||
88 | int offset = xstate_offsets[feature_bit]; | ||
89 | int size = xstate_sizes[feature_bit]; | ||
90 | |||
91 | memcpy(((void *) fx) + offset, | ||
92 | ((void *) init_xstate_buf) + offset, | ||
93 | size); | ||
94 | } | ||
95 | |||
96 | xstate_bv >>= 1; | ||
97 | feature_bit++; | ||
98 | } | ||
99 | } | ||
100 | |||
24 | /* | 101 | /* |
25 | * Check for the presence of extended state information in the | 102 | * Check for the presence of extended state information in the |
26 | * user fpstate pointer in the sigcontext. | 103 | * user fpstate pointer in the sigcontext. |
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, | |||
36 | 113 | ||
37 | err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], | 114 | err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], |
38 | sizeof(struct _fpx_sw_bytes)); | 115 | sizeof(struct _fpx_sw_bytes)); |
39 | |||
40 | if (err) | 116 | if (err) |
41 | return err; | 117 | return -EFAULT; |
42 | 118 | ||
43 | /* | 119 | /* |
44 | * First Magic check failed. | 120 | * First Magic check failed. |
45 | */ | 121 | */ |
46 | if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) | 122 | if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) |
47 | return -1; | 123 | return -EINVAL; |
48 | 124 | ||
49 | /* | 125 | /* |
50 | * Check for error scenarios. | 126 | * Check for error scenarios. |
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, | |||
52 | if (fx_sw_user->xstate_size < min_xstate_size || | 128 | if (fx_sw_user->xstate_size < min_xstate_size || |
53 | fx_sw_user->xstate_size > xstate_size || | 129 | fx_sw_user->xstate_size > xstate_size || |
54 | fx_sw_user->xstate_size > fx_sw_user->extended_size) | 130 | fx_sw_user->xstate_size > fx_sw_user->extended_size) |
55 | return -1; | 131 | return -EINVAL; |
56 | 132 | ||
57 | err = __get_user(magic2, (__u32 *) (((void *)fpstate) + | 133 | err = __get_user(magic2, (__u32 *) (((void *)fpstate) + |
58 | fx_sw_user->extended_size - | 134 | fx_sw_user->extended_size - |
59 | FP_XSTATE_MAGIC2_SIZE)); | 135 | FP_XSTATE_MAGIC2_SIZE)); |
136 | if (err) | ||
137 | return err; | ||
60 | /* | 138 | /* |
61 | * Check for the presence of second magic word at the end of memory | 139 | * Check for the presence of second magic word at the end of memory |
62 | * layout. This detects the case where the user just copied the legacy | 140 | * layout. This detects the case where the user just copied the legacy |
63 | * fpstate layout with out copying the extended state information | 141 | * fpstate layout with out copying the extended state information |
64 | * in the memory layout. | 142 | * in the memory layout. |
65 | */ | 143 | */ |
66 | if (err || magic2 != FP_XSTATE_MAGIC2) | 144 | if (magic2 != FP_XSTATE_MAGIC2) |
67 | return -1; | 145 | return -EFAULT; |
68 | 146 | ||
69 | return 0; | 147 | return 0; |
70 | } | 148 | } |
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf) | |||
91 | return 0; | 169 | return 0; |
92 | 170 | ||
93 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | 171 | if (task_thread_info(tsk)->status & TS_USEDFPU) { |
94 | /* | ||
95 | * Start with clearing the user buffer. This will present a | ||
96 | * clean context for the bytes not touched by the fxsave/xsave. | ||
97 | */ | ||
98 | err = __clear_user(buf, sig_xstate_size); | ||
99 | if (err) | ||
100 | return err; | ||
101 | |||
102 | if (use_xsave()) | 172 | if (use_xsave()) |
103 | err = xsave_user(buf); | 173 | err = xsave_user(buf); |
104 | else | 174 | else |
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf) | |||
109 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | 179 | task_thread_info(tsk)->status &= ~TS_USEDFPU; |
110 | stts(); | 180 | stts(); |
111 | } else { | 181 | } else { |
182 | sanitize_i387_state(tsk); | ||
112 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, | 183 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, |
113 | xstate_size)) | 184 | xstate_size)) |
114 | return -1; | 185 | return -1; |
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf) | |||
184 | * init the state skipped by the user. | 255 | * init the state skipped by the user. |
185 | */ | 256 | */ |
186 | mask = pcntxt_mask & ~mask; | 257 | mask = pcntxt_mask & ~mask; |
187 | 258 | if (unlikely(mask)) | |
188 | xrstor_state(init_xstate_buf, mask); | 259 | xrstor_state(init_xstate_buf, mask); |
189 | 260 | ||
190 | return 0; | 261 | return 0; |
191 | 262 | ||
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void) | |||
274 | #endif | 345 | #endif |
275 | } | 346 | } |
276 | 347 | ||
277 | /* | ||
278 | * Represents init state for the supported extended state. | ||
279 | */ | ||
280 | struct xsave_struct *init_xstate_buf; | ||
281 | |||
282 | #ifdef CONFIG_X86_64 | 348 | #ifdef CONFIG_X86_64 |
283 | unsigned int sig_xstate_size = sizeof(struct _fpstate); | 349 | unsigned int sig_xstate_size = sizeof(struct _fpstate); |
284 | #endif | 350 | #endif |
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); | |||
286 | /* | 352 | /* |
287 | * Enable the extended processor state save/restore feature | 353 | * Enable the extended processor state save/restore feature |
288 | */ | 354 | */ |
289 | void __cpuinit xsave_init(void) | 355 | static inline void xstate_enable(void) |
290 | { | 356 | { |
291 | if (!cpu_has_xsave) | ||
292 | return; | ||
293 | |||
294 | set_in_cr4(X86_CR4_OSXSAVE); | 357 | set_in_cr4(X86_CR4_OSXSAVE); |
295 | |||
296 | /* | ||
297 | * Enable all the features that the HW is capable of | ||
298 | * and the Linux kernel is aware of. | ||
299 | */ | ||
300 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); | 358 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); |
301 | } | 359 | } |
302 | 360 | ||
303 | /* | 361 | /* |
362 | * Record the offsets and sizes of different state managed by the xsave | ||
363 | * memory layout. | ||
364 | */ | ||
365 | static void __init setup_xstate_features(void) | ||
366 | { | ||
367 | int eax, ebx, ecx, edx, leaf = 0x2; | ||
368 | |||
369 | xstate_features = fls64(pcntxt_mask); | ||
370 | xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); | ||
371 | xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); | ||
372 | |||
373 | do { | ||
374 | cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); | ||
375 | |||
376 | if (eax == 0) | ||
377 | break; | ||
378 | |||
379 | xstate_offsets[leaf] = ebx; | ||
380 | xstate_sizes[leaf] = eax; | ||
381 | |||
382 | leaf++; | ||
383 | } while (1); | ||
384 | } | ||
385 | |||
386 | /* | ||
304 | * setup the xstate image representing the init state | 387 | * setup the xstate image representing the init state |
305 | */ | 388 | */ |
306 | static void __init setup_xstate_init(void) | 389 | static void __init setup_xstate_init(void) |
307 | { | 390 | { |
391 | setup_xstate_features(); | ||
392 | |||
393 | /* | ||
394 | * Setup init_xstate_buf to represent the init state of | ||
395 | * all the features managed by the xsave | ||
396 | */ | ||
308 | init_xstate_buf = alloc_bootmem(xstate_size); | 397 | init_xstate_buf = alloc_bootmem(xstate_size); |
309 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; | 398 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; |
399 | |||
400 | clts(); | ||
401 | /* | ||
402 | * Init all the features state with header_bv being 0x0 | ||
403 | */ | ||
404 | xrstor_state(init_xstate_buf, -1); | ||
405 | /* | ||
406 | * Dump the init state again. This is to identify the init state | ||
407 | * of any feature which is not represented by all zero's. | ||
408 | */ | ||
409 | xsave_state(init_xstate_buf, -1); | ||
410 | stts(); | ||
310 | } | 411 | } |
311 | 412 | ||
312 | /* | 413 | /* |
313 | * Enable and initialize the xsave feature. | 414 | * Enable and initialize the xsave feature. |
314 | */ | 415 | */ |
315 | void __ref xsave_cntxt_init(void) | 416 | static void __init xstate_enable_boot_cpu(void) |
316 | { | 417 | { |
317 | unsigned int eax, ebx, ecx, edx; | 418 | unsigned int eax, ebx, ecx, edx; |
318 | 419 | ||
319 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | 420 | if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { |
421 | WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); | ||
422 | return; | ||
423 | } | ||
424 | |||
425 | cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); | ||
320 | pcntxt_mask = eax + ((u64)edx << 32); | 426 | pcntxt_mask = eax + ((u64)edx << 32); |
321 | 427 | ||
322 | if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { | 428 | if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { |
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void) | |||
329 | * Support only the state known to OS. | 435 | * Support only the state known to OS. |
330 | */ | 436 | */ |
331 | pcntxt_mask = pcntxt_mask & XCNTXT_MASK; | 437 | pcntxt_mask = pcntxt_mask & XCNTXT_MASK; |
332 | xsave_init(); | 438 | |
439 | xstate_enable(); | ||
333 | 440 | ||
334 | /* | 441 | /* |
335 | * Recompute the context size for enabled features | 442 | * Recompute the context size for enabled features |
336 | */ | 443 | */ |
337 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | 444 | cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); |
338 | xstate_size = ebx; | 445 | xstate_size = ebx; |
339 | 446 | ||
340 | update_regset_xstate_info(xstate_size, pcntxt_mask); | 447 | update_regset_xstate_info(xstate_size, pcntxt_mask); |
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void) | |||
346 | "cntxt size 0x%x\n", | 453 | "cntxt size 0x%x\n", |
347 | pcntxt_mask, xstate_size); | 454 | pcntxt_mask, xstate_size); |
348 | } | 455 | } |
456 | |||
457 | /* | ||
458 | * For the very first instance, this calls xstate_enable_boot_cpu(); | ||
459 | * for all subsequent instances, this calls xstate_enable(). | ||
460 | * | ||
461 | * This is somewhat obfuscated due to the lack of powerful enough | ||
462 | * overrides for the section checks. | ||
463 | */ | ||
464 | void __cpuinit xsave_init(void) | ||
465 | { | ||
466 | static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; | ||
467 | void (*this_func)(void); | ||
468 | |||
469 | if (!cpu_has_xsave) | ||
470 | return; | ||
471 | |||
472 | this_func = next_func; | ||
473 | next_func = xstate_enable; | ||
474 | this_func(); | ||
475 | } | ||