aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c23
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apb_timer.c8
-rw-r--r--arch/x86/kernel/apic/apic.c455
-rw-r--r--arch/x86/kernel/apic/io_apic.c13
-rw-r--r--arch/x86/kernel/cpu/amd.c19
-rw-r--r--arch/x86/kernel/cpu/common.c38
-rw-r--r--arch/x86/kernel/cpu/intel.c10
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c145
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c76
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h18
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/e820.c26
-rw-r--r--arch/x86/kernel/early_printk.c187
-rw-r--r--arch/x86/kernel/entry_32.S3
-rw-r--r--arch/x86/kernel/entry_64.S320
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S30
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c45
-rw-r--r--arch/x86/kernel/i387.c42
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/irq_32.c13
-rw-r--r--arch/x86/kernel/kprobes/core.c56
-rw-r--r--arch/x86/kernel/kprobes/opt.c5
-rw-r--r--arch/x86/kernel/kvm.c13
-rw-r--r--arch/x86/kernel/livepatch.c90
-rw-r--r--arch/x86/kernel/module.c24
-rw-r--r--arch/x86/kernel/pmc_atom.c81
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/rtc.c6
-rw-r--r--arch/x86/kernel/setup.c35
-rw-r--r--arch/x86/kernel/signal.c8
-rw-r--r--arch/x86/kernel/smpboot.c113
-rw-r--r--arch/x86/kernel/traps.c131
-rw-r--r--arch/x86/kernel/uprobes.c153
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c10
-rw-r--r--arch/x86/kernel/xsave.c3
54 files changed, 1440 insertions, 879 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..cdb1b70ddad0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg
16CFLAGS_REMOVE_early_printk.o = -pg 16CFLAGS_REMOVE_early_printk.o = -pg
17endif 17endif
18 18
19KASAN_SANITIZE_head$(BITS).o := n
20KASAN_SANITIZE_dumpstack.o := n
21KASAN_SANITIZE_dumpstack_$(BITS).o := n
22
19CFLAGS_irq.o := -I$(src)/../include/asm/trace 23CFLAGS_irq.o := -I$(src)/../include/asm/trace
20 24
21obj-y := process_$(BITS).o signal.o entry_$(BITS).o 25obj-y := process_$(BITS).o signal.o entry_$(BITS).o
@@ -63,6 +67,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
63obj-y += apic/ 67obj-y += apic/
64obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 68obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
65obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 69obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
70obj-$(CONFIG_LIVEPATCH) += livepatch.o
66obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 71obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
67obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o 72obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
68obj-$(CONFIG_X86_TSC) += trace_clock.o 73obj-$(CONFIG_X86_TSC) += trace_clock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b9e30daa0881..3d525c6124f6 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -613,6 +613,11 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
613{ 613{
614 int rc, irq, trigger, polarity; 614 int rc, irq, trigger, polarity;
615 615
616 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
617 *irqp = gsi;
618 return 0;
619 }
620
616 rc = acpi_get_override_irq(gsi, &trigger, &polarity); 621 rc = acpi_get_override_irq(gsi, &trigger, &polarity);
617 if (rc == 0) { 622 if (rc == 0) {
618 trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; 623 trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
@@ -653,6 +658,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
653 return gsi; 658 return gsi;
654} 659}
655 660
661#ifdef CONFIG_X86_LOCAL_APIC
656static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, 662static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
657 int trigger, int polarity) 663 int trigger, int polarity)
658{ 664{
@@ -675,6 +681,7 @@ static void acpi_unregister_gsi_ioapic(u32 gsi)
675 mutex_unlock(&acpi_ioapic_lock); 681 mutex_unlock(&acpi_ioapic_lock);
676#endif 682#endif
677} 683}
684#endif
678 685
679int (*__acpi_register_gsi)(struct device *dev, u32 gsi, 686int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
680 int trigger, int polarity) = acpi_register_gsi_pic; 687 int trigger, int polarity) = acpi_register_gsi_pic;
@@ -843,13 +850,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
843 850
844static int __init acpi_parse_sbf(struct acpi_table_header *table) 851static int __init acpi_parse_sbf(struct acpi_table_header *table)
845{ 852{
846 struct acpi_table_boot *sb; 853 struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
847
848 sb = (struct acpi_table_boot *)table;
849 if (!sb) {
850 printk(KERN_WARNING PREFIX "Unable to map SBF\n");
851 return -ENODEV;
852 }
853 854
854 sbf_port = sb->cmos_index; /* Save CMOS port */ 855 sbf_port = sb->cmos_index; /* Save CMOS port */
855 856
@@ -863,13 +864,7 @@ static struct resource *hpet_res __initdata;
863 864
864static int __init acpi_parse_hpet(struct acpi_table_header *table) 865static int __init acpi_parse_hpet(struct acpi_table_header *table)
865{ 866{
866 struct acpi_table_hpet *hpet_tbl; 867 struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
867
868 hpet_tbl = (struct acpi_table_hpet *)table;
869 if (!hpet_tbl) {
870 printk(KERN_WARNING PREFIX "Unable to map HPET\n");
871 return -ENODEV;
872 }
873 868
874 if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { 869 if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
875 printk(KERN_WARNING PREFIX "HPET timers must be located in " 870 printk(KERN_WARNING PREFIX "HPET timers must be located in "
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 31368207837c..d1daead5fcdd 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void)
78 78
79 header->pmode_cr0 = read_cr0(); 79 header->pmode_cr0 = read_cr0();
80 if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { 80 if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
81 header->pmode_cr4 = read_cr4(); 81 header->pmode_cr4 = __read_cr4();
82 header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); 82 header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
83 } 83 }
84 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, 84 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index b708738d016e..6a7c23ff21d3 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -135,14 +135,6 @@ static inline void apbt_clear_mapping(void)
135 apbt_virt_address = NULL; 135 apbt_virt_address = NULL;
136} 136}
137 137
138/*
139 * APBT timer interrupt enable / disable
140 */
141static inline int is_apbt_capable(void)
142{
143 return apbt_virt_address ? 1 : 0;
144}
145
146static int __init apbt_clockevent_register(void) 138static int __init apbt_clockevent_register(void)
147{ 139{
148 struct sfi_timer_table_entry *mtmr; 140 struct sfi_timer_table_entry *mtmr;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 29b5b18afa27..ad3639ae1b9b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -134,9 +134,6 @@ static inline void imcr_apic_to_pic(void)
134 */ 134 */
135static int force_enable_local_apic __initdata; 135static int force_enable_local_apic __initdata;
136 136
137/* Control whether x2APIC mode is enabled or not */
138static bool nox2apic __initdata;
139
140/* 137/*
141 * APIC command line parameters 138 * APIC command line parameters
142 */ 139 */
@@ -161,33 +158,6 @@ static __init int setup_apicpmtimer(char *s)
161__setup("apicpmtimer", setup_apicpmtimer); 158__setup("apicpmtimer", setup_apicpmtimer);
162#endif 159#endif
163 160
164int x2apic_mode;
165#ifdef CONFIG_X86_X2APIC
166/* x2apic enabled before OS handover */
167int x2apic_preenabled;
168static int x2apic_disabled;
169static int __init setup_nox2apic(char *str)
170{
171 if (x2apic_enabled()) {
172 int apicid = native_apic_msr_read(APIC_ID);
173
174 if (apicid >= 255) {
175 pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
176 apicid);
177 return 0;
178 }
179
180 pr_warning("x2apic already enabled. will disable it\n");
181 } else
182 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
183
184 nox2apic = true;
185
186 return 0;
187}
188early_param("nox2apic", setup_nox2apic);
189#endif
190
191unsigned long mp_lapic_addr; 161unsigned long mp_lapic_addr;
192int disable_apic; 162int disable_apic;
193/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 163/* Disable local APIC timer from the kernel commandline or via dmi quirk */
@@ -1475,7 +1445,7 @@ void setup_local_APIC(void)
1475#endif 1445#endif
1476} 1446}
1477 1447
1478void end_local_APIC_setup(void) 1448static void end_local_APIC_setup(void)
1479{ 1449{
1480 lapic_setup_esr(); 1450 lapic_setup_esr();
1481 1451
@@ -1492,116 +1462,183 @@ void end_local_APIC_setup(void)
1492 apic_pm_activate(); 1462 apic_pm_activate();
1493} 1463}
1494 1464
1495void __init bsp_end_local_APIC_setup(void) 1465/*
1466 * APIC setup function for application processors. Called from smpboot.c
1467 */
1468void apic_ap_setup(void)
1496{ 1469{
1470 setup_local_APIC();
1497 end_local_APIC_setup(); 1471 end_local_APIC_setup();
1498
1499 /*
1500 * Now that local APIC setup is completed for BP, configure the fault
1501 * handling for interrupt remapping.
1502 */
1503 irq_remap_enable_fault_handling();
1504
1505} 1472}
1506 1473
1507#ifdef CONFIG_X86_X2APIC 1474#ifdef CONFIG_X86_X2APIC
1508/* 1475int x2apic_mode;
1509 * Need to disable xapic and x2apic at the same time and then enable xapic mode
1510 */
1511static inline void __disable_x2apic(u64 msr)
1512{
1513 wrmsrl(MSR_IA32_APICBASE,
1514 msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
1515 wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
1516}
1517 1476
1518static __init void disable_x2apic(void) 1477enum {
1478 X2APIC_OFF,
1479 X2APIC_ON,
1480 X2APIC_DISABLED,
1481};
1482static int x2apic_state;
1483
1484static inline void __x2apic_disable(void)
1519{ 1485{
1520 u64 msr; 1486 u64 msr;
1521 1487
1522 if (!cpu_has_x2apic) 1488 if (cpu_has_apic)
1523 return; 1489 return;
1524 1490
1525 rdmsrl(MSR_IA32_APICBASE, msr); 1491 rdmsrl(MSR_IA32_APICBASE, msr);
1526 if (msr & X2APIC_ENABLE) { 1492 if (!(msr & X2APIC_ENABLE))
1527 u32 x2apic_id = read_apic_id(); 1493 return;
1528 1494 /* Disable xapic and x2apic first and then reenable xapic mode */
1529 if (x2apic_id >= 255) 1495 wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
1530 panic("Cannot disable x2apic, id: %08x\n", x2apic_id); 1496 wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
1497 printk_once(KERN_INFO "x2apic disabled\n");
1498}
1531 1499
1532 pr_info("Disabling x2apic\n"); 1500static inline void __x2apic_enable(void)
1533 __disable_x2apic(msr); 1501{
1502 u64 msr;
1534 1503
1535 if (nox2apic) { 1504 rdmsrl(MSR_IA32_APICBASE, msr);
1536 clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); 1505 if (msr & X2APIC_ENABLE)
1537 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 1506 return;
1538 } 1507 wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
1508 printk_once(KERN_INFO "x2apic enabled\n");
1509}
1539 1510
1540 x2apic_disabled = 1; 1511static int __init setup_nox2apic(char *str)
1541 x2apic_mode = 0; 1512{
1513 if (x2apic_enabled()) {
1514 int apicid = native_apic_msr_read(APIC_ID);
1542 1515
1543 register_lapic_address(mp_lapic_addr); 1516 if (apicid >= 255) {
1517 pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
1518 apicid);
1519 return 0;
1520 }
1521 pr_warning("x2apic already enabled.\n");
1522 __x2apic_disable();
1544 } 1523 }
1524 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
1525 x2apic_state = X2APIC_DISABLED;
1526 x2apic_mode = 0;
1527 return 0;
1545} 1528}
1529early_param("nox2apic", setup_nox2apic);
1546 1530
1547void check_x2apic(void) 1531/* Called from cpu_init() to enable x2apic on (secondary) cpus */
1532void x2apic_setup(void)
1548{ 1533{
1549 if (x2apic_enabled()) { 1534 /*
1550 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); 1535 * If x2apic is not in ON state, disable it if already enabled
1551 x2apic_preenabled = x2apic_mode = 1; 1536 * from BIOS.
1537 */
1538 if (x2apic_state != X2APIC_ON) {
1539 __x2apic_disable();
1540 return;
1552 } 1541 }
1542 __x2apic_enable();
1553} 1543}
1554 1544
1555void enable_x2apic(void) 1545static __init void x2apic_disable(void)
1556{ 1546{
1557 u64 msr; 1547 u32 x2apic_id;
1558 1548
1559 rdmsrl(MSR_IA32_APICBASE, msr); 1549 if (x2apic_state != X2APIC_ON)
1560 if (x2apic_disabled) { 1550 goto out;
1561 __disable_x2apic(msr); 1551
1552 x2apic_id = read_apic_id();
1553 if (x2apic_id >= 255)
1554 panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
1555
1556 __x2apic_disable();
1557 register_lapic_address(mp_lapic_addr);
1558out:
1559 x2apic_state = X2APIC_DISABLED;
1560 x2apic_mode = 0;
1561}
1562
1563static __init void x2apic_enable(void)
1564{
1565 if (x2apic_state != X2APIC_OFF)
1562 return; 1566 return;
1563 }
1564 1567
1565 if (!x2apic_mode) 1568 x2apic_mode = 1;
1569 x2apic_state = X2APIC_ON;
1570 __x2apic_enable();
1571}
1572
1573static __init void try_to_enable_x2apic(int remap_mode)
1574{
1575 if (x2apic_state == X2APIC_DISABLED)
1566 return; 1576 return;
1567 1577
1568 if (!(msr & X2APIC_ENABLE)) { 1578 if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
1569 printk_once(KERN_INFO "Enabling x2apic\n"); 1579 /* IR is required if there is APIC ID > 255 even when running
1570 wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); 1580 * under KVM
1581 */
1582 if (max_physical_apicid > 255 ||
1583 !hypervisor_x2apic_available()) {
1584 pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
1585 x2apic_disable();
1586 return;
1587 }
1588
1589 /*
1590 * without IR all CPUs can be addressed by IOAPIC/MSI
1591 * only in physical mode
1592 */
1593 x2apic_phys = 1;
1571 } 1594 }
1595 x2apic_enable();
1572} 1596}
1573#endif /* CONFIG_X86_X2APIC */
1574 1597
1575int __init enable_IR(void) 1598void __init check_x2apic(void)
1576{ 1599{
1577#ifdef CONFIG_IRQ_REMAP 1600 if (x2apic_enabled()) {
1578 if (!irq_remapping_supported()) { 1601 pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
1579 pr_debug("intr-remapping not supported\n"); 1602 x2apic_mode = 1;
1580 return -1; 1603 x2apic_state = X2APIC_ON;
1604 } else if (!cpu_has_x2apic) {
1605 x2apic_state = X2APIC_DISABLED;
1581 } 1606 }
1607}
1608#else /* CONFIG_X86_X2APIC */
1609static int __init validate_x2apic(void)
1610{
1611 if (!apic_is_x2apic_enabled())
1612 return 0;
1613 /*
1614 * Checkme: Can we simply turn off x2apic here instead of panic?
1615 */
1616 panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
1617}
1618early_initcall(validate_x2apic);
1582 1619
1583 if (!x2apic_preenabled && skip_ioapic_setup) { 1620static inline void try_to_enable_x2apic(int remap_mode) { }
1584 pr_info("Skipped enabling intr-remap because of skipping " 1621static inline void __x2apic_enable(void) { }
1585 "io-apic setup\n"); 1622#endif /* !CONFIG_X86_X2APIC */
1623
1624static int __init try_to_enable_IR(void)
1625{
1626#ifdef CONFIG_X86_IO_APIC
1627 if (!x2apic_enabled() && skip_ioapic_setup) {
1628 pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
1586 return -1; 1629 return -1;
1587 } 1630 }
1588
1589 return irq_remapping_enable();
1590#endif 1631#endif
1591 return -1; 1632 return irq_remapping_enable();
1592} 1633}
1593 1634
1594void __init enable_IR_x2apic(void) 1635void __init enable_IR_x2apic(void)
1595{ 1636{
1596 unsigned long flags; 1637 unsigned long flags;
1597 int ret, x2apic_enabled = 0; 1638 int ret, ir_stat;
1598 int hardware_init_ret;
1599
1600 /* Make sure irq_remap_ops are initialized */
1601 setup_irq_remapping_ops();
1602 1639
1603 hardware_init_ret = irq_remapping_prepare(); 1640 ir_stat = irq_remapping_prepare();
1604 if (hardware_init_ret && !x2apic_supported()) 1641 if (ir_stat < 0 && !x2apic_supported())
1605 return; 1642 return;
1606 1643
1607 ret = save_ioapic_entries(); 1644 ret = save_ioapic_entries();
@@ -1614,49 +1651,13 @@ void __init enable_IR_x2apic(void)
1614 legacy_pic->mask_all(); 1651 legacy_pic->mask_all();
1615 mask_ioapic_entries(); 1652 mask_ioapic_entries();
1616 1653
1617 if (x2apic_preenabled && nox2apic) 1654 /* If irq_remapping_prepare() succeded, try to enable it */
1618 disable_x2apic(); 1655 if (ir_stat >= 0)
1619 1656 ir_stat = try_to_enable_IR();
1620 if (hardware_init_ret) 1657 /* ir_stat contains the remap mode or an error code */
1621 ret = -1; 1658 try_to_enable_x2apic(ir_stat);
1622 else
1623 ret = enable_IR();
1624
1625 if (!x2apic_supported())
1626 goto skip_x2apic;
1627
1628 if (ret < 0) {
1629 /* IR is required if there is APIC ID > 255 even when running
1630 * under KVM
1631 */
1632 if (max_physical_apicid > 255 ||
1633 !hypervisor_x2apic_available()) {
1634 if (x2apic_preenabled)
1635 disable_x2apic();
1636 goto skip_x2apic;
1637 }
1638 /*
1639 * without IR all CPUs can be addressed by IOAPIC/MSI
1640 * only in physical mode
1641 */
1642 x2apic_force_phys();
1643 }
1644 1659
1645 if (ret == IRQ_REMAP_XAPIC_MODE) { 1660 if (ir_stat < 0)
1646 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1647 goto skip_x2apic;
1648 }
1649
1650 x2apic_enabled = 1;
1651
1652 if (x2apic_supported() && !x2apic_mode) {
1653 x2apic_mode = 1;
1654 enable_x2apic();
1655 pr_info("Enabled x2apic\n");
1656 }
1657
1658skip_x2apic:
1659 if (ret < 0) /* IR enabling failed */
1660 restore_ioapic_entries(); 1661 restore_ioapic_entries();
1661 legacy_pic->restore_mask(); 1662 legacy_pic->restore_mask();
1662 local_irq_restore(flags); 1663 local_irq_restore(flags);
@@ -1847,82 +1848,8 @@ void __init register_lapic_address(unsigned long address)
1847 } 1848 }
1848} 1849}
1849 1850
1850/*
1851 * This initializes the IO-APIC and APIC hardware if this is
1852 * a UP kernel.
1853 */
1854int apic_version[MAX_LOCAL_APIC]; 1851int apic_version[MAX_LOCAL_APIC];
1855 1852
1856int __init APIC_init_uniprocessor(void)
1857{
1858 if (disable_apic) {
1859 pr_info("Apic disabled\n");
1860 return -1;
1861 }
1862#ifdef CONFIG_X86_64
1863 if (!cpu_has_apic) {
1864 disable_apic = 1;
1865 pr_info("Apic disabled by BIOS\n");
1866 return -1;
1867 }
1868#else
1869 if (!smp_found_config && !cpu_has_apic)
1870 return -1;
1871
1872 /*
1873 * Complain if the BIOS pretends there is one.
1874 */
1875 if (!cpu_has_apic &&
1876 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1877 pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
1878 boot_cpu_physical_apicid);
1879 return -1;
1880 }
1881#endif
1882
1883 default_setup_apic_routing();
1884
1885 verify_local_APIC();
1886 connect_bsp_APIC();
1887
1888#ifdef CONFIG_X86_64
1889 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
1890#else
1891 /*
1892 * Hack: In case of kdump, after a crash, kernel might be booting
1893 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1894 * might be zero if read from MP tables. Get it from LAPIC.
1895 */
1896# ifdef CONFIG_CRASH_DUMP
1897 boot_cpu_physical_apicid = read_apic_id();
1898# endif
1899#endif
1900 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1901 setup_local_APIC();
1902
1903#ifdef CONFIG_X86_IO_APIC
1904 /*
1905 * Now enable IO-APICs, actually call clear_IO_APIC
1906 * We need clear_IO_APIC before enabling error vector
1907 */
1908 if (!skip_ioapic_setup && nr_ioapics)
1909 enable_IO_APIC();
1910#endif
1911
1912 bsp_end_local_APIC_setup();
1913
1914#ifdef CONFIG_X86_IO_APIC
1915 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1916 setup_IO_APIC();
1917 else {
1918 nr_ioapics = 0;
1919 }
1920#endif
1921
1922 x86_init.timers.setup_percpu_clockev();
1923 return 0;
1924}
1925
1926/* 1853/*
1927 * Local APIC interrupts 1854 * Local APIC interrupts
1928 */ 1855 */
@@ -2027,7 +1954,7 @@ __visible void smp_trace_error_interrupt(struct pt_regs *regs)
2027/** 1954/**
2028 * connect_bsp_APIC - attach the APIC to the interrupt system 1955 * connect_bsp_APIC - attach the APIC to the interrupt system
2029 */ 1956 */
2030void __init connect_bsp_APIC(void) 1957static void __init connect_bsp_APIC(void)
2031{ 1958{
2032#ifdef CONFIG_X86_32 1959#ifdef CONFIG_X86_32
2033 if (pic_mode) { 1960 if (pic_mode) {
@@ -2274,6 +2201,100 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
2274 } 2201 }
2275} 2202}
2276 2203
2204static void __init apic_bsp_up_setup(void)
2205{
2206#ifdef CONFIG_X86_64
2207 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
2208#else
2209 /*
2210 * Hack: In case of kdump, after a crash, kernel might be booting
2211 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
2212 * might be zero if read from MP tables. Get it from LAPIC.
2213 */
2214# ifdef CONFIG_CRASH_DUMP
2215 boot_cpu_physical_apicid = read_apic_id();
2216# endif
2217#endif
2218 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
2219}
2220
2221/**
2222 * apic_bsp_setup - Setup function for local apic and io-apic
2223 * @upmode: Force UP mode (for APIC_init_uniprocessor)
2224 *
2225 * Returns:
2226 * apic_id of BSP APIC
2227 */
2228int __init apic_bsp_setup(bool upmode)
2229{
2230 int id;
2231
2232 connect_bsp_APIC();
2233 if (upmode)
2234 apic_bsp_up_setup();
2235 setup_local_APIC();
2236
2237 if (x2apic_mode)
2238 id = apic_read(APIC_LDR);
2239 else
2240 id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
2241
2242 enable_IO_APIC();
2243 end_local_APIC_setup();
2244 irq_remap_enable_fault_handling();
2245 setup_IO_APIC();
2246 /* Setup local timer */
2247 x86_init.timers.setup_percpu_clockev();
2248 return id;
2249}
2250
2251/*
2252 * This initializes the IO-APIC and APIC hardware if this is
2253 * a UP kernel.
2254 */
2255int __init APIC_init_uniprocessor(void)
2256{
2257 if (disable_apic) {
2258 pr_info("Apic disabled\n");
2259 return -1;
2260 }
2261#ifdef CONFIG_X86_64
2262 if (!cpu_has_apic) {
2263 disable_apic = 1;
2264 pr_info("Apic disabled by BIOS\n");
2265 return -1;
2266 }
2267#else
2268 if (!smp_found_config && !cpu_has_apic)
2269 return -1;
2270
2271 /*
2272 * Complain if the BIOS pretends there is one.
2273 */
2274 if (!cpu_has_apic &&
2275 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
2276 pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
2277 boot_cpu_physical_apicid);
2278 return -1;
2279 }
2280#endif
2281
2282 if (!smp_found_config)
2283 disable_ioapic_support();
2284
2285 default_setup_apic_routing();
2286 verify_local_APIC();
2287 apic_bsp_setup(true);
2288 return 0;
2289}
2290
2291#ifdef CONFIG_UP_LATE_INIT
2292void __init up_late_init(void)
2293{
2294 APIC_init_uniprocessor();
2295}
2296#endif
2297
2277/* 2298/*
2278 * Power management 2299 * Power management
2279 */ 2300 */
@@ -2359,9 +2380,9 @@ static void lapic_resume(void)
2359 mask_ioapic_entries(); 2380 mask_ioapic_entries();
2360 legacy_pic->mask_all(); 2381 legacy_pic->mask_all();
2361 2382
2362 if (x2apic_mode) 2383 if (x2apic_mode) {
2363 enable_x2apic(); 2384 __x2apic_enable();
2364 else { 2385 } else {
2365 /* 2386 /*
2366 * Make sure the APICBASE points to the right address 2387 * Make sure the APICBASE points to the right address
2367 * 2388 *
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3f5f60406ab1..f4dc2462a1ac 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1507,7 +1507,10 @@ void __init enable_IO_APIC(void)
1507 int i8259_apic, i8259_pin; 1507 int i8259_apic, i8259_pin;
1508 int apic, pin; 1508 int apic, pin;
1509 1509
1510 if (!nr_legacy_irqs()) 1510 if (skip_ioapic_setup)
1511 nr_ioapics = 0;
1512
1513 if (!nr_legacy_irqs() || !nr_ioapics)
1511 return; 1514 return;
1512 1515
1513 for_each_ioapic_pin(apic, pin) { 1516 for_each_ioapic_pin(apic, pin) {
@@ -2295,7 +2298,7 @@ static inline void __init check_timer(void)
2295 } 2298 }
2296 local_irq_disable(); 2299 local_irq_disable();
2297 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 2300 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2298 if (x2apic_preenabled) 2301 if (apic_is_x2apic_enabled())
2299 apic_printk(APIC_QUIET, KERN_INFO 2302 apic_printk(APIC_QUIET, KERN_INFO
2300 "Perhaps problem with the pre-enabled x2apic mode\n" 2303 "Perhaps problem with the pre-enabled x2apic mode\n"
2301 "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); 2304 "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
@@ -2373,9 +2376,9 @@ void __init setup_IO_APIC(void)
2373{ 2376{
2374 int ioapic; 2377 int ioapic;
2375 2378
2376 /* 2379 if (skip_ioapic_setup || !nr_ioapics)
2377 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 2380 return;
2378 */ 2381
2379 io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; 2382 io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
2380 2383
2381 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 2384 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15c5df92f74e..a220239cea65 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -869,3 +869,22 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
869 869
870 return false; 870 return false;
871} 871}
872
873void set_dr_addr_mask(unsigned long mask, int dr)
874{
875 if (!cpu_has_bpext)
876 return;
877
878 switch (dr) {
879 case 0:
880 wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
881 break;
882 case 1:
883 case 2:
884 case 3:
885 wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
886 break;
887 default:
888 break;
889 }
890}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c6049650c093..2346c95c6ab1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
19#include <asm/archrandom.h> 19#include <asm/archrandom.h>
20#include <asm/hypervisor.h> 20#include <asm/hypervisor.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h>
22#include <asm/debugreg.h> 23#include <asm/debugreg.h>
23#include <asm/sections.h> 24#include <asm/sections.h>
24#include <asm/vsyscall.h> 25#include <asm/vsyscall.h>
@@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep);
278static __always_inline void setup_smep(struct cpuinfo_x86 *c) 279static __always_inline void setup_smep(struct cpuinfo_x86 *c)
279{ 280{
280 if (cpu_has(c, X86_FEATURE_SMEP)) 281 if (cpu_has(c, X86_FEATURE_SMEP))
281 set_in_cr4(X86_CR4_SMEP); 282 cr4_set_bits(X86_CR4_SMEP);
282} 283}
283 284
284static __init int setup_disable_smap(char *arg) 285static __init int setup_disable_smap(char *arg)
@@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
298 299
299 if (cpu_has(c, X86_FEATURE_SMAP)) { 300 if (cpu_has(c, X86_FEATURE_SMAP)) {
300#ifdef CONFIG_X86_SMAP 301#ifdef CONFIG_X86_SMAP
301 set_in_cr4(X86_CR4_SMAP); 302 cr4_set_bits(X86_CR4_SMAP);
302#else 303#else
303 clear_in_cr4(X86_CR4_SMAP); 304 cr4_clear_bits(X86_CR4_SMAP);
304#endif 305#endif
305 } 306 }
306} 307}
@@ -491,17 +492,18 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
491u16 __read_mostly tlb_lld_4m[NR_INFO]; 492u16 __read_mostly tlb_lld_4m[NR_INFO];
492u16 __read_mostly tlb_lld_1g[NR_INFO]; 493u16 __read_mostly tlb_lld_1g[NR_INFO];
493 494
494void cpu_detect_tlb(struct cpuinfo_x86 *c) 495static void cpu_detect_tlb(struct cpuinfo_x86 *c)
495{ 496{
496 if (this_cpu->c_detect_tlb) 497 if (this_cpu->c_detect_tlb)
497 this_cpu->c_detect_tlb(c); 498 this_cpu->c_detect_tlb(c);
498 499
499 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" 500 pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
500 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
501 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], 501 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
502 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], 502 tlb_lli_4m[ENTRIES]);
503 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], 503
504 tlb_lld_1g[ENTRIES]); 504 pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
505 tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
506 tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
505} 507}
506 508
507void detect_ht(struct cpuinfo_x86 *c) 509void detect_ht(struct cpuinfo_x86 *c)
@@ -1294,6 +1296,12 @@ void cpu_init(void)
1294 wait_for_master_cpu(cpu); 1296 wait_for_master_cpu(cpu);
1295 1297
1296 /* 1298 /*
1299 * Initialize the CR4 shadow before doing anything that could
1300 * try to read it.
1301 */
1302 cr4_init_shadow();
1303
1304 /*
1297 * Load microcode on this cpu if a valid microcode is available. 1305 * Load microcode on this cpu if a valid microcode is available.
1298 * This is early microcode loading procedure. 1306 * This is early microcode loading procedure.
1299 */ 1307 */
@@ -1312,7 +1320,7 @@ void cpu_init(void)
1312 1320
1313 pr_debug("Initializing CPU#%d\n", cpu); 1321 pr_debug("Initializing CPU#%d\n", cpu);
1314 1322
1315 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1323 cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1316 1324
1317 /* 1325 /*
1318 * Initialize the per-CPU GDT with the boot GDT, 1326 * Initialize the per-CPU GDT with the boot GDT,
@@ -1332,7 +1340,7 @@ void cpu_init(void)
1332 barrier(); 1340 barrier();
1333 1341
1334 x86_configure_nx(); 1342 x86_configure_nx();
1335 enable_x2apic(); 1343 x2apic_setup();
1336 1344
1337 /* 1345 /*
1338 * set up and load the per-CPU TSS 1346 * set up and load the per-CPU TSS
@@ -1388,12 +1396,18 @@ void cpu_init(void)
1388 1396
1389 wait_for_master_cpu(cpu); 1397 wait_for_master_cpu(cpu);
1390 1398
1399 /*
1400 * Initialize the CR4 shadow before doing anything that could
1401 * try to read it.
1402 */
1403 cr4_init_shadow();
1404
1391 show_ucode_info_early(); 1405 show_ucode_info_early();
1392 1406
1393 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1407 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1394 1408
1395 if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) 1409 if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
1396 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1410 cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1397 1411
1398 load_current_idt(); 1412 load_current_idt();
1399 switch_to_new_gdt(cpu); 1413 switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9cc6b6f25f42..50163fa9034f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -487,10 +487,8 @@ static void init_intel(struct cpuinfo_x86 *c)
487 487
488 rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); 488 rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
489 if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { 489 if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
490 printk_once(KERN_WARNING "ENERGY_PERF_BIAS:" 490 pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
491 " Set to 'normal', was 'performance'\n" 491 pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
492 "ENERGY_PERF_BIAS: View and update with"
493 " x86_energy_perf_policy(8)\n");
494 epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; 492 epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
495 wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); 493 wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
496 } 494 }
@@ -567,8 +565,8 @@ static const struct _tlb_table intel_tlb_table[] = {
567 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, 565 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
568 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, 566 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
569 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, 567 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
570 { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" }, 568 { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" },
571 { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" }, 569 { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" },
572 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, 570 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
573 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, 571 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
574 { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, 572 { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c7035073dfc1..659643376dbf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
952static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, 952static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
953 int type, char *buf) 953 int type, char *buf)
954{ 954{
955 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; 955 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
956 int n = 0; 956 int ret;
957 957
958 if (len > 1) { 958 if (type)
959 const struct cpumask *mask; 959 ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
960 960 cpumask_pr_args(mask));
961 mask = to_cpumask(this_leaf->shared_cpu_map); 961 else
962 n = type ? 962 ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
963 cpulist_scnprintf(buf, len-2, mask) : 963 cpumask_pr_args(mask));
964 cpumask_scnprintf(buf, len-2, mask); 964 buf[ret++] = '\n';
965 buf[n++] = '\n'; 965 buf[ret] = '\0';
966 buf[n] = '\0'; 966 return ret;
967 }
968 return n;
969} 967}
970 968
971static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, 969static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..3c036cb4a370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,8 @@
43#include <linux/export.h> 43#include <linux/export.h>
44 44
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/traps.h>
47#include <asm/tlbflush.h>
46#include <asm/mce.h> 48#include <asm/mce.h>
47#include <asm/msr.h> 49#include <asm/msr.h>
48 50
@@ -115,7 +117,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
115 * CPU/chipset specific EDAC code can register a notifier call here to print 117 * CPU/chipset specific EDAC code can register a notifier call here to print
116 * MCE errors in a human-readable form. 118 * MCE errors in a human-readable form.
117 */ 119 */
118ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 120static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
119 121
120/* Do initial initialization of a struct mce */ 122/* Do initial initialization of a struct mce */
121void mce_setup(struct mce *m) 123void mce_setup(struct mce *m)
@@ -150,14 +152,11 @@ static struct mce_log mcelog = {
150void mce_log(struct mce *mce) 152void mce_log(struct mce *mce)
151{ 153{
152 unsigned next, entry; 154 unsigned next, entry;
153 int ret = 0;
154 155
155 /* Emit the trace record: */ 156 /* Emit the trace record: */
156 trace_mce_record(mce); 157 trace_mce_record(mce);
157 158
158 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 159 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
159 if (ret == NOTIFY_STOP)
160 return;
161 160
162 mce->finished = 0; 161 mce->finished = 0;
163 wmb(); 162 wmb();
@@ -311,7 +310,7 @@ static void wait_for_panic(void)
311 panic("Panicing machine check CPU died"); 310 panic("Panicing machine check CPU died");
312} 311}
313 312
314static void mce_panic(char *msg, struct mce *final, char *exp) 313static void mce_panic(const char *msg, struct mce *final, char *exp)
315{ 314{
316 int i, apei_err = 0; 315 int i, apei_err = 0;
317 316
@@ -529,7 +528,7 @@ static void mce_schedule_work(void)
529 schedule_work(this_cpu_ptr(&mce_work)); 528 schedule_work(this_cpu_ptr(&mce_work));
530} 529}
531 530
532DEFINE_PER_CPU(struct irq_work, mce_irq_work); 531static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
533 532
534static void mce_irq_work_cb(struct irq_work *entry) 533static void mce_irq_work_cb(struct irq_work *entry)
535{ 534{
@@ -735,7 +734,7 @@ static atomic_t mce_callin;
735/* 734/*
736 * Check if a timeout waiting for other CPUs happened. 735 * Check if a timeout waiting for other CPUs happened.
737 */ 736 */
738static int mce_timed_out(u64 *t) 737static int mce_timed_out(u64 *t, const char *msg)
739{ 738{
740 /* 739 /*
741 * The others already did panic for some reason. 740 * The others already did panic for some reason.
@@ -750,8 +749,7 @@ static int mce_timed_out(u64 *t)
750 goto out; 749 goto out;
751 if ((s64)*t < SPINUNIT) { 750 if ((s64)*t < SPINUNIT) {
752 if (mca_cfg.tolerant <= 1) 751 if (mca_cfg.tolerant <= 1)
753 mce_panic("Timeout synchronizing machine check over CPUs", 752 mce_panic(msg, NULL, NULL);
754 NULL, NULL);
755 cpu_missing = 1; 753 cpu_missing = 1;
756 return 1; 754 return 1;
757 } 755 }
@@ -867,7 +865,8 @@ static int mce_start(int *no_way_out)
867 * Wait for everyone. 865 * Wait for everyone.
868 */ 866 */
869 while (atomic_read(&mce_callin) != cpus) { 867 while (atomic_read(&mce_callin) != cpus) {
870 if (mce_timed_out(&timeout)) { 868 if (mce_timed_out(&timeout,
869 "Timeout: Not all CPUs entered broadcast exception handler")) {
871 atomic_set(&global_nwo, 0); 870 atomic_set(&global_nwo, 0);
872 return -1; 871 return -1;
873 } 872 }
@@ -892,7 +891,8 @@ static int mce_start(int *no_way_out)
892 * only seen by one CPU before cleared, avoiding duplicates. 891 * only seen by one CPU before cleared, avoiding duplicates.
893 */ 892 */
894 while (atomic_read(&mce_executing) < order) { 893 while (atomic_read(&mce_executing) < order) {
895 if (mce_timed_out(&timeout)) { 894 if (mce_timed_out(&timeout,
895 "Timeout: Subject CPUs unable to finish machine check processing")) {
896 atomic_set(&global_nwo, 0); 896 atomic_set(&global_nwo, 0);
897 return -1; 897 return -1;
898 } 898 }
@@ -936,7 +936,8 @@ static int mce_end(int order)
936 * loops. 936 * loops.
937 */ 937 */
938 while (atomic_read(&mce_executing) <= cpus) { 938 while (atomic_read(&mce_executing) <= cpus) {
939 if (mce_timed_out(&timeout)) 939 if (mce_timed_out(&timeout,
940 "Timeout: Monarch CPU unable to finish machine check processing"))
940 goto reset; 941 goto reset;
941 ndelay(SPINUNIT); 942 ndelay(SPINUNIT);
942 } 943 }
@@ -949,7 +950,8 @@ static int mce_end(int order)
949 * Subject: Wait for Monarch to finish. 950 * Subject: Wait for Monarch to finish.
950 */ 951 */
951 while (atomic_read(&mce_executing) != 0) { 952 while (atomic_read(&mce_executing) != 0) {
952 if (mce_timed_out(&timeout)) 953 if (mce_timed_out(&timeout,
954 "Timeout: Monarch CPU did not finish machine check processing"))
953 goto reset; 955 goto reset;
954 ndelay(SPINUNIT); 956 ndelay(SPINUNIT);
955 } 957 }
@@ -1003,51 +1005,6 @@ static void mce_clear_state(unsigned long *toclear)
1003} 1005}
1004 1006
1005/* 1007/*
1006 * Need to save faulting physical address associated with a process
1007 * in the machine check handler some place where we can grab it back
1008 * later in mce_notify_process()
1009 */
1010#define MCE_INFO_MAX 16
1011
1012struct mce_info {
1013 atomic_t inuse;
1014 struct task_struct *t;
1015 __u64 paddr;
1016 int restartable;
1017} mce_info[MCE_INFO_MAX];
1018
1019static void mce_save_info(__u64 addr, int c)
1020{
1021 struct mce_info *mi;
1022
1023 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
1024 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
1025 mi->t = current;
1026 mi->paddr = addr;
1027 mi->restartable = c;
1028 return;
1029 }
1030 }
1031
1032 mce_panic("Too many concurrent recoverable errors", NULL, NULL);
1033}
1034
1035static struct mce_info *mce_find_info(void)
1036{
1037 struct mce_info *mi;
1038
1039 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
1040 if (atomic_read(&mi->inuse) && mi->t == current)
1041 return mi;
1042 return NULL;
1043}
1044
1045static void mce_clear_info(struct mce_info *mi)
1046{
1047 atomic_set(&mi->inuse, 0);
1048}
1049
1050/*
1051 * The actual machine check handler. This only handles real 1008 * The actual machine check handler. This only handles real
1052 * exceptions when something got corrupted coming in through int 18. 1009 * exceptions when something got corrupted coming in through int 18.
1053 * 1010 *
@@ -1063,6 +1020,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1063{ 1020{
1064 struct mca_config *cfg = &mca_cfg; 1021 struct mca_config *cfg = &mca_cfg;
1065 struct mce m, *final; 1022 struct mce m, *final;
1023 enum ctx_state prev_state;
1066 int i; 1024 int i;
1067 int worst = 0; 1025 int worst = 0;
1068 int severity; 1026 int severity;
@@ -1084,6 +1042,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1084 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1042 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1085 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1043 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1086 char *msg = "Unknown"; 1044 char *msg = "Unknown";
1045 u64 recover_paddr = ~0ull;
1046 int flags = MF_ACTION_REQUIRED;
1047
1048 prev_state = ist_enter(regs);
1087 1049
1088 this_cpu_inc(mce_exception_count); 1050 this_cpu_inc(mce_exception_count);
1089 1051
@@ -1203,9 +1165,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1203 if (no_way_out) 1165 if (no_way_out)
1204 mce_panic("Fatal machine check on current CPU", &m, msg); 1166 mce_panic("Fatal machine check on current CPU", &m, msg);
1205 if (worst == MCE_AR_SEVERITY) { 1167 if (worst == MCE_AR_SEVERITY) {
1206 /* schedule action before return to userland */ 1168 recover_paddr = m.addr;
1207 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1169 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1208 set_thread_flag(TIF_MCE_NOTIFY); 1170 flags |= MF_MUST_KILL;
1209 } else if (kill_it) { 1171 } else if (kill_it) {
1210 force_sig(SIGBUS, current); 1172 force_sig(SIGBUS, current);
1211 } 1173 }
@@ -1216,6 +1178,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1216 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1178 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1217out: 1179out:
1218 sync_core(); 1180 sync_core();
1181
1182 if (recover_paddr == ~0ull)
1183 goto done;
1184
1185 pr_err("Uncorrected hardware memory error in user-access at %llx",
1186 recover_paddr);
1187 /*
1188 * We must call memory_failure() here even if the current process is
1189 * doomed. We still need to mark the page as poisoned and alert any
1190 * other users of the page.
1191 */
1192 ist_begin_non_atomic(regs);
1193 local_irq_enable();
1194 if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1195 pr_err("Memory error not recovered");
1196 force_sig(SIGBUS, current);
1197 }
1198 local_irq_disable();
1199 ist_end_non_atomic();
1200done:
1201 ist_exit(regs, prev_state);
1219} 1202}
1220EXPORT_SYMBOL_GPL(do_machine_check); 1203EXPORT_SYMBOL_GPL(do_machine_check);
1221 1204
@@ -1233,42 +1216,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
1233#endif 1216#endif
1234 1217
1235/* 1218/*
1236 * Called in process context that interrupted by MCE and marked with
1237 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1238 * This code is allowed to sleep.
1239 * Attempt possible recovery such as calling the high level VM handler to
1240 * process any corrupted pages, and kill/signal current process if required.
1241 * Action required errors are handled here.
1242 */
1243void mce_notify_process(void)
1244{
1245 unsigned long pfn;
1246 struct mce_info *mi = mce_find_info();
1247 int flags = MF_ACTION_REQUIRED;
1248
1249 if (!mi)
1250 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1251 pfn = mi->paddr >> PAGE_SHIFT;
1252
1253 clear_thread_flag(TIF_MCE_NOTIFY);
1254
1255 pr_err("Uncorrected hardware memory error in user-access at %llx",
1256 mi->paddr);
1257 /*
1258 * We must call memory_failure() here even if the current process is
1259 * doomed. We still need to mark the page as poisoned and alert any
1260 * other users of the page.
1261 */
1262 if (!mi->restartable)
1263 flags |= MF_MUST_KILL;
1264 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1265 pr_err("Memory error not recovered");
1266 force_sig(SIGBUS, current);
1267 }
1268 mce_clear_info(mi);
1269}
1270
1271/*
1272 * Action optional processing happens here (picking up 1219 * Action optional processing happens here (picking up
1273 * from the list of faulting pages that do_machine_check() 1220 * from the list of faulting pages that do_machine_check()
1274 * placed into the "ring"). 1221 * placed into the "ring").
@@ -1503,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void)
1503 bitmap_fill(all_banks, MAX_NR_BANKS); 1450 bitmap_fill(all_banks, MAX_NR_BANKS);
1504 machine_check_poll(MCP_UC | m_fl, &all_banks); 1451 machine_check_poll(MCP_UC | m_fl, &all_banks);
1505 1452
1506 set_in_cr4(X86_CR4_MCE); 1453 cr4_set_bits(X86_CR4_MCE);
1507 1454
1508 rdmsrl(MSR_IA32_MCG_CAP, cap); 1455 rdmsrl(MSR_IA32_MCG_CAP, cap);
1509 if (cap & MCG_CTL_P) 1456 if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..737b0ad4e61a 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,8 @@
8#include <linux/smp.h> 8#include <linux/smp.h>
9 9
10#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/traps.h>
12#include <asm/tlbflush.h>
11#include <asm/mce.h> 13#include <asm/mce.h>
12#include <asm/msr.h> 14#include <asm/msr.h>
13 15
@@ -17,8 +19,11 @@ int mce_p5_enabled __read_mostly;
17/* Machine check handler for Pentium class Intel CPUs: */ 19/* Machine check handler for Pentium class Intel CPUs: */
18static void pentium_machine_check(struct pt_regs *regs, long error_code) 20static void pentium_machine_check(struct pt_regs *regs, long error_code)
19{ 21{
22 enum ctx_state prev_state;
20 u32 loaddr, hi, lotype; 23 u32 loaddr, hi, lotype;
21 24
25 prev_state = ist_enter(regs);
26
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 27 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 28 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 29
@@ -33,6 +38,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
33 } 38 }
34 39
35 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 40 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
41
42 ist_exit(regs, prev_state);
36} 43}
37 44
38/* Set up machine check reporting for processors with Intel style MCE: */ 45/* Set up machine check reporting for processors with Intel style MCE: */
@@ -59,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
59 "Intel old style machine check architecture supported.\n"); 66 "Intel old style machine check architecture supported.\n");
60 67
61 /* Enable MCE: */ 68 /* Enable MCE: */
62 set_in_cr4(X86_CR4_MCE); 69 cr4_set_bits(X86_CR4_MCE);
63 printk(KERN_INFO 70 printk(KERN_INFO
64 "Intel old style machine check reporting enabled on CPU#%d.\n", 71 "Intel old style machine check reporting enabled on CPU#%d.\n",
65 smp_processor_id()); 72 smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..44f138296fbe 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,14 +7,20 @@
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include <asm/processor.h> 9#include <asm/processor.h>
10#include <asm/traps.h>
11#include <asm/tlbflush.h>
10#include <asm/mce.h> 12#include <asm/mce.h>
11#include <asm/msr.h> 13#include <asm/msr.h>
12 14
13/* Machine check handler for WinChip C6: */ 15/* Machine check handler for WinChip C6: */
14static void winchip_machine_check(struct pt_regs *regs, long error_code) 16static void winchip_machine_check(struct pt_regs *regs, long error_code)
15{ 17{
18 enum ctx_state prev_state = ist_enter(regs);
19
16 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
17 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 21 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
22
23 ist_exit(regs, prev_state);
18} 24}
19 25
20/* Set up machine check reporting on the Winchip C6 series */ 26/* Set up machine check reporting on the Winchip C6 series */
@@ -31,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
31 lo &= ~(1<<4); /* Enable MCE */ 37 lo &= ~(1<<4); /* Enable MCE */
32 wrmsr(MSR_IDT_FCR1, lo, hi); 38 wrmsr(MSR_IDT_FCR1, lo, hi);
33 39
34 set_in_cr4(X86_CR4_MCE); 40 cr4_set_bits(X86_CR4_MCE);
35 41
36 printk(KERN_INFO 42 printk(KERN_INFO
37 "Winchip machine check reporting enabled on CPU#0.\n"); 43 "Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c29096136b..36a83617eb21 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -552,7 +552,7 @@ static int __init microcode_init(void)
552 int error; 552 int error;
553 553
554 if (paravirt_enabled() || dis_ucode_ldr) 554 if (paravirt_enabled() || dis_ucode_ldr)
555 return 0; 555 return -EINVAL;
556 556
557 if (c->x86_vendor == X86_VENDOR_INTEL) 557 if (c->x86_vendor == X86_VENDOR_INTEL)
558 microcode_ops = init_intel_microcode(); 558 microcode_ops = init_intel_microcode();
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index c6826d1e8082..746e7fd08aad 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -196,6 +196,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
196 struct microcode_header_intel mc_header; 196 struct microcode_header_intel mc_header;
197 unsigned int mc_size; 197 unsigned int mc_size;
198 198
199 if (leftover < sizeof(mc_header)) {
200 pr_err("error! Truncated header in microcode data file\n");
201 break;
202 }
203
199 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) 204 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
200 break; 205 break;
201 206
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index ec9df6f9cd47..420eb933189c 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -321,7 +321,11 @@ get_matching_model_microcode(int cpu, unsigned long start,
321 unsigned int mc_saved_count = mc_saved_data->mc_saved_count; 321 unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
322 int i; 322 int i;
323 323
324 while (leftover) { 324 while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
325
326 if (leftover < sizeof(mc_header))
327 break;
328
325 mc_header = (struct microcode_header_intel *)ucode_ptr; 329 mc_header = (struct microcode_header_intel *)ucode_ptr;
326 330
327 mc_size = get_totalsize(mc_header); 331 mc_size = get_totalsize(mc_header);
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..f8c81ba0b465 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)
138 138
139 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 139 /* Save value of CR4 and clear Page Global Enable (bit 7) */
140 if (cpu_has_pge) { 140 if (cpu_has_pge) {
141 cr4 = read_cr4(); 141 cr4 = __read_cr4();
142 write_cr4(cr4 & ~X86_CR4_PGE); 142 __write_cr4(cr4 & ~X86_CR4_PGE);
143 } 143 }
144 144
145 /* 145 /*
@@ -171,7 +171,7 @@ static void post_set(void)
171 171
172 /* Restore value of CR4 */ 172 /* Restore value of CR4 */
173 if (cpu_has_pge) 173 if (cpu_has_pge)
174 write_cr4(cr4); 174 __write_cr4(cr4);
175} 175}
176 176
177static void cyrix_set_arr(unsigned int reg, unsigned long base, 177static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0e25a1bc5ab5..7d74f7b3c6ba 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
678 678
679 /* Save value of CR4 and clear Page Global Enable (bit 7) */ 679 /* Save value of CR4 and clear Page Global Enable (bit 7) */
680 if (cpu_has_pge) { 680 if (cpu_has_pge) {
681 cr4 = read_cr4(); 681 cr4 = __read_cr4();
682 write_cr4(cr4 & ~X86_CR4_PGE); 682 __write_cr4(cr4 & ~X86_CR4_PGE);
683 } 683 }
684 684
685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ 685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)
708 708
709 /* Restore value of CR4 */ 709 /* Restore value of CR4 */
710 if (cpu_has_pge) 710 if (cpu_has_pge)
711 write_cr4(cr4); 711 __write_cr4(cr4);
712 raw_spin_unlock(&set_atomicity_lock); 712 raw_spin_unlock(&set_atomicity_lock);
713} 713}
714 714
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5dc855..b71a7f86d68a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,8 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/alternative.h> 33#include <asm/alternative.h>
34#include <asm/mmu_context.h>
35#include <asm/tlbflush.h>
34#include <asm/timer.h> 36#include <asm/timer.h>
35#include <asm/desc.h> 37#include <asm/desc.h>
36#include <asm/ldt.h> 38#include <asm/ldt.h>
@@ -43,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
43 .enabled = 1, 45 .enabled = 1,
44}; 46};
45 47
48struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
49
46u64 __read_mostly hw_cache_event_ids 50u64 __read_mostly hw_cache_event_ids
47 [PERF_COUNT_HW_CACHE_MAX] 51 [PERF_COUNT_HW_CACHE_MAX]
48 [PERF_COUNT_HW_CACHE_OP_MAX] 52 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1327,8 +1331,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1327 break; 1331 break;
1328 1332
1329 case CPU_STARTING: 1333 case CPU_STARTING:
1330 if (x86_pmu.attr_rdpmc)
1331 set_in_cr4(X86_CR4_PCE);
1332 if (x86_pmu.cpu_starting) 1334 if (x86_pmu.cpu_starting)
1333 x86_pmu.cpu_starting(cpu); 1335 x86_pmu.cpu_starting(cpu);
1334 break; 1336 break;
@@ -1804,14 +1806,44 @@ static int x86_pmu_event_init(struct perf_event *event)
1804 event->destroy(event); 1806 event->destroy(event);
1805 } 1807 }
1806 1808
1809 if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
1810 event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
1811
1807 return err; 1812 return err;
1808} 1813}
1809 1814
1815static void refresh_pce(void *ignored)
1816{
1817 if (current->mm)
1818 load_mm_cr4(current->mm);
1819}
1820
1821static void x86_pmu_event_mapped(struct perf_event *event)
1822{
1823 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
1824 return;
1825
1826 if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
1827 on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
1828}
1829
1830static void x86_pmu_event_unmapped(struct perf_event *event)
1831{
1832 if (!current->mm)
1833 return;
1834
1835 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
1836 return;
1837
1838 if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
1839 on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
1840}
1841
1810static int x86_pmu_event_idx(struct perf_event *event) 1842static int x86_pmu_event_idx(struct perf_event *event)
1811{ 1843{
1812 int idx = event->hw.idx; 1844 int idx = event->hw.idx;
1813 1845
1814 if (!x86_pmu.attr_rdpmc) 1846 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
1815 return 0; 1847 return 0;
1816 1848
1817 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { 1849 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
@@ -1829,16 +1861,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev,
1829 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); 1861 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
1830} 1862}
1831 1863
1832static void change_rdpmc(void *info)
1833{
1834 bool enable = !!(unsigned long)info;
1835
1836 if (enable)
1837 set_in_cr4(X86_CR4_PCE);
1838 else
1839 clear_in_cr4(X86_CR4_PCE);
1840}
1841
1842static ssize_t set_attr_rdpmc(struct device *cdev, 1864static ssize_t set_attr_rdpmc(struct device *cdev,
1843 struct device_attribute *attr, 1865 struct device_attribute *attr,
1844 const char *buf, size_t count) 1866 const char *buf, size_t count)
@@ -1850,14 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
1850 if (ret) 1872 if (ret)
1851 return ret; 1873 return ret;
1852 1874
1875 if (val > 2)
1876 return -EINVAL;
1877
1853 if (x86_pmu.attr_rdpmc_broken) 1878 if (x86_pmu.attr_rdpmc_broken)
1854 return -ENOTSUPP; 1879 return -ENOTSUPP;
1855 1880
1856 if (!!val != !!x86_pmu.attr_rdpmc) { 1881 if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
1857 x86_pmu.attr_rdpmc = !!val; 1882 /*
1858 on_each_cpu(change_rdpmc, (void *)val, 1); 1883 * Changing into or out of always available, aka
1884 * perf-event-bypassing mode. This path is extremely slow,
1885 * but only root can trigger it, so it's okay.
1886 */
1887 if (val == 2)
1888 static_key_slow_inc(&rdpmc_always_available);
1889 else
1890 static_key_slow_dec(&rdpmc_always_available);
1891 on_each_cpu(refresh_pce, NULL, 1);
1859 } 1892 }
1860 1893
1894 x86_pmu.attr_rdpmc = val;
1895
1861 return count; 1896 return count;
1862} 1897}
1863 1898
@@ -1900,6 +1935,9 @@ static struct pmu pmu = {
1900 1935
1901 .event_init = x86_pmu_event_init, 1936 .event_init = x86_pmu_event_init,
1902 1937
1938 .event_mapped = x86_pmu_event_mapped,
1939 .event_unmapped = x86_pmu_event_unmapped,
1940
1903 .add = x86_pmu_add, 1941 .add = x86_pmu_add,
1904 .del = x86_pmu_del, 1942 .del = x86_pmu_del,
1905 .start = x86_pmu_start, 1943 .start = x86_pmu_start,
@@ -1914,13 +1952,15 @@ static struct pmu pmu = {
1914 .flush_branch_stack = x86_pmu_flush_branch_stack, 1952 .flush_branch_stack = x86_pmu_flush_branch_stack,
1915}; 1953};
1916 1954
1917void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 1955void arch_perf_update_userpage(struct perf_event *event,
1956 struct perf_event_mmap_page *userpg, u64 now)
1918{ 1957{
1919 struct cyc2ns_data *data; 1958 struct cyc2ns_data *data;
1920 1959
1921 userpg->cap_user_time = 0; 1960 userpg->cap_user_time = 0;
1922 userpg->cap_user_time_zero = 0; 1961 userpg->cap_user_time_zero = 0;
1923 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; 1962 userpg->cap_user_rdpmc =
1963 !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
1924 userpg->pmc_width = x86_pmu.cntval_bits; 1964 userpg->pmc_width = x86_pmu.cntval_bits;
1925 1965
1926 if (!sched_clock_stable()) 1966 if (!sched_clock_stable())
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0ddc70..df525d2be1e8 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
71#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ 71#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
72#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ 72#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
73#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ 73#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
74#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
75
74 76
75struct amd_nb { 77struct amd_nb {
76 int nb_id; /* NorthBridge id */ 78 int nb_id; /* NorthBridge id */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 944bf019b74f..498b6d967138 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2431,6 +2431,7 @@ __init int intel_pmu_init(void)
2431 break; 2431 break;
2432 2432
2433 case 55: /* 22nm Atom "Silvermont" */ 2433 case 55: /* 22nm Atom "Silvermont" */
2434 case 76: /* 14nm Atom "Airmont" */
2434 case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ 2435 case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
2435 memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, 2436 memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
2436 sizeof(hw_cache_event_ids)); 2437 sizeof(hw_cache_event_ids));
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 6e434f8e5fc8..c4bb8b8e5017 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -142,7 +142,7 @@ static inline u64 rapl_scale(u64 v)
142 * or use ldexp(count, -32). 142 * or use ldexp(count, -32).
143 * Watts = Joules/Time delta 143 * Watts = Joules/Time delta
144 */ 144 */
145 return v << (32 - __this_cpu_read(rapl_pmu->hw_unit)); 145 return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
146} 146}
147 147
148static u64 rapl_event_update(struct perf_event *event) 148static u64 rapl_event_update(struct perf_event *event)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 10b8d3eaaf15..c635b8b49e93 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -840,7 +840,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
840 box->phys_id = phys_id; 840 box->phys_id = phys_id;
841 box->pci_dev = pdev; 841 box->pci_dev = pdev;
842 box->pmu = pmu; 842 box->pmu = pmu;
843 uncore_box_init(box);
844 pci_set_drvdata(pdev, box); 843 pci_set_drvdata(pdev, box);
845 844
846 raw_spin_lock(&uncore_box_lock); 845 raw_spin_lock(&uncore_box_lock);
@@ -1004,10 +1003,8 @@ static int uncore_cpu_starting(int cpu)
1004 pmu = &type->pmus[j]; 1003 pmu = &type->pmus[j];
1005 box = *per_cpu_ptr(pmu->box, cpu); 1004 box = *per_cpu_ptr(pmu->box, cpu);
1006 /* called by uncore_cpu_init? */ 1005 /* called by uncore_cpu_init? */
1007 if (box && box->phys_id >= 0) { 1006 if (box && box->phys_id >= 0)
1008 uncore_box_init(box);
1009 continue; 1007 continue;
1010 }
1011 1008
1012 for_each_online_cpu(k) { 1009 for_each_online_cpu(k) {
1013 exist = *per_cpu_ptr(pmu->box, k); 1010 exist = *per_cpu_ptr(pmu->box, k);
@@ -1023,10 +1020,8 @@ static int uncore_cpu_starting(int cpu)
1023 } 1020 }
1024 } 1021 }
1025 1022
1026 if (box) { 1023 if (box)
1027 box->phys_id = phys_id; 1024 box->phys_id = phys_id;
1028 uncore_box_init(box);
1029 }
1030 } 1025 }
1031 } 1026 }
1032 return 0; 1027 return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 863d9b02563e..6c8c1e7e69d8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -257,6 +257,14 @@ static inline int uncore_num_counters(struct intel_uncore_box *box)
257 return box->pmu->type->num_counters; 257 return box->pmu->type->num_counters;
258} 258}
259 259
260static inline void uncore_box_init(struct intel_uncore_box *box)
261{
262 if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
263 if (box->pmu->type->ops->init_box)
264 box->pmu->type->ops->init_box(box);
265 }
266}
267
260static inline void uncore_disable_box(struct intel_uncore_box *box) 268static inline void uncore_disable_box(struct intel_uncore_box *box)
261{ 269{
262 if (box->pmu->type->ops->disable_box) 270 if (box->pmu->type->ops->disable_box)
@@ -265,6 +273,8 @@ static inline void uncore_disable_box(struct intel_uncore_box *box)
265 273
266static inline void uncore_enable_box(struct intel_uncore_box *box) 274static inline void uncore_enable_box(struct intel_uncore_box *box)
267{ 275{
276 uncore_box_init(box);
277
268 if (box->pmu->type->ops->enable_box) 278 if (box->pmu->type->ops->enable_box)
269 box->pmu->type->ops->enable_box(box); 279 box->pmu->type->ops->enable_box(box);
270} 280}
@@ -287,14 +297,6 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box,
287 return box->pmu->type->ops->read_counter(box, event); 297 return box->pmu->type->ops->read_counter(box, event);
288} 298}
289 299
290static inline void uncore_box_init(struct intel_uncore_box *box)
291{
292 if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
293 if (box->pmu->type->ops->init_box)
294 box->pmu->type->ops->init_box(box);
295 }
296}
297
298static inline bool uncore_box_is_fake(struct intel_uncore_box *box) 300static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
299{ 301{
300 return (box->phys_id < 0); 302 return (box->phys_id < 0);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index b74ebc7c4402..cf3df1d8d039 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err)
265 printk("SMP "); 265 printk("SMP ");
266#endif 266#endif
267#ifdef CONFIG_DEBUG_PAGEALLOC 267#ifdef CONFIG_DEBUG_PAGEALLOC
268 printk("DEBUG_PAGEALLOC"); 268 printk("DEBUG_PAGEALLOC ");
269#endif
270#ifdef CONFIG_KASAN
271 printk("KASAN");
269#endif 272#endif
270 printk("\n"); 273 printk("\n");
271 if (notify_die(DIE_OOPS, str, regs, err, 274 if (notify_die(DIE_OOPS, str, regs, err,
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index dd2f07ae9d0c..46201deee923 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -184,9 +184,9 @@ void __init e820_print_map(char *who)
184 * overwritten in the same location, starting at biosmap. 184 * overwritten in the same location, starting at biosmap.
185 * 185 *
186 * The integer pointed to by pnr_map must be valid on entry (the 186 * The integer pointed to by pnr_map must be valid on entry (the
187 * current number of valid entries located at biosmap) and will 187 * current number of valid entries located at biosmap). If the
188 * be updated on return, with the new number of valid entries 188 * sanitizing succeeds the *pnr_map will be updated with the new
189 * (something no more than max_nr_map.) 189 * number of valid entries (something no more than max_nr_map).
190 * 190 *
191 * The return value from sanitize_e820_map() is zero if it 191 * The return value from sanitize_e820_map() is zero if it
192 * successfully 'sanitized' the map entries passed in, and is -1 192 * successfully 'sanitized' the map entries passed in, and is -1
@@ -561,23 +561,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
561 561
562void __init update_e820(void) 562void __init update_e820(void)
563{ 563{
564 u32 nr_map; 564 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map))
565
566 nr_map = e820.nr_map;
567 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
568 return; 565 return;
569 e820.nr_map = nr_map;
570 printk(KERN_INFO "e820: modified physical RAM map:\n"); 566 printk(KERN_INFO "e820: modified physical RAM map:\n");
571 e820_print_map("modified"); 567 e820_print_map("modified");
572} 568}
573static void __init update_e820_saved(void) 569static void __init update_e820_saved(void)
574{ 570{
575 u32 nr_map; 571 sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map),
576 572 &e820_saved.nr_map);
577 nr_map = e820_saved.nr_map;
578 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
579 return;
580 e820_saved.nr_map = nr_map;
581} 573}
582#define MAX_GAP_END 0x100000000ull 574#define MAX_GAP_END 0x100000000ull
583/* 575/*
@@ -898,11 +890,9 @@ early_param("memmap", parse_memmap_opt);
898void __init finish_e820_parsing(void) 890void __init finish_e820_parsing(void)
899{ 891{
900 if (userdef) { 892 if (userdef) {
901 u32 nr = e820.nr_map; 893 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map),
902 894 &e820.nr_map) < 0)
903 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
904 early_panic("Invalid user supplied memory map"); 895 early_panic("Invalid user supplied memory map");
905 e820.nr_map = nr;
906 896
907 printk(KERN_INFO "e820: user-defined physical RAM map:\n"); 897 printk(KERN_INFO "e820: user-defined physical RAM map:\n");
908 e820_print_map("user"); 898 e820_print_map("user");
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 01d1c187c9f9..a62536a1be88 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -19,6 +19,7 @@
19#include <linux/usb/ehci_def.h> 19#include <linux/usb/ehci_def.h>
20#include <linux/efi.h> 20#include <linux/efi.h>
21#include <asm/efi.h> 21#include <asm/efi.h>
22#include <asm/pci_x86.h>
22 23
23/* Simple VGA output */ 24/* Simple VGA output */
24#define VGABASE (__ISA_IO_base + 0xb8000) 25#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -76,7 +77,7 @@ static struct console early_vga_console = {
76 77
77/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ 78/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
78 79
79static int early_serial_base = 0x3f8; /* ttyS0 */ 80static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
80 81
81#define XMTRDY 0x20 82#define XMTRDY 0x20
82 83
@@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
94#define DLL 0 /* Divisor Latch Low */ 95#define DLL 0 /* Divisor Latch Low */
95#define DLH 1 /* Divisor latch High */ 96#define DLH 1 /* Divisor latch High */
96 97
98static void mem32_serial_out(unsigned long addr, int offset, int value)
99{
100 uint32_t *vaddr = (uint32_t *)addr;
101 /* shift implied by pointer type */
102 writel(value, vaddr + offset);
103}
104
105static unsigned int mem32_serial_in(unsigned long addr, int offset)
106{
107 uint32_t *vaddr = (uint32_t *)addr;
108 /* shift implied by pointer type */
109 return readl(vaddr + offset);
110}
111
112static unsigned int io_serial_in(unsigned long addr, int offset)
113{
114 return inb(addr + offset);
115}
116
117static void io_serial_out(unsigned long addr, int offset, int value)
118{
119 outb(value, addr + offset);
120}
121
122static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
123static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
124
97static int early_serial_putc(unsigned char ch) 125static int early_serial_putc(unsigned char ch)
98{ 126{
99 unsigned timeout = 0xffff; 127 unsigned timeout = 0xffff;
100 128
101 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 129 while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
102 cpu_relax(); 130 cpu_relax();
103 outb(ch, early_serial_base + TXR); 131 serial_out(early_serial_base, TXR, ch);
104 return timeout ? 0 : -1; 132 return timeout ? 0 : -1;
105} 133}
106 134
@@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n)
114 } 142 }
115} 143}
116 144
145static __init void early_serial_hw_init(unsigned divisor)
146{
147 unsigned char c;
148
149 serial_out(early_serial_base, LCR, 0x3); /* 8n1 */
150 serial_out(early_serial_base, IER, 0); /* no interrupt */
151 serial_out(early_serial_base, FCR, 0); /* no fifo */
152 serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */
153
154 c = serial_in(early_serial_base, LCR);
155 serial_out(early_serial_base, LCR, c | DLAB);
156 serial_out(early_serial_base, DLL, divisor & 0xff);
157 serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
158 serial_out(early_serial_base, LCR, c & ~DLAB);
159}
160
117#define DEFAULT_BAUD 9600 161#define DEFAULT_BAUD 9600
118 162
119static __init void early_serial_init(char *s) 163static __init void early_serial_init(char *s)
120{ 164{
121 unsigned char c;
122 unsigned divisor; 165 unsigned divisor;
123 unsigned baud = DEFAULT_BAUD; 166 unsigned long baud = DEFAULT_BAUD;
124 char *e; 167 char *e;
125 168
126 if (*s == ',') 169 if (*s == ',')
@@ -145,24 +188,124 @@ static __init void early_serial_init(char *s)
145 s++; 188 s++;
146 } 189 }
147 190
148 outb(0x3, early_serial_base + LCR); /* 8n1 */ 191 if (*s) {
149 outb(0, early_serial_base + IER); /* no interrupt */ 192 if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
150 outb(0, early_serial_base + FCR); /* no fifo */ 193 baud = DEFAULT_BAUD;
151 outb(0x3, early_serial_base + MCR); /* DTR + RTS */ 194 }
195
196 /* Convert from baud to divisor value */
197 divisor = 115200 / baud;
198
199 /* These will always be IO based ports */
200 serial_in = io_serial_in;
201 serial_out = io_serial_out;
202
203 /* Set up the HW */
204 early_serial_hw_init(divisor);
205}
206
207#ifdef CONFIG_PCI
208/*
209 * early_pci_serial_init()
210 *
211 * This function is invoked when the early_printk param starts with "pciserial"
212 * The rest of the param should be ",B:D.F,baud" where B, D & F describe the
213 * location of a PCI device that must be a UART device.
214 */
215static __init void early_pci_serial_init(char *s)
216{
217 unsigned divisor;
218 unsigned long baud = DEFAULT_BAUD;
219 u8 bus, slot, func;
220 uint32_t classcode, bar0;
221 uint16_t cmdreg;
222 char *e;
223
224
225 /*
226 * First, part the param to get the BDF values
227 */
228 if (*s == ',')
229 ++s;
230
231 if (*s == 0)
232 return;
233
234 bus = (u8)simple_strtoul(s, &e, 16);
235 s = e;
236 if (*s != ':')
237 return;
238 ++s;
239 slot = (u8)simple_strtoul(s, &e, 16);
240 s = e;
241 if (*s != '.')
242 return;
243 ++s;
244 func = (u8)simple_strtoul(s, &e, 16);
245 s = e;
246
247 /* A baud might be following */
248 if (*s == ',')
249 s++;
250
251 /*
252 * Second, find the device from the BDF
253 */
254 cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
255 classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
256 bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
257
258 /*
259 * Verify it is a UART type device
260 */
261 if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
262 (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
263 (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */
264 return;
265
266 /*
267 * Determine if it is IO or memory mapped
268 */
269 if (bar0 & 0x01) {
270 /* it is IO mapped */
271 serial_in = io_serial_in;
272 serial_out = io_serial_out;
273 early_serial_base = bar0&0xfffffffc;
274 write_pci_config(bus, slot, func, PCI_COMMAND,
275 cmdreg|PCI_COMMAND_IO);
276 } else {
277 /* It is memory mapped - assume 32-bit alignment */
278 serial_in = mem32_serial_in;
279 serial_out = mem32_serial_out;
280 /* WARNING! assuming the address is always in the first 4G */
281 early_serial_base =
282 (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
283 write_pci_config(bus, slot, func, PCI_COMMAND,
284 cmdreg|PCI_COMMAND_MEMORY);
285 }
152 286
287 /*
288 * Lastly, initalize the hardware
289 */
153 if (*s) { 290 if (*s) {
154 baud = simple_strtoul(s, &e, 0); 291 if (strcmp(s, "nocfg") == 0)
155 if (baud == 0 || s == e) 292 /* Sometimes, we want to leave the UART alone
293 * and assume the BIOS has set it up correctly.
294 * "nocfg" tells us this is the case, and we
295 * should do no more setup.
296 */
297 return;
298 if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
156 baud = DEFAULT_BAUD; 299 baud = DEFAULT_BAUD;
157 } 300 }
158 301
302 /* Convert from baud to divisor value */
159 divisor = 115200 / baud; 303 divisor = 115200 / baud;
160 c = inb(early_serial_base + LCR); 304
161 outb(c | DLAB, early_serial_base + LCR); 305 /* Set up the HW */
162 outb(divisor & 0xff, early_serial_base + DLL); 306 early_serial_hw_init(divisor);
163 outb((divisor >> 8) & 0xff, early_serial_base + DLH);
164 outb(c & ~DLAB, early_serial_base + LCR);
165} 307}
308#endif
166 309
167static struct console early_serial_console = { 310static struct console early_serial_console = {
168 .name = "earlyser", 311 .name = "earlyser",
@@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf)
210 early_serial_init(buf + 4); 353 early_serial_init(buf + 4);
211 early_console_register(&early_serial_console, keep); 354 early_console_register(&early_serial_console, keep);
212 } 355 }
356#ifdef CONFIG_PCI
357 if (!strncmp(buf, "pciserial", 9)) {
358 early_pci_serial_init(buf + 9);
359 early_console_register(&early_serial_console, keep);
360 buf += 9; /* Keep from match the above "serial" */
361 }
362#endif
213 if (!strncmp(buf, "vga", 3) && 363 if (!strncmp(buf, "vga", 3) &&
214 boot_params.screen_info.orig_video_isVGA == 1) { 364 boot_params.screen_info.orig_video_isVGA == 1) {
215 max_xpos = boot_params.screen_info.orig_video_cols; 365 max_xpos = boot_params.screen_info.orig_video_cols;
@@ -226,11 +376,6 @@ static int __init setup_early_printk(char *buf)
226 early_console_register(&xenboot_console, keep); 376 early_console_register(&xenboot_console, keep);
227#endif 377#endif
228#ifdef CONFIG_EARLY_PRINTK_INTEL_MID 378#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
229 if (!strncmp(buf, "mrst", 4)) {
230 mrst_early_console_init();
231 early_console_register(&early_mrst_console, keep);
232 }
233
234 if (!strncmp(buf, "hsu", 3)) { 379 if (!strncmp(buf, "hsu", 3)) {
235 hsu_early_console_init(buf + 3); 380 hsu_early_console_init(buf + 3);
236 early_console_register(&early_hsu_console, keep); 381 early_console_register(&early_hsu_console, keep);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 000d4199b03e..31e2d5bf3e38 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -982,6 +982,9 @@ ENTRY(xen_hypervisor_callback)
982ENTRY(xen_do_upcall) 982ENTRY(xen_do_upcall)
9831: mov %esp, %eax 9831: mov %esp, %eax
984 call xen_evtchn_do_upcall 984 call xen_evtchn_do_upcall
985#ifndef CONFIG_PREEMPT
986 call xen_maybe_preempt_hcall
987#endif
985 jmp ret_from_intr 988 jmp ret_from_intr
986 CFI_ENDPROC 989 CFI_ENDPROC
987ENDPROC(xen_hypervisor_callback) 990ENDPROC(xen_hypervisor_callback)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9ebaf63ba182..10074ad9ebf8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64)
143 movq \tmp,RSP+\offset(%rsp) 143 movq \tmp,RSP+\offset(%rsp)
144 movq $__USER_DS,SS+\offset(%rsp) 144 movq $__USER_DS,SS+\offset(%rsp)
145 movq $__USER_CS,CS+\offset(%rsp) 145 movq $__USER_CS,CS+\offset(%rsp)
146 movq $-1,RCX+\offset(%rsp) 146 movq RIP+\offset(%rsp),\tmp /* get rip */
147 movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
147 movq R11+\offset(%rsp),\tmp /* get eflags */ 148 movq R11+\offset(%rsp),\tmp /* get eflags */
148 movq \tmp,EFLAGS+\offset(%rsp) 149 movq \tmp,EFLAGS+\offset(%rsp)
149 .endm 150 .endm
@@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64)
155 movq \tmp,R11+\offset(%rsp) 156 movq \tmp,R11+\offset(%rsp)
156 .endm 157 .endm
157 158
158 .macro FAKE_STACK_FRAME child_rip
159 /* push in order ss, rsp, eflags, cs, rip */
160 xorl %eax, %eax
161 pushq_cfi $__KERNEL_DS /* ss */
162 /*CFI_REL_OFFSET ss,0*/
163 pushq_cfi %rax /* rsp */
164 CFI_REL_OFFSET rsp,0
165 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
166 /*CFI_REL_OFFSET rflags,0*/
167 pushq_cfi $__KERNEL_CS /* cs */
168 /*CFI_REL_OFFSET cs,0*/
169 pushq_cfi \child_rip /* rip */
170 CFI_REL_OFFSET rip,0
171 pushq_cfi %rax /* orig rax */
172 .endm
173
174 .macro UNFAKE_STACK_FRAME
175 addq $8*6, %rsp
176 CFI_ADJUST_CFA_OFFSET -(6*8)
177 .endm
178
179/* 159/*
180 * initial frame state for interrupts (and exceptions without error code) 160 * initial frame state for interrupts (and exceptions without error code)
181 */ 161 */
@@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
238 CFI_REL_OFFSET r15, R15+\offset 218 CFI_REL_OFFSET r15, R15+\offset
239 .endm 219 .endm
240 220
241/* save partial stack frame */
242 .macro SAVE_ARGS_IRQ
243 cld
244 /* start from rbp in pt_regs and jump over */
245 movq_cfi rdi, (RDI-RBP)
246 movq_cfi rsi, (RSI-RBP)
247 movq_cfi rdx, (RDX-RBP)
248 movq_cfi rcx, (RCX-RBP)
249 movq_cfi rax, (RAX-RBP)
250 movq_cfi r8, (R8-RBP)
251 movq_cfi r9, (R9-RBP)
252 movq_cfi r10, (R10-RBP)
253 movq_cfi r11, (R11-RBP)
254
255 /* Save rbp so that we can unwind from get_irq_regs() */
256 movq_cfi rbp, 0
257
258 /* Save previous stack value */
259 movq %rsp, %rsi
260
261 leaq -RBP(%rsp),%rdi /* arg1 for handler */
262 testl $3, CS-RBP(%rsi)
263 je 1f
264 SWAPGS
265 /*
266 * irq_count is used to check if a CPU is already on an interrupt stack
267 * or not. While this is essentially redundant with preempt_count it is
268 * a little cheaper to use a separate counter in the PDA (short of
269 * moving irq_enter into assembly, which would be too much work)
270 */
2711: incl PER_CPU_VAR(irq_count)
272 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
273 CFI_DEF_CFA_REGISTER rsi
274
275 /* Store previous stack value */
276 pushq %rsi
277 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
278 0x77 /* DW_OP_breg7 */, 0, \
279 0x06 /* DW_OP_deref */, \
280 0x08 /* DW_OP_const1u */, SS+8-RBP, \
281 0x22 /* DW_OP_plus */
282 /* We entered an interrupt context - irqs are off: */
283 TRACE_IRQS_OFF
284 .endm
285
286ENTRY(save_paranoid) 221ENTRY(save_paranoid)
287 XCPT_FRAME 1 RDI+8 222 XCPT_FRAME 1 RDI+8
288 cld 223 cld
@@ -426,15 +361,12 @@ system_call_fastpath:
426 * Has incomplete stack frame and undefined top of stack. 361 * Has incomplete stack frame and undefined top of stack.
427 */ 362 */
428ret_from_sys_call: 363ret_from_sys_call:
429 movl $_TIF_ALLWORK_MASK,%edi 364 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
430 /* edi: flagmask */ 365 jnz int_ret_from_sys_call_fixup /* Go the the slow path */
431sysret_check: 366
432 LOCKDEP_SYS_EXIT 367 LOCKDEP_SYS_EXIT
433 DISABLE_INTERRUPTS(CLBR_NONE) 368 DISABLE_INTERRUPTS(CLBR_NONE)
434 TRACE_IRQS_OFF 369 TRACE_IRQS_OFF
435 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
436 andl %edi,%edx
437 jnz sysret_careful
438 CFI_REMEMBER_STATE 370 CFI_REMEMBER_STATE
439 /* 371 /*
440 * sysretq will re-enable interrupts: 372 * sysretq will re-enable interrupts:
@@ -448,49 +380,10 @@ sysret_check:
448 USERGS_SYSRET64 380 USERGS_SYSRET64
449 381
450 CFI_RESTORE_STATE 382 CFI_RESTORE_STATE
451 /* Handle reschedules */
452 /* edx: work, edi: workmask */
453sysret_careful:
454 bt $TIF_NEED_RESCHED,%edx
455 jnc sysret_signal
456 TRACE_IRQS_ON
457 ENABLE_INTERRUPTS(CLBR_NONE)
458 pushq_cfi %rdi
459 SCHEDULE_USER
460 popq_cfi %rdi
461 jmp sysret_check
462 383
463 /* Handle a signal */ 384int_ret_from_sys_call_fixup:
464sysret_signal:
465 TRACE_IRQS_ON
466 ENABLE_INTERRUPTS(CLBR_NONE)
467#ifdef CONFIG_AUDITSYSCALL
468 bt $TIF_SYSCALL_AUDIT,%edx
469 jc sysret_audit
470#endif
471 /*
472 * We have a signal, or exit tracing or single-step.
473 * These all wind up with the iret return path anyway,
474 * so just join that path right now.
475 */
476 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET 385 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
477 jmp int_check_syscall_exit_work 386 jmp int_ret_from_sys_call
478
479#ifdef CONFIG_AUDITSYSCALL
480 /*
481 * Return fast path for syscall audit. Call __audit_syscall_exit()
482 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
483 * masked off.
484 */
485sysret_audit:
486 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
487 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
488 setbe %al /* 1 if so, 0 if not */
489 movzbl %al,%edi /* zero-extend that into %edi */
490 call __audit_syscall_exit
491 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
492 jmp sysret_check
493#endif /* CONFIG_AUDITSYSCALL */
494 387
495 /* Do syscall tracing */ 388 /* Do syscall tracing */
496tracesys: 389tracesys:
@@ -626,19 +519,6 @@ END(\label)
626 FORK_LIKE vfork 519 FORK_LIKE vfork
627 FIXED_FRAME stub_iopl, sys_iopl 520 FIXED_FRAME stub_iopl, sys_iopl
628 521
629ENTRY(ptregscall_common)
630 DEFAULT_FRAME 1 8 /* offset 8: return address */
631 RESTORE_TOP_OF_STACK %r11, 8
632 movq_cfi_restore R15+8, r15
633 movq_cfi_restore R14+8, r14
634 movq_cfi_restore R13+8, r13
635 movq_cfi_restore R12+8, r12
636 movq_cfi_restore RBP+8, rbp
637 movq_cfi_restore RBX+8, rbx
638 ret $REST_SKIP /* pop extended registers */
639 CFI_ENDPROC
640END(ptregscall_common)
641
642ENTRY(stub_execve) 522ENTRY(stub_execve)
643 CFI_STARTPROC 523 CFI_STARTPROC
644 addq $8, %rsp 524 addq $8, %rsp
@@ -779,7 +659,48 @@ END(interrupt)
779 /* reserve pt_regs for scratch regs and rbp */ 659 /* reserve pt_regs for scratch regs and rbp */
780 subq $ORIG_RAX-RBP, %rsp 660 subq $ORIG_RAX-RBP, %rsp
781 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP 661 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
782 SAVE_ARGS_IRQ 662 cld
663 /* start from rbp in pt_regs and jump over */
664 movq_cfi rdi, (RDI-RBP)
665 movq_cfi rsi, (RSI-RBP)
666 movq_cfi rdx, (RDX-RBP)
667 movq_cfi rcx, (RCX-RBP)
668 movq_cfi rax, (RAX-RBP)
669 movq_cfi r8, (R8-RBP)
670 movq_cfi r9, (R9-RBP)
671 movq_cfi r10, (R10-RBP)
672 movq_cfi r11, (R11-RBP)
673
674 /* Save rbp so that we can unwind from get_irq_regs() */
675 movq_cfi rbp, 0
676
677 /* Save previous stack value */
678 movq %rsp, %rsi
679
680 leaq -RBP(%rsp),%rdi /* arg1 for handler */
681 testl $3, CS-RBP(%rsi)
682 je 1f
683 SWAPGS
684 /*
685 * irq_count is used to check if a CPU is already on an interrupt stack
686 * or not. While this is essentially redundant with preempt_count it is
687 * a little cheaper to use a separate counter in the PDA (short of
688 * moving irq_enter into assembly, which would be too much work)
689 */
6901: incl PER_CPU_VAR(irq_count)
691 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
692 CFI_DEF_CFA_REGISTER rsi
693
694 /* Store previous stack value */
695 pushq %rsi
696 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
697 0x77 /* DW_OP_breg7 */, 0, \
698 0x06 /* DW_OP_deref */, \
699 0x08 /* DW_OP_const1u */, SS+8-RBP, \
700 0x22 /* DW_OP_plus */
701 /* We entered an interrupt context - irqs are off: */
702 TRACE_IRQS_OFF
703
783 call \func 704 call \func
784 .endm 705 .endm
785 706
@@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */
831 */ 752 */
832 DISABLE_INTERRUPTS(CLBR_ANY) 753 DISABLE_INTERRUPTS(CLBR_ANY)
833 TRACE_IRQS_IRETQ 754 TRACE_IRQS_IRETQ
755
756 /*
757 * Try to use SYSRET instead of IRET if we're returning to
758 * a completely clean 64-bit userspace context.
759 */
760 movq (RCX-R11)(%rsp), %rcx
761 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
762 jne opportunistic_sysret_failed
763
764 /*
765 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
766 * in kernel space. This essentially lets the user take over
767 * the kernel, since userspace controls RSP. It's not worth
768 * testing for canonicalness exactly -- this check detects any
769 * of the 17 high bits set, which is true for non-canonical
770 * or kernel addresses. (This will pessimize vsyscall=native.
771 * Big deal.)
772 *
773 * If virtual addresses ever become wider, this will need
774 * to be updated to remain correct on both old and new CPUs.
775 */
776 .ifne __VIRTUAL_MASK_SHIFT - 47
777 .error "virtual address width changed -- sysret checks need update"
778 .endif
779 shr $__VIRTUAL_MASK_SHIFT, %rcx
780 jnz opportunistic_sysret_failed
781
782 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
783 jne opportunistic_sysret_failed
784
785 movq (R11-ARGOFFSET)(%rsp), %r11
786 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
787 jne opportunistic_sysret_failed
788
789 testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
790 jnz opportunistic_sysret_failed
791
792 /* nothing to check for RSP */
793
794 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
795 jne opportunistic_sysret_failed
796
797 /*
798 * We win! This label is here just for ease of understanding
799 * perf profiles. Nothing jumps here.
800 */
801irq_return_via_sysret:
802 CFI_REMEMBER_STATE
803 RESTORE_ARGS 1,8,1
804 movq (RSP-RIP)(%rsp),%rsp
805 USERGS_SYSRET64
806 CFI_RESTORE_STATE
807
808opportunistic_sysret_failed:
834 SWAPGS 809 SWAPGS
835 jmp restore_args 810 jmp restore_args
836 811
@@ -1048,6 +1023,11 @@ ENTRY(\sym)
1048 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1023 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1049 1024
1050 .if \paranoid 1025 .if \paranoid
1026 .if \paranoid == 1
1027 CFI_REMEMBER_STATE
1028 testl $3, CS(%rsp) /* If coming from userspace, switch */
1029 jnz 1f /* stacks. */
1030 .endif
1051 call save_paranoid 1031 call save_paranoid
1052 .else 1032 .else
1053 call error_entry 1033 call error_entry
@@ -1088,6 +1068,36 @@ ENTRY(\sym)
1088 jmp error_exit /* %ebx: no swapgs flag */ 1068 jmp error_exit /* %ebx: no swapgs flag */
1089 .endif 1069 .endif
1090 1070
1071 .if \paranoid == 1
1072 CFI_RESTORE_STATE
1073 /*
1074 * Paranoid entry from userspace. Switch stacks and treat it
1075 * as a normal entry. This means that paranoid handlers
1076 * run in real process context if user_mode(regs).
1077 */
10781:
1079 call error_entry
1080
1081 DEFAULT_FRAME 0
1082
1083 movq %rsp,%rdi /* pt_regs pointer */
1084 call sync_regs
1085 movq %rax,%rsp /* switch stack */
1086
1087 movq %rsp,%rdi /* pt_regs pointer */
1088
1089 .if \has_error_code
1090 movq ORIG_RAX(%rsp),%rsi /* get error code */
1091 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1092 .else
1093 xorl %esi,%esi /* no error code */
1094 .endif
1095
1096 call \do_sym
1097
1098 jmp error_exit /* %ebx: no swapgs flag */
1099 .endif
1100
1091 CFI_ENDPROC 1101 CFI_ENDPROC
1092END(\sym) 1102END(\sym)
1093.endm 1103.endm
@@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0
1108idtentry bounds do_bounds has_error_code=0 1118idtentry bounds do_bounds has_error_code=0
1109idtentry invalid_op do_invalid_op has_error_code=0 1119idtentry invalid_op do_invalid_op has_error_code=0
1110idtentry device_not_available do_device_not_available has_error_code=0 1120idtentry device_not_available do_device_not_available has_error_code=0
1111idtentry double_fault do_double_fault has_error_code=1 paranoid=1 1121idtentry double_fault do_double_fault has_error_code=1 paranoid=2
1112idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 1122idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
1113idtentry invalid_TSS do_invalid_TSS has_error_code=1 1123idtentry invalid_TSS do_invalid_TSS has_error_code=1
1114idtentry segment_not_present do_segment_not_present has_error_code=1 1124idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1198,6 +1208,9 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1198 popq %rsp 1208 popq %rsp
1199 CFI_DEF_CFA_REGISTER rsp 1209 CFI_DEF_CFA_REGISTER rsp
1200 decl PER_CPU_VAR(irq_count) 1210 decl PER_CPU_VAR(irq_count)
1211#ifndef CONFIG_PREEMPT
1212 call xen_maybe_preempt_hcall
1213#endif
1201 jmp error_exit 1214 jmp error_exit
1202 CFI_ENDPROC 1215 CFI_ENDPROC
1203END(xen_do_hypervisor_callback) 1216END(xen_do_hypervisor_callback)
@@ -1289,16 +1302,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
1289#endif 1302#endif
1290 1303
1291 /* 1304 /*
1292 * "Paranoid" exit path from exception stack. 1305 * "Paranoid" exit path from exception stack. This is invoked
1293 * Paranoid because this is used by NMIs and cannot take 1306 * only on return from non-NMI IST interrupts that came
1294 * any kernel state for granted. 1307 * from kernel space.
1295 * We don't do kernel preemption checks here, because only
1296 * NMI should be common and it does not enable IRQs and
1297 * cannot get reschedule ticks.
1298 * 1308 *
1299 * "trace" is 0 for the NMI handler only, because irq-tracing 1309 * We may be returning to very strange contexts (e.g. very early
1300 * is fundamentally NMI-unsafe. (we cannot change the soft and 1310 * in syscall entry), so checking for preemption here would
1301 * hard flags at once, atomically) 1311 * be complicated. Fortunately, we there's no good reason
1312 * to try to handle preemption here.
1302 */ 1313 */
1303 1314
1304 /* ebx: no swapgs flag */ 1315 /* ebx: no swapgs flag */
@@ -1308,43 +1319,14 @@ ENTRY(paranoid_exit)
1308 TRACE_IRQS_OFF_DEBUG 1319 TRACE_IRQS_OFF_DEBUG
1309 testl %ebx,%ebx /* swapgs needed? */ 1320 testl %ebx,%ebx /* swapgs needed? */
1310 jnz paranoid_restore 1321 jnz paranoid_restore
1311 testl $3,CS(%rsp)
1312 jnz paranoid_userspace
1313paranoid_swapgs:
1314 TRACE_IRQS_IRETQ 0 1322 TRACE_IRQS_IRETQ 0
1315 SWAPGS_UNSAFE_STACK 1323 SWAPGS_UNSAFE_STACK
1316 RESTORE_ALL 8 1324 RESTORE_ALL 8
1317 jmp irq_return 1325 INTERRUPT_RETURN
1318paranoid_restore: 1326paranoid_restore:
1319 TRACE_IRQS_IRETQ_DEBUG 0 1327 TRACE_IRQS_IRETQ_DEBUG 0
1320 RESTORE_ALL 8 1328 RESTORE_ALL 8
1321 jmp irq_return 1329 INTERRUPT_RETURN
1322paranoid_userspace:
1323 GET_THREAD_INFO(%rcx)
1324 movl TI_flags(%rcx),%ebx
1325 andl $_TIF_WORK_MASK,%ebx
1326 jz paranoid_swapgs
1327 movq %rsp,%rdi /* &pt_regs */
1328 call sync_regs
1329 movq %rax,%rsp /* switch stack for scheduling */
1330 testl $_TIF_NEED_RESCHED,%ebx
1331 jnz paranoid_schedule
1332 movl %ebx,%edx /* arg3: thread flags */
1333 TRACE_IRQS_ON
1334 ENABLE_INTERRUPTS(CLBR_NONE)
1335 xorl %esi,%esi /* arg2: oldset */
1336 movq %rsp,%rdi /* arg1: &pt_regs */
1337 call do_notify_resume
1338 DISABLE_INTERRUPTS(CLBR_NONE)
1339 TRACE_IRQS_OFF
1340 jmp paranoid_userspace
1341paranoid_schedule:
1342 TRACE_IRQS_ON
1343 ENABLE_INTERRUPTS(CLBR_ANY)
1344 SCHEDULE_USER
1345 DISABLE_INTERRUPTS(CLBR_ANY)
1346 TRACE_IRQS_OFF
1347 jmp paranoid_userspace
1348 CFI_ENDPROC 1330 CFI_ENDPROC
1349END(paranoid_exit) 1331END(paranoid_exit)
1350 1332
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)
31 31
32asmlinkage __visible void __init i386_start_kernel(void) 32asmlinkage __visible void __init i386_start_kernel(void)
33{ 33{
34 cr4_init_shadow();
34 sanitize_boot_params(&boot_params); 35 sanitize_boot_params(&boot_params);
35 36
36 /* Call the subarch specific early setup function */ 37 /* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..c4f8d4659070 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/bootparam_utils.h> 28#include <asm/bootparam_utils.h>
29#include <asm/microcode.h> 29#include <asm/microcode.h>
30#include <asm/kasan.h>
30 31
31/* 32/*
32 * Manage page tables very early on. 33 * Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
46 47
47 next_early_pgt = 0; 48 next_early_pgt = 0;
48 49
49 write_cr3(__pa(early_level4_pgt)); 50 write_cr3(__pa_nodebug(early_level4_pgt));
50} 51}
51 52
52/* Create a new PMD entry */ 53/* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
59 pmdval_t pmd, *pmd_p; 60 pmdval_t pmd, *pmd_p;
60 61
61 /* Invalid address or early pgt is done ? */ 62 /* Invalid address or early pgt is done ? */
62 if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) 63 if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
63 return -1; 64 return -1;
64 65
65again: 66again:
@@ -155,9 +156,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
155 (__START_KERNEL & PGDIR_MASK))); 156 (__START_KERNEL & PGDIR_MASK)));
156 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 157 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
157 158
159 cr4_init_shadow();
160
158 /* Kill off the identity-map trampoline */ 161 /* Kill off the identity-map trampoline */
159 reset_early_page_tables(); 162 reset_early_page_tables();
160 163
164 kasan_map_early_shadow(early_level4_pgt);
165
161 /* clear bss before set_intr_gate with early_idt_handler */ 166 /* clear bss before set_intr_gate with early_idt_handler */
162 clear_bss(); 167 clear_bss();
163 168
@@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
179 /* set init_level4_pgt kernel high mapping*/ 184 /* set init_level4_pgt kernel high mapping*/
180 init_level4_pgt[511] = early_level4_pgt[511]; 185 init_level4_pgt[511] = early_level4_pgt[511];
181 186
187 kasan_map_early_shadow(init_level4_pgt);
188
182 x86_64_start_reservations(real_mode_data); 189 x86_64_start_reservations(real_mode_data);
183} 190}
184 191
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a65c42..6fd514d9f69a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -514,8 +514,38 @@ ENTRY(phys_base)
514 /* This must match the first entry in level2_kernel_pgt */ 514 /* This must match the first entry in level2_kernel_pgt */
515 .quad 0x0000000000000000 515 .quad 0x0000000000000000
516 516
517#ifdef CONFIG_KASAN
518#define FILL(VAL, COUNT) \
519 .rept (COUNT) ; \
520 .quad (VAL) ; \
521 .endr
522
523NEXT_PAGE(kasan_zero_pte)
524 FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
525NEXT_PAGE(kasan_zero_pmd)
526 FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
527NEXT_PAGE(kasan_zero_pud)
528 FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
529
530#undef FILL
531#endif
532
533
517#include "../../x86/xen/xen-head.S" 534#include "../../x86/xen/xen-head.S"
518 535
519 __PAGE_ALIGNED_BSS 536 __PAGE_ALIGNED_BSS
520NEXT_PAGE(empty_zero_page) 537NEXT_PAGE(empty_zero_page)
521 .skip PAGE_SIZE 538 .skip PAGE_SIZE
539
540#ifdef CONFIG_KASAN
541/*
542 * This page used as early shadow. We don't use empty_zero_page
543 * at early stages, stack instrumentation could write some garbage
544 * to this page.
545 * Latter we reuse it as zero shadow for large ranges of memory
546 * that allowed to access, but not instrumented by kasan
547 * (vmalloc/vmemmap ...).
548 */
549NEXT_PAGE(kasan_zero_page)
550 .skip PAGE_SIZE
551#endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
168#define hpet_print_config() \ 168#define hpet_print_config() \
169do { \ 169do { \
170 if (hpet_verbose) \ 170 if (hpet_verbose) \
171 _hpet_print_config(__FUNCTION__, __LINE__); \ 171 _hpet_print_config(__func__, __LINE__); \
172} while (0) 172} while (0)
173 173
174/* 174/*
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 3d5fb509bdeb..7114ba220fd4 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -126,6 +126,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
126 *dr7 |= encode_dr7(i, info->len, info->type); 126 *dr7 |= encode_dr7(i, info->len, info->type);
127 127
128 set_debugreg(*dr7, 7); 128 set_debugreg(*dr7, 7);
129 if (info->mask)
130 set_dr_addr_mask(info->mask, i);
129 131
130 return 0; 132 return 0;
131} 133}
@@ -161,29 +163,8 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
161 *dr7 &= ~__encode_dr7(i, info->len, info->type); 163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
162 164
163 set_debugreg(*dr7, 7); 165 set_debugreg(*dr7, 7);
164} 166 if (info->mask)
165 167 set_dr_addr_mask(0, i);
166static int get_hbp_len(u8 hbp_len)
167{
168 unsigned int len_in_bytes = 0;
169
170 switch (hbp_len) {
171 case X86_BREAKPOINT_LEN_1:
172 len_in_bytes = 1;
173 break;
174 case X86_BREAKPOINT_LEN_2:
175 len_in_bytes = 2;
176 break;
177 case X86_BREAKPOINT_LEN_4:
178 len_in_bytes = 4;
179 break;
180#ifdef CONFIG_X86_64
181 case X86_BREAKPOINT_LEN_8:
182 len_in_bytes = 8;
183 break;
184#endif
185 }
186 return len_in_bytes;
187} 168}
188 169
189/* 170/*
@@ -196,7 +177,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
196 struct arch_hw_breakpoint *info = counter_arch_bp(bp); 177 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
197 178
198 va = info->address; 179 va = info->address;
199 len = get_hbp_len(info->len); 180 len = bp->attr.bp_len;
200 181
201 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 182 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
202} 183}
@@ -277,6 +258,8 @@ static int arch_build_bp_info(struct perf_event *bp)
277 } 258 }
278 259
279 /* Len */ 260 /* Len */
261 info->mask = 0;
262
280 switch (bp->attr.bp_len) { 263 switch (bp->attr.bp_len) {
281 case HW_BREAKPOINT_LEN_1: 264 case HW_BREAKPOINT_LEN_1:
282 info->len = X86_BREAKPOINT_LEN_1; 265 info->len = X86_BREAKPOINT_LEN_1;
@@ -293,11 +276,17 @@ static int arch_build_bp_info(struct perf_event *bp)
293 break; 276 break;
294#endif 277#endif
295 default: 278 default:
296 return -EINVAL; 279 if (!is_power_of_2(bp->attr.bp_len))
280 return -EINVAL;
281 if (!cpu_has_bpext)
282 return -EOPNOTSUPP;
283 info->mask = bp->attr.bp_len - 1;
284 info->len = X86_BREAKPOINT_LEN_1;
297 } 285 }
298 286
299 return 0; 287 return 0;
300} 288}
289
301/* 290/*
302 * Validate the arch-specific HW Breakpoint register settings 291 * Validate the arch-specific HW Breakpoint register settings
303 */ 292 */
@@ -312,11 +301,11 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
312 if (ret) 301 if (ret)
313 return ret; 302 return ret;
314 303
315 ret = -EINVAL;
316
317 switch (info->len) { 304 switch (info->len) {
318 case X86_BREAKPOINT_LEN_1: 305 case X86_BREAKPOINT_LEN_1:
319 align = 0; 306 align = 0;
307 if (info->mask)
308 align = info->mask;
320 break; 309 break;
321 case X86_BREAKPOINT_LEN_2: 310 case X86_BREAKPOINT_LEN_2:
322 align = 1; 311 align = 1;
@@ -330,7 +319,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
330 break; 319 break;
331#endif 320#endif
332 default: 321 default:
333 return ret; 322 WARN_ON_ONCE(1);
334 } 323 }
335 324
336 /* 325 /*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..d5651fce0b71 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,12 +13,26 @@
13#include <asm/sigcontext.h> 13#include <asm/sigcontext.h>
14#include <asm/processor.h> 14#include <asm/processor.h>
15#include <asm/math_emu.h> 15#include <asm/math_emu.h>
16#include <asm/tlbflush.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/ptrace.h> 18#include <asm/ptrace.h>
18#include <asm/i387.h> 19#include <asm/i387.h>
19#include <asm/fpu-internal.h> 20#include <asm/fpu-internal.h>
20#include <asm/user.h> 21#include <asm/user.h>
21 22
23static DEFINE_PER_CPU(bool, in_kernel_fpu);
24
25void kernel_fpu_disable(void)
26{
27 WARN_ON(this_cpu_read(in_kernel_fpu));
28 this_cpu_write(in_kernel_fpu, true);
29}
30
31void kernel_fpu_enable(void)
32{
33 this_cpu_write(in_kernel_fpu, false);
34}
35
22/* 36/*
23 * Were we in an interrupt that interrupted kernel mode? 37 * Were we in an interrupt that interrupted kernel mode?
24 * 38 *
@@ -33,6 +47,9 @@
33 */ 47 */
34static inline bool interrupted_kernel_fpu_idle(void) 48static inline bool interrupted_kernel_fpu_idle(void)
35{ 49{
50 if (this_cpu_read(in_kernel_fpu))
51 return false;
52
36 if (use_eager_fpu()) 53 if (use_eager_fpu())
37 return __thread_has_fpu(current); 54 return __thread_has_fpu(current);
38 55
@@ -73,10 +90,10 @@ void __kernel_fpu_begin(void)
73{ 90{
74 struct task_struct *me = current; 91 struct task_struct *me = current;
75 92
93 this_cpu_write(in_kernel_fpu, true);
94
76 if (__thread_has_fpu(me)) { 95 if (__thread_has_fpu(me)) {
77 __thread_clear_has_fpu(me);
78 __save_init_fpu(me); 96 __save_init_fpu(me);
79 /* We do 'stts()' in __kernel_fpu_end() */
80 } else if (!use_eager_fpu()) { 97 } else if (!use_eager_fpu()) {
81 this_cpu_write(fpu_owner_task, NULL); 98 this_cpu_write(fpu_owner_task, NULL);
82 clts(); 99 clts();
@@ -86,19 +103,16 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
86 103
87void __kernel_fpu_end(void) 104void __kernel_fpu_end(void)
88{ 105{
89 if (use_eager_fpu()) { 106 struct task_struct *me = current;
90 /* 107
91 * For eager fpu, most the time, tsk_used_math() is true. 108 if (__thread_has_fpu(me)) {
92 * Restore the user math as we are done with the kernel usage. 109 if (WARN_ON(restore_fpu_checking(me)))
93 * At few instances during thread exit, signal handling etc, 110 drop_init_fpu(me);
94 * tsk_used_math() is false. Those few places will take proper 111 } else if (!use_eager_fpu()) {
95 * actions, so we don't need to restore the math here.
96 */
97 if (likely(tsk_used_math(current)))
98 math_state_restore();
99 } else {
100 stts(); 112 stts();
101 } 113 }
114
115 this_cpu_write(in_kernel_fpu, false);
102} 116}
103EXPORT_SYMBOL(__kernel_fpu_end); 117EXPORT_SYMBOL(__kernel_fpu_end);
104 118
@@ -180,7 +194,7 @@ void fpu_init(void)
180 if (cpu_has_xmm) 194 if (cpu_has_xmm)
181 cr4_mask |= X86_CR4_OSXMMEXCPT; 195 cr4_mask |= X86_CR4_OSXMMEXCPT;
182 if (cr4_mask) 196 if (cr4_mask)
183 set_in_cr4(cr4_mask); 197 cr4_set_bits(cr4_mask);
184 198
185 cr0 = read_cr0(); 199 cr0 = read_cr0();
186 cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ 200 cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 705ef8d48e2d..67b1cbe0093a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void)
302 irq = __this_cpu_read(vector_irq[vector]); 302 irq = __this_cpu_read(vector_irq[vector]);
303 if (irq >= 0) { 303 if (irq >= 0) {
304 desc = irq_to_desc(irq); 304 desc = irq_to_desc(irq);
305 if (!desc)
306 continue;
307
305 data = irq_desc_get_irq_data(desc); 308 data = irq_desc_get_irq_data(desc);
306 cpumask_copy(&affinity_new, data->affinity); 309 cpumask_copy(&affinity_new, data->affinity);
307 cpu_clear(this_cpu, affinity_new); 310 cpu_clear(this_cpu, affinity_new);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 63ce838e5a54..28d28f5eb8f4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
69 : "memory", "cc", "edx", "ecx", "eax"); 69 : "memory", "cc", "edx", "ecx", "eax");
70} 70}
71 71
72/* how to get the current stack pointer from C */
73#define current_stack_pointer ({ \
74 unsigned long sp; \
75 asm("mov %%esp,%0" : "=g" (sp)); \
76 sp; \
77})
78
79static inline void *current_stack(void) 72static inline void *current_stack(void)
80{ 73{
81 return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); 74 return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
82} 75}
83 76
84static inline int 77static inline int
@@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
103 96
104 /* Save the next esp at the bottom of the stack */ 97 /* Save the next esp at the bottom of the stack */
105 prev_esp = (u32 *)irqstk; 98 prev_esp = (u32 *)irqstk;
106 *prev_esp = current_stack_pointer; 99 *prev_esp = current_stack_pointer();
107 100
108 if (unlikely(overflow)) 101 if (unlikely(overflow))
109 call_on_stack(print_stack_overflow, isp); 102 call_on_stack(print_stack_overflow, isp);
@@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
156 149
157 /* Push the previous esp onto the stack */ 150 /* Push the previous esp onto the stack */
158 prev_esp = (u32 *)irqstk; 151 prev_esp = (u32 *)irqstk;
159 *prev_esp = current_stack_pointer; 152 *prev_esp = current_stack_pointer();
160 153
161 call_on_stack(__do_softirq, isp); 154 call_on_stack(__do_softirq, isp);
162} 155}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 98f654d466e5..4e3d5a9621fe 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -84,7 +84,7 @@ static volatile u32 twobyte_is_boostable[256 / 32] = {
84 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 84 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
85 /* ---------------------------------------------- */ 85 /* ---------------------------------------------- */
86 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ 86 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
87 W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ 87 W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
88 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ 88 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
89 W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ 89 W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
90 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ 90 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
@@ -223,27 +223,48 @@ static unsigned long
223__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) 223__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
224{ 224{
225 struct kprobe *kp; 225 struct kprobe *kp;
226 unsigned long faddr;
226 227
227 kp = get_kprobe((void *)addr); 228 kp = get_kprobe((void *)addr);
228 /* There is no probe, return original address */ 229 faddr = ftrace_location(addr);
229 if (!kp) 230 /*
231 * Addresses inside the ftrace location are refused by
232 * arch_check_ftrace_location(). Something went terribly wrong
233 * if such an address is checked here.
234 */
235 if (WARN_ON(faddr && faddr != addr))
236 return 0UL;
237 /*
238 * Use the current code if it is not modified by Kprobe
239 * and it cannot be modified by ftrace.
240 */
241 if (!kp && !faddr)
230 return addr; 242 return addr;
231 243
232 /* 244 /*
233 * Basically, kp->ainsn.insn has an original instruction. 245 * Basically, kp->ainsn.insn has an original instruction.
234 * However, RIP-relative instruction can not do single-stepping 246 * However, RIP-relative instruction can not do single-stepping
235 * at different place, __copy_instruction() tweaks the displacement of 247 * at different place, __copy_instruction() tweaks the displacement of
236 * that instruction. In that case, we can't recover the instruction 248 * that instruction. In that case, we can't recover the instruction
237 * from the kp->ainsn.insn. 249 * from the kp->ainsn.insn.
238 * 250 *
239 * On the other hand, kp->opcode has a copy of the first byte of 251 * On the other hand, in case on normal Kprobe, kp->opcode has a copy
240 * the probed instruction, which is overwritten by int3. And 252 * of the first byte of the probed instruction, which is overwritten
241 * the instruction at kp->addr is not modified by kprobes except 253 * by int3. And the instruction at kp->addr is not modified by kprobes
242 * for the first byte, we can recover the original instruction 254 * except for the first byte, we can recover the original instruction
243 * from it and kp->opcode. 255 * from it and kp->opcode.
256 *
257 * In case of Kprobes using ftrace, we do not have a copy of
258 * the original instruction. In fact, the ftrace location might
259 * be modified at anytime and even could be in an inconsistent state.
260 * Fortunately, we know that the original code is the ideal 5-byte
261 * long NOP.
244 */ 262 */
245 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 263 memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
246 buf[0] = kp->opcode; 264 if (faddr)
265 memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
266 else
267 buf[0] = kp->opcode;
247 return (unsigned long)buf; 268 return (unsigned long)buf;
248} 269}
249 270
@@ -251,6 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
251 * Recover the probed instruction at addr for further analysis. 272 * Recover the probed instruction at addr for further analysis.
252 * Caller must lock kprobes by kprobe_mutex, or disable preemption 273 * Caller must lock kprobes by kprobe_mutex, or disable preemption
253 * for preventing to release referencing kprobes. 274 * for preventing to release referencing kprobes.
275 * Returns zero if the instruction can not get recovered.
254 */ 276 */
255unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) 277unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
256{ 278{
@@ -285,6 +307,8 @@ static int can_probe(unsigned long paddr)
285 * normally used, we just go through if there is no kprobe. 307 * normally used, we just go through if there is no kprobe.
286 */ 308 */
287 __addr = recover_probed_instruction(buf, addr); 309 __addr = recover_probed_instruction(buf, addr);
310 if (!__addr)
311 return 0;
288 kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); 312 kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
289 insn_get_length(&insn); 313 insn_get_length(&insn);
290 314
@@ -333,6 +357,8 @@ int __copy_instruction(u8 *dest, u8 *src)
333 unsigned long recovered_insn = 357 unsigned long recovered_insn =
334 recover_probed_instruction(buf, (unsigned long)src); 358 recover_probed_instruction(buf, (unsigned long)src);
335 359
360 if (!recovered_insn)
361 return 0;
336 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); 362 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
337 insn_get_length(&insn); 363 insn_get_length(&insn);
338 /* Another subsystem puts a breakpoint, failed to recover */ 364 /* Another subsystem puts a breakpoint, failed to recover */
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..7b3b9d15c47a 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -259,6 +259,8 @@ static int can_optimize(unsigned long paddr)
259 */ 259 */
260 return 0; 260 return 0;
261 recovered_insn = recover_probed_instruction(buf, addr); 261 recovered_insn = recover_probed_instruction(buf, addr);
262 if (!recovered_insn)
263 return 0;
262 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); 264 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
263 insn_get_length(&insn); 265 insn_get_length(&insn);
264 /* Another subsystem puts a breakpoint */ 266 /* Another subsystem puts a breakpoint */
@@ -322,7 +324,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
322 * Target instructions MUST be relocatable (checked inside) 324 * Target instructions MUST be relocatable (checked inside)
323 * This is called when new aggr(opt)probe is allocated or reused. 325 * This is called when new aggr(opt)probe is allocated or reused.
324 */ 326 */
325int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) 327int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
328 struct kprobe *__unused)
326{ 329{
327 u8 *buf; 330 u8 *buf;
328 int ret; 331 int ret;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 94f643484300..e354cc6446ab 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -609,7 +609,7 @@ static inline void check_zero(void)
609 u8 ret; 609 u8 ret;
610 u8 old; 610 u8 old;
611 611
612 old = ACCESS_ONCE(zero_stats); 612 old = READ_ONCE(zero_stats);
613 if (unlikely(old)) { 613 if (unlikely(old)) {
614 ret = cmpxchg(&zero_stats, old, 0); 614 ret = cmpxchg(&zero_stats, old, 0);
615 /* This ensures only one fellow resets the stat */ 615 /* This ensures only one fellow resets the stat */
@@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
727 int cpu; 727 int cpu;
728 u64 start; 728 u64 start;
729 unsigned long flags; 729 unsigned long flags;
730 __ticket_t head;
730 731
731 if (in_nmi()) 732 if (in_nmi())
732 return; 733 return;
@@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
768 */ 769 */
769 __ticket_enter_slowpath(lock); 770 __ticket_enter_slowpath(lock);
770 771
772 /* make sure enter_slowpath, which is atomic does not cross the read */
773 smp_mb__after_atomic();
774
771 /* 775 /*
772 * check again make sure it didn't become free while 776 * check again make sure it didn't become free while
773 * we weren't looking. 777 * we weren't looking.
774 */ 778 */
775 if (ACCESS_ONCE(lock->tickets.head) == want) { 779 head = READ_ONCE(lock->tickets.head);
780 if (__tickets_equal(head, want)) {
776 add_stats(TAKEN_SLOW_PICKUP, 1); 781 add_stats(TAKEN_SLOW_PICKUP, 1);
777 goto out; 782 goto out;
778 } 783 }
@@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
803 add_stats(RELEASED_SLOW, 1); 808 add_stats(RELEASED_SLOW, 1);
804 for_each_cpu(cpu, &waiting_cpus) { 809 for_each_cpu(cpu, &waiting_cpus) {
805 const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); 810 const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
806 if (ACCESS_ONCE(w->lock) == lock && 811 if (READ_ONCE(w->lock) == lock &&
807 ACCESS_ONCE(w->want) == ticket) { 812 READ_ONCE(w->want) == ticket) {
808 add_stats(RELEASED_SLOW_KICKED, 1); 813 add_stats(RELEASED_SLOW_KICKED, 1);
809 kvm_kick_cpu(cpu); 814 kvm_kick_cpu(cpu);
810 break; 815 break;
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..ff3c3101d003
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,90 @@
1/*
2 * livepatch.c - x86-specific Kernel Live Patching Core
3 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <asm/cacheflush.h>
24#include <asm/page_types.h>
25#include <asm/elf.h>
26#include <asm/livepatch.h>
27
28/**
29 * klp_write_module_reloc() - write a relocation in a module
30 * @mod: module in which the section to be modified is found
31 * @type: ELF relocation type (see asm/elf.h)
32 * @loc: address that the relocation should be written to
33 * @value: relocation value (sym address + addend)
34 *
35 * This function writes a relocation to the specified location for
36 * a particular module.
37 */
38int klp_write_module_reloc(struct module *mod, unsigned long type,
39 unsigned long loc, unsigned long value)
40{
41 int ret, numpages, size = 4;
42 bool readonly;
43 unsigned long val;
44 unsigned long core = (unsigned long)mod->module_core;
45 unsigned long core_ro_size = mod->core_ro_size;
46 unsigned long core_size = mod->core_size;
47
48 switch (type) {
49 case R_X86_64_NONE:
50 return 0;
51 case R_X86_64_64:
52 val = value;
53 size = 8;
54 break;
55 case R_X86_64_32:
56 val = (u32)value;
57 break;
58 case R_X86_64_32S:
59 val = (s32)value;
60 break;
61 case R_X86_64_PC32:
62 val = (u32)(value - loc);
63 break;
64 default:
65 /* unsupported relocation type */
66 return -EINVAL;
67 }
68
69 if (loc < core || loc >= core + core_size)
70 /* loc does not point to any symbol inside the module */
71 return -EINVAL;
72
73 if (loc < core + core_ro_size)
74 readonly = true;
75 else
76 readonly = false;
77
78 /* determine if the relocation spans a page boundary */
79 numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
80
81 if (readonly)
82 set_memory_rw(loc & PAGE_MASK, numpages);
83
84 ret = probe_kernel_write((void *)loc, &val, size);
85
86 if (readonly)
87 set_memory_ro(loc & PAGE_MASK, numpages);
88
89 return ret;
90}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..9bbb9b35c144 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/kasan.h>
27#include <linux/bug.h> 28#include <linux/bug.h>
28#include <linux/mm.h> 29#include <linux/mm.h>
29#include <linux/gfp.h> 30#include <linux/gfp.h>
@@ -46,21 +47,13 @@ do { \
46 47
47#ifdef CONFIG_RANDOMIZE_BASE 48#ifdef CONFIG_RANDOMIZE_BASE
48static unsigned long module_load_offset; 49static unsigned long module_load_offset;
49static int randomize_modules = 1;
50 50
51/* Mutex protects the module_load_offset. */ 51/* Mutex protects the module_load_offset. */
52static DEFINE_MUTEX(module_kaslr_mutex); 52static DEFINE_MUTEX(module_kaslr_mutex);
53 53
54static int __init parse_nokaslr(char *p)
55{
56 randomize_modules = 0;
57 return 0;
58}
59early_param("nokaslr", parse_nokaslr);
60
61static unsigned long int get_module_load_offset(void) 54static unsigned long int get_module_load_offset(void)
62{ 55{
63 if (randomize_modules) { 56 if (kaslr_enabled) {
64 mutex_lock(&module_kaslr_mutex); 57 mutex_lock(&module_kaslr_mutex);
65 /* 58 /*
66 * Calculate the module_load_offset the first time this 59 * Calculate the module_load_offset the first time this
@@ -83,13 +76,22 @@ static unsigned long int get_module_load_offset(void)
83 76
84void *module_alloc(unsigned long size) 77void *module_alloc(unsigned long size)
85{ 78{
79 void *p;
80
86 if (PAGE_ALIGN(size) > MODULES_LEN) 81 if (PAGE_ALIGN(size) > MODULES_LEN)
87 return NULL; 82 return NULL;
88 return __vmalloc_node_range(size, 1, 83
84 p = __vmalloc_node_range(size, MODULE_ALIGN,
89 MODULES_VADDR + get_module_load_offset(), 85 MODULES_VADDR + get_module_load_offset(),
90 MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, 86 MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
91 PAGE_KERNEL_EXEC, NUMA_NO_NODE, 87 PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
92 __builtin_return_address(0)); 88 __builtin_return_address(0));
89 if (p && (kasan_module_alloc(p, size) < 0)) {
90 vfree(p);
91 return NULL;
92 }
93
94 return p;
93} 95}
94 96
95#ifdef CONFIG_X86_32 97#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index 0ee5025e0fa4..d66a4fe6caee 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -25,8 +25,6 @@
25 25
26#include <asm/pmc_atom.h> 26#include <asm/pmc_atom.h>
27 27
28#define DRIVER_NAME KBUILD_MODNAME
29
30struct pmc_dev { 28struct pmc_dev {
31 u32 base_addr; 29 u32 base_addr;
32 void __iomem *regmap; 30 void __iomem *regmap;
@@ -38,12 +36,12 @@ struct pmc_dev {
38static struct pmc_dev pmc_device; 36static struct pmc_dev pmc_device;
39static u32 acpi_base_addr; 37static u32 acpi_base_addr;
40 38
41struct pmc_dev_map { 39struct pmc_bit_map {
42 const char *name; 40 const char *name;
43 u32 bit_mask; 41 u32 bit_mask;
44}; 42};
45 43
46static const struct pmc_dev_map dev_map[] = { 44static const struct pmc_bit_map dev_map[] = {
47 {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA}, 45 {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
48 {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1}, 46 {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
49 {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2}, 47 {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
@@ -82,6 +80,27 @@ static const struct pmc_dev_map dev_map[] = {
82 {"35 - DFX", BIT_DFX}, 80 {"35 - DFX", BIT_DFX},
83}; 81};
84 82
83static const struct pmc_bit_map pss_map[] = {
84 {"0 - GBE", PMC_PSS_BIT_GBE},
85 {"1 - SATA", PMC_PSS_BIT_SATA},
86 {"2 - HDA", PMC_PSS_BIT_HDA},
87 {"3 - SEC", PMC_PSS_BIT_SEC},
88 {"4 - PCIE", PMC_PSS_BIT_PCIE},
89 {"5 - LPSS", PMC_PSS_BIT_LPSS},
90 {"6 - LPE", PMC_PSS_BIT_LPE},
91 {"7 - DFX", PMC_PSS_BIT_DFX},
92 {"8 - USH_CTRL", PMC_PSS_BIT_USH_CTRL},
93 {"9 - USH_SUS", PMC_PSS_BIT_USH_SUS},
94 {"10 - USH_VCCS", PMC_PSS_BIT_USH_VCCS},
95 {"11 - USH_VCCA", PMC_PSS_BIT_USH_VCCA},
96 {"12 - OTG_CTRL", PMC_PSS_BIT_OTG_CTRL},
97 {"13 - OTG_VCCS", PMC_PSS_BIT_OTG_VCCS},
98 {"14 - OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK},
99 {"15 - OTG_VCCA", PMC_PSS_BIT_OTG_VCCA},
100 {"16 - USB", PMC_PSS_BIT_USB},
101 {"17 - USB_SUS", PMC_PSS_BIT_USB_SUS},
102};
103
85static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset) 104static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
86{ 105{
87 return readl(pmc->regmap + reg_offset); 106 return readl(pmc->regmap + reg_offset);
@@ -169,6 +188,32 @@ static const struct file_operations pmc_dev_state_ops = {
169 .release = single_release, 188 .release = single_release,
170}; 189};
171 190
191static int pmc_pss_state_show(struct seq_file *s, void *unused)
192{
193 struct pmc_dev *pmc = s->private;
194 u32 pss = pmc_reg_read(pmc, PMC_PSS);
195 int pss_index;
196
197 for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
198 seq_printf(s, "Island: %-32s\tState: %s\n",
199 pss_map[pss_index].name,
200 pss_map[pss_index].bit_mask & pss ? "Off" : "On");
201 }
202 return 0;
203}
204
205static int pmc_pss_state_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, pmc_pss_state_show, inode->i_private);
208}
209
210static const struct file_operations pmc_pss_state_ops = {
211 .open = pmc_pss_state_open,
212 .read = seq_read,
213 .llseek = seq_lseek,
214 .release = single_release,
215};
216
172static int pmc_sleep_tmr_show(struct seq_file *s, void *unused) 217static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
173{ 218{
174 struct pmc_dev *pmc = s->private; 219 struct pmc_dev *pmc = s->private;
@@ -202,11 +247,7 @@ static const struct file_operations pmc_sleep_tmr_ops = {
202 247
203static void pmc_dbgfs_unregister(struct pmc_dev *pmc) 248static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
204{ 249{
205 if (!pmc->dbgfs_dir)
206 return;
207
208 debugfs_remove_recursive(pmc->dbgfs_dir); 250 debugfs_remove_recursive(pmc->dbgfs_dir);
209 pmc->dbgfs_dir = NULL;
210} 251}
211 252
212static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) 253static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
@@ -217,19 +258,29 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
217 if (!dir) 258 if (!dir)
218 return -ENOMEM; 259 return -ENOMEM;
219 260
261 pmc->dbgfs_dir = dir;
262
220 f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO, 263 f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
221 dir, pmc, &pmc_dev_state_ops); 264 dir, pmc, &pmc_dev_state_ops);
222 if (!f) { 265 if (!f) {
223 dev_err(&pdev->dev, "dev_states register failed\n"); 266 dev_err(&pdev->dev, "dev_state register failed\n");
224 goto err; 267 goto err;
225 } 268 }
269
270 f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
271 dir, pmc, &pmc_pss_state_ops);
272 if (!f) {
273 dev_err(&pdev->dev, "pss_state register failed\n");
274 goto err;
275 }
276
226 f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO, 277 f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
227 dir, pmc, &pmc_sleep_tmr_ops); 278 dir, pmc, &pmc_sleep_tmr_ops);
228 if (!f) { 279 if (!f) {
229 dev_err(&pdev->dev, "sleep_state register failed\n"); 280 dev_err(&pdev->dev, "sleep_state register failed\n");
230 goto err; 281 goto err;
231 } 282 }
232 pmc->dbgfs_dir = dir; 283
233 return 0; 284 return 0;
234err: 285err:
235 pmc_dbgfs_unregister(pmc); 286 pmc_dbgfs_unregister(pmc);
@@ -292,7 +343,6 @@ MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
292 343
293static int __init pmc_atom_init(void) 344static int __init pmc_atom_init(void)
294{ 345{
295 int err = -ENODEV;
296 struct pci_dev *pdev = NULL; 346 struct pci_dev *pdev = NULL;
297 const struct pci_device_id *ent; 347 const struct pci_device_id *ent;
298 348
@@ -306,14 +356,11 @@ static int __init pmc_atom_init(void)
306 */ 356 */
307 for_each_pci_dev(pdev) { 357 for_each_pci_dev(pdev) {
308 ent = pci_match_id(pmc_pci_ids, pdev); 358 ent = pci_match_id(pmc_pci_ids, pdev);
309 if (ent) { 359 if (ent)
310 err = pmc_setup_dev(pdev); 360 return pmc_setup_dev(pdev);
311 goto out;
312 }
313 } 361 }
314 /* Device not found. */ 362 /* Device not found. */
315out: 363 return -ENODEV;
316 return err;
317} 364}
318 365
319module_init(pmc_atom_init); 366module_init(pmc_atom_init);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127ddaa2d5a..046e2d620bbe 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
28#include <asm/fpu-internal.h> 28#include <asm/fpu-internal.h>
29#include <asm/debugreg.h> 29#include <asm/debugreg.h>
30#include <asm/nmi.h> 30#include <asm/nmi.h>
31#include <asm/tlbflush.h>
31 32
32/* 33/*
33 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 34 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -141,7 +142,7 @@ void flush_thread(void)
141 142
142static void hard_disable_TSC(void) 143static void hard_disable_TSC(void)
143{ 144{
144 write_cr4(read_cr4() | X86_CR4_TSD); 145 cr4_set_bits(X86_CR4_TSD);
145} 146}
146 147
147void disable_TSC(void) 148void disable_TSC(void)
@@ -158,7 +159,7 @@ void disable_TSC(void)
158 159
159static void hard_enable_TSC(void) 160static void hard_enable_TSC(void)
160{ 161{
161 write_cr4(read_cr4() & ~X86_CR4_TSD); 162 cr4_clear_bits(X86_CR4_TSD);
162} 163}
163 164
164static void enable_TSC(void) 165static void enable_TSC(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8f3ebfe710d0..603c4f99cb5a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all)
101 cr0 = read_cr0(); 101 cr0 = read_cr0();
102 cr2 = read_cr2(); 102 cr2 = read_cr2();
103 cr3 = read_cr3(); 103 cr3 = read_cr3();
104 cr4 = read_cr4_safe(); 104 cr4 = __read_cr4_safe();
105 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 105 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
106 cr0, cr2, cr3, cr4); 106 cr0, cr2, cr3, cr4);
107 107
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c02913af3..67fcc43577d2 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all)
93 cr0 = read_cr0(); 93 cr0 = read_cr0();
94 cr2 = read_cr2(); 94 cr2 = read_cr2();
95 cr3 = read_cr3(); 95 cr3 = read_cr3();
96 cr4 = read_cr4(); 96 cr4 = __read_cr4();
97 97
98 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 98 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99 fs, fsindex, gs, gsindex, shadowgs); 99 fs, fsindex, gs, gsindex, shadowgs);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index ca9622a25e95..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
49 retval = set_rtc_time(&tm); 49 retval = set_rtc_time(&tm);
50 if (retval) 50 if (retval)
51 printk(KERN_ERR "%s: RTC write failed with error %d\n", 51 printk(KERN_ERR "%s: RTC write failed with error %d\n",
52 __FUNCTION__, retval); 52 __func__, retval);
53 } else { 53 } else {
54 printk(KERN_ERR 54 printk(KERN_ERR
55 "%s: Invalid RTC value: write of %lx to RTC failed\n", 55 "%s: Invalid RTC value: write of %lx to RTC failed\n",
56 __FUNCTION__, nowtime); 56 __func__, nowtime);
57 retval = -EINVAL; 57 retval = -EINVAL;
58 } 58 }
59 return retval; 59 return retval;
@@ -170,7 +170,7 @@ static struct platform_device rtc_device = {
170static __init int add_rtc_cmos(void) 170static __init int add_rtc_cmos(void)
171{ 171{
172#ifdef CONFIG_PNP 172#ifdef CONFIG_PNP
173 static const char * const const ids[] __initconst = 173 static const char * const ids[] __initconst =
174 { "PNP0b00", "PNP0b01", "PNP0b02", }; 174 { "PNP0b00", "PNP0b01", "PNP0b02", };
175 struct pnp_dev *dev; 175 struct pnp_dev *dev;
176 struct pnp_id *id; 176 struct pnp_id *id;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ab4734e5411d..98dc9317286e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,6 +89,7 @@
89#include <asm/cacheflush.h> 89#include <asm/cacheflush.h>
90#include <asm/processor.h> 90#include <asm/processor.h>
91#include <asm/bugs.h> 91#include <asm/bugs.h>
92#include <asm/kasan.h>
92 93
93#include <asm/vsyscall.h> 94#include <asm/vsyscall.h>
94#include <asm/cpu.h> 95#include <asm/cpu.h>
@@ -121,6 +122,8 @@
121unsigned long max_low_pfn_mapped; 122unsigned long max_low_pfn_mapped;
122unsigned long max_pfn_mapped; 123unsigned long max_pfn_mapped;
123 124
125bool __read_mostly kaslr_enabled = false;
126
124#ifdef CONFIG_DMI 127#ifdef CONFIG_DMI
125RESERVE_BRK(dmi_alloc, 65536); 128RESERVE_BRK(dmi_alloc, 65536);
126#endif 129#endif
@@ -424,6 +427,11 @@ static void __init reserve_initrd(void)
424} 427}
425#endif /* CONFIG_BLK_DEV_INITRD */ 428#endif /* CONFIG_BLK_DEV_INITRD */
426 429
430static void __init parse_kaslr_setup(u64 pa_data, u32 data_len)
431{
432 kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data));
433}
434
427static void __init parse_setup_data(void) 435static void __init parse_setup_data(void)
428{ 436{
429 struct setup_data *data; 437 struct setup_data *data;
@@ -431,15 +439,13 @@ static void __init parse_setup_data(void)
431 439
432 pa_data = boot_params.hdr.setup_data; 440 pa_data = boot_params.hdr.setup_data;
433 while (pa_data) { 441 while (pa_data) {
434 u32 data_len, map_len, data_type; 442 u32 data_len, data_type;
435 443
436 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), 444 data = early_memremap(pa_data, sizeof(*data));
437 (u64)sizeof(struct setup_data));
438 data = early_memremap(pa_data, map_len);
439 data_len = data->len + sizeof(struct setup_data); 445 data_len = data->len + sizeof(struct setup_data);
440 data_type = data->type; 446 data_type = data->type;
441 pa_next = data->next; 447 pa_next = data->next;
442 early_iounmap(data, map_len); 448 early_iounmap(data, sizeof(*data));
443 449
444 switch (data_type) { 450 switch (data_type) {
445 case SETUP_E820_EXT: 451 case SETUP_E820_EXT:
@@ -451,6 +457,9 @@ static void __init parse_setup_data(void)
451 case SETUP_EFI: 457 case SETUP_EFI:
452 parse_efi_setup(pa_data, data_len); 458 parse_efi_setup(pa_data, data_len);
453 break; 459 break;
460 case SETUP_KASLR:
461 parse_kaslr_setup(pa_data, data_len);
462 break;
454 default: 463 default:
455 break; 464 break;
456 } 465 }
@@ -833,10 +842,14 @@ static void __init trim_low_memory_range(void)
833static int 842static int
834dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) 843dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
835{ 844{
836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx " 845 if (kaslr_enabled)
837 "(relocation range: 0x%lx-0x%lx)\n", 846 pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
838 (unsigned long)&_text - __START_KERNEL, __START_KERNEL, 847 (unsigned long)&_text - __START_KERNEL,
839 __START_KERNEL_map, MODULES_VADDR-1); 848 __START_KERNEL,
849 __START_KERNEL_map,
850 MODULES_VADDR-1);
851 else
852 pr_emerg("Kernel Offset: disabled\n");
840 853
841 return 0; 854 return 0;
842} 855}
@@ -1176,9 +1189,11 @@ void __init setup_arch(char **cmdline_p)
1176 1189
1177 x86_init.paging.pagetable_init(); 1190 x86_init.paging.pagetable_init();
1178 1191
1192 kasan_init();
1193
1179 if (boot_cpu_data.cpuid_level >= 0) { 1194 if (boot_cpu_data.cpuid_level >= 0) {
1180 /* A CPU has %cr4 if and only if it has CPUID */ 1195 /* A CPU has %cr4 if and only if it has CPUID */
1181 mmu_cr4_features = read_cr4(); 1196 mmu_cr4_features = __read_cr4();
1182 if (trampoline_cr4_features) 1197 if (trampoline_cr4_features)
1183 *trampoline_cr4_features = mmu_cr4_features; 1198 *trampoline_cr4_features = mmu_cr4_features;
1184 } 1199 }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
69 unsigned int err = 0; 69 unsigned int err = 0;
70 70
71 /* Always make any pending restarted system calls return -EINTR */ 71 /* Always make any pending restarted system calls return -EINTR */
72 current_thread_info()->restart_block.fn = do_no_restart_syscall; 72 current->restart_block.fn = do_no_restart_syscall;
73 73
74 get_user_try { 74 get_user_try {
75 75
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
740{ 740{
741 user_exit(); 741 user_exit();
742 742
743#ifdef CONFIG_X86_MCE
744 /* notify userspace of pending MCEs */
745 if (thread_info_flags & _TIF_MCE_NOTIFY)
746 mce_notify_process();
747#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
748
749 if (thread_info_flags & _TIF_UPROBE) 743 if (thread_info_flags & _TIF_UPROBE)
750 uprobe_notify_resume(regs); 744 uprobe_notify_resume(regs);
751 745
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6d7022c683e3..febc6aabc72e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,7 +73,6 @@
73#include <asm/setup.h> 73#include <asm/setup.h>
74#include <asm/uv/uv.h> 74#include <asm/uv/uv.h>
75#include <linux/mc146818rtc.h> 75#include <linux/mc146818rtc.h>
76#include <asm/smpboot_hooks.h>
77#include <asm/i8259.h> 76#include <asm/i8259.h>
78#include <asm/realmode.h> 77#include <asm/realmode.h>
79#include <asm/misc.h> 78#include <asm/misc.h>
@@ -104,6 +103,43 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
104 103
105atomic_t init_deasserted; 104atomic_t init_deasserted;
106 105
106static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rtc_lock, flags);
111 CMOS_WRITE(0xa, 0xf);
112 spin_unlock_irqrestore(&rtc_lock, flags);
113 local_flush_tlb();
114 pr_debug("1.\n");
115 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
116 start_eip >> 4;
117 pr_debug("2.\n");
118 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
119 start_eip & 0xf;
120 pr_debug("3.\n");
121}
122
123static inline void smpboot_restore_warm_reset_vector(void)
124{
125 unsigned long flags;
126
127 /*
128 * Install writable page 0 entry to set BIOS data area.
129 */
130 local_flush_tlb();
131
132 /*
133 * Paranoid: Set warm reset code and vector here back
134 * to default values.
135 */
136 spin_lock_irqsave(&rtc_lock, flags);
137 CMOS_WRITE(0, 0xf);
138 spin_unlock_irqrestore(&rtc_lock, flags);
139
140 *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
141}
142
107/* 143/*
108 * Report back to the Boot Processor during boot time or to the caller processor 144 * Report back to the Boot Processor during boot time or to the caller processor
109 * during CPU online. 145 * during CPU online.
@@ -136,8 +172,7 @@ static void smp_callin(void)
136 * CPU, first the APIC. (this is probably redundant on most 172 * CPU, first the APIC. (this is probably redundant on most
137 * boards) 173 * boards)
138 */ 174 */
139 setup_local_APIC(); 175 apic_ap_setup();
140 end_local_APIC_setup();
141 176
142 /* 177 /*
143 * Need to setup vector mappings before we enable interrupts. 178 * Need to setup vector mappings before we enable interrupts.
@@ -955,9 +990,12 @@ void arch_disable_smp_support(void)
955 */ 990 */
956static __init void disable_smp(void) 991static __init void disable_smp(void)
957{ 992{
993 pr_info("SMP disabled\n");
994
995 disable_ioapic_support();
996
958 init_cpu_present(cpumask_of(0)); 997 init_cpu_present(cpumask_of(0));
959 init_cpu_possible(cpumask_of(0)); 998 init_cpu_possible(cpumask_of(0));
960 smpboot_clear_io_apic_irqs();
961 999
962 if (smp_found_config) 1000 if (smp_found_config)
963 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1001 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
@@ -967,6 +1005,13 @@ static __init void disable_smp(void)
967 cpumask_set_cpu(0, cpu_core_mask(0)); 1005 cpumask_set_cpu(0, cpu_core_mask(0));
968} 1006}
969 1007
1008enum {
1009 SMP_OK,
1010 SMP_NO_CONFIG,
1011 SMP_NO_APIC,
1012 SMP_FORCE_UP,
1013};
1014
970/* 1015/*
971 * Various sanity checks. 1016 * Various sanity checks.
972 */ 1017 */
@@ -1014,10 +1059,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1014 if (!smp_found_config && !acpi_lapic) { 1059 if (!smp_found_config && !acpi_lapic) {
1015 preempt_enable(); 1060 preempt_enable();
1016 pr_notice("SMP motherboard not detected\n"); 1061 pr_notice("SMP motherboard not detected\n");
1017 disable_smp(); 1062 return SMP_NO_CONFIG;
1018 if (APIC_init_uniprocessor())
1019 pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
1020 return -1;
1021 } 1063 }
1022 1064
1023 /* 1065 /*
@@ -1041,9 +1083,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1041 boot_cpu_physical_apicid); 1083 boot_cpu_physical_apicid);
1042 pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); 1084 pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
1043 } 1085 }
1044 smpboot_clear_io_apic(); 1086 return SMP_NO_APIC;
1045 disable_ioapic_support();
1046 return -1;
1047 } 1087 }
1048 1088
1049 verify_local_APIC(); 1089 verify_local_APIC();
@@ -1053,15 +1093,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
1053 */ 1093 */
1054 if (!max_cpus) { 1094 if (!max_cpus) {
1055 pr_info("SMP mode deactivated\n"); 1095 pr_info("SMP mode deactivated\n");
1056 smpboot_clear_io_apic(); 1096 return SMP_FORCE_UP;
1057
1058 connect_bsp_APIC();
1059 setup_local_APIC();
1060 bsp_end_local_APIC_setup();
1061 return -1;
1062 } 1097 }
1063 1098
1064 return 0; 1099 return SMP_OK;
1065} 1100}
1066 1101
1067static void __init smp_cpu_index_default(void) 1102static void __init smp_cpu_index_default(void)
@@ -1101,10 +1136,21 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1101 } 1136 }
1102 set_cpu_sibling_map(0); 1137 set_cpu_sibling_map(0);
1103 1138
1104 if (smp_sanity_check(max_cpus) < 0) { 1139 switch (smp_sanity_check(max_cpus)) {
1105 pr_info("SMP disabled\n"); 1140 case SMP_NO_CONFIG:
1106 disable_smp(); 1141 disable_smp();
1142 if (APIC_init_uniprocessor())
1143 pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
1107 return; 1144 return;
1145 case SMP_NO_APIC:
1146 disable_smp();
1147 return;
1148 case SMP_FORCE_UP:
1149 disable_smp();
1150 apic_bsp_setup(false);
1151 return;
1152 case SMP_OK:
1153 break;
1108 } 1154 }
1109 1155
1110 default_setup_apic_routing(); 1156 default_setup_apic_routing();
@@ -1115,33 +1161,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1115 /* Or can we switch back to PIC here? */ 1161 /* Or can we switch back to PIC here? */
1116 } 1162 }
1117 1163
1118 connect_bsp_APIC(); 1164 cpu0_logical_apicid = apic_bsp_setup(false);
1119
1120 /*
1121 * Switch from PIC to APIC mode.
1122 */
1123 setup_local_APIC();
1124
1125 if (x2apic_mode)
1126 cpu0_logical_apicid = apic_read(APIC_LDR);
1127 else
1128 cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
1129
1130 /*
1131 * Enable IO APIC before setting up error vector
1132 */
1133 if (!skip_ioapic_setup && nr_ioapics)
1134 enable_IO_APIC();
1135
1136 bsp_end_local_APIC_setup();
1137 smpboot_setup_io_apic();
1138 /*
1139 * Set up local APIC timer on boot CPU.
1140 */
1141 1165
1142 pr_info("CPU%d: ", 0); 1166 pr_info("CPU%d: ", 0);
1143 print_cpu_info(&cpu_data(0)); 1167 print_cpu_info(&cpu_data(0));
1144 x86_init.timers.setup_percpu_clockev();
1145 1168
1146 if (is_uv_system()) 1169 if (is_uv_system())
1147 uv_system_init(); 1170 uv_system_init();
@@ -1177,9 +1200,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1177 1200
1178 nmi_selftest(); 1201 nmi_selftest();
1179 impress_friends(); 1202 impress_friends();
1180#ifdef CONFIG_X86_IO_APIC
1181 setup_ioapic_dest(); 1203 setup_ioapic_dest();
1182#endif
1183 mtrr_aps_init(); 1204 mtrr_aps_init();
1184} 1205}
1185 1206
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 88900e288021..9d2073e2ecc9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 preempt_count_dec(); 108 preempt_count_dec();
109} 109}
110 110
111enum ctx_state ist_enter(struct pt_regs *regs)
112{
113 enum ctx_state prev_state;
114
115 if (user_mode_vm(regs)) {
116 /* Other than that, we're just an exception. */
117 prev_state = exception_enter();
118 } else {
119 /*
120 * We might have interrupted pretty much anything. In
121 * fact, if we're a machine check, we can even interrupt
122 * NMI processing. We don't want in_nmi() to return true,
123 * but we need to notify RCU.
124 */
125 rcu_nmi_enter();
126 prev_state = IN_KERNEL; /* the value is irrelevant. */
127 }
128
129 /*
130 * We are atomic because we're on the IST stack (or we're on x86_32,
131 * in which case we still shouldn't schedule).
132 *
133 * This must be after exception_enter(), because exception_enter()
134 * won't do anything if in_interrupt() returns true.
135 */
136 preempt_count_add(HARDIRQ_OFFSET);
137
138 /* This code is a bit fragile. Test it. */
139 rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
140
141 return prev_state;
142}
143
144void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
145{
146 /* Must be before exception_exit. */
147 preempt_count_sub(HARDIRQ_OFFSET);
148
149 if (user_mode_vm(regs))
150 return exception_exit(prev_state);
151 else
152 rcu_nmi_exit();
153}
154
155/**
156 * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
157 * @regs: regs passed to the IST exception handler
158 *
159 * IST exception handlers normally cannot schedule. As a special
160 * exception, if the exception interrupted userspace code (i.e.
161 * user_mode_vm(regs) would return true) and the exception was not
162 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
163 * begins a non-atomic section within an ist_enter()/ist_exit() region.
164 * Callers are responsible for enabling interrupts themselves inside
165 * the non-atomic section, and callers must call is_end_non_atomic()
166 * before ist_exit().
167 */
168void ist_begin_non_atomic(struct pt_regs *regs)
169{
170 BUG_ON(!user_mode_vm(regs));
171
172 /*
173 * Sanity check: we need to be on the normal thread stack. This
174 * will catch asm bugs and any attempt to use ist_preempt_enable
175 * from double_fault.
176 */
177 BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
178 & ~(THREAD_SIZE - 1)) != 0);
179
180 preempt_count_sub(HARDIRQ_OFFSET);
181}
182
183/**
184 * ist_end_non_atomic() - begin a non-atomic section in an IST exception
185 *
186 * Ends a non-atomic section started with ist_begin_non_atomic().
187 */
188void ist_end_non_atomic(void)
189{
190 preempt_count_add(HARDIRQ_OFFSET);
191}
192
111static nokprobe_inline int 193static nokprobe_inline int
112do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
113 struct pt_regs *regs, long error_code) 195 struct pt_regs *regs, long error_code)
@@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
251 * end up promoting it to a doublefault. In that case, modify 333 * end up promoting it to a doublefault. In that case, modify
252 * the stack to make it look like we just entered the #GP 334 * the stack to make it look like we just entered the #GP
253 * handler from user space, similar to bad_iret. 335 * handler from user space, similar to bad_iret.
336 *
337 * No need for ist_enter here because we don't use RCU.
254 */ 338 */
255 if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && 339 if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
256 regs->cs == __KERNEL_CS && 340 regs->cs == __KERNEL_CS &&
@@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
263 normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ 347 normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
264 regs->ip = (unsigned long)general_protection; 348 regs->ip = (unsigned long)general_protection;
265 regs->sp = (unsigned long)&normal_regs->orig_ax; 349 regs->sp = (unsigned long)&normal_regs->orig_ax;
350
266 return; 351 return;
267 } 352 }
268#endif 353#endif
269 354
270 exception_enter(); 355 ist_enter(regs); /* Discard prev_state because we won't return. */
271 /* Return not checked because double check cannot be ignored */
272 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 356 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
273 357
274 tsk->thread.error_code = error_code; 358 tsk->thread.error_code = error_code;
@@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
434 if (poke_int3_handler(regs)) 518 if (poke_int3_handler(regs))
435 return; 519 return;
436 520
437 prev_state = exception_enter(); 521 prev_state = ist_enter(regs);
438#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 522#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
439 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 523 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
440 SIGTRAP) == NOTIFY_STOP) 524 SIGTRAP) == NOTIFY_STOP)
@@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
460 preempt_conditional_cli(regs); 544 preempt_conditional_cli(regs);
461 debug_stack_usage_dec(); 545 debug_stack_usage_dec();
462exit: 546exit:
463 exception_exit(prev_state); 547 ist_exit(regs, prev_state);
464} 548}
465NOKPROBE_SYMBOL(do_int3); 549NOKPROBE_SYMBOL(do_int3);
466 550
467#ifdef CONFIG_X86_64 551#ifdef CONFIG_X86_64
468/* 552/*
469 * Help handler running on IST stack to switch back to user stack 553 * Help handler running on IST stack to switch off the IST stack if the
470 * for scheduling or signal handling. The actual stack switch is done in 554 * interrupted code was in user mode. The actual stack switch is done in
471 * entry.S 555 * entry_64.S
472 */ 556 */
473asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) 557asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
474{ 558{
475 struct pt_regs *regs = eregs; 559 struct pt_regs *regs = task_pt_regs(current);
476 /* Did already sync */ 560 *regs = *eregs;
477 if (eregs == (struct pt_regs *)eregs->sp)
478 ;
479 /* Exception from user space */
480 else if (user_mode(eregs))
481 regs = task_pt_regs(current);
482 /*
483 * Exception from kernel and interrupts are enabled. Move to
484 * kernel process stack.
485 */
486 else if (eregs->flags & X86_EFLAGS_IF)
487 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
488 if (eregs != regs)
489 *regs = *eregs;
490 return regs; 561 return regs;
491} 562}
492NOKPROBE_SYMBOL(sync_regs); 563NOKPROBE_SYMBOL(sync_regs);
@@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
554 unsigned long dr6; 625 unsigned long dr6;
555 int si_code; 626 int si_code;
556 627
557 prev_state = exception_enter(); 628 prev_state = ist_enter(regs);
558 629
559 get_debugreg(dr6, 6); 630 get_debugreg(dr6, 6);
560 631
@@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
629 debug_stack_usage_dec(); 700 debug_stack_usage_dec();
630 701
631exit: 702exit:
632 exception_exit(prev_state); 703 ist_exit(regs, prev_state);
633} 704}
634NOKPROBE_SYMBOL(do_debug); 705NOKPROBE_SYMBOL(do_debug);
635 706
@@ -788,18 +859,16 @@ void math_state_restore(void)
788 local_irq_disable(); 859 local_irq_disable();
789 } 860 }
790 861
862 /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
863 kernel_fpu_disable();
791 __thread_fpu_begin(tsk); 864 __thread_fpu_begin(tsk);
792
793 /*
794 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
795 */
796 if (unlikely(restore_fpu_checking(tsk))) { 865 if (unlikely(restore_fpu_checking(tsk))) {
797 drop_init_fpu(tsk); 866 drop_init_fpu(tsk);
798 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); 867 force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
799 return; 868 } else {
869 tsk->thread.fpu_counter++;
800 } 870 }
801 871 kernel_fpu_enable();
802 tsk->thread.fpu_counter++;
803} 872}
804EXPORT_SYMBOL_GPL(math_state_restore); 873EXPORT_SYMBOL_GPL(math_state_restore);
805 874
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 8b96a947021f..81f8adb0679e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -66,27 +66,54 @@
66 * Good-instruction tables for 32-bit apps. This is non-const and volatile 66 * Good-instruction tables for 32-bit apps. This is non-const and volatile
67 * to keep gcc from statically optimizing it out, as variable_test_bit makes 67 * to keep gcc from statically optimizing it out, as variable_test_bit makes
68 * some versions of gcc to think only *(unsigned long*) is used. 68 * some versions of gcc to think only *(unsigned long*) is used.
69 *
70 * Opcodes we'll probably never support:
71 * 6c-6f - ins,outs. SEGVs if used in userspace
72 * e4-e7 - in,out imm. SEGVs if used in userspace
73 * ec-ef - in,out acc. SEGVs if used in userspace
74 * cc - int3. SIGTRAP if used in userspace
75 * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
76 * (why we support bound (62) then? it's similar, and similarly unused...)
77 * f1 - int1. SIGTRAP if used in userspace
78 * f4 - hlt. SEGVs if used in userspace
79 * fa - cli. SEGVs if used in userspace
80 * fb - sti. SEGVs if used in userspace
81 *
82 * Opcodes which need some work to be supported:
83 * 07,17,1f - pop es/ss/ds
84 * Normally not used in userspace, but would execute if used.
85 * Can cause GP or stack exception if tries to load wrong segment descriptor.
86 * We hesitate to run them under single step since kernel's handling
87 * of userspace single-stepping (TF flag) is fragile.
88 * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
89 * on the same grounds that they are never used.
90 * cd - int N.
91 * Used by userspace for "int 80" syscall entry. (Other "int N"
92 * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
93 * Not supported since kernel's handling of userspace single-stepping
94 * (TF flag) is fragile.
95 * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
69 */ 96 */
70#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 97#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
71static volatile u32 good_insns_32[256 / 32] = { 98static volatile u32 good_insns_32[256 / 32] = {
72 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 99 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
73 /* ---------------------------------------------- */ 100 /* ---------------------------------------------- */
74 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ 101 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
75 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ 102 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
76 W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ 103 W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
77 W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ 104 W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
78 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ 105 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
79 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ 106 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
80 W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ 107 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
81 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ 108 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
82 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ 109 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
83 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ 110 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
84 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ 111 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
85 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ 112 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
86 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ 113 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
87 W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ 114 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
88 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ 115 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
89 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ 116 W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
90 /* ---------------------------------------------- */ 117 /* ---------------------------------------------- */
91 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 118 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
92}; 119};
@@ -94,27 +121,61 @@ static volatile u32 good_insns_32[256 / 32] = {
94#define good_insns_32 NULL 121#define good_insns_32 NULL
95#endif 122#endif
96 123
97/* Good-instruction tables for 64-bit apps */ 124/* Good-instruction tables for 64-bit apps.
125 *
126 * Genuinely invalid opcodes:
127 * 06,07 - formerly push/pop es
128 * 0e - formerly push cs
129 * 16,17 - formerly push/pop ss
130 * 1e,1f - formerly push/pop ds
131 * 27,2f,37,3f - formerly daa/das/aaa/aas
132 * 60,61 - formerly pusha/popa
133 * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
134 * 82 - formerly redundant encoding of Group1
135 * 9a - formerly call seg:ofs
136 * ce - formerly into
137 * d4,d5 - formerly aam/aad
138 * d6 - formerly undocumented salc
139 * ea - formerly jmp seg:ofs
140 *
141 * Opcodes we'll probably never support:
142 * 6c-6f - ins,outs. SEGVs if used in userspace
143 * e4-e7 - in,out imm. SEGVs if used in userspace
144 * ec-ef - in,out acc. SEGVs if used in userspace
145 * cc - int3. SIGTRAP if used in userspace
146 * f1 - int1. SIGTRAP if used in userspace
147 * f4 - hlt. SEGVs if used in userspace
148 * fa - cli. SEGVs if used in userspace
149 * fb - sti. SEGVs if used in userspace
150 *
151 * Opcodes which need some work to be supported:
152 * cd - int N.
153 * Used by userspace for "int 80" syscall entry. (Other "int N"
154 * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
155 * Not supported since kernel's handling of userspace single-stepping
156 * (TF flag) is fragile.
157 * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
158 */
98#if defined(CONFIG_X86_64) 159#if defined(CONFIG_X86_64)
99static volatile u32 good_insns_64[256 / 32] = { 160static volatile u32 good_insns_64[256 / 32] = {
100 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 161 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
101 /* ---------------------------------------------- */ 162 /* ---------------------------------------------- */
102 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ 163 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
103 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ 164 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
104 W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ 165 W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
105 W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ 166 W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
106 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ 167 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
107 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ 168 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
108 W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ 169 W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
109 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ 170 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
110 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ 171 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
111 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ 172 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
112 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ 173 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
113 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ 174 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
114 W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ 175 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
115 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ 176 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
116 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ 177 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
117 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ 178 W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
118 /* ---------------------------------------------- */ 179 /* ---------------------------------------------- */
119 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 180 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
120}; 181};
@@ -122,49 +183,55 @@ static volatile u32 good_insns_64[256 / 32] = {
122#define good_insns_64 NULL 183#define good_insns_64 NULL
123#endif 184#endif
124 185
125/* Using this for both 64-bit and 32-bit apps */ 186/* Using this for both 64-bit and 32-bit apps.
187 * Opcodes we don't support:
188 * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
189 * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
190 * Also encodes tons of other system insns if mod=11.
191 * Some are in fact non-system: xend, xtest, rdtscp, maybe more
192 * 0f 05 - syscall
193 * 0f 06 - clts (CPL0 insn)
194 * 0f 07 - sysret
195 * 0f 08 - invd (CPL0 insn)
196 * 0f 09 - wbinvd (CPL0 insn)
197 * 0f 0b - ud2
198 * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
199 * 0f 34 - sysenter
200 * 0f 35 - sysexit
201 * 0f 37 - getsec
202 * 0f 78 - vmread (Intel VMX. CPL0 insn)
203 * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
204 * Note: with prefixes, these two opcodes are
205 * extrq/insertq/AVX512 convert vector ops.
206 * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
207 * {rd,wr}{fs,gs}base,{s,l,m}fence.
208 * Why? They are all user-executable.
209 */
126static volatile u32 good_2byte_insns[256 / 32] = { 210static volatile u32 good_2byte_insns[256 / 32] = {
127 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 211 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
128 /* ---------------------------------------------- */ 212 /* ---------------------------------------------- */
129 W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ 213 W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
130 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ 214 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
131 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ 215 W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
132 W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ 216 W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
133 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ 217 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
134 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ 218 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
135 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ 219 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
136 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ 220 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
137 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ 221 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
138 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ 222 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
139 W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ 223 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
140 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ 224 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
141 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ 225 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
142 W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ 226 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
143 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ 227 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
144 W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ 228 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */
145 /* ---------------------------------------------- */ 229 /* ---------------------------------------------- */
146 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 230 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
147}; 231};
148#undef W 232#undef W
149 233
150/* 234/*
151 * opcodes we'll probably never support:
152 *
153 * 6c-6d, e4-e5, ec-ed - in
154 * 6e-6f, e6-e7, ee-ef - out
155 * cc, cd - int3, int
156 * cf - iret
157 * d6 - illegal instruction
158 * f1 - int1/icebp
159 * f4 - hlt
160 * fa, fb - cli, sti
161 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
162 *
163 * invalid opcodes in 64-bit mode:
164 *
165 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
166 * 63 - we support this opcode in x86_64 but not in i386.
167 *
168 * opcodes we may need to refine support for: 235 * opcodes we may need to refine support for:
169 * 236 *
170 * 0f - 2-byte instructions: For many of these instructions, the validity 237 * 0f - 2-byte instructions: For many of these instructions, the validity
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 040681928e9d..37d8fa4438f0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
50#undef memset 50#undef memset
51#undef memmove 51#undef memmove
52 52
53extern void *__memset(void *, int, __kernel_size_t);
54extern void *__memcpy(void *, const void *, __kernel_size_t);
55extern void *__memmove(void *, const void *, __kernel_size_t);
53extern void *memset(void *, int, __kernel_size_t); 56extern void *memset(void *, int, __kernel_size_t);
54extern void *memcpy(void *, const void *, __kernel_size_t); 57extern void *memcpy(void *, const void *, __kernel_size_t);
55extern void *__memcpy(void *, const void *, __kernel_size_t); 58extern void *memmove(void *, const void *, __kernel_size_t);
59
60EXPORT_SYMBOL(__memset);
61EXPORT_SYMBOL(__memcpy);
62EXPORT_SYMBOL(__memmove);
56 63
57EXPORT_SYMBOL(memset); 64EXPORT_SYMBOL(memset);
58EXPORT_SYMBOL(memcpy); 65EXPORT_SYMBOL(memcpy);
59EXPORT_SYMBOL(__memcpy);
60EXPORT_SYMBOL(memmove); 66EXPORT_SYMBOL(memmove);
61 67
62#ifndef CONFIG_DEBUG_VIRTUAL 68#ifndef CONFIG_DEBUG_VIRTUAL
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0de1fae2bdf0..34f66e58a896 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
12#include <asm/i387.h> 12#include <asm/i387.h>
13#include <asm/fpu-internal.h> 13#include <asm/fpu-internal.h>
14#include <asm/sigframe.h> 14#include <asm/sigframe.h>
15#include <asm/tlbflush.h>
15#include <asm/xcr.h> 16#include <asm/xcr.h>
16 17
17/* 18/*
@@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void)
453 */ 454 */
454static inline void xstate_enable(void) 455static inline void xstate_enable(void)
455{ 456{
456 set_in_cr4(X86_CR4_OSXSAVE); 457 cr4_set_bits(X86_CR4_OSXSAVE);
457 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); 458 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
458} 459}
459 460