diff options
Diffstat (limited to 'arch/x86/kernel')
120 files changed, 6334 insertions, 4895 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4c58352209e0..0925676266bd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o | |||
47 | obj-y += process.o | 47 | obj-y += process.o |
48 | obj-y += i387.o xsave.o | 48 | obj-y += i387.o xsave.o |
49 | obj-y += ptrace.o | 49 | obj-y += ptrace.o |
50 | obj-$(CONFIG_X86_DS) += ds.o | ||
51 | obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o | ||
52 | obj-$(CONFIG_X86_32) += tls.o | 50 | obj-$(CONFIG_X86_32) += tls.o |
53 | obj-$(CONFIG_IA32_EMULATION) += tls.o | 51 | obj-$(CONFIG_IA32_EMULATION) += tls.o |
54 | obj-y += step.o | 52 | obj-y += step.o |
@@ -106,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o | |||
106 | scx200-y += scx200_32.o | 104 | scx200-y += scx200_32.o |
107 | 105 | ||
108 | obj-$(CONFIG_OLPC) += olpc.o | 106 | obj-$(CONFIG_OLPC) += olpc.o |
107 | obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o | ||
109 | obj-$(CONFIG_X86_MRST) += mrst.o | 108 | obj-$(CONFIG_X86_MRST) += mrst.o |
110 | 109 | ||
111 | microcode-y := microcode_core.o | 110 | microcode-y := microcode_core.o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index cd40aba6aa95..c05872aa3ce0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled); | |||
63 | int acpi_noirq; /* skip ACPI IRQ initialization */ | 63 | int acpi_noirq; /* skip ACPI IRQ initialization */ |
64 | int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ | 64 | int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ |
65 | EXPORT_SYMBOL(acpi_pci_disabled); | 65 | EXPORT_SYMBOL(acpi_pci_disabled); |
66 | int acpi_ht __initdata = 1; /* enable HT */ | ||
67 | 66 | ||
68 | int acpi_lapic; | 67 | int acpi_lapic; |
69 | int acpi_ioapic; | 68 | int acpi_ioapic; |
@@ -94,6 +93,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; | |||
94 | 93 | ||
95 | 94 | ||
96 | /* | 95 | /* |
96 | * ISA irqs by default are the first 16 gsis but can be | ||
97 | * any gsi as specified by an interrupt source override. | ||
98 | */ | ||
99 | static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { | ||
100 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | ||
101 | }; | ||
102 | |||
103 | static unsigned int gsi_to_irq(unsigned int gsi) | ||
104 | { | ||
105 | unsigned int irq = gsi + NR_IRQS_LEGACY; | ||
106 | unsigned int i; | ||
107 | |||
108 | for (i = 0; i < NR_IRQS_LEGACY; i++) { | ||
109 | if (isa_irq_to_gsi[i] == gsi) { | ||
110 | return i; | ||
111 | } | ||
112 | } | ||
113 | |||
114 | /* Provide an identity mapping of gsi == irq | ||
115 | * except on truly weird platforms that have | ||
116 | * non isa irqs in the first 16 gsis. | ||
117 | */ | ||
118 | if (gsi >= NR_IRQS_LEGACY) | ||
119 | irq = gsi; | ||
120 | else | ||
121 | irq = gsi_top + gsi; | ||
122 | |||
123 | return irq; | ||
124 | } | ||
125 | |||
126 | static u32 irq_to_gsi(int irq) | ||
127 | { | ||
128 | unsigned int gsi; | ||
129 | |||
130 | if (irq < NR_IRQS_LEGACY) | ||
131 | gsi = isa_irq_to_gsi[irq]; | ||
132 | else if (irq < gsi_top) | ||
133 | gsi = irq; | ||
134 | else if (irq < (gsi_top + NR_IRQS_LEGACY)) | ||
135 | gsi = irq - gsi_top; | ||
136 | else | ||
137 | gsi = 0xffffffff; | ||
138 | |||
139 | return gsi; | ||
140 | } | ||
141 | |||
142 | /* | ||
97 | * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, | 143 | * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, |
98 | * to map the target physical address. The problem is that set_fixmap() | 144 | * to map the target physical address. The problem is that set_fixmap() |
99 | * provides a single page, and it is possible that the page is not | 145 | * provides a single page, and it is possible that the page is not |
@@ -313,7 +359,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) | |||
313 | /* | 359 | /* |
314 | * Parse Interrupt Source Override for the ACPI SCI | 360 | * Parse Interrupt Source Override for the ACPI SCI |
315 | */ | 361 | */ |
316 | static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) | 362 | static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi) |
317 | { | 363 | { |
318 | if (trigger == 0) /* compatible SCI trigger is level */ | 364 | if (trigger == 0) /* compatible SCI trigger is level */ |
319 | trigger = 3; | 365 | trigger = 3; |
@@ -333,7 +379,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) | |||
333 | * If GSI is < 16, this will update its flags, | 379 | * If GSI is < 16, this will update its flags, |
334 | * else it will create a new mp_irqs[] entry. | 380 | * else it will create a new mp_irqs[] entry. |
335 | */ | 381 | */ |
336 | mp_override_legacy_irq(gsi, polarity, trigger, gsi); | 382 | mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); |
337 | 383 | ||
338 | /* | 384 | /* |
339 | * stash over-ride to indicate we've been here | 385 | * stash over-ride to indicate we've been here |
@@ -357,9 +403,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, | |||
357 | acpi_table_print_madt_entry(header); | 403 | acpi_table_print_madt_entry(header); |
358 | 404 | ||
359 | if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { | 405 | if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { |
360 | acpi_sci_ioapic_setup(intsrc->global_irq, | 406 | acpi_sci_ioapic_setup(intsrc->source_irq, |
361 | intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, | 407 | intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, |
362 | (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2); | 408 | (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, |
409 | intsrc->global_irq); | ||
363 | return 0; | 410 | return 0; |
364 | } | 411 | } |
365 | 412 | ||
@@ -448,7 +495,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) | |||
448 | 495 | ||
449 | int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) | 496 | int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) |
450 | { | 497 | { |
451 | *irq = gsi; | 498 | *irq = gsi_to_irq(gsi); |
452 | 499 | ||
453 | #ifdef CONFIG_X86_IO_APIC | 500 | #ifdef CONFIG_X86_IO_APIC |
454 | if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) | 501 | if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) |
@@ -458,6 +505,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) | |||
458 | return 0; | 505 | return 0; |
459 | } | 506 | } |
460 | 507 | ||
508 | int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) | ||
509 | { | ||
510 | if (isa_irq >= 16) | ||
511 | return -1; | ||
512 | *gsi = irq_to_gsi(isa_irq); | ||
513 | return 0; | ||
514 | } | ||
515 | |||
461 | /* | 516 | /* |
462 | * success: return IRQ number (>=0) | 517 | * success: return IRQ number (>=0) |
463 | * failure: return < 0 | 518 | * failure: return < 0 |
@@ -482,7 +537,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) | |||
482 | plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); | 537 | plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); |
483 | } | 538 | } |
484 | #endif | 539 | #endif |
485 | irq = plat_gsi; | 540 | irq = gsi_to_irq(plat_gsi); |
486 | 541 | ||
487 | return irq; | 542 | return irq; |
488 | } | 543 | } |
@@ -867,29 +922,6 @@ static int __init acpi_parse_madt_lapic_entries(void) | |||
867 | extern int es7000_plat; | 922 | extern int es7000_plat; |
868 | #endif | 923 | #endif |
869 | 924 | ||
870 | int __init acpi_probe_gsi(void) | ||
871 | { | ||
872 | int idx; | ||
873 | int gsi; | ||
874 | int max_gsi = 0; | ||
875 | |||
876 | if (acpi_disabled) | ||
877 | return 0; | ||
878 | |||
879 | if (!acpi_ioapic) | ||
880 | return 0; | ||
881 | |||
882 | max_gsi = 0; | ||
883 | for (idx = 0; idx < nr_ioapics; idx++) { | ||
884 | gsi = mp_gsi_routing[idx].gsi_end; | ||
885 | |||
886 | if (gsi > max_gsi) | ||
887 | max_gsi = gsi; | ||
888 | } | ||
889 | |||
890 | return max_gsi + 1; | ||
891 | } | ||
892 | |||
893 | static void assign_to_mp_irq(struct mpc_intsrc *m, | 925 | static void assign_to_mp_irq(struct mpc_intsrc *m, |
894 | struct mpc_intsrc *mp_irq) | 926 | struct mpc_intsrc *mp_irq) |
895 | { | 927 | { |
@@ -947,13 +979,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | |||
947 | mp_irq.dstirq = pin; /* INTIN# */ | 979 | mp_irq.dstirq = pin; /* INTIN# */ |
948 | 980 | ||
949 | save_mp_irq(&mp_irq); | 981 | save_mp_irq(&mp_irq); |
982 | |||
983 | isa_irq_to_gsi[bus_irq] = gsi; | ||
950 | } | 984 | } |
951 | 985 | ||
952 | void __init mp_config_acpi_legacy_irqs(void) | 986 | void __init mp_config_acpi_legacy_irqs(void) |
953 | { | 987 | { |
954 | int i; | 988 | int i; |
955 | int ioapic; | ||
956 | unsigned int dstapic; | ||
957 | struct mpc_intsrc mp_irq; | 989 | struct mpc_intsrc mp_irq; |
958 | 990 | ||
959 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | 991 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) |
@@ -974,19 +1006,27 @@ void __init mp_config_acpi_legacy_irqs(void) | |||
974 | #endif | 1006 | #endif |
975 | 1007 | ||
976 | /* | 1008 | /* |
977 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | ||
978 | */ | ||
979 | ioapic = mp_find_ioapic(0); | ||
980 | if (ioapic < 0) | ||
981 | return; | ||
982 | dstapic = mp_ioapics[ioapic].apicid; | ||
983 | |||
984 | /* | ||
985 | * Use the default configuration for the IRQs 0-15. Unless | 1009 | * Use the default configuration for the IRQs 0-15. Unless |
986 | * overridden by (MADT) interrupt source override entries. | 1010 | * overridden by (MADT) interrupt source override entries. |
987 | */ | 1011 | */ |
988 | for (i = 0; i < 16; i++) { | 1012 | for (i = 0; i < 16; i++) { |
1013 | int ioapic, pin; | ||
1014 | unsigned int dstapic; | ||
989 | int idx; | 1015 | int idx; |
1016 | u32 gsi; | ||
1017 | |||
1018 | /* Locate the gsi that irq i maps to. */ | ||
1019 | if (acpi_isa_irq_to_gsi(i, &gsi)) | ||
1020 | continue; | ||
1021 | |||
1022 | /* | ||
1023 | * Locate the IOAPIC that manages the ISA IRQ. | ||
1024 | */ | ||
1025 | ioapic = mp_find_ioapic(gsi); | ||
1026 | if (ioapic < 0) | ||
1027 | continue; | ||
1028 | pin = mp_find_ioapic_pin(ioapic, gsi); | ||
1029 | dstapic = mp_ioapics[ioapic].apicid; | ||
990 | 1030 | ||
991 | for (idx = 0; idx < mp_irq_entries; idx++) { | 1031 | for (idx = 0; idx < mp_irq_entries; idx++) { |
992 | struct mpc_intsrc *irq = mp_irqs + idx; | 1032 | struct mpc_intsrc *irq = mp_irqs + idx; |
@@ -996,7 +1036,7 @@ void __init mp_config_acpi_legacy_irqs(void) | |||
996 | break; | 1036 | break; |
997 | 1037 | ||
998 | /* Do we already have a mapping for this IOAPIC pin */ | 1038 | /* Do we already have a mapping for this IOAPIC pin */ |
999 | if (irq->dstapic == dstapic && irq->dstirq == i) | 1039 | if (irq->dstapic == dstapic && irq->dstirq == pin) |
1000 | break; | 1040 | break; |
1001 | } | 1041 | } |
1002 | 1042 | ||
@@ -1011,7 +1051,7 @@ void __init mp_config_acpi_legacy_irqs(void) | |||
1011 | mp_irq.dstapic = dstapic; | 1051 | mp_irq.dstapic = dstapic; |
1012 | mp_irq.irqtype = mp_INT; | 1052 | mp_irq.irqtype = mp_INT; |
1013 | mp_irq.srcbusirq = i; /* Identity mapped */ | 1053 | mp_irq.srcbusirq = i; /* Identity mapped */ |
1014 | mp_irq.dstirq = i; | 1054 | mp_irq.dstirq = pin; |
1015 | 1055 | ||
1016 | save_mp_irq(&mp_irq); | 1056 | save_mp_irq(&mp_irq); |
1017 | } | 1057 | } |
@@ -1076,11 +1116,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) | |||
1076 | 1116 | ||
1077 | ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); | 1117 | ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); |
1078 | 1118 | ||
1079 | #ifdef CONFIG_X86_32 | ||
1080 | if (ioapic_renumber_irq) | ||
1081 | gsi = ioapic_renumber_irq(ioapic, gsi); | ||
1082 | #endif | ||
1083 | |||
1084 | if (ioapic_pin > MP_MAX_IOAPIC_PIN) { | 1119 | if (ioapic_pin > MP_MAX_IOAPIC_PIN) { |
1085 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | 1120 | printk(KERN_ERR "Invalid reference to IOAPIC pin " |
1086 | "%d-%d\n", mp_ioapics[ioapic].apicid, | 1121 | "%d-%d\n", mp_ioapics[ioapic].apicid, |
@@ -1094,7 +1129,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) | |||
1094 | set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, | 1129 | set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, |
1095 | trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, | 1130 | trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, |
1096 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | 1131 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); |
1097 | io_apic_set_pci_routing(dev, gsi, &irq_attr); | 1132 | io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); |
1098 | 1133 | ||
1099 | return gsi; | 1134 | return gsi; |
1100 | } | 1135 | } |
@@ -1154,7 +1189,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) | |||
1154 | * pretend we got one so we can set the SCI flags. | 1189 | * pretend we got one so we can set the SCI flags. |
1155 | */ | 1190 | */ |
1156 | if (!acpi_sci_override_gsi) | 1191 | if (!acpi_sci_override_gsi) |
1157 | acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); | 1192 | acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, |
1193 | acpi_gbl_FADT.sci_interrupt); | ||
1158 | 1194 | ||
1159 | /* Fill in identity legacy mappings where no override */ | 1195 | /* Fill in identity legacy mappings where no override */ |
1160 | mp_config_acpi_legacy_irqs(); | 1196 | mp_config_acpi_legacy_irqs(); |
@@ -1464,9 +1500,8 @@ void __init acpi_boot_table_init(void) | |||
1464 | 1500 | ||
1465 | /* | 1501 | /* |
1466 | * If acpi_disabled, bail out | 1502 | * If acpi_disabled, bail out |
1467 | * One exception: acpi=ht continues far enough to enumerate LAPICs | ||
1468 | */ | 1503 | */ |
1469 | if (acpi_disabled && !acpi_ht) | 1504 | if (acpi_disabled) |
1470 | return; | 1505 | return; |
1471 | 1506 | ||
1472 | /* | 1507 | /* |
@@ -1497,9 +1532,8 @@ int __init early_acpi_boot_init(void) | |||
1497 | { | 1532 | { |
1498 | /* | 1533 | /* |
1499 | * If acpi_disabled, bail out | 1534 | * If acpi_disabled, bail out |
1500 | * One exception: acpi=ht continues far enough to enumerate LAPICs | ||
1501 | */ | 1535 | */ |
1502 | if (acpi_disabled && !acpi_ht) | 1536 | if (acpi_disabled) |
1503 | return 1; | 1537 | return 1; |
1504 | 1538 | ||
1505 | /* | 1539 | /* |
@@ -1517,9 +1551,8 @@ int __init acpi_boot_init(void) | |||
1517 | 1551 | ||
1518 | /* | 1552 | /* |
1519 | * If acpi_disabled, bail out | 1553 | * If acpi_disabled, bail out |
1520 | * One exception: acpi=ht continues far enough to enumerate LAPICs | ||
1521 | */ | 1554 | */ |
1522 | if (acpi_disabled && !acpi_ht) | 1555 | if (acpi_disabled) |
1523 | return 1; | 1556 | return 1; |
1524 | 1557 | ||
1525 | acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); | 1558 | acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); |
@@ -1554,21 +1587,12 @@ static int __init parse_acpi(char *arg) | |||
1554 | /* acpi=force to over-ride black-list */ | 1587 | /* acpi=force to over-ride black-list */ |
1555 | else if (strcmp(arg, "force") == 0) { | 1588 | else if (strcmp(arg, "force") == 0) { |
1556 | acpi_force = 1; | 1589 | acpi_force = 1; |
1557 | acpi_ht = 1; | ||
1558 | acpi_disabled = 0; | 1590 | acpi_disabled = 0; |
1559 | } | 1591 | } |
1560 | /* acpi=strict disables out-of-spec workarounds */ | 1592 | /* acpi=strict disables out-of-spec workarounds */ |
1561 | else if (strcmp(arg, "strict") == 0) { | 1593 | else if (strcmp(arg, "strict") == 0) { |
1562 | acpi_strict = 1; | 1594 | acpi_strict = 1; |
1563 | } | 1595 | } |
1564 | /* Limit ACPI just to boot-time to enable HT */ | ||
1565 | else if (strcmp(arg, "ht") == 0) { | ||
1566 | if (!acpi_force) { | ||
1567 | printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); | ||
1568 | disable_acpi(); | ||
1569 | } | ||
1570 | acpi_ht = 1; | ||
1571 | } | ||
1572 | /* acpi=rsdt use RSDT instead of XSDT */ | 1596 | /* acpi=rsdt use RSDT instead of XSDT */ |
1573 | else if (strcmp(arg, "rsdt") == 0) { | 1597 | else if (strcmp(arg, "rsdt") == 0) { |
1574 | acpi_rsdt_forced = 1; | 1598 | acpi_rsdt_forced = 1; |
@@ -1576,6 +1600,10 @@ static int __init parse_acpi(char *arg) | |||
1576 | /* "acpi=noirq" disables ACPI interrupt routing */ | 1600 | /* "acpi=noirq" disables ACPI interrupt routing */ |
1577 | else if (strcmp(arg, "noirq") == 0) { | 1601 | else if (strcmp(arg, "noirq") == 0) { |
1578 | acpi_noirq_set(); | 1602 | acpi_noirq_set(); |
1603 | } | ||
1604 | /* "acpi=copy_dsdt" copys DSDT */ | ||
1605 | else if (strcmp(arg, "copy_dsdt") == 0) { | ||
1606 | acpi_gbl_copy_dsdt_locally = 1; | ||
1579 | } else { | 1607 | } else { |
1580 | /* Core will printk when we return error. */ | 1608 | /* Core will printk when we return error. */ |
1581 | return -EINVAL; | 1609 | return -EINVAL; |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 2e837f5080fe..fb7a5f052e2b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -145,6 +145,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, | |||
145 | percpu_entry->states[cx->index].eax = cx->address; | 145 | percpu_entry->states[cx->index].eax = cx->address; |
146 | percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; | 146 | percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; |
147 | } | 147 | } |
148 | |||
149 | /* | ||
150 | * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared, | ||
151 | * then we should skip checking BM_STS for this C-state. | ||
152 | * ref: "Intel Processor Vendor-Specific ACPI Interface Specification" | ||
153 | */ | ||
154 | if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2)) | ||
155 | cx->bm_sts_skip = 1; | ||
156 | |||
148 | return retval; | 157 | return retval; |
149 | } | 158 | } |
150 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); | 159 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); |
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e296010..28595d6df47c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S | |||
@@ -104,7 +104,7 @@ _start: | |||
104 | movl %eax, %ecx | 104 | movl %eax, %ecx |
105 | orl %edx, %ecx | 105 | orl %edx, %ecx |
106 | jz 1f | 106 | jz 1f |
107 | movl $0xc0000080, %ecx | 107 | movl $MSR_EFER, %ecx |
108 | wrmsr | 108 | wrmsr |
109 | 1: | 109 | 1: |
110 | 110 | ||
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index f9961034e557..33cec152070d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -2,7 +2,7 @@ | |||
2 | * sleep.c - x86-specific ACPI sleep support. | 2 | * sleep.c - x86-specific ACPI sleep support. |
3 | * | 3 | * |
4 | * Copyright (C) 2001-2003 Patrick Mochel | 4 | * Copyright (C) 2001-2003 Patrick Mochel |
5 | * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz> |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/acpi.h> | 8 | #include <linux/acpi.h> |
@@ -157,13 +157,16 @@ static int __init acpi_sleep_setup(char *str) | |||
157 | #ifdef CONFIG_HIBERNATION | 157 | #ifdef CONFIG_HIBERNATION |
158 | if (strncmp(str, "s4_nohwsig", 10) == 0) | 158 | if (strncmp(str, "s4_nohwsig", 10) == 0) |
159 | acpi_no_s4_hw_signature(); | 159 | acpi_no_s4_hw_signature(); |
160 | if (strncmp(str, "s4_nonvs", 8) == 0) | 160 | if (strncmp(str, "s4_nonvs", 8) == 0) { |
161 | acpi_s4_no_nvs(); | 161 | pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, " |
162 | "please use acpi_sleep=nonvs instead"); | ||
163 | acpi_nvs_nosave(); | ||
164 | } | ||
162 | #endif | 165 | #endif |
166 | if (strncmp(str, "nonvs", 5) == 0) | ||
167 | acpi_nvs_nosave(); | ||
163 | if (strncmp(str, "old_ordering", 12) == 0) | 168 | if (strncmp(str, "old_ordering", 12) == 0) |
164 | acpi_old_suspend_ordering(); | 169 | acpi_old_suspend_ordering(); |
165 | if (strncmp(str, "sci_force_enable", 16) == 0) | ||
166 | acpi_set_sci_en_on_resume(); | ||
167 | str = strchr(str, ','); | 170 | str = strchr(str, ','); |
168 | if (str != NULL) | 171 | if (str != NULL) |
169 | str += strspn(str, ", \t"); | 172 | str += strspn(str, ", \t"); |
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 8ded418b0593..13ab720573e3 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S | |||
@@ -1,4 +1,4 @@ | |||
1 | .section .text.page_aligned | 1 | .section .text..page_aligned |
2 | #include <linux/linkage.h> | 2 | #include <linux/linkage.h> |
3 | #include <asm/segment.h> | 3 | #include <asm/segment.h> |
4 | #include <asm/page_types.h> | 4 | #include <asm/page_types.h> |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1a160d5d44d0..f65ab8b014c4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) | |||
194 | } | 194 | } |
195 | 195 | ||
196 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | 196 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; |
197 | extern u8 *__smp_locks[], *__smp_locks_end[]; | 197 | extern s32 __smp_locks[], __smp_locks_end[]; |
198 | static void *text_poke_early(void *addr, const void *opcode, size_t len); | 198 | static void *text_poke_early(void *addr, const void *opcode, size_t len); |
199 | 199 | ||
200 | /* Replace instructions with better alternatives for this CPU type. | 200 | /* Replace instructions with better alternatives for this CPU type. |
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
214 | u8 *instr = a->instr; | 214 | u8 *instr = a->instr; |
215 | BUG_ON(a->replacementlen > a->instrlen); | 215 | BUG_ON(a->replacementlen > a->instrlen); |
216 | BUG_ON(a->instrlen > sizeof(insnbuf)); | 216 | BUG_ON(a->instrlen > sizeof(insnbuf)); |
217 | BUG_ON(a->cpuid >= NCAPINTS*32); | ||
217 | if (!boot_cpu_has(a->cpuid)) | 218 | if (!boot_cpu_has(a->cpuid)) |
218 | continue; | 219 | continue; |
219 | #ifdef CONFIG_X86_64 | 220 | #ifdef CONFIG_X86_64 |
@@ -235,37 +236,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start, | |||
235 | 236 | ||
236 | #ifdef CONFIG_SMP | 237 | #ifdef CONFIG_SMP |
237 | 238 | ||
238 | static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) | 239 | static void alternatives_smp_lock(const s32 *start, const s32 *end, |
240 | u8 *text, u8 *text_end) | ||
239 | { | 241 | { |
240 | u8 **ptr; | 242 | const s32 *poff; |
241 | 243 | ||
242 | mutex_lock(&text_mutex); | 244 | mutex_lock(&text_mutex); |
243 | for (ptr = start; ptr < end; ptr++) { | 245 | for (poff = start; poff < end; poff++) { |
244 | if (*ptr < text) | 246 | u8 *ptr = (u8 *)poff + *poff; |
245 | continue; | 247 | |
246 | if (*ptr > text_end) | 248 | if (!*poff || ptr < text || ptr >= text_end) |
247 | continue; | 249 | continue; |
248 | /* turn DS segment override prefix into lock prefix */ | 250 | /* turn DS segment override prefix into lock prefix */ |
249 | text_poke(*ptr, ((unsigned char []){0xf0}), 1); | 251 | if (*ptr == 0x3e) |
252 | text_poke(ptr, ((unsigned char []){0xf0}), 1); | ||
250 | }; | 253 | }; |
251 | mutex_unlock(&text_mutex); | 254 | mutex_unlock(&text_mutex); |
252 | } | 255 | } |
253 | 256 | ||
254 | static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) | 257 | static void alternatives_smp_unlock(const s32 *start, const s32 *end, |
258 | u8 *text, u8 *text_end) | ||
255 | { | 259 | { |
256 | u8 **ptr; | 260 | const s32 *poff; |
257 | 261 | ||
258 | if (noreplace_smp) | 262 | if (noreplace_smp) |
259 | return; | 263 | return; |
260 | 264 | ||
261 | mutex_lock(&text_mutex); | 265 | mutex_lock(&text_mutex); |
262 | for (ptr = start; ptr < end; ptr++) { | 266 | for (poff = start; poff < end; poff++) { |
263 | if (*ptr < text) | 267 | u8 *ptr = (u8 *)poff + *poff; |
264 | continue; | 268 | |
265 | if (*ptr > text_end) | 269 | if (!*poff || ptr < text || ptr >= text_end) |
266 | continue; | 270 | continue; |
267 | /* turn lock prefix into DS segment override prefix */ | 271 | /* turn lock prefix into DS segment override prefix */ |
268 | text_poke(*ptr, ((unsigned char []){0x3E}), 1); | 272 | if (*ptr == 0xf0) |
273 | text_poke(ptr, ((unsigned char []){0x3E}), 1); | ||
269 | }; | 274 | }; |
270 | mutex_unlock(&text_mutex); | 275 | mutex_unlock(&text_mutex); |
271 | } | 276 | } |
@@ -276,8 +281,8 @@ struct smp_alt_module { | |||
276 | char *name; | 281 | char *name; |
277 | 282 | ||
278 | /* ptrs to lock prefixes */ | 283 | /* ptrs to lock prefixes */ |
279 | u8 **locks; | 284 | const s32 *locks; |
280 | u8 **locks_end; | 285 | const s32 *locks_end; |
281 | 286 | ||
282 | /* .text segment, needed to avoid patching init code ;) */ | 287 | /* .text segment, needed to avoid patching init code ;) */ |
283 | u8 *text; | 288 | u8 *text; |
@@ -398,16 +403,19 @@ void alternatives_smp_switch(int smp) | |||
398 | int alternatives_text_reserved(void *start, void *end) | 403 | int alternatives_text_reserved(void *start, void *end) |
399 | { | 404 | { |
400 | struct smp_alt_module *mod; | 405 | struct smp_alt_module *mod; |
401 | u8 **ptr; | 406 | const s32 *poff; |
402 | u8 *text_start = start; | 407 | u8 *text_start = start; |
403 | u8 *text_end = end; | 408 | u8 *text_end = end; |
404 | 409 | ||
405 | list_for_each_entry(mod, &smp_alt_modules, next) { | 410 | list_for_each_entry(mod, &smp_alt_modules, next) { |
406 | if (mod->text > text_end || mod->text_end < text_start) | 411 | if (mod->text > text_end || mod->text_end < text_start) |
407 | continue; | 412 | continue; |
408 | for (ptr = mod->locks; ptr < mod->locks_end; ptr++) | 413 | for (poff = mod->locks; poff < mod->locks_end; poff++) { |
409 | if (text_start <= *ptr && text_end >= *ptr) | 414 | const u8 *ptr = (const u8 *)poff + *poff; |
415 | |||
416 | if (text_start <= ptr && text_end > ptr) | ||
410 | return 1; | 417 | return 1; |
418 | } | ||
411 | } | 419 | } |
412 | 420 | ||
413 | return 0; | 421 | return 0; |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f854d89b7edf..fa044e1e30a2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain, | |||
731 | 731 | ||
732 | static u64 *alloc_pte(struct protection_domain *domain, | 732 | static u64 *alloc_pte(struct protection_domain *domain, |
733 | unsigned long address, | 733 | unsigned long address, |
734 | int end_lvl, | 734 | unsigned long page_size, |
735 | u64 **pte_page, | 735 | u64 **pte_page, |
736 | gfp_t gfp) | 736 | gfp_t gfp) |
737 | { | 737 | { |
738 | int level, end_lvl; | ||
738 | u64 *pte, *page; | 739 | u64 *pte, *page; |
739 | int level; | 740 | |
741 | BUG_ON(!is_power_of_2(page_size)); | ||
740 | 742 | ||
741 | while (address > PM_LEVEL_SIZE(domain->mode)) | 743 | while (address > PM_LEVEL_SIZE(domain->mode)) |
742 | increase_address_space(domain, gfp); | 744 | increase_address_space(domain, gfp); |
743 | 745 | ||
744 | level = domain->mode - 1; | 746 | level = domain->mode - 1; |
745 | pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; | 747 | pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; |
748 | address = PAGE_SIZE_ALIGN(address, page_size); | ||
749 | end_lvl = PAGE_SIZE_LEVEL(page_size); | ||
746 | 750 | ||
747 | while (level > end_lvl) { | 751 | while (level > end_lvl) { |
748 | if (!IOMMU_PTE_PRESENT(*pte)) { | 752 | if (!IOMMU_PTE_PRESENT(*pte)) { |
@@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain, | |||
752 | *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); | 756 | *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); |
753 | } | 757 | } |
754 | 758 | ||
759 | /* No level skipping support yet */ | ||
760 | if (PM_PTE_LEVEL(*pte) != level) | ||
761 | return NULL; | ||
762 | |||
755 | level -= 1; | 763 | level -= 1; |
756 | 764 | ||
757 | pte = IOMMU_PTE_PAGE(*pte); | 765 | pte = IOMMU_PTE_PAGE(*pte); |
@@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain, | |||
769 | * This function checks if there is a PTE for a given dma address. If | 777 | * This function checks if there is a PTE for a given dma address. If |
770 | * there is one, it returns the pointer to it. | 778 | * there is one, it returns the pointer to it. |
771 | */ | 779 | */ |
772 | static u64 *fetch_pte(struct protection_domain *domain, | 780 | static u64 *fetch_pte(struct protection_domain *domain, unsigned long address) |
773 | unsigned long address, int map_size) | ||
774 | { | 781 | { |
775 | int level; | 782 | int level; |
776 | u64 *pte; | 783 | u64 *pte; |
777 | 784 | ||
778 | level = domain->mode - 1; | 785 | if (address > PM_LEVEL_SIZE(domain->mode)) |
779 | pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; | 786 | return NULL; |
780 | 787 | ||
781 | while (level > map_size) { | 788 | level = domain->mode - 1; |
789 | pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; | ||
790 | |||
791 | while (level > 0) { | ||
792 | |||
793 | /* Not Present */ | ||
782 | if (!IOMMU_PTE_PRESENT(*pte)) | 794 | if (!IOMMU_PTE_PRESENT(*pte)) |
783 | return NULL; | 795 | return NULL; |
784 | 796 | ||
797 | /* Large PTE */ | ||
798 | if (PM_PTE_LEVEL(*pte) == 0x07) { | ||
799 | unsigned long pte_mask, __pte; | ||
800 | |||
801 | /* | ||
802 | * If we have a series of large PTEs, make | ||
803 | * sure to return a pointer to the first one. | ||
804 | */ | ||
805 | pte_mask = PTE_PAGE_SIZE(*pte); | ||
806 | pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); | ||
807 | __pte = ((unsigned long)pte) & pte_mask; | ||
808 | |||
809 | return (u64 *)__pte; | ||
810 | } | ||
811 | |||
812 | /* No level skipping support yet */ | ||
813 | if (PM_PTE_LEVEL(*pte) != level) | ||
814 | return NULL; | ||
815 | |||
785 | level -= 1; | 816 | level -= 1; |
786 | 817 | ||
818 | /* Walk to the next level */ | ||
787 | pte = IOMMU_PTE_PAGE(*pte); | 819 | pte = IOMMU_PTE_PAGE(*pte); |
788 | pte = &pte[PM_LEVEL_INDEX(level, address)]; | 820 | pte = &pte[PM_LEVEL_INDEX(level, address)]; |
789 | |||
790 | if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { | ||
791 | pte = NULL; | ||
792 | break; | ||
793 | } | ||
794 | } | 821 | } |
795 | 822 | ||
796 | return pte; | 823 | return pte; |
@@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom, | |||
807 | unsigned long bus_addr, | 834 | unsigned long bus_addr, |
808 | unsigned long phys_addr, | 835 | unsigned long phys_addr, |
809 | int prot, | 836 | int prot, |
810 | int map_size) | 837 | unsigned long page_size) |
811 | { | 838 | { |
812 | u64 __pte, *pte; | 839 | u64 __pte, *pte; |
813 | 840 | int i, count; | |
814 | bus_addr = PAGE_ALIGN(bus_addr); | ||
815 | phys_addr = PAGE_ALIGN(phys_addr); | ||
816 | |||
817 | BUG_ON(!PM_ALIGNED(map_size, bus_addr)); | ||
818 | BUG_ON(!PM_ALIGNED(map_size, phys_addr)); | ||
819 | 841 | ||
820 | if (!(prot & IOMMU_PROT_MASK)) | 842 | if (!(prot & IOMMU_PROT_MASK)) |
821 | return -EINVAL; | 843 | return -EINVAL; |
822 | 844 | ||
823 | pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); | 845 | bus_addr = PAGE_ALIGN(bus_addr); |
846 | phys_addr = PAGE_ALIGN(phys_addr); | ||
847 | count = PAGE_SIZE_PTE_COUNT(page_size); | ||
848 | pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL); | ||
849 | |||
850 | for (i = 0; i < count; ++i) | ||
851 | if (IOMMU_PTE_PRESENT(pte[i])) | ||
852 | return -EBUSY; | ||
824 | 853 | ||
825 | if (IOMMU_PTE_PRESENT(*pte)) | 854 | if (page_size > PAGE_SIZE) { |
826 | return -EBUSY; | 855 | __pte = PAGE_SIZE_PTE(phys_addr, page_size); |
856 | __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; | ||
857 | } else | ||
858 | __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; | ||
827 | 859 | ||
828 | __pte = phys_addr | IOMMU_PTE_P; | ||
829 | if (prot & IOMMU_PROT_IR) | 860 | if (prot & IOMMU_PROT_IR) |
830 | __pte |= IOMMU_PTE_IR; | 861 | __pte |= IOMMU_PTE_IR; |
831 | if (prot & IOMMU_PROT_IW) | 862 | if (prot & IOMMU_PROT_IW) |
832 | __pte |= IOMMU_PTE_IW; | 863 | __pte |= IOMMU_PTE_IW; |
833 | 864 | ||
834 | *pte = __pte; | 865 | for (i = 0; i < count; ++i) |
866 | pte[i] = __pte; | ||
835 | 867 | ||
836 | update_domain(dom); | 868 | update_domain(dom); |
837 | 869 | ||
838 | return 0; | 870 | return 0; |
839 | } | 871 | } |
840 | 872 | ||
841 | static void iommu_unmap_page(struct protection_domain *dom, | 873 | static unsigned long iommu_unmap_page(struct protection_domain *dom, |
842 | unsigned long bus_addr, int map_size) | 874 | unsigned long bus_addr, |
875 | unsigned long page_size) | ||
843 | { | 876 | { |
844 | u64 *pte = fetch_pte(dom, bus_addr, map_size); | 877 | unsigned long long unmap_size, unmapped; |
878 | u64 *pte; | ||
879 | |||
880 | BUG_ON(!is_power_of_2(page_size)); | ||
881 | |||
882 | unmapped = 0; | ||
883 | |||
884 | while (unmapped < page_size) { | ||
885 | |||
886 | pte = fetch_pte(dom, bus_addr); | ||
887 | |||
888 | if (!pte) { | ||
889 | /* | ||
890 | * No PTE for this address | ||
891 | * move forward in 4kb steps | ||
892 | */ | ||
893 | unmap_size = PAGE_SIZE; | ||
894 | } else if (PM_PTE_LEVEL(*pte) == 0) { | ||
895 | /* 4kb PTE found for this address */ | ||
896 | unmap_size = PAGE_SIZE; | ||
897 | *pte = 0ULL; | ||
898 | } else { | ||
899 | int count, i; | ||
900 | |||
901 | /* Large PTE found which maps this address */ | ||
902 | unmap_size = PTE_PAGE_SIZE(*pte); | ||
903 | count = PAGE_SIZE_PTE_COUNT(unmap_size); | ||
904 | for (i = 0; i < count; i++) | ||
905 | pte[i] = 0ULL; | ||
906 | } | ||
845 | 907 | ||
846 | if (pte) | 908 | bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size; |
847 | *pte = 0; | 909 | unmapped += unmap_size; |
910 | } | ||
911 | |||
912 | BUG_ON(!is_power_of_2(unmapped)); | ||
913 | |||
914 | return unmapped; | ||
848 | } | 915 | } |
849 | 916 | ||
850 | /* | 917 | /* |
@@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, | |||
878 | for (addr = e->address_start; addr < e->address_end; | 945 | for (addr = e->address_start; addr < e->address_end; |
879 | addr += PAGE_SIZE) { | 946 | addr += PAGE_SIZE) { |
880 | ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, | 947 | ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, |
881 | PM_MAP_4k); | 948 | PAGE_SIZE); |
882 | if (ret) | 949 | if (ret) |
883 | return ret; | 950 | return ret; |
884 | /* | 951 | /* |
@@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, | |||
1006 | u64 *pte, *pte_page; | 1073 | u64 *pte, *pte_page; |
1007 | 1074 | ||
1008 | for (i = 0; i < num_ptes; ++i) { | 1075 | for (i = 0; i < num_ptes; ++i) { |
1009 | pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, | 1076 | pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE, |
1010 | &pte_page, gfp); | 1077 | &pte_page, gfp); |
1011 | if (!pte) | 1078 | if (!pte) |
1012 | goto out_free; | 1079 | goto out_free; |
@@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, | |||
1042 | for (i = dma_dom->aperture[index]->offset; | 1109 | for (i = dma_dom->aperture[index]->offset; |
1043 | i < dma_dom->aperture_size; | 1110 | i < dma_dom->aperture_size; |
1044 | i += PAGE_SIZE) { | 1111 | i += PAGE_SIZE) { |
1045 | u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); | 1112 | u64 *pte = fetch_pte(&dma_dom->domain, i); |
1046 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) | 1113 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) |
1047 | continue; | 1114 | continue; |
1048 | 1115 | ||
@@ -1420,6 +1487,7 @@ static int __attach_device(struct device *dev, | |||
1420 | struct protection_domain *domain) | 1487 | struct protection_domain *domain) |
1421 | { | 1488 | { |
1422 | struct iommu_dev_data *dev_data, *alias_data; | 1489 | struct iommu_dev_data *dev_data, *alias_data; |
1490 | int ret; | ||
1423 | 1491 | ||
1424 | dev_data = get_dev_data(dev); | 1492 | dev_data = get_dev_data(dev); |
1425 | alias_data = get_dev_data(dev_data->alias); | 1493 | alias_data = get_dev_data(dev_data->alias); |
@@ -1431,13 +1499,14 @@ static int __attach_device(struct device *dev, | |||
1431 | spin_lock(&domain->lock); | 1499 | spin_lock(&domain->lock); |
1432 | 1500 | ||
1433 | /* Some sanity checks */ | 1501 | /* Some sanity checks */ |
1502 | ret = -EBUSY; | ||
1434 | if (alias_data->domain != NULL && | 1503 | if (alias_data->domain != NULL && |
1435 | alias_data->domain != domain) | 1504 | alias_data->domain != domain) |
1436 | return -EBUSY; | 1505 | goto out_unlock; |
1437 | 1506 | ||
1438 | if (dev_data->domain != NULL && | 1507 | if (dev_data->domain != NULL && |
1439 | dev_data->domain != domain) | 1508 | dev_data->domain != domain) |
1440 | return -EBUSY; | 1509 | goto out_unlock; |
1441 | 1510 | ||
1442 | /* Do real assignment */ | 1511 | /* Do real assignment */ |
1443 | if (dev_data->alias != dev) { | 1512 | if (dev_data->alias != dev) { |
@@ -1453,10 +1522,14 @@ static int __attach_device(struct device *dev, | |||
1453 | 1522 | ||
1454 | atomic_inc(&dev_data->bind); | 1523 | atomic_inc(&dev_data->bind); |
1455 | 1524 | ||
1525 | ret = 0; | ||
1526 | |||
1527 | out_unlock: | ||
1528 | |||
1456 | /* ready */ | 1529 | /* ready */ |
1457 | spin_unlock(&domain->lock); | 1530 | spin_unlock(&domain->lock); |
1458 | 1531 | ||
1459 | return 0; | 1532 | return ret; |
1460 | } | 1533 | } |
1461 | 1534 | ||
1462 | /* | 1535 | /* |
@@ -1712,7 +1785,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, | |||
1712 | 1785 | ||
1713 | pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; | 1786 | pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; |
1714 | if (!pte) { | 1787 | if (!pte) { |
1715 | pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, | 1788 | pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page, |
1716 | GFP_ATOMIC); | 1789 | GFP_ATOMIC); |
1717 | aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; | 1790 | aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; |
1718 | } else | 1791 | } else |
@@ -2257,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void) | |||
2257 | 2330 | ||
2258 | iommu_detected = 1; | 2331 | iommu_detected = 1; |
2259 | swiotlb = 0; | 2332 | swiotlb = 0; |
2260 | #ifdef CONFIG_GART_IOMMU | ||
2261 | gart_iommu_aperture_disabled = 1; | ||
2262 | gart_iommu_aperture = 0; | ||
2263 | #endif | ||
2264 | 2333 | ||
2265 | /* Make the driver finally visible to the drivers */ | 2334 | /* Make the driver finally visible to the drivers */ |
2266 | dma_ops = &amd_iommu_dma_ops; | 2335 | dma_ops = &amd_iommu_dma_ops; |
@@ -2439,12 +2508,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, | |||
2439 | return ret; | 2508 | return ret; |
2440 | } | 2509 | } |
2441 | 2510 | ||
2442 | static int amd_iommu_map_range(struct iommu_domain *dom, | 2511 | static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, |
2443 | unsigned long iova, phys_addr_t paddr, | 2512 | phys_addr_t paddr, int gfp_order, int iommu_prot) |
2444 | size_t size, int iommu_prot) | ||
2445 | { | 2513 | { |
2514 | unsigned long page_size = 0x1000UL << gfp_order; | ||
2446 | struct protection_domain *domain = dom->priv; | 2515 | struct protection_domain *domain = dom->priv; |
2447 | unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE); | ||
2448 | int prot = 0; | 2516 | int prot = 0; |
2449 | int ret; | 2517 | int ret; |
2450 | 2518 | ||
@@ -2453,61 +2521,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom, | |||
2453 | if (iommu_prot & IOMMU_WRITE) | 2521 | if (iommu_prot & IOMMU_WRITE) |
2454 | prot |= IOMMU_PROT_IW; | 2522 | prot |= IOMMU_PROT_IW; |
2455 | 2523 | ||
2456 | iova &= PAGE_MASK; | ||
2457 | paddr &= PAGE_MASK; | ||
2458 | |||
2459 | mutex_lock(&domain->api_lock); | 2524 | mutex_lock(&domain->api_lock); |
2460 | 2525 | ret = iommu_map_page(domain, iova, paddr, prot, page_size); | |
2461 | for (i = 0; i < npages; ++i) { | ||
2462 | ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); | ||
2463 | if (ret) | ||
2464 | return ret; | ||
2465 | |||
2466 | iova += PAGE_SIZE; | ||
2467 | paddr += PAGE_SIZE; | ||
2468 | } | ||
2469 | |||
2470 | mutex_unlock(&domain->api_lock); | 2526 | mutex_unlock(&domain->api_lock); |
2471 | 2527 | ||
2472 | return 0; | 2528 | return ret; |
2473 | } | 2529 | } |
2474 | 2530 | ||
2475 | static void amd_iommu_unmap_range(struct iommu_domain *dom, | 2531 | static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, |
2476 | unsigned long iova, size_t size) | 2532 | int gfp_order) |
2477 | { | 2533 | { |
2478 | |||
2479 | struct protection_domain *domain = dom->priv; | 2534 | struct protection_domain *domain = dom->priv; |
2480 | unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); | 2535 | unsigned long page_size, unmap_size; |
2481 | 2536 | ||
2482 | iova &= PAGE_MASK; | 2537 | page_size = 0x1000UL << gfp_order; |
2483 | 2538 | ||
2484 | mutex_lock(&domain->api_lock); | 2539 | mutex_lock(&domain->api_lock); |
2485 | 2540 | unmap_size = iommu_unmap_page(domain, iova, page_size); | |
2486 | for (i = 0; i < npages; ++i) { | 2541 | mutex_unlock(&domain->api_lock); |
2487 | iommu_unmap_page(domain, iova, PM_MAP_4k); | ||
2488 | iova += PAGE_SIZE; | ||
2489 | } | ||
2490 | 2542 | ||
2491 | iommu_flush_tlb_pde(domain); | 2543 | iommu_flush_tlb_pde(domain); |
2492 | 2544 | ||
2493 | mutex_unlock(&domain->api_lock); | 2545 | return get_order(unmap_size); |
2494 | } | 2546 | } |
2495 | 2547 | ||
2496 | static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, | 2548 | static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, |
2497 | unsigned long iova) | 2549 | unsigned long iova) |
2498 | { | 2550 | { |
2499 | struct protection_domain *domain = dom->priv; | 2551 | struct protection_domain *domain = dom->priv; |
2500 | unsigned long offset = iova & ~PAGE_MASK; | 2552 | unsigned long offset_mask; |
2501 | phys_addr_t paddr; | 2553 | phys_addr_t paddr; |
2502 | u64 *pte; | 2554 | u64 *pte, __pte; |
2503 | 2555 | ||
2504 | pte = fetch_pte(domain, iova, PM_MAP_4k); | 2556 | pte = fetch_pte(domain, iova); |
2505 | 2557 | ||
2506 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) | 2558 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) |
2507 | return 0; | 2559 | return 0; |
2508 | 2560 | ||
2509 | paddr = *pte & IOMMU_PAGE_MASK; | 2561 | if (PM_PTE_LEVEL(*pte) == 0) |
2510 | paddr |= offset; | 2562 | offset_mask = PAGE_SIZE - 1; |
2563 | else | ||
2564 | offset_mask = PTE_PAGE_SIZE(*pte) - 1; | ||
2565 | |||
2566 | __pte = *pte & PM_ADDR_MASK; | ||
2567 | paddr = (__pte & ~offset_mask) | (iova & offset_mask); | ||
2511 | 2568 | ||
2512 | return paddr; | 2569 | return paddr; |
2513 | } | 2570 | } |
@@ -2515,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, | |||
2515 | static int amd_iommu_domain_has_cap(struct iommu_domain *domain, | 2572 | static int amd_iommu_domain_has_cap(struct iommu_domain *domain, |
2516 | unsigned long cap) | 2573 | unsigned long cap) |
2517 | { | 2574 | { |
2575 | switch (cap) { | ||
2576 | case IOMMU_CAP_CACHE_COHERENCY: | ||
2577 | return 1; | ||
2578 | } | ||
2579 | |||
2518 | return 0; | 2580 | return 0; |
2519 | } | 2581 | } |
2520 | 2582 | ||
@@ -2523,8 +2585,8 @@ static struct iommu_ops amd_iommu_ops = { | |||
2523 | .domain_destroy = amd_iommu_domain_destroy, | 2585 | .domain_destroy = amd_iommu_domain_destroy, |
2524 | .attach_dev = amd_iommu_attach_device, | 2586 | .attach_dev = amd_iommu_attach_device, |
2525 | .detach_dev = amd_iommu_detach_device, | 2587 | .detach_dev = amd_iommu_detach_device, |
2526 | .map = amd_iommu_map_range, | 2588 | .map = amd_iommu_map, |
2527 | .unmap = amd_iommu_unmap_range, | 2589 | .unmap = amd_iommu_unmap, |
2528 | .iova_to_phys = amd_iommu_iova_to_phys, | 2590 | .iova_to_phys = amd_iommu_iova_to_phys, |
2529 | .domain_has_cap = amd_iommu_domain_has_cap, | 2591 | .domain_has_cap = amd_iommu_domain_has_cap, |
2530 | }; | 2592 | }; |
@@ -2552,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void) | |||
2552 | 2614 | ||
2553 | pt_domain->mode |= PAGE_MODE_NONE; | 2615 | pt_domain->mode |= PAGE_MODE_NONE; |
2554 | 2616 | ||
2555 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | 2617 | for_each_pci_dev(dev) { |
2556 | |||
2557 | if (!check_device(&dev->dev)) | 2618 | if (!check_device(&dev->dev)) |
2558 | continue; | 2619 | continue; |
2559 | 2620 | ||
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6360abf993d4..3cc63e2b8dd4 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -120,6 +120,7 @@ struct ivmd_header { | |||
120 | bool amd_iommu_dump; | 120 | bool amd_iommu_dump; |
121 | 121 | ||
122 | static int __initdata amd_iommu_detected; | 122 | static int __initdata amd_iommu_detected; |
123 | static bool __initdata amd_iommu_disabled; | ||
123 | 124 | ||
124 | u16 amd_iommu_last_bdf; /* largest PCI device id we have | 125 | u16 amd_iommu_last_bdf; /* largest PCI device id we have |
125 | to handle */ | 126 | to handle */ |
@@ -286,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address) | |||
286 | { | 287 | { |
287 | u8 *ret; | 288 | u8 *ret; |
288 | 289 | ||
289 | if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) | 290 | if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) { |
291 | pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n", | ||
292 | address); | ||
293 | pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n"); | ||
290 | return NULL; | 294 | return NULL; |
295 | } | ||
291 | 296 | ||
292 | ret = ioremap_nocache(address, MMIO_REGION_LENGTH); | 297 | ret = ioremap_nocache(address, MMIO_REGION_LENGTH); |
293 | if (ret != NULL) | 298 | if (ret != NULL) |
@@ -1313,7 +1318,7 @@ static int __init amd_iommu_init(void) | |||
1313 | ret = amd_iommu_init_dma_ops(); | 1318 | ret = amd_iommu_init_dma_ops(); |
1314 | 1319 | ||
1315 | if (ret) | 1320 | if (ret) |
1316 | goto free; | 1321 | goto free_disable; |
1317 | 1322 | ||
1318 | amd_iommu_init_api(); | 1323 | amd_iommu_init_api(); |
1319 | 1324 | ||
@@ -1331,9 +1336,10 @@ static int __init amd_iommu_init(void) | |||
1331 | out: | 1336 | out: |
1332 | return ret; | 1337 | return ret; |
1333 | 1338 | ||
1334 | free: | 1339 | free_disable: |
1335 | disable_iommus(); | 1340 | disable_iommus(); |
1336 | 1341 | ||
1342 | free: | ||
1337 | amd_iommu_uninit_devices(); | 1343 | amd_iommu_uninit_devices(); |
1338 | 1344 | ||
1339 | free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, | 1345 | free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, |
@@ -1352,6 +1358,15 @@ free: | |||
1352 | 1358 | ||
1353 | free_unity_maps(); | 1359 | free_unity_maps(); |
1354 | 1360 | ||
1361 | #ifdef CONFIG_GART_IOMMU | ||
1362 | /* | ||
1363 | * We failed to initialize the AMD IOMMU - try fallback to GART | ||
1364 | * if possible. | ||
1365 | */ | ||
1366 | gart_iommu_init(); | ||
1367 | |||
1368 | #endif | ||
1369 | |||
1355 | goto out; | 1370 | goto out; |
1356 | } | 1371 | } |
1357 | 1372 | ||
@@ -1372,6 +1387,9 @@ void __init amd_iommu_detect(void) | |||
1372 | if (no_iommu || (iommu_detected && !gart_iommu_aperture)) | 1387 | if (no_iommu || (iommu_detected && !gart_iommu_aperture)) |
1373 | return; | 1388 | return; |
1374 | 1389 | ||
1390 | if (amd_iommu_disabled) | ||
1391 | return; | ||
1392 | |||
1375 | if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { | 1393 | if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { |
1376 | iommu_detected = 1; | 1394 | iommu_detected = 1; |
1377 | amd_iommu_detected = 1; | 1395 | amd_iommu_detected = 1; |
@@ -1401,6 +1419,8 @@ static int __init parse_amd_iommu_options(char *str) | |||
1401 | for (; *str; ++str) { | 1419 | for (; *str; ++str) { |
1402 | if (strncmp(str, "fullflush", 9) == 0) | 1420 | if (strncmp(str, "fullflush", 9) == 0) |
1403 | amd_iommu_unmap_flush = true; | 1421 | amd_iommu_unmap_flush = true; |
1422 | if (strncmp(str, "off", 3) == 0) | ||
1423 | amd_iommu_disabled = true; | ||
1404 | } | 1424 | } |
1405 | 1425 | ||
1406 | return 1; | 1426 | return 1; |
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a35347501d36..8dd77800ff5d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c | |||
@@ -43,10 +43,11 @@ | |||
43 | 43 | ||
44 | #include <asm/fixmap.h> | 44 | #include <asm/fixmap.h> |
45 | #include <asm/apb_timer.h> | 45 | #include <asm/apb_timer.h> |
46 | #include <asm/mrst.h> | ||
46 | 47 | ||
47 | #define APBT_MASK CLOCKSOURCE_MASK(32) | 48 | #define APBT_MASK CLOCKSOURCE_MASK(32) |
48 | #define APBT_SHIFT 22 | 49 | #define APBT_SHIFT 22 |
49 | #define APBT_CLOCKEVENT_RATING 150 | 50 | #define APBT_CLOCKEVENT_RATING 110 |
50 | #define APBT_CLOCKSOURCE_RATING 250 | 51 | #define APBT_CLOCKSOURCE_RATING 250 |
51 | #define APBT_MIN_DELTA_USEC 200 | 52 | #define APBT_MIN_DELTA_USEC 200 |
52 | 53 | ||
@@ -83,8 +84,6 @@ struct apbt_dev { | |||
83 | char name[10]; | 84 | char name[10]; |
84 | }; | 85 | }; |
85 | 86 | ||
86 | int disable_apbt_percpu __cpuinitdata; | ||
87 | |||
88 | static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); | 87 | static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); |
89 | 88 | ||
90 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = { | |||
195 | }; | 194 | }; |
196 | 195 | ||
197 | /* | 196 | /* |
198 | * if user does not want to use per CPU apb timer, just give it a lower rating | ||
199 | * than local apic timer and skip the late per cpu timer init. | ||
200 | */ | ||
201 | static inline int __init setup_x86_mrst_timer(char *arg) | ||
202 | { | ||
203 | if (!arg) | ||
204 | return -EINVAL; | ||
205 | |||
206 | if (strcmp("apbt_only", arg) == 0) | ||
207 | disable_apbt_percpu = 0; | ||
208 | else if (strcmp("lapic_and_apbt", arg) == 0) | ||
209 | disable_apbt_percpu = 1; | ||
210 | else { | ||
211 | pr_warning("X86 MRST timer option %s not recognised" | ||
212 | " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", | ||
213 | arg); | ||
214 | return -EINVAL; | ||
215 | } | ||
216 | return 0; | ||
217 | } | ||
218 | __setup("x86_mrst_timer=", setup_x86_mrst_timer); | ||
219 | |||
220 | /* | ||
221 | * start count down from 0xffff_ffff. this is done by toggling the enable bit | 197 | * start count down from 0xffff_ffff. this is done by toggling the enable bit |
222 | * then load initial load count to ~0. | 198 | * then load initial load count to ~0. |
223 | */ | 199 | */ |
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) | |||
335 | adev->num = smp_processor_id(); | 311 | adev->num = smp_processor_id(); |
336 | memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); | 312 | memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); |
337 | 313 | ||
338 | if (disable_apbt_percpu) { | 314 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { |
339 | apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; | 315 | apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; |
340 | global_clock_event = &adev->evt; | 316 | global_clock_event = &adev->evt; |
341 | printk(KERN_DEBUG "%s clockevent registered as global\n", | 317 | printk(KERN_DEBUG "%s clockevent registered as global\n", |
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, | |||
429 | 405 | ||
430 | static __init int apbt_late_init(void) | 406 | static __init int apbt_late_init(void) |
431 | { | 407 | { |
432 | if (disable_apbt_percpu || !apb_timer_block_enabled) | 408 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || |
409 | !apb_timer_block_enabled) | ||
433 | return 0; | 410 | return 0; |
434 | /* This notifier should be called after workqueue is ready */ | 411 | /* This notifier should be called after workqueue is ready */ |
435 | hotcpu_notifier(apbt_cpuhp_notify, -20); | 412 | hotcpu_notifier(apbt_cpuhp_notify, -20); |
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, | |||
450 | int timer_num; | 427 | int timer_num; |
451 | struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); | 428 | struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); |
452 | 429 | ||
430 | BUG_ON(!apbt_virt_address); | ||
431 | |||
453 | timer_num = adev->num; | 432 | timer_num = adev->num; |
454 | pr_debug("%s CPU %d timer %d mode=%d\n", | 433 | pr_debug("%s CPU %d timer %d mode=%d\n", |
455 | __func__, first_cpu(*evt->cpumask), timer_num, mode); | 434 | __func__, first_cpu(*evt->cpumask), timer_num, mode); |
@@ -676,7 +655,7 @@ void __init apbt_time_init(void) | |||
676 | } | 655 | } |
677 | #ifdef CONFIG_SMP | 656 | #ifdef CONFIG_SMP |
678 | /* kernel cmdline disable apb timer, so we will use lapic timers */ | 657 | /* kernel cmdline disable apb timer, so we will use lapic timers */ |
679 | if (disable_apbt_percpu) { | 658 | if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { |
680 | printk(KERN_INFO "apbt: disabled per cpu timer\n"); | 659 | printk(KERN_INFO "apbt: disabled per cpu timer\n"); |
681 | return; | 660 | return; |
682 | } | 661 | } |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0bcf235..a2e0caf26e17 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) | |||
280 | * or BIOS forget to put that in reserved. | 280 | * or BIOS forget to put that in reserved. |
281 | * try to update e820 to make that region as reserved. | 281 | * try to update e820 to make that region as reserved. |
282 | */ | 282 | */ |
283 | u32 agp_aper_base = 0, agp_aper_order = 0; | 283 | u32 agp_aper_order = 0; |
284 | int i, fix, slot, valid_agp = 0; | 284 | int i, fix, slot, valid_agp = 0; |
285 | u32 ctl; | 285 | u32 ctl; |
286 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; | 286 | u32 aper_size = 0, aper_order = 0, last_aper_order = 0; |
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) | |||
291 | return; | 291 | return; |
292 | 292 | ||
293 | /* This is mostly duplicate of iommu_hole_init */ | 293 | /* This is mostly duplicate of iommu_hole_init */ |
294 | agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); | 294 | search_agp_bridge(&agp_aper_order, &valid_agp); |
295 | 295 | ||
296 | fix = 0; | 296 | fix = 0; |
297 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 297 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bfc507d..910f20b457c4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile | |||
@@ -2,7 +2,12 @@ | |||
2 | # Makefile for local APIC drivers and for the IO-APIC code | 2 | # Makefile for local APIC drivers and for the IO-APIC code |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o | 5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o |
6 | ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) | ||
7 | obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o | ||
8 | endif | ||
9 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o | ||
10 | |||
6 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o | 11 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o |
7 | obj-$(CONFIG_SMP) += ipi.o | 12 | obj-$(CONFIG_SMP) += ipi.o |
8 | 13 | ||
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e5a4a1e01618..980508c79082 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <asm/smp.h> | 51 | #include <asm/smp.h> |
52 | #include <asm/mce.h> | 52 | #include <asm/mce.h> |
53 | #include <asm/kvm_para.h> | 53 | #include <asm/kvm_para.h> |
54 | #include <asm/tsc.h> | ||
54 | 55 | ||
55 | unsigned int num_processors; | 56 | unsigned int num_processors; |
56 | 57 | ||
@@ -459,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask) | |||
459 | } | 460 | } |
460 | 461 | ||
461 | /* | 462 | /* |
462 | * Setup the local APIC timer for this CPU. Copy the initilized values | 463 | * Setup the local APIC timer for this CPU. Copy the initialized values |
463 | * of the boot CPU and register the clock event in the framework. | 464 | * of the boot CPU and register the clock event in the framework. |
464 | */ | 465 | */ |
465 | static void __cpuinit setup_APIC_timer(void) | 466 | static void __cpuinit setup_APIC_timer(void) |
@@ -920,7 +921,7 @@ void disable_local_APIC(void) | |||
920 | unsigned int value; | 921 | unsigned int value; |
921 | 922 | ||
922 | /* APIC hasn't been mapped yet */ | 923 | /* APIC hasn't been mapped yet */ |
923 | if (!apic_phys) | 924 | if (!x2apic_mode && !apic_phys) |
924 | return; | 925 | return; |
925 | 926 | ||
926 | clear_local_APIC(); | 927 | clear_local_APIC(); |
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void) | |||
1151 | */ | 1152 | */ |
1152 | void __cpuinit setup_local_APIC(void) | 1153 | void __cpuinit setup_local_APIC(void) |
1153 | { | 1154 | { |
1154 | unsigned int value; | 1155 | unsigned int value, queued; |
1155 | int i, j; | 1156 | int i, j, acked = 0; |
1157 | unsigned long long tsc = 0, ntsc; | ||
1158 | long long max_loops = cpu_khz; | ||
1159 | |||
1160 | if (cpu_has_tsc) | ||
1161 | rdtscll(tsc); | ||
1156 | 1162 | ||
1157 | if (disable_apic) { | 1163 | if (disable_apic) { |
1158 | arch_disable_smp_support(); | 1164 | arch_disable_smp_support(); |
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void) | |||
1204 | * the interrupt. Hence a vector might get locked. It was noticed | 1210 | * the interrupt. Hence a vector might get locked. It was noticed |
1205 | * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. | 1211 | * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. |
1206 | */ | 1212 | */ |
1207 | for (i = APIC_ISR_NR - 1; i >= 0; i--) { | 1213 | do { |
1208 | value = apic_read(APIC_ISR + i*0x10); | 1214 | queued = 0; |
1209 | for (j = 31; j >= 0; j--) { | 1215 | for (i = APIC_ISR_NR - 1; i >= 0; i--) |
1210 | if (value & (1<<j)) | 1216 | queued |= apic_read(APIC_IRR + i*0x10); |
1211 | ack_APIC_irq(); | 1217 | |
1218 | for (i = APIC_ISR_NR - 1; i >= 0; i--) { | ||
1219 | value = apic_read(APIC_ISR + i*0x10); | ||
1220 | for (j = 31; j >= 0; j--) { | ||
1221 | if (value & (1<<j)) { | ||
1222 | ack_APIC_irq(); | ||
1223 | acked++; | ||
1224 | } | ||
1225 | } | ||
1212 | } | 1226 | } |
1213 | } | 1227 | if (acked > 256) { |
1228 | printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", | ||
1229 | acked); | ||
1230 | break; | ||
1231 | } | ||
1232 | if (cpu_has_tsc) { | ||
1233 | rdtscll(ntsc); | ||
1234 | max_loops = (cpu_khz << 10) - (ntsc - tsc); | ||
1235 | } else | ||
1236 | max_loops--; | ||
1237 | } while (queued && max_loops > 0); | ||
1238 | WARN_ON(max_loops <= 0); | ||
1214 | 1239 | ||
1215 | /* | 1240 | /* |
1216 | * Now that we are all set up, enable the APIC | 1241 | * Now that we are all set up, enable the APIC |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 03ba1b895f5e..8593582d8022 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -129,25 +129,6 @@ int es7000_plat; | |||
129 | * GSI override for ES7000 platforms. | 129 | * GSI override for ES7000 platforms. |
130 | */ | 130 | */ |
131 | 131 | ||
132 | static unsigned int base; | ||
133 | |||
134 | static int | ||
135 | es7000_rename_gsi(int ioapic, int gsi) | ||
136 | { | ||
137 | if (es7000_plat == ES7000_ZORRO) | ||
138 | return gsi; | ||
139 | |||
140 | if (!base) { | ||
141 | int i; | ||
142 | for (i = 0; i < nr_ioapics; i++) | ||
143 | base += nr_ioapic_registers[i]; | ||
144 | } | ||
145 | |||
146 | if (!ioapic && (gsi < 16)) | ||
147 | gsi += base; | ||
148 | |||
149 | return gsi; | ||
150 | } | ||
151 | 132 | ||
152 | static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) | 133 | static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) |
153 | { | 134 | { |
@@ -190,7 +171,6 @@ static void setup_unisys(void) | |||
190 | es7000_plat = ES7000_ZORRO; | 171 | es7000_plat = ES7000_ZORRO; |
191 | else | 172 | else |
192 | es7000_plat = ES7000_CLASSIC; | 173 | es7000_plat = ES7000_CLASSIC; |
193 | ioapic_renumber_irq = es7000_rename_gsi; | ||
194 | } | 174 | } |
195 | 175 | ||
196 | /* | 176 | /* |
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 000000000000..cefd6942f0e9 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c | |||
@@ -0,0 +1,107 @@ | |||
1 | /* | ||
2 | * HW NMI watchdog support | ||
3 | * | ||
4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Arch specific calls to support NMI watchdog | ||
7 | * | ||
8 | * Bits copied from original nmi.c file | ||
9 | * | ||
10 | */ | ||
11 | #include <asm/apic.h> | ||
12 | |||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/kdebug.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/kprobes.h> | ||
17 | #include <linux/nmi.h> | ||
18 | #include <linux/module.h> | ||
19 | |||
20 | /* For reliability, we're prepared to waste bits here. */ | ||
21 | static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; | ||
22 | |||
23 | u64 hw_nmi_get_sample_period(void) | ||
24 | { | ||
25 | return (u64)(cpu_khz) * 1000 * 60; | ||
26 | } | ||
27 | |||
28 | #ifdef ARCH_HAS_NMI_WATCHDOG | ||
29 | void arch_trigger_all_cpu_backtrace(void) | ||
30 | { | ||
31 | int i; | ||
32 | |||
33 | cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); | ||
34 | |||
35 | printk(KERN_INFO "sending NMI to all CPUs:\n"); | ||
36 | apic->send_IPI_all(NMI_VECTOR); | ||
37 | |||
38 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
39 | for (i = 0; i < 10 * 1000; i++) { | ||
40 | if (cpumask_empty(to_cpumask(backtrace_mask))) | ||
41 | break; | ||
42 | mdelay(1); | ||
43 | } | ||
44 | } | ||
45 | |||
46 | static int __kprobes | ||
47 | arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, | ||
48 | unsigned long cmd, void *__args) | ||
49 | { | ||
50 | struct die_args *args = __args; | ||
51 | struct pt_regs *regs; | ||
52 | int cpu = smp_processor_id(); | ||
53 | |||
54 | switch (cmd) { | ||
55 | case DIE_NMI: | ||
56 | case DIE_NMI_IPI: | ||
57 | break; | ||
58 | |||
59 | default: | ||
60 | return NOTIFY_DONE; | ||
61 | } | ||
62 | |||
63 | regs = args->regs; | ||
64 | |||
65 | if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { | ||
66 | static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; | ||
67 | |||
68 | arch_spin_lock(&lock); | ||
69 | printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); | ||
70 | show_regs(regs); | ||
71 | dump_stack(); | ||
72 | arch_spin_unlock(&lock); | ||
73 | cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); | ||
74 | return NOTIFY_STOP; | ||
75 | } | ||
76 | |||
77 | return NOTIFY_DONE; | ||
78 | } | ||
79 | |||
80 | static __read_mostly struct notifier_block backtrace_notifier = { | ||
81 | .notifier_call = arch_trigger_all_cpu_backtrace_handler, | ||
82 | .next = NULL, | ||
83 | .priority = 1 | ||
84 | }; | ||
85 | |||
86 | static int __init register_trigger_all_cpu_backtrace(void) | ||
87 | { | ||
88 | register_die_notifier(&backtrace_notifier); | ||
89 | return 0; | ||
90 | } | ||
91 | early_initcall(register_trigger_all_cpu_backtrace); | ||
92 | #endif | ||
93 | |||
94 | /* STUB calls to mimic old nmi_watchdog behaviour */ | ||
95 | #if defined(CONFIG_X86_LOCAL_APIC) | ||
96 | unsigned int nmi_watchdog = NMI_NONE; | ||
97 | EXPORT_SYMBOL(nmi_watchdog); | ||
98 | void acpi_nmi_enable(void) { return; } | ||
99 | void acpi_nmi_disable(void) { return; } | ||
100 | #endif | ||
101 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
102 | EXPORT_SYMBOL(nmi_active); | ||
103 | int unknown_nmi_panic; | ||
104 | void cpu_nmi_set_wd_enabled(void) { return; } | ||
105 | void stop_apic_nmi_watchdog(void *unused) { return; } | ||
106 | void setup_apic_nmi_watchdog(void *unused) { return; } | ||
107 | int __init check_nmi_watchdog(void) { return 0; } | ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index eb2789c3f721..4dc0084ec1b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -89,6 +89,9 @@ int nr_ioapics; | |||
89 | /* IO APIC gsi routing info */ | 89 | /* IO APIC gsi routing info */ |
90 | struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; | 90 | struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; |
91 | 91 | ||
92 | /* The one past the highest gsi number used */ | ||
93 | u32 gsi_top; | ||
94 | |||
92 | /* MP IRQ source entries */ | 95 | /* MP IRQ source entries */ |
93 | struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; | 96 | struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; |
94 | 97 | ||
@@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx) | |||
1013 | return MPBIOS_trigger(idx); | 1016 | return MPBIOS_trigger(idx); |
1014 | } | 1017 | } |
1015 | 1018 | ||
1016 | int (*ioapic_renumber_irq)(int ioapic, int irq); | ||
1017 | static int pin_2_irq(int idx, int apic, int pin) | 1019 | static int pin_2_irq(int idx, int apic, int pin) |
1018 | { | 1020 | { |
1019 | int irq, i; | 1021 | int irq; |
1020 | int bus = mp_irqs[idx].srcbus; | 1022 | int bus = mp_irqs[idx].srcbus; |
1021 | 1023 | ||
1022 | /* | 1024 | /* |
@@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin) | |||
1028 | if (test_bit(bus, mp_bus_not_pci)) { | 1030 | if (test_bit(bus, mp_bus_not_pci)) { |
1029 | irq = mp_irqs[idx].srcbusirq; | 1031 | irq = mp_irqs[idx].srcbusirq; |
1030 | } else { | 1032 | } else { |
1031 | /* | 1033 | u32 gsi = mp_gsi_routing[apic].gsi_base + pin; |
1032 | * PCI IRQs are mapped in order | 1034 | |
1033 | */ | 1035 | if (gsi >= NR_IRQS_LEGACY) |
1034 | i = irq = 0; | 1036 | irq = gsi; |
1035 | while (i < apic) | 1037 | else |
1036 | irq += nr_ioapic_registers[i++]; | 1038 | irq = gsi_top + gsi; |
1037 | irq += pin; | ||
1038 | /* | ||
1039 | * For MPS mode, so far only needed by ES7000 platform | ||
1040 | */ | ||
1041 | if (ioapic_renumber_irq) | ||
1042 | irq = ioapic_renumber_irq(apic, irq); | ||
1043 | } | 1039 | } |
1044 | 1040 | ||
1045 | #ifdef CONFIG_X86_32 | 1041 | #ifdef CONFIG_X86_32 |
@@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | |||
1950 | 1946 | ||
1951 | void __init enable_IO_APIC(void) | 1947 | void __init enable_IO_APIC(void) |
1952 | { | 1948 | { |
1953 | union IO_APIC_reg_01 reg_01; | ||
1954 | int i8259_apic, i8259_pin; | 1949 | int i8259_apic, i8259_pin; |
1955 | int apic; | 1950 | int apic; |
1956 | unsigned long flags; | ||
1957 | |||
1958 | /* | ||
1959 | * The number of IO-APIC IRQ registers (== #pins): | ||
1960 | */ | ||
1961 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1962 | raw_spin_lock_irqsave(&ioapic_lock, flags); | ||
1963 | reg_01.raw = io_apic_read(apic, 1); | ||
1964 | raw_spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1965 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | ||
1966 | } | ||
1967 | 1951 | ||
1968 | if (!legacy_pic->nr_legacy_irqs) | 1952 | if (!legacy_pic->nr_legacy_irqs) |
1969 | return; | 1953 | return; |
@@ -3413,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
3413 | 3397 | ||
3414 | cfg = desc->chip_data; | 3398 | cfg = desc->chip_data; |
3415 | 3399 | ||
3416 | read_msi_msg_desc(desc, &msg); | 3400 | get_cached_msi_msg_desc(desc, &msg); |
3417 | 3401 | ||
3418 | msg.data &= ~MSI_DATA_VECTOR_MASK; | 3402 | msg.data &= ~MSI_DATA_VECTOR_MASK; |
3419 | msg.data |= MSI_DATA_VECTOR(cfg->vector); | 3403 | msg.data |= MSI_DATA_VECTOR(cfg->vector); |
@@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic) | |||
3858 | reg_01.raw = io_apic_read(ioapic, 1); | 3842 | reg_01.raw = io_apic_read(ioapic, 1); |
3859 | raw_spin_unlock_irqrestore(&ioapic_lock, flags); | 3843 | raw_spin_unlock_irqrestore(&ioapic_lock, flags); |
3860 | 3844 | ||
3861 | return reg_01.bits.entries; | 3845 | /* The register returns the maximum index redir index |
3846 | * supported, which is one less than the total number of redir | ||
3847 | * entries. | ||
3848 | */ | ||
3849 | return reg_01.bits.entries + 1; | ||
3862 | } | 3850 | } |
3863 | 3851 | ||
3864 | void __init probe_nr_irqs_gsi(void) | 3852 | void __init probe_nr_irqs_gsi(void) |
3865 | { | 3853 | { |
3866 | int nr = 0; | 3854 | int nr; |
3867 | 3855 | ||
3868 | nr = acpi_probe_gsi(); | 3856 | nr = gsi_top + NR_IRQS_LEGACY; |
3869 | if (nr > nr_irqs_gsi) { | 3857 | if (nr > nr_irqs_gsi) |
3870 | nr_irqs_gsi = nr; | 3858 | nr_irqs_gsi = nr; |
3871 | } else { | ||
3872 | /* for acpi=off or acpi is not compiled in */ | ||
3873 | int idx; | ||
3874 | |||
3875 | nr = 0; | ||
3876 | for (idx = 0; idx < nr_ioapics; idx++) | ||
3877 | nr += io_apic_get_redir_entries(idx) + 1; | ||
3878 | |||
3879 | if (nr > nr_irqs_gsi) | ||
3880 | nr_irqs_gsi = nr; | ||
3881 | } | ||
3882 | 3859 | ||
3883 | printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); | 3860 | printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); |
3884 | } | 3861 | } |
@@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic) | |||
4085 | return reg_01.bits.version; | 4062 | return reg_01.bits.version; |
4086 | } | 4063 | } |
4087 | 4064 | ||
4088 | int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | 4065 | int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) |
4089 | { | 4066 | { |
4090 | int i; | 4067 | int ioapic, pin, idx; |
4091 | 4068 | ||
4092 | if (skip_ioapic_setup) | 4069 | if (skip_ioapic_setup) |
4093 | return -1; | 4070 | return -1; |
4094 | 4071 | ||
4095 | for (i = 0; i < mp_irq_entries; i++) | 4072 | ioapic = mp_find_ioapic(gsi); |
4096 | if (mp_irqs[i].irqtype == mp_INT && | 4073 | if (ioapic < 0) |
4097 | mp_irqs[i].srcbusirq == bus_irq) | ||
4098 | break; | ||
4099 | if (i >= mp_irq_entries) | ||
4100 | return -1; | 4074 | return -1; |
4101 | 4075 | ||
4102 | *trigger = irq_trigger(i); | 4076 | pin = mp_find_ioapic_pin(ioapic, gsi); |
4103 | *polarity = irq_polarity(i); | 4077 | if (pin < 0) |
4078 | return -1; | ||
4079 | |||
4080 | idx = find_irq_entry(ioapic, pin, mp_INT); | ||
4081 | if (idx < 0) | ||
4082 | return -1; | ||
4083 | |||
4084 | *trigger = irq_trigger(idx); | ||
4085 | *polarity = irq_polarity(idx); | ||
4104 | return 0; | 4086 | return 0; |
4105 | } | 4087 | } |
4106 | 4088 | ||
@@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void) | |||
4241 | } | 4223 | } |
4242 | } | 4224 | } |
4243 | 4225 | ||
4244 | int mp_find_ioapic(int gsi) | 4226 | int mp_find_ioapic(u32 gsi) |
4245 | { | 4227 | { |
4246 | int i = 0; | 4228 | int i = 0; |
4247 | 4229 | ||
@@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi) | |||
4256 | return -1; | 4238 | return -1; |
4257 | } | 4239 | } |
4258 | 4240 | ||
4259 | int mp_find_ioapic_pin(int ioapic, int gsi) | 4241 | int mp_find_ioapic_pin(int ioapic, u32 gsi) |
4260 | { | 4242 | { |
4261 | if (WARN_ON(ioapic == -1)) | 4243 | if (WARN_ON(ioapic == -1)) |
4262 | return -1; | 4244 | return -1; |
@@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address) | |||
4284 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | 4266 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) |
4285 | { | 4267 | { |
4286 | int idx = 0; | 4268 | int idx = 0; |
4269 | int entries; | ||
4287 | 4270 | ||
4288 | if (bad_ioapic(address)) | 4271 | if (bad_ioapic(address)) |
4289 | return; | 4272 | return; |
@@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | |||
4302 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups | 4285 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups |
4303 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | 4286 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). |
4304 | */ | 4287 | */ |
4288 | entries = io_apic_get_redir_entries(idx); | ||
4305 | mp_gsi_routing[idx].gsi_base = gsi_base; | 4289 | mp_gsi_routing[idx].gsi_base = gsi_base; |
4306 | mp_gsi_routing[idx].gsi_end = gsi_base + | 4290 | mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; |
4307 | io_apic_get_redir_entries(idx); | 4291 | |
4292 | /* | ||
4293 | * The number of IO-APIC IRQ registers (== #pins): | ||
4294 | */ | ||
4295 | nr_ioapic_registers[idx] = entries; | ||
4296 | |||
4297 | if (mp_gsi_routing[idx].gsi_end >= gsi_top) | ||
4298 | gsi_top = mp_gsi_routing[idx].gsi_end + 1; | ||
4308 | 4299 | ||
4309 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | 4300 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " |
4310 | "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, | 4301 | "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, |
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 1edaf15c0b8e..a43f71cb30f8 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c | |||
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
401 | int cpu = smp_processor_id(); | 401 | int cpu = smp_processor_id(); |
402 | int rc = 0; | 402 | int rc = 0; |
403 | 403 | ||
404 | /* check for other users first */ | ||
405 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
406 | == NOTIFY_STOP) { | ||
407 | rc = 1; | ||
408 | touched = 1; | ||
409 | } | ||
410 | |||
411 | sum = get_timer_irqs(cpu); | 404 | sum = get_timer_irqs(cpu); |
412 | 405 | ||
413 | if (__get_cpu_var(nmi_touch)) { | 406 | if (__get_cpu_var(nmi_touch)) { |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c085d52dbaf2..e46f98f36e31 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -735,9 +735,6 @@ void __init uv_system_init(void) | |||
735 | uv_node_to_blade[nid] = blade; | 735 | uv_node_to_blade[nid] = blade; |
736 | uv_cpu_to_blade[cpu] = blade; | 736 | uv_cpu_to_blade[cpu] = blade; |
737 | max_pnode = max(pnode, max_pnode); | 737 | max_pnode = max(pnode, max_pnode); |
738 | |||
739 | printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", | ||
740 | cpu, apicid, pnode, nid, lcpu, blade); | ||
741 | } | 738 | } |
742 | 739 | ||
743 | /* Add blade/pnode info for nodes without cpus */ | 740 | /* Add blade/pnode info for nodes without cpus */ |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 031aa887b0eb..4c9c67bf09b7 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -140,7 +140,7 @@ | |||
140 | * is now the way life works). | 140 | * is now the way life works). |
141 | * Fix thinko in suspend() (wrong return). | 141 | * Fix thinko in suspend() (wrong return). |
142 | * Notify drivers on critical suspend. | 142 | * Notify drivers on critical suspend. |
143 | * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> | 143 | * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz> |
144 | * modified by sfr). | 144 | * modified by sfr). |
145 | * Disable interrupts while we are suspended (Andy Henroid | 145 | * Disable interrupts while we are suspended (Andy Henroid |
146 | * <andy_henroid@yahoo.com> fixed by sfr). | 146 | * <andy_henroid@yahoo.com> fixed by sfr). |
@@ -1224,7 +1224,7 @@ static void reinit_timer(void) | |||
1224 | #ifdef INIT_TIMER_AFTER_SUSPEND | 1224 | #ifdef INIT_TIMER_AFTER_SUSPEND |
1225 | unsigned long flags; | 1225 | unsigned long flags; |
1226 | 1226 | ||
1227 | spin_lock_irqsave(&i8253_lock, flags); | 1227 | raw_spin_lock_irqsave(&i8253_lock, flags); |
1228 | /* set the clock to HZ */ | 1228 | /* set the clock to HZ */ |
1229 | outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ | 1229 | outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ |
1230 | udelay(10); | 1230 | udelay(10); |
@@ -1232,7 +1232,7 @@ static void reinit_timer(void) | |||
1232 | udelay(10); | 1232 | udelay(10); |
1233 | outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ | 1233 | outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ |
1234 | udelay(10); | 1234 | udelay(10); |
1235 | spin_unlock_irqrestore(&i8253_lock, flags); | 1235 | raw_spin_unlock_irqrestore(&i8253_lock, flags); |
1236 | #endif | 1236 | #endif |
1237 | } | 1237 | } |
1238 | 1238 | ||
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c202b62f3671..3f0ebe429a01 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -12,11 +12,11 @@ endif | |||
12 | nostackp := $(call cc-option, -fno-stack-protector) | 12 | nostackp := $(call cc-option, -fno-stack-protector) |
13 | CFLAGS_common.o := $(nostackp) | 13 | CFLAGS_common.o := $(nostackp) |
14 | 14 | ||
15 | obj-y := intel_cacheinfo.o addon_cpuid_features.o | 15 | obj-y := intel_cacheinfo.o scattered.o topology.o |
16 | obj-y += proc.o capflags.o powerflags.o common.o | 16 | obj-y += proc.o capflags.o powerflags.o common.o |
17 | obj-y += vmware.o hypervisor.o sched.o | 17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o |
18 | 18 | ||
19 | obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o | 19 | obj-$(CONFIG_X86_32) += bugs.o |
20 | obj-$(CONFIG_X86_64) += bugs_64.o | 20 | obj-$(CONFIG_X86_64) += bugs_64.o |
21 | 21 | ||
22 | obj-$(CONFIG_CPU_SUP_INTEL) += intel.o | 22 | obj-$(CONFIG_CPU_SUP_INTEL) += intel.o |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d2..60a57b13082d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
466 | } | 466 | } |
467 | 467 | ||
468 | } | 468 | } |
469 | if (c->x86 == 0x10 || c->x86 == 0x11) | 469 | if (c->x86 >= 0x10) |
470 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 470 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
471 | 471 | ||
472 | /* get apicid instead of initial apic id from cpuid */ | 472 | /* get apicid instead of initial apic id from cpuid */ |
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
529 | num_cache_leaves = 3; | 529 | num_cache_leaves = 3; |
530 | } | 530 | } |
531 | 531 | ||
532 | if (c->x86 >= 0xf && c->x86 <= 0x11) | 532 | if (c->x86 >= 0xf) |
533 | set_cpu_cap(c, X86_FEATURE_K8); | 533 | set_cpu_cap(c, X86_FEATURE_K8); |
534 | 534 | ||
535 | if (cpu_has_xmm2) { | 535 | if (cpu_has_xmm2) { |
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
546 | fam10h_check_enable_mmcfg(); | 546 | fam10h_check_enable_mmcfg(); |
547 | } | 547 | } |
548 | 548 | ||
549 | if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { | 549 | if (c == &boot_cpu_data && c->x86 >= 0xf) { |
550 | unsigned long long tseg; | 550 | unsigned long long tseg; |
551 | 551 | ||
552 | /* | 552 | /* |
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { | |||
609 | }; | 609 | }; |
610 | 610 | ||
611 | cpu_dev_register(amd_cpu_dev); | 611 | cpu_dev_register(amd_cpu_dev); |
612 | |||
613 | /* | ||
614 | * AMD errata checking | ||
615 | * | ||
616 | * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or | ||
617 | * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that | ||
618 | * have an OSVW id assigned, which it takes as first argument. Both take a | ||
619 | * variable number of family-specific model-stepping ranges created by | ||
620 | * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const | ||
621 | * int[] in arch/x86/include/asm/processor.h. | ||
622 | * | ||
623 | * Example: | ||
624 | * | ||
625 | * const int amd_erratum_319[] = | ||
626 | * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), | ||
627 | * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), | ||
628 | * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); | ||
629 | */ | ||
630 | |||
631 | const int amd_erratum_400[] = | ||
632 | AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), | ||
633 | AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); | ||
634 | EXPORT_SYMBOL_GPL(amd_erratum_400); | ||
635 | |||
636 | const int amd_erratum_383[] = | ||
637 | AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); | ||
638 | EXPORT_SYMBOL_GPL(amd_erratum_383); | ||
639 | |||
640 | bool cpu_has_amd_erratum(const int *erratum) | ||
641 | { | ||
642 | struct cpuinfo_x86 *cpu = ¤t_cpu_data; | ||
643 | int osvw_id = *erratum++; | ||
644 | u32 range; | ||
645 | u32 ms; | ||
646 | |||
647 | /* | ||
648 | * If called early enough that current_cpu_data hasn't been initialized | ||
649 | * yet, fall back to boot_cpu_data. | ||
650 | */ | ||
651 | if (cpu->x86 == 0) | ||
652 | cpu = &boot_cpu_data; | ||
653 | |||
654 | if (cpu->x86_vendor != X86_VENDOR_AMD) | ||
655 | return false; | ||
656 | |||
657 | if (osvw_id >= 0 && osvw_id < 65536 && | ||
658 | cpu_has(cpu, X86_FEATURE_OSVW)) { | ||
659 | u64 osvw_len; | ||
660 | |||
661 | rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); | ||
662 | if (osvw_id < osvw_len) { | ||
663 | u64 osvw_bits; | ||
664 | |||
665 | rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), | ||
666 | osvw_bits); | ||
667 | return osvw_bits & (1ULL << (osvw_id & 0x3f)); | ||
668 | } | ||
669 | } | ||
670 | |||
671 | /* OSVW unavailable or ID unknown, match family-model-stepping range */ | ||
672 | ms = (cpu->x86_model << 8) | cpu->x86_mask; | ||
673 | while ((range = *erratum++)) | ||
674 | if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && | ||
675 | (ms >= AMD_MODEL_RANGE_START(range)) && | ||
676 | (ms <= AMD_MODEL_RANGE_END(range))) | ||
677 | return true; | ||
678 | |||
679 | return false; | ||
680 | } | ||
681 | |||
682 | EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); | ||
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 01a265212395..c39576cb3018 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c | |||
@@ -86,7 +86,7 @@ static void __init check_fpu(void) | |||
86 | 86 | ||
87 | static void __init check_hlt(void) | 87 | static void __init check_hlt(void) |
88 | { | 88 | { |
89 | if (paravirt_enabled()) | 89 | if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) |
90 | return; | 90 | return; |
91 | 91 | ||
92 | printk(KERN_INFO "Checking 'hlt' instruction... "); | 92 | printk(KERN_INFO "Checking 'hlt' instruction... "); |
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c deleted file mode 100644 index 2056ccf572cc..000000000000 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ /dev/null | |||
@@ -1,72 +0,0 @@ | |||
1 | /* | ||
2 | * cmpxchg*() fallbacks for CPU not supporting these instructions | ||
3 | */ | ||
4 | |||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/smp.h> | ||
7 | #include <linux/module.h> | ||
8 | |||
9 | #ifndef CONFIG_X86_CMPXCHG | ||
10 | unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) | ||
11 | { | ||
12 | u8 prev; | ||
13 | unsigned long flags; | ||
14 | |||
15 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
16 | local_irq_save(flags); | ||
17 | prev = *(u8 *)ptr; | ||
18 | if (prev == old) | ||
19 | *(u8 *)ptr = new; | ||
20 | local_irq_restore(flags); | ||
21 | return prev; | ||
22 | } | ||
23 | EXPORT_SYMBOL(cmpxchg_386_u8); | ||
24 | |||
25 | unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) | ||
26 | { | ||
27 | u16 prev; | ||
28 | unsigned long flags; | ||
29 | |||
30 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
31 | local_irq_save(flags); | ||
32 | prev = *(u16 *)ptr; | ||
33 | if (prev == old) | ||
34 | *(u16 *)ptr = new; | ||
35 | local_irq_restore(flags); | ||
36 | return prev; | ||
37 | } | ||
38 | EXPORT_SYMBOL(cmpxchg_386_u16); | ||
39 | |||
40 | unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) | ||
41 | { | ||
42 | u32 prev; | ||
43 | unsigned long flags; | ||
44 | |||
45 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
46 | local_irq_save(flags); | ||
47 | prev = *(u32 *)ptr; | ||
48 | if (prev == old) | ||
49 | *(u32 *)ptr = new; | ||
50 | local_irq_restore(flags); | ||
51 | return prev; | ||
52 | } | ||
53 | EXPORT_SYMBOL(cmpxchg_386_u32); | ||
54 | #endif | ||
55 | |||
56 | #ifndef CONFIG_X86_CMPXCHG64 | ||
57 | unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) | ||
58 | { | ||
59 | u64 prev; | ||
60 | unsigned long flags; | ||
61 | |||
62 | /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ | ||
63 | local_irq_save(flags); | ||
64 | prev = *(u64 *)ptr; | ||
65 | if (prev == old) | ||
66 | *(u64 *)ptr = new; | ||
67 | local_irq_restore(flags); | ||
68 | return prev; | ||
69 | } | ||
70 | EXPORT_SYMBOL(cmpxchg_486_u64); | ||
71 | #endif | ||
72 | |||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4868e4a951ee..490dac63c2d2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | |||
140 | static int __init x86_xsave_setup(char *s) | 140 | static int __init x86_xsave_setup(char *s) |
141 | { | 141 | { |
142 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); | 142 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); |
143 | setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); | ||
143 | return 1; | 144 | return 1; |
144 | } | 145 | } |
145 | __setup("noxsave", x86_xsave_setup); | 146 | __setup("noxsave", x86_xsave_setup); |
146 | 147 | ||
148 | static int __init x86_xsaveopt_setup(char *s) | ||
149 | { | ||
150 | setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); | ||
151 | return 1; | ||
152 | } | ||
153 | __setup("noxsaveopt", x86_xsaveopt_setup); | ||
154 | |||
147 | #ifdef CONFIG_X86_32 | 155 | #ifdef CONFIG_X86_32 |
148 | static int cachesize_override __cpuinitdata = -1; | 156 | static int cachesize_override __cpuinitdata = -1; |
149 | static int disable_x86_serial_nr __cpuinitdata = 1; | 157 | static int disable_x86_serial_nr __cpuinitdata = 1; |
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
551 | c->x86_capability[4] = excap; | 559 | c->x86_capability[4] = excap; |
552 | } | 560 | } |
553 | 561 | ||
562 | /* Additional Intel-defined flags: level 0x00000007 */ | ||
563 | if (c->cpuid_level >= 0x00000007) { | ||
564 | u32 eax, ebx, ecx, edx; | ||
565 | |||
566 | cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); | ||
567 | |||
568 | if (eax > 0) | ||
569 | c->x86_capability[9] = ebx; | ||
570 | } | ||
571 | |||
554 | /* AMD-defined flags: level 0x80000001 */ | 572 | /* AMD-defined flags: level 0x80000001 */ |
555 | xlvl = cpuid_eax(0x80000000); | 573 | xlvl = cpuid_eax(0x80000000); |
556 | c->extended_cpuid_level = xlvl; | 574 | c->extended_cpuid_level = xlvl; |
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | |||
576 | if (c->extended_cpuid_level >= 0x80000007) | 594 | if (c->extended_cpuid_level >= 0x80000007) |
577 | c->x86_power = cpuid_edx(0x80000007); | 595 | c->x86_power = cpuid_edx(0x80000007); |
578 | 596 | ||
597 | init_scattered_cpuid_features(c); | ||
579 | } | 598 | } |
580 | 599 | ||
581 | static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) | 600 | static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) |
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) | |||
731 | 750 | ||
732 | get_model_name(c); /* Default name */ | 751 | get_model_name(c); /* Default name */ |
733 | 752 | ||
734 | init_scattered_cpuid_features(c); | ||
735 | detect_nopl(c); | 753 | detect_nopl(c); |
736 | } | 754 | } |
737 | 755 | ||
@@ -1084,6 +1102,20 @@ static void clear_all_debug_regs(void) | |||
1084 | } | 1102 | } |
1085 | } | 1103 | } |
1086 | 1104 | ||
1105 | #ifdef CONFIG_KGDB | ||
1106 | /* | ||
1107 | * Restore debug regs if using kgdbwait and you have a kernel debugger | ||
1108 | * connection established. | ||
1109 | */ | ||
1110 | static void dbg_restore_debug_regs(void) | ||
1111 | { | ||
1112 | if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break)) | ||
1113 | arch_kgdb_ops.correct_hw_break(); | ||
1114 | } | ||
1115 | #else /* ! CONFIG_KGDB */ | ||
1116 | #define dbg_restore_debug_regs() | ||
1117 | #endif /* ! CONFIG_KGDB */ | ||
1118 | |||
1087 | /* | 1119 | /* |
1088 | * cpu_init() initializes state that is per-CPU. Some data is already | 1120 | * cpu_init() initializes state that is per-CPU. Some data is already |
1089 | * initialized (naturally) in the bootstrap process, such as the GDT | 1121 | * initialized (naturally) in the bootstrap process, such as the GDT |
@@ -1107,9 +1139,9 @@ void __cpuinit cpu_init(void) | |||
1107 | oist = &per_cpu(orig_ist, cpu); | 1139 | oist = &per_cpu(orig_ist, cpu); |
1108 | 1140 | ||
1109 | #ifdef CONFIG_NUMA | 1141 | #ifdef CONFIG_NUMA |
1110 | if (cpu != 0 && percpu_read(node_number) == 0 && | 1142 | if (cpu != 0 && percpu_read(numa_node) == 0 && |
1111 | cpu_to_node(cpu) != NUMA_NO_NODE) | 1143 | early_cpu_to_node(cpu) != NUMA_NO_NODE) |
1112 | percpu_write(node_number, cpu_to_node(cpu)); | 1144 | set_numa_node(early_cpu_to_node(cpu)); |
1113 | #endif | 1145 | #endif |
1114 | 1146 | ||
1115 | me = current; | 1147 | me = current; |
@@ -1174,20 +1206,11 @@ void __cpuinit cpu_init(void) | |||
1174 | load_TR_desc(); | 1206 | load_TR_desc(); |
1175 | load_LDT(&init_mm.context); | 1207 | load_LDT(&init_mm.context); |
1176 | 1208 | ||
1177 | #ifdef CONFIG_KGDB | 1209 | clear_all_debug_regs(); |
1178 | /* | 1210 | dbg_restore_debug_regs(); |
1179 | * If the kgdb is connected no debug regs should be altered. This | ||
1180 | * is only applicable when KGDB and a KGDB I/O module are built | ||
1181 | * into the kernel and you are using early debugging with | ||
1182 | * kgdbwait. KGDB will control the kernel HW breakpoint registers. | ||
1183 | */ | ||
1184 | if (kgdb_connected && arch_kgdb_ops.correct_hw_break) | ||
1185 | arch_kgdb_ops.correct_hw_break(); | ||
1186 | else | ||
1187 | #endif | ||
1188 | clear_all_debug_regs(); | ||
1189 | 1211 | ||
1190 | fpu_init(); | 1212 | fpu_init(); |
1213 | xsave_init(); | ||
1191 | 1214 | ||
1192 | raw_local_save_flags(kernel_eflags); | 1215 | raw_local_save_flags(kernel_eflags); |
1193 | 1216 | ||
@@ -1239,23 +1262,16 @@ void __cpuinit cpu_init(void) | |||
1239 | #endif | 1262 | #endif |
1240 | 1263 | ||
1241 | clear_all_debug_regs(); | 1264 | clear_all_debug_regs(); |
1265 | dbg_restore_debug_regs(); | ||
1242 | 1266 | ||
1243 | /* | 1267 | /* |
1244 | * Force FPU initialization: | 1268 | * Force FPU initialization: |
1245 | */ | 1269 | */ |
1246 | if (cpu_has_xsave) | 1270 | current_thread_info()->status = 0; |
1247 | current_thread_info()->status = TS_XSAVE; | ||
1248 | else | ||
1249 | current_thread_info()->status = 0; | ||
1250 | clear_used_math(); | 1271 | clear_used_math(); |
1251 | mxcsr_feature_mask_init(); | 1272 | mxcsr_feature_mask_init(); |
1252 | 1273 | ||
1253 | /* | 1274 | fpu_init(); |
1254 | * Boot processor to setup the FP and extended state context info. | ||
1255 | */ | ||
1256 | if (smp_processor_id() == boot_cpu_id) | ||
1257 | init_thread_xstate(); | ||
1258 | |||
1259 | xsave_init(); | 1275 | xsave_init(); |
1260 | } | 1276 | } |
1261 | #endif | 1277 | #endif |
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 1840c0a5170b..bd54bf67e6fb 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile | |||
@@ -2,8 +2,8 @@ | |||
2 | # K8 systems. ACPI is preferred to all other hardware-specific drivers. | 2 | # K8 systems. ACPI is preferred to all other hardware-specific drivers. |
3 | # speedstep-* is preferred over p4-clockmod. | 3 | # speedstep-* is preferred over p4-clockmod. |
4 | 4 | ||
5 | obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o | 5 | obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o |
6 | obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o | 6 | obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o |
7 | obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o | 7 | obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o |
8 | obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o | 8 | obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o |
9 | obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o | 9 | obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o |
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 459168083b77..246cd3afbb5f 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -34,7 +34,6 @@ | |||
34 | #include <linux/compiler.h> | 34 | #include <linux/compiler.h> |
35 | #include <linux/dmi.h> | 35 | #include <linux/dmi.h> |
36 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
37 | #include <trace/events/power.h> | ||
38 | 37 | ||
39 | #include <linux/acpi.h> | 38 | #include <linux/acpi.h> |
40 | #include <linux/io.h> | 39 | #include <linux/io.h> |
@@ -46,6 +45,7 @@ | |||
46 | #include <asm/msr.h> | 45 | #include <asm/msr.h> |
47 | #include <asm/processor.h> | 46 | #include <asm/processor.h> |
48 | #include <asm/cpufeature.h> | 47 | #include <asm/cpufeature.h> |
48 | #include "mperf.h" | ||
49 | 49 | ||
50 | #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ | 50 | #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ |
51 | "acpi-cpufreq", msg) | 51 | "acpi-cpufreq", msg) |
@@ -71,8 +71,6 @@ struct acpi_cpufreq_data { | |||
71 | 71 | ||
72 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); | 72 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); |
73 | 73 | ||
74 | static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); | ||
75 | |||
76 | /* acpi_perf_data is a pointer to percpu data. */ | 74 | /* acpi_perf_data is a pointer to percpu data. */ |
77 | static struct acpi_processor_performance *acpi_perf_data; | 75 | static struct acpi_processor_performance *acpi_perf_data; |
78 | 76 | ||
@@ -240,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask) | |||
240 | return cmd.val; | 238 | return cmd.val; |
241 | } | 239 | } |
242 | 240 | ||
243 | /* Called via smp_call_function_single(), on the target CPU */ | ||
244 | static void read_measured_perf_ctrs(void *_cur) | ||
245 | { | ||
246 | struct aperfmperf *am = _cur; | ||
247 | |||
248 | get_aperfmperf(am); | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * Return the measured active (C0) frequency on this CPU since last call | ||
253 | * to this function. | ||
254 | * Input: cpu number | ||
255 | * Return: Average CPU frequency in terms of max frequency (zero on error) | ||
256 | * | ||
257 | * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance | ||
258 | * over a period of time, while CPU is in C0 state. | ||
259 | * IA32_MPERF counts at the rate of max advertised frequency | ||
260 | * IA32_APERF counts at the rate of actual CPU frequency | ||
261 | * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and | ||
262 | * no meaning should be associated with absolute values of these MSRs. | ||
263 | */ | ||
264 | static unsigned int get_measured_perf(struct cpufreq_policy *policy, | ||
265 | unsigned int cpu) | ||
266 | { | ||
267 | struct aperfmperf perf; | ||
268 | unsigned long ratio; | ||
269 | unsigned int retval; | ||
270 | |||
271 | if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) | ||
272 | return 0; | ||
273 | |||
274 | ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); | ||
275 | per_cpu(acfreq_old_perf, cpu) = perf; | ||
276 | |||
277 | retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; | ||
278 | |||
279 | return retval; | ||
280 | } | ||
281 | |||
282 | static unsigned int get_cur_freq_on_cpu(unsigned int cpu) | 241 | static unsigned int get_cur_freq_on_cpu(unsigned int cpu) |
283 | { | 242 | { |
284 | struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); | 243 | struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); |
@@ -364,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
364 | } | 323 | } |
365 | } | 324 | } |
366 | 325 | ||
367 | trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); | ||
368 | |||
369 | switch (data->cpu_feature) { | 326 | switch (data->cpu_feature) { |
370 | case SYSTEM_INTEL_MSR_CAPABLE: | 327 | case SYSTEM_INTEL_MSR_CAPABLE: |
371 | cmd.type = SYSTEM_INTEL_MSR_CAPABLE; | 328 | cmd.type = SYSTEM_INTEL_MSR_CAPABLE; |
@@ -391,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
391 | 348 | ||
392 | freqs.old = perf->states[perf->state].core_frequency * 1000; | 349 | freqs.old = perf->states[perf->state].core_frequency * 1000; |
393 | freqs.new = data->freq_table[next_state].frequency; | 350 | freqs.new = data->freq_table[next_state].frequency; |
394 | for_each_cpu(i, cmd.mask) { | 351 | for_each_cpu(i, policy->cpus) { |
395 | freqs.cpu = i; | 352 | freqs.cpu = i; |
396 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 353 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
397 | } | 354 | } |
@@ -407,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
407 | } | 364 | } |
408 | } | 365 | } |
409 | 366 | ||
410 | for_each_cpu(i, cmd.mask) { | 367 | for_each_cpu(i, policy->cpus) { |
411 | freqs.cpu = i; | 368 | freqs.cpu = i; |
412 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 369 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
413 | } | 370 | } |
@@ -702,7 +659,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) | |||
702 | 659 | ||
703 | /* Check for APERF/MPERF support in hardware */ | 660 | /* Check for APERF/MPERF support in hardware */ |
704 | if (cpu_has(c, X86_FEATURE_APERFMPERF)) | 661 | if (cpu_has(c, X86_FEATURE_APERFMPERF)) |
705 | acpi_cpufreq_driver.getavg = get_measured_perf; | 662 | acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; |
706 | 663 | ||
707 | dprintk("CPU%u - ACPI performance management activated.\n", cpu); | 664 | dprintk("CPU%u - ACPI performance management activated.\n", cpu); |
708 | for (i = 0; i < perf->state_count; i++) | 665 | for (i = 0; i < perf->state_count; i++) |
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 16e3483be9e3..32974cf84232 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | |||
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = { | |||
169 | * Low Level chipset interface * | 169 | * Low Level chipset interface * |
170 | ****************************************************************/ | 170 | ****************************************************************/ |
171 | static struct pci_device_id gx_chipset_tbl[] __initdata = { | 171 | static struct pci_device_id gx_chipset_tbl[] __initdata = { |
172 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, | 172 | { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, |
173 | PCI_ANY_ID, PCI_ANY_ID }, | 173 | { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, |
174 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, | 174 | { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, |
175 | PCI_ANY_ID, PCI_ANY_ID }, | ||
176 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, | ||
177 | PCI_ANY_ID, PCI_ANY_ID }, | ||
178 | { 0, }, | 175 | { 0, }, |
179 | }; | 176 | }; |
180 | 177 | ||
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void) | |||
199 | } | 196 | } |
200 | 197 | ||
201 | /* detect which companion chip is used */ | 198 | /* detect which companion chip is used */ |
202 | while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { | 199 | for_each_pci_dev(gx_pci) { |
203 | if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) | 200 | if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) |
204 | return gx_pci; | 201 | return gx_pci; |
205 | } | 202 | } |
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 7e7eea4f8261..03162dac6271 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c | |||
@@ -426,7 +426,7 @@ static int guess_fsb(int mult) | |||
426 | } | 426 | } |
427 | 427 | ||
428 | 428 | ||
429 | static int __init longhaul_get_ranges(void) | 429 | static int __cpuinit longhaul_get_ranges(void) |
430 | { | 430 | { |
431 | unsigned int i, j, k = 0; | 431 | unsigned int i, j, k = 0; |
432 | unsigned int ratio; | 432 | unsigned int ratio; |
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void) | |||
530 | } | 530 | } |
531 | 531 | ||
532 | 532 | ||
533 | static void __init longhaul_setup_voltagescaling(void) | 533 | static void __cpuinit longhaul_setup_voltagescaling(void) |
534 | { | 534 | { |
535 | union msr_longhaul longhaul; | 535 | union msr_longhaul longhaul; |
536 | struct mV_pos minvid, maxvid, vid; | 536 | struct mV_pos minvid, maxvid, vid; |
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void) | |||
784 | return 0; | 784 | return 0; |
785 | } | 785 | } |
786 | 786 | ||
787 | static int __init longhaul_cpu_init(struct cpufreq_policy *policy) | 787 | static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) |
788 | { | 788 | { |
789 | struct cpuinfo_x86 *c = &cpu_data(0); | 789 | struct cpuinfo_x86 *c = &cpu_data(0); |
790 | char *cpuname = NULL; | 790 | char *cpuname = NULL; |
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h index e2360a469f79..cbf48fbca881 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h | |||
@@ -56,7 +56,7 @@ union msr_longhaul { | |||
56 | /* | 56 | /* |
57 | * VIA C3 Samuel 1 & Samuel 2 (stepping 0) | 57 | * VIA C3 Samuel 1 & Samuel 2 (stepping 0) |
58 | */ | 58 | */ |
59 | static const int __initdata samuel1_mults[16] = { | 59 | static const int __cpuinitdata samuel1_mults[16] = { |
60 | -1, /* 0000 -> RESERVED */ | 60 | -1, /* 0000 -> RESERVED */ |
61 | 30, /* 0001 -> 3.0x */ | 61 | 30, /* 0001 -> 3.0x */ |
62 | 40, /* 0010 -> 4.0x */ | 62 | 40, /* 0010 -> 4.0x */ |
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = { | |||
75 | -1, /* 1111 -> RESERVED */ | 75 | -1, /* 1111 -> RESERVED */ |
76 | }; | 76 | }; |
77 | 77 | ||
78 | static const int __initdata samuel1_eblcr[16] = { | 78 | static const int __cpuinitdata samuel1_eblcr[16] = { |
79 | 50, /* 0000 -> RESERVED */ | 79 | 50, /* 0000 -> RESERVED */ |
80 | 30, /* 0001 -> 3.0x */ | 80 | 30, /* 0001 -> 3.0x */ |
81 | 40, /* 0010 -> 4.0x */ | 81 | 40, /* 0010 -> 4.0x */ |
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = { | |||
97 | /* | 97 | /* |
98 | * VIA C3 Samuel2 Stepping 1->15 | 98 | * VIA C3 Samuel2 Stepping 1->15 |
99 | */ | 99 | */ |
100 | static const int __initdata samuel2_eblcr[16] = { | 100 | static const int __cpuinitdata samuel2_eblcr[16] = { |
101 | 50, /* 0000 -> 5.0x */ | 101 | 50, /* 0000 -> 5.0x */ |
102 | 30, /* 0001 -> 3.0x */ | 102 | 30, /* 0001 -> 3.0x */ |
103 | 40, /* 0010 -> 4.0x */ | 103 | 40, /* 0010 -> 4.0x */ |
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = { | |||
119 | /* | 119 | /* |
120 | * VIA C3 Ezra | 120 | * VIA C3 Ezra |
121 | */ | 121 | */ |
122 | static const int __initdata ezra_mults[16] = { | 122 | static const int __cpuinitdata ezra_mults[16] = { |
123 | 100, /* 0000 -> 10.0x */ | 123 | 100, /* 0000 -> 10.0x */ |
124 | 30, /* 0001 -> 3.0x */ | 124 | 30, /* 0001 -> 3.0x */ |
125 | 40, /* 0010 -> 4.0x */ | 125 | 40, /* 0010 -> 4.0x */ |
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = { | |||
138 | 120, /* 1111 -> 12.0x */ | 138 | 120, /* 1111 -> 12.0x */ |
139 | }; | 139 | }; |
140 | 140 | ||
141 | static const int __initdata ezra_eblcr[16] = { | 141 | static const int __cpuinitdata ezra_eblcr[16] = { |
142 | 50, /* 0000 -> 5.0x */ | 142 | 50, /* 0000 -> 5.0x */ |
143 | 30, /* 0001 -> 3.0x */ | 143 | 30, /* 0001 -> 3.0x */ |
144 | 40, /* 0010 -> 4.0x */ | 144 | 40, /* 0010 -> 4.0x */ |
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = { | |||
160 | /* | 160 | /* |
161 | * VIA C3 (Ezra-T) [C5M]. | 161 | * VIA C3 (Ezra-T) [C5M]. |
162 | */ | 162 | */ |
163 | static const int __initdata ezrat_mults[32] = { | 163 | static const int __cpuinitdata ezrat_mults[32] = { |
164 | 100, /* 0000 -> 10.0x */ | 164 | 100, /* 0000 -> 10.0x */ |
165 | 30, /* 0001 -> 3.0x */ | 165 | 30, /* 0001 -> 3.0x */ |
166 | 40, /* 0010 -> 4.0x */ | 166 | 40, /* 0010 -> 4.0x */ |
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = { | |||
196 | -1, /* 1111 -> RESERVED (12.0x) */ | 196 | -1, /* 1111 -> RESERVED (12.0x) */ |
197 | }; | 197 | }; |
198 | 198 | ||
199 | static const int __initdata ezrat_eblcr[32] = { | 199 | static const int __cpuinitdata ezrat_eblcr[32] = { |
200 | 50, /* 0000 -> 5.0x */ | 200 | 50, /* 0000 -> 5.0x */ |
201 | 30, /* 0001 -> 3.0x */ | 201 | 30, /* 0001 -> 3.0x */ |
202 | 40, /* 0010 -> 4.0x */ | 202 | 40, /* 0010 -> 4.0x */ |
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = { | |||
235 | /* | 235 | /* |
236 | * VIA C3 Nehemiah */ | 236 | * VIA C3 Nehemiah */ |
237 | 237 | ||
238 | static const int __initdata nehemiah_mults[32] = { | 238 | static const int __cpuinitdata nehemiah_mults[32] = { |
239 | 100, /* 0000 -> 10.0x */ | 239 | 100, /* 0000 -> 10.0x */ |
240 | -1, /* 0001 -> 16.0x */ | 240 | -1, /* 0001 -> 16.0x */ |
241 | 40, /* 0010 -> 4.0x */ | 241 | 40, /* 0010 -> 4.0x */ |
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = { | |||
270 | -1, /* 1111 -> 12.0x */ | 270 | -1, /* 1111 -> 12.0x */ |
271 | }; | 271 | }; |
272 | 272 | ||
273 | static const int __initdata nehemiah_eblcr[32] = { | 273 | static const int __cpuinitdata nehemiah_eblcr[32] = { |
274 | 50, /* 0000 -> 5.0x */ | 274 | 50, /* 0000 -> 5.0x */ |
275 | 160, /* 0001 -> 16.0x */ | 275 | 160, /* 0001 -> 16.0x */ |
276 | 40, /* 0010 -> 4.0x */ | 276 | 40, /* 0010 -> 4.0x */ |
@@ -315,7 +315,7 @@ struct mV_pos { | |||
315 | unsigned short pos; | 315 | unsigned short pos; |
316 | }; | 316 | }; |
317 | 317 | ||
318 | static const struct mV_pos __initdata vrm85_mV[32] = { | 318 | static const struct mV_pos __cpuinitdata vrm85_mV[32] = { |
319 | {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, | 319 | {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, |
320 | {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, | 320 | {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, |
321 | {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, | 321 | {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, |
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = { | |||
326 | {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} | 326 | {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} |
327 | }; | 327 | }; |
328 | 328 | ||
329 | static const unsigned char __initdata mV_vrm85[32] = { | 329 | static const unsigned char __cpuinitdata mV_vrm85[32] = { |
330 | 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, | 330 | 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, |
331 | 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, | 331 | 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, |
332 | 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, | 332 | 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, |
333 | 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 | 333 | 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 |
334 | }; | 334 | }; |
335 | 335 | ||
336 | static const struct mV_pos __initdata mobilevrm_mV[32] = { | 336 | static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { |
337 | {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, | 337 | {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, |
338 | {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, | 338 | {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, |
339 | {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, | 339 | {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, |
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = { | |||
344 | {675, 3}, {650, 2}, {625, 1}, {600, 0} | 344 | {675, 3}, {650, 2}, {625, 1}, {600, 0} |
345 | }; | 345 | }; |
346 | 346 | ||
347 | static const unsigned char __initdata mV_mobilevrm[32] = { | 347 | static const unsigned char __cpuinitdata mV_mobilevrm[32] = { |
348 | 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, | 348 | 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, |
349 | 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, | 349 | 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, |
350 | 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, | 350 | 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, |
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index e7b559d74c52..fc09f142d94d 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c | |||
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu) | |||
165 | * TMTA rules: | 165 | * TMTA rules: |
166 | * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) | 166 | * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) |
167 | */ | 167 | */ |
168 | static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, | 168 | static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, |
169 | unsigned int *high_freq) | 169 | unsigned int *high_freq) |
170 | { | 170 | { |
171 | u32 msr_lo, msr_hi; | 171 | u32 msr_lo, msr_hi; |
172 | u32 save_lo, save_hi; | 172 | u32 save_lo, save_hi; |
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, | |||
258 | } | 258 | } |
259 | 259 | ||
260 | 260 | ||
261 | static int __init longrun_cpu_init(struct cpufreq_policy *policy) | 261 | static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) |
262 | { | 262 | { |
263 | int result = 0; | 263 | int result = 0; |
264 | 264 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c new file mode 100644 index 000000000000..911e193018ae --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/mperf.c | |||
@@ -0,0 +1,51 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/smp.h> | ||
3 | #include <linux/module.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/cpufreq.h> | ||
6 | #include <linux/slab.h> | ||
7 | |||
8 | #include "mperf.h" | ||
9 | |||
10 | static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); | ||
11 | |||
12 | /* Called via smp_call_function_single(), on the target CPU */ | ||
13 | static void read_measured_perf_ctrs(void *_cur) | ||
14 | { | ||
15 | struct aperfmperf *am = _cur; | ||
16 | |||
17 | get_aperfmperf(am); | ||
18 | } | ||
19 | |||
20 | /* | ||
21 | * Return the measured active (C0) frequency on this CPU since last call | ||
22 | * to this function. | ||
23 | * Input: cpu number | ||
24 | * Return: Average CPU frequency in terms of max frequency (zero on error) | ||
25 | * | ||
26 | * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance | ||
27 | * over a period of time, while CPU is in C0 state. | ||
28 | * IA32_MPERF counts at the rate of max advertised frequency | ||
29 | * IA32_APERF counts at the rate of actual CPU frequency | ||
30 | * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and | ||
31 | * no meaning should be associated with absolute values of these MSRs. | ||
32 | */ | ||
33 | unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, | ||
34 | unsigned int cpu) | ||
35 | { | ||
36 | struct aperfmperf perf; | ||
37 | unsigned long ratio; | ||
38 | unsigned int retval; | ||
39 | |||
40 | if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) | ||
41 | return 0; | ||
42 | |||
43 | ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); | ||
44 | per_cpu(acfreq_old_perf, cpu) = perf; | ||
45 | |||
46 | retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; | ||
47 | |||
48 | return retval; | ||
49 | } | ||
50 | EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); | ||
51 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h new file mode 100644 index 000000000000..5dbf2950dc22 --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/mperf.h | |||
@@ -0,0 +1,9 @@ | |||
1 | /* | ||
2 | * (c) 2010 Advanced Micro Devices, Inc. | ||
3 | * Your use of this code is subject to the terms and conditions of the | ||
4 | * GNU general public license version 2. See "COPYING" or | ||
5 | * http://www.gnu.org/licenses/gpl.html | ||
6 | */ | ||
7 | |||
8 | unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, | ||
9 | unsigned int cpu); | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 7b8a8ba67b07..bd1cac747f67 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | |||
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) | |||
178 | } | 178 | } |
179 | } | 179 | } |
180 | 180 | ||
181 | if (c->x86 != 0xF) { | 181 | if (c->x86 != 0xF) |
182 | if (!cpu_has(c, X86_FEATURE_EST)) | ||
183 | printk(KERN_WARNING PFX "Unknown CPU. " | ||
184 | "Please send an e-mail to " | ||
185 | "<cpufreq@vger.kernel.org>\n"); | ||
186 | return 0; | 182 | return 0; |
187 | } | ||
188 | 183 | ||
189 | /* on P-4s, the TSC runs with constant frequency independent whether | 184 | /* on P-4s, the TSC runs with constant frequency independent whether |
190 | * throttling is active or not. */ | 185 | * throttling is active or not. */ |
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ce7cde713e71..a36de5bbb622 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | |||
@@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) | |||
368 | return -ENODEV; | 368 | return -ENODEV; |
369 | 369 | ||
370 | out_obj = output.pointer; | 370 | out_obj = output.pointer; |
371 | if (out_obj->type != ACPI_TYPE_BUFFER) { | 371 | if (out_obj->type != ACPI_TYPE_BUFFER) |
372 | ret = -ENODEV; | 372 | return -ENODEV; |
373 | goto out_free; | ||
374 | } | ||
375 | 373 | ||
376 | errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); | 374 | errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); |
377 | if (errors) { | 375 | if (errors) |
378 | ret = -ENODEV; | 376 | return -ENODEV; |
379 | goto out_free; | ||
380 | } | ||
381 | 377 | ||
382 | supported = *((u32 *)(out_obj->buffer.pointer + 4)); | 378 | supported = *((u32 *)(out_obj->buffer.pointer + 4)); |
383 | if (!(supported & 0x1)) { | 379 | if (!(supported & 0x1)) |
384 | ret = -ENODEV; | 380 | return -ENODEV; |
385 | goto out_free; | ||
386 | } | ||
387 | 381 | ||
388 | out_free: | 382 | out_free: |
389 | kfree(output.pointer); | 383 | kfree(output.pointer); |
@@ -397,13 +391,17 @@ static int __init pcc_cpufreq_probe(void) | |||
397 | struct pcc_memory_resource *mem_resource; | 391 | struct pcc_memory_resource *mem_resource; |
398 | struct pcc_register_resource *reg_resource; | 392 | struct pcc_register_resource *reg_resource; |
399 | union acpi_object *out_obj, *member; | 393 | union acpi_object *out_obj, *member; |
400 | acpi_handle handle, osc_handle; | 394 | acpi_handle handle, osc_handle, pcch_handle; |
401 | int ret = 0; | 395 | int ret = 0; |
402 | 396 | ||
403 | status = acpi_get_handle(NULL, "\\_SB", &handle); | 397 | status = acpi_get_handle(NULL, "\\_SB", &handle); |
404 | if (ACPI_FAILURE(status)) | 398 | if (ACPI_FAILURE(status)) |
405 | return -ENODEV; | 399 | return -ENODEV; |
406 | 400 | ||
401 | status = acpi_get_handle(handle, "PCCH", &pcch_handle); | ||
402 | if (ACPI_FAILURE(status)) | ||
403 | return -ENODEV; | ||
404 | |||
407 | status = acpi_get_handle(handle, "_OSC", &osc_handle); | 405 | status = acpi_get_handle(handle, "_OSC", &osc_handle); |
408 | if (ACPI_SUCCESS(status)) { | 406 | if (ACPI_SUCCESS(status)) { |
409 | ret = pcc_cpufreq_do_osc(&osc_handle); | 407 | ret = pcc_cpufreq_do_osc(&osc_handle); |
@@ -543,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) | |||
543 | 541 | ||
544 | if (!pcch_virt_addr) { | 542 | if (!pcch_virt_addr) { |
545 | result = -1; | 543 | result = -1; |
546 | goto pcch_null; | 544 | goto out; |
547 | } | 545 | } |
548 | 546 | ||
549 | result = pcc_get_offset(cpu); | 547 | result = pcc_get_offset(cpu); |
550 | if (result) { | 548 | if (result) { |
551 | dprintk("init: PCCP evaluation failed\n"); | 549 | dprintk("init: PCCP evaluation failed\n"); |
552 | goto free; | 550 | goto out; |
553 | } | 551 | } |
554 | 552 | ||
555 | policy->max = policy->cpuinfo.max_freq = | 553 | policy->max = policy->cpuinfo.max_freq = |
@@ -558,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) | |||
558 | ioread32(&pcch_hdr->minimum_frequency) * 1000; | 556 | ioread32(&pcch_hdr->minimum_frequency) * 1000; |
559 | policy->cur = pcc_get_freq(cpu); | 557 | policy->cur = pcc_get_freq(cpu); |
560 | 558 | ||
559 | if (!policy->cur) { | ||
560 | dprintk("init: Unable to get current CPU frequency\n"); | ||
561 | result = -EINVAL; | ||
562 | goto out; | ||
563 | } | ||
564 | |||
561 | dprintk("init: policy->max is %d, policy->min is %d\n", | 565 | dprintk("init: policy->max is %d, policy->min is %d\n", |
562 | policy->max, policy->min); | 566 | policy->max, policy->min); |
563 | 567 | out: | |
564 | return 0; | ||
565 | free: | ||
566 | pcc_clear_mapping(); | ||
567 | free_percpu(pcc_cpu_info); | ||
568 | pcch_null: | ||
569 | return result; | 568 | return result; |
570 | } | 569 | } |
571 | 570 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 9a97116f89e5..4a45fd6e41ba 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c | |||
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy) | |||
569 | * We will then get the same kind of behaviour already tested under | 569 | * We will then get the same kind of behaviour already tested under |
570 | * the "well-known" other OS. | 570 | * the "well-known" other OS. |
571 | */ | 571 | */ |
572 | static int __init fixup_sgtc(void) | 572 | static int __cpuinit fixup_sgtc(void) |
573 | { | 573 | { |
574 | unsigned int sgtc; | 574 | unsigned int sgtc; |
575 | unsigned int m; | 575 | unsigned int m; |
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu) | |||
603 | } | 603 | } |
604 | 604 | ||
605 | 605 | ||
606 | static int __init acer_cpufreq_pst(const struct dmi_system_id *d) | 606 | static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) |
607 | { | 607 | { |
608 | printk(KERN_WARNING PFX | 608 | printk(KERN_WARNING PFX |
609 | "%s laptop with broken PST tables in BIOS detected.\n", | 609 | "%s laptop with broken PST tables in BIOS detected.\n", |
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d) | |||
621 | * A BIOS update is all that can save them. | 621 | * A BIOS update is all that can save them. |
622 | * Mention this, and disable cpufreq. | 622 | * Mention this, and disable cpufreq. |
623 | */ | 623 | */ |
624 | static struct dmi_system_id __initdata powernow_dmi_table[] = { | 624 | static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { |
625 | { | 625 | { |
626 | .callback = acer_cpufreq_pst, | 626 | .callback = acer_cpufreq_pst, |
627 | .ident = "Acer Aspire", | 627 | .ident = "Acer Aspire", |
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = { | |||
633 | { } | 633 | { } |
634 | }; | 634 | }; |
635 | 635 | ||
636 | static int __init powernow_cpu_init(struct cpufreq_policy *policy) | 636 | static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) |
637 | { | 637 | { |
638 | union msr_fidvidstatus fidvidstatus; | 638 | union msr_fidvidstatus fidvidstatus; |
639 | int result; | 639 | int result; |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index b6215b9798e2..491977baf6c0 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -1,6 +1,5 @@ | |||
1 | |||
2 | /* | 1 | /* |
3 | * (c) 2003-2006 Advanced Micro Devices, Inc. | 2 | * (c) 2003-2010 Advanced Micro Devices, Inc. |
4 | * Your use of this code is subject to the terms and conditions of the | 3 | * Your use of this code is subject to the terms and conditions of the |
5 | * GNU general public license version 2. See "COPYING" or | 4 | * GNU general public license version 2. See "COPYING" or |
6 | * http://www.gnu.org/licenses/gpl.html | 5 | * http://www.gnu.org/licenses/gpl.html |
@@ -10,7 +9,7 @@ | |||
10 | * Based on the powernow-k7.c module written by Dave Jones. | 9 | * Based on the powernow-k7.c module written by Dave Jones. |
11 | * (C) 2003 Dave Jones on behalf of SuSE Labs | 10 | * (C) 2003 Dave Jones on behalf of SuSE Labs |
12 | * (C) 2004 Dominik Brodowski <linux@brodo.de> | 11 | * (C) 2004 Dominik Brodowski <linux@brodo.de> |
13 | * (C) 2004 Pavel Machek <pavel@suse.cz> | 12 | * (C) 2004 Pavel Machek <pavel@ucw.cz> |
14 | * Licensed under the terms of the GNU GPL License version 2. | 13 | * Licensed under the terms of the GNU GPL License version 2. |
15 | * Based upon datasheets & sample CPUs kindly provided by AMD. | 14 | * Based upon datasheets & sample CPUs kindly provided by AMD. |
16 | * | 15 | * |
@@ -46,6 +45,7 @@ | |||
46 | #define PFX "powernow-k8: " | 45 | #define PFX "powernow-k8: " |
47 | #define VERSION "version 2.20.00" | 46 | #define VERSION "version 2.20.00" |
48 | #include "powernow-k8.h" | 47 | #include "powernow-k8.h" |
48 | #include "mperf.h" | ||
49 | 49 | ||
50 | /* serialize freq changes */ | 50 | /* serialize freq changes */ |
51 | static DEFINE_MUTEX(fidvid_mutex); | 51 | static DEFINE_MUTEX(fidvid_mutex); |
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); | |||
54 | 54 | ||
55 | static int cpu_family = CPU_OPTERON; | 55 | static int cpu_family = CPU_OPTERON; |
56 | 56 | ||
57 | /* core performance boost */ | ||
58 | static bool cpb_capable, cpb_enabled; | ||
59 | static struct msr __percpu *msrs; | ||
60 | |||
61 | static struct cpufreq_driver cpufreq_amd64_driver; | ||
62 | |||
57 | #ifndef CONFIG_SMP | 63 | #ifndef CONFIG_SMP |
58 | static inline const struct cpumask *cpu_core_mask(int cpu) | 64 | static inline const struct cpumask *cpu_core_mask(int cpu) |
59 | { | 65 | { |
@@ -800,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data) | |||
800 | * www.amd.com | 806 | * www.amd.com |
801 | */ | 807 | */ |
802 | printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); | 808 | printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); |
809 | printk(KERN_ERR PFX "Make sure that your BIOS is up to date" | ||
810 | " and Cool'N'Quiet support is enabled in BIOS setup\n"); | ||
803 | return -ENODEV; | 811 | return -ENODEV; |
804 | } | 812 | } |
805 | 813 | ||
@@ -904,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, | |||
904 | { | 912 | { |
905 | int i; | 913 | int i; |
906 | u32 hi = 0, lo = 0; | 914 | u32 hi = 0, lo = 0; |
907 | rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); | 915 | rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); |
908 | data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; | 916 | data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; |
909 | 917 | ||
910 | for (i = 0; i < data->acpi_data.state_count; i++) { | 918 | for (i = 0; i < data->acpi_data.state_count; i++) { |
911 | u32 index; | 919 | u32 index; |
@@ -1017,13 +1025,12 @@ static int get_transition_latency(struct powernow_k8_data *data) | |||
1017 | } | 1025 | } |
1018 | if (max_latency == 0) { | 1026 | if (max_latency == 0) { |
1019 | /* | 1027 | /* |
1020 | * Fam 11h always returns 0 as transition latency. | 1028 | * Fam 11h and later may return 0 as transition latency. This |
1021 | * This is intended and means "very fast". While cpufreq core | 1029 | * is intended and means "very fast". While cpufreq core and |
1022 | * and governors currently can handle that gracefully, better | 1030 | * governors currently can handle that gracefully, better set it |
1023 | * set it to 1 to avoid problems in the future. | 1031 | * to 1 to avoid problems in the future. |
1024 | * For all others it's a BIOS bug. | ||
1025 | */ | 1032 | */ |
1026 | if (boot_cpu_data.x86 != 0x11) | 1033 | if (boot_cpu_data.x86 < 0x11) |
1027 | printk(KERN_ERR FW_WARN PFX "Invalid zero transition " | 1034 | printk(KERN_ERR FW_WARN PFX "Invalid zero transition " |
1028 | "latency\n"); | 1035 | "latency\n"); |
1029 | max_latency = 1; | 1036 | max_latency = 1; |
@@ -1249,6 +1256,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1249 | struct powernow_k8_data *data; | 1256 | struct powernow_k8_data *data; |
1250 | struct init_on_cpu init_on_cpu; | 1257 | struct init_on_cpu init_on_cpu; |
1251 | int rc; | 1258 | int rc; |
1259 | struct cpuinfo_x86 *c = &cpu_data(pol->cpu); | ||
1252 | 1260 | ||
1253 | if (!cpu_online(pol->cpu)) | 1261 | if (!cpu_online(pol->cpu)) |
1254 | return -ENODEV; | 1262 | return -ENODEV; |
@@ -1323,6 +1331,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1323 | return -EINVAL; | 1331 | return -EINVAL; |
1324 | } | 1332 | } |
1325 | 1333 | ||
1334 | /* Check for APERF/MPERF support in hardware */ | ||
1335 | if (cpu_has(c, X86_FEATURE_APERFMPERF)) | ||
1336 | cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; | ||
1337 | |||
1326 | cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); | 1338 | cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); |
1327 | 1339 | ||
1328 | if (cpu_family == CPU_HW_PSTATE) | 1340 | if (cpu_family == CPU_HW_PSTATE) |
@@ -1394,8 +1406,77 @@ out: | |||
1394 | return khz; | 1406 | return khz; |
1395 | } | 1407 | } |
1396 | 1408 | ||
1409 | static void _cpb_toggle_msrs(bool t) | ||
1410 | { | ||
1411 | int cpu; | ||
1412 | |||
1413 | get_online_cpus(); | ||
1414 | |||
1415 | rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); | ||
1416 | |||
1417 | for_each_cpu(cpu, cpu_online_mask) { | ||
1418 | struct msr *reg = per_cpu_ptr(msrs, cpu); | ||
1419 | if (t) | ||
1420 | reg->l &= ~BIT(25); | ||
1421 | else | ||
1422 | reg->l |= BIT(25); | ||
1423 | } | ||
1424 | wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); | ||
1425 | |||
1426 | put_online_cpus(); | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * Switch on/off core performance boosting. | ||
1431 | * | ||
1432 | * 0=disable | ||
1433 | * 1=enable. | ||
1434 | */ | ||
1435 | static void cpb_toggle(bool t) | ||
1436 | { | ||
1437 | if (!cpb_capable) | ||
1438 | return; | ||
1439 | |||
1440 | if (t && !cpb_enabled) { | ||
1441 | cpb_enabled = true; | ||
1442 | _cpb_toggle_msrs(t); | ||
1443 | printk(KERN_INFO PFX "Core Boosting enabled.\n"); | ||
1444 | } else if (!t && cpb_enabled) { | ||
1445 | cpb_enabled = false; | ||
1446 | _cpb_toggle_msrs(t); | ||
1447 | printk(KERN_INFO PFX "Core Boosting disabled.\n"); | ||
1448 | } | ||
1449 | } | ||
1450 | |||
1451 | static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, | ||
1452 | size_t count) | ||
1453 | { | ||
1454 | int ret = -EINVAL; | ||
1455 | unsigned long val = 0; | ||
1456 | |||
1457 | ret = strict_strtoul(buf, 10, &val); | ||
1458 | if (!ret && (val == 0 || val == 1) && cpb_capable) | ||
1459 | cpb_toggle(val); | ||
1460 | else | ||
1461 | return -EINVAL; | ||
1462 | |||
1463 | return count; | ||
1464 | } | ||
1465 | |||
1466 | static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) | ||
1467 | { | ||
1468 | return sprintf(buf, "%u\n", cpb_enabled); | ||
1469 | } | ||
1470 | |||
1471 | #define define_one_rw(_name) \ | ||
1472 | static struct freq_attr _name = \ | ||
1473 | __ATTR(_name, 0644, show_##_name, store_##_name) | ||
1474 | |||
1475 | define_one_rw(cpb); | ||
1476 | |||
1397 | static struct freq_attr *powernow_k8_attr[] = { | 1477 | static struct freq_attr *powernow_k8_attr[] = { |
1398 | &cpufreq_freq_attr_scaling_available_freqs, | 1478 | &cpufreq_freq_attr_scaling_available_freqs, |
1479 | &cpb, | ||
1399 | NULL, | 1480 | NULL, |
1400 | }; | 1481 | }; |
1401 | 1482 | ||
@@ -1411,10 +1492,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = { | |||
1411 | .attr = powernow_k8_attr, | 1492 | .attr = powernow_k8_attr, |
1412 | }; | 1493 | }; |
1413 | 1494 | ||
1495 | /* | ||
1496 | * Clear the boost-disable flag on the CPU_DOWN path so that this cpu | ||
1497 | * cannot block the remaining ones from boosting. On the CPU_UP path we | ||
1498 | * simply keep the boost-disable flag in sync with the current global | ||
1499 | * state. | ||
1500 | */ | ||
1501 | static int cpb_notify(struct notifier_block *nb, unsigned long action, | ||
1502 | void *hcpu) | ||
1503 | { | ||
1504 | unsigned cpu = (long)hcpu; | ||
1505 | u32 lo, hi; | ||
1506 | |||
1507 | switch (action) { | ||
1508 | case CPU_UP_PREPARE: | ||
1509 | case CPU_UP_PREPARE_FROZEN: | ||
1510 | |||
1511 | if (!cpb_enabled) { | ||
1512 | rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); | ||
1513 | lo |= BIT(25); | ||
1514 | wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); | ||
1515 | } | ||
1516 | break; | ||
1517 | |||
1518 | case CPU_DOWN_PREPARE: | ||
1519 | case CPU_DOWN_PREPARE_FROZEN: | ||
1520 | rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); | ||
1521 | lo &= ~BIT(25); | ||
1522 | wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); | ||
1523 | break; | ||
1524 | |||
1525 | default: | ||
1526 | break; | ||
1527 | } | ||
1528 | |||
1529 | return NOTIFY_OK; | ||
1530 | } | ||
1531 | |||
1532 | static struct notifier_block cpb_nb = { | ||
1533 | .notifier_call = cpb_notify, | ||
1534 | }; | ||
1535 | |||
1414 | /* driver entry point for init */ | 1536 | /* driver entry point for init */ |
1415 | static int __cpuinit powernowk8_init(void) | 1537 | static int __cpuinit powernowk8_init(void) |
1416 | { | 1538 | { |
1417 | unsigned int i, supported_cpus = 0; | 1539 | unsigned int i, supported_cpus = 0, cpu; |
1418 | 1540 | ||
1419 | for_each_online_cpu(i) { | 1541 | for_each_online_cpu(i) { |
1420 | int rc; | 1542 | int rc; |
@@ -1423,15 +1545,36 @@ static int __cpuinit powernowk8_init(void) | |||
1423 | supported_cpus++; | 1545 | supported_cpus++; |
1424 | } | 1546 | } |
1425 | 1547 | ||
1426 | if (supported_cpus == num_online_cpus()) { | 1548 | if (supported_cpus != num_online_cpus()) |
1427 | printk(KERN_INFO PFX "Found %d %s " | 1549 | return -ENODEV; |
1428 | "processors (%d cpu cores) (" VERSION ")\n", | 1550 | |
1429 | num_online_nodes(), | 1551 | printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", |
1430 | boot_cpu_data.x86_model_id, supported_cpus); | 1552 | num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); |
1431 | return cpufreq_register_driver(&cpufreq_amd64_driver); | 1553 | |
1554 | if (boot_cpu_has(X86_FEATURE_CPB)) { | ||
1555 | |||
1556 | cpb_capable = true; | ||
1557 | |||
1558 | register_cpu_notifier(&cpb_nb); | ||
1559 | |||
1560 | msrs = msrs_alloc(); | ||
1561 | if (!msrs) { | ||
1562 | printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); | ||
1563 | return -ENOMEM; | ||
1564 | } | ||
1565 | |||
1566 | rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); | ||
1567 | |||
1568 | for_each_cpu(cpu, cpu_online_mask) { | ||
1569 | struct msr *reg = per_cpu_ptr(msrs, cpu); | ||
1570 | cpb_enabled |= !(!!(reg->l & BIT(25))); | ||
1571 | } | ||
1572 | |||
1573 | printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", | ||
1574 | (cpb_enabled ? "on" : "off")); | ||
1432 | } | 1575 | } |
1433 | 1576 | ||
1434 | return -ENODEV; | 1577 | return cpufreq_register_driver(&cpufreq_amd64_driver); |
1435 | } | 1578 | } |
1436 | 1579 | ||
1437 | /* driver entry point for term */ | 1580 | /* driver entry point for term */ |
@@ -1439,6 +1582,13 @@ static void __exit powernowk8_exit(void) | |||
1439 | { | 1582 | { |
1440 | dprintk("exit\n"); | 1583 | dprintk("exit\n"); |
1441 | 1584 | ||
1585 | if (boot_cpu_has(X86_FEATURE_CPB)) { | ||
1586 | msrs_free(msrs); | ||
1587 | msrs = NULL; | ||
1588 | |||
1589 | unregister_cpu_notifier(&cpb_nb); | ||
1590 | } | ||
1591 | |||
1442 | cpufreq_unregister_driver(&cpufreq_amd64_driver); | 1592 | cpufreq_unregister_driver(&cpufreq_amd64_driver); |
1443 | } | 1593 | } |
1444 | 1594 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 02ce824073cb..df3529b1c02d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h | |||
@@ -5,7 +5,6 @@ | |||
5 | * http://www.gnu.org/licenses/gpl.html | 5 | * http://www.gnu.org/licenses/gpl.html |
6 | */ | 6 | */ |
7 | 7 | ||
8 | |||
9 | enum pstate { | 8 | enum pstate { |
10 | HW_PSTATE_INVALID = 0xff, | 9 | HW_PSTATE_INVALID = 0xff, |
11 | HW_PSTATE_0 = 0, | 10 | HW_PSTATE_0 = 0, |
@@ -55,7 +54,6 @@ struct powernow_k8_data { | |||
55 | struct cpumask *available_cores; | 54 | struct cpumask *available_cores; |
56 | }; | 55 | }; |
57 | 56 | ||
58 | |||
59 | /* processor's cpuid instruction support */ | 57 | /* processor's cpuid instruction support */ |
60 | #define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ | 58 | #define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ |
61 | #define CPUID_XFAM 0x0ff00000 /* extended family */ | 59 | #define CPUID_XFAM 0x0ff00000 /* extended family */ |
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 08be922de33a..8095f8611f8a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c | |||
@@ -21,37 +21,58 @@ | |||
21 | * | 21 | * |
22 | */ | 22 | */ |
23 | 23 | ||
24 | #include <linux/module.h> | ||
24 | #include <asm/processor.h> | 25 | #include <asm/processor.h> |
25 | #include <asm/vmware.h> | ||
26 | #include <asm/hypervisor.h> | 26 | #include <asm/hypervisor.h> |
27 | 27 | ||
28 | static inline void __cpuinit | 28 | /* |
29 | detect_hypervisor_vendor(struct cpuinfo_x86 *c) | 29 | * Hypervisor detect order. This is specified explicitly here because |
30 | * some hypervisors might implement compatibility modes for other | ||
31 | * hypervisors and therefore need to be detected in specific sequence. | ||
32 | */ | ||
33 | static const __initconst struct hypervisor_x86 * const hypervisors[] = | ||
30 | { | 34 | { |
31 | if (vmware_platform()) | 35 | &x86_hyper_vmware, |
32 | c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; | 36 | &x86_hyper_ms_hyperv, |
33 | else | 37 | #ifdef CONFIG_XEN_PVHVM |
34 | c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; | 38 | &x86_hyper_xen_hvm, |
35 | } | 39 | #endif |
40 | }; | ||
36 | 41 | ||
37 | static inline void __cpuinit | 42 | const struct hypervisor_x86 *x86_hyper; |
38 | hypervisor_set_feature_bits(struct cpuinfo_x86 *c) | 43 | EXPORT_SYMBOL(x86_hyper); |
44 | |||
45 | static inline void __init | ||
46 | detect_hypervisor_vendor(void) | ||
39 | { | 47 | { |
40 | if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { | 48 | const struct hypervisor_x86 *h, * const *p; |
41 | vmware_set_feature_bits(c); | 49 | |
42 | return; | 50 | for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { |
51 | h = *p; | ||
52 | if (h->detect()) { | ||
53 | x86_hyper = h; | ||
54 | printk(KERN_INFO "Hypervisor detected: %s\n", h->name); | ||
55 | break; | ||
56 | } | ||
43 | } | 57 | } |
44 | } | 58 | } |
45 | 59 | ||
46 | void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) | 60 | void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) |
47 | { | 61 | { |
48 | detect_hypervisor_vendor(c); | 62 | if (x86_hyper && x86_hyper->set_cpu_features) |
49 | hypervisor_set_feature_bits(c); | 63 | x86_hyper->set_cpu_features(c); |
50 | } | 64 | } |
51 | 65 | ||
52 | void __init init_hypervisor_platform(void) | 66 | void __init init_hypervisor_platform(void) |
53 | { | 67 | { |
68 | |||
69 | detect_hypervisor_vendor(); | ||
70 | |||
71 | if (!x86_hyper) | ||
72 | return; | ||
73 | |||
54 | init_hypervisor(&boot_cpu_data); | 74 | init_hypervisor(&boot_cpu_data); |
55 | if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) | 75 | |
56 | vmware_platform_setup(); | 76 | if (x86_hyper->init_platform) |
77 | x86_hyper->init_platform(); | ||
57 | } | 78 | } |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1366c7cfd483..85f69cdeae10 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #include <asm/processor.h> | 12 | #include <asm/processor.h> |
13 | #include <asm/pgtable.h> | 13 | #include <asm/pgtable.h> |
14 | #include <asm/msr.h> | 14 | #include <asm/msr.h> |
15 | #include <asm/ds.h> | ||
16 | #include <asm/bugs.h> | 15 | #include <asm/bugs.h> |
17 | #include <asm/cpu.h> | 16 | #include <asm/cpu.h> |
18 | 17 | ||
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |||
373 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | 372 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); |
374 | } | 373 | } |
375 | 374 | ||
376 | if (c->cpuid_level > 6) { | ||
377 | unsigned ecx = cpuid_ecx(6); | ||
378 | if (ecx & 0x01) | ||
379 | set_cpu_cap(c, X86_FEATURE_APERFMPERF); | ||
380 | } | ||
381 | |||
382 | if (cpu_has_xmm2) | 375 | if (cpu_has_xmm2) |
383 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | 376 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); |
384 | if (cpu_has_ds) { | 377 | if (cpu_has_ds) { |
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |||
388 | set_cpu_cap(c, X86_FEATURE_BTS); | 381 | set_cpu_cap(c, X86_FEATURE_BTS); |
389 | if (!(l1 & (1<<12))) | 382 | if (!(l1 & (1<<12))) |
390 | set_cpu_cap(c, X86_FEATURE_PEBS); | 383 | set_cpu_cap(c, X86_FEATURE_PEBS); |
391 | ds_init_intel(c); | ||
392 | } | 384 | } |
393 | 385 | ||
394 | if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) | 386 | if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 95962a93f99a..898c2f4eab88 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx { | |||
148 | u32 full; | 148 | u32 full; |
149 | }; | 149 | }; |
150 | 150 | ||
151 | struct amd_l3_cache { | ||
152 | struct pci_dev *dev; | ||
153 | bool can_disable; | ||
154 | unsigned indices; | ||
155 | u8 subcaches[4]; | ||
156 | }; | ||
157 | |||
151 | struct _cpuid4_info { | 158 | struct _cpuid4_info { |
152 | union _cpuid4_leaf_eax eax; | 159 | union _cpuid4_leaf_eax eax; |
153 | union _cpuid4_leaf_ebx ebx; | 160 | union _cpuid4_leaf_ebx ebx; |
154 | union _cpuid4_leaf_ecx ecx; | 161 | union _cpuid4_leaf_ecx ecx; |
155 | unsigned long size; | 162 | unsigned long size; |
156 | bool can_disable; | 163 | struct amd_l3_cache *l3; |
157 | unsigned int l3_indices; | ||
158 | DECLARE_BITMAP(shared_cpu_map, NR_CPUS); | 164 | DECLARE_BITMAP(shared_cpu_map, NR_CPUS); |
159 | }; | 165 | }; |
160 | 166 | ||
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs { | |||
164 | union _cpuid4_leaf_ebx ebx; | 170 | union _cpuid4_leaf_ebx ebx; |
165 | union _cpuid4_leaf_ecx ecx; | 171 | union _cpuid4_leaf_ecx ecx; |
166 | unsigned long size; | 172 | unsigned long size; |
167 | bool can_disable; | 173 | struct amd_l3_cache *l3; |
168 | unsigned int l3_indices; | ||
169 | }; | 174 | }; |
170 | 175 | ||
171 | unsigned short num_cache_leaves; | 176 | unsigned short num_cache_leaves; |
@@ -302,124 +307,246 @@ struct _cache_attr { | |||
302 | }; | 307 | }; |
303 | 308 | ||
304 | #ifdef CONFIG_CPU_SUP_AMD | 309 | #ifdef CONFIG_CPU_SUP_AMD |
305 | static unsigned int __cpuinit amd_calc_l3_indices(void) | 310 | |
311 | /* | ||
312 | * L3 cache descriptors | ||
313 | */ | ||
314 | static struct amd_l3_cache **__cpuinitdata l3_caches; | ||
315 | |||
316 | static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) | ||
306 | { | 317 | { |
307 | /* | ||
308 | * We're called over smp_call_function_single() and therefore | ||
309 | * are on the correct cpu. | ||
310 | */ | ||
311 | int cpu = smp_processor_id(); | ||
312 | int node = cpu_to_node(cpu); | ||
313 | struct pci_dev *dev = node_to_k8_nb_misc(node); | ||
314 | unsigned int sc0, sc1, sc2, sc3; | 318 | unsigned int sc0, sc1, sc2, sc3; |
315 | u32 val = 0; | 319 | u32 val = 0; |
316 | 320 | ||
317 | pci_read_config_dword(dev, 0x1C4, &val); | 321 | pci_read_config_dword(l3->dev, 0x1C4, &val); |
318 | 322 | ||
319 | /* calculate subcache sizes */ | 323 | /* calculate subcache sizes */ |
320 | sc0 = !(val & BIT(0)); | 324 | l3->subcaches[0] = sc0 = !(val & BIT(0)); |
321 | sc1 = !(val & BIT(4)); | 325 | l3->subcaches[1] = sc1 = !(val & BIT(4)); |
322 | sc2 = !(val & BIT(8)) + !(val & BIT(9)); | 326 | l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); |
323 | sc3 = !(val & BIT(12)) + !(val & BIT(13)); | 327 | l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); |
324 | 328 | ||
325 | return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; | 329 | l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; |
326 | } | 330 | } |
327 | 331 | ||
328 | static void __cpuinit | 332 | static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) |
329 | amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | ||
330 | { | 333 | { |
331 | if (index < 3) | 334 | struct amd_l3_cache *l3; |
335 | struct pci_dev *dev = node_to_k8_nb_misc(node); | ||
336 | |||
337 | l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); | ||
338 | if (!l3) { | ||
339 | printk(KERN_WARNING "Error allocating L3 struct\n"); | ||
340 | return NULL; | ||
341 | } | ||
342 | |||
343 | l3->dev = dev; | ||
344 | |||
345 | amd_calc_l3_indices(l3); | ||
346 | |||
347 | return l3; | ||
348 | } | ||
349 | |||
350 | static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, | ||
351 | int index) | ||
352 | { | ||
353 | int node; | ||
354 | |||
355 | if (boot_cpu_data.x86 != 0x10) | ||
332 | return; | 356 | return; |
333 | 357 | ||
334 | if (boot_cpu_data.x86 == 0x11) | 358 | if (index < 3) |
335 | return; | 359 | return; |
336 | 360 | ||
337 | /* see errata #382 and #388 */ | 361 | /* see errata #382 and #388 */ |
338 | if ((boot_cpu_data.x86 == 0x10) && | 362 | if (boot_cpu_data.x86_model < 0x8) |
339 | ((boot_cpu_data.x86_model < 0x8) || | ||
340 | (boot_cpu_data.x86_mask < 0x1))) | ||
341 | return; | 363 | return; |
342 | 364 | ||
365 | if ((boot_cpu_data.x86_model == 0x8 || | ||
366 | boot_cpu_data.x86_model == 0x9) | ||
367 | && | ||
368 | boot_cpu_data.x86_mask < 0x1) | ||
369 | return; | ||
370 | |||
343 | /* not in virtualized environments */ | 371 | /* not in virtualized environments */ |
344 | if (num_k8_northbridges == 0) | 372 | if (num_k8_northbridges == 0) |
345 | return; | 373 | return; |
346 | 374 | ||
347 | this_leaf->can_disable = true; | 375 | /* |
348 | this_leaf->l3_indices = amd_calc_l3_indices(); | 376 | * Strictly speaking, the amount in @size below is leaked since it is |
377 | * never freed but this is done only on shutdown so it doesn't matter. | ||
378 | */ | ||
379 | if (!l3_caches) { | ||
380 | int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); | ||
381 | |||
382 | l3_caches = kzalloc(size, GFP_ATOMIC); | ||
383 | if (!l3_caches) | ||
384 | return; | ||
385 | } | ||
386 | |||
387 | node = amd_get_nb_id(smp_processor_id()); | ||
388 | |||
389 | if (!l3_caches[node]) { | ||
390 | l3_caches[node] = amd_init_l3_cache(node); | ||
391 | l3_caches[node]->can_disable = true; | ||
392 | } | ||
393 | |||
394 | WARN_ON(!l3_caches[node]); | ||
395 | |||
396 | this_leaf->l3 = l3_caches[node]; | ||
349 | } | 397 | } |
350 | 398 | ||
351 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | 399 | /* |
352 | unsigned int index) | 400 | * check whether a slot used for disabling an L3 index is occupied. |
401 | * @l3: L3 cache descriptor | ||
402 | * @slot: slot number (0..1) | ||
403 | * | ||
404 | * @returns: the disabled index if used or negative value if slot free. | ||
405 | */ | ||
406 | int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) | ||
353 | { | 407 | { |
354 | int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); | ||
355 | int node = amd_get_nb_id(cpu); | ||
356 | struct pci_dev *dev = node_to_k8_nb_misc(node); | ||
357 | unsigned int reg = 0; | 408 | unsigned int reg = 0; |
358 | 409 | ||
359 | if (!this_leaf->can_disable) | 410 | pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); |
360 | return -EINVAL; | 411 | |
412 | /* check whether this slot is activated already */ | ||
413 | if (reg & (3UL << 30)) | ||
414 | return reg & 0xfff; | ||
361 | 415 | ||
362 | if (!dev) | 416 | return -1; |
417 | } | ||
418 | |||
419 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | ||
420 | unsigned int slot) | ||
421 | { | ||
422 | int index; | ||
423 | |||
424 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) | ||
363 | return -EINVAL; | 425 | return -EINVAL; |
364 | 426 | ||
365 | pci_read_config_dword(dev, 0x1BC + index * 4, ®); | 427 | index = amd_get_l3_disable_slot(this_leaf->l3, slot); |
366 | return sprintf(buf, "0x%08x\n", reg); | 428 | if (index >= 0) |
429 | return sprintf(buf, "%d\n", index); | ||
430 | |||
431 | return sprintf(buf, "FREE\n"); | ||
367 | } | 432 | } |
368 | 433 | ||
369 | #define SHOW_CACHE_DISABLE(index) \ | 434 | #define SHOW_CACHE_DISABLE(slot) \ |
370 | static ssize_t \ | 435 | static ssize_t \ |
371 | show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ | 436 | show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ |
372 | { \ | 437 | { \ |
373 | return show_cache_disable(this_leaf, buf, index); \ | 438 | return show_cache_disable(this_leaf, buf, slot); \ |
374 | } | 439 | } |
375 | SHOW_CACHE_DISABLE(0) | 440 | SHOW_CACHE_DISABLE(0) |
376 | SHOW_CACHE_DISABLE(1) | 441 | SHOW_CACHE_DISABLE(1) |
377 | 442 | ||
378 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | 443 | static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, |
379 | const char *buf, size_t count, unsigned int index) | 444 | unsigned slot, unsigned long idx) |
380 | { | 445 | { |
381 | int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); | 446 | int i; |
382 | int node = amd_get_nb_id(cpu); | 447 | |
383 | struct pci_dev *dev = node_to_k8_nb_misc(node); | 448 | idx |= BIT(30); |
384 | unsigned long val = 0; | 449 | |
450 | /* | ||
451 | * disable index in all 4 subcaches | ||
452 | */ | ||
453 | for (i = 0; i < 4; i++) { | ||
454 | u32 reg = idx | (i << 20); | ||
455 | |||
456 | if (!l3->subcaches[i]) | ||
457 | continue; | ||
458 | |||
459 | pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); | ||
460 | |||
461 | /* | ||
462 | * We need to WBINVD on a core on the node containing the L3 | ||
463 | * cache which indices we disable therefore a simple wbinvd() | ||
464 | * is not sufficient. | ||
465 | */ | ||
466 | wbinvd_on_cpu(cpu); | ||
467 | |||
468 | reg |= BIT(31); | ||
469 | pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); | ||
470 | } | ||
471 | } | ||
472 | |||
473 | /* | ||
474 | * disable a L3 cache index by using a disable-slot | ||
475 | * | ||
476 | * @l3: L3 cache descriptor | ||
477 | * @cpu: A CPU on the node containing the L3 cache | ||
478 | * @slot: slot number (0..1) | ||
479 | * @index: index to disable | ||
480 | * | ||
481 | * @return: 0 on success, error status on failure | ||
482 | */ | ||
483 | int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, | ||
484 | unsigned long index) | ||
485 | { | ||
486 | int ret = 0; | ||
385 | 487 | ||
386 | #define SUBCACHE_MASK (3UL << 20) | 488 | #define SUBCACHE_MASK (3UL << 20) |
387 | #define SUBCACHE_INDEX 0xfff | 489 | #define SUBCACHE_INDEX 0xfff |
388 | 490 | ||
389 | if (!this_leaf->can_disable) | 491 | /* |
492 | * check whether this slot is already used or | ||
493 | * the index is already disabled | ||
494 | */ | ||
495 | ret = amd_get_l3_disable_slot(l3, slot); | ||
496 | if (ret >= 0) | ||
390 | return -EINVAL; | 497 | return -EINVAL; |
391 | 498 | ||
499 | /* | ||
500 | * check whether the other slot has disabled the | ||
501 | * same index already | ||
502 | */ | ||
503 | if (index == amd_get_l3_disable_slot(l3, !slot)) | ||
504 | return -EINVAL; | ||
505 | |||
506 | /* do not allow writes outside of allowed bits */ | ||
507 | if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || | ||
508 | ((index & SUBCACHE_INDEX) > l3->indices)) | ||
509 | return -EINVAL; | ||
510 | |||
511 | amd_l3_disable_index(l3, cpu, slot, index); | ||
512 | |||
513 | return 0; | ||
514 | } | ||
515 | |||
516 | static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | ||
517 | const char *buf, size_t count, | ||
518 | unsigned int slot) | ||
519 | { | ||
520 | unsigned long val = 0; | ||
521 | int cpu, err = 0; | ||
522 | |||
392 | if (!capable(CAP_SYS_ADMIN)) | 523 | if (!capable(CAP_SYS_ADMIN)) |
393 | return -EPERM; | 524 | return -EPERM; |
394 | 525 | ||
395 | if (!dev) | 526 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) |
396 | return -EINVAL; | 527 | return -EINVAL; |
397 | 528 | ||
398 | if (strict_strtoul(buf, 10, &val) < 0) | 529 | cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); |
399 | return -EINVAL; | ||
400 | 530 | ||
401 | /* do not allow writes outside of allowed bits */ | 531 | if (strict_strtoul(buf, 10, &val) < 0) |
402 | if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || | ||
403 | ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) | ||
404 | return -EINVAL; | 532 | return -EINVAL; |
405 | 533 | ||
406 | val |= BIT(30); | 534 | err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); |
407 | pci_write_config_dword(dev, 0x1BC + index * 4, val); | 535 | if (err) { |
408 | /* | 536 | if (err == -EEXIST) |
409 | * We need to WBINVD on a core on the node containing the L3 cache which | 537 | printk(KERN_WARNING "L3 disable slot %d in use!\n", |
410 | * indices we disable therefore a simple wbinvd() is not sufficient. | 538 | slot); |
411 | */ | 539 | return err; |
412 | wbinvd_on_cpu(cpu); | 540 | } |
413 | pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); | ||
414 | return count; | 541 | return count; |
415 | } | 542 | } |
416 | 543 | ||
417 | #define STORE_CACHE_DISABLE(index) \ | 544 | #define STORE_CACHE_DISABLE(slot) \ |
418 | static ssize_t \ | 545 | static ssize_t \ |
419 | store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ | 546 | store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ |
420 | const char *buf, size_t count) \ | 547 | const char *buf, size_t count) \ |
421 | { \ | 548 | { \ |
422 | return store_cache_disable(this_leaf, buf, count, index); \ | 549 | return store_cache_disable(this_leaf, buf, count, slot); \ |
423 | } | 550 | } |
424 | STORE_CACHE_DISABLE(0) | 551 | STORE_CACHE_DISABLE(0) |
425 | STORE_CACHE_DISABLE(1) | 552 | STORE_CACHE_DISABLE(1) |
@@ -431,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | |||
431 | 558 | ||
432 | #else /* CONFIG_CPU_SUP_AMD */ | 559 | #else /* CONFIG_CPU_SUP_AMD */ |
433 | static void __cpuinit | 560 | static void __cpuinit |
434 | amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) | 561 | amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) |
435 | { | 562 | { |
436 | }; | 563 | }; |
437 | #endif /* CONFIG_CPU_SUP_AMD */ | 564 | #endif /* CONFIG_CPU_SUP_AMD */ |
@@ -447,8 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, | |||
447 | 574 | ||
448 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | 575 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { |
449 | amd_cpuid4(index, &eax, &ebx, &ecx); | 576 | amd_cpuid4(index, &eax, &ebx, &ecx); |
450 | if (boot_cpu_data.x86 >= 0x10) | 577 | amd_check_l3_disable(this_leaf, index); |
451 | amd_check_l3_disable(index, this_leaf); | ||
452 | } else { | 578 | } else { |
453 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | 579 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); |
454 | } | 580 | } |
@@ -705,6 +831,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) | |||
705 | for (i = 0; i < num_cache_leaves; i++) | 831 | for (i = 0; i < num_cache_leaves; i++) |
706 | cache_remove_shared_cpu_map(cpu, i); | 832 | cache_remove_shared_cpu_map(cpu, i); |
707 | 833 | ||
834 | kfree(per_cpu(ici_cpuid4_info, cpu)->l3); | ||
708 | kfree(per_cpu(ici_cpuid4_info, cpu)); | 835 | kfree(per_cpu(ici_cpuid4_info, cpu)); |
709 | per_cpu(ici_cpuid4_info, cpu) = NULL; | 836 | per_cpu(ici_cpuid4_info, cpu) = NULL; |
710 | } | 837 | } |
@@ -989,7 +1116,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
989 | 1116 | ||
990 | this_leaf = CPUID4_INFO_IDX(cpu, i); | 1117 | this_leaf = CPUID4_INFO_IDX(cpu, i); |
991 | 1118 | ||
992 | if (this_leaf->can_disable) | 1119 | if (this_leaf->l3 && this_leaf->l3->can_disable) |
993 | ktype_cache.default_attrs = default_l3_attrs; | 1120 | ktype_cache.default_attrs = default_l3_attrs; |
994 | else | 1121 | else |
995 | ktype_cache.default_attrs = default_attrs; | 1122 | ktype_cache.default_attrs = default_attrs; |
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 4ac6d48fe11b..bb34b03af252 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile | |||
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o | |||
7 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o | 7 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o |
8 | 8 | ||
9 | obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o | 9 | obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o |
10 | |||
11 | obj-$(CONFIG_ACPI_APEI) += mce-apei.o | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c new file mode 100644 index 000000000000..745b54f9be89 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c | |||
@@ -0,0 +1,138 @@ | |||
1 | /* | ||
2 | * Bridge between MCE and APEI | ||
3 | * | ||
4 | * On some machine, corrected memory errors are reported via APEI | ||
5 | * generic hardware error source (GHES) instead of corrected Machine | ||
6 | * Check. These corrected memory errors can be reported to user space | ||
7 | * through /dev/mcelog via faking a corrected Machine Check, so that | ||
8 | * the error memory page can be offlined by /sbin/mcelog if the error | ||
9 | * count for one page is beyond the threshold. | ||
10 | * | ||
11 | * For fatal MCE, save MCE record into persistent storage via ERST, so | ||
12 | * that the MCE record can be logged after reboot via ERST. | ||
13 | * | ||
14 | * Copyright 2010 Intel Corp. | ||
15 | * Author: Huang Ying <ying.huang@intel.com> | ||
16 | * | ||
17 | * This program is free software; you can redistribute it and/or | ||
18 | * modify it under the terms of the GNU General Public License version | ||
19 | * 2 as published by the Free Software Foundation. | ||
20 | * | ||
21 | * This program is distributed in the hope that it will be useful, | ||
22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
24 | * GNU General Public License for more details. | ||
25 | * | ||
26 | * You should have received a copy of the GNU General Public License | ||
27 | * along with this program; if not, write to the Free Software | ||
28 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
29 | */ | ||
30 | |||
31 | #include <linux/kernel.h> | ||
32 | #include <linux/acpi.h> | ||
33 | #include <linux/cper.h> | ||
34 | #include <acpi/apei.h> | ||
35 | #include <asm/mce.h> | ||
36 | |||
37 | #include "mce-internal.h" | ||
38 | |||
39 | void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) | ||
40 | { | ||
41 | struct mce m; | ||
42 | |||
43 | /* Only corrected MC is reported */ | ||
44 | if (!corrected) | ||
45 | return; | ||
46 | |||
47 | mce_setup(&m); | ||
48 | m.bank = 1; | ||
49 | /* Fake a memory read corrected error with unknown channel */ | ||
50 | m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; | ||
51 | m.addr = mem_err->physical_addr; | ||
52 | mce_log(&m); | ||
53 | mce_notify_irq(); | ||
54 | } | ||
55 | EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); | ||
56 | |||
57 | #define CPER_CREATOR_MCE \ | ||
58 | UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ | ||
59 | 0x64, 0x90, 0xb8, 0x9d) | ||
60 | #define CPER_SECTION_TYPE_MCE \ | ||
61 | UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ | ||
62 | 0x04, 0x4a, 0x38, 0xfc) | ||
63 | |||
64 | /* | ||
65 | * CPER specification (in UEFI specification 2.3 appendix N) requires | ||
66 | * byte-packed. | ||
67 | */ | ||
68 | struct cper_mce_record { | ||
69 | struct cper_record_header hdr; | ||
70 | struct cper_section_descriptor sec_hdr; | ||
71 | struct mce mce; | ||
72 | } __packed; | ||
73 | |||
74 | int apei_write_mce(struct mce *m) | ||
75 | { | ||
76 | struct cper_mce_record rcd; | ||
77 | |||
78 | memset(&rcd, 0, sizeof(rcd)); | ||
79 | memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); | ||
80 | rcd.hdr.revision = CPER_RECORD_REV; | ||
81 | rcd.hdr.signature_end = CPER_SIG_END; | ||
82 | rcd.hdr.section_count = 1; | ||
83 | rcd.hdr.error_severity = CPER_SER_FATAL; | ||
84 | /* timestamp, platform_id, partition_id are all invalid */ | ||
85 | rcd.hdr.validation_bits = 0; | ||
86 | rcd.hdr.record_length = sizeof(rcd); | ||
87 | rcd.hdr.creator_id = CPER_CREATOR_MCE; | ||
88 | rcd.hdr.notification_type = CPER_NOTIFY_MCE; | ||
89 | rcd.hdr.record_id = cper_next_record_id(); | ||
90 | rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; | ||
91 | |||
92 | rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; | ||
93 | rcd.sec_hdr.section_length = sizeof(rcd.mce); | ||
94 | rcd.sec_hdr.revision = CPER_SEC_REV; | ||
95 | /* fru_id and fru_text is invalid */ | ||
96 | rcd.sec_hdr.validation_bits = 0; | ||
97 | rcd.sec_hdr.flags = CPER_SEC_PRIMARY; | ||
98 | rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; | ||
99 | rcd.sec_hdr.section_severity = CPER_SER_FATAL; | ||
100 | |||
101 | memcpy(&rcd.mce, m, sizeof(*m)); | ||
102 | |||
103 | return erst_write(&rcd.hdr); | ||
104 | } | ||
105 | |||
106 | ssize_t apei_read_mce(struct mce *m, u64 *record_id) | ||
107 | { | ||
108 | struct cper_mce_record rcd; | ||
109 | ssize_t len; | ||
110 | |||
111 | len = erst_read_next(&rcd.hdr, sizeof(rcd)); | ||
112 | if (len <= 0) | ||
113 | return len; | ||
114 | /* Can not skip other records in storage via ERST unless clear them */ | ||
115 | else if (len != sizeof(rcd) || | ||
116 | uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { | ||
117 | if (printk_ratelimit()) | ||
118 | pr_warning( | ||
119 | "MCE-APEI: Can not skip the unknown record in ERST"); | ||
120 | return -EIO; | ||
121 | } | ||
122 | |||
123 | memcpy(m, &rcd.mce, sizeof(*m)); | ||
124 | *record_id = rcd.hdr.record_id; | ||
125 | |||
126 | return sizeof(*m); | ||
127 | } | ||
128 | |||
129 | /* Check whether there is record in ERST */ | ||
130 | int apei_check_mce(void) | ||
131 | { | ||
132 | return erst_get_record_count(); | ||
133 | } | ||
134 | |||
135 | int apei_clear_mce(u64 record_id) | ||
136 | { | ||
137 | return erst_clear(record_id); | ||
138 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 32996f9fab67..fefcc69ee8b5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
@@ -28,3 +28,26 @@ extern int mce_ser; | |||
28 | 28 | ||
29 | extern struct mce_bank *mce_banks; | 29 | extern struct mce_bank *mce_banks; |
30 | 30 | ||
31 | #ifdef CONFIG_ACPI_APEI | ||
32 | int apei_write_mce(struct mce *m); | ||
33 | ssize_t apei_read_mce(struct mce *m, u64 *record_id); | ||
34 | int apei_check_mce(void); | ||
35 | int apei_clear_mce(u64 record_id); | ||
36 | #else | ||
37 | static inline int apei_write_mce(struct mce *m) | ||
38 | { | ||
39 | return -EINVAL; | ||
40 | } | ||
41 | static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) | ||
42 | { | ||
43 | return 0; | ||
44 | } | ||
45 | static inline int apei_check_mce(void) | ||
46 | { | ||
47 | return 0; | ||
48 | } | ||
49 | static inline int apei_clear_mce(u64 record_id) | ||
50 | { | ||
51 | return -EINVAL; | ||
52 | } | ||
53 | #endif | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8a6f0afa767e..ed41562909fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/fs.h> | 36 | #include <linux/fs.h> |
37 | #include <linux/mm.h> | 37 | #include <linux/mm.h> |
38 | #include <linux/debugfs.h> | 38 | #include <linux/debugfs.h> |
39 | #include <linux/edac_mce.h> | ||
39 | 40 | ||
40 | #include <asm/processor.h> | 41 | #include <asm/processor.h> |
41 | #include <asm/hw_irq.h> | 42 | #include <asm/hw_irq.h> |
@@ -50,7 +51,7 @@ | |||
50 | static DEFINE_MUTEX(mce_read_mutex); | 51 | static DEFINE_MUTEX(mce_read_mutex); |
51 | 52 | ||
52 | #define rcu_dereference_check_mce(p) \ | 53 | #define rcu_dereference_check_mce(p) \ |
53 | rcu_dereference_check((p), \ | 54 | rcu_dereference_index_check((p), \ |
54 | rcu_read_lock_sched_held() || \ | 55 | rcu_read_lock_sched_held() || \ |
55 | lockdep_is_held(&mce_read_mutex)) | 56 | lockdep_is_held(&mce_read_mutex)) |
56 | 57 | ||
@@ -106,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); | |||
106 | static int default_decode_mce(struct notifier_block *nb, unsigned long val, | 107 | static int default_decode_mce(struct notifier_block *nb, unsigned long val, |
107 | void *data) | 108 | void *data) |
108 | { | 109 | { |
109 | pr_emerg("No human readable MCE decoding support on this CPU type.\n"); | 110 | pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); |
110 | pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); | 111 | pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); |
111 | 112 | ||
112 | return NOTIFY_STOP; | 113 | return NOTIFY_STOP; |
113 | } | 114 | } |
@@ -169,6 +170,15 @@ void mce_log(struct mce *mce) | |||
169 | entry = rcu_dereference_check_mce(mcelog.next); | 170 | entry = rcu_dereference_check_mce(mcelog.next); |
170 | for (;;) { | 171 | for (;;) { |
171 | /* | 172 | /* |
173 | * If edac_mce is enabled, it will check the error type | ||
174 | * and will process it, if it is a known error. | ||
175 | * Otherwise, the error will be sent through mcelog | ||
176 | * interface | ||
177 | */ | ||
178 | if (edac_mce_parse(mce)) | ||
179 | return; | ||
180 | |||
181 | /* | ||
172 | * When the buffer fills up discard new entries. | 182 | * When the buffer fills up discard new entries. |
173 | * Assume that the earlier errors are the more | 183 | * Assume that the earlier errors are the more |
174 | * interesting ones: | 184 | * interesting ones: |
@@ -201,11 +211,11 @@ void mce_log(struct mce *mce) | |||
201 | 211 | ||
202 | static void print_mce(struct mce *m) | 212 | static void print_mce(struct mce *m) |
203 | { | 213 | { |
204 | pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | 214 | pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", |
205 | m->extcpu, m->mcgstatus, m->bank, m->status); | 215 | m->extcpu, m->mcgstatus, m->bank, m->status); |
206 | 216 | ||
207 | if (m->ip) { | 217 | if (m->ip) { |
208 | pr_emerg("RIP%s %02x:<%016Lx> ", | 218 | pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", |
209 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | 219 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", |
210 | m->cs, m->ip); | 220 | m->cs, m->ip); |
211 | 221 | ||
@@ -214,14 +224,14 @@ static void print_mce(struct mce *m) | |||
214 | pr_cont("\n"); | 224 | pr_cont("\n"); |
215 | } | 225 | } |
216 | 226 | ||
217 | pr_emerg("TSC %llx ", m->tsc); | 227 | pr_emerg(HW_ERR "TSC %llx ", m->tsc); |
218 | if (m->addr) | 228 | if (m->addr) |
219 | pr_cont("ADDR %llx ", m->addr); | 229 | pr_cont("ADDR %llx ", m->addr); |
220 | if (m->misc) | 230 | if (m->misc) |
221 | pr_cont("MISC %llx ", m->misc); | 231 | pr_cont("MISC %llx ", m->misc); |
222 | 232 | ||
223 | pr_cont("\n"); | 233 | pr_cont("\n"); |
224 | pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | 234 | pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", |
225 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); | 235 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); |
226 | 236 | ||
227 | /* | 237 | /* |
@@ -231,16 +241,6 @@ static void print_mce(struct mce *m) | |||
231 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); | 241 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); |
232 | } | 242 | } |
233 | 243 | ||
234 | static void print_mce_head(void) | ||
235 | { | ||
236 | pr_emerg("\nHARDWARE ERROR\n"); | ||
237 | } | ||
238 | |||
239 | static void print_mce_tail(void) | ||
240 | { | ||
241 | pr_emerg("This is not a software problem!\n"); | ||
242 | } | ||
243 | |||
244 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | 244 | #define PANIC_TIMEOUT 5 /* 5 seconds */ |
245 | 245 | ||
246 | static atomic_t mce_paniced; | 246 | static atomic_t mce_paniced; |
@@ -264,7 +264,7 @@ static void wait_for_panic(void) | |||
264 | 264 | ||
265 | static void mce_panic(char *msg, struct mce *final, char *exp) | 265 | static void mce_panic(char *msg, struct mce *final, char *exp) |
266 | { | 266 | { |
267 | int i; | 267 | int i, apei_err = 0; |
268 | 268 | ||
269 | if (!fake_panic) { | 269 | if (!fake_panic) { |
270 | /* | 270 | /* |
@@ -281,14 +281,16 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
281 | if (atomic_inc_return(&mce_fake_paniced) > 1) | 281 | if (atomic_inc_return(&mce_fake_paniced) > 1) |
282 | return; | 282 | return; |
283 | } | 283 | } |
284 | print_mce_head(); | ||
285 | /* First print corrected ones that are still unlogged */ | 284 | /* First print corrected ones that are still unlogged */ |
286 | for (i = 0; i < MCE_LOG_LEN; i++) { | 285 | for (i = 0; i < MCE_LOG_LEN; i++) { |
287 | struct mce *m = &mcelog.entry[i]; | 286 | struct mce *m = &mcelog.entry[i]; |
288 | if (!(m->status & MCI_STATUS_VAL)) | 287 | if (!(m->status & MCI_STATUS_VAL)) |
289 | continue; | 288 | continue; |
290 | if (!(m->status & MCI_STATUS_UC)) | 289 | if (!(m->status & MCI_STATUS_UC)) { |
291 | print_mce(m); | 290 | print_mce(m); |
291 | if (!apei_err) | ||
292 | apei_err = apei_write_mce(m); | ||
293 | } | ||
292 | } | 294 | } |
293 | /* Now print uncorrected but with the final one last */ | 295 | /* Now print uncorrected but with the final one last */ |
294 | for (i = 0; i < MCE_LOG_LEN; i++) { | 296 | for (i = 0; i < MCE_LOG_LEN; i++) { |
@@ -297,22 +299,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
297 | continue; | 299 | continue; |
298 | if (!(m->status & MCI_STATUS_UC)) | 300 | if (!(m->status & MCI_STATUS_UC)) |
299 | continue; | 301 | continue; |
300 | if (!final || memcmp(m, final, sizeof(struct mce))) | 302 | if (!final || memcmp(m, final, sizeof(struct mce))) { |
301 | print_mce(m); | 303 | print_mce(m); |
304 | if (!apei_err) | ||
305 | apei_err = apei_write_mce(m); | ||
306 | } | ||
302 | } | 307 | } |
303 | if (final) | 308 | if (final) { |
304 | print_mce(final); | 309 | print_mce(final); |
310 | if (!apei_err) | ||
311 | apei_err = apei_write_mce(final); | ||
312 | } | ||
305 | if (cpu_missing) | 313 | if (cpu_missing) |
306 | printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); | 314 | pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); |
307 | print_mce_tail(); | ||
308 | if (exp) | 315 | if (exp) |
309 | printk(KERN_EMERG "Machine check: %s\n", exp); | 316 | pr_emerg(HW_ERR "Machine check: %s\n", exp); |
310 | if (!fake_panic) { | 317 | if (!fake_panic) { |
311 | if (panic_timeout == 0) | 318 | if (panic_timeout == 0) |
312 | panic_timeout = mce_panic_timeout; | 319 | panic_timeout = mce_panic_timeout; |
313 | panic(msg); | 320 | panic(msg); |
314 | } else | 321 | } else |
315 | printk(KERN_EMERG "Fake kernel panic: %s\n", msg); | 322 | pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); |
316 | } | 323 | } |
317 | 324 | ||
318 | /* Support code for software error injection */ | 325 | /* Support code for software error injection */ |
@@ -539,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
539 | struct mce m; | 546 | struct mce m; |
540 | int i; | 547 | int i; |
541 | 548 | ||
542 | __get_cpu_var(mce_poll_count)++; | 549 | percpu_inc(mce_poll_count); |
543 | 550 | ||
544 | mce_setup(&m); | 551 | mce_setup(&m); |
545 | 552 | ||
@@ -581,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
581 | */ | 588 | */ |
582 | if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { | 589 | if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { |
583 | mce_log(&m); | 590 | mce_log(&m); |
591 | atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); | ||
584 | add_taint(TAINT_MACHINE_CHECK); | 592 | add_taint(TAINT_MACHINE_CHECK); |
585 | } | 593 | } |
586 | 594 | ||
@@ -934,7 +942,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
934 | 942 | ||
935 | atomic_inc(&mce_entry); | 943 | atomic_inc(&mce_entry); |
936 | 944 | ||
937 | __get_cpu_var(mce_exception_count)++; | 945 | percpu_inc(mce_exception_count); |
938 | 946 | ||
939 | if (notify_die(DIE_NMI, "machine check", regs, error_code, | 947 | if (notify_die(DIE_NMI, "machine check", regs, error_code, |
940 | 18, SIGKILL) == NOTIFY_STOP) | 948 | 18, SIGKILL) == NOTIFY_STOP) |
@@ -1201,7 +1209,7 @@ int mce_notify_irq(void) | |||
1201 | schedule_work(&mce_trigger_work); | 1209 | schedule_work(&mce_trigger_work); |
1202 | 1210 | ||
1203 | if (__ratelimit(&ratelimit)) | 1211 | if (__ratelimit(&ratelimit)) |
1204 | printk(KERN_INFO "Machine check events logged\n"); | 1212 | pr_info(HW_ERR "Machine check events logged\n"); |
1205 | 1213 | ||
1206 | return 1; | 1214 | return 1; |
1207 | } | 1215 | } |
@@ -1493,6 +1501,43 @@ static void collect_tscs(void *data) | |||
1493 | rdtscll(cpu_tsc[smp_processor_id()]); | 1501 | rdtscll(cpu_tsc[smp_processor_id()]); |
1494 | } | 1502 | } |
1495 | 1503 | ||
1504 | static int mce_apei_read_done; | ||
1505 | |||
1506 | /* Collect MCE record of previous boot in persistent storage via APEI ERST. */ | ||
1507 | static int __mce_read_apei(char __user **ubuf, size_t usize) | ||
1508 | { | ||
1509 | int rc; | ||
1510 | u64 record_id; | ||
1511 | struct mce m; | ||
1512 | |||
1513 | if (usize < sizeof(struct mce)) | ||
1514 | return -EINVAL; | ||
1515 | |||
1516 | rc = apei_read_mce(&m, &record_id); | ||
1517 | /* Error or no more MCE record */ | ||
1518 | if (rc <= 0) { | ||
1519 | mce_apei_read_done = 1; | ||
1520 | return rc; | ||
1521 | } | ||
1522 | rc = -EFAULT; | ||
1523 | if (copy_to_user(*ubuf, &m, sizeof(struct mce))) | ||
1524 | return rc; | ||
1525 | /* | ||
1526 | * In fact, we should have cleared the record after that has | ||
1527 | * been flushed to the disk or sent to network in | ||
1528 | * /sbin/mcelog, but we have no interface to support that now, | ||
1529 | * so just clear it to avoid duplication. | ||
1530 | */ | ||
1531 | rc = apei_clear_mce(record_id); | ||
1532 | if (rc) { | ||
1533 | mce_apei_read_done = 1; | ||
1534 | return rc; | ||
1535 | } | ||
1536 | *ubuf += sizeof(struct mce); | ||
1537 | |||
1538 | return 0; | ||
1539 | } | ||
1540 | |||
1496 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | 1541 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, |
1497 | loff_t *off) | 1542 | loff_t *off) |
1498 | { | 1543 | { |
@@ -1506,15 +1551,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | |||
1506 | return -ENOMEM; | 1551 | return -ENOMEM; |
1507 | 1552 | ||
1508 | mutex_lock(&mce_read_mutex); | 1553 | mutex_lock(&mce_read_mutex); |
1554 | |||
1555 | if (!mce_apei_read_done) { | ||
1556 | err = __mce_read_apei(&buf, usize); | ||
1557 | if (err || buf != ubuf) | ||
1558 | goto out; | ||
1559 | } | ||
1560 | |||
1509 | next = rcu_dereference_check_mce(mcelog.next); | 1561 | next = rcu_dereference_check_mce(mcelog.next); |
1510 | 1562 | ||
1511 | /* Only supports full reads right now */ | 1563 | /* Only supports full reads right now */ |
1512 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | 1564 | err = -EINVAL; |
1513 | mutex_unlock(&mce_read_mutex); | 1565 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) |
1514 | kfree(cpu_tsc); | 1566 | goto out; |
1515 | |||
1516 | return -EINVAL; | ||
1517 | } | ||
1518 | 1567 | ||
1519 | err = 0; | 1568 | err = 0; |
1520 | prev = 0; | 1569 | prev = 0; |
@@ -1562,10 +1611,15 @@ timeout: | |||
1562 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | 1611 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); |
1563 | } | 1612 | } |
1564 | } | 1613 | } |
1614 | |||
1615 | if (err) | ||
1616 | err = -EFAULT; | ||
1617 | |||
1618 | out: | ||
1565 | mutex_unlock(&mce_read_mutex); | 1619 | mutex_unlock(&mce_read_mutex); |
1566 | kfree(cpu_tsc); | 1620 | kfree(cpu_tsc); |
1567 | 1621 | ||
1568 | return err ? -EFAULT : buf - ubuf; | 1622 | return err ? err : buf - ubuf; |
1569 | } | 1623 | } |
1570 | 1624 | ||
1571 | static unsigned int mce_poll(struct file *file, poll_table *wait) | 1625 | static unsigned int mce_poll(struct file *file, poll_table *wait) |
@@ -1573,6 +1627,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) | |||
1573 | poll_wait(file, &mce_wait, wait); | 1627 | poll_wait(file, &mce_wait, wait); |
1574 | if (rcu_dereference_check_mce(mcelog.next)) | 1628 | if (rcu_dereference_check_mce(mcelog.next)) |
1575 | return POLLIN | POLLRDNORM; | 1629 | return POLLIN | POLLRDNORM; |
1630 | if (!mce_apei_read_done && apei_check_mce()) | ||
1631 | return POLLIN | POLLRDNORM; | ||
1576 | return 0; | 1632 | return 0; |
1577 | } | 1633 | } |
1578 | 1634 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e40920a..6fcd0936194f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot) | |||
95 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 95 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
96 | 96 | ||
97 | /* Already owned by someone else? */ | 97 | /* Already owned by someone else? */ |
98 | if (val & CMCI_EN) { | 98 | if (val & MCI_CTL2_CMCI_EN) { |
99 | if (test_and_clear_bit(i, owned) && !boot) | 99 | if (test_and_clear_bit(i, owned) && !boot) |
100 | print_update("SHD", &hdr, i); | 100 | print_update("SHD", &hdr, i); |
101 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 101 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
102 | continue; | 102 | continue; |
103 | } | 103 | } |
104 | 104 | ||
105 | val |= CMCI_EN | CMCI_THRESHOLD; | 105 | val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; |
106 | val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; | ||
106 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 107 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
107 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 108 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
108 | 109 | ||
109 | /* Did the enable bit stick? -- the bank supports CMCI */ | 110 | /* Did the enable bit stick? -- the bank supports CMCI */ |
110 | if (val & CMCI_EN) { | 111 | if (val & MCI_CTL2_CMCI_EN) { |
111 | if (!test_and_set_bit(i, owned) && !boot) | 112 | if (!test_and_set_bit(i, owned) && !boot) |
112 | print_update("CMCI", &hdr, i); | 113 | print_update("CMCI", &hdr, i); |
113 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | 114 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); |
@@ -155,7 +156,7 @@ void cmci_clear(void) | |||
155 | continue; | 156 | continue; |
156 | /* Disable CMCI */ | 157 | /* Disable CMCI */ |
157 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); | 158 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
158 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | 159 | val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); |
159 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); | 160 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
160 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | 161 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); |
161 | } | 162 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 81c499eceb21..c2a8b26d4fea 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -34,15 +34,25 @@ | |||
34 | /* How long to wait between reporting thermal events */ | 34 | /* How long to wait between reporting thermal events */ |
35 | #define CHECK_INTERVAL (300 * HZ) | 35 | #define CHECK_INTERVAL (300 * HZ) |
36 | 36 | ||
37 | #define THERMAL_THROTTLING_EVENT 0 | ||
38 | #define POWER_LIMIT_EVENT 1 | ||
39 | |||
37 | /* | 40 | /* |
38 | * Current thermal throttling state: | 41 | * Current thermal event state: |
39 | */ | 42 | */ |
40 | struct thermal_state { | 43 | struct _thermal_state { |
41 | bool is_throttled; | 44 | bool new_event; |
42 | 45 | int event; | |
43 | u64 next_check; | 46 | u64 next_check; |
44 | unsigned long throttle_count; | 47 | unsigned long count; |
45 | unsigned long last_throttle_count; | 48 | unsigned long last_count; |
49 | }; | ||
50 | |||
51 | struct thermal_state { | ||
52 | struct _thermal_state core_throttle; | ||
53 | struct _thermal_state core_power_limit; | ||
54 | struct _thermal_state package_throttle; | ||
55 | struct _thermal_state package_power_limit; | ||
46 | }; | 56 | }; |
47 | 57 | ||
48 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); | 58 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); |
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly; | |||
53 | 63 | ||
54 | #ifdef CONFIG_SYSFS | 64 | #ifdef CONFIG_SYSFS |
55 | #define define_therm_throt_sysdev_one_ro(_name) \ | 65 | #define define_therm_throt_sysdev_one_ro(_name) \ |
56 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) | 66 | static SYSDEV_ATTR(_name, 0444, \ |
67 | therm_throt_sysdev_show_##_name, \ | ||
68 | NULL) \ | ||
57 | 69 | ||
58 | #define define_therm_throt_sysdev_show_func(name) \ | 70 | #define define_therm_throt_sysdev_show_func(event, name) \ |
59 | \ | 71 | \ |
60 | static ssize_t therm_throt_sysdev_show_##name( \ | 72 | static ssize_t therm_throt_sysdev_show_##event##_##name( \ |
61 | struct sys_device *dev, \ | 73 | struct sys_device *dev, \ |
62 | struct sysdev_attribute *attr, \ | 74 | struct sysdev_attribute *attr, \ |
63 | char *buf) \ | 75 | char *buf) \ |
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \ | |||
66 | ssize_t ret; \ | 78 | ssize_t ret; \ |
67 | \ | 79 | \ |
68 | preempt_disable(); /* CPU hotplug */ \ | 80 | preempt_disable(); /* CPU hotplug */ \ |
69 | if (cpu_online(cpu)) \ | 81 | if (cpu_online(cpu)) { \ |
70 | ret = sprintf(buf, "%lu\n", \ | 82 | ret = sprintf(buf, "%lu\n", \ |
71 | per_cpu(thermal_state, cpu).name); \ | 83 | per_cpu(thermal_state, cpu).event.name); \ |
72 | else \ | 84 | } else \ |
73 | ret = 0; \ | 85 | ret = 0; \ |
74 | preempt_enable(); \ | 86 | preempt_enable(); \ |
75 | \ | 87 | \ |
76 | return ret; \ | 88 | return ret; \ |
77 | } | 89 | } |
78 | 90 | ||
79 | define_therm_throt_sysdev_show_func(throttle_count); | 91 | define_therm_throt_sysdev_show_func(core_throttle, count); |
80 | define_therm_throt_sysdev_one_ro(throttle_count); | 92 | define_therm_throt_sysdev_one_ro(core_throttle_count); |
93 | |||
94 | define_therm_throt_sysdev_show_func(core_power_limit, count); | ||
95 | define_therm_throt_sysdev_one_ro(core_power_limit_count); | ||
96 | |||
97 | define_therm_throt_sysdev_show_func(package_throttle, count); | ||
98 | define_therm_throt_sysdev_one_ro(package_throttle_count); | ||
99 | |||
100 | define_therm_throt_sysdev_show_func(package_power_limit, count); | ||
101 | define_therm_throt_sysdev_one_ro(package_power_limit_count); | ||
81 | 102 | ||
82 | static struct attribute *thermal_throttle_attrs[] = { | 103 | static struct attribute *thermal_throttle_attrs[] = { |
83 | &attr_throttle_count.attr, | 104 | &attr_core_throttle_count.attr, |
84 | NULL | 105 | NULL |
85 | }; | 106 | }; |
86 | 107 | ||
87 | static struct attribute_group thermal_throttle_attr_group = { | 108 | static struct attribute_group thermal_attr_group = { |
88 | .attrs = thermal_throttle_attrs, | 109 | .attrs = thermal_throttle_attrs, |
89 | .name = "thermal_throttle" | 110 | .name = "thermal_throttle" |
90 | }; | 111 | }; |
91 | #endif /* CONFIG_SYSFS */ | 112 | #endif /* CONFIG_SYSFS */ |
92 | 113 | ||
114 | #define CORE_LEVEL 0 | ||
115 | #define PACKAGE_LEVEL 1 | ||
116 | |||
93 | /*** | 117 | /*** |
94 | * therm_throt_process - Process thermal throttling event from interrupt | 118 | * therm_throt_process - Process thermal throttling event from interrupt |
95 | * @curr: Whether the condition is current or not (boolean), since the | 119 | * @curr: Whether the condition is current or not (boolean), since the |
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { | |||
106 | * 1 : Event should be logged further, and a message has been | 130 | * 1 : Event should be logged further, and a message has been |
107 | * printed to the syslog. | 131 | * printed to the syslog. |
108 | */ | 132 | */ |
109 | static int therm_throt_process(bool is_throttled) | 133 | static int therm_throt_process(bool new_event, int event, int level) |
110 | { | 134 | { |
111 | struct thermal_state *state; | 135 | struct _thermal_state *state; |
112 | unsigned int this_cpu; | 136 | unsigned int this_cpu = smp_processor_id(); |
113 | bool was_throttled; | 137 | bool old_event; |
114 | u64 now; | 138 | u64 now; |
139 | struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); | ||
115 | 140 | ||
116 | this_cpu = smp_processor_id(); | ||
117 | now = get_jiffies_64(); | 141 | now = get_jiffies_64(); |
118 | state = &per_cpu(thermal_state, this_cpu); | 142 | if (level == CORE_LEVEL) { |
143 | if (event == THERMAL_THROTTLING_EVENT) | ||
144 | state = &pstate->core_throttle; | ||
145 | else if (event == POWER_LIMIT_EVENT) | ||
146 | state = &pstate->core_power_limit; | ||
147 | else | ||
148 | return 0; | ||
149 | } else if (level == PACKAGE_LEVEL) { | ||
150 | if (event == THERMAL_THROTTLING_EVENT) | ||
151 | state = &pstate->package_throttle; | ||
152 | else if (event == POWER_LIMIT_EVENT) | ||
153 | state = &pstate->package_power_limit; | ||
154 | else | ||
155 | return 0; | ||
156 | } else | ||
157 | return 0; | ||
119 | 158 | ||
120 | was_throttled = state->is_throttled; | 159 | old_event = state->new_event; |
121 | state->is_throttled = is_throttled; | 160 | state->new_event = new_event; |
122 | 161 | ||
123 | if (is_throttled) | 162 | if (new_event) |
124 | state->throttle_count++; | 163 | state->count++; |
125 | 164 | ||
126 | if (time_before64(now, state->next_check) && | 165 | if (time_before64(now, state->next_check) && |
127 | state->throttle_count != state->last_throttle_count) | 166 | state->count != state->last_count) |
128 | return 0; | 167 | return 0; |
129 | 168 | ||
130 | state->next_check = now + CHECK_INTERVAL; | 169 | state->next_check = now + CHECK_INTERVAL; |
131 | state->last_throttle_count = state->throttle_count; | 170 | state->last_count = state->count; |
132 | 171 | ||
133 | /* if we just entered the thermal event */ | 172 | /* if we just entered the thermal event */ |
134 | if (is_throttled) { | 173 | if (new_event) { |
135 | printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); | 174 | if (event == THERMAL_THROTTLING_EVENT) |
175 | printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", | ||
176 | this_cpu, | ||
177 | level == CORE_LEVEL ? "Core" : "Package", | ||
178 | state->count); | ||
179 | else | ||
180 | printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", | ||
181 | this_cpu, | ||
182 | level == CORE_LEVEL ? "Core" : "Package", | ||
183 | state->count); | ||
136 | 184 | ||
137 | add_taint(TAINT_MACHINE_CHECK); | 185 | add_taint(TAINT_MACHINE_CHECK); |
138 | return 1; | 186 | return 1; |
139 | } | 187 | } |
140 | if (was_throttled) { | 188 | if (old_event) { |
141 | printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); | 189 | if (event == THERMAL_THROTTLING_EVENT) |
190 | printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", | ||
191 | this_cpu, | ||
192 | level == CORE_LEVEL ? "Core" : "Package"); | ||
193 | else | ||
194 | printk(KERN_INFO "CPU%d: %s power limit normal\n", | ||
195 | this_cpu, | ||
196 | level == CORE_LEVEL ? "Core" : "Package"); | ||
142 | return 1; | 197 | return 1; |
143 | } | 198 | } |
144 | 199 | ||
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled) | |||
149 | /* Add/Remove thermal_throttle interface for CPU device: */ | 204 | /* Add/Remove thermal_throttle interface for CPU device: */ |
150 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) | 205 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) |
151 | { | 206 | { |
152 | return sysfs_create_group(&sys_dev->kobj, | 207 | int err; |
153 | &thermal_throttle_attr_group); | 208 | struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); |
209 | |||
210 | err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); | ||
211 | if (err) | ||
212 | return err; | ||
213 | |||
214 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
215 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
216 | &attr_core_power_limit_count.attr, | ||
217 | thermal_attr_group.name); | ||
218 | if (cpu_has(c, X86_FEATURE_PTS)) | ||
219 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
220 | &attr_package_throttle_count.attr, | ||
221 | thermal_attr_group.name); | ||
222 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
223 | err = sysfs_add_file_to_group(&sys_dev->kobj, | ||
224 | &attr_package_power_limit_count.attr, | ||
225 | thermal_attr_group.name); | ||
226 | |||
227 | return err; | ||
154 | } | 228 | } |
155 | 229 | ||
156 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) | 230 | static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) |
157 | { | 231 | { |
158 | sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); | 232 | sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); |
159 | } | 233 | } |
160 | 234 | ||
161 | /* Mutex protecting device creation against CPU hotplug: */ | 235 | /* Mutex protecting device creation against CPU hotplug: */ |
@@ -190,7 +264,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, | |||
190 | mutex_unlock(&therm_cpu_lock); | 264 | mutex_unlock(&therm_cpu_lock); |
191 | break; | 265 | break; |
192 | } | 266 | } |
193 | return err ? NOTIFY_BAD : NOTIFY_OK; | 267 | return notifier_from_errno(err); |
194 | } | 268 | } |
195 | 269 | ||
196 | static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = | 270 | static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = |
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device); | |||
226 | 300 | ||
227 | #endif /* CONFIG_SYSFS */ | 301 | #endif /* CONFIG_SYSFS */ |
228 | 302 | ||
303 | /* | ||
304 | * Set up the most two significant bit to notify mce log that this thermal | ||
305 | * event type. | ||
306 | * This is a temp solution. May be changed in the future with mce log | ||
307 | * infrasture. | ||
308 | */ | ||
309 | #define CORE_THROTTLED (0) | ||
310 | #define CORE_POWER_LIMIT ((__u64)1 << 62) | ||
311 | #define PACKAGE_THROTTLED ((__u64)2 << 62) | ||
312 | #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) | ||
313 | |||
229 | /* Thermal transition interrupt handler */ | 314 | /* Thermal transition interrupt handler */ |
230 | static void intel_thermal_interrupt(void) | 315 | static void intel_thermal_interrupt(void) |
231 | { | 316 | { |
232 | __u64 msr_val; | 317 | __u64 msr_val; |
318 | struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); | ||
233 | 319 | ||
234 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 320 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
235 | if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) | 321 | |
236 | mce_log_therm_throt_event(msr_val); | 322 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, |
323 | THERMAL_THROTTLING_EVENT, | ||
324 | CORE_LEVEL) != 0) | ||
325 | mce_log_therm_throt_event(CORE_THROTTLED | msr_val); | ||
326 | |||
327 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
328 | if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, | ||
329 | POWER_LIMIT_EVENT, | ||
330 | CORE_LEVEL) != 0) | ||
331 | mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); | ||
332 | |||
333 | if (cpu_has(c, X86_FEATURE_PTS)) { | ||
334 | rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); | ||
335 | if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, | ||
336 | THERMAL_THROTTLING_EVENT, | ||
337 | PACKAGE_LEVEL) != 0) | ||
338 | mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); | ||
339 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
340 | if (therm_throt_process(msr_val & | ||
341 | PACKAGE_THERM_STATUS_POWER_LIMIT, | ||
342 | POWER_LIMIT_EVENT, | ||
343 | PACKAGE_LEVEL) != 0) | ||
344 | mce_log_therm_throt_event(PACKAGE_POWER_LIMIT | ||
345 | | msr_val); | ||
346 | } | ||
237 | } | 347 | } |
238 | 348 | ||
239 | static void unexpected_thermal_interrupt(void) | 349 | static void unexpected_thermal_interrupt(void) |
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c) | |||
335 | apic_write(APIC_LVTTHMR, h); | 445 | apic_write(APIC_LVTTHMR, h); |
336 | 446 | ||
337 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | 447 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); |
338 | wrmsr(MSR_IA32_THERM_INTERRUPT, | 448 | if (cpu_has(c, X86_FEATURE_PLN)) |
339 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | 449 | wrmsr(MSR_IA32_THERM_INTERRUPT, |
450 | l | (THERM_INT_LOW_ENABLE | ||
451 | | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); | ||
452 | else | ||
453 | wrmsr(MSR_IA32_THERM_INTERRUPT, | ||
454 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | ||
455 | |||
456 | if (cpu_has(c, X86_FEATURE_PTS)) { | ||
457 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); | ||
458 | if (cpu_has(c, X86_FEATURE_PLN)) | ||
459 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, | ||
460 | l | (PACKAGE_THERM_INT_LOW_ENABLE | ||
461 | | PACKAGE_THERM_INT_HIGH_ENABLE | ||
462 | | PACKAGE_THERM_INT_PLN_ENABLE), h); | ||
463 | else | ||
464 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, | ||
465 | l | (PACKAGE_THERM_INT_LOW_ENABLE | ||
466 | | PACKAGE_THERM_INT_HIGH_ENABLE), h); | ||
467 | } | ||
340 | 468 | ||
341 | smp_thermal_vector = intel_thermal_interrupt; | 469 | smp_thermal_vector = intel_thermal_interrupt; |
342 | 470 | ||
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c new file mode 100644 index 000000000000..d944bf6c50e9 --- /dev/null +++ b/arch/x86/kernel/cpu/mshyperv.c | |||
@@ -0,0 +1,56 @@ | |||
1 | /* | ||
2 | * HyperV Detection code. | ||
3 | * | ||
4 | * Copyright (C) 2010, Novell, Inc. | ||
5 | * Author : K. Y. Srinivasan <ksrinivasan@novell.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; version 2 of the License. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/types.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <asm/processor.h> | ||
16 | #include <asm/hypervisor.h> | ||
17 | #include <asm/hyperv.h> | ||
18 | #include <asm/mshyperv.h> | ||
19 | |||
20 | struct ms_hyperv_info ms_hyperv; | ||
21 | EXPORT_SYMBOL_GPL(ms_hyperv); | ||
22 | |||
23 | static bool __init ms_hyperv_platform(void) | ||
24 | { | ||
25 | u32 eax; | ||
26 | u32 hyp_signature[3]; | ||
27 | |||
28 | if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) | ||
29 | return false; | ||
30 | |||
31 | cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, | ||
32 | &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); | ||
33 | |||
34 | return eax >= HYPERV_CPUID_MIN && | ||
35 | eax <= HYPERV_CPUID_MAX && | ||
36 | !memcmp("Microsoft Hv", hyp_signature, 12); | ||
37 | } | ||
38 | |||
39 | static void __init ms_hyperv_init_platform(void) | ||
40 | { | ||
41 | /* | ||
42 | * Extract the features and hints | ||
43 | */ | ||
44 | ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); | ||
45 | ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); | ||
46 | |||
47 | printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", | ||
48 | ms_hyperv.features, ms_hyperv.hints); | ||
49 | } | ||
50 | |||
51 | const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { | ||
52 | .name = "Microsoft HyperV", | ||
53 | .detect = ms_hyperv_platform, | ||
54 | .init_platform = ms_hyperv_init_platform, | ||
55 | }; | ||
56 | EXPORT_SYMBOL(x86_hyper_ms_hyperv); | ||
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f012..c5f59d071425 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c | |||
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) | |||
632 | unsigned long gran_base, chunk_base, lose_base; | 632 | unsigned long gran_base, chunk_base, lose_base; |
633 | char gran_factor, chunk_factor, lose_factor; | 633 | char gran_factor, chunk_factor, lose_factor; |
634 | 634 | ||
635 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), | 635 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); |
636 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), | 636 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); |
637 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), | 637 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); |
638 | 638 | ||
639 | pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", | 639 | pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", |
640 | result[i].bad ? "*BAD*" : " ", | 640 | result[i].bad ? "*BAD*" : " ", |
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a441c61c..7d28d7d03885 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, | |||
433 | { | 433 | { |
434 | unsigned int mask_lo, mask_hi, base_lo, base_hi; | 434 | unsigned int mask_lo, mask_hi, base_lo, base_hi; |
435 | unsigned int tmp, hi; | 435 | unsigned int tmp, hi; |
436 | int cpu; | ||
437 | 436 | ||
438 | /* | 437 | /* |
439 | * get_mtrr doesn't need to update mtrr_state, also it could be called | 438 | * get_mtrr doesn't need to update mtrr_state, also it could be called |
440 | * from any cpu, so try to print it out directly. | 439 | * from any cpu, so try to print it out directly. |
441 | */ | 440 | */ |
442 | cpu = get_cpu(); | 441 | get_cpu(); |
443 | 442 | ||
444 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); | 443 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); |
445 | 444 | ||
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd9b602..01c0f3ee6cc3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -35,6 +35,7 @@ | |||
35 | 35 | ||
36 | #include <linux/types.h> /* FIXME: kvm_para.h needs this */ | 36 | #include <linux/types.h> /* FIXME: kvm_para.h needs this */ |
37 | 37 | ||
38 | #include <linux/stop_machine.h> | ||
38 | #include <linux/kvm_para.h> | 39 | #include <linux/kvm_para.h> |
39 | #include <linux/uaccess.h> | 40 | #include <linux/uaccess.h> |
40 | #include <linux/module.h> | 41 | #include <linux/module.h> |
@@ -143,22 +144,28 @@ struct set_mtrr_data { | |||
143 | mtrr_type smp_type; | 144 | mtrr_type smp_type; |
144 | }; | 145 | }; |
145 | 146 | ||
147 | static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); | ||
148 | |||
146 | /** | 149 | /** |
147 | * ipi_handler - Synchronisation handler. Executed by "other" CPUs. | 150 | * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. |
148 | * @info: pointer to mtrr configuration data | 151 | * @info: pointer to mtrr configuration data |
149 | * | 152 | * |
150 | * Returns nothing. | 153 | * Returns nothing. |
151 | */ | 154 | */ |
152 | static void ipi_handler(void *info) | 155 | static int mtrr_work_handler(void *info) |
153 | { | 156 | { |
154 | #ifdef CONFIG_SMP | 157 | #ifdef CONFIG_SMP |
155 | struct set_mtrr_data *data = info; | 158 | struct set_mtrr_data *data = info; |
156 | unsigned long flags; | 159 | unsigned long flags; |
157 | 160 | ||
161 | atomic_dec(&data->count); | ||
162 | while (!atomic_read(&data->gate)) | ||
163 | cpu_relax(); | ||
164 | |||
158 | local_irq_save(flags); | 165 | local_irq_save(flags); |
159 | 166 | ||
160 | atomic_dec(&data->count); | 167 | atomic_dec(&data->count); |
161 | while (!atomic_read(&data->gate)) | 168 | while (atomic_read(&data->gate)) |
162 | cpu_relax(); | 169 | cpu_relax(); |
163 | 170 | ||
164 | /* The master has cleared me to execute */ | 171 | /* The master has cleared me to execute */ |
@@ -173,12 +180,13 @@ static void ipi_handler(void *info) | |||
173 | } | 180 | } |
174 | 181 | ||
175 | atomic_dec(&data->count); | 182 | atomic_dec(&data->count); |
176 | while (atomic_read(&data->gate)) | 183 | while (!atomic_read(&data->gate)) |
177 | cpu_relax(); | 184 | cpu_relax(); |
178 | 185 | ||
179 | atomic_dec(&data->count); | 186 | atomic_dec(&data->count); |
180 | local_irq_restore(flags); | 187 | local_irq_restore(flags); |
181 | #endif | 188 | #endif |
189 | return 0; | ||
182 | } | 190 | } |
183 | 191 | ||
184 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) | 192 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) |
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) | |||
198 | * | 206 | * |
199 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: | 207 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: |
200 | * | 208 | * |
201 | * 1. Send IPI to do the following: | 209 | * 1. Queue work to do the following on all processors: |
202 | * 2. Disable Interrupts | 210 | * 2. Disable Interrupts |
203 | * 3. Wait for all procs to do so | 211 | * 3. Wait for all procs to do so |
204 | * 4. Enter no-fill cache mode | 212 | * 4. Enter no-fill cache mode |
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) | |||
215 | * 15. Enable interrupts. | 223 | * 15. Enable interrupts. |
216 | * | 224 | * |
217 | * What does that mean for us? Well, first we set data.count to the number | 225 | * What does that mean for us? Well, first we set data.count to the number |
218 | * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait | 226 | * of CPUs. As each CPU announces that it started the rendezvous handler by |
219 | * until it hits 0 and proceed. We set the data.gate flag and reset data.count. | 227 | * decrementing the count, We reset data.count and set the data.gate flag |
220 | * Meanwhile, they are waiting for that flag to be set. Once it's set, each | 228 | * allowing all the cpu's to proceed with the work. As each cpu disables |
229 | * interrupts, it'll decrement data.count once. We wait until it hits 0 and | ||
230 | * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they | ||
231 | * are waiting for that flag to be cleared. Once it's cleared, each | ||
221 | * CPU goes through the transition of updating MTRRs. | 232 | * CPU goes through the transition of updating MTRRs. |
222 | * The CPU vendors may each do it differently, | 233 | * The CPU vendors may each do it differently, |
223 | * so we call mtrr_if->set() callback and let them take care of it. | 234 | * so we call mtrr_if->set() callback and let them take care of it. |
224 | * When they're done, they again decrement data->count and wait for data.gate | 235 | * When they're done, they again decrement data->count and wait for data.gate |
225 | * to be reset. | 236 | * to be set. |
226 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag | 237 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag |
227 | * Everyone then enables interrupts and we all continue on. | 238 | * Everyone then enables interrupts and we all continue on. |
228 | * | 239 | * |
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
234 | { | 245 | { |
235 | struct set_mtrr_data data; | 246 | struct set_mtrr_data data; |
236 | unsigned long flags; | 247 | unsigned long flags; |
248 | int cpu; | ||
249 | |||
250 | preempt_disable(); | ||
237 | 251 | ||
238 | data.smp_reg = reg; | 252 | data.smp_reg = reg; |
239 | data.smp_base = base; | 253 | data.smp_base = base; |
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
246 | atomic_set(&data.gate, 0); | 260 | atomic_set(&data.gate, 0); |
247 | 261 | ||
248 | /* Start the ball rolling on other CPUs */ | 262 | /* Start the ball rolling on other CPUs */ |
249 | if (smp_call_function(ipi_handler, &data, 0) != 0) | 263 | for_each_online_cpu(cpu) { |
250 | panic("mtrr: timed out waiting for other CPUs\n"); | 264 | struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); |
265 | |||
266 | if (cpu == smp_processor_id()) | ||
267 | continue; | ||
268 | |||
269 | stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); | ||
270 | } | ||
251 | 271 | ||
252 | local_irq_save(flags); | ||
253 | 272 | ||
254 | while (atomic_read(&data.count)) | 273 | while (atomic_read(&data.count)) |
255 | cpu_relax(); | 274 | cpu_relax(); |
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
259 | smp_wmb(); | 278 | smp_wmb(); |
260 | atomic_set(&data.gate, 1); | 279 | atomic_set(&data.gate, 1); |
261 | 280 | ||
281 | local_irq_save(flags); | ||
282 | |||
283 | while (atomic_read(&data.count)) | ||
284 | cpu_relax(); | ||
285 | |||
286 | /* Ok, reset count and toggle gate */ | ||
287 | atomic_set(&data.count, num_booting_cpus() - 1); | ||
288 | smp_wmb(); | ||
289 | atomic_set(&data.gate, 0); | ||
290 | |||
262 | /* Do our MTRR business */ | 291 | /* Do our MTRR business */ |
263 | 292 | ||
264 | /* | 293 | /* |
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
279 | 308 | ||
280 | atomic_set(&data.count, num_booting_cpus() - 1); | 309 | atomic_set(&data.count, num_booting_cpus() - 1); |
281 | smp_wmb(); | 310 | smp_wmb(); |
282 | atomic_set(&data.gate, 0); | 311 | atomic_set(&data.gate, 1); |
283 | 312 | ||
284 | /* | 313 | /* |
285 | * Wait here for everyone to have seen the gate change | 314 | * Wait here for everyone to have seen the gate change |
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ | |||
289 | cpu_relax(); | 318 | cpu_relax(); |
290 | 319 | ||
291 | local_irq_restore(flags); | 320 | local_irq_restore(flags); |
321 | preempt_enable(); | ||
292 | } | 322 | } |
293 | 323 | ||
294 | /** | 324 | /** |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index db5bdc8addf8..f2da20fda02d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -31,46 +31,51 @@ | |||
31 | #include <asm/nmi.h> | 31 | #include <asm/nmi.h> |
32 | #include <asm/compat.h> | 32 | #include <asm/compat.h> |
33 | 33 | ||
34 | static u64 perf_event_mask __read_mostly; | 34 | #if 0 |
35 | #undef wrmsrl | ||
36 | #define wrmsrl(msr, val) \ | ||
37 | do { \ | ||
38 | trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ | ||
39 | (unsigned long)(val)); \ | ||
40 | native_write_msr((msr), (u32)((u64)(val)), \ | ||
41 | (u32)((u64)(val) >> 32)); \ | ||
42 | } while (0) | ||
43 | #endif | ||
35 | 44 | ||
36 | /* The maximal number of PEBS events: */ | 45 | /* |
37 | #define MAX_PEBS_EVENTS 4 | 46 | * best effort, GUP based copy_from_user() that assumes IRQ or NMI context |
47 | */ | ||
48 | static unsigned long | ||
49 | copy_from_user_nmi(void *to, const void __user *from, unsigned long n) | ||
50 | { | ||
51 | unsigned long offset, addr = (unsigned long)from; | ||
52 | int type = in_nmi() ? KM_NMI : KM_IRQ0; | ||
53 | unsigned long size, len = 0; | ||
54 | struct page *page; | ||
55 | void *map; | ||
56 | int ret; | ||
38 | 57 | ||
39 | /* The size of a BTS record in bytes: */ | 58 | do { |
40 | #define BTS_RECORD_SIZE 24 | 59 | ret = __get_user_pages_fast(addr, 1, 0, &page); |
60 | if (!ret) | ||
61 | break; | ||
41 | 62 | ||
42 | /* The size of a per-cpu BTS buffer in bytes: */ | 63 | offset = addr & (PAGE_SIZE - 1); |
43 | #define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) | 64 | size = min(PAGE_SIZE - offset, n - len); |
44 | 65 | ||
45 | /* The BTS overflow threshold in bytes from the end of the buffer: */ | 66 | map = kmap_atomic(page, type); |
46 | #define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) | 67 | memcpy(to, map+offset, size); |
68 | kunmap_atomic(map, type); | ||
69 | put_page(page); | ||
47 | 70 | ||
71 | len += size; | ||
72 | to += size; | ||
73 | addr += size; | ||
48 | 74 | ||
49 | /* | 75 | } while (len < n); |
50 | * Bits in the debugctlmsr controlling branch tracing. | ||
51 | */ | ||
52 | #define X86_DEBUGCTL_TR (1 << 6) | ||
53 | #define X86_DEBUGCTL_BTS (1 << 7) | ||
54 | #define X86_DEBUGCTL_BTINT (1 << 8) | ||
55 | #define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) | ||
56 | #define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) | ||
57 | 76 | ||
58 | /* | 77 | return len; |
59 | * A debug store configuration. | 78 | } |
60 | * | ||
61 | * We only support architectures that use 64bit fields. | ||
62 | */ | ||
63 | struct debug_store { | ||
64 | u64 bts_buffer_base; | ||
65 | u64 bts_index; | ||
66 | u64 bts_absolute_maximum; | ||
67 | u64 bts_interrupt_threshold; | ||
68 | u64 pebs_buffer_base; | ||
69 | u64 pebs_index; | ||
70 | u64 pebs_absolute_maximum; | ||
71 | u64 pebs_interrupt_threshold; | ||
72 | u64 pebs_event_reset[MAX_PEBS_EVENTS]; | ||
73 | }; | ||
74 | 79 | ||
75 | struct event_constraint { | 80 | struct event_constraint { |
76 | union { | 81 | union { |
@@ -89,18 +94,42 @@ struct amd_nb { | |||
89 | struct event_constraint event_constraints[X86_PMC_IDX_MAX]; | 94 | struct event_constraint event_constraints[X86_PMC_IDX_MAX]; |
90 | }; | 95 | }; |
91 | 96 | ||
97 | #define MAX_LBR_ENTRIES 16 | ||
98 | |||
92 | struct cpu_hw_events { | 99 | struct cpu_hw_events { |
100 | /* | ||
101 | * Generic x86 PMC bits | ||
102 | */ | ||
93 | struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ | 103 | struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ |
94 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | 104 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; |
95 | unsigned long interrupts; | ||
96 | int enabled; | 105 | int enabled; |
97 | struct debug_store *ds; | ||
98 | 106 | ||
99 | int n_events; | 107 | int n_events; |
100 | int n_added; | 108 | int n_added; |
109 | int n_txn; | ||
101 | int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ | 110 | int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ |
102 | u64 tags[X86_PMC_IDX_MAX]; | 111 | u64 tags[X86_PMC_IDX_MAX]; |
103 | struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ | 112 | struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ |
113 | |||
114 | unsigned int group_flag; | ||
115 | |||
116 | /* | ||
117 | * Intel DebugStore bits | ||
118 | */ | ||
119 | struct debug_store *ds; | ||
120 | u64 pebs_enabled; | ||
121 | |||
122 | /* | ||
123 | * Intel LBR bits | ||
124 | */ | ||
125 | int lbr_users; | ||
126 | void *lbr_context; | ||
127 | struct perf_branch_stack lbr_stack; | ||
128 | struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; | ||
129 | |||
130 | /* | ||
131 | * AMD specific bits | ||
132 | */ | ||
104 | struct amd_nb *amd_nb; | 133 | struct amd_nb *amd_nb; |
105 | }; | 134 | }; |
106 | 135 | ||
@@ -114,44 +143,75 @@ struct cpu_hw_events { | |||
114 | #define EVENT_CONSTRAINT(c, n, m) \ | 143 | #define EVENT_CONSTRAINT(c, n, m) \ |
115 | __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) | 144 | __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) |
116 | 145 | ||
146 | /* | ||
147 | * Constraint on the Event code. | ||
148 | */ | ||
117 | #define INTEL_EVENT_CONSTRAINT(c, n) \ | 149 | #define INTEL_EVENT_CONSTRAINT(c, n) \ |
118 | EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) | 150 | EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) |
119 | 151 | ||
152 | /* | ||
153 | * Constraint on the Event code + UMask + fixed-mask | ||
154 | * | ||
155 | * filter mask to validate fixed counter events. | ||
156 | * the following filters disqualify for fixed counters: | ||
157 | * - inv | ||
158 | * - edge | ||
159 | * - cnt-mask | ||
160 | * The other filters are supported by fixed counters. | ||
161 | * The any-thread option is supported starting with v3. | ||
162 | */ | ||
120 | #define FIXED_EVENT_CONSTRAINT(c, n) \ | 163 | #define FIXED_EVENT_CONSTRAINT(c, n) \ |
121 | EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) | 164 | EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) |
165 | |||
166 | /* | ||
167 | * Constraint on the Event code + UMask | ||
168 | */ | ||
169 | #define PEBS_EVENT_CONSTRAINT(c, n) \ | ||
170 | EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) | ||
122 | 171 | ||
123 | #define EVENT_CONSTRAINT_END \ | 172 | #define EVENT_CONSTRAINT_END \ |
124 | EVENT_CONSTRAINT(0, 0, 0) | 173 | EVENT_CONSTRAINT(0, 0, 0) |
125 | 174 | ||
126 | #define for_each_event_constraint(e, c) \ | 175 | #define for_each_event_constraint(e, c) \ |
127 | for ((e) = (c); (e)->cmask; (e)++) | 176 | for ((e) = (c); (e)->weight; (e)++) |
177 | |||
178 | union perf_capabilities { | ||
179 | struct { | ||
180 | u64 lbr_format : 6; | ||
181 | u64 pebs_trap : 1; | ||
182 | u64 pebs_arch_reg : 1; | ||
183 | u64 pebs_format : 4; | ||
184 | u64 smm_freeze : 1; | ||
185 | }; | ||
186 | u64 capabilities; | ||
187 | }; | ||
128 | 188 | ||
129 | /* | 189 | /* |
130 | * struct x86_pmu - generic x86 pmu | 190 | * struct x86_pmu - generic x86 pmu |
131 | */ | 191 | */ |
132 | struct x86_pmu { | 192 | struct x86_pmu { |
193 | /* | ||
194 | * Generic x86 PMC bits | ||
195 | */ | ||
133 | const char *name; | 196 | const char *name; |
134 | int version; | 197 | int version; |
135 | int (*handle_irq)(struct pt_regs *); | 198 | int (*handle_irq)(struct pt_regs *); |
136 | void (*disable_all)(void); | 199 | void (*disable_all)(void); |
137 | void (*enable_all)(void); | 200 | void (*enable_all)(int added); |
138 | void (*enable)(struct perf_event *); | 201 | void (*enable)(struct perf_event *); |
139 | void (*disable)(struct perf_event *); | 202 | void (*disable)(struct perf_event *); |
203 | int (*hw_config)(struct perf_event *event); | ||
204 | int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); | ||
140 | unsigned eventsel; | 205 | unsigned eventsel; |
141 | unsigned perfctr; | 206 | unsigned perfctr; |
142 | u64 (*event_map)(int); | 207 | u64 (*event_map)(int); |
143 | u64 (*raw_event)(u64); | ||
144 | int max_events; | 208 | int max_events; |
145 | int num_events; | 209 | int num_counters; |
146 | int num_events_fixed; | 210 | int num_counters_fixed; |
147 | int event_bits; | 211 | int cntval_bits; |
148 | u64 event_mask; | 212 | u64 cntval_mask; |
149 | int apic; | 213 | int apic; |
150 | u64 max_period; | 214 | u64 max_period; |
151 | u64 intel_ctrl; | ||
152 | void (*enable_bts)(u64 config); | ||
153 | void (*disable_bts)(void); | ||
154 | |||
155 | struct event_constraint * | 215 | struct event_constraint * |
156 | (*get_event_constraints)(struct cpu_hw_events *cpuc, | 216 | (*get_event_constraints)(struct cpu_hw_events *cpuc, |
157 | struct perf_event *event); | 217 | struct perf_event *event); |
@@ -159,11 +219,33 @@ struct x86_pmu { | |||
159 | void (*put_event_constraints)(struct cpu_hw_events *cpuc, | 219 | void (*put_event_constraints)(struct cpu_hw_events *cpuc, |
160 | struct perf_event *event); | 220 | struct perf_event *event); |
161 | struct event_constraint *event_constraints; | 221 | struct event_constraint *event_constraints; |
222 | void (*quirks)(void); | ||
223 | int perfctr_second_write; | ||
162 | 224 | ||
163 | int (*cpu_prepare)(int cpu); | 225 | int (*cpu_prepare)(int cpu); |
164 | void (*cpu_starting)(int cpu); | 226 | void (*cpu_starting)(int cpu); |
165 | void (*cpu_dying)(int cpu); | 227 | void (*cpu_dying)(int cpu); |
166 | void (*cpu_dead)(int cpu); | 228 | void (*cpu_dead)(int cpu); |
229 | |||
230 | /* | ||
231 | * Intel Arch Perfmon v2+ | ||
232 | */ | ||
233 | u64 intel_ctrl; | ||
234 | union perf_capabilities intel_cap; | ||
235 | |||
236 | /* | ||
237 | * Intel DebugStore bits | ||
238 | */ | ||
239 | int bts, pebs; | ||
240 | int pebs_record_size; | ||
241 | void (*drain_pebs)(struct pt_regs *regs); | ||
242 | struct event_constraint *pebs_constraints; | ||
243 | |||
244 | /* | ||
245 | * Intel LBR | ||
246 | */ | ||
247 | unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ | ||
248 | int lbr_nr; /* hardware stack size */ | ||
167 | }; | 249 | }; |
168 | 250 | ||
169 | static struct x86_pmu x86_pmu __read_mostly; | 251 | static struct x86_pmu x86_pmu __read_mostly; |
@@ -198,7 +280,7 @@ static u64 | |||
198 | x86_perf_event_update(struct perf_event *event) | 280 | x86_perf_event_update(struct perf_event *event) |
199 | { | 281 | { |
200 | struct hw_perf_event *hwc = &event->hw; | 282 | struct hw_perf_event *hwc = &event->hw; |
201 | int shift = 64 - x86_pmu.event_bits; | 283 | int shift = 64 - x86_pmu.cntval_bits; |
202 | u64 prev_raw_count, new_raw_count; | 284 | u64 prev_raw_count, new_raw_count; |
203 | int idx = hwc->idx; | 285 | int idx = hwc->idx; |
204 | s64 delta; | 286 | s64 delta; |
@@ -214,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) | |||
214 | * count to the generic event atomically: | 296 | * count to the generic event atomically: |
215 | */ | 297 | */ |
216 | again: | 298 | again: |
217 | prev_raw_count = atomic64_read(&hwc->prev_count); | 299 | prev_raw_count = local64_read(&hwc->prev_count); |
218 | rdmsrl(hwc->event_base + idx, new_raw_count); | 300 | rdmsrl(hwc->event_base + idx, new_raw_count); |
219 | 301 | ||
220 | if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, | 302 | if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, |
221 | new_raw_count) != prev_raw_count) | 303 | new_raw_count) != prev_raw_count) |
222 | goto again; | 304 | goto again; |
223 | 305 | ||
@@ -232,8 +314,8 @@ again: | |||
232 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | 314 | delta = (new_raw_count << shift) - (prev_raw_count << shift); |
233 | delta >>= shift; | 315 | delta >>= shift; |
234 | 316 | ||
235 | atomic64_add(delta, &event->count); | 317 | local64_add(delta, &event->count); |
236 | atomic64_sub(delta, &hwc->period_left); | 318 | local64_sub(delta, &hwc->period_left); |
237 | 319 | ||
238 | return new_raw_count; | 320 | return new_raw_count; |
239 | } | 321 | } |
@@ -241,33 +323,32 @@ again: | |||
241 | static atomic_t active_events; | 323 | static atomic_t active_events; |
242 | static DEFINE_MUTEX(pmc_reserve_mutex); | 324 | static DEFINE_MUTEX(pmc_reserve_mutex); |
243 | 325 | ||
326 | #ifdef CONFIG_X86_LOCAL_APIC | ||
327 | |||
244 | static bool reserve_pmc_hardware(void) | 328 | static bool reserve_pmc_hardware(void) |
245 | { | 329 | { |
246 | #ifdef CONFIG_X86_LOCAL_APIC | ||
247 | int i; | 330 | int i; |
248 | 331 | ||
249 | if (nmi_watchdog == NMI_LOCAL_APIC) | 332 | if (nmi_watchdog == NMI_LOCAL_APIC) |
250 | disable_lapic_nmi_watchdog(); | 333 | disable_lapic_nmi_watchdog(); |
251 | 334 | ||
252 | for (i = 0; i < x86_pmu.num_events; i++) { | 335 | for (i = 0; i < x86_pmu.num_counters; i++) { |
253 | if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) | 336 | if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) |
254 | goto perfctr_fail; | 337 | goto perfctr_fail; |
255 | } | 338 | } |
256 | 339 | ||
257 | for (i = 0; i < x86_pmu.num_events; i++) { | 340 | for (i = 0; i < x86_pmu.num_counters; i++) { |
258 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) | 341 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) |
259 | goto eventsel_fail; | 342 | goto eventsel_fail; |
260 | } | 343 | } |
261 | #endif | ||
262 | 344 | ||
263 | return true; | 345 | return true; |
264 | 346 | ||
265 | #ifdef CONFIG_X86_LOCAL_APIC | ||
266 | eventsel_fail: | 347 | eventsel_fail: |
267 | for (i--; i >= 0; i--) | 348 | for (i--; i >= 0; i--) |
268 | release_evntsel_nmi(x86_pmu.eventsel + i); | 349 | release_evntsel_nmi(x86_pmu.eventsel + i); |
269 | 350 | ||
270 | i = x86_pmu.num_events; | 351 | i = x86_pmu.num_counters; |
271 | 352 | ||
272 | perfctr_fail: | 353 | perfctr_fail: |
273 | for (i--; i >= 0; i--) | 354 | for (i--; i >= 0; i--) |
@@ -277,128 +358,36 @@ perfctr_fail: | |||
277 | enable_lapic_nmi_watchdog(); | 358 | enable_lapic_nmi_watchdog(); |
278 | 359 | ||
279 | return false; | 360 | return false; |
280 | #endif | ||
281 | } | 361 | } |
282 | 362 | ||
283 | static void release_pmc_hardware(void) | 363 | static void release_pmc_hardware(void) |
284 | { | 364 | { |
285 | #ifdef CONFIG_X86_LOCAL_APIC | ||
286 | int i; | 365 | int i; |
287 | 366 | ||
288 | for (i = 0; i < x86_pmu.num_events; i++) { | 367 | for (i = 0; i < x86_pmu.num_counters; i++) { |
289 | release_perfctr_nmi(x86_pmu.perfctr + i); | 368 | release_perfctr_nmi(x86_pmu.perfctr + i); |
290 | release_evntsel_nmi(x86_pmu.eventsel + i); | 369 | release_evntsel_nmi(x86_pmu.eventsel + i); |
291 | } | 370 | } |
292 | 371 | ||
293 | if (nmi_watchdog == NMI_LOCAL_APIC) | 372 | if (nmi_watchdog == NMI_LOCAL_APIC) |
294 | enable_lapic_nmi_watchdog(); | 373 | enable_lapic_nmi_watchdog(); |
295 | #endif | ||
296 | } | ||
297 | |||
298 | static inline bool bts_available(void) | ||
299 | { | ||
300 | return x86_pmu.enable_bts != NULL; | ||
301 | } | ||
302 | |||
303 | static void init_debug_store_on_cpu(int cpu) | ||
304 | { | ||
305 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
306 | |||
307 | if (!ds) | ||
308 | return; | ||
309 | |||
310 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, | ||
311 | (u32)((u64)(unsigned long)ds), | ||
312 | (u32)((u64)(unsigned long)ds >> 32)); | ||
313 | } | ||
314 | |||
315 | static void fini_debug_store_on_cpu(int cpu) | ||
316 | { | ||
317 | if (!per_cpu(cpu_hw_events, cpu).ds) | ||
318 | return; | ||
319 | |||
320 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); | ||
321 | } | ||
322 | |||
323 | static void release_bts_hardware(void) | ||
324 | { | ||
325 | int cpu; | ||
326 | |||
327 | if (!bts_available()) | ||
328 | return; | ||
329 | |||
330 | get_online_cpus(); | ||
331 | |||
332 | for_each_online_cpu(cpu) | ||
333 | fini_debug_store_on_cpu(cpu); | ||
334 | |||
335 | for_each_possible_cpu(cpu) { | ||
336 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
337 | |||
338 | if (!ds) | ||
339 | continue; | ||
340 | |||
341 | per_cpu(cpu_hw_events, cpu).ds = NULL; | ||
342 | |||
343 | kfree((void *)(unsigned long)ds->bts_buffer_base); | ||
344 | kfree(ds); | ||
345 | } | ||
346 | |||
347 | put_online_cpus(); | ||
348 | } | 374 | } |
349 | 375 | ||
350 | static int reserve_bts_hardware(void) | 376 | #else |
351 | { | ||
352 | int cpu, err = 0; | ||
353 | |||
354 | if (!bts_available()) | ||
355 | return 0; | ||
356 | |||
357 | get_online_cpus(); | ||
358 | |||
359 | for_each_possible_cpu(cpu) { | ||
360 | struct debug_store *ds; | ||
361 | void *buffer; | ||
362 | |||
363 | err = -ENOMEM; | ||
364 | buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); | ||
365 | if (unlikely(!buffer)) | ||
366 | break; | ||
367 | |||
368 | ds = kzalloc(sizeof(*ds), GFP_KERNEL); | ||
369 | if (unlikely(!ds)) { | ||
370 | kfree(buffer); | ||
371 | break; | ||
372 | } | ||
373 | |||
374 | ds->bts_buffer_base = (u64)(unsigned long)buffer; | ||
375 | ds->bts_index = ds->bts_buffer_base; | ||
376 | ds->bts_absolute_maximum = | ||
377 | ds->bts_buffer_base + BTS_BUFFER_SIZE; | ||
378 | ds->bts_interrupt_threshold = | ||
379 | ds->bts_absolute_maximum - BTS_OVFL_TH; | ||
380 | 377 | ||
381 | per_cpu(cpu_hw_events, cpu).ds = ds; | 378 | static bool reserve_pmc_hardware(void) { return true; } |
382 | err = 0; | 379 | static void release_pmc_hardware(void) {} |
383 | } | ||
384 | 380 | ||
385 | if (err) | 381 | #endif |
386 | release_bts_hardware(); | ||
387 | else { | ||
388 | for_each_online_cpu(cpu) | ||
389 | init_debug_store_on_cpu(cpu); | ||
390 | } | ||
391 | |||
392 | put_online_cpus(); | ||
393 | 382 | ||
394 | return err; | 383 | static int reserve_ds_buffers(void); |
395 | } | 384 | static void release_ds_buffers(void); |
396 | 385 | ||
397 | static void hw_perf_event_destroy(struct perf_event *event) | 386 | static void hw_perf_event_destroy(struct perf_event *event) |
398 | { | 387 | { |
399 | if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { | 388 | if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { |
400 | release_pmc_hardware(); | 389 | release_pmc_hardware(); |
401 | release_bts_hardware(); | 390 | release_ds_buffers(); |
402 | mutex_unlock(&pmc_reserve_mutex); | 391 | mutex_unlock(&pmc_reserve_mutex); |
403 | } | 392 | } |
404 | } | 393 | } |
@@ -441,59 +430,16 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) | |||
441 | return 0; | 430 | return 0; |
442 | } | 431 | } |
443 | 432 | ||
444 | /* | 433 | static int x86_setup_perfctr(struct perf_event *event) |
445 | * Setup the hardware configuration for a given attr_type | ||
446 | */ | ||
447 | static int __hw_perf_event_init(struct perf_event *event) | ||
448 | { | 434 | { |
449 | struct perf_event_attr *attr = &event->attr; | 435 | struct perf_event_attr *attr = &event->attr; |
450 | struct hw_perf_event *hwc = &event->hw; | 436 | struct hw_perf_event *hwc = &event->hw; |
451 | u64 config; | 437 | u64 config; |
452 | int err; | ||
453 | |||
454 | if (!x86_pmu_initialized()) | ||
455 | return -ENODEV; | ||
456 | |||
457 | err = 0; | ||
458 | if (!atomic_inc_not_zero(&active_events)) { | ||
459 | mutex_lock(&pmc_reserve_mutex); | ||
460 | if (atomic_read(&active_events) == 0) { | ||
461 | if (!reserve_pmc_hardware()) | ||
462 | err = -EBUSY; | ||
463 | else | ||
464 | err = reserve_bts_hardware(); | ||
465 | } | ||
466 | if (!err) | ||
467 | atomic_inc(&active_events); | ||
468 | mutex_unlock(&pmc_reserve_mutex); | ||
469 | } | ||
470 | if (err) | ||
471 | return err; | ||
472 | |||
473 | event->destroy = hw_perf_event_destroy; | ||
474 | |||
475 | /* | ||
476 | * Generate PMC IRQs: | ||
477 | * (keep 'enabled' bit clear for now) | ||
478 | */ | ||
479 | hwc->config = ARCH_PERFMON_EVENTSEL_INT; | ||
480 | |||
481 | hwc->idx = -1; | ||
482 | hwc->last_cpu = -1; | ||
483 | hwc->last_tag = ~0ULL; | ||
484 | |||
485 | /* | ||
486 | * Count user and OS events unless requested not to. | ||
487 | */ | ||
488 | if (!attr->exclude_user) | ||
489 | hwc->config |= ARCH_PERFMON_EVENTSEL_USR; | ||
490 | if (!attr->exclude_kernel) | ||
491 | hwc->config |= ARCH_PERFMON_EVENTSEL_OS; | ||
492 | 438 | ||
493 | if (!hwc->sample_period) { | 439 | if (!hwc->sample_period) { |
494 | hwc->sample_period = x86_pmu.max_period; | 440 | hwc->sample_period = x86_pmu.max_period; |
495 | hwc->last_period = hwc->sample_period; | 441 | hwc->last_period = hwc->sample_period; |
496 | atomic64_set(&hwc->period_left, hwc->sample_period); | 442 | local64_set(&hwc->period_left, hwc->sample_period); |
497 | } else { | 443 | } else { |
498 | /* | 444 | /* |
499 | * If we have a PMU initialized but no APIC | 445 | * If we have a PMU initialized but no APIC |
@@ -505,16 +451,8 @@ static int __hw_perf_event_init(struct perf_event *event) | |||
505 | return -EOPNOTSUPP; | 451 | return -EOPNOTSUPP; |
506 | } | 452 | } |
507 | 453 | ||
508 | /* | 454 | if (attr->type == PERF_TYPE_RAW) |
509 | * Raw hw_event type provide the config in the hw_event structure | ||
510 | */ | ||
511 | if (attr->type == PERF_TYPE_RAW) { | ||
512 | hwc->config |= x86_pmu.raw_event(attr->config); | ||
513 | if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && | ||
514 | perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
515 | return -EACCES; | ||
516 | return 0; | 455 | return 0; |
517 | } | ||
518 | 456 | ||
519 | if (attr->type == PERF_TYPE_HW_CACHE) | 457 | if (attr->type == PERF_TYPE_HW_CACHE) |
520 | return set_ext_hw_attr(hwc, attr); | 458 | return set_ext_hw_attr(hwc, attr); |
@@ -539,11 +477,11 @@ static int __hw_perf_event_init(struct perf_event *event) | |||
539 | if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && | 477 | if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && |
540 | (hwc->sample_period == 1)) { | 478 | (hwc->sample_period == 1)) { |
541 | /* BTS is not supported by this architecture. */ | 479 | /* BTS is not supported by this architecture. */ |
542 | if (!bts_available()) | 480 | if (!x86_pmu.bts) |
543 | return -EOPNOTSUPP; | 481 | return -EOPNOTSUPP; |
544 | 482 | ||
545 | /* BTS is currently only allowed for user-mode. */ | 483 | /* BTS is currently only allowed for user-mode. */ |
546 | if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) | 484 | if (!attr->exclude_kernel) |
547 | return -EOPNOTSUPP; | 485 | return -EOPNOTSUPP; |
548 | } | 486 | } |
549 | 487 | ||
@@ -552,12 +490,87 @@ static int __hw_perf_event_init(struct perf_event *event) | |||
552 | return 0; | 490 | return 0; |
553 | } | 491 | } |
554 | 492 | ||
493 | static int x86_pmu_hw_config(struct perf_event *event) | ||
494 | { | ||
495 | if (event->attr.precise_ip) { | ||
496 | int precise = 0; | ||
497 | |||
498 | /* Support for constant skid */ | ||
499 | if (x86_pmu.pebs) | ||
500 | precise++; | ||
501 | |||
502 | /* Support for IP fixup */ | ||
503 | if (x86_pmu.lbr_nr) | ||
504 | precise++; | ||
505 | |||
506 | if (event->attr.precise_ip > precise) | ||
507 | return -EOPNOTSUPP; | ||
508 | } | ||
509 | |||
510 | /* | ||
511 | * Generate PMC IRQs: | ||
512 | * (keep 'enabled' bit clear for now) | ||
513 | */ | ||
514 | event->hw.config = ARCH_PERFMON_EVENTSEL_INT; | ||
515 | |||
516 | /* | ||
517 | * Count user and OS events unless requested not to | ||
518 | */ | ||
519 | if (!event->attr.exclude_user) | ||
520 | event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; | ||
521 | if (!event->attr.exclude_kernel) | ||
522 | event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; | ||
523 | |||
524 | if (event->attr.type == PERF_TYPE_RAW) | ||
525 | event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; | ||
526 | |||
527 | return x86_setup_perfctr(event); | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Setup the hardware configuration for a given attr_type | ||
532 | */ | ||
533 | static int __hw_perf_event_init(struct perf_event *event) | ||
534 | { | ||
535 | int err; | ||
536 | |||
537 | if (!x86_pmu_initialized()) | ||
538 | return -ENODEV; | ||
539 | |||
540 | err = 0; | ||
541 | if (!atomic_inc_not_zero(&active_events)) { | ||
542 | mutex_lock(&pmc_reserve_mutex); | ||
543 | if (atomic_read(&active_events) == 0) { | ||
544 | if (!reserve_pmc_hardware()) | ||
545 | err = -EBUSY; | ||
546 | else { | ||
547 | err = reserve_ds_buffers(); | ||
548 | if (err) | ||
549 | release_pmc_hardware(); | ||
550 | } | ||
551 | } | ||
552 | if (!err) | ||
553 | atomic_inc(&active_events); | ||
554 | mutex_unlock(&pmc_reserve_mutex); | ||
555 | } | ||
556 | if (err) | ||
557 | return err; | ||
558 | |||
559 | event->destroy = hw_perf_event_destroy; | ||
560 | |||
561 | event->hw.idx = -1; | ||
562 | event->hw.last_cpu = -1; | ||
563 | event->hw.last_tag = ~0ULL; | ||
564 | |||
565 | return x86_pmu.hw_config(event); | ||
566 | } | ||
567 | |||
555 | static void x86_pmu_disable_all(void) | 568 | static void x86_pmu_disable_all(void) |
556 | { | 569 | { |
557 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 570 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
558 | int idx; | 571 | int idx; |
559 | 572 | ||
560 | for (idx = 0; idx < x86_pmu.num_events; idx++) { | 573 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
561 | u64 val; | 574 | u64 val; |
562 | 575 | ||
563 | if (!test_bit(idx, cpuc->active_mask)) | 576 | if (!test_bit(idx, cpuc->active_mask)) |
@@ -587,12 +600,12 @@ void hw_perf_disable(void) | |||
587 | x86_pmu.disable_all(); | 600 | x86_pmu.disable_all(); |
588 | } | 601 | } |
589 | 602 | ||
590 | static void x86_pmu_enable_all(void) | 603 | static void x86_pmu_enable_all(int added) |
591 | { | 604 | { |
592 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 605 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
593 | int idx; | 606 | int idx; |
594 | 607 | ||
595 | for (idx = 0; idx < x86_pmu.num_events; idx++) { | 608 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
596 | struct perf_event *event = cpuc->events[idx]; | 609 | struct perf_event *event = cpuc->events[idx]; |
597 | u64 val; | 610 | u64 val; |
598 | 611 | ||
@@ -667,14 +680,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) | |||
667 | * assign events to counters starting with most | 680 | * assign events to counters starting with most |
668 | * constrained events. | 681 | * constrained events. |
669 | */ | 682 | */ |
670 | wmax = x86_pmu.num_events; | 683 | wmax = x86_pmu.num_counters; |
671 | 684 | ||
672 | /* | 685 | /* |
673 | * when fixed event counters are present, | 686 | * when fixed event counters are present, |
674 | * wmax is incremented by 1 to account | 687 | * wmax is incremented by 1 to account |
675 | * for one more choice | 688 | * for one more choice |
676 | */ | 689 | */ |
677 | if (x86_pmu.num_events_fixed) | 690 | if (x86_pmu.num_counters_fixed) |
678 | wmax++; | 691 | wmax++; |
679 | 692 | ||
680 | for (w = 1, num = n; num && w <= wmax; w++) { | 693 | for (w = 1, num = n; num && w <= wmax; w++) { |
@@ -724,7 +737,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, | |||
724 | struct perf_event *event; | 737 | struct perf_event *event; |
725 | int n, max_count; | 738 | int n, max_count; |
726 | 739 | ||
727 | max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; | 740 | max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; |
728 | 741 | ||
729 | /* current number of events already accepted */ | 742 | /* current number of events already accepted */ |
730 | n = cpuc->n_events; | 743 | n = cpuc->n_events; |
@@ -795,7 +808,7 @@ void hw_perf_enable(void) | |||
795 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 808 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
796 | struct perf_event *event; | 809 | struct perf_event *event; |
797 | struct hw_perf_event *hwc; | 810 | struct hw_perf_event *hwc; |
798 | int i; | 811 | int i, added = cpuc->n_added; |
799 | 812 | ||
800 | if (!x86_pmu_initialized()) | 813 | if (!x86_pmu_initialized()) |
801 | return; | 814 | return; |
@@ -847,19 +860,20 @@ void hw_perf_enable(void) | |||
847 | cpuc->enabled = 1; | 860 | cpuc->enabled = 1; |
848 | barrier(); | 861 | barrier(); |
849 | 862 | ||
850 | x86_pmu.enable_all(); | 863 | x86_pmu.enable_all(added); |
851 | } | 864 | } |
852 | 865 | ||
853 | static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) | 866 | static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, |
867 | u64 enable_mask) | ||
854 | { | 868 | { |
855 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, | 869 | wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); |
856 | hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); | ||
857 | } | 870 | } |
858 | 871 | ||
859 | static inline void x86_pmu_disable_event(struct perf_event *event) | 872 | static inline void x86_pmu_disable_event(struct perf_event *event) |
860 | { | 873 | { |
861 | struct hw_perf_event *hwc = &event->hw; | 874 | struct hw_perf_event *hwc = &event->hw; |
862 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); | 875 | |
876 | wrmsrl(hwc->config_base + hwc->idx, hwc->config); | ||
863 | } | 877 | } |
864 | 878 | ||
865 | static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); | 879 | static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); |
@@ -872,9 +886,9 @@ static int | |||
872 | x86_perf_event_set_period(struct perf_event *event) | 886 | x86_perf_event_set_period(struct perf_event *event) |
873 | { | 887 | { |
874 | struct hw_perf_event *hwc = &event->hw; | 888 | struct hw_perf_event *hwc = &event->hw; |
875 | s64 left = atomic64_read(&hwc->period_left); | 889 | s64 left = local64_read(&hwc->period_left); |
876 | s64 period = hwc->sample_period; | 890 | s64 period = hwc->sample_period; |
877 | int err, ret = 0, idx = hwc->idx; | 891 | int ret = 0, idx = hwc->idx; |
878 | 892 | ||
879 | if (idx == X86_PMC_IDX_FIXED_BTS) | 893 | if (idx == X86_PMC_IDX_FIXED_BTS) |
880 | return 0; | 894 | return 0; |
@@ -884,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) | |||
884 | */ | 898 | */ |
885 | if (unlikely(left <= -period)) { | 899 | if (unlikely(left <= -period)) { |
886 | left = period; | 900 | left = period; |
887 | atomic64_set(&hwc->period_left, left); | 901 | local64_set(&hwc->period_left, left); |
888 | hwc->last_period = period; | 902 | hwc->last_period = period; |
889 | ret = 1; | 903 | ret = 1; |
890 | } | 904 | } |
891 | 905 | ||
892 | if (unlikely(left <= 0)) { | 906 | if (unlikely(left <= 0)) { |
893 | left += period; | 907 | left += period; |
894 | atomic64_set(&hwc->period_left, left); | 908 | local64_set(&hwc->period_left, left); |
895 | hwc->last_period = period; | 909 | hwc->last_period = period; |
896 | ret = 1; | 910 | ret = 1; |
897 | } | 911 | } |
@@ -910,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event) | |||
910 | * The hw event starts counting from this event offset, | 924 | * The hw event starts counting from this event offset, |
911 | * mark it to be able to extra future deltas: | 925 | * mark it to be able to extra future deltas: |
912 | */ | 926 | */ |
913 | atomic64_set(&hwc->prev_count, (u64)-left); | 927 | local64_set(&hwc->prev_count, (u64)-left); |
928 | |||
929 | wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); | ||
914 | 930 | ||
915 | err = checking_wrmsrl(hwc->event_base + idx, | 931 | /* |
916 | (u64)(-left) & x86_pmu.event_mask); | 932 | * Due to erratum on certan cpu we need |
933 | * a second write to be sure the register | ||
934 | * is updated properly | ||
935 | */ | ||
936 | if (x86_pmu.perfctr_second_write) { | ||
937 | wrmsrl(hwc->event_base + idx, | ||
938 | (u64)(-left) & x86_pmu.cntval_mask); | ||
939 | } | ||
917 | 940 | ||
918 | perf_event_update_userpage(event); | 941 | perf_event_update_userpage(event); |
919 | 942 | ||
@@ -924,7 +947,8 @@ static void x86_pmu_enable_event(struct perf_event *event) | |||
924 | { | 947 | { |
925 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 948 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
926 | if (cpuc->enabled) | 949 | if (cpuc->enabled) |
927 | __x86_pmu_enable_event(&event->hw); | 950 | __x86_pmu_enable_event(&event->hw, |
951 | ARCH_PERFMON_EVENTSEL_ENABLE); | ||
928 | } | 952 | } |
929 | 953 | ||
930 | /* | 954 | /* |
@@ -950,7 +974,15 @@ static int x86_pmu_enable(struct perf_event *event) | |||
950 | if (n < 0) | 974 | if (n < 0) |
951 | return n; | 975 | return n; |
952 | 976 | ||
953 | ret = x86_schedule_events(cpuc, n, assign); | 977 | /* |
978 | * If group events scheduling transaction was started, | ||
979 | * skip the schedulability test here, it will be peformed | ||
980 | * at commit time(->commit_txn) as a whole | ||
981 | */ | ||
982 | if (cpuc->group_flag & PERF_EVENT_TXN) | ||
983 | goto out; | ||
984 | |||
985 | ret = x86_pmu.schedule_events(cpuc, n, assign); | ||
954 | if (ret) | 986 | if (ret) |
955 | return ret; | 987 | return ret; |
956 | /* | 988 | /* |
@@ -959,8 +991,10 @@ static int x86_pmu_enable(struct perf_event *event) | |||
959 | */ | 991 | */ |
960 | memcpy(cpuc->assign, assign, n*sizeof(int)); | 992 | memcpy(cpuc->assign, assign, n*sizeof(int)); |
961 | 993 | ||
994 | out: | ||
962 | cpuc->n_events = n; | 995 | cpuc->n_events = n; |
963 | cpuc->n_added += n - n0; | 996 | cpuc->n_added += n - n0; |
997 | cpuc->n_txn += n - n0; | ||
964 | 998 | ||
965 | return 0; | 999 | return 0; |
966 | } | 1000 | } |
@@ -991,11 +1025,12 @@ static void x86_pmu_unthrottle(struct perf_event *event) | |||
991 | void perf_event_print_debug(void) | 1025 | void perf_event_print_debug(void) |
992 | { | 1026 | { |
993 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; | 1027 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; |
1028 | u64 pebs; | ||
994 | struct cpu_hw_events *cpuc; | 1029 | struct cpu_hw_events *cpuc; |
995 | unsigned long flags; | 1030 | unsigned long flags; |
996 | int cpu, idx; | 1031 | int cpu, idx; |
997 | 1032 | ||
998 | if (!x86_pmu.num_events) | 1033 | if (!x86_pmu.num_counters) |
999 | return; | 1034 | return; |
1000 | 1035 | ||
1001 | local_irq_save(flags); | 1036 | local_irq_save(flags); |
@@ -1008,16 +1043,18 @@ void perf_event_print_debug(void) | |||
1008 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); | 1043 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); |
1009 | rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); | 1044 | rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); |
1010 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); | 1045 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); |
1046 | rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); | ||
1011 | 1047 | ||
1012 | pr_info("\n"); | 1048 | pr_info("\n"); |
1013 | pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); | 1049 | pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); |
1014 | pr_info("CPU#%d: status: %016llx\n", cpu, status); | 1050 | pr_info("CPU#%d: status: %016llx\n", cpu, status); |
1015 | pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); | 1051 | pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); |
1016 | pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); | 1052 | pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); |
1053 | pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); | ||
1017 | } | 1054 | } |
1018 | pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); | 1055 | pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); |
1019 | 1056 | ||
1020 | for (idx = 0; idx < x86_pmu.num_events; idx++) { | 1057 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
1021 | rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); | 1058 | rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); |
1022 | rdmsrl(x86_pmu.perfctr + idx, pmc_count); | 1059 | rdmsrl(x86_pmu.perfctr + idx, pmc_count); |
1023 | 1060 | ||
@@ -1030,7 +1067,7 @@ void perf_event_print_debug(void) | |||
1030 | pr_info("CPU#%d: gen-PMC%d left: %016llx\n", | 1067 | pr_info("CPU#%d: gen-PMC%d left: %016llx\n", |
1031 | cpu, idx, prev_left); | 1068 | cpu, idx, prev_left); |
1032 | } | 1069 | } |
1033 | for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { | 1070 | for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { |
1034 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); | 1071 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); |
1035 | 1072 | ||
1036 | pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", | 1073 | pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", |
@@ -1064,6 +1101,14 @@ static void x86_pmu_disable(struct perf_event *event) | |||
1064 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1101 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1065 | int i; | 1102 | int i; |
1066 | 1103 | ||
1104 | /* | ||
1105 | * If we're called during a txn, we don't need to do anything. | ||
1106 | * The events never got scheduled and ->cancel_txn will truncate | ||
1107 | * the event_list. | ||
1108 | */ | ||
1109 | if (cpuc->group_flag & PERF_EVENT_TXN) | ||
1110 | return; | ||
1111 | |||
1067 | x86_pmu_stop(event); | 1112 | x86_pmu_stop(event); |
1068 | 1113 | ||
1069 | for (i = 0; i < cpuc->n_events; i++) { | 1114 | for (i = 0; i < cpuc->n_events; i++) { |
@@ -1095,7 +1140,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1095 | 1140 | ||
1096 | cpuc = &__get_cpu_var(cpu_hw_events); | 1141 | cpuc = &__get_cpu_var(cpu_hw_events); |
1097 | 1142 | ||
1098 | for (idx = 0; idx < x86_pmu.num_events; idx++) { | 1143 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
1099 | if (!test_bit(idx, cpuc->active_mask)) | 1144 | if (!test_bit(idx, cpuc->active_mask)) |
1100 | continue; | 1145 | continue; |
1101 | 1146 | ||
@@ -1103,7 +1148,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1103 | hwc = &event->hw; | 1148 | hwc = &event->hw; |
1104 | 1149 | ||
1105 | val = x86_perf_event_update(event); | 1150 | val = x86_perf_event_update(event); |
1106 | if (val & (1ULL << (x86_pmu.event_bits - 1))) | 1151 | if (val & (1ULL << (x86_pmu.cntval_bits - 1))) |
1107 | continue; | 1152 | continue; |
1108 | 1153 | ||
1109 | /* | 1154 | /* |
@@ -1146,7 +1191,6 @@ void set_perf_event_pending(void) | |||
1146 | 1191 | ||
1147 | void perf_events_lapic_init(void) | 1192 | void perf_events_lapic_init(void) |
1148 | { | 1193 | { |
1149 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1150 | if (!x86_pmu.apic || !x86_pmu_initialized()) | 1194 | if (!x86_pmu.apic || !x86_pmu_initialized()) |
1151 | return; | 1195 | return; |
1152 | 1196 | ||
@@ -1154,7 +1198,6 @@ void perf_events_lapic_init(void) | |||
1154 | * Always use NMI for PMU | 1198 | * Always use NMI for PMU |
1155 | */ | 1199 | */ |
1156 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1200 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1157 | #endif | ||
1158 | } | 1201 | } |
1159 | 1202 | ||
1160 | static int __kprobes | 1203 | static int __kprobes |
@@ -1178,9 +1221,7 @@ perf_event_nmi_handler(struct notifier_block *self, | |||
1178 | 1221 | ||
1179 | regs = args->regs; | 1222 | regs = args->regs; |
1180 | 1223 | ||
1181 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1182 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1224 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1183 | #endif | ||
1184 | /* | 1225 | /* |
1185 | * Can't rely on the handled return value to say it was our NMI, two | 1226 | * Can't rely on the handled return value to say it was our NMI, two |
1186 | * events could trigger 'simultaneously' raising two back-to-back NMIs. | 1227 | * events could trigger 'simultaneously' raising two back-to-back NMIs. |
@@ -1217,118 +1258,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | |||
1217 | return &unconstrained; | 1258 | return &unconstrained; |
1218 | } | 1259 | } |
1219 | 1260 | ||
1220 | static int x86_event_sched_in(struct perf_event *event, | ||
1221 | struct perf_cpu_context *cpuctx) | ||
1222 | { | ||
1223 | int ret = 0; | ||
1224 | |||
1225 | event->state = PERF_EVENT_STATE_ACTIVE; | ||
1226 | event->oncpu = smp_processor_id(); | ||
1227 | event->tstamp_running += event->ctx->time - event->tstamp_stopped; | ||
1228 | |||
1229 | if (!is_x86_event(event)) | ||
1230 | ret = event->pmu->enable(event); | ||
1231 | |||
1232 | if (!ret && !is_software_event(event)) | ||
1233 | cpuctx->active_oncpu++; | ||
1234 | |||
1235 | if (!ret && event->attr.exclusive) | ||
1236 | cpuctx->exclusive = 1; | ||
1237 | |||
1238 | return ret; | ||
1239 | } | ||
1240 | |||
1241 | static void x86_event_sched_out(struct perf_event *event, | ||
1242 | struct perf_cpu_context *cpuctx) | ||
1243 | { | ||
1244 | event->state = PERF_EVENT_STATE_INACTIVE; | ||
1245 | event->oncpu = -1; | ||
1246 | |||
1247 | if (!is_x86_event(event)) | ||
1248 | event->pmu->disable(event); | ||
1249 | |||
1250 | event->tstamp_running -= event->ctx->time - event->tstamp_stopped; | ||
1251 | |||
1252 | if (!is_software_event(event)) | ||
1253 | cpuctx->active_oncpu--; | ||
1254 | |||
1255 | if (event->attr.exclusive || !cpuctx->active_oncpu) | ||
1256 | cpuctx->exclusive = 0; | ||
1257 | } | ||
1258 | |||
1259 | /* | ||
1260 | * Called to enable a whole group of events. | ||
1261 | * Returns 1 if the group was enabled, or -EAGAIN if it could not be. | ||
1262 | * Assumes the caller has disabled interrupts and has | ||
1263 | * frozen the PMU with hw_perf_save_disable. | ||
1264 | * | ||
1265 | * called with PMU disabled. If successful and return value 1, | ||
1266 | * then guaranteed to call perf_enable() and hw_perf_enable() | ||
1267 | */ | ||
1268 | int hw_perf_group_sched_in(struct perf_event *leader, | ||
1269 | struct perf_cpu_context *cpuctx, | ||
1270 | struct perf_event_context *ctx) | ||
1271 | { | ||
1272 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1273 | struct perf_event *sub; | ||
1274 | int assign[X86_PMC_IDX_MAX]; | ||
1275 | int n0, n1, ret; | ||
1276 | |||
1277 | /* n0 = total number of events */ | ||
1278 | n0 = collect_events(cpuc, leader, true); | ||
1279 | if (n0 < 0) | ||
1280 | return n0; | ||
1281 | |||
1282 | ret = x86_schedule_events(cpuc, n0, assign); | ||
1283 | if (ret) | ||
1284 | return ret; | ||
1285 | |||
1286 | ret = x86_event_sched_in(leader, cpuctx); | ||
1287 | if (ret) | ||
1288 | return ret; | ||
1289 | |||
1290 | n1 = 1; | ||
1291 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | ||
1292 | if (sub->state > PERF_EVENT_STATE_OFF) { | ||
1293 | ret = x86_event_sched_in(sub, cpuctx); | ||
1294 | if (ret) | ||
1295 | goto undo; | ||
1296 | ++n1; | ||
1297 | } | ||
1298 | } | ||
1299 | /* | ||
1300 | * copy new assignment, now we know it is possible | ||
1301 | * will be used by hw_perf_enable() | ||
1302 | */ | ||
1303 | memcpy(cpuc->assign, assign, n0*sizeof(int)); | ||
1304 | |||
1305 | cpuc->n_events = n0; | ||
1306 | cpuc->n_added += n1; | ||
1307 | ctx->nr_active += n1; | ||
1308 | |||
1309 | /* | ||
1310 | * 1 means successful and events are active | ||
1311 | * This is not quite true because we defer | ||
1312 | * actual activation until hw_perf_enable() but | ||
1313 | * this way we* ensure caller won't try to enable | ||
1314 | * individual events | ||
1315 | */ | ||
1316 | return 1; | ||
1317 | undo: | ||
1318 | x86_event_sched_out(leader, cpuctx); | ||
1319 | n0 = 1; | ||
1320 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | ||
1321 | if (sub->state == PERF_EVENT_STATE_ACTIVE) { | ||
1322 | x86_event_sched_out(sub, cpuctx); | ||
1323 | if (++n0 == n1) | ||
1324 | break; | ||
1325 | } | ||
1326 | } | ||
1327 | return ret; | ||
1328 | } | ||
1329 | |||
1330 | #include "perf_event_amd.c" | 1261 | #include "perf_event_amd.c" |
1331 | #include "perf_event_p6.c" | 1262 | #include "perf_event_p6.c" |
1263 | #include "perf_event_p4.c" | ||
1264 | #include "perf_event_intel_lbr.c" | ||
1265 | #include "perf_event_intel_ds.c" | ||
1332 | #include "perf_event_intel.c" | 1266 | #include "perf_event_intel.c" |
1333 | 1267 | ||
1334 | static int __cpuinit | 1268 | static int __cpuinit |
@@ -1402,48 +1336,50 @@ void __init init_hw_perf_events(void) | |||
1402 | 1336 | ||
1403 | pr_cont("%s PMU driver.\n", x86_pmu.name); | 1337 | pr_cont("%s PMU driver.\n", x86_pmu.name); |
1404 | 1338 | ||
1405 | if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { | 1339 | if (x86_pmu.quirks) |
1340 | x86_pmu.quirks(); | ||
1341 | |||
1342 | if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { | ||
1406 | WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", | 1343 | WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", |
1407 | x86_pmu.num_events, X86_PMC_MAX_GENERIC); | 1344 | x86_pmu.num_counters, X86_PMC_MAX_GENERIC); |
1408 | x86_pmu.num_events = X86_PMC_MAX_GENERIC; | 1345 | x86_pmu.num_counters = X86_PMC_MAX_GENERIC; |
1409 | } | 1346 | } |
1410 | perf_event_mask = (1 << x86_pmu.num_events) - 1; | 1347 | x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; |
1411 | perf_max_events = x86_pmu.num_events; | 1348 | perf_max_events = x86_pmu.num_counters; |
1412 | 1349 | ||
1413 | if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { | 1350 | if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { |
1414 | WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", | 1351 | WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", |
1415 | x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); | 1352 | x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); |
1416 | x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; | 1353 | x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; |
1417 | } | 1354 | } |
1418 | 1355 | ||
1419 | perf_event_mask |= | 1356 | x86_pmu.intel_ctrl |= |
1420 | ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; | 1357 | ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; |
1421 | x86_pmu.intel_ctrl = perf_event_mask; | ||
1422 | 1358 | ||
1423 | perf_events_lapic_init(); | 1359 | perf_events_lapic_init(); |
1424 | register_die_notifier(&perf_event_nmi_notifier); | 1360 | register_die_notifier(&perf_event_nmi_notifier); |
1425 | 1361 | ||
1426 | unconstrained = (struct event_constraint) | 1362 | unconstrained = (struct event_constraint) |
1427 | __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, | 1363 | __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, |
1428 | 0, x86_pmu.num_events); | 1364 | 0, x86_pmu.num_counters); |
1429 | 1365 | ||
1430 | if (x86_pmu.event_constraints) { | 1366 | if (x86_pmu.event_constraints) { |
1431 | for_each_event_constraint(c, x86_pmu.event_constraints) { | 1367 | for_each_event_constraint(c, x86_pmu.event_constraints) { |
1432 | if (c->cmask != INTEL_ARCH_FIXED_MASK) | 1368 | if (c->cmask != X86_RAW_EVENT_MASK) |
1433 | continue; | 1369 | continue; |
1434 | 1370 | ||
1435 | c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; | 1371 | c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; |
1436 | c->weight += x86_pmu.num_events; | 1372 | c->weight += x86_pmu.num_counters; |
1437 | } | 1373 | } |
1438 | } | 1374 | } |
1439 | 1375 | ||
1440 | pr_info("... version: %d\n", x86_pmu.version); | 1376 | pr_info("... version: %d\n", x86_pmu.version); |
1441 | pr_info("... bit width: %d\n", x86_pmu.event_bits); | 1377 | pr_info("... bit width: %d\n", x86_pmu.cntval_bits); |
1442 | pr_info("... generic registers: %d\n", x86_pmu.num_events); | 1378 | pr_info("... generic registers: %d\n", x86_pmu.num_counters); |
1443 | pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); | 1379 | pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); |
1444 | pr_info("... max period: %016Lx\n", x86_pmu.max_period); | 1380 | pr_info("... max period: %016Lx\n", x86_pmu.max_period); |
1445 | pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); | 1381 | pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); |
1446 | pr_info("... event mask: %016Lx\n", perf_event_mask); | 1382 | pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); |
1447 | 1383 | ||
1448 | perf_cpu_notifier(x86_pmu_notifier); | 1384 | perf_cpu_notifier(x86_pmu_notifier); |
1449 | } | 1385 | } |
@@ -1453,6 +1389,67 @@ static inline void x86_pmu_read(struct perf_event *event) | |||
1453 | x86_perf_event_update(event); | 1389 | x86_perf_event_update(event); |
1454 | } | 1390 | } |
1455 | 1391 | ||
1392 | /* | ||
1393 | * Start group events scheduling transaction | ||
1394 | * Set the flag to make pmu::enable() not perform the | ||
1395 | * schedulability test, it will be performed at commit time | ||
1396 | */ | ||
1397 | static void x86_pmu_start_txn(const struct pmu *pmu) | ||
1398 | { | ||
1399 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1400 | |||
1401 | cpuc->group_flag |= PERF_EVENT_TXN; | ||
1402 | cpuc->n_txn = 0; | ||
1403 | } | ||
1404 | |||
1405 | /* | ||
1406 | * Stop group events scheduling transaction | ||
1407 | * Clear the flag and pmu::enable() will perform the | ||
1408 | * schedulability test. | ||
1409 | */ | ||
1410 | static void x86_pmu_cancel_txn(const struct pmu *pmu) | ||
1411 | { | ||
1412 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1413 | |||
1414 | cpuc->group_flag &= ~PERF_EVENT_TXN; | ||
1415 | /* | ||
1416 | * Truncate the collected events. | ||
1417 | */ | ||
1418 | cpuc->n_added -= cpuc->n_txn; | ||
1419 | cpuc->n_events -= cpuc->n_txn; | ||
1420 | } | ||
1421 | |||
1422 | /* | ||
1423 | * Commit group events scheduling transaction | ||
1424 | * Perform the group schedulability test as a whole | ||
1425 | * Return 0 if success | ||
1426 | */ | ||
1427 | static int x86_pmu_commit_txn(const struct pmu *pmu) | ||
1428 | { | ||
1429 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1430 | int assign[X86_PMC_IDX_MAX]; | ||
1431 | int n, ret; | ||
1432 | |||
1433 | n = cpuc->n_events; | ||
1434 | |||
1435 | if (!x86_pmu_initialized()) | ||
1436 | return -EAGAIN; | ||
1437 | |||
1438 | ret = x86_pmu.schedule_events(cpuc, n, assign); | ||
1439 | if (ret) | ||
1440 | return ret; | ||
1441 | |||
1442 | /* | ||
1443 | * copy new assignment, now we know it is possible | ||
1444 | * will be used by hw_perf_enable() | ||
1445 | */ | ||
1446 | memcpy(cpuc->assign, assign, n*sizeof(int)); | ||
1447 | |||
1448 | cpuc->group_flag &= ~PERF_EVENT_TXN; | ||
1449 | |||
1450 | return 0; | ||
1451 | } | ||
1452 | |||
1456 | static const struct pmu pmu = { | 1453 | static const struct pmu pmu = { |
1457 | .enable = x86_pmu_enable, | 1454 | .enable = x86_pmu_enable, |
1458 | .disable = x86_pmu_disable, | 1455 | .disable = x86_pmu_disable, |
@@ -1460,9 +1457,38 @@ static const struct pmu pmu = { | |||
1460 | .stop = x86_pmu_stop, | 1457 | .stop = x86_pmu_stop, |
1461 | .read = x86_pmu_read, | 1458 | .read = x86_pmu_read, |
1462 | .unthrottle = x86_pmu_unthrottle, | 1459 | .unthrottle = x86_pmu_unthrottle, |
1460 | .start_txn = x86_pmu_start_txn, | ||
1461 | .cancel_txn = x86_pmu_cancel_txn, | ||
1462 | .commit_txn = x86_pmu_commit_txn, | ||
1463 | }; | 1463 | }; |
1464 | 1464 | ||
1465 | /* | 1465 | /* |
1466 | * validate that we can schedule this event | ||
1467 | */ | ||
1468 | static int validate_event(struct perf_event *event) | ||
1469 | { | ||
1470 | struct cpu_hw_events *fake_cpuc; | ||
1471 | struct event_constraint *c; | ||
1472 | int ret = 0; | ||
1473 | |||
1474 | fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); | ||
1475 | if (!fake_cpuc) | ||
1476 | return -ENOMEM; | ||
1477 | |||
1478 | c = x86_pmu.get_event_constraints(fake_cpuc, event); | ||
1479 | |||
1480 | if (!c || !c->weight) | ||
1481 | ret = -ENOSPC; | ||
1482 | |||
1483 | if (x86_pmu.put_event_constraints) | ||
1484 | x86_pmu.put_event_constraints(fake_cpuc, event); | ||
1485 | |||
1486 | kfree(fake_cpuc); | ||
1487 | |||
1488 | return ret; | ||
1489 | } | ||
1490 | |||
1491 | /* | ||
1466 | * validate a single event group | 1492 | * validate a single event group |
1467 | * | 1493 | * |
1468 | * validation include: | 1494 | * validation include: |
@@ -1502,7 +1528,7 @@ static int validate_group(struct perf_event *event) | |||
1502 | 1528 | ||
1503 | fake_cpuc->n_events = n; | 1529 | fake_cpuc->n_events = n; |
1504 | 1530 | ||
1505 | ret = x86_schedule_events(fake_cpuc, n, NULL); | 1531 | ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); |
1506 | 1532 | ||
1507 | out_free: | 1533 | out_free: |
1508 | kfree(fake_cpuc); | 1534 | kfree(fake_cpuc); |
@@ -1527,6 +1553,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) | |||
1527 | 1553 | ||
1528 | if (event->group_leader != event) | 1554 | if (event->group_leader != event) |
1529 | err = validate_group(event); | 1555 | err = validate_group(event); |
1556 | else | ||
1557 | err = validate_event(event); | ||
1530 | 1558 | ||
1531 | event->pmu = tmp; | 1559 | event->pmu = tmp; |
1532 | } | 1560 | } |
@@ -1574,8 +1602,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) | |||
1574 | { | 1602 | { |
1575 | struct perf_callchain_entry *entry = data; | 1603 | struct perf_callchain_entry *entry = data; |
1576 | 1604 | ||
1577 | if (reliable) | 1605 | callchain_store(entry, addr); |
1578 | callchain_store(entry, addr); | ||
1579 | } | 1606 | } |
1580 | 1607 | ||
1581 | static const struct stacktrace_ops backtrace_ops = { | 1608 | static const struct stacktrace_ops backtrace_ops = { |
@@ -1586,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = { | |||
1586 | .walk_stack = print_context_stack_bp, | 1613 | .walk_stack = print_context_stack_bp, |
1587 | }; | 1614 | }; |
1588 | 1615 | ||
1589 | #include "../dumpstack.h" | ||
1590 | |||
1591 | static void | 1616 | static void |
1592 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | 1617 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) |
1593 | { | 1618 | { |
@@ -1597,41 +1622,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | |||
1597 | dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); | 1622 | dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); |
1598 | } | 1623 | } |
1599 | 1624 | ||
1600 | /* | ||
1601 | * best effort, GUP based copy_from_user() that assumes IRQ or NMI context | ||
1602 | */ | ||
1603 | static unsigned long | ||
1604 | copy_from_user_nmi(void *to, const void __user *from, unsigned long n) | ||
1605 | { | ||
1606 | unsigned long offset, addr = (unsigned long)from; | ||
1607 | int type = in_nmi() ? KM_NMI : KM_IRQ0; | ||
1608 | unsigned long size, len = 0; | ||
1609 | struct page *page; | ||
1610 | void *map; | ||
1611 | int ret; | ||
1612 | |||
1613 | do { | ||
1614 | ret = __get_user_pages_fast(addr, 1, 0, &page); | ||
1615 | if (!ret) | ||
1616 | break; | ||
1617 | |||
1618 | offset = addr & (PAGE_SIZE - 1); | ||
1619 | size = min(PAGE_SIZE - offset, n - len); | ||
1620 | |||
1621 | map = kmap_atomic(page, type); | ||
1622 | memcpy(to, map+offset, size); | ||
1623 | kunmap_atomic(map, type); | ||
1624 | put_page(page); | ||
1625 | |||
1626 | len += size; | ||
1627 | to += size; | ||
1628 | addr += size; | ||
1629 | |||
1630 | } while (len < n); | ||
1631 | |||
1632 | return len; | ||
1633 | } | ||
1634 | |||
1635 | #ifdef CONFIG_COMPAT | 1625 | #ifdef CONFIG_COMPAT |
1636 | static inline int | 1626 | static inline int |
1637 | perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) | 1627 | perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) |
@@ -1727,6 +1717,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
1727 | { | 1717 | { |
1728 | struct perf_callchain_entry *entry; | 1718 | struct perf_callchain_entry *entry; |
1729 | 1719 | ||
1720 | if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { | ||
1721 | /* TODO: We don't support guest os callchain now */ | ||
1722 | return NULL; | ||
1723 | } | ||
1724 | |||
1730 | if (in_nmi()) | 1725 | if (in_nmi()) |
1731 | entry = &__get_cpu_var(pmc_nmi_entry); | 1726 | entry = &__get_cpu_var(pmc_nmi_entry); |
1732 | else | 1727 | else |
@@ -1739,14 +1734,36 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
1739 | return entry; | 1734 | return entry; |
1740 | } | 1735 | } |
1741 | 1736 | ||
1742 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) | 1737 | unsigned long perf_instruction_pointer(struct pt_regs *regs) |
1743 | { | 1738 | { |
1744 | regs->ip = ip; | 1739 | unsigned long ip; |
1745 | /* | 1740 | |
1746 | * perf_arch_fetch_caller_regs adds another call, we need to increment | 1741 | if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) |
1747 | * the skip level | 1742 | ip = perf_guest_cbs->get_guest_ip(); |
1748 | */ | 1743 | else |
1749 | regs->bp = rewind_frame_pointer(skip + 1); | 1744 | ip = instruction_pointer(regs); |
1750 | regs->cs = __KERNEL_CS; | 1745 | |
1751 | local_save_flags(regs->flags); | 1746 | return ip; |
1747 | } | ||
1748 | |||
1749 | unsigned long perf_misc_flags(struct pt_regs *regs) | ||
1750 | { | ||
1751 | int misc = 0; | ||
1752 | |||
1753 | if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { | ||
1754 | if (perf_guest_cbs->is_user_mode()) | ||
1755 | misc |= PERF_RECORD_MISC_GUEST_USER; | ||
1756 | else | ||
1757 | misc |= PERF_RECORD_MISC_GUEST_KERNEL; | ||
1758 | } else { | ||
1759 | if (user_mode(regs)) | ||
1760 | misc |= PERF_RECORD_MISC_USER; | ||
1761 | else | ||
1762 | misc |= PERF_RECORD_MISC_KERNEL; | ||
1763 | } | ||
1764 | |||
1765 | if (regs->flags & PERF_EFLAGS_EXACT) | ||
1766 | misc |= PERF_RECORD_MISC_EXACT_IP; | ||
1767 | |||
1768 | return misc; | ||
1752 | } | 1769 | } |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index db6f7d4056e1..c2897b7b4a3b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
@@ -2,7 +2,7 @@ | |||
2 | 2 | ||
3 | static DEFINE_RAW_SPINLOCK(amd_nb_lock); | 3 | static DEFINE_RAW_SPINLOCK(amd_nb_lock); |
4 | 4 | ||
5 | static __initconst u64 amd_hw_cache_event_ids | 5 | static __initconst const u64 amd_hw_cache_event_ids |
6 | [PERF_COUNT_HW_CACHE_MAX] | 6 | [PERF_COUNT_HW_CACHE_MAX] |
7 | [PERF_COUNT_HW_CACHE_OP_MAX] | 7 | [PERF_COUNT_HW_CACHE_OP_MAX] |
8 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | 8 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = |
@@ -102,8 +102,8 @@ static const u64 amd_perfmon_event_map[] = | |||
102 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, | 102 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, |
103 | [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, | 103 | [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, |
104 | [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, | 104 | [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, |
105 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, | 105 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, |
106 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, | 106 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, |
107 | }; | 107 | }; |
108 | 108 | ||
109 | static u64 amd_pmu_event_map(int hw_event) | 109 | static u64 amd_pmu_event_map(int hw_event) |
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event) | |||
111 | return amd_perfmon_event_map[hw_event]; | 111 | return amd_perfmon_event_map[hw_event]; |
112 | } | 112 | } |
113 | 113 | ||
114 | static u64 amd_pmu_raw_event(u64 hw_event) | 114 | static int amd_pmu_hw_config(struct perf_event *event) |
115 | { | 115 | { |
116 | #define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL | 116 | int ret = x86_pmu_hw_config(event); |
117 | #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL | 117 | |
118 | #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL | 118 | if (ret) |
119 | #define K7_EVNTSEL_INV_MASK 0x000800000ULL | 119 | return ret; |
120 | #define K7_EVNTSEL_REG_MASK 0x0FF000000ULL | 120 | |
121 | 121 | if (event->attr.type != PERF_TYPE_RAW) | |
122 | #define K7_EVNTSEL_MASK \ | 122 | return 0; |
123 | (K7_EVNTSEL_EVENT_MASK | \ | 123 | |
124 | K7_EVNTSEL_UNIT_MASK | \ | 124 | event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; |
125 | K7_EVNTSEL_EDGE_MASK | \ | 125 | |
126 | K7_EVNTSEL_INV_MASK | \ | 126 | return 0; |
127 | K7_EVNTSEL_REG_MASK) | ||
128 | |||
129 | return hw_event & K7_EVNTSEL_MASK; | ||
130 | } | 127 | } |
131 | 128 | ||
132 | /* | 129 | /* |
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, | |||
165 | * be removed on one CPU at a time AND PMU is disabled | 162 | * be removed on one CPU at a time AND PMU is disabled |
166 | * when we come here | 163 | * when we come here |
167 | */ | 164 | */ |
168 | for (i = 0; i < x86_pmu.num_events; i++) { | 165 | for (i = 0; i < x86_pmu.num_counters; i++) { |
169 | if (nb->owners[i] == event) { | 166 | if (nb->owners[i] == event) { |
170 | cmpxchg(nb->owners+i, event, NULL); | 167 | cmpxchg(nb->owners+i, event, NULL); |
171 | break; | 168 | break; |
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | |||
215 | struct hw_perf_event *hwc = &event->hw; | 212 | struct hw_perf_event *hwc = &event->hw; |
216 | struct amd_nb *nb = cpuc->amd_nb; | 213 | struct amd_nb *nb = cpuc->amd_nb; |
217 | struct perf_event *old = NULL; | 214 | struct perf_event *old = NULL; |
218 | int max = x86_pmu.num_events; | 215 | int max = x86_pmu.num_counters; |
219 | int i, j, k = -1; | 216 | int i, j, k = -1; |
220 | 217 | ||
221 | /* | 218 | /* |
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) | |||
293 | /* | 290 | /* |
294 | * initialize all possible NB constraints | 291 | * initialize all possible NB constraints |
295 | */ | 292 | */ |
296 | for (i = 0; i < x86_pmu.num_events; i++) { | 293 | for (i = 0; i < x86_pmu.num_counters; i++) { |
297 | __set_bit(i, nb->event_constraints[i].idxmsk); | 294 | __set_bit(i, nb->event_constraints[i].idxmsk); |
298 | nb->event_constraints[i].weight = 1; | 295 | nb->event_constraints[i].weight = 1; |
299 | } | 296 | } |
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu) | |||
371 | raw_spin_unlock(&amd_nb_lock); | 368 | raw_spin_unlock(&amd_nb_lock); |
372 | } | 369 | } |
373 | 370 | ||
374 | static __initconst struct x86_pmu amd_pmu = { | 371 | static __initconst const struct x86_pmu amd_pmu = { |
375 | .name = "AMD", | 372 | .name = "AMD", |
376 | .handle_irq = x86_pmu_handle_irq, | 373 | .handle_irq = x86_pmu_handle_irq, |
377 | .disable_all = x86_pmu_disable_all, | 374 | .disable_all = x86_pmu_disable_all, |
378 | .enable_all = x86_pmu_enable_all, | 375 | .enable_all = x86_pmu_enable_all, |
379 | .enable = x86_pmu_enable_event, | 376 | .enable = x86_pmu_enable_event, |
380 | .disable = x86_pmu_disable_event, | 377 | .disable = x86_pmu_disable_event, |
378 | .hw_config = amd_pmu_hw_config, | ||
379 | .schedule_events = x86_schedule_events, | ||
381 | .eventsel = MSR_K7_EVNTSEL0, | 380 | .eventsel = MSR_K7_EVNTSEL0, |
382 | .perfctr = MSR_K7_PERFCTR0, | 381 | .perfctr = MSR_K7_PERFCTR0, |
383 | .event_map = amd_pmu_event_map, | 382 | .event_map = amd_pmu_event_map, |
384 | .raw_event = amd_pmu_raw_event, | ||
385 | .max_events = ARRAY_SIZE(amd_perfmon_event_map), | 383 | .max_events = ARRAY_SIZE(amd_perfmon_event_map), |
386 | .num_events = 4, | 384 | .num_counters = 4, |
387 | .event_bits = 48, | 385 | .cntval_bits = 48, |
388 | .event_mask = (1ULL << 48) - 1, | 386 | .cntval_mask = (1ULL << 48) - 1, |
389 | .apic = 1, | 387 | .apic = 1, |
390 | /* use highest bit to detect overflow */ | 388 | /* use highest bit to detect overflow */ |
391 | .max_period = (1ULL << 47) - 1, | 389 | .max_period = (1ULL << 47) - 1, |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9c794ac87837..214ac860ebe0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -72,6 +72,7 @@ static struct event_constraint intel_westmere_event_constraints[] = | |||
72 | INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ | 72 | INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ |
73 | INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ | 73 | INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ |
74 | INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ | 74 | INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ |
75 | INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */ | ||
75 | EVENT_CONSTRAINT_END | 76 | EVENT_CONSTRAINT_END |
76 | }; | 77 | }; |
77 | 78 | ||
@@ -88,7 +89,7 @@ static u64 intel_pmu_event_map(int hw_event) | |||
88 | return intel_perfmon_event_map[hw_event]; | 89 | return intel_perfmon_event_map[hw_event]; |
89 | } | 90 | } |
90 | 91 | ||
91 | static __initconst u64 westmere_hw_cache_event_ids | 92 | static __initconst const u64 westmere_hw_cache_event_ids |
92 | [PERF_COUNT_HW_CACHE_MAX] | 93 | [PERF_COUNT_HW_CACHE_MAX] |
93 | [PERF_COUNT_HW_CACHE_OP_MAX] | 94 | [PERF_COUNT_HW_CACHE_OP_MAX] |
94 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | 95 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = |
@@ -179,7 +180,7 @@ static __initconst u64 westmere_hw_cache_event_ids | |||
179 | }, | 180 | }, |
180 | }; | 181 | }; |
181 | 182 | ||
182 | static __initconst u64 nehalem_hw_cache_event_ids | 183 | static __initconst const u64 nehalem_hw_cache_event_ids |
183 | [PERF_COUNT_HW_CACHE_MAX] | 184 | [PERF_COUNT_HW_CACHE_MAX] |
184 | [PERF_COUNT_HW_CACHE_OP_MAX] | 185 | [PERF_COUNT_HW_CACHE_OP_MAX] |
185 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | 186 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = |
@@ -270,7 +271,7 @@ static __initconst u64 nehalem_hw_cache_event_ids | |||
270 | }, | 271 | }, |
271 | }; | 272 | }; |
272 | 273 | ||
273 | static __initconst u64 core2_hw_cache_event_ids | 274 | static __initconst const u64 core2_hw_cache_event_ids |
274 | [PERF_COUNT_HW_CACHE_MAX] | 275 | [PERF_COUNT_HW_CACHE_MAX] |
275 | [PERF_COUNT_HW_CACHE_OP_MAX] | 276 | [PERF_COUNT_HW_CACHE_OP_MAX] |
276 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | 277 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = |
@@ -361,7 +362,7 @@ static __initconst u64 core2_hw_cache_event_ids | |||
361 | }, | 362 | }, |
362 | }; | 363 | }; |
363 | 364 | ||
364 | static __initconst u64 atom_hw_cache_event_ids | 365 | static __initconst const u64 atom_hw_cache_event_ids |
365 | [PERF_COUNT_HW_CACHE_MAX] | 366 | [PERF_COUNT_HW_CACHE_MAX] |
366 | [PERF_COUNT_HW_CACHE_OP_MAX] | 367 | [PERF_COUNT_HW_CACHE_OP_MAX] |
367 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | 368 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = |
@@ -452,60 +453,6 @@ static __initconst u64 atom_hw_cache_event_ids | |||
452 | }, | 453 | }, |
453 | }; | 454 | }; |
454 | 455 | ||
455 | static u64 intel_pmu_raw_event(u64 hw_event) | ||
456 | { | ||
457 | #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL | ||
458 | #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL | ||
459 | #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL | ||
460 | #define CORE_EVNTSEL_INV_MASK 0x00800000ULL | ||
461 | #define CORE_EVNTSEL_REG_MASK 0xFF000000ULL | ||
462 | |||
463 | #define CORE_EVNTSEL_MASK \ | ||
464 | (INTEL_ARCH_EVTSEL_MASK | \ | ||
465 | INTEL_ARCH_UNIT_MASK | \ | ||
466 | INTEL_ARCH_EDGE_MASK | \ | ||
467 | INTEL_ARCH_INV_MASK | \ | ||
468 | INTEL_ARCH_CNT_MASK) | ||
469 | |||
470 | return hw_event & CORE_EVNTSEL_MASK; | ||
471 | } | ||
472 | |||
473 | static void intel_pmu_enable_bts(u64 config) | ||
474 | { | ||
475 | unsigned long debugctlmsr; | ||
476 | |||
477 | debugctlmsr = get_debugctlmsr(); | ||
478 | |||
479 | debugctlmsr |= X86_DEBUGCTL_TR; | ||
480 | debugctlmsr |= X86_DEBUGCTL_BTS; | ||
481 | debugctlmsr |= X86_DEBUGCTL_BTINT; | ||
482 | |||
483 | if (!(config & ARCH_PERFMON_EVENTSEL_OS)) | ||
484 | debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; | ||
485 | |||
486 | if (!(config & ARCH_PERFMON_EVENTSEL_USR)) | ||
487 | debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; | ||
488 | |||
489 | update_debugctlmsr(debugctlmsr); | ||
490 | } | ||
491 | |||
492 | static void intel_pmu_disable_bts(void) | ||
493 | { | ||
494 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
495 | unsigned long debugctlmsr; | ||
496 | |||
497 | if (!cpuc->ds) | ||
498 | return; | ||
499 | |||
500 | debugctlmsr = get_debugctlmsr(); | ||
501 | |||
502 | debugctlmsr &= | ||
503 | ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | | ||
504 | X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); | ||
505 | |||
506 | update_debugctlmsr(debugctlmsr); | ||
507 | } | ||
508 | |||
509 | static void intel_pmu_disable_all(void) | 456 | static void intel_pmu_disable_all(void) |
510 | { | 457 | { |
511 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 458 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
@@ -514,12 +461,17 @@ static void intel_pmu_disable_all(void) | |||
514 | 461 | ||
515 | if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) | 462 | if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) |
516 | intel_pmu_disable_bts(); | 463 | intel_pmu_disable_bts(); |
464 | |||
465 | intel_pmu_pebs_disable_all(); | ||
466 | intel_pmu_lbr_disable_all(); | ||
517 | } | 467 | } |
518 | 468 | ||
519 | static void intel_pmu_enable_all(void) | 469 | static void intel_pmu_enable_all(int added) |
520 | { | 470 | { |
521 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 471 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
522 | 472 | ||
473 | intel_pmu_pebs_enable_all(); | ||
474 | intel_pmu_lbr_enable_all(); | ||
523 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); | 475 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); |
524 | 476 | ||
525 | if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { | 477 | if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { |
@@ -533,6 +485,42 @@ static void intel_pmu_enable_all(void) | |||
533 | } | 485 | } |
534 | } | 486 | } |
535 | 487 | ||
488 | /* | ||
489 | * Workaround for: | ||
490 | * Intel Errata AAK100 (model 26) | ||
491 | * Intel Errata AAP53 (model 30) | ||
492 | * Intel Errata BD53 (model 44) | ||
493 | * | ||
494 | * These chips need to be 'reset' when adding counters by programming | ||
495 | * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5 | ||
496 | * either in sequence on the same PMC or on different PMCs. | ||
497 | */ | ||
498 | static void intel_pmu_nhm_enable_all(int added) | ||
499 | { | ||
500 | if (added) { | ||
501 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
502 | int i; | ||
503 | |||
504 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2); | ||
505 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1); | ||
506 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5); | ||
507 | |||
508 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); | ||
509 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); | ||
510 | |||
511 | for (i = 0; i < 3; i++) { | ||
512 | struct perf_event *event = cpuc->events[i]; | ||
513 | |||
514 | if (!event) | ||
515 | continue; | ||
516 | |||
517 | __x86_pmu_enable_event(&event->hw, | ||
518 | ARCH_PERFMON_EVENTSEL_ENABLE); | ||
519 | } | ||
520 | } | ||
521 | intel_pmu_enable_all(added); | ||
522 | } | ||
523 | |||
536 | static inline u64 intel_pmu_get_status(void) | 524 | static inline u64 intel_pmu_get_status(void) |
537 | { | 525 | { |
538 | u64 status; | 526 | u64 status; |
@@ -547,8 +535,7 @@ static inline void intel_pmu_ack_status(u64 ack) | |||
547 | wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); | 535 | wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); |
548 | } | 536 | } |
549 | 537 | ||
550 | static inline void | 538 | static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) |
551 | intel_pmu_disable_fixed(struct hw_perf_event *hwc) | ||
552 | { | 539 | { |
553 | int idx = hwc->idx - X86_PMC_IDX_FIXED; | 540 | int idx = hwc->idx - X86_PMC_IDX_FIXED; |
554 | u64 ctrl_val, mask; | 541 | u64 ctrl_val, mask; |
@@ -557,71 +544,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc) | |||
557 | 544 | ||
558 | rdmsrl(hwc->config_base, ctrl_val); | 545 | rdmsrl(hwc->config_base, ctrl_val); |
559 | ctrl_val &= ~mask; | 546 | ctrl_val &= ~mask; |
560 | (void)checking_wrmsrl(hwc->config_base, ctrl_val); | 547 | wrmsrl(hwc->config_base, ctrl_val); |
561 | } | ||
562 | |||
563 | static void intel_pmu_drain_bts_buffer(void) | ||
564 | { | ||
565 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
566 | struct debug_store *ds = cpuc->ds; | ||
567 | struct bts_record { | ||
568 | u64 from; | ||
569 | u64 to; | ||
570 | u64 flags; | ||
571 | }; | ||
572 | struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; | ||
573 | struct bts_record *at, *top; | ||
574 | struct perf_output_handle handle; | ||
575 | struct perf_event_header header; | ||
576 | struct perf_sample_data data; | ||
577 | struct pt_regs regs; | ||
578 | |||
579 | if (!event) | ||
580 | return; | ||
581 | |||
582 | if (!ds) | ||
583 | return; | ||
584 | |||
585 | at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; | ||
586 | top = (struct bts_record *)(unsigned long)ds->bts_index; | ||
587 | |||
588 | if (top <= at) | ||
589 | return; | ||
590 | |||
591 | ds->bts_index = ds->bts_buffer_base; | ||
592 | |||
593 | perf_sample_data_init(&data, 0); | ||
594 | |||
595 | data.period = event->hw.last_period; | ||
596 | regs.ip = 0; | ||
597 | |||
598 | /* | ||
599 | * Prepare a generic sample, i.e. fill in the invariant fields. | ||
600 | * We will overwrite the from and to address before we output | ||
601 | * the sample. | ||
602 | */ | ||
603 | perf_prepare_sample(&header, &data, event, ®s); | ||
604 | |||
605 | if (perf_output_begin(&handle, event, | ||
606 | header.size * (top - at), 1, 1)) | ||
607 | return; | ||
608 | |||
609 | for (; at < top; at++) { | ||
610 | data.ip = at->from; | ||
611 | data.addr = at->to; | ||
612 | |||
613 | perf_output_sample(&handle, &header, &data, event); | ||
614 | } | ||
615 | |||
616 | perf_output_end(&handle); | ||
617 | |||
618 | /* There's new data available. */ | ||
619 | event->hw.interrupts++; | ||
620 | event->pending_kill = POLL_IN; | ||
621 | } | 548 | } |
622 | 549 | ||
623 | static inline void | 550 | static void intel_pmu_disable_event(struct perf_event *event) |
624 | intel_pmu_disable_event(struct perf_event *event) | ||
625 | { | 551 | { |
626 | struct hw_perf_event *hwc = &event->hw; | 552 | struct hw_perf_event *hwc = &event->hw; |
627 | 553 | ||
@@ -637,14 +563,15 @@ intel_pmu_disable_event(struct perf_event *event) | |||
637 | } | 563 | } |
638 | 564 | ||
639 | x86_pmu_disable_event(event); | 565 | x86_pmu_disable_event(event); |
566 | |||
567 | if (unlikely(event->attr.precise_ip)) | ||
568 | intel_pmu_pebs_disable(event); | ||
640 | } | 569 | } |
641 | 570 | ||
642 | static inline void | 571 | static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) |
643 | intel_pmu_enable_fixed(struct hw_perf_event *hwc) | ||
644 | { | 572 | { |
645 | int idx = hwc->idx - X86_PMC_IDX_FIXED; | 573 | int idx = hwc->idx - X86_PMC_IDX_FIXED; |
646 | u64 ctrl_val, bits, mask; | 574 | u64 ctrl_val, bits, mask; |
647 | int err; | ||
648 | 575 | ||
649 | /* | 576 | /* |
650 | * Enable IRQ generation (0x8), | 577 | * Enable IRQ generation (0x8), |
@@ -669,7 +596,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc) | |||
669 | rdmsrl(hwc->config_base, ctrl_val); | 596 | rdmsrl(hwc->config_base, ctrl_val); |
670 | ctrl_val &= ~mask; | 597 | ctrl_val &= ~mask; |
671 | ctrl_val |= bits; | 598 | ctrl_val |= bits; |
672 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | 599 | wrmsrl(hwc->config_base, ctrl_val); |
673 | } | 600 | } |
674 | 601 | ||
675 | static void intel_pmu_enable_event(struct perf_event *event) | 602 | static void intel_pmu_enable_event(struct perf_event *event) |
@@ -689,7 +616,10 @@ static void intel_pmu_enable_event(struct perf_event *event) | |||
689 | return; | 616 | return; |
690 | } | 617 | } |
691 | 618 | ||
692 | __x86_pmu_enable_event(hwc); | 619 | if (unlikely(event->attr.precise_ip)) |
620 | intel_pmu_pebs_enable(event); | ||
621 | |||
622 | __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); | ||
693 | } | 623 | } |
694 | 624 | ||
695 | /* | 625 | /* |
@@ -708,20 +638,20 @@ static void intel_pmu_reset(void) | |||
708 | unsigned long flags; | 638 | unsigned long flags; |
709 | int idx; | 639 | int idx; |
710 | 640 | ||
711 | if (!x86_pmu.num_events) | 641 | if (!x86_pmu.num_counters) |
712 | return; | 642 | return; |
713 | 643 | ||
714 | local_irq_save(flags); | 644 | local_irq_save(flags); |
715 | 645 | ||
716 | printk("clearing PMU state on CPU#%d\n", smp_processor_id()); | 646 | printk("clearing PMU state on CPU#%d\n", smp_processor_id()); |
717 | 647 | ||
718 | for (idx = 0; idx < x86_pmu.num_events; idx++) { | 648 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
719 | checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); | 649 | checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); |
720 | checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); | 650 | checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); |
721 | } | 651 | } |
722 | for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { | 652 | for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) |
723 | checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); | 653 | checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); |
724 | } | 654 | |
725 | if (ds) | 655 | if (ds) |
726 | ds->bts_index = ds->bts_buffer_base; | 656 | ds->bts_index = ds->bts_buffer_base; |
727 | 657 | ||
@@ -747,7 +677,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
747 | intel_pmu_drain_bts_buffer(); | 677 | intel_pmu_drain_bts_buffer(); |
748 | status = intel_pmu_get_status(); | 678 | status = intel_pmu_get_status(); |
749 | if (!status) { | 679 | if (!status) { |
750 | intel_pmu_enable_all(); | 680 | intel_pmu_enable_all(0); |
751 | return 0; | 681 | return 0; |
752 | } | 682 | } |
753 | 683 | ||
@@ -762,6 +692,15 @@ again: | |||
762 | 692 | ||
763 | inc_irq_stat(apic_perf_irqs); | 693 | inc_irq_stat(apic_perf_irqs); |
764 | ack = status; | 694 | ack = status; |
695 | |||
696 | intel_pmu_lbr_read(); | ||
697 | |||
698 | /* | ||
699 | * PEBS overflow sets bit 62 in the global status register | ||
700 | */ | ||
701 | if (__test_and_clear_bit(62, (unsigned long *)&status)) | ||
702 | x86_pmu.drain_pebs(regs); | ||
703 | |||
765 | for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { | 704 | for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { |
766 | struct perf_event *event = cpuc->events[bit]; | 705 | struct perf_event *event = cpuc->events[bit]; |
767 | 706 | ||
@@ -787,26 +726,22 @@ again: | |||
787 | goto again; | 726 | goto again; |
788 | 727 | ||
789 | done: | 728 | done: |
790 | intel_pmu_enable_all(); | 729 | intel_pmu_enable_all(0); |
791 | return 1; | 730 | return 1; |
792 | } | 731 | } |
793 | 732 | ||
794 | static struct event_constraint bts_constraint = | ||
795 | EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); | ||
796 | |||
797 | static struct event_constraint * | 733 | static struct event_constraint * |
798 | intel_special_constraints(struct perf_event *event) | 734 | intel_bts_constraints(struct perf_event *event) |
799 | { | 735 | { |
800 | unsigned int hw_event; | 736 | struct hw_perf_event *hwc = &event->hw; |
801 | 737 | unsigned int hw_event, bts_event; | |
802 | hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; | ||
803 | 738 | ||
804 | if (unlikely((hw_event == | 739 | hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; |
805 | x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && | 740 | bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); |
806 | (event->hw.sample_period == 1))) { | ||
807 | 741 | ||
742 | if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) | ||
808 | return &bts_constraint; | 743 | return &bts_constraint; |
809 | } | 744 | |
810 | return NULL; | 745 | return NULL; |
811 | } | 746 | } |
812 | 747 | ||
@@ -815,24 +750,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event | |||
815 | { | 750 | { |
816 | struct event_constraint *c; | 751 | struct event_constraint *c; |
817 | 752 | ||
818 | c = intel_special_constraints(event); | 753 | c = intel_bts_constraints(event); |
754 | if (c) | ||
755 | return c; | ||
756 | |||
757 | c = intel_pebs_constraints(event); | ||
819 | if (c) | 758 | if (c) |
820 | return c; | 759 | return c; |
821 | 760 | ||
822 | return x86_get_event_constraints(cpuc, event); | 761 | return x86_get_event_constraints(cpuc, event); |
823 | } | 762 | } |
824 | 763 | ||
825 | static __initconst struct x86_pmu core_pmu = { | 764 | static int intel_pmu_hw_config(struct perf_event *event) |
765 | { | ||
766 | int ret = x86_pmu_hw_config(event); | ||
767 | |||
768 | if (ret) | ||
769 | return ret; | ||
770 | |||
771 | if (event->attr.type != PERF_TYPE_RAW) | ||
772 | return 0; | ||
773 | |||
774 | if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) | ||
775 | return 0; | ||
776 | |||
777 | if (x86_pmu.version < 3) | ||
778 | return -EINVAL; | ||
779 | |||
780 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
781 | return -EACCES; | ||
782 | |||
783 | event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY; | ||
784 | |||
785 | return 0; | ||
786 | } | ||
787 | |||
788 | static __initconst const struct x86_pmu core_pmu = { | ||
826 | .name = "core", | 789 | .name = "core", |
827 | .handle_irq = x86_pmu_handle_irq, | 790 | .handle_irq = x86_pmu_handle_irq, |
828 | .disable_all = x86_pmu_disable_all, | 791 | .disable_all = x86_pmu_disable_all, |
829 | .enable_all = x86_pmu_enable_all, | 792 | .enable_all = x86_pmu_enable_all, |
830 | .enable = x86_pmu_enable_event, | 793 | .enable = x86_pmu_enable_event, |
831 | .disable = x86_pmu_disable_event, | 794 | .disable = x86_pmu_disable_event, |
795 | .hw_config = x86_pmu_hw_config, | ||
796 | .schedule_events = x86_schedule_events, | ||
832 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, | 797 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, |
833 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, | 798 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, |
834 | .event_map = intel_pmu_event_map, | 799 | .event_map = intel_pmu_event_map, |
835 | .raw_event = intel_pmu_raw_event, | ||
836 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), | 800 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), |
837 | .apic = 1, | 801 | .apic = 1, |
838 | /* | 802 | /* |
@@ -845,17 +809,32 @@ static __initconst struct x86_pmu core_pmu = { | |||
845 | .event_constraints = intel_core_event_constraints, | 809 | .event_constraints = intel_core_event_constraints, |
846 | }; | 810 | }; |
847 | 811 | ||
848 | static __initconst struct x86_pmu intel_pmu = { | 812 | static void intel_pmu_cpu_starting(int cpu) |
813 | { | ||
814 | init_debug_store_on_cpu(cpu); | ||
815 | /* | ||
816 | * Deal with CPUs that don't clear their LBRs on power-up. | ||
817 | */ | ||
818 | intel_pmu_lbr_reset(); | ||
819 | } | ||
820 | |||
821 | static void intel_pmu_cpu_dying(int cpu) | ||
822 | { | ||
823 | fini_debug_store_on_cpu(cpu); | ||
824 | } | ||
825 | |||
826 | static __initconst const struct x86_pmu intel_pmu = { | ||
849 | .name = "Intel", | 827 | .name = "Intel", |
850 | .handle_irq = intel_pmu_handle_irq, | 828 | .handle_irq = intel_pmu_handle_irq, |
851 | .disable_all = intel_pmu_disable_all, | 829 | .disable_all = intel_pmu_disable_all, |
852 | .enable_all = intel_pmu_enable_all, | 830 | .enable_all = intel_pmu_enable_all, |
853 | .enable = intel_pmu_enable_event, | 831 | .enable = intel_pmu_enable_event, |
854 | .disable = intel_pmu_disable_event, | 832 | .disable = intel_pmu_disable_event, |
833 | .hw_config = intel_pmu_hw_config, | ||
834 | .schedule_events = x86_schedule_events, | ||
855 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, | 835 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, |
856 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, | 836 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, |
857 | .event_map = intel_pmu_event_map, | 837 | .event_map = intel_pmu_event_map, |
858 | .raw_event = intel_pmu_raw_event, | ||
859 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), | 838 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), |
860 | .apic = 1, | 839 | .apic = 1, |
861 | /* | 840 | /* |
@@ -864,14 +843,38 @@ static __initconst struct x86_pmu intel_pmu = { | |||
864 | * the generic event period: | 843 | * the generic event period: |
865 | */ | 844 | */ |
866 | .max_period = (1ULL << 31) - 1, | 845 | .max_period = (1ULL << 31) - 1, |
867 | .enable_bts = intel_pmu_enable_bts, | ||
868 | .disable_bts = intel_pmu_disable_bts, | ||
869 | .get_event_constraints = intel_get_event_constraints, | 846 | .get_event_constraints = intel_get_event_constraints, |
870 | 847 | ||
871 | .cpu_starting = init_debug_store_on_cpu, | 848 | .cpu_starting = intel_pmu_cpu_starting, |
872 | .cpu_dying = fini_debug_store_on_cpu, | 849 | .cpu_dying = intel_pmu_cpu_dying, |
873 | }; | 850 | }; |
874 | 851 | ||
852 | static void intel_clovertown_quirks(void) | ||
853 | { | ||
854 | /* | ||
855 | * PEBS is unreliable due to: | ||
856 | * | ||
857 | * AJ67 - PEBS may experience CPL leaks | ||
858 | * AJ68 - PEBS PMI may be delayed by one event | ||
859 | * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12] | ||
860 | * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS | ||
861 | * | ||
862 | * AJ67 could be worked around by restricting the OS/USR flags. | ||
863 | * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI. | ||
864 | * | ||
865 | * AJ106 could possibly be worked around by not allowing LBR | ||
866 | * usage from PEBS, including the fixup. | ||
867 | * AJ68 could possibly be worked around by always programming | ||
868 | * a pebs_event_reset[0] value and coping with the lost events. | ||
869 | * | ||
870 | * But taken together it might just make sense to not enable PEBS on | ||
871 | * these chips. | ||
872 | */ | ||
873 | printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); | ||
874 | x86_pmu.pebs = 0; | ||
875 | x86_pmu.pebs_constraints = NULL; | ||
876 | } | ||
877 | |||
875 | static __init int intel_pmu_init(void) | 878 | static __init int intel_pmu_init(void) |
876 | { | 879 | { |
877 | union cpuid10_edx edx; | 880 | union cpuid10_edx edx; |
@@ -881,12 +884,13 @@ static __init int intel_pmu_init(void) | |||
881 | int version; | 884 | int version; |
882 | 885 | ||
883 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { | 886 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { |
884 | /* check for P6 processor family */ | 887 | switch (boot_cpu_data.x86) { |
885 | if (boot_cpu_data.x86 == 6) { | 888 | case 0x6: |
886 | return p6_pmu_init(); | 889 | return p6_pmu_init(); |
887 | } else { | 890 | case 0xf: |
891 | return p4_pmu_init(); | ||
892 | } | ||
888 | return -ENODEV; | 893 | return -ENODEV; |
889 | } | ||
890 | } | 894 | } |
891 | 895 | ||
892 | /* | 896 | /* |
@@ -904,16 +908,28 @@ static __init int intel_pmu_init(void) | |||
904 | x86_pmu = intel_pmu; | 908 | x86_pmu = intel_pmu; |
905 | 909 | ||
906 | x86_pmu.version = version; | 910 | x86_pmu.version = version; |
907 | x86_pmu.num_events = eax.split.num_events; | 911 | x86_pmu.num_counters = eax.split.num_counters; |
908 | x86_pmu.event_bits = eax.split.bit_width; | 912 | x86_pmu.cntval_bits = eax.split.bit_width; |
909 | x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; | 913 | x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; |
910 | 914 | ||
911 | /* | 915 | /* |
912 | * Quirk: v2 perfmon does not report fixed-purpose events, so | 916 | * Quirk: v2 perfmon does not report fixed-purpose events, so |
913 | * assume at least 3 events: | 917 | * assume at least 3 events: |
914 | */ | 918 | */ |
915 | if (version > 1) | 919 | if (version > 1) |
916 | x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); | 920 | x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); |
921 | |||
922 | /* | ||
923 | * v2 and above have a perf capabilities MSR | ||
924 | */ | ||
925 | if (version > 1) { | ||
926 | u64 capabilities; | ||
927 | |||
928 | rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); | ||
929 | x86_pmu.intel_cap.capabilities = capabilities; | ||
930 | } | ||
931 | |||
932 | intel_ds_init(); | ||
917 | 933 | ||
918 | /* | 934 | /* |
919 | * Install the hw-cache-events table: | 935 | * Install the hw-cache-events table: |
@@ -924,12 +940,15 @@ static __init int intel_pmu_init(void) | |||
924 | break; | 940 | break; |
925 | 941 | ||
926 | case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ | 942 | case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ |
943 | x86_pmu.quirks = intel_clovertown_quirks; | ||
927 | case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ | 944 | case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ |
928 | case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ | 945 | case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ |
929 | case 29: /* six-core 45 nm xeon "Dunnington" */ | 946 | case 29: /* six-core 45 nm xeon "Dunnington" */ |
930 | memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, | 947 | memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, |
931 | sizeof(hw_cache_event_ids)); | 948 | sizeof(hw_cache_event_ids)); |
932 | 949 | ||
950 | intel_pmu_lbr_init_core(); | ||
951 | |||
933 | x86_pmu.event_constraints = intel_core2_event_constraints; | 952 | x86_pmu.event_constraints = intel_core2_event_constraints; |
934 | pr_cont("Core2 events, "); | 953 | pr_cont("Core2 events, "); |
935 | break; | 954 | break; |
@@ -940,13 +959,19 @@ static __init int intel_pmu_init(void) | |||
940 | memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, | 959 | memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, |
941 | sizeof(hw_cache_event_ids)); | 960 | sizeof(hw_cache_event_ids)); |
942 | 961 | ||
962 | intel_pmu_lbr_init_nhm(); | ||
963 | |||
943 | x86_pmu.event_constraints = intel_nehalem_event_constraints; | 964 | x86_pmu.event_constraints = intel_nehalem_event_constraints; |
944 | pr_cont("Nehalem/Corei7 events, "); | 965 | x86_pmu.enable_all = intel_pmu_nhm_enable_all; |
966 | pr_cont("Nehalem events, "); | ||
945 | break; | 967 | break; |
968 | |||
946 | case 28: /* Atom */ | 969 | case 28: /* Atom */ |
947 | memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, | 970 | memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, |
948 | sizeof(hw_cache_event_ids)); | 971 | sizeof(hw_cache_event_ids)); |
949 | 972 | ||
973 | intel_pmu_lbr_init_atom(); | ||
974 | |||
950 | x86_pmu.event_constraints = intel_gen_event_constraints; | 975 | x86_pmu.event_constraints = intel_gen_event_constraints; |
951 | pr_cont("Atom events, "); | 976 | pr_cont("Atom events, "); |
952 | break; | 977 | break; |
@@ -956,7 +981,10 @@ static __init int intel_pmu_init(void) | |||
956 | memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, | 981 | memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, |
957 | sizeof(hw_cache_event_ids)); | 982 | sizeof(hw_cache_event_ids)); |
958 | 983 | ||
984 | intel_pmu_lbr_init_nhm(); | ||
985 | |||
959 | x86_pmu.event_constraints = intel_westmere_event_constraints; | 986 | x86_pmu.event_constraints = intel_westmere_event_constraints; |
987 | x86_pmu.enable_all = intel_pmu_nhm_enable_all; | ||
960 | pr_cont("Westmere events, "); | 988 | pr_cont("Westmere events, "); |
961 | break; | 989 | break; |
962 | 990 | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c new file mode 100644 index 000000000000..18018d1311cd --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c | |||
@@ -0,0 +1,641 @@ | |||
1 | #ifdef CONFIG_CPU_SUP_INTEL | ||
2 | |||
3 | /* The maximal number of PEBS events: */ | ||
4 | #define MAX_PEBS_EVENTS 4 | ||
5 | |||
6 | /* The size of a BTS record in bytes: */ | ||
7 | #define BTS_RECORD_SIZE 24 | ||
8 | |||
9 | #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) | ||
10 | #define PEBS_BUFFER_SIZE PAGE_SIZE | ||
11 | |||
12 | /* | ||
13 | * pebs_record_32 for p4 and core not supported | ||
14 | |||
15 | struct pebs_record_32 { | ||
16 | u32 flags, ip; | ||
17 | u32 ax, bc, cx, dx; | ||
18 | u32 si, di, bp, sp; | ||
19 | }; | ||
20 | |||
21 | */ | ||
22 | |||
23 | struct pebs_record_core { | ||
24 | u64 flags, ip; | ||
25 | u64 ax, bx, cx, dx; | ||
26 | u64 si, di, bp, sp; | ||
27 | u64 r8, r9, r10, r11; | ||
28 | u64 r12, r13, r14, r15; | ||
29 | }; | ||
30 | |||
31 | struct pebs_record_nhm { | ||
32 | u64 flags, ip; | ||
33 | u64 ax, bx, cx, dx; | ||
34 | u64 si, di, bp, sp; | ||
35 | u64 r8, r9, r10, r11; | ||
36 | u64 r12, r13, r14, r15; | ||
37 | u64 status, dla, dse, lat; | ||
38 | }; | ||
39 | |||
40 | /* | ||
41 | * A debug store configuration. | ||
42 | * | ||
43 | * We only support architectures that use 64bit fields. | ||
44 | */ | ||
45 | struct debug_store { | ||
46 | u64 bts_buffer_base; | ||
47 | u64 bts_index; | ||
48 | u64 bts_absolute_maximum; | ||
49 | u64 bts_interrupt_threshold; | ||
50 | u64 pebs_buffer_base; | ||
51 | u64 pebs_index; | ||
52 | u64 pebs_absolute_maximum; | ||
53 | u64 pebs_interrupt_threshold; | ||
54 | u64 pebs_event_reset[MAX_PEBS_EVENTS]; | ||
55 | }; | ||
56 | |||
57 | static void init_debug_store_on_cpu(int cpu) | ||
58 | { | ||
59 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
60 | |||
61 | if (!ds) | ||
62 | return; | ||
63 | |||
64 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, | ||
65 | (u32)((u64)(unsigned long)ds), | ||
66 | (u32)((u64)(unsigned long)ds >> 32)); | ||
67 | } | ||
68 | |||
69 | static void fini_debug_store_on_cpu(int cpu) | ||
70 | { | ||
71 | if (!per_cpu(cpu_hw_events, cpu).ds) | ||
72 | return; | ||
73 | |||
74 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); | ||
75 | } | ||
76 | |||
77 | static void release_ds_buffers(void) | ||
78 | { | ||
79 | int cpu; | ||
80 | |||
81 | if (!x86_pmu.bts && !x86_pmu.pebs) | ||
82 | return; | ||
83 | |||
84 | get_online_cpus(); | ||
85 | |||
86 | for_each_online_cpu(cpu) | ||
87 | fini_debug_store_on_cpu(cpu); | ||
88 | |||
89 | for_each_possible_cpu(cpu) { | ||
90 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
91 | |||
92 | if (!ds) | ||
93 | continue; | ||
94 | |||
95 | per_cpu(cpu_hw_events, cpu).ds = NULL; | ||
96 | |||
97 | kfree((void *)(unsigned long)ds->pebs_buffer_base); | ||
98 | kfree((void *)(unsigned long)ds->bts_buffer_base); | ||
99 | kfree(ds); | ||
100 | } | ||
101 | |||
102 | put_online_cpus(); | ||
103 | } | ||
104 | |||
105 | static int reserve_ds_buffers(void) | ||
106 | { | ||
107 | int cpu, err = 0; | ||
108 | |||
109 | if (!x86_pmu.bts && !x86_pmu.pebs) | ||
110 | return 0; | ||
111 | |||
112 | get_online_cpus(); | ||
113 | |||
114 | for_each_possible_cpu(cpu) { | ||
115 | struct debug_store *ds; | ||
116 | void *buffer; | ||
117 | int max, thresh; | ||
118 | |||
119 | err = -ENOMEM; | ||
120 | ds = kzalloc(sizeof(*ds), GFP_KERNEL); | ||
121 | if (unlikely(!ds)) | ||
122 | break; | ||
123 | per_cpu(cpu_hw_events, cpu).ds = ds; | ||
124 | |||
125 | if (x86_pmu.bts) { | ||
126 | buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); | ||
127 | if (unlikely(!buffer)) | ||
128 | break; | ||
129 | |||
130 | max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; | ||
131 | thresh = max / 16; | ||
132 | |||
133 | ds->bts_buffer_base = (u64)(unsigned long)buffer; | ||
134 | ds->bts_index = ds->bts_buffer_base; | ||
135 | ds->bts_absolute_maximum = ds->bts_buffer_base + | ||
136 | max * BTS_RECORD_SIZE; | ||
137 | ds->bts_interrupt_threshold = ds->bts_absolute_maximum - | ||
138 | thresh * BTS_RECORD_SIZE; | ||
139 | } | ||
140 | |||
141 | if (x86_pmu.pebs) { | ||
142 | buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); | ||
143 | if (unlikely(!buffer)) | ||
144 | break; | ||
145 | |||
146 | max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; | ||
147 | |||
148 | ds->pebs_buffer_base = (u64)(unsigned long)buffer; | ||
149 | ds->pebs_index = ds->pebs_buffer_base; | ||
150 | ds->pebs_absolute_maximum = ds->pebs_buffer_base + | ||
151 | max * x86_pmu.pebs_record_size; | ||
152 | /* | ||
153 | * Always use single record PEBS | ||
154 | */ | ||
155 | ds->pebs_interrupt_threshold = ds->pebs_buffer_base + | ||
156 | x86_pmu.pebs_record_size; | ||
157 | } | ||
158 | |||
159 | err = 0; | ||
160 | } | ||
161 | |||
162 | if (err) | ||
163 | release_ds_buffers(); | ||
164 | else { | ||
165 | for_each_online_cpu(cpu) | ||
166 | init_debug_store_on_cpu(cpu); | ||
167 | } | ||
168 | |||
169 | put_online_cpus(); | ||
170 | |||
171 | return err; | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * BTS | ||
176 | */ | ||
177 | |||
178 | static struct event_constraint bts_constraint = | ||
179 | EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); | ||
180 | |||
181 | static void intel_pmu_enable_bts(u64 config) | ||
182 | { | ||
183 | unsigned long debugctlmsr; | ||
184 | |||
185 | debugctlmsr = get_debugctlmsr(); | ||
186 | |||
187 | debugctlmsr |= DEBUGCTLMSR_TR; | ||
188 | debugctlmsr |= DEBUGCTLMSR_BTS; | ||
189 | debugctlmsr |= DEBUGCTLMSR_BTINT; | ||
190 | |||
191 | if (!(config & ARCH_PERFMON_EVENTSEL_OS)) | ||
192 | debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; | ||
193 | |||
194 | if (!(config & ARCH_PERFMON_EVENTSEL_USR)) | ||
195 | debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR; | ||
196 | |||
197 | update_debugctlmsr(debugctlmsr); | ||
198 | } | ||
199 | |||
200 | static void intel_pmu_disable_bts(void) | ||
201 | { | ||
202 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
203 | unsigned long debugctlmsr; | ||
204 | |||
205 | if (!cpuc->ds) | ||
206 | return; | ||
207 | |||
208 | debugctlmsr = get_debugctlmsr(); | ||
209 | |||
210 | debugctlmsr &= | ||
211 | ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT | | ||
212 | DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR); | ||
213 | |||
214 | update_debugctlmsr(debugctlmsr); | ||
215 | } | ||
216 | |||
217 | static void intel_pmu_drain_bts_buffer(void) | ||
218 | { | ||
219 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
220 | struct debug_store *ds = cpuc->ds; | ||
221 | struct bts_record { | ||
222 | u64 from; | ||
223 | u64 to; | ||
224 | u64 flags; | ||
225 | }; | ||
226 | struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; | ||
227 | struct bts_record *at, *top; | ||
228 | struct perf_output_handle handle; | ||
229 | struct perf_event_header header; | ||
230 | struct perf_sample_data data; | ||
231 | struct pt_regs regs; | ||
232 | |||
233 | if (!event) | ||
234 | return; | ||
235 | |||
236 | if (!ds) | ||
237 | return; | ||
238 | |||
239 | at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; | ||
240 | top = (struct bts_record *)(unsigned long)ds->bts_index; | ||
241 | |||
242 | if (top <= at) | ||
243 | return; | ||
244 | |||
245 | ds->bts_index = ds->bts_buffer_base; | ||
246 | |||
247 | perf_sample_data_init(&data, 0); | ||
248 | data.period = event->hw.last_period; | ||
249 | regs.ip = 0; | ||
250 | |||
251 | /* | ||
252 | * Prepare a generic sample, i.e. fill in the invariant fields. | ||
253 | * We will overwrite the from and to address before we output | ||
254 | * the sample. | ||
255 | */ | ||
256 | perf_prepare_sample(&header, &data, event, ®s); | ||
257 | |||
258 | if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) | ||
259 | return; | ||
260 | |||
261 | for (; at < top; at++) { | ||
262 | data.ip = at->from; | ||
263 | data.addr = at->to; | ||
264 | |||
265 | perf_output_sample(&handle, &header, &data, event); | ||
266 | } | ||
267 | |||
268 | perf_output_end(&handle); | ||
269 | |||
270 | /* There's new data available. */ | ||
271 | event->hw.interrupts++; | ||
272 | event->pending_kill = POLL_IN; | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * PEBS | ||
277 | */ | ||
278 | |||
279 | static struct event_constraint intel_core_pebs_events[] = { | ||
280 | PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ | ||
281 | PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ | ||
282 | PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ | ||
283 | PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ | ||
284 | PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ | ||
285 | PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ | ||
286 | PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ | ||
287 | PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ | ||
288 | PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ | ||
289 | EVENT_CONSTRAINT_END | ||
290 | }; | ||
291 | |||
292 | static struct event_constraint intel_nehalem_pebs_events[] = { | ||
293 | PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ | ||
294 | PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ | ||
295 | PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ | ||
296 | PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ | ||
297 | PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ | ||
298 | PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ | ||
299 | PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ | ||
300 | PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ | ||
301 | PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ | ||
302 | EVENT_CONSTRAINT_END | ||
303 | }; | ||
304 | |||
305 | static struct event_constraint * | ||
306 | intel_pebs_constraints(struct perf_event *event) | ||
307 | { | ||
308 | struct event_constraint *c; | ||
309 | |||
310 | if (!event->attr.precise_ip) | ||
311 | return NULL; | ||
312 | |||
313 | if (x86_pmu.pebs_constraints) { | ||
314 | for_each_event_constraint(c, x86_pmu.pebs_constraints) { | ||
315 | if ((event->hw.config & c->cmask) == c->code) | ||
316 | return c; | ||
317 | } | ||
318 | } | ||
319 | |||
320 | return &emptyconstraint; | ||
321 | } | ||
322 | |||
323 | static void intel_pmu_pebs_enable(struct perf_event *event) | ||
324 | { | ||
325 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
326 | struct hw_perf_event *hwc = &event->hw; | ||
327 | |||
328 | hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; | ||
329 | |||
330 | cpuc->pebs_enabled |= 1ULL << hwc->idx; | ||
331 | WARN_ON_ONCE(cpuc->enabled); | ||
332 | |||
333 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
334 | intel_pmu_lbr_enable(event); | ||
335 | } | ||
336 | |||
337 | static void intel_pmu_pebs_disable(struct perf_event *event) | ||
338 | { | ||
339 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
340 | struct hw_perf_event *hwc = &event->hw; | ||
341 | |||
342 | cpuc->pebs_enabled &= ~(1ULL << hwc->idx); | ||
343 | if (cpuc->enabled) | ||
344 | wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); | ||
345 | |||
346 | hwc->config |= ARCH_PERFMON_EVENTSEL_INT; | ||
347 | |||
348 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
349 | intel_pmu_lbr_disable(event); | ||
350 | } | ||
351 | |||
352 | static void intel_pmu_pebs_enable_all(void) | ||
353 | { | ||
354 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
355 | |||
356 | if (cpuc->pebs_enabled) | ||
357 | wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); | ||
358 | } | ||
359 | |||
360 | static void intel_pmu_pebs_disable_all(void) | ||
361 | { | ||
362 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
363 | |||
364 | if (cpuc->pebs_enabled) | ||
365 | wrmsrl(MSR_IA32_PEBS_ENABLE, 0); | ||
366 | } | ||
367 | |||
368 | #include <asm/insn.h> | ||
369 | |||
370 | static inline bool kernel_ip(unsigned long ip) | ||
371 | { | ||
372 | #ifdef CONFIG_X86_32 | ||
373 | return ip > PAGE_OFFSET; | ||
374 | #else | ||
375 | return (long)ip < 0; | ||
376 | #endif | ||
377 | } | ||
378 | |||
379 | static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) | ||
380 | { | ||
381 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
382 | unsigned long from = cpuc->lbr_entries[0].from; | ||
383 | unsigned long old_to, to = cpuc->lbr_entries[0].to; | ||
384 | unsigned long ip = regs->ip; | ||
385 | |||
386 | /* | ||
387 | * We don't need to fixup if the PEBS assist is fault like | ||
388 | */ | ||
389 | if (!x86_pmu.intel_cap.pebs_trap) | ||
390 | return 1; | ||
391 | |||
392 | /* | ||
393 | * No LBR entry, no basic block, no rewinding | ||
394 | */ | ||
395 | if (!cpuc->lbr_stack.nr || !from || !to) | ||
396 | return 0; | ||
397 | |||
398 | /* | ||
399 | * Basic blocks should never cross user/kernel boundaries | ||
400 | */ | ||
401 | if (kernel_ip(ip) != kernel_ip(to)) | ||
402 | return 0; | ||
403 | |||
404 | /* | ||
405 | * unsigned math, either ip is before the start (impossible) or | ||
406 | * the basic block is larger than 1 page (sanity) | ||
407 | */ | ||
408 | if ((ip - to) > PAGE_SIZE) | ||
409 | return 0; | ||
410 | |||
411 | /* | ||
412 | * We sampled a branch insn, rewind using the LBR stack | ||
413 | */ | ||
414 | if (ip == to) { | ||
415 | regs->ip = from; | ||
416 | return 1; | ||
417 | } | ||
418 | |||
419 | do { | ||
420 | struct insn insn; | ||
421 | u8 buf[MAX_INSN_SIZE]; | ||
422 | void *kaddr; | ||
423 | |||
424 | old_to = to; | ||
425 | if (!kernel_ip(ip)) { | ||
426 | int bytes, size = MAX_INSN_SIZE; | ||
427 | |||
428 | bytes = copy_from_user_nmi(buf, (void __user *)to, size); | ||
429 | if (bytes != size) | ||
430 | return 0; | ||
431 | |||
432 | kaddr = buf; | ||
433 | } else | ||
434 | kaddr = (void *)to; | ||
435 | |||
436 | kernel_insn_init(&insn, kaddr); | ||
437 | insn_get_length(&insn); | ||
438 | to += insn.length; | ||
439 | } while (to < ip); | ||
440 | |||
441 | if (to == ip) { | ||
442 | regs->ip = old_to; | ||
443 | return 1; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Even though we decoded the basic block, the instruction stream | ||
448 | * never matched the given IP, either the TO or the IP got corrupted. | ||
449 | */ | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | static int intel_pmu_save_and_restart(struct perf_event *event); | ||
454 | |||
455 | static void __intel_pmu_pebs_event(struct perf_event *event, | ||
456 | struct pt_regs *iregs, void *__pebs) | ||
457 | { | ||
458 | /* | ||
459 | * We cast to pebs_record_core since that is a subset of | ||
460 | * both formats and we don't use the other fields in this | ||
461 | * routine. | ||
462 | */ | ||
463 | struct pebs_record_core *pebs = __pebs; | ||
464 | struct perf_sample_data data; | ||
465 | struct pt_regs regs; | ||
466 | |||
467 | if (!intel_pmu_save_and_restart(event)) | ||
468 | return; | ||
469 | |||
470 | perf_sample_data_init(&data, 0); | ||
471 | data.period = event->hw.last_period; | ||
472 | |||
473 | /* | ||
474 | * We use the interrupt regs as a base because the PEBS record | ||
475 | * does not contain a full regs set, specifically it seems to | ||
476 | * lack segment descriptors, which get used by things like | ||
477 | * user_mode(). | ||
478 | * | ||
479 | * In the simple case fix up only the IP and BP,SP regs, for | ||
480 | * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. | ||
481 | * A possible PERF_SAMPLE_REGS will have to transfer all regs. | ||
482 | */ | ||
483 | regs = *iregs; | ||
484 | regs.ip = pebs->ip; | ||
485 | regs.bp = pebs->bp; | ||
486 | regs.sp = pebs->sp; | ||
487 | |||
488 | if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) | ||
489 | regs.flags |= PERF_EFLAGS_EXACT; | ||
490 | else | ||
491 | regs.flags &= ~PERF_EFLAGS_EXACT; | ||
492 | |||
493 | if (perf_event_overflow(event, 1, &data, ®s)) | ||
494 | x86_pmu_stop(event); | ||
495 | } | ||
496 | |||
497 | static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) | ||
498 | { | ||
499 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
500 | struct debug_store *ds = cpuc->ds; | ||
501 | struct perf_event *event = cpuc->events[0]; /* PMC0 only */ | ||
502 | struct pebs_record_core *at, *top; | ||
503 | int n; | ||
504 | |||
505 | if (!ds || !x86_pmu.pebs) | ||
506 | return; | ||
507 | |||
508 | at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; | ||
509 | top = (struct pebs_record_core *)(unsigned long)ds->pebs_index; | ||
510 | |||
511 | /* | ||
512 | * Whatever else happens, drain the thing | ||
513 | */ | ||
514 | ds->pebs_index = ds->pebs_buffer_base; | ||
515 | |||
516 | if (!test_bit(0, cpuc->active_mask)) | ||
517 | return; | ||
518 | |||
519 | WARN_ON_ONCE(!event); | ||
520 | |||
521 | if (!event->attr.precise_ip) | ||
522 | return; | ||
523 | |||
524 | n = top - at; | ||
525 | if (n <= 0) | ||
526 | return; | ||
527 | |||
528 | /* | ||
529 | * Should not happen, we program the threshold at 1 and do not | ||
530 | * set a reset value. | ||
531 | */ | ||
532 | WARN_ON_ONCE(n > 1); | ||
533 | at += n - 1; | ||
534 | |||
535 | __intel_pmu_pebs_event(event, iregs, at); | ||
536 | } | ||
537 | |||
538 | static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) | ||
539 | { | ||
540 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
541 | struct debug_store *ds = cpuc->ds; | ||
542 | struct pebs_record_nhm *at, *top; | ||
543 | struct perf_event *event = NULL; | ||
544 | u64 status = 0; | ||
545 | int bit, n; | ||
546 | |||
547 | if (!ds || !x86_pmu.pebs) | ||
548 | return; | ||
549 | |||
550 | at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; | ||
551 | top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; | ||
552 | |||
553 | ds->pebs_index = ds->pebs_buffer_base; | ||
554 | |||
555 | n = top - at; | ||
556 | if (n <= 0) | ||
557 | return; | ||
558 | |||
559 | /* | ||
560 | * Should not happen, we program the threshold at 1 and do not | ||
561 | * set a reset value. | ||
562 | */ | ||
563 | WARN_ON_ONCE(n > MAX_PEBS_EVENTS); | ||
564 | |||
565 | for ( ; at < top; at++) { | ||
566 | for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { | ||
567 | event = cpuc->events[bit]; | ||
568 | if (!test_bit(bit, cpuc->active_mask)) | ||
569 | continue; | ||
570 | |||
571 | WARN_ON_ONCE(!event); | ||
572 | |||
573 | if (!event->attr.precise_ip) | ||
574 | continue; | ||
575 | |||
576 | if (__test_and_set_bit(bit, (unsigned long *)&status)) | ||
577 | continue; | ||
578 | |||
579 | break; | ||
580 | } | ||
581 | |||
582 | if (!event || bit >= MAX_PEBS_EVENTS) | ||
583 | continue; | ||
584 | |||
585 | __intel_pmu_pebs_event(event, iregs, at); | ||
586 | } | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * BTS, PEBS probe and setup | ||
591 | */ | ||
592 | |||
593 | static void intel_ds_init(void) | ||
594 | { | ||
595 | /* | ||
596 | * No support for 32bit formats | ||
597 | */ | ||
598 | if (!boot_cpu_has(X86_FEATURE_DTES64)) | ||
599 | return; | ||
600 | |||
601 | x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); | ||
602 | x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); | ||
603 | if (x86_pmu.pebs) { | ||
604 | char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; | ||
605 | int format = x86_pmu.intel_cap.pebs_format; | ||
606 | |||
607 | switch (format) { | ||
608 | case 0: | ||
609 | printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); | ||
610 | x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); | ||
611 | x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; | ||
612 | x86_pmu.pebs_constraints = intel_core_pebs_events; | ||
613 | break; | ||
614 | |||
615 | case 1: | ||
616 | printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); | ||
617 | x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); | ||
618 | x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; | ||
619 | x86_pmu.pebs_constraints = intel_nehalem_pebs_events; | ||
620 | break; | ||
621 | |||
622 | default: | ||
623 | printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); | ||
624 | x86_pmu.pebs = 0; | ||
625 | break; | ||
626 | } | ||
627 | } | ||
628 | } | ||
629 | |||
630 | #else /* CONFIG_CPU_SUP_INTEL */ | ||
631 | |||
632 | static int reserve_ds_buffers(void) | ||
633 | { | ||
634 | return 0; | ||
635 | } | ||
636 | |||
637 | static void release_ds_buffers(void) | ||
638 | { | ||
639 | } | ||
640 | |||
641 | #endif /* CONFIG_CPU_SUP_INTEL */ | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c new file mode 100644 index 000000000000..d202c1bece1a --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c | |||
@@ -0,0 +1,218 @@ | |||
1 | #ifdef CONFIG_CPU_SUP_INTEL | ||
2 | |||
3 | enum { | ||
4 | LBR_FORMAT_32 = 0x00, | ||
5 | LBR_FORMAT_LIP = 0x01, | ||
6 | LBR_FORMAT_EIP = 0x02, | ||
7 | LBR_FORMAT_EIP_FLAGS = 0x03, | ||
8 | }; | ||
9 | |||
10 | /* | ||
11 | * We only support LBR implementations that have FREEZE_LBRS_ON_PMI | ||
12 | * otherwise it becomes near impossible to get a reliable stack. | ||
13 | */ | ||
14 | |||
15 | static void __intel_pmu_lbr_enable(void) | ||
16 | { | ||
17 | u64 debugctl; | ||
18 | |||
19 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | ||
20 | debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); | ||
21 | wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | ||
22 | } | ||
23 | |||
24 | static void __intel_pmu_lbr_disable(void) | ||
25 | { | ||
26 | u64 debugctl; | ||
27 | |||
28 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | ||
29 | debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); | ||
30 | wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | ||
31 | } | ||
32 | |||
33 | static void intel_pmu_lbr_reset_32(void) | ||
34 | { | ||
35 | int i; | ||
36 | |||
37 | for (i = 0; i < x86_pmu.lbr_nr; i++) | ||
38 | wrmsrl(x86_pmu.lbr_from + i, 0); | ||
39 | } | ||
40 | |||
41 | static void intel_pmu_lbr_reset_64(void) | ||
42 | { | ||
43 | int i; | ||
44 | |||
45 | for (i = 0; i < x86_pmu.lbr_nr; i++) { | ||
46 | wrmsrl(x86_pmu.lbr_from + i, 0); | ||
47 | wrmsrl(x86_pmu.lbr_to + i, 0); | ||
48 | } | ||
49 | } | ||
50 | |||
51 | static void intel_pmu_lbr_reset(void) | ||
52 | { | ||
53 | if (!x86_pmu.lbr_nr) | ||
54 | return; | ||
55 | |||
56 | if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) | ||
57 | intel_pmu_lbr_reset_32(); | ||
58 | else | ||
59 | intel_pmu_lbr_reset_64(); | ||
60 | } | ||
61 | |||
62 | static void intel_pmu_lbr_enable(struct perf_event *event) | ||
63 | { | ||
64 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
65 | |||
66 | if (!x86_pmu.lbr_nr) | ||
67 | return; | ||
68 | |||
69 | WARN_ON_ONCE(cpuc->enabled); | ||
70 | |||
71 | /* | ||
72 | * Reset the LBR stack if we changed task context to | ||
73 | * avoid data leaks. | ||
74 | */ | ||
75 | |||
76 | if (event->ctx->task && cpuc->lbr_context != event->ctx) { | ||
77 | intel_pmu_lbr_reset(); | ||
78 | cpuc->lbr_context = event->ctx; | ||
79 | } | ||
80 | |||
81 | cpuc->lbr_users++; | ||
82 | } | ||
83 | |||
84 | static void intel_pmu_lbr_disable(struct perf_event *event) | ||
85 | { | ||
86 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
87 | |||
88 | if (!x86_pmu.lbr_nr) | ||
89 | return; | ||
90 | |||
91 | cpuc->lbr_users--; | ||
92 | WARN_ON_ONCE(cpuc->lbr_users < 0); | ||
93 | |||
94 | if (cpuc->enabled && !cpuc->lbr_users) | ||
95 | __intel_pmu_lbr_disable(); | ||
96 | } | ||
97 | |||
98 | static void intel_pmu_lbr_enable_all(void) | ||
99 | { | ||
100 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
101 | |||
102 | if (cpuc->lbr_users) | ||
103 | __intel_pmu_lbr_enable(); | ||
104 | } | ||
105 | |||
106 | static void intel_pmu_lbr_disable_all(void) | ||
107 | { | ||
108 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
109 | |||
110 | if (cpuc->lbr_users) | ||
111 | __intel_pmu_lbr_disable(); | ||
112 | } | ||
113 | |||
114 | static inline u64 intel_pmu_lbr_tos(void) | ||
115 | { | ||
116 | u64 tos; | ||
117 | |||
118 | rdmsrl(x86_pmu.lbr_tos, tos); | ||
119 | |||
120 | return tos; | ||
121 | } | ||
122 | |||
123 | static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) | ||
124 | { | ||
125 | unsigned long mask = x86_pmu.lbr_nr - 1; | ||
126 | u64 tos = intel_pmu_lbr_tos(); | ||
127 | int i; | ||
128 | |||
129 | for (i = 0; i < x86_pmu.lbr_nr; i++) { | ||
130 | unsigned long lbr_idx = (tos - i) & mask; | ||
131 | union { | ||
132 | struct { | ||
133 | u32 from; | ||
134 | u32 to; | ||
135 | }; | ||
136 | u64 lbr; | ||
137 | } msr_lastbranch; | ||
138 | |||
139 | rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); | ||
140 | |||
141 | cpuc->lbr_entries[i].from = msr_lastbranch.from; | ||
142 | cpuc->lbr_entries[i].to = msr_lastbranch.to; | ||
143 | cpuc->lbr_entries[i].flags = 0; | ||
144 | } | ||
145 | cpuc->lbr_stack.nr = i; | ||
146 | } | ||
147 | |||
148 | #define LBR_FROM_FLAG_MISPRED (1ULL << 63) | ||
149 | |||
150 | /* | ||
151 | * Due to lack of segmentation in Linux the effective address (offset) | ||
152 | * is the same as the linear address, allowing us to merge the LIP and EIP | ||
153 | * LBR formats. | ||
154 | */ | ||
155 | static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) | ||
156 | { | ||
157 | unsigned long mask = x86_pmu.lbr_nr - 1; | ||
158 | int lbr_format = x86_pmu.intel_cap.lbr_format; | ||
159 | u64 tos = intel_pmu_lbr_tos(); | ||
160 | int i; | ||
161 | |||
162 | for (i = 0; i < x86_pmu.lbr_nr; i++) { | ||
163 | unsigned long lbr_idx = (tos - i) & mask; | ||
164 | u64 from, to, flags = 0; | ||
165 | |||
166 | rdmsrl(x86_pmu.lbr_from + lbr_idx, from); | ||
167 | rdmsrl(x86_pmu.lbr_to + lbr_idx, to); | ||
168 | |||
169 | if (lbr_format == LBR_FORMAT_EIP_FLAGS) { | ||
170 | flags = !!(from & LBR_FROM_FLAG_MISPRED); | ||
171 | from = (u64)((((s64)from) << 1) >> 1); | ||
172 | } | ||
173 | |||
174 | cpuc->lbr_entries[i].from = from; | ||
175 | cpuc->lbr_entries[i].to = to; | ||
176 | cpuc->lbr_entries[i].flags = flags; | ||
177 | } | ||
178 | cpuc->lbr_stack.nr = i; | ||
179 | } | ||
180 | |||
181 | static void intel_pmu_lbr_read(void) | ||
182 | { | ||
183 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
184 | |||
185 | if (!cpuc->lbr_users) | ||
186 | return; | ||
187 | |||
188 | if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) | ||
189 | intel_pmu_lbr_read_32(cpuc); | ||
190 | else | ||
191 | intel_pmu_lbr_read_64(cpuc); | ||
192 | } | ||
193 | |||
194 | static void intel_pmu_lbr_init_core(void) | ||
195 | { | ||
196 | x86_pmu.lbr_nr = 4; | ||
197 | x86_pmu.lbr_tos = 0x01c9; | ||
198 | x86_pmu.lbr_from = 0x40; | ||
199 | x86_pmu.lbr_to = 0x60; | ||
200 | } | ||
201 | |||
202 | static void intel_pmu_lbr_init_nhm(void) | ||
203 | { | ||
204 | x86_pmu.lbr_nr = 16; | ||
205 | x86_pmu.lbr_tos = 0x01c9; | ||
206 | x86_pmu.lbr_from = 0x680; | ||
207 | x86_pmu.lbr_to = 0x6c0; | ||
208 | } | ||
209 | |||
210 | static void intel_pmu_lbr_init_atom(void) | ||
211 | { | ||
212 | x86_pmu.lbr_nr = 8; | ||
213 | x86_pmu.lbr_tos = 0x01c9; | ||
214 | x86_pmu.lbr_from = 0x40; | ||
215 | x86_pmu.lbr_to = 0x60; | ||
216 | } | ||
217 | |||
218 | #endif /* CONFIG_CPU_SUP_INTEL */ | ||
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c new file mode 100644 index 000000000000..107711bf0ee8 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p4.c | |||
@@ -0,0 +1,942 @@ | |||
1 | /* | ||
2 | * Netburst Perfomance Events (P4, old Xeon) | ||
3 | * | ||
4 | * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> | ||
5 | * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> | ||
6 | * | ||
7 | * For licencing details see kernel-base/COPYING | ||
8 | */ | ||
9 | |||
10 | #ifdef CONFIG_CPU_SUP_INTEL | ||
11 | |||
12 | #include <asm/perf_event_p4.h> | ||
13 | |||
14 | #define P4_CNTR_LIMIT 3 | ||
15 | /* | ||
16 | * array indices: 0,1 - HT threads, used with HT enabled cpu | ||
17 | */ | ||
18 | struct p4_event_bind { | ||
19 | unsigned int opcode; /* Event code and ESCR selector */ | ||
20 | unsigned int escr_msr[2]; /* ESCR MSR for this event */ | ||
21 | char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ | ||
22 | }; | ||
23 | |||
24 | struct p4_pebs_bind { | ||
25 | unsigned int metric_pebs; | ||
26 | unsigned int metric_vert; | ||
27 | }; | ||
28 | |||
29 | /* it sets P4_PEBS_ENABLE_UOP_TAG as well */ | ||
30 | #define P4_GEN_PEBS_BIND(name, pebs, vert) \ | ||
31 | [P4_PEBS_METRIC__##name] = { \ | ||
32 | .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ | ||
33 | .metric_vert = vert, \ | ||
34 | } | ||
35 | |||
36 | /* | ||
37 | * note we have P4_PEBS_ENABLE_UOP_TAG always set here | ||
38 | * | ||
39 | * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of | ||
40 | * event configuration to find out which values are to be | ||
41 | * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT | ||
42 | * resgisters | ||
43 | */ | ||
44 | static struct p4_pebs_bind p4_pebs_bind_map[] = { | ||
45 | P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), | ||
46 | P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), | ||
47 | P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), | ||
48 | P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), | ||
49 | P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), | ||
50 | P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), | ||
51 | P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), | ||
52 | P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), | ||
53 | P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), | ||
54 | }; | ||
55 | |||
56 | /* | ||
57 | * Note that we don't use CCCR1 here, there is an | ||
58 | * exception for P4_BSQ_ALLOCATION but we just have | ||
59 | * no workaround | ||
60 | * | ||
61 | * consider this binding as resources which particular | ||
62 | * event may borrow, it doesn't contain EventMask, | ||
63 | * Tags and friends -- they are left to a caller | ||
64 | */ | ||
65 | static struct p4_event_bind p4_event_bind_map[] = { | ||
66 | [P4_EVENT_TC_DELIVER_MODE] = { | ||
67 | .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), | ||
68 | .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, | ||
69 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | ||
70 | }, | ||
71 | [P4_EVENT_BPU_FETCH_REQUEST] = { | ||
72 | .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), | ||
73 | .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, | ||
74 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
75 | }, | ||
76 | [P4_EVENT_ITLB_REFERENCE] = { | ||
77 | .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), | ||
78 | .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, | ||
79 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
80 | }, | ||
81 | [P4_EVENT_MEMORY_CANCEL] = { | ||
82 | .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), | ||
83 | .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, | ||
84 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
85 | }, | ||
86 | [P4_EVENT_MEMORY_COMPLETE] = { | ||
87 | .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), | ||
88 | .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, | ||
89 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
90 | }, | ||
91 | [P4_EVENT_LOAD_PORT_REPLAY] = { | ||
92 | .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), | ||
93 | .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, | ||
94 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
95 | }, | ||
96 | [P4_EVENT_STORE_PORT_REPLAY] = { | ||
97 | .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), | ||
98 | .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, | ||
99 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
100 | }, | ||
101 | [P4_EVENT_MOB_LOAD_REPLAY] = { | ||
102 | .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), | ||
103 | .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, | ||
104 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
105 | }, | ||
106 | [P4_EVENT_PAGE_WALK_TYPE] = { | ||
107 | .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), | ||
108 | .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, | ||
109 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
110 | }, | ||
111 | [P4_EVENT_BSQ_CACHE_REFERENCE] = { | ||
112 | .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), | ||
113 | .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, | ||
114 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
115 | }, | ||
116 | [P4_EVENT_IOQ_ALLOCATION] = { | ||
117 | .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), | ||
118 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | ||
119 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
120 | }, | ||
121 | [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ | ||
122 | .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), | ||
123 | .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, | ||
124 | .cntr = { {2, -1, -1}, {3, -1, -1} }, | ||
125 | }, | ||
126 | [P4_EVENT_FSB_DATA_ACTIVITY] = { | ||
127 | .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), | ||
128 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | ||
129 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
130 | }, | ||
131 | [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ | ||
132 | .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), | ||
133 | .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, | ||
134 | .cntr = { {0, -1, -1}, {1, -1, -1} }, | ||
135 | }, | ||
136 | [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ | ||
137 | .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), | ||
138 | .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, | ||
139 | .cntr = { {2, -1, -1}, {3, -1, -1} }, | ||
140 | }, | ||
141 | [P4_EVENT_SSE_INPUT_ASSIST] = { | ||
142 | .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), | ||
143 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
144 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
145 | }, | ||
146 | [P4_EVENT_PACKED_SP_UOP] = { | ||
147 | .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), | ||
148 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
149 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
150 | }, | ||
151 | [P4_EVENT_PACKED_DP_UOP] = { | ||
152 | .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), | ||
153 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
154 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
155 | }, | ||
156 | [P4_EVENT_SCALAR_SP_UOP] = { | ||
157 | .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), | ||
158 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
159 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
160 | }, | ||
161 | [P4_EVENT_SCALAR_DP_UOP] = { | ||
162 | .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), | ||
163 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
164 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
165 | }, | ||
166 | [P4_EVENT_64BIT_MMX_UOP] = { | ||
167 | .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), | ||
168 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
169 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
170 | }, | ||
171 | [P4_EVENT_128BIT_MMX_UOP] = { | ||
172 | .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), | ||
173 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
174 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
175 | }, | ||
176 | [P4_EVENT_X87_FP_UOP] = { | ||
177 | .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), | ||
178 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | ||
179 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
180 | }, | ||
181 | [P4_EVENT_TC_MISC] = { | ||
182 | .opcode = P4_OPCODE(P4_EVENT_TC_MISC), | ||
183 | .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, | ||
184 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | ||
185 | }, | ||
186 | [P4_EVENT_GLOBAL_POWER_EVENTS] = { | ||
187 | .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), | ||
188 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | ||
189 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
190 | }, | ||
191 | [P4_EVENT_TC_MS_XFER] = { | ||
192 | .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), | ||
193 | .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, | ||
194 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | ||
195 | }, | ||
196 | [P4_EVENT_UOP_QUEUE_WRITES] = { | ||
197 | .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), | ||
198 | .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, | ||
199 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | ||
200 | }, | ||
201 | [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { | ||
202 | .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), | ||
203 | .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, | ||
204 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | ||
205 | }, | ||
206 | [P4_EVENT_RETIRED_BRANCH_TYPE] = { | ||
207 | .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), | ||
208 | .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, | ||
209 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | ||
210 | }, | ||
211 | [P4_EVENT_RESOURCE_STALL] = { | ||
212 | .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), | ||
213 | .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, | ||
214 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
215 | }, | ||
216 | [P4_EVENT_WC_BUFFER] = { | ||
217 | .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), | ||
218 | .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, | ||
219 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | ||
220 | }, | ||
221 | [P4_EVENT_B2B_CYCLES] = { | ||
222 | .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), | ||
223 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | ||
224 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
225 | }, | ||
226 | [P4_EVENT_BNR] = { | ||
227 | .opcode = P4_OPCODE(P4_EVENT_BNR), | ||
228 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | ||
229 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
230 | }, | ||
231 | [P4_EVENT_SNOOP] = { | ||
232 | .opcode = P4_OPCODE(P4_EVENT_SNOOP), | ||
233 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | ||
234 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
235 | }, | ||
236 | [P4_EVENT_RESPONSE] = { | ||
237 | .opcode = P4_OPCODE(P4_EVENT_RESPONSE), | ||
238 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | ||
239 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | ||
240 | }, | ||
241 | [P4_EVENT_FRONT_END_EVENT] = { | ||
242 | .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), | ||
243 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | ||
244 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
245 | }, | ||
246 | [P4_EVENT_EXECUTION_EVENT] = { | ||
247 | .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), | ||
248 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | ||
249 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
250 | }, | ||
251 | [P4_EVENT_REPLAY_EVENT] = { | ||
252 | .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), | ||
253 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | ||
254 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
255 | }, | ||
256 | [P4_EVENT_INSTR_RETIRED] = { | ||
257 | .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), | ||
258 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | ||
259 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
260 | }, | ||
261 | [P4_EVENT_UOPS_RETIRED] = { | ||
262 | .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), | ||
263 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | ||
264 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
265 | }, | ||
266 | [P4_EVENT_UOP_TYPE] = { | ||
267 | .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), | ||
268 | .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, | ||
269 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
270 | }, | ||
271 | [P4_EVENT_BRANCH_RETIRED] = { | ||
272 | .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), | ||
273 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | ||
274 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
275 | }, | ||
276 | [P4_EVENT_MISPRED_BRANCH_RETIRED] = { | ||
277 | .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), | ||
278 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | ||
279 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
280 | }, | ||
281 | [P4_EVENT_X87_ASSIST] = { | ||
282 | .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), | ||
283 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | ||
284 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
285 | }, | ||
286 | [P4_EVENT_MACHINE_CLEAR] = { | ||
287 | .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), | ||
288 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | ||
289 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
290 | }, | ||
291 | [P4_EVENT_INSTR_COMPLETED] = { | ||
292 | .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), | ||
293 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | ||
294 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | ||
295 | }, | ||
296 | }; | ||
297 | |||
298 | #define P4_GEN_CACHE_EVENT(event, bit, metric) \ | ||
299 | p4_config_pack_escr(P4_ESCR_EVENT(event) | \ | ||
300 | P4_ESCR_EMASK_BIT(event, bit)) | \ | ||
301 | p4_config_pack_cccr(metric | \ | ||
302 | P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) | ||
303 | |||
304 | static __initconst const u64 p4_hw_cache_event_ids | ||
305 | [PERF_COUNT_HW_CACHE_MAX] | ||
306 | [PERF_COUNT_HW_CACHE_OP_MAX] | ||
307 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | ||
308 | { | ||
309 | [ C(L1D ) ] = { | ||
310 | [ C(OP_READ) ] = { | ||
311 | [ C(RESULT_ACCESS) ] = 0x0, | ||
312 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | ||
313 | P4_PEBS_METRIC__1stl_cache_load_miss_retired), | ||
314 | }, | ||
315 | }, | ||
316 | [ C(LL ) ] = { | ||
317 | [ C(OP_READ) ] = { | ||
318 | [ C(RESULT_ACCESS) ] = 0x0, | ||
319 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | ||
320 | P4_PEBS_METRIC__2ndl_cache_load_miss_retired), | ||
321 | }, | ||
322 | }, | ||
323 | [ C(DTLB) ] = { | ||
324 | [ C(OP_READ) ] = { | ||
325 | [ C(RESULT_ACCESS) ] = 0x0, | ||
326 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | ||
327 | P4_PEBS_METRIC__dtlb_load_miss_retired), | ||
328 | }, | ||
329 | [ C(OP_WRITE) ] = { | ||
330 | [ C(RESULT_ACCESS) ] = 0x0, | ||
331 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, | ||
332 | P4_PEBS_METRIC__dtlb_store_miss_retired), | ||
333 | }, | ||
334 | }, | ||
335 | [ C(ITLB) ] = { | ||
336 | [ C(OP_READ) ] = { | ||
337 | [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, | ||
338 | P4_PEBS_METRIC__none), | ||
339 | [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, | ||
340 | P4_PEBS_METRIC__none), | ||
341 | }, | ||
342 | [ C(OP_WRITE) ] = { | ||
343 | [ C(RESULT_ACCESS) ] = -1, | ||
344 | [ C(RESULT_MISS) ] = -1, | ||
345 | }, | ||
346 | [ C(OP_PREFETCH) ] = { | ||
347 | [ C(RESULT_ACCESS) ] = -1, | ||
348 | [ C(RESULT_MISS) ] = -1, | ||
349 | }, | ||
350 | }, | ||
351 | }; | ||
352 | |||
353 | static u64 p4_general_events[PERF_COUNT_HW_MAX] = { | ||
354 | /* non-halted CPU clocks */ | ||
355 | [PERF_COUNT_HW_CPU_CYCLES] = | ||
356 | p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | | ||
357 | P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), | ||
358 | |||
359 | /* | ||
360 | * retired instructions | ||
361 | * in a sake of simplicity we don't use the FSB tagging | ||
362 | */ | ||
363 | [PERF_COUNT_HW_INSTRUCTIONS] = | ||
364 | p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) | | ||
365 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | | ||
366 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)), | ||
367 | |||
368 | /* cache hits */ | ||
369 | [PERF_COUNT_HW_CACHE_REFERENCES] = | ||
370 | p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | | ||
371 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | | ||
372 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | | ||
373 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | | ||
374 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | | ||
375 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | | ||
376 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)), | ||
377 | |||
378 | /* cache misses */ | ||
379 | [PERF_COUNT_HW_CACHE_MISSES] = | ||
380 | p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | | ||
381 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | | ||
382 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | | ||
383 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)), | ||
384 | |||
385 | /* branch instructions retired */ | ||
386 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = | ||
387 | p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) | | ||
388 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | | ||
389 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | | ||
390 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | | ||
391 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)), | ||
392 | |||
393 | /* mispredicted branches retired */ | ||
394 | [PERF_COUNT_HW_BRANCH_MISSES] = | ||
395 | p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) | | ||
396 | P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)), | ||
397 | |||
398 | /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */ | ||
399 | [PERF_COUNT_HW_BUS_CYCLES] = | ||
400 | p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) | | ||
401 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | | ||
402 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) | | ||
403 | p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE), | ||
404 | }; | ||
405 | |||
406 | static struct p4_event_bind *p4_config_get_bind(u64 config) | ||
407 | { | ||
408 | unsigned int evnt = p4_config_unpack_event(config); | ||
409 | struct p4_event_bind *bind = NULL; | ||
410 | |||
411 | if (evnt < ARRAY_SIZE(p4_event_bind_map)) | ||
412 | bind = &p4_event_bind_map[evnt]; | ||
413 | |||
414 | return bind; | ||
415 | } | ||
416 | |||
417 | static u64 p4_pmu_event_map(int hw_event) | ||
418 | { | ||
419 | struct p4_event_bind *bind; | ||
420 | unsigned int esel; | ||
421 | u64 config; | ||
422 | |||
423 | config = p4_general_events[hw_event]; | ||
424 | bind = p4_config_get_bind(config); | ||
425 | esel = P4_OPCODE_ESEL(bind->opcode); | ||
426 | config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); | ||
427 | |||
428 | return config; | ||
429 | } | ||
430 | |||
431 | static int p4_validate_raw_event(struct perf_event *event) | ||
432 | { | ||
433 | unsigned int v; | ||
434 | |||
435 | /* user data may have out-of-bound event index */ | ||
436 | v = p4_config_unpack_event(event->attr.config); | ||
437 | if (v >= ARRAY_SIZE(p4_event_bind_map)) { | ||
438 | pr_warning("P4 PMU: Unknown event code: %d\n", v); | ||
439 | return -EINVAL; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * it may have some screwed PEBS bits | ||
444 | */ | ||
445 | if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { | ||
446 | pr_warning("P4 PMU: PEBS are not supported yet\n"); | ||
447 | return -EINVAL; | ||
448 | } | ||
449 | v = p4_config_unpack_metric(event->attr.config); | ||
450 | if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { | ||
451 | pr_warning("P4 PMU: Unknown metric code: %d\n", v); | ||
452 | return -EINVAL; | ||
453 | } | ||
454 | |||
455 | return 0; | ||
456 | } | ||
457 | |||
458 | static int p4_hw_config(struct perf_event *event) | ||
459 | { | ||
460 | int cpu = get_cpu(); | ||
461 | int rc = 0; | ||
462 | u32 escr, cccr; | ||
463 | |||
464 | /* | ||
465 | * the reason we use cpu that early is that: if we get scheduled | ||
466 | * first time on the same cpu -- we will not need swap thread | ||
467 | * specific flags in config (and will save some cpu cycles) | ||
468 | */ | ||
469 | |||
470 | cccr = p4_default_cccr_conf(cpu); | ||
471 | escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel, | ||
472 | event->attr.exclude_user); | ||
473 | event->hw.config = p4_config_pack_escr(escr) | | ||
474 | p4_config_pack_cccr(cccr); | ||
475 | |||
476 | if (p4_ht_active() && p4_ht_thread(cpu)) | ||
477 | event->hw.config = p4_set_ht_bit(event->hw.config); | ||
478 | |||
479 | if (event->attr.type == PERF_TYPE_RAW) { | ||
480 | |||
481 | rc = p4_validate_raw_event(event); | ||
482 | if (rc) | ||
483 | goto out; | ||
484 | |||
485 | /* | ||
486 | * We don't control raw events so it's up to the caller | ||
487 | * to pass sane values (and we don't count the thread number | ||
488 | * on HT machine but allow HT-compatible specifics to be | ||
489 | * passed on) | ||
490 | * | ||
491 | * Note that for RAW events we allow user to use P4_CCCR_RESERVED | ||
492 | * bits since we keep additional info here (for cache events and etc) | ||
493 | * | ||
494 | * XXX: HT wide things should check perf_paranoid_cpu() && | ||
495 | * CAP_SYS_ADMIN | ||
496 | */ | ||
497 | event->hw.config |= event->attr.config & | ||
498 | (p4_config_pack_escr(P4_ESCR_MASK_HT) | | ||
499 | p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); | ||
500 | } | ||
501 | |||
502 | rc = x86_setup_perfctr(event); | ||
503 | out: | ||
504 | put_cpu(); | ||
505 | return rc; | ||
506 | } | ||
507 | |||
508 | static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) | ||
509 | { | ||
510 | int overflow = 0; | ||
511 | u32 low, high; | ||
512 | |||
513 | rdmsr(hwc->config_base + hwc->idx, low, high); | ||
514 | |||
515 | /* we need to check high bit for unflagged overflows */ | ||
516 | if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { | ||
517 | overflow = 1; | ||
518 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, | ||
519 | ((u64)low) & ~P4_CCCR_OVF); | ||
520 | } | ||
521 | |||
522 | return overflow; | ||
523 | } | ||
524 | |||
525 | static void p4_pmu_disable_pebs(void) | ||
526 | { | ||
527 | /* | ||
528 | * FIXME | ||
529 | * | ||
530 | * It's still allowed that two threads setup same cache | ||
531 | * events so we can't simply clear metrics until we knew | ||
532 | * noone is depending on us, so we need kind of counter | ||
533 | * for "ReplayEvent" users. | ||
534 | * | ||
535 | * What is more complex -- RAW events, if user (for some | ||
536 | * reason) will pass some cache event metric with improper | ||
537 | * event opcode -- it's fine from hardware point of view | ||
538 | * but completely nonsence from "meaning" of such action. | ||
539 | * | ||
540 | * So at moment let leave metrics turned on forever -- it's | ||
541 | * ok for now but need to be revisited! | ||
542 | * | ||
543 | * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); | ||
544 | * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); | ||
545 | */ | ||
546 | } | ||
547 | |||
548 | static inline void p4_pmu_disable_event(struct perf_event *event) | ||
549 | { | ||
550 | struct hw_perf_event *hwc = &event->hw; | ||
551 | |||
552 | /* | ||
553 | * If event gets disabled while counter is in overflowed | ||
554 | * state we need to clear P4_CCCR_OVF, otherwise interrupt get | ||
555 | * asserted again and again | ||
556 | */ | ||
557 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, | ||
558 | (u64)(p4_config_unpack_cccr(hwc->config)) & | ||
559 | ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); | ||
560 | } | ||
561 | |||
562 | static void p4_pmu_disable_all(void) | ||
563 | { | ||
564 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
565 | int idx; | ||
566 | |||
567 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
568 | struct perf_event *event = cpuc->events[idx]; | ||
569 | if (!test_bit(idx, cpuc->active_mask)) | ||
570 | continue; | ||
571 | p4_pmu_disable_event(event); | ||
572 | } | ||
573 | |||
574 | p4_pmu_disable_pebs(); | ||
575 | } | ||
576 | |||
577 | /* configuration must be valid */ | ||
578 | static void p4_pmu_enable_pebs(u64 config) | ||
579 | { | ||
580 | struct p4_pebs_bind *bind; | ||
581 | unsigned int idx; | ||
582 | |||
583 | BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); | ||
584 | |||
585 | idx = p4_config_unpack_metric(config); | ||
586 | if (idx == P4_PEBS_METRIC__none) | ||
587 | return; | ||
588 | |||
589 | bind = &p4_pebs_bind_map[idx]; | ||
590 | |||
591 | (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); | ||
592 | (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); | ||
593 | } | ||
594 | |||
595 | static void p4_pmu_enable_event(struct perf_event *event) | ||
596 | { | ||
597 | struct hw_perf_event *hwc = &event->hw; | ||
598 | int thread = p4_ht_config_thread(hwc->config); | ||
599 | u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); | ||
600 | unsigned int idx = p4_config_unpack_event(hwc->config); | ||
601 | struct p4_event_bind *bind; | ||
602 | u64 escr_addr, cccr; | ||
603 | |||
604 | bind = &p4_event_bind_map[idx]; | ||
605 | escr_addr = (u64)bind->escr_msr[thread]; | ||
606 | |||
607 | /* | ||
608 | * - we dont support cascaded counters yet | ||
609 | * - and counter 1 is broken (erratum) | ||
610 | */ | ||
611 | WARN_ON_ONCE(p4_is_event_cascaded(hwc->config)); | ||
612 | WARN_ON_ONCE(hwc->idx == 1); | ||
613 | |||
614 | /* we need a real Event value */ | ||
615 | escr_conf &= ~P4_ESCR_EVENT_MASK; | ||
616 | escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode)); | ||
617 | |||
618 | cccr = p4_config_unpack_cccr(hwc->config); | ||
619 | |||
620 | /* | ||
621 | * it could be Cache event so we need to write metrics | ||
622 | * into additional MSRs | ||
623 | */ | ||
624 | p4_pmu_enable_pebs(hwc->config); | ||
625 | |||
626 | (void)checking_wrmsrl(escr_addr, escr_conf); | ||
627 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, | ||
628 | (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); | ||
629 | } | ||
630 | |||
631 | static void p4_pmu_enable_all(int added) | ||
632 | { | ||
633 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
634 | int idx; | ||
635 | |||
636 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
637 | struct perf_event *event = cpuc->events[idx]; | ||
638 | if (!test_bit(idx, cpuc->active_mask)) | ||
639 | continue; | ||
640 | p4_pmu_enable_event(event); | ||
641 | } | ||
642 | } | ||
643 | |||
644 | static int p4_pmu_handle_irq(struct pt_regs *regs) | ||
645 | { | ||
646 | struct perf_sample_data data; | ||
647 | struct cpu_hw_events *cpuc; | ||
648 | struct perf_event *event; | ||
649 | struct hw_perf_event *hwc; | ||
650 | int idx, handled = 0; | ||
651 | u64 val; | ||
652 | |||
653 | data.addr = 0; | ||
654 | data.raw = NULL; | ||
655 | |||
656 | cpuc = &__get_cpu_var(cpu_hw_events); | ||
657 | |||
658 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
659 | |||
660 | if (!test_bit(idx, cpuc->active_mask)) | ||
661 | continue; | ||
662 | |||
663 | event = cpuc->events[idx]; | ||
664 | hwc = &event->hw; | ||
665 | |||
666 | WARN_ON_ONCE(hwc->idx != idx); | ||
667 | |||
668 | /* it might be unflagged overflow */ | ||
669 | handled = p4_pmu_clear_cccr_ovf(hwc); | ||
670 | |||
671 | val = x86_perf_event_update(event); | ||
672 | if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) | ||
673 | continue; | ||
674 | |||
675 | /* event overflow for sure */ | ||
676 | data.period = event->hw.last_period; | ||
677 | |||
678 | if (!x86_perf_event_set_period(event)) | ||
679 | continue; | ||
680 | if (perf_event_overflow(event, 1, &data, regs)) | ||
681 | p4_pmu_disable_event(event); | ||
682 | } | ||
683 | |||
684 | if (handled) { | ||
685 | /* p4 quirk: unmask it again */ | ||
686 | apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); | ||
687 | inc_irq_stat(apic_perf_irqs); | ||
688 | } | ||
689 | |||
690 | return handled; | ||
691 | } | ||
692 | |||
693 | /* | ||
694 | * swap thread specific fields according to a thread | ||
695 | * we are going to run on | ||
696 | */ | ||
697 | static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu) | ||
698 | { | ||
699 | u32 escr, cccr; | ||
700 | |||
701 | /* | ||
702 | * we either lucky and continue on same cpu or no HT support | ||
703 | */ | ||
704 | if (!p4_should_swap_ts(hwc->config, cpu)) | ||
705 | return; | ||
706 | |||
707 | /* | ||
708 | * the event is migrated from an another logical | ||
709 | * cpu, so we need to swap thread specific flags | ||
710 | */ | ||
711 | |||
712 | escr = p4_config_unpack_escr(hwc->config); | ||
713 | cccr = p4_config_unpack_cccr(hwc->config); | ||
714 | |||
715 | if (p4_ht_thread(cpu)) { | ||
716 | cccr &= ~P4_CCCR_OVF_PMI_T0; | ||
717 | cccr |= P4_CCCR_OVF_PMI_T1; | ||
718 | if (escr & P4_ESCR_T0_OS) { | ||
719 | escr &= ~P4_ESCR_T0_OS; | ||
720 | escr |= P4_ESCR_T1_OS; | ||
721 | } | ||
722 | if (escr & P4_ESCR_T0_USR) { | ||
723 | escr &= ~P4_ESCR_T0_USR; | ||
724 | escr |= P4_ESCR_T1_USR; | ||
725 | } | ||
726 | hwc->config = p4_config_pack_escr(escr); | ||
727 | hwc->config |= p4_config_pack_cccr(cccr); | ||
728 | hwc->config |= P4_CONFIG_HT; | ||
729 | } else { | ||
730 | cccr &= ~P4_CCCR_OVF_PMI_T1; | ||
731 | cccr |= P4_CCCR_OVF_PMI_T0; | ||
732 | if (escr & P4_ESCR_T1_OS) { | ||
733 | escr &= ~P4_ESCR_T1_OS; | ||
734 | escr |= P4_ESCR_T0_OS; | ||
735 | } | ||
736 | if (escr & P4_ESCR_T1_USR) { | ||
737 | escr &= ~P4_ESCR_T1_USR; | ||
738 | escr |= P4_ESCR_T0_USR; | ||
739 | } | ||
740 | hwc->config = p4_config_pack_escr(escr); | ||
741 | hwc->config |= p4_config_pack_cccr(cccr); | ||
742 | hwc->config &= ~P4_CONFIG_HT; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | /* | ||
747 | * ESCR address hashing is tricky, ESCRs are not sequential | ||
748 | * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and | ||
749 | * the metric between any ESCRs is laid in range [0xa0,0xe1] | ||
750 | * | ||
751 | * so we make ~70% filled hashtable | ||
752 | */ | ||
753 | |||
754 | #define P4_ESCR_MSR_BASE 0x000003a0 | ||
755 | #define P4_ESCR_MSR_MAX 0x000003e1 | ||
756 | #define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1) | ||
757 | #define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE) | ||
758 | #define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr | ||
759 | |||
760 | static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = { | ||
761 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0), | ||
762 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1), | ||
763 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0), | ||
764 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1), | ||
765 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0), | ||
766 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1), | ||
767 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0), | ||
768 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1), | ||
769 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2), | ||
770 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3), | ||
771 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4), | ||
772 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5), | ||
773 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0), | ||
774 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1), | ||
775 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0), | ||
776 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1), | ||
777 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0), | ||
778 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1), | ||
779 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0), | ||
780 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1), | ||
781 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0), | ||
782 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1), | ||
783 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0), | ||
784 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1), | ||
785 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0), | ||
786 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1), | ||
787 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0), | ||
788 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1), | ||
789 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0), | ||
790 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1), | ||
791 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0), | ||
792 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1), | ||
793 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0), | ||
794 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1), | ||
795 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0), | ||
796 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1), | ||
797 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0), | ||
798 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1), | ||
799 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0), | ||
800 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1), | ||
801 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0), | ||
802 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1), | ||
803 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0), | ||
804 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1), | ||
805 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0), | ||
806 | P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1), | ||
807 | }; | ||
808 | |||
809 | static int p4_get_escr_idx(unsigned int addr) | ||
810 | { | ||
811 | unsigned int idx = P4_ESCR_MSR_IDX(addr); | ||
812 | |||
813 | if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || | ||
814 | !p4_escr_table[idx] || | ||
815 | p4_escr_table[idx] != addr)) { | ||
816 | WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); | ||
817 | return -1; | ||
818 | } | ||
819 | |||
820 | return idx; | ||
821 | } | ||
822 | |||
823 | static int p4_next_cntr(int thread, unsigned long *used_mask, | ||
824 | struct p4_event_bind *bind) | ||
825 | { | ||
826 | int i, j; | ||
827 | |||
828 | for (i = 0; i < P4_CNTR_LIMIT; i++) { | ||
829 | j = bind->cntr[thread][i]; | ||
830 | if (j != -1 && !test_bit(j, used_mask)) | ||
831 | return j; | ||
832 | } | ||
833 | |||
834 | return -1; | ||
835 | } | ||
836 | |||
837 | static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) | ||
838 | { | ||
839 | unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | ||
840 | unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; | ||
841 | int cpu = smp_processor_id(); | ||
842 | struct hw_perf_event *hwc; | ||
843 | struct p4_event_bind *bind; | ||
844 | unsigned int i, thread, num; | ||
845 | int cntr_idx, escr_idx; | ||
846 | |||
847 | bitmap_zero(used_mask, X86_PMC_IDX_MAX); | ||
848 | bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); | ||
849 | |||
850 | for (i = 0, num = n; i < n; i++, num--) { | ||
851 | |||
852 | hwc = &cpuc->event_list[i]->hw; | ||
853 | thread = p4_ht_thread(cpu); | ||
854 | bind = p4_config_get_bind(hwc->config); | ||
855 | escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); | ||
856 | if (unlikely(escr_idx == -1)) | ||
857 | goto done; | ||
858 | |||
859 | if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) { | ||
860 | cntr_idx = hwc->idx; | ||
861 | if (assign) | ||
862 | assign[i] = hwc->idx; | ||
863 | goto reserve; | ||
864 | } | ||
865 | |||
866 | cntr_idx = p4_next_cntr(thread, used_mask, bind); | ||
867 | if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) | ||
868 | goto done; | ||
869 | |||
870 | p4_pmu_swap_config_ts(hwc, cpu); | ||
871 | if (assign) | ||
872 | assign[i] = cntr_idx; | ||
873 | reserve: | ||
874 | set_bit(cntr_idx, used_mask); | ||
875 | set_bit(escr_idx, escr_mask); | ||
876 | } | ||
877 | |||
878 | done: | ||
879 | return num ? -ENOSPC : 0; | ||
880 | } | ||
881 | |||
882 | static __initconst const struct x86_pmu p4_pmu = { | ||
883 | .name = "Netburst P4/Xeon", | ||
884 | .handle_irq = p4_pmu_handle_irq, | ||
885 | .disable_all = p4_pmu_disable_all, | ||
886 | .enable_all = p4_pmu_enable_all, | ||
887 | .enable = p4_pmu_enable_event, | ||
888 | .disable = p4_pmu_disable_event, | ||
889 | .eventsel = MSR_P4_BPU_CCCR0, | ||
890 | .perfctr = MSR_P4_BPU_PERFCTR0, | ||
891 | .event_map = p4_pmu_event_map, | ||
892 | .max_events = ARRAY_SIZE(p4_general_events), | ||
893 | .get_event_constraints = x86_get_event_constraints, | ||
894 | /* | ||
895 | * IF HT disabled we may need to use all | ||
896 | * ARCH_P4_MAX_CCCR counters simulaneously | ||
897 | * though leave it restricted at moment assuming | ||
898 | * HT is on | ||
899 | */ | ||
900 | .num_counters = ARCH_P4_MAX_CCCR, | ||
901 | .apic = 1, | ||
902 | .cntval_bits = 40, | ||
903 | .cntval_mask = (1ULL << 40) - 1, | ||
904 | .max_period = (1ULL << 39) - 1, | ||
905 | .hw_config = p4_hw_config, | ||
906 | .schedule_events = p4_pmu_schedule_events, | ||
907 | /* | ||
908 | * This handles erratum N15 in intel doc 249199-029, | ||
909 | * the counter may not be updated correctly on write | ||
910 | * so we need a second write operation to do the trick | ||
911 | * (the official workaround didn't work) | ||
912 | * | ||
913 | * the former idea is taken from OProfile code | ||
914 | */ | ||
915 | .perfctr_second_write = 1, | ||
916 | }; | ||
917 | |||
918 | static __init int p4_pmu_init(void) | ||
919 | { | ||
920 | unsigned int low, high; | ||
921 | |||
922 | /* If we get stripped -- indexig fails */ | ||
923 | BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); | ||
924 | |||
925 | rdmsr(MSR_IA32_MISC_ENABLE, low, high); | ||
926 | if (!(low & (1 << 7))) { | ||
927 | pr_cont("unsupported Netburst CPU model %d ", | ||
928 | boot_cpu_data.x86_model); | ||
929 | return -ENODEV; | ||
930 | } | ||
931 | |||
932 | memcpy(hw_cache_event_ids, p4_hw_cache_event_ids, | ||
933 | sizeof(hw_cache_event_ids)); | ||
934 | |||
935 | pr_cont("Netburst events, "); | ||
936 | |||
937 | x86_pmu = p4_pmu; | ||
938 | |||
939 | return 0; | ||
940 | } | ||
941 | |||
942 | #endif /* CONFIG_CPU_SUP_INTEL */ | ||
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index a330485d14da..34ba07be2cda 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c | |||
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event) | |||
27 | */ | 27 | */ |
28 | #define P6_NOP_EVENT 0x0000002EULL | 28 | #define P6_NOP_EVENT 0x0000002EULL |
29 | 29 | ||
30 | static u64 p6_pmu_raw_event(u64 hw_event) | ||
31 | { | ||
32 | #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL | ||
33 | #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL | ||
34 | #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL | ||
35 | #define P6_EVNTSEL_INV_MASK 0x00800000ULL | ||
36 | #define P6_EVNTSEL_REG_MASK 0xFF000000ULL | ||
37 | |||
38 | #define P6_EVNTSEL_MASK \ | ||
39 | (P6_EVNTSEL_EVENT_MASK | \ | ||
40 | P6_EVNTSEL_UNIT_MASK | \ | ||
41 | P6_EVNTSEL_EDGE_MASK | \ | ||
42 | P6_EVNTSEL_INV_MASK | \ | ||
43 | P6_EVNTSEL_REG_MASK) | ||
44 | |||
45 | return hw_event & P6_EVNTSEL_MASK; | ||
46 | } | ||
47 | |||
48 | static struct event_constraint p6_event_constraints[] = | 30 | static struct event_constraint p6_event_constraints[] = |
49 | { | 31 | { |
50 | INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ | 32 | INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ |
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void) | |||
66 | wrmsrl(MSR_P6_EVNTSEL0, val); | 48 | wrmsrl(MSR_P6_EVNTSEL0, val); |
67 | } | 49 | } |
68 | 50 | ||
69 | static void p6_pmu_enable_all(void) | 51 | static void p6_pmu_enable_all(int added) |
70 | { | 52 | { |
71 | unsigned long val; | 53 | unsigned long val; |
72 | 54 | ||
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event) | |||
102 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); | 84 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); |
103 | } | 85 | } |
104 | 86 | ||
105 | static __initconst struct x86_pmu p6_pmu = { | 87 | static __initconst const struct x86_pmu p6_pmu = { |
106 | .name = "p6", | 88 | .name = "p6", |
107 | .handle_irq = x86_pmu_handle_irq, | 89 | .handle_irq = x86_pmu_handle_irq, |
108 | .disable_all = p6_pmu_disable_all, | 90 | .disable_all = p6_pmu_disable_all, |
109 | .enable_all = p6_pmu_enable_all, | 91 | .enable_all = p6_pmu_enable_all, |
110 | .enable = p6_pmu_enable_event, | 92 | .enable = p6_pmu_enable_event, |
111 | .disable = p6_pmu_disable_event, | 93 | .disable = p6_pmu_disable_event, |
94 | .hw_config = x86_pmu_hw_config, | ||
95 | .schedule_events = x86_schedule_events, | ||
112 | .eventsel = MSR_P6_EVNTSEL0, | 96 | .eventsel = MSR_P6_EVNTSEL0, |
113 | .perfctr = MSR_P6_PERFCTR0, | 97 | .perfctr = MSR_P6_PERFCTR0, |
114 | .event_map = p6_pmu_event_map, | 98 | .event_map = p6_pmu_event_map, |
115 | .raw_event = p6_pmu_raw_event, | ||
116 | .max_events = ARRAY_SIZE(p6_perfmon_event_map), | 99 | .max_events = ARRAY_SIZE(p6_perfmon_event_map), |
117 | .apic = 1, | 100 | .apic = 1, |
118 | .max_period = (1ULL << 31) - 1, | 101 | .max_period = (1ULL << 31) - 1, |
119 | .version = 0, | 102 | .version = 0, |
120 | .num_events = 2, | 103 | .num_counters = 2, |
121 | /* | 104 | /* |
122 | * Events have 40 bits implemented. However they are designed such | 105 | * Events have 40 bits implemented. However they are designed such |
123 | * that bits [32-39] are sign extensions of bit 31. As such the | 106 | * that bits [32-39] are sign extensions of bit 31. As such the |
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = { | |||
125 | * | 108 | * |
126 | * See IA-32 Intel Architecture Software developer manual Vol 3B | 109 | * See IA-32 Intel Architecture Software developer manual Vol 3B |
127 | */ | 110 | */ |
128 | .event_bits = 32, | 111 | .cntval_bits = 32, |
129 | .event_mask = (1ULL << 32) - 1, | 112 | .cntval_mask = (1ULL << 32) - 1, |
130 | .get_event_constraints = x86_get_event_constraints, | 113 | .get_event_constraints = x86_get_event_constraints, |
131 | .event_constraints = p6_event_constraints, | 114 | .event_constraints = p6_event_constraints, |
132 | }; | 115 | }; |
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000000..34b4dad6f0b8 --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c | |||
@@ -0,0 +1,63 @@ | |||
1 | /* | ||
2 | * Routines to indentify additional cpu features that are scattered in | ||
3 | * cpuid space. | ||
4 | */ | ||
5 | #include <linux/cpu.h> | ||
6 | |||
7 | #include <asm/pat.h> | ||
8 | #include <asm/processor.h> | ||
9 | |||
10 | #include <asm/apic.h> | ||
11 | |||
12 | struct cpuid_bit { | ||
13 | u16 feature; | ||
14 | u8 reg; | ||
15 | u8 bit; | ||
16 | u32 level; | ||
17 | u32 sub_leaf; | ||
18 | }; | ||
19 | |||
20 | enum cpuid_regs { | ||
21 | CR_EAX = 0, | ||
22 | CR_ECX, | ||
23 | CR_EDX, | ||
24 | CR_EBX | ||
25 | }; | ||
26 | |||
27 | void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | ||
28 | { | ||
29 | u32 max_level; | ||
30 | u32 regs[4]; | ||
31 | const struct cpuid_bit *cb; | ||
32 | |||
33 | static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { | ||
34 | { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, | ||
35 | { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, | ||
36 | { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, | ||
37 | { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, | ||
38 | { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, | ||
39 | { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, | ||
40 | { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, | ||
41 | { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, | ||
42 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, | ||
43 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, | ||
44 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, | ||
45 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, | ||
46 | { 0, 0, 0, 0, 0 } | ||
47 | }; | ||
48 | |||
49 | for (cb = cpuid_bits; cb->feature; cb++) { | ||
50 | |||
51 | /* Verify that the level is valid */ | ||
52 | max_level = cpuid_eax(cb->level & 0xffff0000); | ||
53 | if (max_level < cb->level || | ||
54 | max_level > (cb->level | 0xffff)) | ||
55 | continue; | ||
56 | |||
57 | cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], | ||
58 | ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); | ||
59 | |||
60 | if (regs[cb->reg] & (1 << cb->bit)) | ||
61 | set_cpu_cap(c, cb->feature); | ||
62 | } | ||
63 | } | ||
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c index 97ad79cdf688..4397e987a1cf 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/topology.c | |||
@@ -1,60 +1,14 @@ | |||
1 | /* | 1 | /* |
2 | * Routines to indentify additional cpu features that are scattered in | 2 | * Check for extended topology enumeration cpuid leaf 0xb and if it |
3 | * cpuid space. | 3 | * exists, use it for populating initial_apicid and cpu topology |
4 | * detection. | ||
4 | */ | 5 | */ |
5 | #include <linux/cpu.h> | ||
6 | 6 | ||
7 | #include <linux/cpu.h> | ||
8 | #include <asm/apic.h> | ||
7 | #include <asm/pat.h> | 9 | #include <asm/pat.h> |
8 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
9 | 11 | ||
10 | #include <asm/apic.h> | ||
11 | |||
12 | struct cpuid_bit { | ||
13 | u16 feature; | ||
14 | u8 reg; | ||
15 | u8 bit; | ||
16 | u32 level; | ||
17 | }; | ||
18 | |||
19 | enum cpuid_regs { | ||
20 | CR_EAX = 0, | ||
21 | CR_ECX, | ||
22 | CR_EDX, | ||
23 | CR_EBX | ||
24 | }; | ||
25 | |||
26 | void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | ||
27 | { | ||
28 | u32 max_level; | ||
29 | u32 regs[4]; | ||
30 | const struct cpuid_bit *cb; | ||
31 | |||
32 | static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { | ||
33 | { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, | ||
34 | { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, | ||
35 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, | ||
36 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, | ||
37 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, | ||
38 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, | ||
39 | { 0, 0, 0, 0 } | ||
40 | }; | ||
41 | |||
42 | for (cb = cpuid_bits; cb->feature; cb++) { | ||
43 | |||
44 | /* Verify that the level is valid */ | ||
45 | max_level = cpuid_eax(cb->level & 0xffff0000); | ||
46 | if (max_level < cb->level || | ||
47 | max_level > (cb->level | 0xffff)) | ||
48 | continue; | ||
49 | |||
50 | cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], | ||
51 | ®s[CR_ECX], ®s[CR_EDX]); | ||
52 | |||
53 | if (regs[cb->reg] & (1 << cb->bit)) | ||
54 | set_cpu_cap(c, cb->feature); | ||
55 | } | ||
56 | } | ||
57 | |||
58 | /* leaf 0xb SMT level */ | 12 | /* leaf 0xb SMT level */ |
59 | #define SMT_LEVEL 0 | 13 | #define SMT_LEVEL 0 |
60 | 14 | ||
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index dfdb4dba2320..227b0448960d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c | |||
@@ -24,8 +24,8 @@ | |||
24 | #include <linux/dmi.h> | 24 | #include <linux/dmi.h> |
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <asm/div64.h> | 26 | #include <asm/div64.h> |
27 | #include <asm/vmware.h> | ||
28 | #include <asm/x86_init.h> | 27 | #include <asm/x86_init.h> |
28 | #include <asm/hypervisor.h> | ||
29 | 29 | ||
30 | #define CPUID_VMWARE_INFO_LEAF 0x40000000 | 30 | #define CPUID_VMWARE_INFO_LEAF 0x40000000 |
31 | #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 | 31 | #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 |
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void) | |||
51 | 51 | ||
52 | static unsigned long vmware_get_tsc_khz(void) | 52 | static unsigned long vmware_get_tsc_khz(void) |
53 | { | 53 | { |
54 | uint64_t tsc_hz; | 54 | uint64_t tsc_hz, lpj; |
55 | uint32_t eax, ebx, ecx, edx; | 55 | uint32_t eax, ebx, ecx, edx; |
56 | 56 | ||
57 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); | 57 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); |
@@ -62,10 +62,17 @@ static unsigned long vmware_get_tsc_khz(void) | |||
62 | printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", | 62 | printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", |
63 | (unsigned long) tsc_hz / 1000, | 63 | (unsigned long) tsc_hz / 1000, |
64 | (unsigned long) tsc_hz % 1000); | 64 | (unsigned long) tsc_hz % 1000); |
65 | |||
66 | if (!preset_lpj) { | ||
67 | lpj = ((u64)tsc_hz * 1000); | ||
68 | do_div(lpj, HZ); | ||
69 | preset_lpj = lpj; | ||
70 | } | ||
71 | |||
65 | return tsc_hz; | 72 | return tsc_hz; |
66 | } | 73 | } |
67 | 74 | ||
68 | void __init vmware_platform_setup(void) | 75 | static void __init vmware_platform_setup(void) |
69 | { | 76 | { |
70 | uint32_t eax, ebx, ecx, edx; | 77 | uint32_t eax, ebx, ecx, edx; |
71 | 78 | ||
@@ -83,26 +90,22 @@ void __init vmware_platform_setup(void) | |||
83 | * serial key should be enough, as this will always have a VMware | 90 | * serial key should be enough, as this will always have a VMware |
84 | * specific string when running under VMware hypervisor. | 91 | * specific string when running under VMware hypervisor. |
85 | */ | 92 | */ |
86 | int vmware_platform(void) | 93 | static bool __init vmware_platform(void) |
87 | { | 94 | { |
88 | if (cpu_has_hypervisor) { | 95 | if (cpu_has_hypervisor) { |
89 | unsigned int eax, ebx, ecx, edx; | 96 | unsigned int eax; |
90 | char hyper_vendor_id[13]; | 97 | unsigned int hyper_vendor_id[3]; |
91 | 98 | ||
92 | cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); | 99 | cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], |
93 | memcpy(hyper_vendor_id + 0, &ebx, 4); | 100 | &hyper_vendor_id[1], &hyper_vendor_id[2]); |
94 | memcpy(hyper_vendor_id + 4, &ecx, 4); | 101 | if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) |
95 | memcpy(hyper_vendor_id + 8, &edx, 4); | 102 | return true; |
96 | hyper_vendor_id[12] = '\0'; | ||
97 | if (!strcmp(hyper_vendor_id, "VMwareVMware")) | ||
98 | return 1; | ||
99 | } else if (dmi_available && dmi_name_in_serial("VMware") && | 103 | } else if (dmi_available && dmi_name_in_serial("VMware") && |
100 | __vmware_platform()) | 104 | __vmware_platform()) |
101 | return 1; | 105 | return true; |
102 | 106 | ||
103 | return 0; | 107 | return false; |
104 | } | 108 | } |
105 | EXPORT_SYMBOL(vmware_platform); | ||
106 | 109 | ||
107 | /* | 110 | /* |
108 | * VMware hypervisor takes care of exporting a reliable TSC to the guest. | 111 | * VMware hypervisor takes care of exporting a reliable TSC to the guest. |
@@ -116,8 +119,16 @@ EXPORT_SYMBOL(vmware_platform); | |||
116 | * so that the kernel could just trust the hypervisor with providing a | 119 | * so that the kernel could just trust the hypervisor with providing a |
117 | * reliable virtual TSC that is suitable for timekeeping. | 120 | * reliable virtual TSC that is suitable for timekeeping. |
118 | */ | 121 | */ |
119 | void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) | 122 | static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c) |
120 | { | 123 | { |
121 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 124 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
122 | set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); | 125 | set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); |
123 | } | 126 | } |
127 | |||
128 | const __refconst struct hypervisor_x86 x86_hyper_vmware = { | ||
129 | .name = "VMware", | ||
130 | .detect = vmware_platform, | ||
131 | .set_cpu_features = vmware_set_cpu_features, | ||
132 | .init_platform = vmware_platform_setup, | ||
133 | }; | ||
134 | EXPORT_SYMBOL(x86_hyper_vmware); | ||
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 8b862d5900fe..1b7b31ab7d86 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, | |||
170 | cpuid_device_destroy(cpu); | 170 | cpuid_device_destroy(cpu); |
171 | break; | 171 | break; |
172 | } | 172 | } |
173 | return err ? NOTIFY_BAD : NOTIFY_OK; | 173 | return notifier_from_errno(err); |
174 | } | 174 | } |
175 | 175 | ||
176 | static struct notifier_block __refdata cpuid_class_cpu_notifier = | 176 | static struct notifier_block __refdata cpuid_class_cpu_notifier = |
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c deleted file mode 100644 index 1c47390dd0e5..000000000000 --- a/arch/x86/kernel/ds.c +++ /dev/null | |||
@@ -1,1437 +0,0 @@ | |||
1 | /* | ||
2 | * Debug Store support | ||
3 | * | ||
4 | * This provides a low-level interface to the hardware's Debug Store | ||
5 | * feature that is used for branch trace store (BTS) and | ||
6 | * precise-event based sampling (PEBS). | ||
7 | * | ||
8 | * It manages: | ||
9 | * - DS and BTS hardware configuration | ||
10 | * - buffer overflow handling (to be done) | ||
11 | * - buffer access | ||
12 | * | ||
13 | * It does not do: | ||
14 | * - security checking (is the caller allowed to trace the task) | ||
15 | * - buffer allocation (memory accounting) | ||
16 | * | ||
17 | * | ||
18 | * Copyright (C) 2007-2009 Intel Corporation. | ||
19 | * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <linux/sched.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/trace_clock.h> | ||
29 | |||
30 | #include <asm/ds.h> | ||
31 | |||
32 | #include "ds_selftest.h" | ||
33 | |||
34 | /* | ||
35 | * The configuration for a particular DS hardware implementation: | ||
36 | */ | ||
37 | struct ds_configuration { | ||
38 | /* The name of the configuration: */ | ||
39 | const char *name; | ||
40 | |||
41 | /* The size of pointer-typed fields in DS, BTS, and PEBS: */ | ||
42 | unsigned char sizeof_ptr_field; | ||
43 | |||
44 | /* The size of a BTS/PEBS record in bytes: */ | ||
45 | unsigned char sizeof_rec[2]; | ||
46 | |||
47 | /* The number of pebs counter reset values in the DS structure. */ | ||
48 | unsigned char nr_counter_reset; | ||
49 | |||
50 | /* Control bit-masks indexed by enum ds_feature: */ | ||
51 | unsigned long ctl[dsf_ctl_max]; | ||
52 | }; | ||
53 | static struct ds_configuration ds_cfg __read_mostly; | ||
54 | |||
55 | |||
56 | /* Maximal size of a DS configuration: */ | ||
57 | #define MAX_SIZEOF_DS 0x80 | ||
58 | |||
59 | /* Maximal size of a BTS record: */ | ||
60 | #define MAX_SIZEOF_BTS (3 * 8) | ||
61 | |||
62 | /* BTS and PEBS buffer alignment: */ | ||
63 | #define DS_ALIGNMENT (1 << 3) | ||
64 | |||
65 | /* Number of buffer pointers in DS: */ | ||
66 | #define NUM_DS_PTR_FIELDS 8 | ||
67 | |||
68 | /* Size of a pebs reset value in DS: */ | ||
69 | #define PEBS_RESET_FIELD_SIZE 8 | ||
70 | |||
71 | /* Mask of control bits in the DS MSR register: */ | ||
72 | #define BTS_CONTROL \ | ||
73 | ( ds_cfg.ctl[dsf_bts] | \ | ||
74 | ds_cfg.ctl[dsf_bts_kernel] | \ | ||
75 | ds_cfg.ctl[dsf_bts_user] | \ | ||
76 | ds_cfg.ctl[dsf_bts_overflow] ) | ||
77 | |||
78 | /* | ||
79 | * A BTS or PEBS tracer. | ||
80 | * | ||
81 | * This holds the configuration of the tracer and serves as a handle | ||
82 | * to identify tracers. | ||
83 | */ | ||
84 | struct ds_tracer { | ||
85 | /* The DS context (partially) owned by this tracer. */ | ||
86 | struct ds_context *context; | ||
87 | /* The buffer provided on ds_request() and its size in bytes. */ | ||
88 | void *buffer; | ||
89 | size_t size; | ||
90 | }; | ||
91 | |||
92 | struct bts_tracer { | ||
93 | /* The common DS part: */ | ||
94 | struct ds_tracer ds; | ||
95 | |||
96 | /* The trace including the DS configuration: */ | ||
97 | struct bts_trace trace; | ||
98 | |||
99 | /* Buffer overflow notification function: */ | ||
100 | bts_ovfl_callback_t ovfl; | ||
101 | |||
102 | /* Active flags affecting trace collection. */ | ||
103 | unsigned int flags; | ||
104 | }; | ||
105 | |||
106 | struct pebs_tracer { | ||
107 | /* The common DS part: */ | ||
108 | struct ds_tracer ds; | ||
109 | |||
110 | /* The trace including the DS configuration: */ | ||
111 | struct pebs_trace trace; | ||
112 | |||
113 | /* Buffer overflow notification function: */ | ||
114 | pebs_ovfl_callback_t ovfl; | ||
115 | }; | ||
116 | |||
117 | /* | ||
118 | * Debug Store (DS) save area configuration (see Intel64 and IA32 | ||
119 | * Architectures Software Developer's Manual, section 18.5) | ||
120 | * | ||
121 | * The DS configuration consists of the following fields; different | ||
122 | * architetures vary in the size of those fields. | ||
123 | * | ||
124 | * - double-word aligned base linear address of the BTS buffer | ||
125 | * - write pointer into the BTS buffer | ||
126 | * - end linear address of the BTS buffer (one byte beyond the end of | ||
127 | * the buffer) | ||
128 | * - interrupt pointer into BTS buffer | ||
129 | * (interrupt occurs when write pointer passes interrupt pointer) | ||
130 | * - double-word aligned base linear address of the PEBS buffer | ||
131 | * - write pointer into the PEBS buffer | ||
132 | * - end linear address of the PEBS buffer (one byte beyond the end of | ||
133 | * the buffer) | ||
134 | * - interrupt pointer into PEBS buffer | ||
135 | * (interrupt occurs when write pointer passes interrupt pointer) | ||
136 | * - value to which counter is reset following counter overflow | ||
137 | * | ||
138 | * Later architectures use 64bit pointers throughout, whereas earlier | ||
139 | * architectures use 32bit pointers in 32bit mode. | ||
140 | * | ||
141 | * | ||
142 | * We compute the base address for the first 8 fields based on: | ||
143 | * - the field size stored in the DS configuration | ||
144 | * - the relative field position | ||
145 | * - an offset giving the start of the respective region | ||
146 | * | ||
147 | * This offset is further used to index various arrays holding | ||
148 | * information for BTS and PEBS at the respective index. | ||
149 | * | ||
150 | * On later 32bit processors, we only access the lower 32bit of the | ||
151 | * 64bit pointer fields. The upper halves will be zeroed out. | ||
152 | */ | ||
153 | |||
154 | enum ds_field { | ||
155 | ds_buffer_base = 0, | ||
156 | ds_index, | ||
157 | ds_absolute_maximum, | ||
158 | ds_interrupt_threshold, | ||
159 | }; | ||
160 | |||
161 | enum ds_qualifier { | ||
162 | ds_bts = 0, | ||
163 | ds_pebs | ||
164 | }; | ||
165 | |||
166 | static inline unsigned long | ||
167 | ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) | ||
168 | { | ||
169 | base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); | ||
170 | return *(unsigned long *)base; | ||
171 | } | ||
172 | |||
173 | static inline void | ||
174 | ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, | ||
175 | unsigned long value) | ||
176 | { | ||
177 | base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); | ||
178 | (*(unsigned long *)base) = value; | ||
179 | } | ||
180 | |||
181 | |||
182 | /* | ||
183 | * Locking is done only for allocating BTS or PEBS resources. | ||
184 | */ | ||
185 | static DEFINE_SPINLOCK(ds_lock); | ||
186 | |||
187 | /* | ||
188 | * We either support (system-wide) per-cpu or per-thread allocation. | ||
189 | * We distinguish the two based on the task_struct pointer, where a | ||
190 | * NULL pointer indicates per-cpu allocation for the current cpu. | ||
191 | * | ||
192 | * Allocations are use-counted. As soon as resources are allocated, | ||
193 | * further allocations must be of the same type (per-cpu or | ||
194 | * per-thread). We model this by counting allocations (i.e. the number | ||
195 | * of tracers of a certain type) for one type negatively: | ||
196 | * =0 no tracers | ||
197 | * >0 number of per-thread tracers | ||
198 | * <0 number of per-cpu tracers | ||
199 | * | ||
200 | * Tracers essentially gives the number of ds contexts for a certain | ||
201 | * type of allocation. | ||
202 | */ | ||
203 | static atomic_t tracers = ATOMIC_INIT(0); | ||
204 | |||
205 | static inline int get_tracer(struct task_struct *task) | ||
206 | { | ||
207 | int error; | ||
208 | |||
209 | spin_lock_irq(&ds_lock); | ||
210 | |||
211 | if (task) { | ||
212 | error = -EPERM; | ||
213 | if (atomic_read(&tracers) < 0) | ||
214 | goto out; | ||
215 | atomic_inc(&tracers); | ||
216 | } else { | ||
217 | error = -EPERM; | ||
218 | if (atomic_read(&tracers) > 0) | ||
219 | goto out; | ||
220 | atomic_dec(&tracers); | ||
221 | } | ||
222 | |||
223 | error = 0; | ||
224 | out: | ||
225 | spin_unlock_irq(&ds_lock); | ||
226 | return error; | ||
227 | } | ||
228 | |||
229 | static inline void put_tracer(struct task_struct *task) | ||
230 | { | ||
231 | if (task) | ||
232 | atomic_dec(&tracers); | ||
233 | else | ||
234 | atomic_inc(&tracers); | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * The DS context is either attached to a thread or to a cpu: | ||
239 | * - in the former case, the thread_struct contains a pointer to the | ||
240 | * attached context. | ||
241 | * - in the latter case, we use a static array of per-cpu context | ||
242 | * pointers. | ||
243 | * | ||
244 | * Contexts are use-counted. They are allocated on first access and | ||
245 | * deallocated when the last user puts the context. | ||
246 | */ | ||
247 | struct ds_context { | ||
248 | /* The DS configuration; goes into MSR_IA32_DS_AREA: */ | ||
249 | unsigned char ds[MAX_SIZEOF_DS]; | ||
250 | |||
251 | /* The owner of the BTS and PEBS configuration, respectively: */ | ||
252 | struct bts_tracer *bts_master; | ||
253 | struct pebs_tracer *pebs_master; | ||
254 | |||
255 | /* Use count: */ | ||
256 | unsigned long count; | ||
257 | |||
258 | /* Pointer to the context pointer field: */ | ||
259 | struct ds_context **this; | ||
260 | |||
261 | /* The traced task; NULL for cpu tracing: */ | ||
262 | struct task_struct *task; | ||
263 | |||
264 | /* The traced cpu; only valid if task is NULL: */ | ||
265 | int cpu; | ||
266 | }; | ||
267 | |||
268 | static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context); | ||
269 | |||
270 | |||
271 | static struct ds_context *ds_get_context(struct task_struct *task, int cpu) | ||
272 | { | ||
273 | struct ds_context **p_context = | ||
274 | (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu)); | ||
275 | struct ds_context *context = NULL; | ||
276 | struct ds_context *new_context = NULL; | ||
277 | |||
278 | /* Chances are small that we already have a context. */ | ||
279 | new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); | ||
280 | if (!new_context) | ||
281 | return NULL; | ||
282 | |||
283 | spin_lock_irq(&ds_lock); | ||
284 | |||
285 | context = *p_context; | ||
286 | if (likely(!context)) { | ||
287 | context = new_context; | ||
288 | |||
289 | context->this = p_context; | ||
290 | context->task = task; | ||
291 | context->cpu = cpu; | ||
292 | context->count = 0; | ||
293 | |||
294 | *p_context = context; | ||
295 | } | ||
296 | |||
297 | context->count++; | ||
298 | |||
299 | spin_unlock_irq(&ds_lock); | ||
300 | |||
301 | if (context != new_context) | ||
302 | kfree(new_context); | ||
303 | |||
304 | return context; | ||
305 | } | ||
306 | |||
307 | static void ds_put_context(struct ds_context *context) | ||
308 | { | ||
309 | struct task_struct *task; | ||
310 | unsigned long irq; | ||
311 | |||
312 | if (!context) | ||
313 | return; | ||
314 | |||
315 | spin_lock_irqsave(&ds_lock, irq); | ||
316 | |||
317 | if (--context->count) { | ||
318 | spin_unlock_irqrestore(&ds_lock, irq); | ||
319 | return; | ||
320 | } | ||
321 | |||
322 | *(context->this) = NULL; | ||
323 | |||
324 | task = context->task; | ||
325 | |||
326 | if (task) | ||
327 | clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); | ||
328 | |||
329 | /* | ||
330 | * We leave the (now dangling) pointer to the DS configuration in | ||
331 | * the DS_AREA msr. This is as good or as bad as replacing it with | ||
332 | * NULL - the hardware would crash if we enabled tracing. | ||
333 | * | ||
334 | * This saves us some problems with having to write an msr on a | ||
335 | * different cpu while preventing others from doing the same for the | ||
336 | * next context for that same cpu. | ||
337 | */ | ||
338 | |||
339 | spin_unlock_irqrestore(&ds_lock, irq); | ||
340 | |||
341 | /* The context might still be in use for context switching. */ | ||
342 | if (task && (task != current)) | ||
343 | wait_task_context_switch(task); | ||
344 | |||
345 | kfree(context); | ||
346 | } | ||
347 | |||
348 | static void ds_install_ds_area(struct ds_context *context) | ||
349 | { | ||
350 | unsigned long ds; | ||
351 | |||
352 | ds = (unsigned long)context->ds; | ||
353 | |||
354 | /* | ||
355 | * There is a race between the bts master and the pebs master. | ||
356 | * | ||
357 | * The thread/cpu access is synchronized via get/put_cpu() for | ||
358 | * task tracing and via wrmsr_on_cpu for cpu tracing. | ||
359 | * | ||
360 | * If bts and pebs are collected for the same task or same cpu, | ||
361 | * the same confiuration is written twice. | ||
362 | */ | ||
363 | if (context->task) { | ||
364 | get_cpu(); | ||
365 | if (context->task == current) | ||
366 | wrmsrl(MSR_IA32_DS_AREA, ds); | ||
367 | set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); | ||
368 | put_cpu(); | ||
369 | } else | ||
370 | wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, | ||
371 | (u32)((u64)ds), (u32)((u64)ds >> 32)); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * Call the tracer's callback on a buffer overflow. | ||
376 | * | ||
377 | * context: the ds context | ||
378 | * qual: the buffer type | ||
379 | */ | ||
380 | static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) | ||
381 | { | ||
382 | switch (qual) { | ||
383 | case ds_bts: | ||
384 | if (context->bts_master && | ||
385 | context->bts_master->ovfl) | ||
386 | context->bts_master->ovfl(context->bts_master); | ||
387 | break; | ||
388 | case ds_pebs: | ||
389 | if (context->pebs_master && | ||
390 | context->pebs_master->ovfl) | ||
391 | context->pebs_master->ovfl(context->pebs_master); | ||
392 | break; | ||
393 | } | ||
394 | } | ||
395 | |||
396 | |||
397 | /* | ||
398 | * Write raw data into the BTS or PEBS buffer. | ||
399 | * | ||
400 | * The remainder of any partially written record is zeroed out. | ||
401 | * | ||
402 | * context: the DS context | ||
403 | * qual: the buffer type | ||
404 | * record: the data to write | ||
405 | * size: the size of the data | ||
406 | */ | ||
407 | static int ds_write(struct ds_context *context, enum ds_qualifier qual, | ||
408 | const void *record, size_t size) | ||
409 | { | ||
410 | int bytes_written = 0; | ||
411 | |||
412 | if (!record) | ||
413 | return -EINVAL; | ||
414 | |||
415 | while (size) { | ||
416 | unsigned long base, index, end, write_end, int_th; | ||
417 | unsigned long write_size, adj_write_size; | ||
418 | |||
419 | /* | ||
420 | * Write as much as possible without producing an | ||
421 | * overflow interrupt. | ||
422 | * | ||
423 | * Interrupt_threshold must either be | ||
424 | * - bigger than absolute_maximum or | ||
425 | * - point to a record between buffer_base and absolute_maximum | ||
426 | * | ||
427 | * Index points to a valid record. | ||
428 | */ | ||
429 | base = ds_get(context->ds, qual, ds_buffer_base); | ||
430 | index = ds_get(context->ds, qual, ds_index); | ||
431 | end = ds_get(context->ds, qual, ds_absolute_maximum); | ||
432 | int_th = ds_get(context->ds, qual, ds_interrupt_threshold); | ||
433 | |||
434 | write_end = min(end, int_th); | ||
435 | |||
436 | /* | ||
437 | * If we are already beyond the interrupt threshold, | ||
438 | * we fill the entire buffer. | ||
439 | */ | ||
440 | if (write_end <= index) | ||
441 | write_end = end; | ||
442 | |||
443 | if (write_end <= index) | ||
444 | break; | ||
445 | |||
446 | write_size = min((unsigned long) size, write_end - index); | ||
447 | memcpy((void *)index, record, write_size); | ||
448 | |||
449 | record = (const char *)record + write_size; | ||
450 | size -= write_size; | ||
451 | bytes_written += write_size; | ||
452 | |||
453 | adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; | ||
454 | adj_write_size *= ds_cfg.sizeof_rec[qual]; | ||
455 | |||
456 | /* Zero out trailing bytes. */ | ||
457 | memset((char *)index + write_size, 0, | ||
458 | adj_write_size - write_size); | ||
459 | index += adj_write_size; | ||
460 | |||
461 | if (index >= end) | ||
462 | index = base; | ||
463 | ds_set(context->ds, qual, ds_index, index); | ||
464 | |||
465 | if (index >= int_th) | ||
466 | ds_overflow(context, qual); | ||
467 | } | ||
468 | |||
469 | return bytes_written; | ||
470 | } | ||
471 | |||
472 | |||
473 | /* | ||
474 | * Branch Trace Store (BTS) uses the following format. Different | ||
475 | * architectures vary in the size of those fields. | ||
476 | * - source linear address | ||
477 | * - destination linear address | ||
478 | * - flags | ||
479 | * | ||
480 | * Later architectures use 64bit pointers throughout, whereas earlier | ||
481 | * architectures use 32bit pointers in 32bit mode. | ||
482 | * | ||
483 | * We compute the base address for the fields based on: | ||
484 | * - the field size stored in the DS configuration | ||
485 | * - the relative field position | ||
486 | * | ||
487 | * In order to store additional information in the BTS buffer, we use | ||
488 | * a special source address to indicate that the record requires | ||
489 | * special interpretation. | ||
490 | * | ||
491 | * Netburst indicated via a bit in the flags field whether the branch | ||
492 | * was predicted; this is ignored. | ||
493 | * | ||
494 | * We use two levels of abstraction: | ||
495 | * - the raw data level defined here | ||
496 | * - an arch-independent level defined in ds.h | ||
497 | */ | ||
498 | |||
499 | enum bts_field { | ||
500 | bts_from, | ||
501 | bts_to, | ||
502 | bts_flags, | ||
503 | |||
504 | bts_qual = bts_from, | ||
505 | bts_clock = bts_to, | ||
506 | bts_pid = bts_flags, | ||
507 | |||
508 | bts_qual_mask = (bts_qual_max - 1), | ||
509 | bts_escape = ((unsigned long)-1 & ~bts_qual_mask) | ||
510 | }; | ||
511 | |||
512 | static inline unsigned long bts_get(const char *base, unsigned long field) | ||
513 | { | ||
514 | base += (ds_cfg.sizeof_ptr_field * field); | ||
515 | return *(unsigned long *)base; | ||
516 | } | ||
517 | |||
518 | static inline void bts_set(char *base, unsigned long field, unsigned long val) | ||
519 | { | ||
520 | base += (ds_cfg.sizeof_ptr_field * field); | ||
521 | (*(unsigned long *)base) = val; | ||
522 | } | ||
523 | |||
524 | |||
525 | /* | ||
526 | * The raw BTS data is architecture dependent. | ||
527 | * | ||
528 | * For higher-level users, we give an arch-independent view. | ||
529 | * - ds.h defines struct bts_struct | ||
530 | * - bts_read translates one raw bts record into a bts_struct | ||
531 | * - bts_write translates one bts_struct into the raw format and | ||
532 | * writes it into the top of the parameter tracer's buffer. | ||
533 | * | ||
534 | * return: bytes read/written on success; -Eerrno, otherwise | ||
535 | */ | ||
536 | static int | ||
537 | bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) | ||
538 | { | ||
539 | if (!tracer) | ||
540 | return -EINVAL; | ||
541 | |||
542 | if (at < tracer->trace.ds.begin) | ||
543 | return -EINVAL; | ||
544 | |||
545 | if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) | ||
546 | return -EINVAL; | ||
547 | |||
548 | memset(out, 0, sizeof(*out)); | ||
549 | if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { | ||
550 | out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); | ||
551 | out->variant.event.clock = bts_get(at, bts_clock); | ||
552 | out->variant.event.pid = bts_get(at, bts_pid); | ||
553 | } else { | ||
554 | out->qualifier = bts_branch; | ||
555 | out->variant.lbr.from = bts_get(at, bts_from); | ||
556 | out->variant.lbr.to = bts_get(at, bts_to); | ||
557 | |||
558 | if (!out->variant.lbr.from && !out->variant.lbr.to) | ||
559 | out->qualifier = bts_invalid; | ||
560 | } | ||
561 | |||
562 | return ds_cfg.sizeof_rec[ds_bts]; | ||
563 | } | ||
564 | |||
565 | static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) | ||
566 | { | ||
567 | unsigned char raw[MAX_SIZEOF_BTS]; | ||
568 | |||
569 | if (!tracer) | ||
570 | return -EINVAL; | ||
571 | |||
572 | if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) | ||
573 | return -EOVERFLOW; | ||
574 | |||
575 | switch (in->qualifier) { | ||
576 | case bts_invalid: | ||
577 | bts_set(raw, bts_from, 0); | ||
578 | bts_set(raw, bts_to, 0); | ||
579 | bts_set(raw, bts_flags, 0); | ||
580 | break; | ||
581 | case bts_branch: | ||
582 | bts_set(raw, bts_from, in->variant.lbr.from); | ||
583 | bts_set(raw, bts_to, in->variant.lbr.to); | ||
584 | bts_set(raw, bts_flags, 0); | ||
585 | break; | ||
586 | case bts_task_arrives: | ||
587 | case bts_task_departs: | ||
588 | bts_set(raw, bts_qual, (bts_escape | in->qualifier)); | ||
589 | bts_set(raw, bts_clock, in->variant.event.clock); | ||
590 | bts_set(raw, bts_pid, in->variant.event.pid); | ||
591 | break; | ||
592 | default: | ||
593 | return -EINVAL; | ||
594 | } | ||
595 | |||
596 | return ds_write(tracer->ds.context, ds_bts, raw, | ||
597 | ds_cfg.sizeof_rec[ds_bts]); | ||
598 | } | ||
599 | |||
600 | |||
601 | static void ds_write_config(struct ds_context *context, | ||
602 | struct ds_trace *cfg, enum ds_qualifier qual) | ||
603 | { | ||
604 | unsigned char *ds = context->ds; | ||
605 | |||
606 | ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); | ||
607 | ds_set(ds, qual, ds_index, (unsigned long)cfg->top); | ||
608 | ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); | ||
609 | ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); | ||
610 | } | ||
611 | |||
612 | static void ds_read_config(struct ds_context *context, | ||
613 | struct ds_trace *cfg, enum ds_qualifier qual) | ||
614 | { | ||
615 | unsigned char *ds = context->ds; | ||
616 | |||
617 | cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); | ||
618 | cfg->top = (void *)ds_get(ds, qual, ds_index); | ||
619 | cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); | ||
620 | cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); | ||
621 | } | ||
622 | |||
623 | static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, | ||
624 | void *base, size_t size, size_t ith, | ||
625 | unsigned int flags) { | ||
626 | unsigned long buffer, adj; | ||
627 | |||
628 | /* | ||
629 | * Adjust the buffer address and size to meet alignment | ||
630 | * constraints: | ||
631 | * - buffer is double-word aligned | ||
632 | * - size is multiple of record size | ||
633 | * | ||
634 | * We checked the size at the very beginning; we have enough | ||
635 | * space to do the adjustment. | ||
636 | */ | ||
637 | buffer = (unsigned long)base; | ||
638 | |||
639 | adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; | ||
640 | buffer += adj; | ||
641 | size -= adj; | ||
642 | |||
643 | trace->n = size / ds_cfg.sizeof_rec[qual]; | ||
644 | trace->size = ds_cfg.sizeof_rec[qual]; | ||
645 | |||
646 | size = (trace->n * trace->size); | ||
647 | |||
648 | trace->begin = (void *)buffer; | ||
649 | trace->top = trace->begin; | ||
650 | trace->end = (void *)(buffer + size); | ||
651 | /* | ||
652 | * The value for 'no threshold' is -1, which will set the | ||
653 | * threshold outside of the buffer, just like we want it. | ||
654 | */ | ||
655 | ith *= ds_cfg.sizeof_rec[qual]; | ||
656 | trace->ith = (void *)(buffer + size - ith); | ||
657 | |||
658 | trace->flags = flags; | ||
659 | } | ||
660 | |||
661 | |||
662 | static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, | ||
663 | enum ds_qualifier qual, struct task_struct *task, | ||
664 | int cpu, void *base, size_t size, size_t th) | ||
665 | { | ||
666 | struct ds_context *context; | ||
667 | int error; | ||
668 | size_t req_size; | ||
669 | |||
670 | error = -EOPNOTSUPP; | ||
671 | if (!ds_cfg.sizeof_rec[qual]) | ||
672 | goto out; | ||
673 | |||
674 | error = -EINVAL; | ||
675 | if (!base) | ||
676 | goto out; | ||
677 | |||
678 | req_size = ds_cfg.sizeof_rec[qual]; | ||
679 | /* We might need space for alignment adjustments. */ | ||
680 | if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) | ||
681 | req_size += DS_ALIGNMENT; | ||
682 | |||
683 | error = -EINVAL; | ||
684 | if (size < req_size) | ||
685 | goto out; | ||
686 | |||
687 | if (th != (size_t)-1) { | ||
688 | th *= ds_cfg.sizeof_rec[qual]; | ||
689 | |||
690 | error = -EINVAL; | ||
691 | if (size <= th) | ||
692 | goto out; | ||
693 | } | ||
694 | |||
695 | tracer->buffer = base; | ||
696 | tracer->size = size; | ||
697 | |||
698 | error = -ENOMEM; | ||
699 | context = ds_get_context(task, cpu); | ||
700 | if (!context) | ||
701 | goto out; | ||
702 | tracer->context = context; | ||
703 | |||
704 | /* | ||
705 | * Defer any tracer-specific initialization work for the context until | ||
706 | * context ownership has been clarified. | ||
707 | */ | ||
708 | |||
709 | error = 0; | ||
710 | out: | ||
711 | return error; | ||
712 | } | ||
713 | |||
714 | static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, | ||
715 | void *base, size_t size, | ||
716 | bts_ovfl_callback_t ovfl, size_t th, | ||
717 | unsigned int flags) | ||
718 | { | ||
719 | struct bts_tracer *tracer; | ||
720 | int error; | ||
721 | |||
722 | /* Buffer overflow notification is not yet implemented. */ | ||
723 | error = -EOPNOTSUPP; | ||
724 | if (ovfl) | ||
725 | goto out; | ||
726 | |||
727 | error = get_tracer(task); | ||
728 | if (error < 0) | ||
729 | goto out; | ||
730 | |||
731 | error = -ENOMEM; | ||
732 | tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); | ||
733 | if (!tracer) | ||
734 | goto out_put_tracer; | ||
735 | tracer->ovfl = ovfl; | ||
736 | |||
737 | /* Do some more error checking and acquire a tracing context. */ | ||
738 | error = ds_request(&tracer->ds, &tracer->trace.ds, | ||
739 | ds_bts, task, cpu, base, size, th); | ||
740 | if (error < 0) | ||
741 | goto out_tracer; | ||
742 | |||
743 | /* Claim the bts part of the tracing context we acquired above. */ | ||
744 | spin_lock_irq(&ds_lock); | ||
745 | |||
746 | error = -EPERM; | ||
747 | if (tracer->ds.context->bts_master) | ||
748 | goto out_unlock; | ||
749 | tracer->ds.context->bts_master = tracer; | ||
750 | |||
751 | spin_unlock_irq(&ds_lock); | ||
752 | |||
753 | /* | ||
754 | * Now that we own the bts part of the context, let's complete the | ||
755 | * initialization for that part. | ||
756 | */ | ||
757 | ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); | ||
758 | ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); | ||
759 | ds_install_ds_area(tracer->ds.context); | ||
760 | |||
761 | tracer->trace.read = bts_read; | ||
762 | tracer->trace.write = bts_write; | ||
763 | |||
764 | /* Start tracing. */ | ||
765 | ds_resume_bts(tracer); | ||
766 | |||
767 | return tracer; | ||
768 | |||
769 | out_unlock: | ||
770 | spin_unlock_irq(&ds_lock); | ||
771 | ds_put_context(tracer->ds.context); | ||
772 | out_tracer: | ||
773 | kfree(tracer); | ||
774 | out_put_tracer: | ||
775 | put_tracer(task); | ||
776 | out: | ||
777 | return ERR_PTR(error); | ||
778 | } | ||
779 | |||
780 | struct bts_tracer *ds_request_bts_task(struct task_struct *task, | ||
781 | void *base, size_t size, | ||
782 | bts_ovfl_callback_t ovfl, | ||
783 | size_t th, unsigned int flags) | ||
784 | { | ||
785 | return ds_request_bts(task, 0, base, size, ovfl, th, flags); | ||
786 | } | ||
787 | |||
788 | struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, | ||
789 | bts_ovfl_callback_t ovfl, | ||
790 | size_t th, unsigned int flags) | ||
791 | { | ||
792 | return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); | ||
793 | } | ||
794 | |||
795 | static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, | ||
796 | void *base, size_t size, | ||
797 | pebs_ovfl_callback_t ovfl, size_t th, | ||
798 | unsigned int flags) | ||
799 | { | ||
800 | struct pebs_tracer *tracer; | ||
801 | int error; | ||
802 | |||
803 | /* Buffer overflow notification is not yet implemented. */ | ||
804 | error = -EOPNOTSUPP; | ||
805 | if (ovfl) | ||
806 | goto out; | ||
807 | |||
808 | error = get_tracer(task); | ||
809 | if (error < 0) | ||
810 | goto out; | ||
811 | |||
812 | error = -ENOMEM; | ||
813 | tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); | ||
814 | if (!tracer) | ||
815 | goto out_put_tracer; | ||
816 | tracer->ovfl = ovfl; | ||
817 | |||
818 | /* Do some more error checking and acquire a tracing context. */ | ||
819 | error = ds_request(&tracer->ds, &tracer->trace.ds, | ||
820 | ds_pebs, task, cpu, base, size, th); | ||
821 | if (error < 0) | ||
822 | goto out_tracer; | ||
823 | |||
824 | /* Claim the pebs part of the tracing context we acquired above. */ | ||
825 | spin_lock_irq(&ds_lock); | ||
826 | |||
827 | error = -EPERM; | ||
828 | if (tracer->ds.context->pebs_master) | ||
829 | goto out_unlock; | ||
830 | tracer->ds.context->pebs_master = tracer; | ||
831 | |||
832 | spin_unlock_irq(&ds_lock); | ||
833 | |||
834 | /* | ||
835 | * Now that we own the pebs part of the context, let's complete the | ||
836 | * initialization for that part. | ||
837 | */ | ||
838 | ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); | ||
839 | ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); | ||
840 | ds_install_ds_area(tracer->ds.context); | ||
841 | |||
842 | /* Start tracing. */ | ||
843 | ds_resume_pebs(tracer); | ||
844 | |||
845 | return tracer; | ||
846 | |||
847 | out_unlock: | ||
848 | spin_unlock_irq(&ds_lock); | ||
849 | ds_put_context(tracer->ds.context); | ||
850 | out_tracer: | ||
851 | kfree(tracer); | ||
852 | out_put_tracer: | ||
853 | put_tracer(task); | ||
854 | out: | ||
855 | return ERR_PTR(error); | ||
856 | } | ||
857 | |||
858 | struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, | ||
859 | void *base, size_t size, | ||
860 | pebs_ovfl_callback_t ovfl, | ||
861 | size_t th, unsigned int flags) | ||
862 | { | ||
863 | return ds_request_pebs(task, 0, base, size, ovfl, th, flags); | ||
864 | } | ||
865 | |||
866 | struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, | ||
867 | pebs_ovfl_callback_t ovfl, | ||
868 | size_t th, unsigned int flags) | ||
869 | { | ||
870 | return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); | ||
871 | } | ||
872 | |||
873 | static void ds_free_bts(struct bts_tracer *tracer) | ||
874 | { | ||
875 | struct task_struct *task; | ||
876 | |||
877 | task = tracer->ds.context->task; | ||
878 | |||
879 | WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); | ||
880 | tracer->ds.context->bts_master = NULL; | ||
881 | |||
882 | /* Make sure tracing stopped and the tracer is not in use. */ | ||
883 | if (task && (task != current)) | ||
884 | wait_task_context_switch(task); | ||
885 | |||
886 | ds_put_context(tracer->ds.context); | ||
887 | put_tracer(task); | ||
888 | |||
889 | kfree(tracer); | ||
890 | } | ||
891 | |||
892 | void ds_release_bts(struct bts_tracer *tracer) | ||
893 | { | ||
894 | might_sleep(); | ||
895 | |||
896 | if (!tracer) | ||
897 | return; | ||
898 | |||
899 | ds_suspend_bts(tracer); | ||
900 | ds_free_bts(tracer); | ||
901 | } | ||
902 | |||
903 | int ds_release_bts_noirq(struct bts_tracer *tracer) | ||
904 | { | ||
905 | struct task_struct *task; | ||
906 | unsigned long irq; | ||
907 | int error; | ||
908 | |||
909 | if (!tracer) | ||
910 | return 0; | ||
911 | |||
912 | task = tracer->ds.context->task; | ||
913 | |||
914 | local_irq_save(irq); | ||
915 | |||
916 | error = -EPERM; | ||
917 | if (!task && | ||
918 | (tracer->ds.context->cpu != smp_processor_id())) | ||
919 | goto out; | ||
920 | |||
921 | error = -EPERM; | ||
922 | if (task && (task != current)) | ||
923 | goto out; | ||
924 | |||
925 | ds_suspend_bts_noirq(tracer); | ||
926 | ds_free_bts(tracer); | ||
927 | |||
928 | error = 0; | ||
929 | out: | ||
930 | local_irq_restore(irq); | ||
931 | return error; | ||
932 | } | ||
933 | |||
934 | static void update_task_debugctlmsr(struct task_struct *task, | ||
935 | unsigned long debugctlmsr) | ||
936 | { | ||
937 | task->thread.debugctlmsr = debugctlmsr; | ||
938 | |||
939 | get_cpu(); | ||
940 | if (task == current) | ||
941 | update_debugctlmsr(debugctlmsr); | ||
942 | put_cpu(); | ||
943 | } | ||
944 | |||
945 | void ds_suspend_bts(struct bts_tracer *tracer) | ||
946 | { | ||
947 | struct task_struct *task; | ||
948 | unsigned long debugctlmsr; | ||
949 | int cpu; | ||
950 | |||
951 | if (!tracer) | ||
952 | return; | ||
953 | |||
954 | tracer->flags = 0; | ||
955 | |||
956 | task = tracer->ds.context->task; | ||
957 | cpu = tracer->ds.context->cpu; | ||
958 | |||
959 | WARN_ON(!task && irqs_disabled()); | ||
960 | |||
961 | debugctlmsr = (task ? | ||
962 | task->thread.debugctlmsr : | ||
963 | get_debugctlmsr_on_cpu(cpu)); | ||
964 | debugctlmsr &= ~BTS_CONTROL; | ||
965 | |||
966 | if (task) | ||
967 | update_task_debugctlmsr(task, debugctlmsr); | ||
968 | else | ||
969 | update_debugctlmsr_on_cpu(cpu, debugctlmsr); | ||
970 | } | ||
971 | |||
972 | int ds_suspend_bts_noirq(struct bts_tracer *tracer) | ||
973 | { | ||
974 | struct task_struct *task; | ||
975 | unsigned long debugctlmsr, irq; | ||
976 | int cpu, error = 0; | ||
977 | |||
978 | if (!tracer) | ||
979 | return 0; | ||
980 | |||
981 | tracer->flags = 0; | ||
982 | |||
983 | task = tracer->ds.context->task; | ||
984 | cpu = tracer->ds.context->cpu; | ||
985 | |||
986 | local_irq_save(irq); | ||
987 | |||
988 | error = -EPERM; | ||
989 | if (!task && (cpu != smp_processor_id())) | ||
990 | goto out; | ||
991 | |||
992 | debugctlmsr = (task ? | ||
993 | task->thread.debugctlmsr : | ||
994 | get_debugctlmsr()); | ||
995 | debugctlmsr &= ~BTS_CONTROL; | ||
996 | |||
997 | if (task) | ||
998 | update_task_debugctlmsr(task, debugctlmsr); | ||
999 | else | ||
1000 | update_debugctlmsr(debugctlmsr); | ||
1001 | |||
1002 | error = 0; | ||
1003 | out: | ||
1004 | local_irq_restore(irq); | ||
1005 | return error; | ||
1006 | } | ||
1007 | |||
1008 | static unsigned long ds_bts_control(struct bts_tracer *tracer) | ||
1009 | { | ||
1010 | unsigned long control; | ||
1011 | |||
1012 | control = ds_cfg.ctl[dsf_bts]; | ||
1013 | if (!(tracer->trace.ds.flags & BTS_KERNEL)) | ||
1014 | control |= ds_cfg.ctl[dsf_bts_kernel]; | ||
1015 | if (!(tracer->trace.ds.flags & BTS_USER)) | ||
1016 | control |= ds_cfg.ctl[dsf_bts_user]; | ||
1017 | |||
1018 | return control; | ||
1019 | } | ||
1020 | |||
1021 | void ds_resume_bts(struct bts_tracer *tracer) | ||
1022 | { | ||
1023 | struct task_struct *task; | ||
1024 | unsigned long debugctlmsr; | ||
1025 | int cpu; | ||
1026 | |||
1027 | if (!tracer) | ||
1028 | return; | ||
1029 | |||
1030 | tracer->flags = tracer->trace.ds.flags; | ||
1031 | |||
1032 | task = tracer->ds.context->task; | ||
1033 | cpu = tracer->ds.context->cpu; | ||
1034 | |||
1035 | WARN_ON(!task && irqs_disabled()); | ||
1036 | |||
1037 | debugctlmsr = (task ? | ||
1038 | task->thread.debugctlmsr : | ||
1039 | get_debugctlmsr_on_cpu(cpu)); | ||
1040 | debugctlmsr |= ds_bts_control(tracer); | ||
1041 | |||
1042 | if (task) | ||
1043 | update_task_debugctlmsr(task, debugctlmsr); | ||
1044 | else | ||
1045 | update_debugctlmsr_on_cpu(cpu, debugctlmsr); | ||
1046 | } | ||
1047 | |||
1048 | int ds_resume_bts_noirq(struct bts_tracer *tracer) | ||
1049 | { | ||
1050 | struct task_struct *task; | ||
1051 | unsigned long debugctlmsr, irq; | ||
1052 | int cpu, error = 0; | ||
1053 | |||
1054 | if (!tracer) | ||
1055 | return 0; | ||
1056 | |||
1057 | tracer->flags = tracer->trace.ds.flags; | ||
1058 | |||
1059 | task = tracer->ds.context->task; | ||
1060 | cpu = tracer->ds.context->cpu; | ||
1061 | |||
1062 | local_irq_save(irq); | ||
1063 | |||
1064 | error = -EPERM; | ||
1065 | if (!task && (cpu != smp_processor_id())) | ||
1066 | goto out; | ||
1067 | |||
1068 | debugctlmsr = (task ? | ||
1069 | task->thread.debugctlmsr : | ||
1070 | get_debugctlmsr()); | ||
1071 | debugctlmsr |= ds_bts_control(tracer); | ||
1072 | |||
1073 | if (task) | ||
1074 | update_task_debugctlmsr(task, debugctlmsr); | ||
1075 | else | ||
1076 | update_debugctlmsr(debugctlmsr); | ||
1077 | |||
1078 | error = 0; | ||
1079 | out: | ||
1080 | local_irq_restore(irq); | ||
1081 | return error; | ||
1082 | } | ||
1083 | |||
1084 | static void ds_free_pebs(struct pebs_tracer *tracer) | ||
1085 | { | ||
1086 | struct task_struct *task; | ||
1087 | |||
1088 | task = tracer->ds.context->task; | ||
1089 | |||
1090 | WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); | ||
1091 | tracer->ds.context->pebs_master = NULL; | ||
1092 | |||
1093 | ds_put_context(tracer->ds.context); | ||
1094 | put_tracer(task); | ||
1095 | |||
1096 | kfree(tracer); | ||
1097 | } | ||
1098 | |||
1099 | void ds_release_pebs(struct pebs_tracer *tracer) | ||
1100 | { | ||
1101 | might_sleep(); | ||
1102 | |||
1103 | if (!tracer) | ||
1104 | return; | ||
1105 | |||
1106 | ds_suspend_pebs(tracer); | ||
1107 | ds_free_pebs(tracer); | ||
1108 | } | ||
1109 | |||
1110 | int ds_release_pebs_noirq(struct pebs_tracer *tracer) | ||
1111 | { | ||
1112 | struct task_struct *task; | ||
1113 | unsigned long irq; | ||
1114 | int error; | ||
1115 | |||
1116 | if (!tracer) | ||
1117 | return 0; | ||
1118 | |||
1119 | task = tracer->ds.context->task; | ||
1120 | |||
1121 | local_irq_save(irq); | ||
1122 | |||
1123 | error = -EPERM; | ||
1124 | if (!task && | ||
1125 | (tracer->ds.context->cpu != smp_processor_id())) | ||
1126 | goto out; | ||
1127 | |||
1128 | error = -EPERM; | ||
1129 | if (task && (task != current)) | ||
1130 | goto out; | ||
1131 | |||
1132 | ds_suspend_pebs_noirq(tracer); | ||
1133 | ds_free_pebs(tracer); | ||
1134 | |||
1135 | error = 0; | ||
1136 | out: | ||
1137 | local_irq_restore(irq); | ||
1138 | return error; | ||
1139 | } | ||
1140 | |||
1141 | void ds_suspend_pebs(struct pebs_tracer *tracer) | ||
1142 | { | ||
1143 | |||
1144 | } | ||
1145 | |||
1146 | int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) | ||
1147 | { | ||
1148 | return 0; | ||
1149 | } | ||
1150 | |||
1151 | void ds_resume_pebs(struct pebs_tracer *tracer) | ||
1152 | { | ||
1153 | |||
1154 | } | ||
1155 | |||
1156 | int ds_resume_pebs_noirq(struct pebs_tracer *tracer) | ||
1157 | { | ||
1158 | return 0; | ||
1159 | } | ||
1160 | |||
1161 | const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) | ||
1162 | { | ||
1163 | if (!tracer) | ||
1164 | return NULL; | ||
1165 | |||
1166 | ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); | ||
1167 | return &tracer->trace; | ||
1168 | } | ||
1169 | |||
1170 | const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) | ||
1171 | { | ||
1172 | if (!tracer) | ||
1173 | return NULL; | ||
1174 | |||
1175 | ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); | ||
1176 | |||
1177 | tracer->trace.counters = ds_cfg.nr_counter_reset; | ||
1178 | memcpy(tracer->trace.counter_reset, | ||
1179 | tracer->ds.context->ds + | ||
1180 | (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), | ||
1181 | ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); | ||
1182 | |||
1183 | return &tracer->trace; | ||
1184 | } | ||
1185 | |||
1186 | int ds_reset_bts(struct bts_tracer *tracer) | ||
1187 | { | ||
1188 | if (!tracer) | ||
1189 | return -EINVAL; | ||
1190 | |||
1191 | tracer->trace.ds.top = tracer->trace.ds.begin; | ||
1192 | |||
1193 | ds_set(tracer->ds.context->ds, ds_bts, ds_index, | ||
1194 | (unsigned long)tracer->trace.ds.top); | ||
1195 | |||
1196 | return 0; | ||
1197 | } | ||
1198 | |||
1199 | int ds_reset_pebs(struct pebs_tracer *tracer) | ||
1200 | { | ||
1201 | if (!tracer) | ||
1202 | return -EINVAL; | ||
1203 | |||
1204 | tracer->trace.ds.top = tracer->trace.ds.begin; | ||
1205 | |||
1206 | ds_set(tracer->ds.context->ds, ds_pebs, ds_index, | ||
1207 | (unsigned long)tracer->trace.ds.top); | ||
1208 | |||
1209 | return 0; | ||
1210 | } | ||
1211 | |||
1212 | int ds_set_pebs_reset(struct pebs_tracer *tracer, | ||
1213 | unsigned int counter, u64 value) | ||
1214 | { | ||
1215 | if (!tracer) | ||
1216 | return -EINVAL; | ||
1217 | |||
1218 | if (ds_cfg.nr_counter_reset < counter) | ||
1219 | return -EINVAL; | ||
1220 | |||
1221 | *(u64 *)(tracer->ds.context->ds + | ||
1222 | (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + | ||
1223 | (counter * PEBS_RESET_FIELD_SIZE)) = value; | ||
1224 | |||
1225 | return 0; | ||
1226 | } | ||
1227 | |||
1228 | static const struct ds_configuration ds_cfg_netburst = { | ||
1229 | .name = "Netburst", | ||
1230 | .ctl[dsf_bts] = (1 << 2) | (1 << 3), | ||
1231 | .ctl[dsf_bts_kernel] = (1 << 5), | ||
1232 | .ctl[dsf_bts_user] = (1 << 6), | ||
1233 | .nr_counter_reset = 1, | ||
1234 | }; | ||
1235 | static const struct ds_configuration ds_cfg_pentium_m = { | ||
1236 | .name = "Pentium M", | ||
1237 | .ctl[dsf_bts] = (1 << 6) | (1 << 7), | ||
1238 | .nr_counter_reset = 1, | ||
1239 | }; | ||
1240 | static const struct ds_configuration ds_cfg_core2_atom = { | ||
1241 | .name = "Core 2/Atom", | ||
1242 | .ctl[dsf_bts] = (1 << 6) | (1 << 7), | ||
1243 | .ctl[dsf_bts_kernel] = (1 << 9), | ||
1244 | .ctl[dsf_bts_user] = (1 << 10), | ||
1245 | .nr_counter_reset = 1, | ||
1246 | }; | ||
1247 | static const struct ds_configuration ds_cfg_core_i7 = { | ||
1248 | .name = "Core i7", | ||
1249 | .ctl[dsf_bts] = (1 << 6) | (1 << 7), | ||
1250 | .ctl[dsf_bts_kernel] = (1 << 9), | ||
1251 | .ctl[dsf_bts_user] = (1 << 10), | ||
1252 | .nr_counter_reset = 4, | ||
1253 | }; | ||
1254 | |||
1255 | static void | ||
1256 | ds_configure(const struct ds_configuration *cfg, | ||
1257 | struct cpuinfo_x86 *cpu) | ||
1258 | { | ||
1259 | unsigned long nr_pebs_fields = 0; | ||
1260 | |||
1261 | printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); | ||
1262 | |||
1263 | #ifdef __i386__ | ||
1264 | nr_pebs_fields = 10; | ||
1265 | #else | ||
1266 | nr_pebs_fields = 18; | ||
1267 | #endif | ||
1268 | |||
1269 | /* | ||
1270 | * Starting with version 2, architectural performance | ||
1271 | * monitoring supports a format specifier. | ||
1272 | */ | ||
1273 | if ((cpuid_eax(0xa) & 0xff) > 1) { | ||
1274 | unsigned long perf_capabilities, format; | ||
1275 | |||
1276 | rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); | ||
1277 | |||
1278 | format = (perf_capabilities >> 8) & 0xf; | ||
1279 | |||
1280 | switch (format) { | ||
1281 | case 0: | ||
1282 | nr_pebs_fields = 18; | ||
1283 | break; | ||
1284 | case 1: | ||
1285 | nr_pebs_fields = 22; | ||
1286 | break; | ||
1287 | default: | ||
1288 | printk(KERN_INFO | ||
1289 | "[ds] unknown PEBS format: %lu\n", format); | ||
1290 | nr_pebs_fields = 0; | ||
1291 | break; | ||
1292 | } | ||
1293 | } | ||
1294 | |||
1295 | memset(&ds_cfg, 0, sizeof(ds_cfg)); | ||
1296 | ds_cfg = *cfg; | ||
1297 | |||
1298 | ds_cfg.sizeof_ptr_field = | ||
1299 | (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); | ||
1300 | |||
1301 | ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; | ||
1302 | ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; | ||
1303 | |||
1304 | if (!cpu_has(cpu, X86_FEATURE_BTS)) { | ||
1305 | ds_cfg.sizeof_rec[ds_bts] = 0; | ||
1306 | printk(KERN_INFO "[ds] bts not available\n"); | ||
1307 | } | ||
1308 | if (!cpu_has(cpu, X86_FEATURE_PEBS)) { | ||
1309 | ds_cfg.sizeof_rec[ds_pebs] = 0; | ||
1310 | printk(KERN_INFO "[ds] pebs not available\n"); | ||
1311 | } | ||
1312 | |||
1313 | printk(KERN_INFO "[ds] sizes: address: %u bit, ", | ||
1314 | 8 * ds_cfg.sizeof_ptr_field); | ||
1315 | printk("bts/pebs record: %u/%u bytes\n", | ||
1316 | ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); | ||
1317 | |||
1318 | WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); | ||
1319 | } | ||
1320 | |||
1321 | void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) | ||
1322 | { | ||
1323 | /* Only configure the first cpu. Others are identical. */ | ||
1324 | if (ds_cfg.name) | ||
1325 | return; | ||
1326 | |||
1327 | switch (c->x86) { | ||
1328 | case 0x6: | ||
1329 | switch (c->x86_model) { | ||
1330 | case 0x9: | ||
1331 | case 0xd: /* Pentium M */ | ||
1332 | ds_configure(&ds_cfg_pentium_m, c); | ||
1333 | break; | ||
1334 | case 0xf: | ||
1335 | case 0x17: /* Core2 */ | ||
1336 | case 0x1c: /* Atom */ | ||
1337 | ds_configure(&ds_cfg_core2_atom, c); | ||
1338 | break; | ||
1339 | case 0x1a: /* Core i7 */ | ||
1340 | ds_configure(&ds_cfg_core_i7, c); | ||
1341 | break; | ||
1342 | default: | ||
1343 | /* Sorry, don't know about them. */ | ||
1344 | break; | ||
1345 | } | ||
1346 | break; | ||
1347 | case 0xf: | ||
1348 | switch (c->x86_model) { | ||
1349 | case 0x0: | ||
1350 | case 0x1: | ||
1351 | case 0x2: /* Netburst */ | ||
1352 | ds_configure(&ds_cfg_netburst, c); | ||
1353 | break; | ||
1354 | default: | ||
1355 | /* Sorry, don't know about them. */ | ||
1356 | break; | ||
1357 | } | ||
1358 | break; | ||
1359 | default: | ||
1360 | /* Sorry, don't know about them. */ | ||
1361 | break; | ||
1362 | } | ||
1363 | } | ||
1364 | |||
1365 | static inline void ds_take_timestamp(struct ds_context *context, | ||
1366 | enum bts_qualifier qualifier, | ||
1367 | struct task_struct *task) | ||
1368 | { | ||
1369 | struct bts_tracer *tracer = context->bts_master; | ||
1370 | struct bts_struct ts; | ||
1371 | |||
1372 | /* Prevent compilers from reading the tracer pointer twice. */ | ||
1373 | barrier(); | ||
1374 | |||
1375 | if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) | ||
1376 | return; | ||
1377 | |||
1378 | memset(&ts, 0, sizeof(ts)); | ||
1379 | ts.qualifier = qualifier; | ||
1380 | ts.variant.event.clock = trace_clock_global(); | ||
1381 | ts.variant.event.pid = task->pid; | ||
1382 | |||
1383 | bts_write(tracer, &ts); | ||
1384 | } | ||
1385 | |||
1386 | /* | ||
1387 | * Change the DS configuration from tracing prev to tracing next. | ||
1388 | */ | ||
1389 | void ds_switch_to(struct task_struct *prev, struct task_struct *next) | ||
1390 | { | ||
1391 | struct ds_context *prev_ctx = prev->thread.ds_ctx; | ||
1392 | struct ds_context *next_ctx = next->thread.ds_ctx; | ||
1393 | unsigned long debugctlmsr = next->thread.debugctlmsr; | ||
1394 | |||
1395 | /* Make sure all data is read before we start. */ | ||
1396 | barrier(); | ||
1397 | |||
1398 | if (prev_ctx) { | ||
1399 | update_debugctlmsr(0); | ||
1400 | |||
1401 | ds_take_timestamp(prev_ctx, bts_task_departs, prev); | ||
1402 | } | ||
1403 | |||
1404 | if (next_ctx) { | ||
1405 | ds_take_timestamp(next_ctx, bts_task_arrives, next); | ||
1406 | |||
1407 | wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); | ||
1408 | } | ||
1409 | |||
1410 | update_debugctlmsr(debugctlmsr); | ||
1411 | } | ||
1412 | |||
1413 | static __init int ds_selftest(void) | ||
1414 | { | ||
1415 | if (ds_cfg.sizeof_rec[ds_bts]) { | ||
1416 | int error; | ||
1417 | |||
1418 | error = ds_selftest_bts(); | ||
1419 | if (error) { | ||
1420 | WARN(1, "[ds] selftest failed. disabling bts.\n"); | ||
1421 | ds_cfg.sizeof_rec[ds_bts] = 0; | ||
1422 | } | ||
1423 | } | ||
1424 | |||
1425 | if (ds_cfg.sizeof_rec[ds_pebs]) { | ||
1426 | int error; | ||
1427 | |||
1428 | error = ds_selftest_pebs(); | ||
1429 | if (error) { | ||
1430 | WARN(1, "[ds] selftest failed. disabling pebs.\n"); | ||
1431 | ds_cfg.sizeof_rec[ds_pebs] = 0; | ||
1432 | } | ||
1433 | } | ||
1434 | |||
1435 | return 0; | ||
1436 | } | ||
1437 | device_initcall(ds_selftest); | ||
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c deleted file mode 100644 index 6bc7c199ab99..000000000000 --- a/arch/x86/kernel/ds_selftest.c +++ /dev/null | |||
@@ -1,408 +0,0 @@ | |||
1 | /* | ||
2 | * Debug Store support - selftest | ||
3 | * | ||
4 | * | ||
5 | * Copyright (C) 2009 Intel Corporation. | ||
6 | * Markus Metzger <markus.t.metzger@intel.com>, 2009 | ||
7 | */ | ||
8 | |||
9 | #include "ds_selftest.h" | ||
10 | |||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/cpu.h> | ||
15 | |||
16 | #include <asm/ds.h> | ||
17 | |||
18 | |||
19 | #define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ | ||
20 | #define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ | ||
21 | |||
22 | struct ds_selftest_bts_conf { | ||
23 | struct bts_tracer *tracer; | ||
24 | int error; | ||
25 | int (*suspend)(struct bts_tracer *); | ||
26 | int (*resume)(struct bts_tracer *); | ||
27 | }; | ||
28 | |||
29 | static int ds_selftest_bts_consistency(const struct bts_trace *trace) | ||
30 | { | ||
31 | int error = 0; | ||
32 | |||
33 | if (!trace) { | ||
34 | printk(KERN_CONT "failed to access trace..."); | ||
35 | /* Bail out. Other tests are pointless. */ | ||
36 | return -1; | ||
37 | } | ||
38 | |||
39 | if (!trace->read) { | ||
40 | printk(KERN_CONT "bts read not available..."); | ||
41 | error = -1; | ||
42 | } | ||
43 | |||
44 | /* Do some sanity checks on the trace configuration. */ | ||
45 | if (!trace->ds.n) { | ||
46 | printk(KERN_CONT "empty bts buffer..."); | ||
47 | error = -1; | ||
48 | } | ||
49 | if (!trace->ds.size) { | ||
50 | printk(KERN_CONT "bad bts trace setup..."); | ||
51 | error = -1; | ||
52 | } | ||
53 | if (trace->ds.end != | ||
54 | (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { | ||
55 | printk(KERN_CONT "bad bts buffer setup..."); | ||
56 | error = -1; | ||
57 | } | ||
58 | /* | ||
59 | * We allow top in [begin; end], since its not clear when the | ||
60 | * overflow adjustment happens: after the increment or before the | ||
61 | * write. | ||
62 | */ | ||
63 | if ((trace->ds.top < trace->ds.begin) || | ||
64 | (trace->ds.end < trace->ds.top)) { | ||
65 | printk(KERN_CONT "bts top out of bounds..."); | ||
66 | error = -1; | ||
67 | } | ||
68 | |||
69 | return error; | ||
70 | } | ||
71 | |||
72 | static int ds_selftest_bts_read(struct bts_tracer *tracer, | ||
73 | const struct bts_trace *trace, | ||
74 | const void *from, const void *to) | ||
75 | { | ||
76 | const unsigned char *at; | ||
77 | |||
78 | /* | ||
79 | * Check a few things which do not belong to this test. | ||
80 | * They should be covered by other tests. | ||
81 | */ | ||
82 | if (!trace) | ||
83 | return -1; | ||
84 | |||
85 | if (!trace->read) | ||
86 | return -1; | ||
87 | |||
88 | if (to < from) | ||
89 | return -1; | ||
90 | |||
91 | if (from < trace->ds.begin) | ||
92 | return -1; | ||
93 | |||
94 | if (trace->ds.end < to) | ||
95 | return -1; | ||
96 | |||
97 | if (!trace->ds.size) | ||
98 | return -1; | ||
99 | |||
100 | /* Now to the test itself. */ | ||
101 | for (at = from; (void *)at < to; at += trace->ds.size) { | ||
102 | struct bts_struct bts; | ||
103 | unsigned long index; | ||
104 | int error; | ||
105 | |||
106 | if (((void *)at - trace->ds.begin) % trace->ds.size) { | ||
107 | printk(KERN_CONT | ||
108 | "read from non-integer index..."); | ||
109 | return -1; | ||
110 | } | ||
111 | index = ((void *)at - trace->ds.begin) / trace->ds.size; | ||
112 | |||
113 | memset(&bts, 0, sizeof(bts)); | ||
114 | error = trace->read(tracer, at, &bts); | ||
115 | if (error < 0) { | ||
116 | printk(KERN_CONT | ||
117 | "error reading bts trace at [%lu] (0x%p)...", | ||
118 | index, at); | ||
119 | return error; | ||
120 | } | ||
121 | |||
122 | switch (bts.qualifier) { | ||
123 | case BTS_BRANCH: | ||
124 | break; | ||
125 | default: | ||
126 | printk(KERN_CONT | ||
127 | "unexpected bts entry %llu at [%lu] (0x%p)...", | ||
128 | bts.qualifier, index, at); | ||
129 | return -1; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | return 0; | ||
134 | } | ||
135 | |||
136 | static void ds_selftest_bts_cpu(void *arg) | ||
137 | { | ||
138 | struct ds_selftest_bts_conf *conf = arg; | ||
139 | const struct bts_trace *trace; | ||
140 | void *top; | ||
141 | |||
142 | if (IS_ERR(conf->tracer)) { | ||
143 | conf->error = PTR_ERR(conf->tracer); | ||
144 | conf->tracer = NULL; | ||
145 | |||
146 | printk(KERN_CONT | ||
147 | "initialization failed (err: %d)...", conf->error); | ||
148 | return; | ||
149 | } | ||
150 | |||
151 | /* We should meanwhile have enough trace. */ | ||
152 | conf->error = conf->suspend(conf->tracer); | ||
153 | if (conf->error < 0) | ||
154 | return; | ||
155 | |||
156 | /* Let's see if we can access the trace. */ | ||
157 | trace = ds_read_bts(conf->tracer); | ||
158 | |||
159 | conf->error = ds_selftest_bts_consistency(trace); | ||
160 | if (conf->error < 0) | ||
161 | return; | ||
162 | |||
163 | /* If everything went well, we should have a few trace entries. */ | ||
164 | if (trace->ds.top == trace->ds.begin) { | ||
165 | /* | ||
166 | * It is possible but highly unlikely that we got a | ||
167 | * buffer overflow and end up at exactly the same | ||
168 | * position we started from. | ||
169 | * Let's issue a warning, but continue. | ||
170 | */ | ||
171 | printk(KERN_CONT "no trace/overflow..."); | ||
172 | } | ||
173 | |||
174 | /* Let's try to read the trace we collected. */ | ||
175 | conf->error = | ||
176 | ds_selftest_bts_read(conf->tracer, trace, | ||
177 | trace->ds.begin, trace->ds.top); | ||
178 | if (conf->error < 0) | ||
179 | return; | ||
180 | |||
181 | /* | ||
182 | * Let's read the trace again. | ||
183 | * Since we suspended tracing, we should get the same result. | ||
184 | */ | ||
185 | top = trace->ds.top; | ||
186 | |||
187 | trace = ds_read_bts(conf->tracer); | ||
188 | conf->error = ds_selftest_bts_consistency(trace); | ||
189 | if (conf->error < 0) | ||
190 | return; | ||
191 | |||
192 | if (top != trace->ds.top) { | ||
193 | printk(KERN_CONT "suspend not working..."); | ||
194 | conf->error = -1; | ||
195 | return; | ||
196 | } | ||
197 | |||
198 | /* Let's collect some more trace - see if resume is working. */ | ||
199 | conf->error = conf->resume(conf->tracer); | ||
200 | if (conf->error < 0) | ||
201 | return; | ||
202 | |||
203 | conf->error = conf->suspend(conf->tracer); | ||
204 | if (conf->error < 0) | ||
205 | return; | ||
206 | |||
207 | trace = ds_read_bts(conf->tracer); | ||
208 | |||
209 | conf->error = ds_selftest_bts_consistency(trace); | ||
210 | if (conf->error < 0) | ||
211 | return; | ||
212 | |||
213 | if (trace->ds.top == top) { | ||
214 | /* | ||
215 | * It is possible but highly unlikely that we got a | ||
216 | * buffer overflow and end up at exactly the same | ||
217 | * position we started from. | ||
218 | * Let's issue a warning and check the full trace. | ||
219 | */ | ||
220 | printk(KERN_CONT | ||
221 | "no resume progress/overflow..."); | ||
222 | |||
223 | conf->error = | ||
224 | ds_selftest_bts_read(conf->tracer, trace, | ||
225 | trace->ds.begin, trace->ds.end); | ||
226 | } else if (trace->ds.top < top) { | ||
227 | /* | ||
228 | * We had a buffer overflow - the entire buffer should | ||
229 | * contain trace records. | ||
230 | */ | ||
231 | conf->error = | ||
232 | ds_selftest_bts_read(conf->tracer, trace, | ||
233 | trace->ds.begin, trace->ds.end); | ||
234 | } else { | ||
235 | /* | ||
236 | * It is quite likely that the buffer did not overflow. | ||
237 | * Let's just check the delta trace. | ||
238 | */ | ||
239 | conf->error = | ||
240 | ds_selftest_bts_read(conf->tracer, trace, top, | ||
241 | trace->ds.top); | ||
242 | } | ||
243 | if (conf->error < 0) | ||
244 | return; | ||
245 | |||
246 | conf->error = 0; | ||
247 | } | ||
248 | |||
249 | static int ds_suspend_bts_wrap(struct bts_tracer *tracer) | ||
250 | { | ||
251 | ds_suspend_bts(tracer); | ||
252 | return 0; | ||
253 | } | ||
254 | |||
255 | static int ds_resume_bts_wrap(struct bts_tracer *tracer) | ||
256 | { | ||
257 | ds_resume_bts(tracer); | ||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | static void ds_release_bts_noirq_wrap(void *tracer) | ||
262 | { | ||
263 | (void)ds_release_bts_noirq(tracer); | ||
264 | } | ||
265 | |||
266 | static int ds_selftest_bts_bad_release_noirq(int cpu, | ||
267 | struct bts_tracer *tracer) | ||
268 | { | ||
269 | int error = -EPERM; | ||
270 | |||
271 | /* Try to release the tracer on the wrong cpu. */ | ||
272 | get_cpu(); | ||
273 | if (cpu != smp_processor_id()) { | ||
274 | error = ds_release_bts_noirq(tracer); | ||
275 | if (error != -EPERM) | ||
276 | printk(KERN_CONT "release on wrong cpu..."); | ||
277 | } | ||
278 | put_cpu(); | ||
279 | |||
280 | return error ? 0 : -1; | ||
281 | } | ||
282 | |||
283 | static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) | ||
284 | { | ||
285 | struct bts_tracer *tracer; | ||
286 | int error; | ||
287 | |||
288 | /* Try to request cpu tracing while task tracing is active. */ | ||
289 | tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, | ||
290 | (size_t)-1, BTS_KERNEL); | ||
291 | error = PTR_ERR(tracer); | ||
292 | if (!IS_ERR(tracer)) { | ||
293 | ds_release_bts(tracer); | ||
294 | error = 0; | ||
295 | } | ||
296 | |||
297 | if (error != -EPERM) | ||
298 | printk(KERN_CONT "cpu/task tracing overlap..."); | ||
299 | |||
300 | return error ? 0 : -1; | ||
301 | } | ||
302 | |||
303 | static int ds_selftest_bts_bad_request_task(void *buffer) | ||
304 | { | ||
305 | struct bts_tracer *tracer; | ||
306 | int error; | ||
307 | |||
308 | /* Try to request cpu tracing while task tracing is active. */ | ||
309 | tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, | ||
310 | (size_t)-1, BTS_KERNEL); | ||
311 | error = PTR_ERR(tracer); | ||
312 | if (!IS_ERR(tracer)) { | ||
313 | error = 0; | ||
314 | ds_release_bts(tracer); | ||
315 | } | ||
316 | |||
317 | if (error != -EPERM) | ||
318 | printk(KERN_CONT "task/cpu tracing overlap..."); | ||
319 | |||
320 | return error ? 0 : -1; | ||
321 | } | ||
322 | |||
323 | int ds_selftest_bts(void) | ||
324 | { | ||
325 | struct ds_selftest_bts_conf conf; | ||
326 | unsigned char buffer[BUFFER_SIZE], *small_buffer; | ||
327 | unsigned long irq; | ||
328 | int cpu; | ||
329 | |||
330 | printk(KERN_INFO "[ds] bts selftest..."); | ||
331 | conf.error = 0; | ||
332 | |||
333 | small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; | ||
334 | |||
335 | get_online_cpus(); | ||
336 | for_each_online_cpu(cpu) { | ||
337 | conf.suspend = ds_suspend_bts_wrap; | ||
338 | conf.resume = ds_resume_bts_wrap; | ||
339 | conf.tracer = | ||
340 | ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, | ||
341 | NULL, (size_t)-1, BTS_KERNEL); | ||
342 | ds_selftest_bts_cpu(&conf); | ||
343 | if (conf.error >= 0) | ||
344 | conf.error = ds_selftest_bts_bad_request_task(buffer); | ||
345 | ds_release_bts(conf.tracer); | ||
346 | if (conf.error < 0) | ||
347 | goto out; | ||
348 | |||
349 | conf.suspend = ds_suspend_bts_noirq; | ||
350 | conf.resume = ds_resume_bts_noirq; | ||
351 | conf.tracer = | ||
352 | ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, | ||
353 | NULL, (size_t)-1, BTS_KERNEL); | ||
354 | smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); | ||
355 | if (conf.error >= 0) { | ||
356 | conf.error = | ||
357 | ds_selftest_bts_bad_release_noirq(cpu, | ||
358 | conf.tracer); | ||
359 | /* We must not release the tracer twice. */ | ||
360 | if (conf.error < 0) | ||
361 | conf.tracer = NULL; | ||
362 | } | ||
363 | if (conf.error >= 0) | ||
364 | conf.error = ds_selftest_bts_bad_request_task(buffer); | ||
365 | smp_call_function_single(cpu, ds_release_bts_noirq_wrap, | ||
366 | conf.tracer, 1); | ||
367 | if (conf.error < 0) | ||
368 | goto out; | ||
369 | } | ||
370 | |||
371 | conf.suspend = ds_suspend_bts_wrap; | ||
372 | conf.resume = ds_resume_bts_wrap; | ||
373 | conf.tracer = | ||
374 | ds_request_bts_task(current, buffer, BUFFER_SIZE, | ||
375 | NULL, (size_t)-1, BTS_KERNEL); | ||
376 | ds_selftest_bts_cpu(&conf); | ||
377 | if (conf.error >= 0) | ||
378 | conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); | ||
379 | ds_release_bts(conf.tracer); | ||
380 | if (conf.error < 0) | ||
381 | goto out; | ||
382 | |||
383 | conf.suspend = ds_suspend_bts_noirq; | ||
384 | conf.resume = ds_resume_bts_noirq; | ||
385 | conf.tracer = | ||
386 | ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, | ||
387 | NULL, (size_t)-1, BTS_KERNEL); | ||
388 | local_irq_save(irq); | ||
389 | ds_selftest_bts_cpu(&conf); | ||
390 | if (conf.error >= 0) | ||
391 | conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); | ||
392 | ds_release_bts_noirq(conf.tracer); | ||
393 | local_irq_restore(irq); | ||
394 | if (conf.error < 0) | ||
395 | goto out; | ||
396 | |||
397 | conf.error = 0; | ||
398 | out: | ||
399 | put_online_cpus(); | ||
400 | printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); | ||
401 | |||
402 | return conf.error; | ||
403 | } | ||
404 | |||
405 | int ds_selftest_pebs(void) | ||
406 | { | ||
407 | return 0; | ||
408 | } | ||
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h deleted file mode 100644 index 2ba8745c6663..000000000000 --- a/arch/x86/kernel/ds_selftest.h +++ /dev/null | |||
@@ -1,15 +0,0 @@ | |||
1 | /* | ||
2 | * Debug Store support - selftest | ||
3 | * | ||
4 | * | ||
5 | * Copyright (C) 2009 Intel Corporation. | ||
6 | * Markus Metzger <markus.t.metzger@intel.com>, 2009 | ||
7 | */ | ||
8 | |||
9 | #ifdef CONFIG_X86_DS_SELFTEST | ||
10 | extern int ds_selftest_bts(void); | ||
11 | extern int ds_selftest_pebs(void); | ||
12 | #else | ||
13 | static inline int ds_selftest_bts(void) { return 0; } | ||
14 | static inline int ds_selftest_pebs(void) { return 0; } | ||
15 | #endif | ||
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6d817554780a..6e8752c1bd52 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -18,7 +18,6 @@ | |||
18 | 18 | ||
19 | #include <asm/stacktrace.h> | 19 | #include <asm/stacktrace.h> |
20 | 20 | ||
21 | #include "dumpstack.h" | ||
22 | 21 | ||
23 | int panic_on_unrecovered_nmi; | 22 | int panic_on_unrecovered_nmi; |
24 | int panic_on_io_nmi; | 23 | int panic_on_io_nmi; |
@@ -224,11 +223,6 @@ unsigned __kprobes long oops_begin(void) | |||
224 | int cpu; | 223 | int cpu; |
225 | unsigned long flags; | 224 | unsigned long flags; |
226 | 225 | ||
227 | /* notify the hw-branch tracer so it may disable tracing and | ||
228 | add the last trace to the trace buffer - | ||
229 | the earlier this happens, the more useful the trace. */ | ||
230 | trace_hw_branch_oops(); | ||
231 | |||
232 | oops_enter(); | 226 | oops_enter(); |
233 | 227 | ||
234 | /* racy, but better than risking deadlock. */ | 228 | /* racy, but better than risking deadlock. */ |
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be4fd44..000000000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null | |||
@@ -1,56 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | |||
6 | #ifndef DUMPSTACK_H | ||
7 | #define DUMPSTACK_H | ||
8 | |||
9 | #ifdef CONFIG_X86_32 | ||
10 | #define STACKSLOTS_PER_LINE 8 | ||
11 | #define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) | ||
12 | #else | ||
13 | #define STACKSLOTS_PER_LINE 4 | ||
14 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) | ||
15 | #endif | ||
16 | |||
17 | #include <linux/uaccess.h> | ||
18 | |||
19 | extern void | ||
20 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
21 | unsigned long *stack, unsigned long bp, char *log_lvl); | ||
22 | |||
23 | extern void | ||
24 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
25 | unsigned long *sp, unsigned long bp, char *log_lvl); | ||
26 | |||
27 | extern unsigned int code_bytes; | ||
28 | |||
29 | /* The form of the top of the frame on the stack */ | ||
30 | struct stack_frame { | ||
31 | struct stack_frame *next_frame; | ||
32 | unsigned long return_address; | ||
33 | }; | ||
34 | |||
35 | struct stack_frame_ia32 { | ||
36 | u32 next_frame; | ||
37 | u32 return_address; | ||
38 | }; | ||
39 | |||
40 | static inline unsigned long rewind_frame_pointer(int n) | ||
41 | { | ||
42 | struct stack_frame *frame; | ||
43 | |||
44 | get_bp(frame); | ||
45 | |||
46 | #ifdef CONFIG_FRAME_POINTER | ||
47 | while (n--) { | ||
48 | if (probe_kernel_address(&frame->next_frame, frame)) | ||
49 | break; | ||
50 | } | ||
51 | #endif | ||
52 | |||
53 | return (unsigned long)frame; | ||
54 | } | ||
55 | |||
56 | #endif /* DUMPSTACK_H */ | ||
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a189d93..0f6376ffa2d9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -16,8 +16,6 @@ | |||
16 | 16 | ||
17 | #include <asm/stacktrace.h> | 17 | #include <asm/stacktrace.h> |
18 | 18 | ||
19 | #include "dumpstack.h" | ||
20 | |||
21 | 19 | ||
22 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | 20 | void dump_trace(struct task_struct *task, struct pt_regs *regs, |
23 | unsigned long *stack, unsigned long bp, | 21 | unsigned long *stack, unsigned long bp, |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1f05f3..57a21f11c791 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -16,7 +16,6 @@ | |||
16 | 16 | ||
17 | #include <asm/stacktrace.h> | 17 | #include <asm/stacktrace.h> |
18 | 18 | ||
19 | #include "dumpstack.h" | ||
20 | 19 | ||
21 | #define N_EXCEPTION_STACKS_END \ | 20 | #define N_EXCEPTION_STACKS_END \ |
22 | (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) | 21 | (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7bca3c6a02fb..0d6fc71bedb1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -729,7 +729,7 @@ static int __init e820_mark_nvs_memory(void) | |||
729 | struct e820entry *ei = &e820.map[i]; | 729 | struct e820entry *ei = &e820.map[i]; |
730 | 730 | ||
731 | if (ei->type == E820_NVS) | 731 | if (ei->type == E820_NVS) |
732 | hibernate_nvs_register(ei->addr, ei->size); | 732 | suspend_nvs_register(ei->addr, ei->size); |
733 | } | 733 | } |
734 | 734 | ||
735 | return 0; | 735 | return 0; |
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ebdb85cf2686..e5cc7e82e60d 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/apic.h> | 18 | #include <asm/apic.h> |
19 | #include <asm/iommu.h> | 19 | #include <asm/iommu.h> |
20 | #include <asm/gart.h> | 20 | #include <asm/gart.h> |
21 | #include <asm/hpet.h> | ||
21 | 22 | ||
22 | static void __init fix_hypertransport_config(int num, int slot, int func) | 23 | static void __init fix_hypertransport_config(int num, int slot, int func) |
23 | { | 24 | { |
@@ -191,6 +192,21 @@ static void __init ati_bugs_contd(int num, int slot, int func) | |||
191 | } | 192 | } |
192 | #endif | 193 | #endif |
193 | 194 | ||
195 | /* | ||
196 | * Force the read back of the CMP register in hpet_next_event() | ||
197 | * to work around the problem that the CMP register write seems to be | ||
198 | * delayed. See hpet_next_event() for details. | ||
199 | * | ||
200 | * We do this on all SMBUS incarnations for now until we have more | ||
201 | * information about the affected chipsets. | ||
202 | */ | ||
203 | static void __init ati_hpet_bugs(int num, int slot, int func) | ||
204 | { | ||
205 | #ifdef CONFIG_HPET_TIMER | ||
206 | hpet_readback_cmp = 1; | ||
207 | #endif | ||
208 | } | ||
209 | |||
194 | #define QFLAG_APPLY_ONCE 0x1 | 210 | #define QFLAG_APPLY_ONCE 0x1 |
195 | #define QFLAG_APPLIED 0x2 | 211 | #define QFLAG_APPLIED 0x2 |
196 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) | 212 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) |
@@ -220,6 +236,8 @@ static struct chipset early_qrk[] __initdata = { | |||
220 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, | 236 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, |
221 | { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, | 237 | { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, |
222 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, | 238 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, |
239 | { PCI_VENDOR_ID_ATI, PCI_ANY_ID, | ||
240 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, | ||
223 | {} | 241 | {} |
224 | }; | 242 | }; |
225 | 243 | ||
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index b9c830c12b4a..fa99bae75ace 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c | |||
@@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n) | |||
41 | writew(0x720, VGABASE + 2*(max_xpos*j + i)); | 41 | writew(0x720, VGABASE + 2*(max_xpos*j + i)); |
42 | current_ypos = max_ypos-1; | 42 | current_ypos = max_ypos-1; |
43 | } | 43 | } |
44 | #ifdef CONFIG_KGDB_KDB | ||
45 | if (c == '\b') { | ||
46 | if (current_xpos > 0) | ||
47 | current_xpos--; | ||
48 | } else if (c == '\r') { | ||
49 | current_xpos = 0; | ||
50 | } else | ||
51 | #endif | ||
44 | if (c == '\n') { | 52 | if (c == '\n') { |
45 | current_xpos = 0; | 53 | current_xpos = 0; |
46 | current_ypos++; | 54 | current_ypos++; |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 44a8e0dc6737..227d00920d2f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <asm/processor-flags.h> | 53 | #include <asm/processor-flags.h> |
54 | #include <asm/ftrace.h> | 54 | #include <asm/ftrace.h> |
55 | #include <asm/irq_vectors.h> | 55 | #include <asm/irq_vectors.h> |
56 | #include <asm/cpufeature.h> | ||
56 | 57 | ||
57 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | 58 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
58 | #include <linux/elf-em.h> | 59 | #include <linux/elf-em.h> |
@@ -610,14 +611,14 @@ ldt_ss: | |||
610 | * compensating for the offset by changing to the ESPFIX segment with | 611 | * compensating for the offset by changing to the ESPFIX segment with |
611 | * a base address that matches for the difference. | 612 | * a base address that matches for the difference. |
612 | */ | 613 | */ |
614 | #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) | ||
613 | mov %esp, %edx /* load kernel esp */ | 615 | mov %esp, %edx /* load kernel esp */ |
614 | mov PT_OLDESP(%esp), %eax /* load userspace esp */ | 616 | mov PT_OLDESP(%esp), %eax /* load userspace esp */ |
615 | mov %dx, %ax /* eax: new kernel esp */ | 617 | mov %dx, %ax /* eax: new kernel esp */ |
616 | sub %eax, %edx /* offset (low word is 0) */ | 618 | sub %eax, %edx /* offset (low word is 0) */ |
617 | PER_CPU(gdt_page, %ebx) | ||
618 | shr $16, %edx | 619 | shr $16, %edx |
619 | mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ | 620 | mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ |
620 | mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ | 621 | mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ |
621 | pushl $__ESPFIX_SS | 622 | pushl $__ESPFIX_SS |
622 | CFI_ADJUST_CFA_OFFSET 4 | 623 | CFI_ADJUST_CFA_OFFSET 4 |
623 | push %eax /* new kernel esp */ | 624 | push %eax /* new kernel esp */ |
@@ -790,9 +791,8 @@ ptregs_clone: | |||
790 | * normal stack and adjusts ESP with the matching offset. | 791 | * normal stack and adjusts ESP with the matching offset. |
791 | */ | 792 | */ |
792 | /* fixup the stack */ | 793 | /* fixup the stack */ |
793 | PER_CPU(gdt_page, %ebx) | 794 | mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ |
794 | mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ | 795 | mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ |
795 | mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ | ||
796 | shl $16, %eax | 796 | shl $16, %eax |
797 | addl %esp, %eax /* the adjusted stack pointer */ | 797 | addl %esp, %eax /* the adjusted stack pointer */ |
798 | pushl $__KERNEL_DS | 798 | pushl $__KERNEL_DS |
@@ -905,7 +905,25 @@ ENTRY(simd_coprocessor_error) | |||
905 | RING0_INT_FRAME | 905 | RING0_INT_FRAME |
906 | pushl $0 | 906 | pushl $0 |
907 | CFI_ADJUST_CFA_OFFSET 4 | 907 | CFI_ADJUST_CFA_OFFSET 4 |
908 | #ifdef CONFIG_X86_INVD_BUG | ||
909 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ | ||
910 | 661: pushl $do_general_protection | ||
911 | 662: | ||
912 | .section .altinstructions,"a" | ||
913 | .balign 4 | ||
914 | .long 661b | ||
915 | .long 663f | ||
916 | .word X86_FEATURE_XMM | ||
917 | .byte 662b-661b | ||
918 | .byte 664f-663f | ||
919 | .previous | ||
920 | .section .altinstr_replacement,"ax" | ||
921 | 663: pushl $do_simd_coprocessor_error | ||
922 | 664: | ||
923 | .previous | ||
924 | #else | ||
908 | pushl $do_simd_coprocessor_error | 925 | pushl $do_simd_coprocessor_error |
926 | #endif | ||
909 | CFI_ADJUST_CFA_OFFSET 4 | 927 | CFI_ADJUST_CFA_OFFSET 4 |
910 | jmp error_code | 928 | jmp error_code |
911 | CFI_ENDPROC | 929 | CFI_ENDPROC |
@@ -1147,6 +1165,9 @@ ENTRY(xen_failsafe_callback) | |||
1147 | .previous | 1165 | .previous |
1148 | ENDPROC(xen_failsafe_callback) | 1166 | ENDPROC(xen_failsafe_callback) |
1149 | 1167 | ||
1168 | BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, | ||
1169 | xen_evtchn_do_upcall) | ||
1170 | |||
1150 | #endif /* CONFIG_XEN */ | 1171 | #endif /* CONFIG_XEN */ |
1151 | 1172 | ||
1152 | #ifdef CONFIG_FUNCTION_TRACER | 1173 | #ifdef CONFIG_FUNCTION_TRACER |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0697ff139837..c5ea5cdbe7b3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -571,8 +571,8 @@ auditsys: | |||
571 | * masked off. | 571 | * masked off. |
572 | */ | 572 | */ |
573 | sysret_audit: | 573 | sysret_audit: |
574 | movq %rax,%rsi /* second arg, syscall return value */ | 574 | movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ |
575 | cmpq $0,%rax /* is it < 0? */ | 575 | cmpq $0,%rsi /* is it < 0? */ |
576 | setl %al /* 1 if so, 0 if not */ | 576 | setl %al /* 1 if so, 0 if not */ |
577 | movzbl %al,%edi /* zero-extend that into %edi */ | 577 | movzbl %al,%edi /* zero-extend that into %edi */ |
578 | inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ | 578 | inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ |
@@ -1065,6 +1065,7 @@ ENTRY(\sym) | |||
1065 | END(\sym) | 1065 | END(\sym) |
1066 | .endm | 1066 | .endm |
1067 | 1067 | ||
1068 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) | ||
1068 | .macro paranoidzeroentry_ist sym do_sym ist | 1069 | .macro paranoidzeroentry_ist sym do_sym ist |
1069 | ENTRY(\sym) | 1070 | ENTRY(\sym) |
1070 | INTR_FRAME | 1071 | INTR_FRAME |
@@ -1076,10 +1077,9 @@ ENTRY(\sym) | |||
1076 | TRACE_IRQS_OFF | 1077 | TRACE_IRQS_OFF |
1077 | movq %rsp,%rdi /* pt_regs pointer */ | 1078 | movq %rsp,%rdi /* pt_regs pointer */ |
1078 | xorl %esi,%esi /* no error code */ | 1079 | xorl %esi,%esi /* no error code */ |
1079 | PER_CPU(init_tss, %r12) | 1080 | subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) |
1080 | subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) | ||
1081 | call \do_sym | 1081 | call \do_sym |
1082 | addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) | 1082 | addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) |
1083 | jmp paranoid_exit /* %ebx: no swapgs flag */ | 1083 | jmp paranoid_exit /* %ebx: no swapgs flag */ |
1084 | CFI_ENDPROC | 1084 | CFI_ENDPROC |
1085 | END(\sym) | 1085 | END(\sym) |
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) | |||
1329 | CFI_ENDPROC | 1329 | CFI_ENDPROC |
1330 | END(xen_failsafe_callback) | 1330 | END(xen_failsafe_callback) |
1331 | 1331 | ||
1332 | apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ | ||
1333 | xen_hvm_callback_vector xen_evtchn_do_upcall | ||
1334 | |||
1332 | #endif /* CONFIG_XEN */ | 1335 | #endif /* CONFIG_XEN */ |
1333 | 1336 | ||
1334 | /* | 1337 | /* |
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index b2e246037392..784360c0625c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c | |||
@@ -20,7 +20,7 @@ | |||
20 | 20 | ||
21 | static void __init i386_default_early_setup(void) | 21 | static void __init i386_default_early_setup(void) |
22 | { | 22 | { |
23 | /* Initilize 32bit specific setup functions */ | 23 | /* Initialize 32bit specific setup functions */ |
24 | x86_init.resources.probe_roms = probe_roms; | 24 | x86_init.resources.probe_roms = probe_roms; |
25 | x86_init.resources.reserve_resources = i386_reserve_resources; | 25 | x86_init.resources.reserve_resources = i386_reserve_resources; |
26 | x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; | 26 | x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d85..ff4c453e13f3 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -131,6 +131,12 @@ ENTRY(startup_32) | |||
131 | movsl | 131 | movsl |
132 | 1: | 132 | 1: |
133 | 133 | ||
134 | #ifdef CONFIG_OLPC_OPENFIRMWARE | ||
135 | /* save OFW's pgdir table for later use when calling into OFW */ | ||
136 | movl %cr3, %eax | ||
137 | movl %eax, pa(olpc_ofw_pgd) | ||
138 | #endif | ||
139 | |||
134 | #ifdef CONFIG_PARAVIRT | 140 | #ifdef CONFIG_PARAVIRT |
135 | /* This is can only trip for a broken bootloader... */ | 141 | /* This is can only trip for a broken bootloader... */ |
136 | cmpw $0x207, pa(boot_params + BP_version) | 142 | cmpw $0x207, pa(boot_params + BP_version) |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f16b7a6..239046bd447f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) | |||
234 | * init data section till per cpu areas are set up. | 234 | * init data section till per cpu areas are set up. |
235 | */ | 235 | */ |
236 | movl $MSR_GS_BASE,%ecx | 236 | movl $MSR_GS_BASE,%ecx |
237 | movq initial_gs(%rip),%rax | 237 | movl initial_gs(%rip),%eax |
238 | movq %rax,%rdx | 238 | movl initial_gs+4(%rip),%edx |
239 | shrq $32,%rdx | ||
240 | wrmsr | 239 | wrmsr |
241 | 240 | ||
242 | /* esi is pointer to real mode structure with interesting info. | 241 | /* esi is pointer to real mode structure with interesting info. |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 23b4ecdffa9b..33dbcc4ec5ff 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <asm/hpet.h> | 16 | #include <asm/hpet.h> |
17 | 17 | ||
18 | #define HPET_MASK CLOCKSOURCE_MASK(32) | 18 | #define HPET_MASK CLOCKSOURCE_MASK(32) |
19 | #define HPET_SHIFT 22 | ||
20 | 19 | ||
21 | /* FSEC = 10^-15 | 20 | /* FSEC = 10^-15 |
22 | NSEC = 10^-9 */ | 21 | NSEC = 10^-9 */ |
@@ -36,6 +35,7 @@ | |||
36 | unsigned long hpet_address; | 35 | unsigned long hpet_address; |
37 | u8 hpet_blockid; /* OS timer block num */ | 36 | u8 hpet_blockid; /* OS timer block num */ |
38 | u8 hpet_msi_disable; | 37 | u8 hpet_msi_disable; |
38 | u8 hpet_readback_cmp; | ||
39 | 39 | ||
40 | #ifdef CONFIG_PCI_MSI | 40 | #ifdef CONFIG_PCI_MSI |
41 | static unsigned long hpet_num_timers; | 41 | static unsigned long hpet_num_timers; |
@@ -395,19 +395,23 @@ static int hpet_next_event(unsigned long delta, | |||
395 | * at that point and we would wait for the next hpet interrupt | 395 | * at that point and we would wait for the next hpet interrupt |
396 | * forever. We found out that reading the CMP register back | 396 | * forever. We found out that reading the CMP register back |
397 | * forces the transfer so we can rely on the comparison with | 397 | * forces the transfer so we can rely on the comparison with |
398 | * the counter register below. If the read back from the | 398 | * the counter register below. |
399 | * compare register does not match the value we programmed | 399 | * |
400 | * then we might have a real hardware problem. We can not do | 400 | * That works fine on those ATI chipsets, but on newer Intel |
401 | * much about it here, but at least alert the user/admin with | 401 | * chipsets (ICH9...) this triggers due to an erratum: Reading |
402 | * a prominent warning. | 402 | * the comparator immediately following a write is returning |
403 | * An erratum on some chipsets (ICH9,..), results in comparator read | 403 | * the old value. |
404 | * immediately following a write returning old value. Workaround | 404 | * |
405 | * for this is to read this value second time, when first | 405 | * We restrict the read back to the affected ATI chipsets (set |
406 | * read returns old value. | 406 | * by quirks) and also run it with hpet=verbose for debugging |
407 | * purposes. | ||
407 | */ | 408 | */ |
408 | if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { | 409 | if (hpet_readback_cmp || hpet_verbose) { |
409 | WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, | 410 | u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); |
410 | KERN_WARNING "hpet: compare register read back failed.\n"); | 411 | |
412 | if (cmp != cnt) | ||
413 | printk_once(KERN_WARNING | ||
414 | "hpet: compare register read back failed.\n"); | ||
411 | } | 415 | } |
412 | 416 | ||
413 | return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; | 417 | return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; |
@@ -782,7 +786,6 @@ static struct clocksource clocksource_hpet = { | |||
782 | .rating = 250, | 786 | .rating = 250, |
783 | .read = read_hpet, | 787 | .read = read_hpet, |
784 | .mask = HPET_MASK, | 788 | .mask = HPET_MASK, |
785 | .shift = HPET_SHIFT, | ||
786 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 789 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
787 | .resume = hpet_resume_counter, | 790 | .resume = hpet_resume_counter, |
788 | #ifdef CONFIG_X86_64 | 791 | #ifdef CONFIG_X86_64 |
@@ -793,6 +796,7 @@ static struct clocksource clocksource_hpet = { | |||
793 | static int hpet_clocksource_register(void) | 796 | static int hpet_clocksource_register(void) |
794 | { | 797 | { |
795 | u64 start, now; | 798 | u64 start, now; |
799 | u64 hpet_freq; | ||
796 | cycle_t t1; | 800 | cycle_t t1; |
797 | 801 | ||
798 | /* Start the counter */ | 802 | /* Start the counter */ |
@@ -827,9 +831,15 @@ static int hpet_clocksource_register(void) | |||
827 | * mult = (hpet_period * 2^shift)/10^6 | 831 | * mult = (hpet_period * 2^shift)/10^6 |
828 | * mult = (hpet_period << shift)/FSEC_PER_NSEC | 832 | * mult = (hpet_period << shift)/FSEC_PER_NSEC |
829 | */ | 833 | */ |
830 | clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); | ||
831 | 834 | ||
832 | clocksource_register(&clocksource_hpet); | 835 | /* Need to convert hpet_period (fsec/cyc) to cyc/sec: |
836 | * | ||
837 | * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) | ||
838 | * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period | ||
839 | */ | ||
840 | hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; | ||
841 | do_div(hpet_freq, hpet_period); | ||
842 | clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); | ||
833 | 843 | ||
834 | return 0; | 844 | return 0; |
835 | } | 845 | } |
@@ -959,7 +969,7 @@ fs_initcall(hpet_late_init); | |||
959 | 969 | ||
960 | void hpet_disable(void) | 970 | void hpet_disable(void) |
961 | { | 971 | { |
962 | if (is_hpet_capable()) { | 972 | if (is_hpet_capable() && hpet_virt_address) { |
963 | unsigned int cfg = hpet_readl(HPET_CFG); | 973 | unsigned int cfg = hpet_readl(HPET_CFG); |
964 | 974 | ||
965 | if (hpet_legacy_int_enabled) { | 975 | if (hpet_legacy_int_enabled) { |
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d6cc065f519f..a474ec37c32f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c | |||
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len) | |||
189 | } | 189 | } |
190 | 190 | ||
191 | /* | 191 | /* |
192 | * Check for virtual address in user space. | ||
193 | */ | ||
194 | int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) | ||
195 | { | ||
196 | unsigned int len; | ||
197 | |||
198 | len = get_hbp_len(hbp_len); | ||
199 | |||
200 | return (va <= TASK_SIZE - len); | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Check for virtual address in kernel space. | 192 | * Check for virtual address in kernel space. |
205 | */ | 193 | */ |
206 | static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) | 194 | int arch_check_bp_in_kernelspace(struct perf_event *bp) |
207 | { | 195 | { |
208 | unsigned int len; | 196 | unsigned int len; |
197 | unsigned long va; | ||
198 | struct arch_hw_breakpoint *info = counter_arch_bp(bp); | ||
209 | 199 | ||
210 | len = get_hbp_len(hbp_len); | 200 | va = info->address; |
201 | len = get_hbp_len(info->len); | ||
211 | 202 | ||
212 | return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); | 203 | return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); |
213 | } | 204 | } |
@@ -217,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type, | |||
217 | { | 208 | { |
218 | /* Len */ | 209 | /* Len */ |
219 | switch (x86_len) { | 210 | switch (x86_len) { |
211 | case X86_BREAKPOINT_LEN_X: | ||
212 | *gen_len = sizeof(long); | ||
213 | break; | ||
220 | case X86_BREAKPOINT_LEN_1: | 214 | case X86_BREAKPOINT_LEN_1: |
221 | *gen_len = HW_BREAKPOINT_LEN_1; | 215 | *gen_len = HW_BREAKPOINT_LEN_1; |
222 | break; | 216 | break; |
@@ -260,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp) | |||
260 | 254 | ||
261 | info->address = bp->attr.bp_addr; | 255 | info->address = bp->attr.bp_addr; |
262 | 256 | ||
257 | /* Type */ | ||
258 | switch (bp->attr.bp_type) { | ||
259 | case HW_BREAKPOINT_W: | ||
260 | info->type = X86_BREAKPOINT_WRITE; | ||
261 | break; | ||
262 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
263 | info->type = X86_BREAKPOINT_RW; | ||
264 | break; | ||
265 | case HW_BREAKPOINT_X: | ||
266 | info->type = X86_BREAKPOINT_EXECUTE; | ||
267 | /* | ||
268 | * x86 inst breakpoints need to have a specific undefined len. | ||
269 | * But we still need to check userspace is not trying to setup | ||
270 | * an unsupported length, to get a range breakpoint for example. | ||
271 | */ | ||
272 | if (bp->attr.bp_len == sizeof(long)) { | ||
273 | info->len = X86_BREAKPOINT_LEN_X; | ||
274 | return 0; | ||
275 | } | ||
276 | default: | ||
277 | return -EINVAL; | ||
278 | } | ||
279 | |||
263 | /* Len */ | 280 | /* Len */ |
264 | switch (bp->attr.bp_len) { | 281 | switch (bp->attr.bp_len) { |
265 | case HW_BREAKPOINT_LEN_1: | 282 | case HW_BREAKPOINT_LEN_1: |
@@ -280,28 +297,12 @@ static int arch_build_bp_info(struct perf_event *bp) | |||
280 | return -EINVAL; | 297 | return -EINVAL; |
281 | } | 298 | } |
282 | 299 | ||
283 | /* Type */ | ||
284 | switch (bp->attr.bp_type) { | ||
285 | case HW_BREAKPOINT_W: | ||
286 | info->type = X86_BREAKPOINT_WRITE; | ||
287 | break; | ||
288 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
289 | info->type = X86_BREAKPOINT_RW; | ||
290 | break; | ||
291 | case HW_BREAKPOINT_X: | ||
292 | info->type = X86_BREAKPOINT_EXECUTE; | ||
293 | break; | ||
294 | default: | ||
295 | return -EINVAL; | ||
296 | } | ||
297 | |||
298 | return 0; | 300 | return 0; |
299 | } | 301 | } |
300 | /* | 302 | /* |
301 | * Validate the arch-specific HW Breakpoint register settings | 303 | * Validate the arch-specific HW Breakpoint register settings |
302 | */ | 304 | */ |
303 | int arch_validate_hwbkpt_settings(struct perf_event *bp, | 305 | int arch_validate_hwbkpt_settings(struct perf_event *bp) |
304 | struct task_struct *tsk) | ||
305 | { | 306 | { |
306 | struct arch_hw_breakpoint *info = counter_arch_bp(bp); | 307 | struct arch_hw_breakpoint *info = counter_arch_bp(bp); |
307 | unsigned int align; | 308 | unsigned int align; |
@@ -314,17 +315,10 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, | |||
314 | 315 | ||
315 | ret = -EINVAL; | 316 | ret = -EINVAL; |
316 | 317 | ||
317 | if (info->type == X86_BREAKPOINT_EXECUTE) | ||
318 | /* | ||
319 | * Ptrace-refactoring code | ||
320 | * For now, we'll allow instruction breakpoint only for user-space | ||
321 | * addresses | ||
322 | */ | ||
323 | if ((!arch_check_va_in_userspace(info->address, info->len)) && | ||
324 | info->len != X86_BREAKPOINT_EXECUTE) | ||
325 | return ret; | ||
326 | |||
327 | switch (info->len) { | 318 | switch (info->len) { |
319 | case X86_BREAKPOINT_LEN_X: | ||
320 | align = sizeof(long) -1; | ||
321 | break; | ||
328 | case X86_BREAKPOINT_LEN_1: | 322 | case X86_BREAKPOINT_LEN_1: |
329 | align = 0; | 323 | align = 0; |
330 | break; | 324 | break; |
@@ -350,15 +344,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, | |||
350 | if (info->address & align) | 344 | if (info->address & align) |
351 | return -EINVAL; | 345 | return -EINVAL; |
352 | 346 | ||
353 | /* Check that the virtual address is in the proper range */ | ||
354 | if (tsk) { | ||
355 | if (!arch_check_va_in_userspace(info->address, info->len)) | ||
356 | return -EFAULT; | ||
357 | } else { | ||
358 | if (!arch_check_va_in_kernelspace(info->address, info->len)) | ||
359 | return -EFAULT; | ||
360 | } | ||
361 | |||
362 | return 0; | 347 | return 0; |
363 | } | 348 | } |
364 | 349 | ||
@@ -495,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) | |||
495 | 480 | ||
496 | perf_bp_event(bp, args->regs); | 481 | perf_bp_event(bp, args->regs); |
497 | 482 | ||
483 | /* | ||
484 | * Set up resume flag to avoid breakpoint recursion when | ||
485 | * returning back to origin. | ||
486 | */ | ||
487 | if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) | ||
488 | args->regs->flags |= X86_EFLAGS_RF; | ||
489 | |||
498 | rcu_read_unlock(); | 490 | rcu_read_unlock(); |
499 | } | 491 | } |
500 | /* | 492 | /* |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 54c31c285488..1f11f5ce668f 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void) | |||
59 | stts(); | 59 | stts(); |
60 | } | 60 | } |
61 | 61 | ||
62 | void __cpuinit init_thread_xstate(void) | 62 | static void __cpuinit init_thread_xstate(void) |
63 | { | 63 | { |
64 | /* | ||
65 | * Note that xstate_size might be overwriten later during | ||
66 | * xsave_init(). | ||
67 | */ | ||
68 | |||
64 | if (!HAVE_HWFP) { | 69 | if (!HAVE_HWFP) { |
65 | xstate_size = sizeof(struct i387_soft_struct); | 70 | xstate_size = sizeof(struct i387_soft_struct); |
66 | return; | 71 | return; |
67 | } | 72 | } |
68 | 73 | ||
69 | if (cpu_has_xsave) { | ||
70 | xsave_cntxt_init(); | ||
71 | return; | ||
72 | } | ||
73 | |||
74 | if (cpu_has_fxsr) | 74 | if (cpu_has_fxsr) |
75 | xstate_size = sizeof(struct i387_fxsave_struct); | 75 | xstate_size = sizeof(struct i387_fxsave_struct); |
76 | #ifdef CONFIG_X86_32 | 76 | #ifdef CONFIG_X86_32 |
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void) | |||
84 | * Called at bootup to set up the initial FPU state that is later cloned | 84 | * Called at bootup to set up the initial FPU state that is later cloned |
85 | * into all processes. | 85 | * into all processes. |
86 | */ | 86 | */ |
87 | |||
87 | void __cpuinit fpu_init(void) | 88 | void __cpuinit fpu_init(void) |
88 | { | 89 | { |
89 | unsigned long oldcr0 = read_cr0(); | 90 | unsigned long oldcr0 = read_cr0(); |
@@ -93,74 +94,77 @@ void __cpuinit fpu_init(void) | |||
93 | 94 | ||
94 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ | 95 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ |
95 | 96 | ||
96 | /* | ||
97 | * Boot processor to setup the FP and extended state context info. | ||
98 | */ | ||
99 | if (!smp_processor_id()) | 97 | if (!smp_processor_id()) |
100 | init_thread_xstate(); | 98 | init_thread_xstate(); |
101 | xsave_init(); | ||
102 | 99 | ||
103 | mxcsr_feature_mask_init(); | 100 | mxcsr_feature_mask_init(); |
104 | /* clean state in init */ | 101 | /* clean state in init */ |
105 | if (cpu_has_xsave) | 102 | current_thread_info()->status = 0; |
106 | current_thread_info()->status = TS_XSAVE; | ||
107 | else | ||
108 | current_thread_info()->status = 0; | ||
109 | clear_used_math(); | 103 | clear_used_math(); |
110 | } | 104 | } |
111 | #endif /* CONFIG_X86_64 */ | ||
112 | 105 | ||
113 | /* | 106 | #else /* CONFIG_X86_64 */ |
114 | * The _current_ task is using the FPU for the first time | 107 | |
115 | * so initialize it and set the mxcsr to its default | 108 | void __cpuinit fpu_init(void) |
116 | * value at reset if we support XMM instructions and then | ||
117 | * remeber the current task has used the FPU. | ||
118 | */ | ||
119 | int init_fpu(struct task_struct *tsk) | ||
120 | { | 109 | { |
121 | if (tsk_used_math(tsk)) { | 110 | if (!smp_processor_id()) |
122 | if (HAVE_HWFP && tsk == current) | 111 | init_thread_xstate(); |
123 | unlazy_fpu(tsk); | 112 | } |
124 | return 0; | ||
125 | } | ||
126 | 113 | ||
127 | /* | 114 | #endif /* CONFIG_X86_32 */ |
128 | * Memory allocation at the first usage of the FPU and other state. | ||
129 | */ | ||
130 | if (!tsk->thread.xstate) { | ||
131 | tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep, | ||
132 | GFP_KERNEL); | ||
133 | if (!tsk->thread.xstate) | ||
134 | return -ENOMEM; | ||
135 | } | ||
136 | 115 | ||
116 | void fpu_finit(struct fpu *fpu) | ||
117 | { | ||
137 | #ifdef CONFIG_X86_32 | 118 | #ifdef CONFIG_X86_32 |
138 | if (!HAVE_HWFP) { | 119 | if (!HAVE_HWFP) { |
139 | memset(tsk->thread.xstate, 0, xstate_size); | 120 | finit_soft_fpu(&fpu->state->soft); |
140 | finit_task(tsk); | 121 | return; |
141 | set_stopped_child_used_math(tsk); | ||
142 | return 0; | ||
143 | } | 122 | } |
144 | #endif | 123 | #endif |
145 | 124 | ||
146 | if (cpu_has_fxsr) { | 125 | if (cpu_has_fxsr) { |
147 | struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; | 126 | struct i387_fxsave_struct *fx = &fpu->state->fxsave; |
148 | 127 | ||
149 | memset(fx, 0, xstate_size); | 128 | memset(fx, 0, xstate_size); |
150 | fx->cwd = 0x37f; | 129 | fx->cwd = 0x37f; |
151 | if (cpu_has_xmm) | 130 | if (cpu_has_xmm) |
152 | fx->mxcsr = MXCSR_DEFAULT; | 131 | fx->mxcsr = MXCSR_DEFAULT; |
153 | } else { | 132 | } else { |
154 | struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; | 133 | struct i387_fsave_struct *fp = &fpu->state->fsave; |
155 | memset(fp, 0, xstate_size); | 134 | memset(fp, 0, xstate_size); |
156 | fp->cwd = 0xffff037fu; | 135 | fp->cwd = 0xffff037fu; |
157 | fp->swd = 0xffff0000u; | 136 | fp->swd = 0xffff0000u; |
158 | fp->twd = 0xffffffffu; | 137 | fp->twd = 0xffffffffu; |
159 | fp->fos = 0xffff0000u; | 138 | fp->fos = 0xffff0000u; |
160 | } | 139 | } |
140 | } | ||
141 | EXPORT_SYMBOL_GPL(fpu_finit); | ||
142 | |||
143 | /* | ||
144 | * The _current_ task is using the FPU for the first time | ||
145 | * so initialize it and set the mxcsr to its default | ||
146 | * value at reset if we support XMM instructions and then | ||
147 | * remeber the current task has used the FPU. | ||
148 | */ | ||
149 | int init_fpu(struct task_struct *tsk) | ||
150 | { | ||
151 | int ret; | ||
152 | |||
153 | if (tsk_used_math(tsk)) { | ||
154 | if (HAVE_HWFP && tsk == current) | ||
155 | unlazy_fpu(tsk); | ||
156 | return 0; | ||
157 | } | ||
158 | |||
161 | /* | 159 | /* |
162 | * Only the device not available exception or ptrace can call init_fpu. | 160 | * Memory allocation at the first usage of the FPU and other state. |
163 | */ | 161 | */ |
162 | ret = fpu_alloc(&tsk->thread.fpu); | ||
163 | if (ret) | ||
164 | return ret; | ||
165 | |||
166 | fpu_finit(&tsk->thread.fpu); | ||
167 | |||
164 | set_stopped_child_used_math(tsk); | 168 | set_stopped_child_used_math(tsk); |
165 | return 0; | 169 | return 0; |
166 | } | 170 | } |
@@ -193,8 +197,10 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, | |||
193 | if (ret) | 197 | if (ret) |
194 | return ret; | 198 | return ret; |
195 | 199 | ||
200 | sanitize_i387_state(target); | ||
201 | |||
196 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, | 202 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, |
197 | &target->thread.xstate->fxsave, 0, -1); | 203 | &target->thread.fpu.state->fxsave, 0, -1); |
198 | } | 204 | } |
199 | 205 | ||
200 | int xfpregs_set(struct task_struct *target, const struct user_regset *regset, | 206 | int xfpregs_set(struct task_struct *target, const struct user_regset *regset, |
@@ -210,20 +216,22 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
210 | if (ret) | 216 | if (ret) |
211 | return ret; | 217 | return ret; |
212 | 218 | ||
219 | sanitize_i387_state(target); | ||
220 | |||
213 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, | 221 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, |
214 | &target->thread.xstate->fxsave, 0, -1); | 222 | &target->thread.fpu.state->fxsave, 0, -1); |
215 | 223 | ||
216 | /* | 224 | /* |
217 | * mxcsr reserved bits must be masked to zero for security reasons. | 225 | * mxcsr reserved bits must be masked to zero for security reasons. |
218 | */ | 226 | */ |
219 | target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; | 227 | target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; |
220 | 228 | ||
221 | /* | 229 | /* |
222 | * update the header bits in the xsave header, indicating the | 230 | * update the header bits in the xsave header, indicating the |
223 | * presence of FP and SSE state. | 231 | * presence of FP and SSE state. |
224 | */ | 232 | */ |
225 | if (cpu_has_xsave) | 233 | if (cpu_has_xsave) |
226 | target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; | 234 | target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; |
227 | 235 | ||
228 | return ret; | 236 | return ret; |
229 | } | 237 | } |
@@ -246,14 +254,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, | |||
246 | * memory layout in the thread struct, so that we can copy the entire | 254 | * memory layout in the thread struct, so that we can copy the entire |
247 | * xstateregs to the user using one user_regset_copyout(). | 255 | * xstateregs to the user using one user_regset_copyout(). |
248 | */ | 256 | */ |
249 | memcpy(&target->thread.xstate->fxsave.sw_reserved, | 257 | memcpy(&target->thread.fpu.state->fxsave.sw_reserved, |
250 | xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); | 258 | xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); |
251 | 259 | ||
252 | /* | 260 | /* |
253 | * Copy the xstate memory layout. | 261 | * Copy the xstate memory layout. |
254 | */ | 262 | */ |
255 | ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, | 263 | ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, |
256 | &target->thread.xstate->xsave, 0, -1); | 264 | &target->thread.fpu.state->xsave, 0, -1); |
257 | return ret; | 265 | return ret; |
258 | } | 266 | } |
259 | 267 | ||
@@ -272,14 +280,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, | |||
272 | return ret; | 280 | return ret; |
273 | 281 | ||
274 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, | 282 | ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, |
275 | &target->thread.xstate->xsave, 0, -1); | 283 | &target->thread.fpu.state->xsave, 0, -1); |
276 | 284 | ||
277 | /* | 285 | /* |
278 | * mxcsr reserved bits must be masked to zero for security reasons. | 286 | * mxcsr reserved bits must be masked to zero for security reasons. |
279 | */ | 287 | */ |
280 | target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; | 288 | target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; |
281 | 289 | ||
282 | xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; | 290 | xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr; |
283 | 291 | ||
284 | xsave_hdr->xstate_bv &= pcntxt_mask; | 292 | xsave_hdr->xstate_bv &= pcntxt_mask; |
285 | /* | 293 | /* |
@@ -365,7 +373,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) | |||
365 | static void | 373 | static void |
366 | convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) | 374 | convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) |
367 | { | 375 | { |
368 | struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; | 376 | struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; |
369 | struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; | 377 | struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; |
370 | struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; | 378 | struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; |
371 | int i; | 379 | int i; |
@@ -405,7 +413,7 @@ static void convert_to_fxsr(struct task_struct *tsk, | |||
405 | const struct user_i387_ia32_struct *env) | 413 | const struct user_i387_ia32_struct *env) |
406 | 414 | ||
407 | { | 415 | { |
408 | struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; | 416 | struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; |
409 | struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; | 417 | struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; |
410 | struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; | 418 | struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; |
411 | int i; | 419 | int i; |
@@ -445,10 +453,12 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, | |||
445 | 453 | ||
446 | if (!cpu_has_fxsr) { | 454 | if (!cpu_has_fxsr) { |
447 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, | 455 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, |
448 | &target->thread.xstate->fsave, 0, | 456 | &target->thread.fpu.state->fsave, 0, |
449 | -1); | 457 | -1); |
450 | } | 458 | } |
451 | 459 | ||
460 | sanitize_i387_state(target); | ||
461 | |||
452 | if (kbuf && pos == 0 && count == sizeof(env)) { | 462 | if (kbuf && pos == 0 && count == sizeof(env)) { |
453 | convert_from_fxsr(kbuf, target); | 463 | convert_from_fxsr(kbuf, target); |
454 | return 0; | 464 | return 0; |
@@ -470,12 +480,14 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
470 | if (ret) | 480 | if (ret) |
471 | return ret; | 481 | return ret; |
472 | 482 | ||
483 | sanitize_i387_state(target); | ||
484 | |||
473 | if (!HAVE_HWFP) | 485 | if (!HAVE_HWFP) |
474 | return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); | 486 | return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); |
475 | 487 | ||
476 | if (!cpu_has_fxsr) { | 488 | if (!cpu_has_fxsr) { |
477 | return user_regset_copyin(&pos, &count, &kbuf, &ubuf, | 489 | return user_regset_copyin(&pos, &count, &kbuf, &ubuf, |
478 | &target->thread.xstate->fsave, 0, -1); | 490 | &target->thread.fpu.state->fsave, 0, -1); |
479 | } | 491 | } |
480 | 492 | ||
481 | if (pos > 0 || count < sizeof(env)) | 493 | if (pos > 0 || count < sizeof(env)) |
@@ -490,7 +502,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
490 | * presence of FP. | 502 | * presence of FP. |
491 | */ | 503 | */ |
492 | if (cpu_has_xsave) | 504 | if (cpu_has_xsave) |
493 | target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; | 505 | target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; |
494 | return ret; | 506 | return ret; |
495 | } | 507 | } |
496 | 508 | ||
@@ -501,7 +513,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
501 | static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) | 513 | static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) |
502 | { | 514 | { |
503 | struct task_struct *tsk = current; | 515 | struct task_struct *tsk = current; |
504 | struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; | 516 | struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave; |
505 | 517 | ||
506 | fp->status = fp->swd; | 518 | fp->status = fp->swd; |
507 | if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) | 519 | if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) |
@@ -512,7 +524,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) | |||
512 | static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) | 524 | static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) |
513 | { | 525 | { |
514 | struct task_struct *tsk = current; | 526 | struct task_struct *tsk = current; |
515 | struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; | 527 | struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; |
516 | struct user_i387_ia32_struct env; | 528 | struct user_i387_ia32_struct env; |
517 | int err = 0; | 529 | int err = 0; |
518 | 530 | ||
@@ -536,6 +548,9 @@ static int save_i387_xsave(void __user *buf) | |||
536 | struct _fpstate_ia32 __user *fx = buf; | 548 | struct _fpstate_ia32 __user *fx = buf; |
537 | int err = 0; | 549 | int err = 0; |
538 | 550 | ||
551 | |||
552 | sanitize_i387_state(tsk); | ||
553 | |||
539 | /* | 554 | /* |
540 | * For legacy compatible, we always set FP/SSE bits in the bit | 555 | * For legacy compatible, we always set FP/SSE bits in the bit |
541 | * vector while saving the state to the user context. | 556 | * vector while saving the state to the user context. |
@@ -547,7 +562,7 @@ static int save_i387_xsave(void __user *buf) | |||
547 | * header as well as change any contents in the memory layout. | 562 | * header as well as change any contents in the memory layout. |
548 | * xrestore as part of sigreturn will capture all the changes. | 563 | * xrestore as part of sigreturn will capture all the changes. |
549 | */ | 564 | */ |
550 | tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; | 565 | tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; |
551 | 566 | ||
552 | if (save_i387_fxsave(fx) < 0) | 567 | if (save_i387_fxsave(fx) < 0) |
553 | return -1; | 568 | return -1; |
@@ -599,7 +614,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) | |||
599 | { | 614 | { |
600 | struct task_struct *tsk = current; | 615 | struct task_struct *tsk = current; |
601 | 616 | ||
602 | return __copy_from_user(&tsk->thread.xstate->fsave, buf, | 617 | return __copy_from_user(&tsk->thread.fpu.state->fsave, buf, |
603 | sizeof(struct i387_fsave_struct)); | 618 | sizeof(struct i387_fsave_struct)); |
604 | } | 619 | } |
605 | 620 | ||
@@ -610,10 +625,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, | |||
610 | struct user_i387_ia32_struct env; | 625 | struct user_i387_ia32_struct env; |
611 | int err; | 626 | int err; |
612 | 627 | ||
613 | err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], | 628 | err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0], |
614 | size); | 629 | size); |
615 | /* mxcsr reserved bits must be masked to zero for security reasons */ | 630 | /* mxcsr reserved bits must be masked to zero for security reasons */ |
616 | tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; | 631 | tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; |
617 | if (err || __copy_from_user(&env, buf, sizeof(env))) | 632 | if (err || __copy_from_user(&env, buf, sizeof(env))) |
618 | return 1; | 633 | return 1; |
619 | convert_to_fxsr(tsk, &env); | 634 | convert_to_fxsr(tsk, &env); |
@@ -629,7 +644,7 @@ static int restore_i387_xsave(void __user *buf) | |||
629 | struct i387_fxsave_struct __user *fx = | 644 | struct i387_fxsave_struct __user *fx = |
630 | (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; | 645 | (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; |
631 | struct xsave_hdr_struct *xsave_hdr = | 646 | struct xsave_hdr_struct *xsave_hdr = |
632 | ¤t->thread.xstate->xsave.xsave_hdr; | 647 | ¤t->thread.fpu.state->xsave.xsave_hdr; |
633 | u64 mask; | 648 | u64 mask; |
634 | int err; | 649 | int err; |
635 | 650 | ||
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 23c167925a5c..2dfd31597443 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c | |||
@@ -16,7 +16,7 @@ | |||
16 | #include <asm/hpet.h> | 16 | #include <asm/hpet.h> |
17 | #include <asm/smp.h> | 17 | #include <asm/smp.h> |
18 | 18 | ||
19 | DEFINE_SPINLOCK(i8253_lock); | 19 | DEFINE_RAW_SPINLOCK(i8253_lock); |
20 | EXPORT_SYMBOL(i8253_lock); | 20 | EXPORT_SYMBOL(i8253_lock); |
21 | 21 | ||
22 | /* | 22 | /* |
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event; | |||
33 | static void init_pit_timer(enum clock_event_mode mode, | 33 | static void init_pit_timer(enum clock_event_mode mode, |
34 | struct clock_event_device *evt) | 34 | struct clock_event_device *evt) |
35 | { | 35 | { |
36 | spin_lock(&i8253_lock); | 36 | raw_spin_lock(&i8253_lock); |
37 | 37 | ||
38 | switch (mode) { | 38 | switch (mode) { |
39 | case CLOCK_EVT_MODE_PERIODIC: | 39 | case CLOCK_EVT_MODE_PERIODIC: |
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode, | |||
62 | /* Nothing to do here */ | 62 | /* Nothing to do here */ |
63 | break; | 63 | break; |
64 | } | 64 | } |
65 | spin_unlock(&i8253_lock); | 65 | raw_spin_unlock(&i8253_lock); |
66 | } | 66 | } |
67 | 67 | ||
68 | /* | 68 | /* |
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode, | |||
72 | */ | 72 | */ |
73 | static int pit_next_event(unsigned long delta, struct clock_event_device *evt) | 73 | static int pit_next_event(unsigned long delta, struct clock_event_device *evt) |
74 | { | 74 | { |
75 | spin_lock(&i8253_lock); | 75 | raw_spin_lock(&i8253_lock); |
76 | outb_pit(delta & 0xff , PIT_CH0); /* LSB */ | 76 | outb_pit(delta & 0xff , PIT_CH0); /* LSB */ |
77 | outb_pit(delta >> 8 , PIT_CH0); /* MSB */ | 77 | outb_pit(delta >> 8 , PIT_CH0); /* MSB */ |
78 | spin_unlock(&i8253_lock); | 78 | raw_spin_unlock(&i8253_lock); |
79 | 79 | ||
80 | return 0; | 80 | return 0; |
81 | } | 81 | } |
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs) | |||
130 | int count; | 130 | int count; |
131 | u32 jifs; | 131 | u32 jifs; |
132 | 132 | ||
133 | spin_lock_irqsave(&i8253_lock, flags); | 133 | raw_spin_lock_irqsave(&i8253_lock, flags); |
134 | /* | 134 | /* |
135 | * Although our caller may have the read side of xtime_lock, | 135 | * Although our caller may have the read side of xtime_lock, |
136 | * this is now a seqlock, and we are cheating in this routine | 136 | * this is now a seqlock, and we are cheating in this routine |
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs) | |||
176 | old_count = count; | 176 | old_count = count; |
177 | old_jifs = jifs; | 177 | old_jifs = jifs; |
178 | 178 | ||
179 | spin_unlock_irqrestore(&i8253_lock, flags); | 179 | raw_spin_unlock_irqrestore(&i8253_lock, flags); |
180 | 180 | ||
181 | count = (LATCH - 1) - count; | 181 | count = (LATCH - 1) - count; |
182 | 182 | ||
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 7c9f02c130f3..cafa7c80ac95 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c | |||
@@ -276,16 +276,6 @@ static struct sys_device device_i8259A = { | |||
276 | .cls = &i8259_sysdev_class, | 276 | .cls = &i8259_sysdev_class, |
277 | }; | 277 | }; |
278 | 278 | ||
279 | static int __init i8259A_init_sysfs(void) | ||
280 | { | ||
281 | int error = sysdev_class_register(&i8259_sysdev_class); | ||
282 | if (!error) | ||
283 | error = sysdev_register(&device_i8259A); | ||
284 | return error; | ||
285 | } | ||
286 | |||
287 | device_initcall(i8259A_init_sysfs); | ||
288 | |||
289 | static void mask_8259A(void) | 279 | static void mask_8259A(void) |
290 | { | 280 | { |
291 | unsigned long flags; | 281 | unsigned long flags; |
@@ -407,3 +397,18 @@ struct legacy_pic default_legacy_pic = { | |||
407 | }; | 397 | }; |
408 | 398 | ||
409 | struct legacy_pic *legacy_pic = &default_legacy_pic; | 399 | struct legacy_pic *legacy_pic = &default_legacy_pic; |
400 | |||
401 | static int __init i8259A_init_sysfs(void) | ||
402 | { | ||
403 | int error; | ||
404 | |||
405 | if (legacy_pic != &default_legacy_pic) | ||
406 | return 0; | ||
407 | |||
408 | error = sysdev_class_register(&i8259_sysdev_class); | ||
409 | if (!error) | ||
410 | error = sysdev_register(&device_i8259A); | ||
411 | return error; | ||
412 | } | ||
413 | |||
414 | device_initcall(i8259A_init_sysfs); | ||
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 3a54dcb9cd0e..43e9ccf44947 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c | |||
@@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task); | |||
34 | /* | 34 | /* |
35 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | 35 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, |
36 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | 36 | * no more per-task TSS's. The TSS size is kept cacheline-aligned |
37 | * so they are allowed to end up in the .data.cacheline_aligned | 37 | * so they are allowed to end up in the .data..cacheline_aligned |
38 | * section. Since TSS's are completely CPU-local, we want them | 38 | * section. Since TSS's are completely CPU-local, we want them |
39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | 39 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
40 | */ | 40 | */ |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 0ed2d300cd46..990ae7cfc578 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) | |||
60 | outb(0, 0xF0); | 60 | outb(0, 0xF0); |
61 | if (ignore_fpu_irq || !boot_cpu_data.hard_math) | 61 | if (ignore_fpu_irq || !boot_cpu_data.hard_math) |
62 | return IRQ_NONE; | 62 | return IRQ_NONE; |
63 | math_error((void __user *)get_irq_regs()->ip); | 63 | math_error(get_irq_regs(), 0, 16); |
64 | return IRQ_HANDLED; | 64 | return IRQ_HANDLED; |
65 | } | 65 | } |
66 | 66 | ||
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b2258ca91003..ef10940e1af0 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -47,69 +47,96 @@ | |||
47 | #include <asm/debugreg.h> | 47 | #include <asm/debugreg.h> |
48 | #include <asm/apicdef.h> | 48 | #include <asm/apicdef.h> |
49 | #include <asm/system.h> | 49 | #include <asm/system.h> |
50 | |||
51 | #include <asm/apic.h> | 50 | #include <asm/apic.h> |
52 | 51 | ||
53 | /* | 52 | struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = |
54 | * Put the error code here just in case the user cares: | ||
55 | */ | ||
56 | static int gdb_x86errcode; | ||
57 | |||
58 | /* | ||
59 | * Likewise, the vector number here (since GDB only gets the signal | ||
60 | * number through the usual means, and that's not very specific): | ||
61 | */ | ||
62 | static int gdb_x86vector = -1; | ||
63 | |||
64 | /** | ||
65 | * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs | ||
66 | * @gdb_regs: A pointer to hold the registers in the order GDB wants. | ||
67 | * @regs: The &struct pt_regs of the current process. | ||
68 | * | ||
69 | * Convert the pt_regs in @regs into the format for registers that | ||
70 | * GDB expects, stored in @gdb_regs. | ||
71 | */ | ||
72 | void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
73 | { | 53 | { |
74 | #ifndef CONFIG_X86_32 | 54 | #ifdef CONFIG_X86_32 |
75 | u32 *gdb_regs32 = (u32 *)gdb_regs; | 55 | { "ax", 4, offsetof(struct pt_regs, ax) }, |
56 | { "cx", 4, offsetof(struct pt_regs, cx) }, | ||
57 | { "dx", 4, offsetof(struct pt_regs, dx) }, | ||
58 | { "bx", 4, offsetof(struct pt_regs, bx) }, | ||
59 | { "sp", 4, offsetof(struct pt_regs, sp) }, | ||
60 | { "bp", 4, offsetof(struct pt_regs, bp) }, | ||
61 | { "si", 4, offsetof(struct pt_regs, si) }, | ||
62 | { "di", 4, offsetof(struct pt_regs, di) }, | ||
63 | { "ip", 4, offsetof(struct pt_regs, ip) }, | ||
64 | { "flags", 4, offsetof(struct pt_regs, flags) }, | ||
65 | { "cs", 4, offsetof(struct pt_regs, cs) }, | ||
66 | { "ss", 4, offsetof(struct pt_regs, ss) }, | ||
67 | { "ds", 4, offsetof(struct pt_regs, ds) }, | ||
68 | { "es", 4, offsetof(struct pt_regs, es) }, | ||
69 | { "fs", 4, -1 }, | ||
70 | { "gs", 4, -1 }, | ||
71 | #else | ||
72 | { "ax", 8, offsetof(struct pt_regs, ax) }, | ||
73 | { "bx", 8, offsetof(struct pt_regs, bx) }, | ||
74 | { "cx", 8, offsetof(struct pt_regs, cx) }, | ||
75 | { "dx", 8, offsetof(struct pt_regs, dx) }, | ||
76 | { "si", 8, offsetof(struct pt_regs, dx) }, | ||
77 | { "di", 8, offsetof(struct pt_regs, di) }, | ||
78 | { "bp", 8, offsetof(struct pt_regs, bp) }, | ||
79 | { "sp", 8, offsetof(struct pt_regs, sp) }, | ||
80 | { "r8", 8, offsetof(struct pt_regs, r8) }, | ||
81 | { "r9", 8, offsetof(struct pt_regs, r9) }, | ||
82 | { "r10", 8, offsetof(struct pt_regs, r10) }, | ||
83 | { "r11", 8, offsetof(struct pt_regs, r11) }, | ||
84 | { "r12", 8, offsetof(struct pt_regs, r12) }, | ||
85 | { "r13", 8, offsetof(struct pt_regs, r13) }, | ||
86 | { "r14", 8, offsetof(struct pt_regs, r14) }, | ||
87 | { "r15", 8, offsetof(struct pt_regs, r15) }, | ||
88 | { "ip", 8, offsetof(struct pt_regs, ip) }, | ||
89 | { "flags", 4, offsetof(struct pt_regs, flags) }, | ||
90 | { "cs", 4, offsetof(struct pt_regs, cs) }, | ||
91 | { "ss", 4, offsetof(struct pt_regs, ss) }, | ||
76 | #endif | 92 | #endif |
77 | gdb_regs[GDB_AX] = regs->ax; | 93 | }; |
78 | gdb_regs[GDB_BX] = regs->bx; | 94 | |
79 | gdb_regs[GDB_CX] = regs->cx; | 95 | int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) |
80 | gdb_regs[GDB_DX] = regs->dx; | 96 | { |
81 | gdb_regs[GDB_SI] = regs->si; | 97 | if ( |
82 | gdb_regs[GDB_DI] = regs->di; | ||
83 | gdb_regs[GDB_BP] = regs->bp; | ||
84 | gdb_regs[GDB_PC] = regs->ip; | ||
85 | #ifdef CONFIG_X86_32 | 98 | #ifdef CONFIG_X86_32 |
86 | gdb_regs[GDB_PS] = regs->flags; | 99 | regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || |
87 | gdb_regs[GDB_DS] = regs->ds; | 100 | #endif |
88 | gdb_regs[GDB_ES] = regs->es; | 101 | regno == GDB_SP || regno == GDB_ORIG_AX) |
89 | gdb_regs[GDB_CS] = regs->cs; | 102 | return 0; |
90 | gdb_regs[GDB_FS] = 0xFFFF; | 103 | |
91 | gdb_regs[GDB_GS] = 0xFFFF; | 104 | if (dbg_reg_def[regno].offset != -1) |
92 | if (user_mode_vm(regs)) { | 105 | memcpy((void *)regs + dbg_reg_def[regno].offset, mem, |
93 | gdb_regs[GDB_SS] = regs->ss; | 106 | dbg_reg_def[regno].size); |
94 | gdb_regs[GDB_SP] = regs->sp; | 107 | return 0; |
95 | } else { | 108 | } |
96 | gdb_regs[GDB_SS] = __KERNEL_DS; | 109 | |
97 | gdb_regs[GDB_SP] = kernel_stack_pointer(regs); | 110 | char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) |
111 | { | ||
112 | if (regno == GDB_ORIG_AX) { | ||
113 | memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); | ||
114 | return "orig_ax"; | ||
98 | } | 115 | } |
99 | #else | 116 | if (regno >= DBG_MAX_REG_NUM || regno < 0) |
100 | gdb_regs[GDB_R8] = regs->r8; | 117 | return NULL; |
101 | gdb_regs[GDB_R9] = regs->r9; | 118 | |
102 | gdb_regs[GDB_R10] = regs->r10; | 119 | if (dbg_reg_def[regno].offset != -1) |
103 | gdb_regs[GDB_R11] = regs->r11; | 120 | memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, |
104 | gdb_regs[GDB_R12] = regs->r12; | 121 | dbg_reg_def[regno].size); |
105 | gdb_regs[GDB_R13] = regs->r13; | 122 | |
106 | gdb_regs[GDB_R14] = regs->r14; | 123 | switch (regno) { |
107 | gdb_regs[GDB_R15] = regs->r15; | 124 | #ifdef CONFIG_X86_32 |
108 | gdb_regs32[GDB_PS] = regs->flags; | 125 | case GDB_SS: |
109 | gdb_regs32[GDB_CS] = regs->cs; | 126 | if (!user_mode_vm(regs)) |
110 | gdb_regs32[GDB_SS] = regs->ss; | 127 | *(unsigned long *)mem = __KERNEL_DS; |
111 | gdb_regs[GDB_SP] = kernel_stack_pointer(regs); | 128 | break; |
129 | case GDB_SP: | ||
130 | if (!user_mode_vm(regs)) | ||
131 | *(unsigned long *)mem = kernel_stack_pointer(regs); | ||
132 | break; | ||
133 | case GDB_GS: | ||
134 | case GDB_FS: | ||
135 | *(unsigned long *)mem = 0xFFFF; | ||
136 | break; | ||
112 | #endif | 137 | #endif |
138 | } | ||
139 | return dbg_reg_def[regno].name; | ||
113 | } | 140 | } |
114 | 141 | ||
115 | /** | 142 | /** |
@@ -162,66 +189,35 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) | |||
162 | gdb_regs[GDB_SP] = p->thread.sp; | 189 | gdb_regs[GDB_SP] = p->thread.sp; |
163 | } | 190 | } |
164 | 191 | ||
165 | /** | ||
166 | * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. | ||
167 | * @gdb_regs: A pointer to hold the registers we've received from GDB. | ||
168 | * @regs: A pointer to a &struct pt_regs to hold these values in. | ||
169 | * | ||
170 | * Convert the GDB regs in @gdb_regs into the pt_regs, and store them | ||
171 | * in @regs. | ||
172 | */ | ||
173 | void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
174 | { | ||
175 | #ifndef CONFIG_X86_32 | ||
176 | u32 *gdb_regs32 = (u32 *)gdb_regs; | ||
177 | #endif | ||
178 | regs->ax = gdb_regs[GDB_AX]; | ||
179 | regs->bx = gdb_regs[GDB_BX]; | ||
180 | regs->cx = gdb_regs[GDB_CX]; | ||
181 | regs->dx = gdb_regs[GDB_DX]; | ||
182 | regs->si = gdb_regs[GDB_SI]; | ||
183 | regs->di = gdb_regs[GDB_DI]; | ||
184 | regs->bp = gdb_regs[GDB_BP]; | ||
185 | regs->ip = gdb_regs[GDB_PC]; | ||
186 | #ifdef CONFIG_X86_32 | ||
187 | regs->flags = gdb_regs[GDB_PS]; | ||
188 | regs->ds = gdb_regs[GDB_DS]; | ||
189 | regs->es = gdb_regs[GDB_ES]; | ||
190 | regs->cs = gdb_regs[GDB_CS]; | ||
191 | #else | ||
192 | regs->r8 = gdb_regs[GDB_R8]; | ||
193 | regs->r9 = gdb_regs[GDB_R9]; | ||
194 | regs->r10 = gdb_regs[GDB_R10]; | ||
195 | regs->r11 = gdb_regs[GDB_R11]; | ||
196 | regs->r12 = gdb_regs[GDB_R12]; | ||
197 | regs->r13 = gdb_regs[GDB_R13]; | ||
198 | regs->r14 = gdb_regs[GDB_R14]; | ||
199 | regs->r15 = gdb_regs[GDB_R15]; | ||
200 | regs->flags = gdb_regs32[GDB_PS]; | ||
201 | regs->cs = gdb_regs32[GDB_CS]; | ||
202 | regs->ss = gdb_regs32[GDB_SS]; | ||
203 | #endif | ||
204 | } | ||
205 | |||
206 | static struct hw_breakpoint { | 192 | static struct hw_breakpoint { |
207 | unsigned enabled; | 193 | unsigned enabled; |
208 | unsigned long addr; | 194 | unsigned long addr; |
209 | int len; | 195 | int len; |
210 | int type; | 196 | int type; |
211 | struct perf_event **pev; | 197 | struct perf_event **pev; |
212 | } breakinfo[4]; | 198 | } breakinfo[HBP_NUM]; |
199 | |||
200 | static unsigned long early_dr7; | ||
213 | 201 | ||
214 | static void kgdb_correct_hw_break(void) | 202 | static void kgdb_correct_hw_break(void) |
215 | { | 203 | { |
216 | int breakno; | 204 | int breakno; |
217 | 205 | ||
218 | for (breakno = 0; breakno < 4; breakno++) { | 206 | for (breakno = 0; breakno < HBP_NUM; breakno++) { |
219 | struct perf_event *bp; | 207 | struct perf_event *bp; |
220 | struct arch_hw_breakpoint *info; | 208 | struct arch_hw_breakpoint *info; |
221 | int val; | 209 | int val; |
222 | int cpu = raw_smp_processor_id(); | 210 | int cpu = raw_smp_processor_id(); |
223 | if (!breakinfo[breakno].enabled) | 211 | if (!breakinfo[breakno].enabled) |
224 | continue; | 212 | continue; |
213 | if (dbg_is_early) { | ||
214 | set_debugreg(breakinfo[breakno].addr, breakno); | ||
215 | early_dr7 |= encode_dr7(breakno, | ||
216 | breakinfo[breakno].len, | ||
217 | breakinfo[breakno].type); | ||
218 | set_debugreg(early_dr7, 7); | ||
219 | continue; | ||
220 | } | ||
225 | bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); | 221 | bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); |
226 | info = counter_arch_bp(bp); | 222 | info = counter_arch_bp(bp); |
227 | if (bp->attr.disabled != 1) | 223 | if (bp->attr.disabled != 1) |
@@ -236,7 +232,8 @@ static void kgdb_correct_hw_break(void) | |||
236 | if (!val) | 232 | if (!val) |
237 | bp->attr.disabled = 0; | 233 | bp->attr.disabled = 0; |
238 | } | 234 | } |
239 | hw_breakpoint_restore(); | 235 | if (!dbg_is_early) |
236 | hw_breakpoint_restore(); | ||
240 | } | 237 | } |
241 | 238 | ||
242 | static int hw_break_reserve_slot(int breakno) | 239 | static int hw_break_reserve_slot(int breakno) |
@@ -245,6 +242,9 @@ static int hw_break_reserve_slot(int breakno) | |||
245 | int cnt = 0; | 242 | int cnt = 0; |
246 | struct perf_event **pevent; | 243 | struct perf_event **pevent; |
247 | 244 | ||
245 | if (dbg_is_early) | ||
246 | return 0; | ||
247 | |||
248 | for_each_online_cpu(cpu) { | 248 | for_each_online_cpu(cpu) { |
249 | cnt++; | 249 | cnt++; |
250 | pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); | 250 | pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); |
@@ -270,6 +270,9 @@ static int hw_break_release_slot(int breakno) | |||
270 | struct perf_event **pevent; | 270 | struct perf_event **pevent; |
271 | int cpu; | 271 | int cpu; |
272 | 272 | ||
273 | if (dbg_is_early) | ||
274 | return 0; | ||
275 | |||
273 | for_each_online_cpu(cpu) { | 276 | for_each_online_cpu(cpu) { |
274 | pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); | 277 | pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); |
275 | if (dbg_release_bp_slot(*pevent)) | 278 | if (dbg_release_bp_slot(*pevent)) |
@@ -287,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) | |||
287 | { | 290 | { |
288 | int i; | 291 | int i; |
289 | 292 | ||
290 | for (i = 0; i < 4; i++) | 293 | for (i = 0; i < HBP_NUM; i++) |
291 | if (breakinfo[i].addr == addr && breakinfo[i].enabled) | 294 | if (breakinfo[i].addr == addr && breakinfo[i].enabled) |
292 | break; | 295 | break; |
293 | if (i == 4) | 296 | if (i == HBP_NUM) |
294 | return -1; | 297 | return -1; |
295 | 298 | ||
296 | if (hw_break_release_slot(i)) { | 299 | if (hw_break_release_slot(i)) { |
@@ -308,13 +311,17 @@ static void kgdb_remove_all_hw_break(void) | |||
308 | int cpu = raw_smp_processor_id(); | 311 | int cpu = raw_smp_processor_id(); |
309 | struct perf_event *bp; | 312 | struct perf_event *bp; |
310 | 313 | ||
311 | for (i = 0; i < 4; i++) { | 314 | for (i = 0; i < HBP_NUM; i++) { |
312 | if (!breakinfo[i].enabled) | 315 | if (!breakinfo[i].enabled) |
313 | continue; | 316 | continue; |
314 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); | 317 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); |
315 | if (bp->attr.disabled == 1) | 318 | if (bp->attr.disabled == 1) |
316 | continue; | 319 | continue; |
317 | arch_uninstall_hw_breakpoint(bp); | 320 | if (dbg_is_early) |
321 | early_dr7 &= ~encode_dr7(i, breakinfo[i].len, | ||
322 | breakinfo[i].type); | ||
323 | else | ||
324 | arch_uninstall_hw_breakpoint(bp); | ||
318 | bp->attr.disabled = 1; | 325 | bp->attr.disabled = 1; |
319 | } | 326 | } |
320 | } | 327 | } |
@@ -324,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) | |||
324 | { | 331 | { |
325 | int i; | 332 | int i; |
326 | 333 | ||
327 | for (i = 0; i < 4; i++) | 334 | for (i = 0; i < HBP_NUM; i++) |
328 | if (!breakinfo[i].enabled) | 335 | if (!breakinfo[i].enabled) |
329 | break; | 336 | break; |
330 | if (i == 4) | 337 | if (i == HBP_NUM) |
331 | return -1; | 338 | return -1; |
332 | 339 | ||
333 | switch (bptype) { | 340 | switch (bptype) { |
@@ -388,9 +395,14 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) | |||
388 | 395 | ||
389 | /* Disable hardware debugging while we are in kgdb: */ | 396 | /* Disable hardware debugging while we are in kgdb: */ |
390 | set_debugreg(0UL, 7); | 397 | set_debugreg(0UL, 7); |
391 | for (i = 0; i < 4; i++) { | 398 | for (i = 0; i < HBP_NUM; i++) { |
392 | if (!breakinfo[i].enabled) | 399 | if (!breakinfo[i].enabled) |
393 | continue; | 400 | continue; |
401 | if (dbg_is_early) { | ||
402 | early_dr7 &= ~encode_dr7(i, breakinfo[i].len, | ||
403 | breakinfo[i].type); | ||
404 | continue; | ||
405 | } | ||
394 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); | 406 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); |
395 | if (bp->attr.disabled == 1) | 407 | if (bp->attr.disabled == 1) |
396 | continue; | 408 | continue; |
@@ -399,23 +411,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) | |||
399 | } | 411 | } |
400 | } | 412 | } |
401 | 413 | ||
402 | /** | ||
403 | * kgdb_post_primary_code - Save error vector/code numbers. | ||
404 | * @regs: Original pt_regs. | ||
405 | * @e_vector: Original error vector. | ||
406 | * @err_code: Original error code. | ||
407 | * | ||
408 | * This is needed on architectures which support SMP and KGDB. | ||
409 | * This function is called after all the slave cpus have been put | ||
410 | * to a know spin state and the primary CPU has control over KGDB. | ||
411 | */ | ||
412 | void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) | ||
413 | { | ||
414 | /* primary processor is completely in the debugger */ | ||
415 | gdb_x86vector = e_vector; | ||
416 | gdb_x86errcode = err_code; | ||
417 | } | ||
418 | |||
419 | #ifdef CONFIG_SMP | 414 | #ifdef CONFIG_SMP |
420 | /** | 415 | /** |
421 | * kgdb_roundup_cpus - Get other CPUs into a holding pattern | 416 | * kgdb_roundup_cpus - Get other CPUs into a holding pattern |
@@ -461,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
461 | { | 456 | { |
462 | unsigned long addr; | 457 | unsigned long addr; |
463 | char *ptr; | 458 | char *ptr; |
464 | int newPC; | ||
465 | 459 | ||
466 | switch (remcomInBuffer[0]) { | 460 | switch (remcomInBuffer[0]) { |
467 | case 'c': | 461 | case 'c': |
@@ -472,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
472 | linux_regs->ip = addr; | 466 | linux_regs->ip = addr; |
473 | case 'D': | 467 | case 'D': |
474 | case 'k': | 468 | case 'k': |
475 | newPC = linux_regs->ip; | ||
476 | |||
477 | /* clear the trace bit */ | 469 | /* clear the trace bit */ |
478 | linux_regs->flags &= ~X86_EFLAGS_TF; | 470 | linux_regs->flags &= ~X86_EFLAGS_TF; |
479 | atomic_set(&kgdb_cpu_doing_single_step, -1); | 471 | atomic_set(&kgdb_cpu_doing_single_step, -1); |
@@ -567,7 +559,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) | |||
567 | return NOTIFY_DONE; | 559 | return NOTIFY_DONE; |
568 | } | 560 | } |
569 | 561 | ||
570 | if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs)) | 562 | if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs)) |
571 | return NOTIFY_DONE; | 563 | return NOTIFY_DONE; |
572 | 564 | ||
573 | /* Must touch watchdog before return to normal operation */ | 565 | /* Must touch watchdog before return to normal operation */ |
@@ -575,6 +567,24 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) | |||
575 | return NOTIFY_STOP; | 567 | return NOTIFY_STOP; |
576 | } | 568 | } |
577 | 569 | ||
570 | int kgdb_ll_trap(int cmd, const char *str, | ||
571 | struct pt_regs *regs, long err, int trap, int sig) | ||
572 | { | ||
573 | struct die_args args = { | ||
574 | .regs = regs, | ||
575 | .str = str, | ||
576 | .err = err, | ||
577 | .trapnr = trap, | ||
578 | .signr = sig, | ||
579 | |||
580 | }; | ||
581 | |||
582 | if (!kgdb_io_module_registered) | ||
583 | return NOTIFY_DONE; | ||
584 | |||
585 | return __kgdb_notify(&args, cmd); | ||
586 | } | ||
587 | |||
578 | static int | 588 | static int |
579 | kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) | 589 | kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) |
580 | { | 590 | { |
@@ -605,14 +615,21 @@ static struct notifier_block kgdb_notifier = { | |||
605 | */ | 615 | */ |
606 | int kgdb_arch_init(void) | 616 | int kgdb_arch_init(void) |
607 | { | 617 | { |
618 | return register_die_notifier(&kgdb_notifier); | ||
619 | } | ||
620 | |||
621 | static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, | ||
622 | struct perf_sample_data *data, struct pt_regs *regs) | ||
623 | { | ||
624 | kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); | ||
625 | } | ||
626 | |||
627 | void kgdb_arch_late(void) | ||
628 | { | ||
608 | int i, cpu; | 629 | int i, cpu; |
609 | int ret; | ||
610 | struct perf_event_attr attr; | 630 | struct perf_event_attr attr; |
611 | struct perf_event **pevent; | 631 | struct perf_event **pevent; |
612 | 632 | ||
613 | ret = register_die_notifier(&kgdb_notifier); | ||
614 | if (ret != 0) | ||
615 | return ret; | ||
616 | /* | 633 | /* |
617 | * Pre-allocate the hw breakpoint structions in the non-atomic | 634 | * Pre-allocate the hw breakpoint structions in the non-atomic |
618 | * portion of kgdb because this operation requires mutexs to | 635 | * portion of kgdb because this operation requires mutexs to |
@@ -623,24 +640,27 @@ int kgdb_arch_init(void) | |||
623 | attr.bp_len = HW_BREAKPOINT_LEN_1; | 640 | attr.bp_len = HW_BREAKPOINT_LEN_1; |
624 | attr.bp_type = HW_BREAKPOINT_W; | 641 | attr.bp_type = HW_BREAKPOINT_W; |
625 | attr.disabled = 1; | 642 | attr.disabled = 1; |
626 | for (i = 0; i < 4; i++) { | 643 | for (i = 0; i < HBP_NUM; i++) { |
644 | if (breakinfo[i].pev) | ||
645 | continue; | ||
627 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); | 646 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); |
628 | if (IS_ERR(breakinfo[i].pev)) { | 647 | if (IS_ERR(breakinfo[i].pev)) { |
629 | printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); | 648 | printk(KERN_ERR "kgdb: Could not allocate hw" |
649 | "breakpoints\nDisabling the kernel debugger\n"); | ||
630 | breakinfo[i].pev = NULL; | 650 | breakinfo[i].pev = NULL; |
631 | kgdb_arch_exit(); | 651 | kgdb_arch_exit(); |
632 | return -1; | 652 | return; |
633 | } | 653 | } |
634 | for_each_online_cpu(cpu) { | 654 | for_each_online_cpu(cpu) { |
635 | pevent = per_cpu_ptr(breakinfo[i].pev, cpu); | 655 | pevent = per_cpu_ptr(breakinfo[i].pev, cpu); |
636 | pevent[0]->hw.sample_period = 1; | 656 | pevent[0]->hw.sample_period = 1; |
657 | pevent[0]->overflow_handler = kgdb_hw_overflow_handler; | ||
637 | if (pevent[0]->destroy != NULL) { | 658 | if (pevent[0]->destroy != NULL) { |
638 | pevent[0]->destroy = NULL; | 659 | pevent[0]->destroy = NULL; |
639 | release_bp_slot(*pevent); | 660 | release_bp_slot(*pevent); |
640 | } | 661 | } |
641 | } | 662 | } |
642 | } | 663 | } |
643 | return ret; | ||
644 | } | 664 | } |
645 | 665 | ||
646 | /** | 666 | /** |
@@ -690,6 +710,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) | |||
690 | return instruction_pointer(regs); | 710 | return instruction_pointer(regs); |
691 | } | 711 | } |
692 | 712 | ||
713 | void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) | ||
714 | { | ||
715 | regs->ip = ip; | ||
716 | } | ||
717 | |||
693 | struct kgdb_arch arch_kgdb_ops = { | 718 | struct kgdb_arch arch_kgdb_ops = { |
694 | /* Breakpoint instruction: */ | 719 | /* Breakpoint instruction: */ |
695 | .gdb_bpt_instr = { 0xcc }, | 720 | .gdb_bpt_instr = { 0xcc }, |
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1658efdfb4e5..1bfb6cf4dd55 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to) | |||
126 | } | 126 | } |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * Check for the REX prefix which can only exist on X86_64 | 129 | * Skip the prefixes of the instruction. |
130 | * X86_32 always returns 0 | ||
131 | */ | 130 | */ |
132 | static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) | 131 | static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) |
133 | { | 132 | { |
133 | insn_attr_t attr; | ||
134 | |||
135 | attr = inat_get_opcode_attribute((insn_byte_t)*insn); | ||
136 | while (inat_is_legacy_prefix(attr)) { | ||
137 | insn++; | ||
138 | attr = inat_get_opcode_attribute((insn_byte_t)*insn); | ||
139 | } | ||
134 | #ifdef CONFIG_X86_64 | 140 | #ifdef CONFIG_X86_64 |
135 | if ((*insn & 0xf0) == 0x40) | 141 | if (inat_is_rex_prefix(attr)) |
136 | return 1; | 142 | insn++; |
137 | #endif | 143 | #endif |
138 | return 0; | 144 | return insn; |
139 | } | 145 | } |
140 | 146 | ||
141 | /* | 147 | /* |
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr) | |||
272 | */ | 278 | */ |
273 | static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | 279 | static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) |
274 | { | 280 | { |
281 | /* Skip prefixes */ | ||
282 | insn = skip_prefixes(insn); | ||
283 | |||
275 | switch (*insn) { | 284 | switch (*insn) { |
276 | case 0xfa: /* cli */ | 285 | case 0xfa: /* cli */ |
277 | case 0xfb: /* sti */ | 286 | case 0xfb: /* sti */ |
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | |||
280 | return 1; | 289 | return 1; |
281 | } | 290 | } |
282 | 291 | ||
283 | /* | ||
284 | * on X86_64, 0x40-0x4f are REX prefixes so we need to look | ||
285 | * at the next byte instead.. but of course not recurse infinitely | ||
286 | */ | ||
287 | if (is_REX_prefix(insn)) | ||
288 | return is_IF_modifier(++insn); | ||
289 | |||
290 | return 0; | 292 | return 0; |
291 | } | 293 | } |
292 | 294 | ||
@@ -422,14 +424,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, | |||
422 | 424 | ||
423 | static void __kprobes clear_btf(void) | 425 | static void __kprobes clear_btf(void) |
424 | { | 426 | { |
425 | if (test_thread_flag(TIF_DEBUGCTLMSR)) | 427 | if (test_thread_flag(TIF_BLOCKSTEP)) { |
426 | update_debugctlmsr(0); | 428 | unsigned long debugctl = get_debugctlmsr(); |
429 | |||
430 | debugctl &= ~DEBUGCTLMSR_BTF; | ||
431 | update_debugctlmsr(debugctl); | ||
432 | } | ||
427 | } | 433 | } |
428 | 434 | ||
429 | static void __kprobes restore_btf(void) | 435 | static void __kprobes restore_btf(void) |
430 | { | 436 | { |
431 | if (test_thread_flag(TIF_DEBUGCTLMSR)) | 437 | if (test_thread_flag(TIF_BLOCKSTEP)) { |
432 | update_debugctlmsr(current->thread.debugctlmsr); | 438 | unsigned long debugctl = get_debugctlmsr(); |
439 | |||
440 | debugctl |= DEBUGCTLMSR_BTF; | ||
441 | update_debugctlmsr(debugctl); | ||
442 | } | ||
433 | } | 443 | } |
434 | 444 | ||
435 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | 445 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, |
@@ -632,8 +642,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) | |||
632 | /* Skip cs, ip, orig_ax and gs. */ \ | 642 | /* Skip cs, ip, orig_ax and gs. */ \ |
633 | " subl $16, %esp\n" \ | 643 | " subl $16, %esp\n" \ |
634 | " pushl %fs\n" \ | 644 | " pushl %fs\n" \ |
635 | " pushl %ds\n" \ | ||
636 | " pushl %es\n" \ | 645 | " pushl %es\n" \ |
646 | " pushl %ds\n" \ | ||
637 | " pushl %eax\n" \ | 647 | " pushl %eax\n" \ |
638 | " pushl %ebp\n" \ | 648 | " pushl %ebp\n" \ |
639 | " pushl %edi\n" \ | 649 | " pushl %edi\n" \ |
@@ -795,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p, | |||
795 | unsigned long orig_ip = (unsigned long)p->addr; | 805 | unsigned long orig_ip = (unsigned long)p->addr; |
796 | kprobe_opcode_t *insn = p->ainsn.insn; | 806 | kprobe_opcode_t *insn = p->ainsn.insn; |
797 | 807 | ||
798 | /*skip the REX prefix*/ | 808 | /* Skip prefixes */ |
799 | if (is_REX_prefix(insn)) | 809 | insn = skip_prefixes(insn); |
800 | insn++; | ||
801 | 810 | ||
802 | regs->flags &= ~X86_EFLAGS_TF; | 811 | regs->flags &= ~X86_EFLAGS_TF; |
803 | switch (*insn) { | 812 | switch (*insn) { |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index feaeb0d3aa4f..eb9b76c716c2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -29,6 +29,8 @@ | |||
29 | #define KVM_SCALE 22 | 29 | #define KVM_SCALE 22 |
30 | 30 | ||
31 | static int kvmclock = 1; | 31 | static int kvmclock = 1; |
32 | static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; | ||
33 | static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; | ||
32 | 34 | ||
33 | static int parse_no_kvmclock(char *arg) | 35 | static int parse_no_kvmclock(char *arg) |
34 | { | 36 | { |
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void) | |||
54 | 56 | ||
55 | low = (int)__pa_symbol(&wall_clock); | 57 | low = (int)__pa_symbol(&wall_clock); |
56 | high = ((u64)__pa_symbol(&wall_clock) >> 32); | 58 | high = ((u64)__pa_symbol(&wall_clock) >> 32); |
57 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | 59 | |
60 | native_write_msr(msr_kvm_wall_clock, low, high); | ||
58 | 61 | ||
59 | vcpu_time = &get_cpu_var(hv_clock); | 62 | vcpu_time = &get_cpu_var(hv_clock); |
60 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); | 63 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); |
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt) | |||
130 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); | 133 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); |
131 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", | 134 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", |
132 | cpu, high, low, txt); | 135 | cpu, high, low, txt); |
133 | return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); | 136 | |
137 | return native_write_msr_safe(msr_kvm_system_time, low, high); | ||
134 | } | 138 | } |
135 | 139 | ||
136 | #ifdef CONFIG_X86_LOCAL_APIC | 140 | #ifdef CONFIG_X86_LOCAL_APIC |
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void) | |||
165 | #ifdef CONFIG_KEXEC | 169 | #ifdef CONFIG_KEXEC |
166 | static void kvm_crash_shutdown(struct pt_regs *regs) | 170 | static void kvm_crash_shutdown(struct pt_regs *regs) |
167 | { | 171 | { |
168 | native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); | 172 | native_write_msr(msr_kvm_system_time, 0, 0); |
169 | native_machine_crash_shutdown(regs); | 173 | native_machine_crash_shutdown(regs); |
170 | } | 174 | } |
171 | #endif | 175 | #endif |
172 | 176 | ||
173 | static void kvm_shutdown(void) | 177 | static void kvm_shutdown(void) |
174 | { | 178 | { |
175 | native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); | 179 | native_write_msr(msr_kvm_system_time, 0, 0); |
176 | native_machine_shutdown(); | 180 | native_machine_shutdown(); |
177 | } | 181 | } |
178 | 182 | ||
@@ -181,27 +185,37 @@ void __init kvmclock_init(void) | |||
181 | if (!kvm_para_available()) | 185 | if (!kvm_para_available()) |
182 | return; | 186 | return; |
183 | 187 | ||
184 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { | 188 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { |
185 | if (kvm_register_clock("boot clock")) | 189 | msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; |
186 | return; | 190 | msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; |
187 | pv_time_ops.sched_clock = kvm_clock_read; | 191 | } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) |
188 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; | 192 | return; |
189 | x86_platform.get_wallclock = kvm_get_wallclock; | 193 | |
190 | x86_platform.set_wallclock = kvm_set_wallclock; | 194 | printk(KERN_INFO "kvm-clock: Using msrs %x and %x", |
195 | msr_kvm_system_time, msr_kvm_wall_clock); | ||
196 | |||
197 | if (kvm_register_clock("boot clock")) | ||
198 | return; | ||
199 | pv_time_ops.sched_clock = kvm_clock_read; | ||
200 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; | ||
201 | x86_platform.get_wallclock = kvm_get_wallclock; | ||
202 | x86_platform.set_wallclock = kvm_set_wallclock; | ||
191 | #ifdef CONFIG_X86_LOCAL_APIC | 203 | #ifdef CONFIG_X86_LOCAL_APIC |
192 | x86_cpuinit.setup_percpu_clockev = | 204 | x86_cpuinit.setup_percpu_clockev = |
193 | kvm_setup_secondary_clock; | 205 | kvm_setup_secondary_clock; |
194 | #endif | 206 | #endif |
195 | #ifdef CONFIG_SMP | 207 | #ifdef CONFIG_SMP |
196 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 208 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
197 | #endif | 209 | #endif |
198 | machine_ops.shutdown = kvm_shutdown; | 210 | machine_ops.shutdown = kvm_shutdown; |
199 | #ifdef CONFIG_KEXEC | 211 | #ifdef CONFIG_KEXEC |
200 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 212 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
201 | #endif | 213 | #endif |
202 | kvm_get_preset_lpj(); | 214 | kvm_get_preset_lpj(); |
203 | clocksource_register(&kvm_clock); | 215 | clocksource_register(&kvm_clock); |
204 | pv_info.paravirt_enabled = 1; | 216 | pv_info.paravirt_enabled = 1; |
205 | pv_info.name = "KVM"; | 217 | pv_info.name = "KVM"; |
206 | } | 218 | |
219 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) | ||
220 | pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); | ||
207 | } | 221 | } |
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index cceb5bc3c3c2..fa6551d36c10 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size) | |||
201 | return error; | 201 | return error; |
202 | } | 202 | } |
203 | 203 | ||
204 | static int microcode_open(struct inode *unused1, struct file *unused2) | 204 | static int microcode_open(struct inode *inode, struct file *file) |
205 | { | 205 | { |
206 | return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; | 206 | return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; |
207 | } | 207 | } |
208 | 208 | ||
209 | static ssize_t microcode_write(struct file *file, const char __user *buf, | 209 | static ssize_t microcode_write(struct file *file, const char __user *buf, |
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void) | |||
260 | } | 260 | } |
261 | 261 | ||
262 | MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); | 262 | MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); |
263 | MODULE_ALIAS("devname:cpu/microcode"); | ||
263 | #else | 264 | #else |
264 | #define microcode_dev_init() 0 | 265 | #define microcode_dev_init() 0 |
265 | #define microcode_dev_exit() do { } while (0) | 266 | #define microcode_dev_exit() do { } while (0) |
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 85a343e28937..356170262a93 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c | |||
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
343 | int (*get_ucode_data)(void *, const void *, size_t)) | 343 | int (*get_ucode_data)(void *, const void *, size_t)) |
344 | { | 344 | { |
345 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 345 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
346 | u8 *ucode_ptr = data, *new_mc = NULL, *mc; | 346 | u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; |
347 | int new_rev = uci->cpu_sig.rev; | 347 | int new_rev = uci->cpu_sig.rev; |
348 | unsigned int leftover = size; | 348 | unsigned int leftover = size; |
349 | enum ucode_state state = UCODE_OK; | 349 | enum ucode_state state = UCODE_OK; |
350 | unsigned int curr_mc_size = 0; | ||
350 | 351 | ||
351 | while (leftover) { | 352 | while (leftover) { |
352 | struct microcode_header_intel mc_header; | 353 | struct microcode_header_intel mc_header; |
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
361 | break; | 362 | break; |
362 | } | 363 | } |
363 | 364 | ||
364 | mc = vmalloc(mc_size); | 365 | /* For performance reasons, reuse mc area when possible */ |
365 | if (!mc) | 366 | if (!mc || mc_size > curr_mc_size) { |
366 | break; | 367 | if (mc) |
368 | vfree(mc); | ||
369 | mc = vmalloc(mc_size); | ||
370 | if (!mc) | ||
371 | break; | ||
372 | curr_mc_size = mc_size; | ||
373 | } | ||
367 | 374 | ||
368 | if (get_ucode_data(mc, ucode_ptr, mc_size) || | 375 | if (get_ucode_data(mc, ucode_ptr, mc_size) || |
369 | microcode_sanity_check(mc) < 0) { | 376 | microcode_sanity_check(mc) < 0) { |
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
376 | vfree(new_mc); | 383 | vfree(new_mc); |
377 | new_rev = mc_header.rev; | 384 | new_rev = mc_header.rev; |
378 | new_mc = mc; | 385 | new_mc = mc; |
379 | } else | 386 | mc = NULL; /* trigger new vmalloc */ |
380 | vfree(mc); | 387 | } |
381 | 388 | ||
382 | ucode_ptr += mc_size; | 389 | ucode_ptr += mc_size; |
383 | leftover -= mc_size; | 390 | leftover -= mc_size; |
384 | } | 391 | } |
385 | 392 | ||
393 | if (mc) | ||
394 | vfree(mc); | ||
395 | |||
386 | if (leftover) { | 396 | if (leftover) { |
387 | if (new_mc) | 397 | if (new_mc) |
388 | vfree(new_mc); | 398 | vfree(new_mc); |
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e81030f71a8f..d86dbf7e54be 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m) | |||
115 | printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); | 115 | printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); |
116 | } | 116 | } |
117 | 117 | ||
118 | static int bad_ioapic(unsigned long address) | ||
119 | { | ||
120 | if (nr_ioapics >= MAX_IO_APICS) { | ||
121 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
122 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
123 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
124 | } | ||
125 | if (!address) { | ||
126 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
127 | " found in table, skipping!\n"); | ||
128 | return 1; | ||
129 | } | ||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | static void __init MP_ioapic_info(struct mpc_ioapic *m) | 118 | static void __init MP_ioapic_info(struct mpc_ioapic *m) |
134 | { | 119 | { |
135 | if (!(m->flags & MPC_APIC_USABLE)) | 120 | if (!(m->flags & MPC_APIC_USABLE)) |
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m) | |||
138 | printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", | 123 | printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", |
139 | m->apicid, m->apicver, m->apicaddr); | 124 | m->apicid, m->apicver, m->apicaddr); |
140 | 125 | ||
141 | if (bad_ioapic(m->apicaddr)) | 126 | mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); |
142 | return; | ||
143 | |||
144 | mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; | ||
145 | mp_ioapics[nr_ioapics].apicid = m->apicid; | ||
146 | mp_ioapics[nr_ioapics].type = m->type; | ||
147 | mp_ioapics[nr_ioapics].apicver = m->apicver; | ||
148 | mp_ioapics[nr_ioapics].flags = m->flags; | ||
149 | nr_ioapics++; | ||
150 | } | 127 | } |
151 | 128 | ||
152 | static void print_MP_intsrc_info(struct mpc_intsrc *m) | 129 | static void print_MP_intsrc_info(struct mpc_intsrc *m) |
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 0aad8670858e..79ae68154e87 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c | |||
@@ -25,8 +25,34 @@ | |||
25 | #include <asm/i8259.h> | 25 | #include <asm/i8259.h> |
26 | #include <asm/apb_timer.h> | 26 | #include <asm/apb_timer.h> |
27 | 27 | ||
28 | /* | ||
29 | * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, | ||
30 | * cmdline option x86_mrst_timer can be used to override the configuration | ||
31 | * to prefer one or the other. | ||
32 | * at runtime, there are basically three timer configurations: | ||
33 | * 1. per cpu apbt clock only | ||
34 | * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only | ||
35 | * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. | ||
36 | * | ||
37 | * by default (without cmdline option), platform code first detects cpu type | ||
38 | * to see if we are on lincroft or penwell, then set up both lapic or apbt | ||
39 | * clocks accordingly. | ||
40 | * i.e. by default, medfield uses configuration #2, moorestown uses #1. | ||
41 | * config #3 is supported but not recommended on medfield. | ||
42 | * | ||
43 | * rating and feature summary: | ||
44 | * lapic (with C3STOP) --------- 100 | ||
45 | * apbt (always-on) ------------ 110 | ||
46 | * lapic (always-on,ARAT) ------ 150 | ||
47 | */ | ||
48 | |||
49 | __cpuinitdata enum mrst_timer_options mrst_timer_options; | ||
50 | |||
28 | static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; | 51 | static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; |
29 | static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; | 52 | static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; |
53 | enum mrst_cpu_type __mrst_cpu_chip; | ||
54 | EXPORT_SYMBOL_GPL(__mrst_cpu_chip); | ||
55 | |||
30 | int sfi_mtimer_num; | 56 | int sfi_mtimer_num; |
31 | 57 | ||
32 | struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; | 58 | struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; |
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) | |||
167 | return 0; | 193 | return 0; |
168 | } | 194 | } |
169 | 195 | ||
170 | /* | ||
171 | * the secondary clock in Moorestown can be APBT or LAPIC clock, default to | ||
172 | * APBT but cmdline option can also override it. | ||
173 | */ | ||
174 | static void __cpuinit mrst_setup_secondary_clock(void) | ||
175 | { | ||
176 | /* restore default lapic clock if disabled by cmdline */ | ||
177 | if (disable_apbt_percpu) | ||
178 | return setup_secondary_APIC_clock(); | ||
179 | apbt_setup_secondary_clock(); | ||
180 | } | ||
181 | |||
182 | static unsigned long __init mrst_calibrate_tsc(void) | 196 | static unsigned long __init mrst_calibrate_tsc(void) |
183 | { | 197 | { |
184 | unsigned long flags, fast_calibrate; | 198 | unsigned long flags, fast_calibrate; |
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void) | |||
195 | 209 | ||
196 | void __init mrst_time_init(void) | 210 | void __init mrst_time_init(void) |
197 | { | 211 | { |
212 | switch (mrst_timer_options) { | ||
213 | case MRST_TIMER_APBT_ONLY: | ||
214 | break; | ||
215 | case MRST_TIMER_LAPIC_APBT: | ||
216 | x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; | ||
217 | x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; | ||
218 | break; | ||
219 | default: | ||
220 | if (!boot_cpu_has(X86_FEATURE_ARAT)) | ||
221 | break; | ||
222 | x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; | ||
223 | x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; | ||
224 | return; | ||
225 | } | ||
226 | /* we need at least one APB timer */ | ||
198 | sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); | 227 | sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); |
199 | pre_init_apic_IRQ0(); | 228 | pre_init_apic_IRQ0(); |
200 | apbt_time_init(); | 229 | apbt_time_init(); |
@@ -205,16 +234,27 @@ void __init mrst_rtc_init(void) | |||
205 | sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); | 234 | sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); |
206 | } | 235 | } |
207 | 236 | ||
208 | /* | 237 | void __cpuinit mrst_arch_setup(void) |
209 | * if we use per cpu apb timer, the bootclock already setup. if we use lapic | ||
210 | * timer and one apbt timer for broadcast, we need to set up lapic boot clock. | ||
211 | */ | ||
212 | static void __init mrst_setup_boot_clock(void) | ||
213 | { | 238 | { |
214 | pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); | 239 | if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) |
215 | if (disable_apbt_percpu) | 240 | __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; |
216 | setup_boot_APIC_clock(); | 241 | else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) |
217 | }; | 242 | __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; |
243 | else { | ||
244 | pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", | ||
245 | boot_cpu_data.x86, boot_cpu_data.x86_model); | ||
246 | __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; | ||
247 | } | ||
248 | pr_debug("Moorestown CPU %s identified\n", | ||
249 | (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? | ||
250 | "Lincroft" : "Penwell"); | ||
251 | } | ||
252 | |||
253 | /* MID systems don't have i8042 controller */ | ||
254 | static int mrst_i8042_detect(void) | ||
255 | { | ||
256 | return 0; | ||
257 | } | ||
218 | 258 | ||
219 | /* | 259 | /* |
220 | * Moorestown specific x86_init function overrides and early setup | 260 | * Moorestown specific x86_init function overrides and early setup |
@@ -226,15 +266,46 @@ void __init x86_mrst_early_setup(void) | |||
226 | x86_init.resources.reserve_resources = x86_init_noop; | 266 | x86_init.resources.reserve_resources = x86_init_noop; |
227 | 267 | ||
228 | x86_init.timers.timer_init = mrst_time_init; | 268 | x86_init.timers.timer_init = mrst_time_init; |
229 | x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; | 269 | x86_init.timers.setup_percpu_clockev = x86_init_noop; |
230 | 270 | ||
231 | x86_init.irqs.pre_vector_init = x86_init_noop; | 271 | x86_init.irqs.pre_vector_init = x86_init_noop; |
232 | 272 | ||
233 | x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; | 273 | x86_init.oem.arch_setup = mrst_arch_setup; |
274 | |||
275 | x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; | ||
234 | 276 | ||
235 | x86_platform.calibrate_tsc = mrst_calibrate_tsc; | 277 | x86_platform.calibrate_tsc = mrst_calibrate_tsc; |
278 | x86_platform.i8042_detect = mrst_i8042_detect; | ||
236 | x86_init.pci.init = pci_mrst_init; | 279 | x86_init.pci.init = pci_mrst_init; |
237 | x86_init.pci.fixup_irqs = x86_init_noop; | 280 | x86_init.pci.fixup_irqs = x86_init_noop; |
238 | 281 | ||
239 | legacy_pic = &null_legacy_pic; | 282 | legacy_pic = &null_legacy_pic; |
283 | |||
284 | /* Avoid searching for BIOS MP tables */ | ||
285 | x86_init.mpparse.find_smp_config = x86_init_noop; | ||
286 | x86_init.mpparse.get_smp_config = x86_init_uint_noop; | ||
287 | |||
288 | } | ||
289 | |||
290 | /* | ||
291 | * if user does not want to use per CPU apb timer, just give it a lower rating | ||
292 | * than local apic timer and skip the late per cpu timer init. | ||
293 | */ | ||
294 | static inline int __init setup_x86_mrst_timer(char *arg) | ||
295 | { | ||
296 | if (!arg) | ||
297 | return -EINVAL; | ||
298 | |||
299 | if (strcmp("apbt_only", arg) == 0) | ||
300 | mrst_timer_options = MRST_TIMER_APBT_ONLY; | ||
301 | else if (strcmp("lapic_and_apbt", arg) == 0) | ||
302 | mrst_timer_options = MRST_TIMER_LAPIC_APBT; | ||
303 | else { | ||
304 | pr_warning("X86 MRST timer option %s not recognised" | ||
305 | " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", | ||
306 | arg); | ||
307 | return -EINVAL; | ||
308 | } | ||
309 | return 0; | ||
240 | } | 310 | } |
311 | __setup("x86_mrst_timer=", setup_x86_mrst_timer); | ||
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4d4468e9f47c..7bf2dc4c8f70 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, | |||
230 | msr_device_destroy(cpu); | 230 | msr_device_destroy(cpu); |
231 | break; | 231 | break; |
232 | } | 232 | } |
233 | return err ? NOTIFY_BAD : NOTIFY_OK; | 233 | return notifier_from_errno(err); |
234 | } | 234 | } |
235 | 235 | ||
236 | static struct notifier_block __refdata msr_class_cpu_notifier = { | 236 | static struct notifier_block __refdata msr_class_cpu_notifier = { |
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b3..0e0cdde519be 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c | |||
@@ -21,10 +21,7 @@ | |||
21 | #include <asm/geode.h> | 21 | #include <asm/geode.h> |
22 | #include <asm/setup.h> | 22 | #include <asm/setup.h> |
23 | #include <asm/olpc.h> | 23 | #include <asm/olpc.h> |
24 | 24 | #include <asm/olpc_ofw.h> | |
25 | #ifdef CONFIG_OPEN_FIRMWARE | ||
26 | #include <asm/ofw.h> | ||
27 | #endif | ||
28 | 25 | ||
29 | struct olpc_platform_t olpc_platform_info; | 26 | struct olpc_platform_t olpc_platform_info; |
30 | EXPORT_SYMBOL_GPL(olpc_platform_info); | 27 | EXPORT_SYMBOL_GPL(olpc_platform_info); |
@@ -145,7 +142,7 @@ restart: | |||
145 | * The OBF flag will sometimes misbehave due to what we believe | 142 | * The OBF flag will sometimes misbehave due to what we believe |
146 | * is a hardware quirk.. | 143 | * is a hardware quirk.. |
147 | */ | 144 | */ |
148 | printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); | 145 | pr_devel("olpc-ec: running cmd 0x%x\n", cmd); |
149 | outb(cmd, 0x6c); | 146 | outb(cmd, 0x6c); |
150 | 147 | ||
151 | if (wait_on_ibf(0x6c, 0)) { | 148 | if (wait_on_ibf(0x6c, 0)) { |
@@ -162,8 +159,7 @@ restart: | |||
162 | " EC accept data!\n"); | 159 | " EC accept data!\n"); |
163 | goto err; | 160 | goto err; |
164 | } | 161 | } |
165 | printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", | 162 | pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); |
166 | inbuf[i]); | ||
167 | outb(inbuf[i], 0x68); | 163 | outb(inbuf[i], 0x68); |
168 | } | 164 | } |
169 | } | 165 | } |
@@ -176,8 +172,7 @@ restart: | |||
176 | goto restart; | 172 | goto restart; |
177 | } | 173 | } |
178 | outbuf[i] = inb(0x68); | 174 | outbuf[i] = inb(0x68); |
179 | printk(KERN_DEBUG "olpc-ec: received 0x%x\n", | 175 | pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); |
180 | outbuf[i]); | ||
181 | } | 176 | } |
182 | } | 177 | } |
183 | 178 | ||
@@ -188,14 +183,15 @@ err: | |||
188 | } | 183 | } |
189 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); | 184 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); |
190 | 185 | ||
191 | #ifdef CONFIG_OPEN_FIRMWARE | 186 | #ifdef CONFIG_OLPC_OPENFIRMWARE |
192 | static void __init platform_detect(void) | 187 | static void __init platform_detect(void) |
193 | { | 188 | { |
194 | size_t propsize; | 189 | size_t propsize; |
195 | __be32 rev; | 190 | __be32 rev; |
191 | const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; | ||
192 | void *res[] = { &propsize }; | ||
196 | 193 | ||
197 | if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, | 194 | if (olpc_ofw("getprop", args, res) || propsize != 4) { |
198 | &propsize) || propsize != 4) { | ||
199 | printk(KERN_ERR "ofw: getprop call failed!\n"); | 195 | printk(KERN_ERR "ofw: getprop call failed!\n"); |
200 | rev = cpu_to_be32(0); | 196 | rev = cpu_to_be32(0); |
201 | } | 197 | } |
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 000000000000..3218aa71ab5e --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c | |||
@@ -0,0 +1,106 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <asm/page.h> | ||
5 | #include <asm/setup.h> | ||
6 | #include <asm/io.h> | ||
7 | #include <asm/pgtable.h> | ||
8 | #include <asm/olpc_ofw.h> | ||
9 | |||
10 | /* address of OFW callback interface; will be NULL if OFW isn't found */ | ||
11 | static int (*olpc_ofw_cif)(int *); | ||
12 | |||
13 | /* page dir entry containing OFW's pgdir table; filled in by head_32.S */ | ||
14 | u32 olpc_ofw_pgd __initdata; | ||
15 | |||
16 | static DEFINE_SPINLOCK(ofw_lock); | ||
17 | |||
18 | #define MAXARGS 10 | ||
19 | |||
20 | void __init setup_olpc_ofw_pgd(void) | ||
21 | { | ||
22 | pgd_t *base, *ofw_pde; | ||
23 | |||
24 | if (!olpc_ofw_cif) | ||
25 | return; | ||
26 | |||
27 | /* fetch OFW's PDE */ | ||
28 | base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); | ||
29 | if (!base) { | ||
30 | printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); | ||
31 | olpc_ofw_cif = NULL; | ||
32 | return; | ||
33 | } | ||
34 | ofw_pde = &base[OLPC_OFW_PDE_NR]; | ||
35 | |||
36 | /* install OFW's PDE permanently into the kernel's pgtable */ | ||
37 | set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); | ||
38 | /* implicit optimization barrier here due to uninline function return */ | ||
39 | |||
40 | early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); | ||
41 | } | ||
42 | |||
43 | int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, | ||
44 | void **res) | ||
45 | { | ||
46 | int ofw_args[MAXARGS + 3]; | ||
47 | unsigned long flags; | ||
48 | int ret, i, *p; | ||
49 | |||
50 | BUG_ON(nr_args + nr_res > MAXARGS); | ||
51 | |||
52 | if (!olpc_ofw_cif) | ||
53 | return -EIO; | ||
54 | |||
55 | ofw_args[0] = (int)name; | ||
56 | ofw_args[1] = nr_args; | ||
57 | ofw_args[2] = nr_res; | ||
58 | |||
59 | p = &ofw_args[3]; | ||
60 | for (i = 0; i < nr_args; i++, p++) | ||
61 | *p = (int)args[i]; | ||
62 | |||
63 | /* call into ofw */ | ||
64 | spin_lock_irqsave(&ofw_lock, flags); | ||
65 | ret = olpc_ofw_cif(ofw_args); | ||
66 | spin_unlock_irqrestore(&ofw_lock, flags); | ||
67 | |||
68 | if (!ret) { | ||
69 | for (i = 0; i < nr_res; i++, p++) | ||
70 | *((int *)res[i]) = *p; | ||
71 | } | ||
72 | |||
73 | return ret; | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(__olpc_ofw); | ||
76 | |||
77 | /* OFW cif _should_ be above this address */ | ||
78 | #define OFW_MIN 0xff000000 | ||
79 | |||
80 | /* OFW starts on a 1MB boundary */ | ||
81 | #define OFW_BOUND (1<<20) | ||
82 | |||
83 | void __init olpc_ofw_detect(void) | ||
84 | { | ||
85 | struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; | ||
86 | unsigned long start; | ||
87 | |||
88 | /* ensure OFW booted us by checking for "OFW " string */ | ||
89 | if (hdr->ofw_magic != OLPC_OFW_SIG) | ||
90 | return; | ||
91 | |||
92 | olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; | ||
93 | |||
94 | if ((unsigned long)olpc_ofw_cif < OFW_MIN) { | ||
95 | printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", | ||
96 | (unsigned long)olpc_ofw_cif); | ||
97 | olpc_ofw_cif = NULL; | ||
98 | return; | ||
99 | } | ||
100 | |||
101 | /* determine where OFW starts in memory */ | ||
102 | start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); | ||
103 | printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", | ||
104 | (unsigned long)olpc_ofw_cif, (-start) >> 20); | ||
105 | reserve_top_address(-start); | ||
106 | } | ||
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index fb99f7edb341..078d4ec1a9d9 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -103,11 +103,16 @@ int use_calgary __read_mostly = 0; | |||
103 | #define PMR_SOFTSTOPFAULT 0x40000000 | 103 | #define PMR_SOFTSTOPFAULT 0x40000000 |
104 | #define PMR_HARDSTOP 0x20000000 | 104 | #define PMR_HARDSTOP 0x20000000 |
105 | 105 | ||
106 | #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ | 106 | /* |
107 | #define MAX_NUM_CHASSIS 8 /* max number of chassis */ | 107 | * The maximum PHB bus number. |
108 | /* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ | 108 | * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 |
109 | #define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) | 109 | * x3950M2: 4 chassis, 48 PHBs per chassis = 192 |
110 | #define PHBS_PER_CALGARY 4 | 110 | * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 |
111 | * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 | ||
112 | */ | ||
113 | #define MAX_PHB_BUS_NUM 256 | ||
114 | |||
115 | #define PHBS_PER_CALGARY 4 | ||
111 | 116 | ||
112 | /* register offsets in Calgary's internal register space */ | 117 | /* register offsets in Calgary's internal register space */ |
113 | static const unsigned long tar_offsets[] = { | 118 | static const unsigned long tar_offsets[] = { |
@@ -1051,8 +1056,6 @@ static int __init calgary_init_one(struct pci_dev *dev) | |||
1051 | struct iommu_table *tbl; | 1056 | struct iommu_table *tbl; |
1052 | int ret; | 1057 | int ret; |
1053 | 1058 | ||
1054 | BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); | ||
1055 | |||
1056 | bbar = busno_to_bbar(dev->bus->number); | 1059 | bbar = busno_to_bbar(dev->bus->number); |
1057 | ret = calgary_setup_tar(dev, bbar); | 1060 | ret = calgary_setup_tar(dev, bbar); |
1058 | if (ret) | 1061 | if (ret) |
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 7d2829dde20e..a5bc528d4328 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c | |||
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = { | |||
31 | .free_coherent = swiotlb_free_coherent, | 31 | .free_coherent = swiotlb_free_coherent, |
32 | .sync_single_for_cpu = swiotlb_sync_single_for_cpu, | 32 | .sync_single_for_cpu = swiotlb_sync_single_for_cpu, |
33 | .sync_single_for_device = swiotlb_sync_single_for_device, | 33 | .sync_single_for_device = swiotlb_sync_single_for_device, |
34 | .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, | ||
35 | .sync_single_range_for_device = swiotlb_sync_single_range_for_device, | ||
36 | .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, | 34 | .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, |
37 | .sync_sg_for_device = swiotlb_sync_sg_for_device, | 35 | .sync_sg_for_device = swiotlb_sync_sg_for_device, |
38 | .map_sg = swiotlb_map_sg_attrs, | 36 | .map_sg = swiotlb_map_sg_attrs, |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0415c3ef91b5..d401f1d2d06e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <asm/idle.h> | 20 | #include <asm/idle.h> |
21 | #include <asm/uaccess.h> | 21 | #include <asm/uaccess.h> |
22 | #include <asm/i387.h> | 22 | #include <asm/i387.h> |
23 | #include <asm/ds.h> | ||
24 | #include <asm/debugreg.h> | 23 | #include <asm/debugreg.h> |
25 | 24 | ||
26 | unsigned long idle_halt; | 25 | unsigned long idle_halt; |
@@ -29,29 +28,26 @@ unsigned long idle_nomwait; | |||
29 | EXPORT_SYMBOL(idle_nomwait); | 28 | EXPORT_SYMBOL(idle_nomwait); |
30 | 29 | ||
31 | struct kmem_cache *task_xstate_cachep; | 30 | struct kmem_cache *task_xstate_cachep; |
31 | EXPORT_SYMBOL_GPL(task_xstate_cachep); | ||
32 | 32 | ||
33 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 33 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
34 | { | 34 | { |
35 | int ret; | ||
36 | |||
35 | *dst = *src; | 37 | *dst = *src; |
36 | if (src->thread.xstate) { | 38 | if (fpu_allocated(&src->thread.fpu)) { |
37 | dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, | 39 | memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); |
38 | GFP_KERNEL); | 40 | ret = fpu_alloc(&dst->thread.fpu); |
39 | if (!dst->thread.xstate) | 41 | if (ret) |
40 | return -ENOMEM; | 42 | return ret; |
41 | WARN_ON((unsigned long)dst->thread.xstate & 15); | 43 | fpu_copy(&dst->thread.fpu, &src->thread.fpu); |
42 | memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); | ||
43 | } | 44 | } |
44 | return 0; | 45 | return 0; |
45 | } | 46 | } |
46 | 47 | ||
47 | void free_thread_xstate(struct task_struct *tsk) | 48 | void free_thread_xstate(struct task_struct *tsk) |
48 | { | 49 | { |
49 | if (tsk->thread.xstate) { | 50 | fpu_free(&tsk->thread.fpu); |
50 | kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); | ||
51 | tsk->thread.xstate = NULL; | ||
52 | } | ||
53 | |||
54 | WARN(tsk->thread.ds_ctx, "leaking DS context\n"); | ||
55 | } | 51 | } |
56 | 52 | ||
57 | void free_thread_info(struct thread_info *ti) | 53 | void free_thread_info(struct thread_info *ti) |
@@ -198,11 +194,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | |||
198 | prev = &prev_p->thread; | 194 | prev = &prev_p->thread; |
199 | next = &next_p->thread; | 195 | next = &next_p->thread; |
200 | 196 | ||
201 | if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || | 197 | if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ |
202 | test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) | 198 | test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { |
203 | ds_switch_to(prev_p, next_p); | 199 | unsigned long debugctl = get_debugctlmsr(); |
204 | else if (next->debugctlmsr != prev->debugctlmsr) | 200 | |
205 | update_debugctlmsr(next->debugctlmsr); | 201 | debugctl &= ~DEBUGCTLMSR_BTF; |
202 | if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) | ||
203 | debugctl |= DEBUGCTLMSR_BTF; | ||
204 | |||
205 | update_debugctlmsr(debugctl); | ||
206 | } | ||
206 | 207 | ||
207 | if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ | 208 | if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ |
208 | test_tsk_thread_flag(next_p, TIF_NOTSC)) { | 209 | test_tsk_thread_flag(next_p, TIF_NOTSC)) { |
@@ -371,7 +372,7 @@ static inline int hlt_use_halt(void) | |||
371 | void default_idle(void) | 372 | void default_idle(void) |
372 | { | 373 | { |
373 | if (hlt_use_halt()) { | 374 | if (hlt_use_halt()) { |
374 | trace_power_start(POWER_CSTATE, 1); | 375 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); |
375 | current_thread_info()->status &= ~TS_POLLING; | 376 | current_thread_info()->status &= ~TS_POLLING; |
376 | /* | 377 | /* |
377 | * TS_POLLING-cleared state must be visible before we | 378 | * TS_POLLING-cleared state must be visible before we |
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); | |||
441 | */ | 442 | */ |
442 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | 443 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) |
443 | { | 444 | { |
444 | trace_power_start(POWER_CSTATE, (ax>>4)+1); | 445 | trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); |
445 | if (!need_resched()) { | 446 | if (!need_resched()) { |
446 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 447 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) |
447 | clflush((void *)¤t_thread_info()->flags); | 448 | clflush((void *)¤t_thread_info()->flags); |
@@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | |||
457 | static void mwait_idle(void) | 458 | static void mwait_idle(void) |
458 | { | 459 | { |
459 | if (!need_resched()) { | 460 | if (!need_resched()) { |
460 | trace_power_start(POWER_CSTATE, 1); | 461 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); |
461 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 462 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) |
462 | clflush((void *)¤t_thread_info()->flags); | 463 | clflush((void *)¤t_thread_info()->flags); |
463 | 464 | ||
@@ -478,7 +479,7 @@ static void mwait_idle(void) | |||
478 | */ | 479 | */ |
479 | static void poll_idle(void) | 480 | static void poll_idle(void) |
480 | { | 481 | { |
481 | trace_power_start(POWER_CSTATE, 0); | 482 | trace_power_start(POWER_CSTATE, 0, smp_processor_id()); |
482 | local_irq_enable(); | 483 | local_irq_enable(); |
483 | while (!need_resched()) | 484 | while (!need_resched()) |
484 | cpu_relax(); | 485 | cpu_relax(); |
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) | |||
525 | return (edx & MWAIT_EDX_C1); | 526 | return (edx & MWAIT_EDX_C1); |
526 | } | 527 | } |
527 | 528 | ||
528 | /* | 529 | bool c1e_detected; |
529 | * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. | 530 | EXPORT_SYMBOL(c1e_detected); |
530 | * For more information see | ||
531 | * - Erratum #400 for NPT family 0xf and family 0x10 CPUs | ||
532 | * - Erratum #365 for family 0x11 (not affected because C1e not in use) | ||
533 | */ | ||
534 | static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) | ||
535 | { | ||
536 | u64 val; | ||
537 | if (c->x86_vendor != X86_VENDOR_AMD) | ||
538 | goto no_c1e_idle; | ||
539 | |||
540 | /* Family 0x0f models < rev F do not have C1E */ | ||
541 | if (c->x86 == 0x0F && c->x86_model >= 0x40) | ||
542 | return 1; | ||
543 | |||
544 | if (c->x86 == 0x10) { | ||
545 | /* | ||
546 | * check OSVW bit for CPUs that are not affected | ||
547 | * by erratum #400 | ||
548 | */ | ||
549 | if (cpu_has(c, X86_FEATURE_OSVW)) { | ||
550 | rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); | ||
551 | if (val >= 2) { | ||
552 | rdmsrl(MSR_AMD64_OSVW_STATUS, val); | ||
553 | if (!(val & BIT(1))) | ||
554 | goto no_c1e_idle; | ||
555 | } | ||
556 | } | ||
557 | return 1; | ||
558 | } | ||
559 | |||
560 | no_c1e_idle: | ||
561 | return 0; | ||
562 | } | ||
563 | 531 | ||
564 | static cpumask_var_t c1e_mask; | 532 | static cpumask_var_t c1e_mask; |
565 | static int c1e_detected; | ||
566 | 533 | ||
567 | void c1e_remove_cpu(int cpu) | 534 | void c1e_remove_cpu(int cpu) |
568 | { | 535 | { |
@@ -584,12 +551,12 @@ static void c1e_idle(void) | |||
584 | u32 lo, hi; | 551 | u32 lo, hi; |
585 | 552 | ||
586 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); | 553 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); |
554 | |||
587 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { | 555 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { |
588 | c1e_detected = 1; | 556 | c1e_detected = true; |
589 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) | 557 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) |
590 | mark_tsc_unstable("TSC halt in AMD C1E"); | 558 | mark_tsc_unstable("TSC halt in AMD C1E"); |
591 | printk(KERN_INFO "System has AMD C1E enabled\n"); | 559 | printk(KERN_INFO "System has AMD C1E enabled\n"); |
592 | set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); | ||
593 | } | 560 | } |
594 | } | 561 | } |
595 | 562 | ||
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | |||
638 | */ | 605 | */ |
639 | printk(KERN_INFO "using mwait in idle threads.\n"); | 606 | printk(KERN_INFO "using mwait in idle threads.\n"); |
640 | pm_idle = mwait_idle; | 607 | pm_idle = mwait_idle; |
641 | } else if (check_c1e_idle(c)) { | 608 | } else if (cpu_has_amd_erratum(amd_erratum_400)) { |
609 | /* E400: APIC timer interrupt does not wake up CPU from C1e */ | ||
642 | printk(KERN_INFO "using C1E aware idle routine\n"); | 610 | printk(KERN_INFO "using C1E aware idle routine\n"); |
643 | pm_idle = c1e_idle; | 611 | pm_idle = c1e_idle; |
644 | } else | 612 | } else |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f6c62667e30c..96586c3cbbbf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -55,9 +55,10 @@ | |||
55 | #include <asm/cpu.h> | 55 | #include <asm/cpu.h> |
56 | #include <asm/idle.h> | 56 | #include <asm/idle.h> |
57 | #include <asm/syscalls.h> | 57 | #include <asm/syscalls.h> |
58 | #include <asm/ds.h> | ||
59 | #include <asm/debugreg.h> | 58 | #include <asm/debugreg.h> |
60 | 59 | ||
60 | #include <trace/events/power.h> | ||
61 | |||
61 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 62 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
62 | 63 | ||
63 | /* | 64 | /* |
@@ -112,6 +113,8 @@ void cpu_idle(void) | |||
112 | stop_critical_timings(); | 113 | stop_critical_timings(); |
113 | pm_idle(); | 114 | pm_idle(); |
114 | start_critical_timings(); | 115 | start_critical_timings(); |
116 | |||
117 | trace_power_end(smp_processor_id()); | ||
115 | } | 118 | } |
116 | tick_nohz_restart_sched_tick(); | 119 | tick_nohz_restart_sched_tick(); |
117 | preempt_enable_no_resched(); | 120 | preempt_enable_no_resched(); |
@@ -238,13 +241,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
238 | kfree(p->thread.io_bitmap_ptr); | 241 | kfree(p->thread.io_bitmap_ptr); |
239 | p->thread.io_bitmap_max = 0; | 242 | p->thread.io_bitmap_max = 0; |
240 | } | 243 | } |
241 | |||
242 | clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); | ||
243 | p->thread.ds_ctx = NULL; | ||
244 | |||
245 | clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); | ||
246 | p->thread.debugctlmsr = 0; | ||
247 | |||
248 | return err; | 244 | return err; |
249 | } | 245 | } |
250 | 246 | ||
@@ -317,7 +313,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
317 | 313 | ||
318 | /* we're going to use this soon, after a few expensive things */ | 314 | /* we're going to use this soon, after a few expensive things */ |
319 | if (preload_fpu) | 315 | if (preload_fpu) |
320 | prefetch(next->xstate); | 316 | prefetch(next->fpu.state); |
321 | 317 | ||
322 | /* | 318 | /* |
323 | * Reload esp0. | 319 | * Reload esp0. |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 17cb3295cbf7..3d9ea531ddd1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -49,9 +49,10 @@ | |||
49 | #include <asm/ia32.h> | 49 | #include <asm/ia32.h> |
50 | #include <asm/idle.h> | 50 | #include <asm/idle.h> |
51 | #include <asm/syscalls.h> | 51 | #include <asm/syscalls.h> |
52 | #include <asm/ds.h> | ||
53 | #include <asm/debugreg.h> | 52 | #include <asm/debugreg.h> |
54 | 53 | ||
54 | #include <trace/events/power.h> | ||
55 | |||
55 | asmlinkage extern void ret_from_fork(void); | 56 | asmlinkage extern void ret_from_fork(void); |
56 | 57 | ||
57 | DEFINE_PER_CPU(unsigned long, old_rsp); | 58 | DEFINE_PER_CPU(unsigned long, old_rsp); |
@@ -139,6 +140,9 @@ void cpu_idle(void) | |||
139 | stop_critical_timings(); | 140 | stop_critical_timings(); |
140 | pm_idle(); | 141 | pm_idle(); |
141 | start_critical_timings(); | 142 | start_critical_timings(); |
143 | |||
144 | trace_power_end(smp_processor_id()); | ||
145 | |||
142 | /* In many cases the interrupt that ended idle | 146 | /* In many cases the interrupt that ended idle |
143 | has already called exit_idle. But some idle | 147 | has already called exit_idle. But some idle |
144 | loops can be woken up without interrupt. */ | 148 | loops can be woken up without interrupt. */ |
@@ -313,13 +317,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
313 | if (err) | 317 | if (err) |
314 | goto out; | 318 | goto out; |
315 | } | 319 | } |
316 | |||
317 | clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); | ||
318 | p->thread.ds_ctx = NULL; | ||
319 | |||
320 | clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); | ||
321 | p->thread.debugctlmsr = 0; | ||
322 | |||
323 | err = 0; | 320 | err = 0; |
324 | out: | 321 | out: |
325 | if (err && p->thread.io_bitmap_ptr) { | 322 | if (err && p->thread.io_bitmap_ptr) { |
@@ -396,7 +393,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
396 | 393 | ||
397 | /* we're going to use this soon, after a few expensive things */ | 394 | /* we're going to use this soon, after a few expensive things */ |
398 | if (preload_fpu) | 395 | if (preload_fpu) |
399 | prefetch(next->xstate); | 396 | prefetch(next->fpu.state); |
400 | 397 | ||
401 | /* | 398 | /* |
402 | * Reload esp0, LDT and the page table pointer: | 399 | * Reload esp0, LDT and the page table pointer: |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2e9b55027b7e..70c4872cd8aa 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -2,9 +2,6 @@ | |||
2 | /* | 2 | /* |
3 | * Pentium III FXSR, SSE support | 3 | * Pentium III FXSR, SSE support |
4 | * Gareth Hughes <gareth@valinux.com>, May 2000 | 4 | * Gareth Hughes <gareth@valinux.com>, May 2000 |
5 | * | ||
6 | * BTS tracing | ||
7 | * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 | ||
8 | */ | 5 | */ |
9 | 6 | ||
10 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
@@ -22,7 +19,6 @@ | |||
22 | #include <linux/audit.h> | 19 | #include <linux/audit.h> |
23 | #include <linux/seccomp.h> | 20 | #include <linux/seccomp.h> |
24 | #include <linux/signal.h> | 21 | #include <linux/signal.h> |
25 | #include <linux/workqueue.h> | ||
26 | #include <linux/perf_event.h> | 22 | #include <linux/perf_event.h> |
27 | #include <linux/hw_breakpoint.h> | 23 | #include <linux/hw_breakpoint.h> |
28 | 24 | ||
@@ -36,7 +32,6 @@ | |||
36 | #include <asm/desc.h> | 32 | #include <asm/desc.h> |
37 | #include <asm/prctl.h> | 33 | #include <asm/prctl.h> |
38 | #include <asm/proto.h> | 34 | #include <asm/proto.h> |
39 | #include <asm/ds.h> | ||
40 | #include <asm/hw_breakpoint.h> | 35 | #include <asm/hw_breakpoint.h> |
41 | 36 | ||
42 | #include "tls.h" | 37 | #include "tls.h" |
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, | |||
693 | struct perf_event_attr attr; | 688 | struct perf_event_attr attr; |
694 | 689 | ||
695 | if (!t->ptrace_bps[nr]) { | 690 | if (!t->ptrace_bps[nr]) { |
696 | hw_breakpoint_init(&attr); | 691 | ptrace_breakpoint_init(&attr); |
697 | /* | 692 | /* |
698 | * Put stub len and type to register (reserve) an inactive but | 693 | * Put stub len and type to register (reserve) an inactive but |
699 | * correct bp | 694 | * correct bp |
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target, | |||
789 | 0, IO_BITMAP_BYTES); | 784 | 0, IO_BITMAP_BYTES); |
790 | } | 785 | } |
791 | 786 | ||
792 | #ifdef CONFIG_X86_PTRACE_BTS | ||
793 | /* | ||
794 | * A branch trace store context. | ||
795 | * | ||
796 | * Contexts may only be installed by ptrace_bts_config() and only for | ||
797 | * ptraced tasks. | ||
798 | * | ||
799 | * Contexts are destroyed when the tracee is detached from the tracer. | ||
800 | * The actual destruction work requires interrupts enabled, so the | ||
801 | * work is deferred and will be scheduled during __ptrace_unlink(). | ||
802 | * | ||
803 | * Contexts hold an additional task_struct reference on the traced | ||
804 | * task, as well as a reference on the tracer's mm. | ||
805 | * | ||
806 | * Ptrace already holds a task_struct for the duration of ptrace operations, | ||
807 | * but since destruction is deferred, it may be executed after both | ||
808 | * tracer and tracee exited. | ||
809 | */ | ||
810 | struct bts_context { | ||
811 | /* The branch trace handle. */ | ||
812 | struct bts_tracer *tracer; | ||
813 | |||
814 | /* The buffer used to store the branch trace and its size. */ | ||
815 | void *buffer; | ||
816 | unsigned int size; | ||
817 | |||
818 | /* The mm that paid for the above buffer. */ | ||
819 | struct mm_struct *mm; | ||
820 | |||
821 | /* The task this context belongs to. */ | ||
822 | struct task_struct *task; | ||
823 | |||
824 | /* The signal to send on a bts buffer overflow. */ | ||
825 | unsigned int bts_ovfl_signal; | ||
826 | |||
827 | /* The work struct to destroy a context. */ | ||
828 | struct work_struct work; | ||
829 | }; | ||
830 | |||
831 | static int alloc_bts_buffer(struct bts_context *context, unsigned int size) | ||
832 | { | ||
833 | void *buffer = NULL; | ||
834 | int err = -ENOMEM; | ||
835 | |||
836 | err = account_locked_memory(current->mm, current->signal->rlim, size); | ||
837 | if (err < 0) | ||
838 | return err; | ||
839 | |||
840 | buffer = kzalloc(size, GFP_KERNEL); | ||
841 | if (!buffer) | ||
842 | goto out_refund; | ||
843 | |||
844 | context->buffer = buffer; | ||
845 | context->size = size; | ||
846 | context->mm = get_task_mm(current); | ||
847 | |||
848 | return 0; | ||
849 | |||
850 | out_refund: | ||
851 | refund_locked_memory(current->mm, size); | ||
852 | return err; | ||
853 | } | ||
854 | |||
855 | static inline void free_bts_buffer(struct bts_context *context) | ||
856 | { | ||
857 | if (!context->buffer) | ||
858 | return; | ||
859 | |||
860 | kfree(context->buffer); | ||
861 | context->buffer = NULL; | ||
862 | |||
863 | refund_locked_memory(context->mm, context->size); | ||
864 | context->size = 0; | ||
865 | |||
866 | mmput(context->mm); | ||
867 | context->mm = NULL; | ||
868 | } | ||
869 | |||
870 | static void free_bts_context_work(struct work_struct *w) | ||
871 | { | ||
872 | struct bts_context *context; | ||
873 | |||
874 | context = container_of(w, struct bts_context, work); | ||
875 | |||
876 | ds_release_bts(context->tracer); | ||
877 | put_task_struct(context->task); | ||
878 | free_bts_buffer(context); | ||
879 | kfree(context); | ||
880 | } | ||
881 | |||
882 | static inline void free_bts_context(struct bts_context *context) | ||
883 | { | ||
884 | INIT_WORK(&context->work, free_bts_context_work); | ||
885 | schedule_work(&context->work); | ||
886 | } | ||
887 | |||
888 | static inline struct bts_context *alloc_bts_context(struct task_struct *task) | ||
889 | { | ||
890 | struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); | ||
891 | if (context) { | ||
892 | context->task = task; | ||
893 | task->bts = context; | ||
894 | |||
895 | get_task_struct(task); | ||
896 | } | ||
897 | |||
898 | return context; | ||
899 | } | ||
900 | |||
901 | static int ptrace_bts_read_record(struct task_struct *child, size_t index, | ||
902 | struct bts_struct __user *out) | ||
903 | { | ||
904 | struct bts_context *context; | ||
905 | const struct bts_trace *trace; | ||
906 | struct bts_struct bts; | ||
907 | const unsigned char *at; | ||
908 | int error; | ||
909 | |||
910 | context = child->bts; | ||
911 | if (!context) | ||
912 | return -ESRCH; | ||
913 | |||
914 | trace = ds_read_bts(context->tracer); | ||
915 | if (!trace) | ||
916 | return -ESRCH; | ||
917 | |||
918 | at = trace->ds.top - ((index + 1) * trace->ds.size); | ||
919 | if ((void *)at < trace->ds.begin) | ||
920 | at += (trace->ds.n * trace->ds.size); | ||
921 | |||
922 | if (!trace->read) | ||
923 | return -EOPNOTSUPP; | ||
924 | |||
925 | error = trace->read(context->tracer, at, &bts); | ||
926 | if (error < 0) | ||
927 | return error; | ||
928 | |||
929 | if (copy_to_user(out, &bts, sizeof(bts))) | ||
930 | return -EFAULT; | ||
931 | |||
932 | return sizeof(bts); | ||
933 | } | ||
934 | |||
935 | static int ptrace_bts_drain(struct task_struct *child, | ||
936 | long size, | ||
937 | struct bts_struct __user *out) | ||
938 | { | ||
939 | struct bts_context *context; | ||
940 | const struct bts_trace *trace; | ||
941 | const unsigned char *at; | ||
942 | int error, drained = 0; | ||
943 | |||
944 | context = child->bts; | ||
945 | if (!context) | ||
946 | return -ESRCH; | ||
947 | |||
948 | trace = ds_read_bts(context->tracer); | ||
949 | if (!trace) | ||
950 | return -ESRCH; | ||
951 | |||
952 | if (!trace->read) | ||
953 | return -EOPNOTSUPP; | ||
954 | |||
955 | if (size < (trace->ds.top - trace->ds.begin)) | ||
956 | return -EIO; | ||
957 | |||
958 | for (at = trace->ds.begin; (void *)at < trace->ds.top; | ||
959 | out++, drained++, at += trace->ds.size) { | ||
960 | struct bts_struct bts; | ||
961 | |||
962 | error = trace->read(context->tracer, at, &bts); | ||
963 | if (error < 0) | ||
964 | return error; | ||
965 | |||
966 | if (copy_to_user(out, &bts, sizeof(bts))) | ||
967 | return -EFAULT; | ||
968 | } | ||
969 | |||
970 | memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); | ||
971 | |||
972 | error = ds_reset_bts(context->tracer); | ||
973 | if (error < 0) | ||
974 | return error; | ||
975 | |||
976 | return drained; | ||
977 | } | ||
978 | |||
979 | static int ptrace_bts_config(struct task_struct *child, | ||
980 | long cfg_size, | ||
981 | const struct ptrace_bts_config __user *ucfg) | ||
982 | { | ||
983 | struct bts_context *context; | ||
984 | struct ptrace_bts_config cfg; | ||
985 | unsigned int flags = 0; | ||
986 | |||
987 | if (cfg_size < sizeof(cfg)) | ||
988 | return -EIO; | ||
989 | |||
990 | if (copy_from_user(&cfg, ucfg, sizeof(cfg))) | ||
991 | return -EFAULT; | ||
992 | |||
993 | context = child->bts; | ||
994 | if (!context) | ||
995 | context = alloc_bts_context(child); | ||
996 | if (!context) | ||
997 | return -ENOMEM; | ||
998 | |||
999 | if (cfg.flags & PTRACE_BTS_O_SIGNAL) { | ||
1000 | if (!cfg.signal) | ||
1001 | return -EINVAL; | ||
1002 | |||
1003 | return -EOPNOTSUPP; | ||
1004 | context->bts_ovfl_signal = cfg.signal; | ||
1005 | } | ||
1006 | |||
1007 | ds_release_bts(context->tracer); | ||
1008 | context->tracer = NULL; | ||
1009 | |||
1010 | if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { | ||
1011 | int err; | ||
1012 | |||
1013 | free_bts_buffer(context); | ||
1014 | if (!cfg.size) | ||
1015 | return 0; | ||
1016 | |||
1017 | err = alloc_bts_buffer(context, cfg.size); | ||
1018 | if (err < 0) | ||
1019 | return err; | ||
1020 | } | ||
1021 | |||
1022 | if (cfg.flags & PTRACE_BTS_O_TRACE) | ||
1023 | flags |= BTS_USER; | ||
1024 | |||
1025 | if (cfg.flags & PTRACE_BTS_O_SCHED) | ||
1026 | flags |= BTS_TIMESTAMPS; | ||
1027 | |||
1028 | context->tracer = | ||
1029 | ds_request_bts_task(child, context->buffer, context->size, | ||
1030 | NULL, (size_t)-1, flags); | ||
1031 | if (unlikely(IS_ERR(context->tracer))) { | ||
1032 | int error = PTR_ERR(context->tracer); | ||
1033 | |||
1034 | free_bts_buffer(context); | ||
1035 | context->tracer = NULL; | ||
1036 | return error; | ||
1037 | } | ||
1038 | |||
1039 | return sizeof(cfg); | ||
1040 | } | ||
1041 | |||
1042 | static int ptrace_bts_status(struct task_struct *child, | ||
1043 | long cfg_size, | ||
1044 | struct ptrace_bts_config __user *ucfg) | ||
1045 | { | ||
1046 | struct bts_context *context; | ||
1047 | const struct bts_trace *trace; | ||
1048 | struct ptrace_bts_config cfg; | ||
1049 | |||
1050 | context = child->bts; | ||
1051 | if (!context) | ||
1052 | return -ESRCH; | ||
1053 | |||
1054 | if (cfg_size < sizeof(cfg)) | ||
1055 | return -EIO; | ||
1056 | |||
1057 | trace = ds_read_bts(context->tracer); | ||
1058 | if (!trace) | ||
1059 | return -ESRCH; | ||
1060 | |||
1061 | memset(&cfg, 0, sizeof(cfg)); | ||
1062 | cfg.size = trace->ds.end - trace->ds.begin; | ||
1063 | cfg.signal = context->bts_ovfl_signal; | ||
1064 | cfg.bts_size = sizeof(struct bts_struct); | ||
1065 | |||
1066 | if (cfg.signal) | ||
1067 | cfg.flags |= PTRACE_BTS_O_SIGNAL; | ||
1068 | |||
1069 | if (trace->ds.flags & BTS_USER) | ||
1070 | cfg.flags |= PTRACE_BTS_O_TRACE; | ||
1071 | |||
1072 | if (trace->ds.flags & BTS_TIMESTAMPS) | ||
1073 | cfg.flags |= PTRACE_BTS_O_SCHED; | ||
1074 | |||
1075 | if (copy_to_user(ucfg, &cfg, sizeof(cfg))) | ||
1076 | return -EFAULT; | ||
1077 | |||
1078 | return sizeof(cfg); | ||
1079 | } | ||
1080 | |||
1081 | static int ptrace_bts_clear(struct task_struct *child) | ||
1082 | { | ||
1083 | struct bts_context *context; | ||
1084 | const struct bts_trace *trace; | ||
1085 | |||
1086 | context = child->bts; | ||
1087 | if (!context) | ||
1088 | return -ESRCH; | ||
1089 | |||
1090 | trace = ds_read_bts(context->tracer); | ||
1091 | if (!trace) | ||
1092 | return -ESRCH; | ||
1093 | |||
1094 | memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); | ||
1095 | |||
1096 | return ds_reset_bts(context->tracer); | ||
1097 | } | ||
1098 | |||
1099 | static int ptrace_bts_size(struct task_struct *child) | ||
1100 | { | ||
1101 | struct bts_context *context; | ||
1102 | const struct bts_trace *trace; | ||
1103 | |||
1104 | context = child->bts; | ||
1105 | if (!context) | ||
1106 | return -ESRCH; | ||
1107 | |||
1108 | trace = ds_read_bts(context->tracer); | ||
1109 | if (!trace) | ||
1110 | return -ESRCH; | ||
1111 | |||
1112 | return (trace->ds.top - trace->ds.begin) / trace->ds.size; | ||
1113 | } | ||
1114 | |||
1115 | /* | ||
1116 | * Called from __ptrace_unlink() after the child has been moved back | ||
1117 | * to its original parent. | ||
1118 | */ | ||
1119 | void ptrace_bts_untrace(struct task_struct *child) | ||
1120 | { | ||
1121 | if (unlikely(child->bts)) { | ||
1122 | free_bts_context(child->bts); | ||
1123 | child->bts = NULL; | ||
1124 | } | ||
1125 | } | ||
1126 | #endif /* CONFIG_X86_PTRACE_BTS */ | ||
1127 | |||
1128 | /* | 787 | /* |
1129 | * Called by kernel/ptrace.c when detaching.. | 788 | * Called by kernel/ptrace.c when detaching.. |
1130 | * | 789 | * |
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
1252 | break; | 911 | break; |
1253 | #endif | 912 | #endif |
1254 | 913 | ||
1255 | /* | ||
1256 | * These bits need more cooking - not enabled yet: | ||
1257 | */ | ||
1258 | #ifdef CONFIG_X86_PTRACE_BTS | ||
1259 | case PTRACE_BTS_CONFIG: | ||
1260 | ret = ptrace_bts_config | ||
1261 | (child, data, (struct ptrace_bts_config __user *)addr); | ||
1262 | break; | ||
1263 | |||
1264 | case PTRACE_BTS_STATUS: | ||
1265 | ret = ptrace_bts_status | ||
1266 | (child, data, (struct ptrace_bts_config __user *)addr); | ||
1267 | break; | ||
1268 | |||
1269 | case PTRACE_BTS_SIZE: | ||
1270 | ret = ptrace_bts_size(child); | ||
1271 | break; | ||
1272 | |||
1273 | case PTRACE_BTS_GET: | ||
1274 | ret = ptrace_bts_read_record | ||
1275 | (child, data, (struct bts_struct __user *) addr); | ||
1276 | break; | ||
1277 | |||
1278 | case PTRACE_BTS_CLEAR: | ||
1279 | ret = ptrace_bts_clear(child); | ||
1280 | break; | ||
1281 | |||
1282 | case PTRACE_BTS_DRAIN: | ||
1283 | ret = ptrace_bts_drain | ||
1284 | (child, data, (struct bts_struct __user *) addr); | ||
1285 | break; | ||
1286 | #endif /* CONFIG_X86_PTRACE_BTS */ | ||
1287 | |||
1288 | default: | 914 | default: |
1289 | ret = ptrace_request(child, request, addr, data); | 915 | ret = ptrace_request(child, request, addr, data); |
1290 | break; | 916 | break; |
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, | |||
1544 | 1170 | ||
1545 | case PTRACE_GET_THREAD_AREA: | 1171 | case PTRACE_GET_THREAD_AREA: |
1546 | case PTRACE_SET_THREAD_AREA: | 1172 | case PTRACE_SET_THREAD_AREA: |
1547 | #ifdef CONFIG_X86_PTRACE_BTS | ||
1548 | case PTRACE_BTS_CONFIG: | ||
1549 | case PTRACE_BTS_STATUS: | ||
1550 | case PTRACE_BTS_SIZE: | ||
1551 | case PTRACE_BTS_GET: | ||
1552 | case PTRACE_BTS_CLEAR: | ||
1553 | case PTRACE_BTS_DRAIN: | ||
1554 | #endif /* CONFIG_X86_PTRACE_BTS */ | ||
1555 | return arch_ptrace(child, request, addr, data); | 1173 | return arch_ptrace(child, request, addr, data); |
1556 | 1174 | ||
1557 | default: | 1175 | default: |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 03801f2f761f..239427ca02af 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -31,8 +31,16 @@ struct pvclock_shadow_time { | |||
31 | u32 tsc_to_nsec_mul; | 31 | u32 tsc_to_nsec_mul; |
32 | int tsc_shift; | 32 | int tsc_shift; |
33 | u32 version; | 33 | u32 version; |
34 | u8 flags; | ||
34 | }; | 35 | }; |
35 | 36 | ||
37 | static u8 valid_flags __read_mostly = 0; | ||
38 | |||
39 | void pvclock_set_flags(u8 flags) | ||
40 | { | ||
41 | valid_flags = flags; | ||
42 | } | ||
43 | |||
36 | /* | 44 | /* |
37 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | 45 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, |
38 | * yielding a 64-bit result. | 46 | * yielding a 64-bit result. |
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | |||
91 | dst->system_timestamp = src->system_time; | 99 | dst->system_timestamp = src->system_time; |
92 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | 100 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; |
93 | dst->tsc_shift = src->tsc_shift; | 101 | dst->tsc_shift = src->tsc_shift; |
102 | dst->flags = src->flags; | ||
94 | rmb(); /* test version after fetching data */ | 103 | rmb(); /* test version after fetching data */ |
95 | } while ((src->version & 1) || (dst->version != src->version)); | 104 | } while ((src->version & 1) || (dst->version != src->version)); |
96 | 105 | ||
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) | |||
109 | return pv_tsc_khz; | 118 | return pv_tsc_khz; |
110 | } | 119 | } |
111 | 120 | ||
121 | static atomic64_t last_value = ATOMIC64_INIT(0); | ||
122 | |||
112 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | 123 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
113 | { | 124 | { |
114 | struct pvclock_shadow_time shadow; | 125 | struct pvclock_shadow_time shadow; |
115 | unsigned version; | 126 | unsigned version; |
116 | cycle_t ret, offset; | 127 | cycle_t ret, offset; |
128 | u64 last; | ||
117 | 129 | ||
118 | do { | 130 | do { |
119 | version = pvclock_get_time_values(&shadow, src); | 131 | version = pvclock_get_time_values(&shadow, src); |
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | |||
123 | barrier(); | 135 | barrier(); |
124 | } while (version != src->version); | 136 | } while (version != src->version); |
125 | 137 | ||
138 | if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && | ||
139 | (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) | ||
140 | return ret; | ||
141 | |||
142 | /* | ||
143 | * Assumption here is that last_value, a global accumulator, always goes | ||
144 | * forward. If we are less than that, we should not be much smaller. | ||
145 | * We assume there is an error marging we're inside, and then the correction | ||
146 | * does not sacrifice accuracy. | ||
147 | * | ||
148 | * For reads: global may have changed between test and return, | ||
149 | * but this means someone else updated poked the clock at a later time. | ||
150 | * We just need to make sure we are not seeing a backwards event. | ||
151 | * | ||
152 | * For updates: last_value = ret is not enough, since two vcpus could be | ||
153 | * updating at the same time, and one of them could be slightly behind, | ||
154 | * making the assumption that last_value always go forward fail to hold. | ||
155 | */ | ||
156 | last = atomic64_read(&last_value); | ||
157 | do { | ||
158 | if (ret < last) | ||
159 | return last; | ||
160 | last = atomic64_cmpxchg(&last_value, last, ret); | ||
161 | } while (unlikely(last != ret)); | ||
162 | |||
126 | return ret; | 163 | return ret; |
127 | } | 164 | } |
128 | 165 | ||
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 12e9feaa2f7a..939b9e98245f 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -495,6 +495,9 @@ void force_hpet_resume(void) | |||
495 | /* | 495 | /* |
496 | * HPET MSI on some boards (ATI SB700/SB800) has side effect on | 496 | * HPET MSI on some boards (ATI SB700/SB800) has side effect on |
497 | * floppy DMA. Disable HPET MSI on such platforms. | 497 | * floppy DMA. Disable HPET MSI on such platforms. |
498 | * See erratum #27 (Misinterpreted MSI Requests May Result in | ||
499 | * Corrupted LPC DMA Data) in AMD Publication #46837, | ||
500 | * "SB700 Family Product Errata", Rev. 1.0, March 2010. | ||
498 | */ | 501 | */ |
499 | static void force_disable_hpet_msi(struct pci_dev *unused) | 502 | static void force_disable_hpet_msi(struct pci_dev *unused) |
500 | { | 503 | { |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 8e1aac86b50c..e3af342fe83a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -228,6 +228,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
228 | DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), | 228 | DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), |
229 | }, | 229 | }, |
230 | }, | 230 | }, |
231 | { /* Handle problems with rebooting on Dell T7400's */ | ||
232 | .callback = set_bios_reboot, | ||
233 | .ident = "Dell Precision T7400", | ||
234 | .matches = { | ||
235 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), | ||
236 | DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), | ||
237 | }, | ||
238 | }, | ||
231 | { /* Handle problems with rebooting on HP laptops */ | 239 | { /* Handle problems with rebooting on HP laptops */ |
232 | .callback = set_bios_reboot, | 240 | .callback = set_bios_reboot, |
233 | .ident = "HP Compaq Laptop", | 241 | .ident = "HP Compaq Laptop", |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4851eff57b3..b008e7883207 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -102,6 +102,7 @@ | |||
102 | 102 | ||
103 | #include <asm/paravirt.h> | 103 | #include <asm/paravirt.h> |
104 | #include <asm/hypervisor.h> | 104 | #include <asm/hypervisor.h> |
105 | #include <asm/olpc_ofw.h> | ||
105 | 106 | ||
106 | #include <asm/percpu.h> | 107 | #include <asm/percpu.h> |
107 | #include <asm/topology.h> | 108 | #include <asm/topology.h> |
@@ -676,6 +677,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { | |||
676 | DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), | 677 | DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), |
677 | }, | 678 | }, |
678 | }, | 679 | }, |
680 | /* | ||
681 | * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so | ||
682 | * match on the product name. | ||
683 | */ | ||
684 | { | ||
685 | .callback = dmi_low_memory_corruption, | ||
686 | .ident = "Phoenix BIOS", | ||
687 | .matches = { | ||
688 | DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), | ||
689 | }, | ||
690 | }, | ||
679 | #endif | 691 | #endif |
680 | {} | 692 | {} |
681 | }; | 693 | }; |
@@ -725,9 +737,15 @@ void __init setup_arch(char **cmdline_p) | |||
725 | /* VMI may relocate the fixmap; do this before touching ioremap area */ | 737 | /* VMI may relocate the fixmap; do this before touching ioremap area */ |
726 | vmi_init(); | 738 | vmi_init(); |
727 | 739 | ||
740 | /* OFW also may relocate the fixmap */ | ||
741 | olpc_ofw_detect(); | ||
742 | |||
743 | early_trap_init(); | ||
728 | early_cpu_init(); | 744 | early_cpu_init(); |
729 | early_ioremap_init(); | 745 | early_ioremap_init(); |
730 | 746 | ||
747 | setup_olpc_ofw_pgd(); | ||
748 | |||
731 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); | 749 | ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); |
732 | screen_info = boot_params.screen_info; | 750 | screen_info = boot_params.screen_info; |
733 | edid_info = boot_params.edid_info; | 751 | edid_info = boot_params.edid_info; |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef6370b00e70..a60df9ae6454 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -21,12 +21,6 @@ | |||
21 | #include <asm/cpu.h> | 21 | #include <asm/cpu.h> |
22 | #include <asm/stackprotector.h> | 22 | #include <asm/stackprotector.h> |
23 | 23 | ||
24 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
25 | # define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) | ||
26 | #else | ||
27 | # define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) | ||
28 | #endif | ||
29 | |||
30 | DEFINE_PER_CPU(int, cpu_number); | 24 | DEFINE_PER_CPU(int, cpu_number); |
31 | EXPORT_PER_CPU_SYMBOL(cpu_number); | 25 | EXPORT_PER_CPU_SYMBOL(cpu_number); |
32 | 26 | ||
@@ -244,10 +238,19 @@ void __init setup_per_cpu_areas(void) | |||
244 | #ifdef CONFIG_NUMA | 238 | #ifdef CONFIG_NUMA |
245 | per_cpu(x86_cpu_to_node_map, cpu) = | 239 | per_cpu(x86_cpu_to_node_map, cpu) = |
246 | early_per_cpu_map(x86_cpu_to_node_map, cpu); | 240 | early_per_cpu_map(x86_cpu_to_node_map, cpu); |
241 | /* | ||
242 | * Ensure that the boot cpu numa_node is correct when the boot | ||
243 | * cpu is on a node that doesn't have memory installed. | ||
244 | * Also cpu_up() will call cpu_to_node() for APs when | ||
245 | * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set | ||
246 | * up later with c_init aka intel_init/amd_init. | ||
247 | * So set them all (boot cpu and all APs). | ||
248 | */ | ||
249 | set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); | ||
247 | #endif | 250 | #endif |
248 | #endif | 251 | #endif |
249 | /* | 252 | /* |
250 | * Up to this point, the boot CPU has been using .data.init | 253 | * Up to this point, the boot CPU has been using .init.data |
251 | * area. Reload any changed state for the boot CPU. | 254 | * area. Reload any changed state for the boot CPU. |
252 | */ | 255 | */ |
253 | if (cpu == boot_cpu_id) | 256 | if (cpu == boot_cpu_id) |
@@ -263,14 +266,6 @@ void __init setup_per_cpu_areas(void) | |||
263 | early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; | 266 | early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; |
264 | #endif | 267 | #endif |
265 | 268 | ||
266 | #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) | ||
267 | /* | ||
268 | * make sure boot cpu node_number is right, when boot cpu is on the | ||
269 | * node that doesn't have mem installed | ||
270 | */ | ||
271 | per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); | ||
272 | #endif | ||
273 | |||
274 | /* Setup node to cpumask map */ | 269 | /* Setup node to cpumask map */ |
275 | setup_node_to_cpumask_map(); | 270 | setup_node_to_cpumask_map(); |
276 | 271 | ||
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c index 34e099382651..cb22acf3ed09 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/kernel/sfi.c | |||
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) | |||
81 | #endif /* CONFIG_X86_LOCAL_APIC */ | 81 | #endif /* CONFIG_X86_LOCAL_APIC */ |
82 | 82 | ||
83 | #ifdef CONFIG_X86_IO_APIC | 83 | #ifdef CONFIG_X86_IO_APIC |
84 | static u32 gsi_base; | ||
85 | 84 | ||
86 | static int __init sfi_parse_ioapic(struct sfi_table_header *table) | 85 | static int __init sfi_parse_ioapic(struct sfi_table_header *table) |
87 | { | 86 | { |
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table) | |||
94 | pentry = (struct sfi_apic_table_entry *)sb->pentry; | 93 | pentry = (struct sfi_apic_table_entry *)sb->pentry; |
95 | 94 | ||
96 | for (i = 0; i < num; i++) { | 95 | for (i = 0; i < num; i++) { |
97 | mp_register_ioapic(i, pentry->phys_addr, gsi_base); | 96 | mp_register_ioapic(i, pentry->phys_addr, gsi_top); |
98 | gsi_base += io_apic_get_redir_entries(i); | ||
99 | pentry++; | 97 | pentry++; |
100 | } | 98 | } |
101 | 99 | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 763d815e27a0..a5e928b0cb5f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) | |||
686 | static void __cpuinit announce_cpu(int cpu, int apicid) | 686 | static void __cpuinit announce_cpu(int cpu, int apicid) |
687 | { | 687 | { |
688 | static int current_node = -1; | 688 | static int current_node = -1; |
689 | int node = cpu_to_node(cpu); | 689 | int node = early_cpu_to_node(cpu); |
690 | 690 | ||
691 | if (system_state == SYSTEM_BOOTING) { | 691 | if (system_state == SYSTEM_BOOTING) { |
692 | if (node != current_node) { | 692 | if (node != current_node) { |
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) | |||
735 | goto do_rest; | 735 | goto do_rest; |
736 | } | 736 | } |
737 | 737 | ||
738 | if (!keventd_up() || current_is_keventd()) | 738 | schedule_work(&c_idle.work); |
739 | c_idle.work.func(&c_idle.work); | 739 | wait_for_completion(&c_idle.done); |
740 | else { | ||
741 | schedule_work(&c_idle.work); | ||
742 | wait_for_completion(&c_idle.done); | ||
743 | } | ||
744 | 740 | ||
745 | if (IS_ERR(c_idle.idle)) { | 741 | if (IS_ERR(c_idle.idle)) { |
746 | printk("failed fork for CPU %d\n", cpu); | 742 | printk("failed fork for CPU %d\n", cpu); |
@@ -816,6 +812,13 @@ do_rest: | |||
816 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | 812 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) |
817 | break; /* It has booted */ | 813 | break; /* It has booted */ |
818 | udelay(100); | 814 | udelay(100); |
815 | /* | ||
816 | * Allow other tasks to run while we wait for the | ||
817 | * AP to come online. This also gives a chance | ||
818 | * for the MTRR work(triggered by the AP coming online) | ||
819 | * to be completed in the stop machine context. | ||
820 | */ | ||
821 | schedule(); | ||
819 | } | 822 | } |
820 | 823 | ||
821 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | 824 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) |
@@ -1215,9 +1218,17 @@ __init void prefill_possible_map(void) | |||
1215 | if (!num_processors) | 1218 | if (!num_processors) |
1216 | num_processors = 1; | 1219 | num_processors = 1; |
1217 | 1220 | ||
1218 | if (setup_possible_cpus == -1) | 1221 | i = setup_max_cpus ?: 1; |
1219 | possible = num_processors + disabled_cpus; | 1222 | if (setup_possible_cpus == -1) { |
1220 | else | 1223 | possible = num_processors; |
1224 | #ifdef CONFIG_HOTPLUG_CPU | ||
1225 | if (setup_max_cpus) | ||
1226 | possible += disabled_cpus; | ||
1227 | #else | ||
1228 | if (possible > i) | ||
1229 | possible = i; | ||
1230 | #endif | ||
1231 | } else | ||
1221 | possible = setup_possible_cpus; | 1232 | possible = setup_possible_cpus; |
1222 | 1233 | ||
1223 | total_cpus = max_t(int, possible, num_processors + disabled_cpus); | 1234 | total_cpus = max_t(int, possible, num_processors + disabled_cpus); |
@@ -1230,11 +1241,23 @@ __init void prefill_possible_map(void) | |||
1230 | possible = nr_cpu_ids; | 1241 | possible = nr_cpu_ids; |
1231 | } | 1242 | } |
1232 | 1243 | ||
1244 | #ifdef CONFIG_HOTPLUG_CPU | ||
1245 | if (!setup_max_cpus) | ||
1246 | #endif | ||
1247 | if (possible > i) { | ||
1248 | printk(KERN_WARNING | ||
1249 | "%d Processors exceeds max_cpus limit of %u\n", | ||
1250 | possible, setup_max_cpus); | ||
1251 | possible = i; | ||
1252 | } | ||
1253 | |||
1233 | printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", | 1254 | printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", |
1234 | possible, max_t(int, possible - num_processors, 0)); | 1255 | possible, max_t(int, possible - num_processors, 0)); |
1235 | 1256 | ||
1236 | for (i = 0; i < possible; i++) | 1257 | for (i = 0; i < possible; i++) |
1237 | set_cpu_possible(i, true); | 1258 | set_cpu_possible(i, true); |
1259 | for (; i < NR_CPUS; i++) | ||
1260 | set_cpu_possible(i, false); | ||
1238 | 1261 | ||
1239 | nr_cpu_ids = possible; | 1262 | nr_cpu_ids = possible; |
1240 | } | 1263 | } |
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefbb3f6c..b53c525368a7 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name) | |||
23 | return 0; | 23 | return 0; |
24 | } | 24 | } |
25 | 25 | ||
26 | static void save_stack_address(void *data, unsigned long addr, int reliable) | 26 | static void |
27 | __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) | ||
27 | { | 28 | { |
28 | struct stack_trace *trace = data; | 29 | struct stack_trace *trace = data; |
30 | #ifdef CONFIG_FRAME_POINTER | ||
29 | if (!reliable) | 31 | if (!reliable) |
30 | return; | 32 | return; |
33 | #endif | ||
34 | if (nosched && in_sched_functions(addr)) | ||
35 | return; | ||
31 | if (trace->skip > 0) { | 36 | if (trace->skip > 0) { |
32 | trace->skip--; | 37 | trace->skip--; |
33 | return; | 38 | return; |
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) | |||
36 | trace->entries[trace->nr_entries++] = addr; | 41 | trace->entries[trace->nr_entries++] = addr; |
37 | } | 42 | } |
38 | 43 | ||
44 | static void save_stack_address(void *data, unsigned long addr, int reliable) | ||
45 | { | ||
46 | return __save_stack_address(data, addr, reliable, false); | ||
47 | } | ||
48 | |||
39 | static void | 49 | static void |
40 | save_stack_address_nosched(void *data, unsigned long addr, int reliable) | 50 | save_stack_address_nosched(void *data, unsigned long addr, int reliable) |
41 | { | 51 | { |
42 | struct stack_trace *trace = (struct stack_trace *)data; | 52 | return __save_stack_address(data, addr, reliable, true); |
43 | if (!reliable) | ||
44 | return; | ||
45 | if (in_sched_functions(addr)) | ||
46 | return; | ||
47 | if (trace->skip > 0) { | ||
48 | trace->skip--; | ||
49 | return; | ||
50 | } | ||
51 | if (trace->nr_entries < trace->max_entries) | ||
52 | trace->entries[trace->nr_entries++] = addr; | ||
53 | } | 53 | } |
54 | 54 | ||
55 | static const struct stacktrace_ops save_stack_ops = { | 55 | static const struct stacktrace_ops save_stack_ops = { |
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); | |||
96 | 96 | ||
97 | /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ | 97 | /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ |
98 | 98 | ||
99 | struct stack_frame { | 99 | struct stack_frame_user { |
100 | const void __user *next_fp; | 100 | const void __user *next_fp; |
101 | unsigned long ret_addr; | 101 | unsigned long ret_addr; |
102 | }; | 102 | }; |
103 | 103 | ||
104 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | 104 | static int |
105 | copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) | ||
105 | { | 106 | { |
106 | int ret; | 107 | int ret; |
107 | 108 | ||
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) | |||
126 | trace->entries[trace->nr_entries++] = regs->ip; | 127 | trace->entries[trace->nr_entries++] = regs->ip; |
127 | 128 | ||
128 | while (trace->nr_entries < trace->max_entries) { | 129 | while (trace->nr_entries < trace->max_entries) { |
129 | struct stack_frame frame; | 130 | struct stack_frame_user frame; |
130 | 131 | ||
131 | frame.next_fp = NULL; | 132 | frame.next_fp = NULL; |
132 | frame.ret_addr = 0; | 133 | frame.ret_addr = 0; |
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 3149032ff107..58de45ee08b6 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c | |||
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child) | |||
158 | } | 158 | } |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running. | ||
162 | */ | ||
163 | static void write_debugctlmsr(struct task_struct *child, unsigned long val) | ||
164 | { | ||
165 | if (child->thread.debugctlmsr == val) | ||
166 | return; | ||
167 | |||
168 | child->thread.debugctlmsr = val; | ||
169 | |||
170 | if (child != current) | ||
171 | return; | ||
172 | |||
173 | update_debugctlmsr(val); | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * Enable single or block step. | 161 | * Enable single or block step. |
178 | */ | 162 | */ |
179 | static void enable_step(struct task_struct *child, bool block) | 163 | static void enable_step(struct task_struct *child, bool block) |
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block) | |||
186 | * that uses user-mode single stepping itself. | 170 | * that uses user-mode single stepping itself. |
187 | */ | 171 | */ |
188 | if (enable_single_step(child) && block) { | 172 | if (enable_single_step(child) && block) { |
189 | set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); | 173 | unsigned long debugctl = get_debugctlmsr(); |
190 | write_debugctlmsr(child, | 174 | |
191 | child->thread.debugctlmsr | DEBUGCTLMSR_BTF); | 175 | debugctl |= DEBUGCTLMSR_BTF; |
192 | } else { | 176 | update_debugctlmsr(debugctl); |
193 | write_debugctlmsr(child, | 177 | set_tsk_thread_flag(child, TIF_BLOCKSTEP); |
194 | child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); | 178 | } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { |
195 | 179 | unsigned long debugctl = get_debugctlmsr(); | |
196 | if (!child->thread.debugctlmsr) | 180 | |
197 | clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); | 181 | debugctl &= ~DEBUGCTLMSR_BTF; |
182 | update_debugctlmsr(debugctl); | ||
183 | clear_tsk_thread_flag(child, TIF_BLOCKSTEP); | ||
198 | } | 184 | } |
199 | } | 185 | } |
200 | 186 | ||
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child) | |||
213 | /* | 199 | /* |
214 | * Make sure block stepping (BTF) is disabled. | 200 | * Make sure block stepping (BTF) is disabled. |
215 | */ | 201 | */ |
216 | write_debugctlmsr(child, | 202 | if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { |
217 | child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); | 203 | unsigned long debugctl = get_debugctlmsr(); |
218 | 204 | ||
219 | if (!child->thread.debugctlmsr) | 205 | debugctl &= ~DEBUGCTLMSR_BTF; |
220 | clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); | 206 | update_debugctlmsr(debugctl); |
207 | clear_tsk_thread_flag(child, TIF_BLOCKSTEP); | ||
208 | } | ||
221 | 209 | ||
222 | /* Always clear TIF_SINGLESTEP... */ | 210 | /* Always clear TIF_SINGLESTEP... */ |
223 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | 211 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); |
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..b35786dc9b8f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -337,3 +337,6 @@ ENTRY(sys_call_table) | |||
337 | .long sys_rt_tgsigqueueinfo /* 335 */ | 337 | .long sys_rt_tgsigqueueinfo /* 335 */ |
338 | .long sys_perf_event_open | 338 | .long sys_perf_event_open |
339 | .long sys_recvmmsg | 339 | .long sys_recvmmsg |
340 | .long sys_fanotify_init | ||
341 | .long sys_fanotify_mark | ||
342 | .long sys_prlimit64 /* 340 */ | ||
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 86c9f91b48ae..c2f1b26141e2 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c | |||
@@ -46,6 +46,7 @@ | |||
46 | 46 | ||
47 | /* Global pointer to shared data; NULL means no measured launch. */ | 47 | /* Global pointer to shared data; NULL means no measured launch. */ |
48 | struct tboot *tboot __read_mostly; | 48 | struct tboot *tboot __read_mostly; |
49 | EXPORT_SYMBOL(tboot); | ||
49 | 50 | ||
50 | /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ | 51 | /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ |
51 | #define AP_WAIT_TIMEOUT 1 | 52 | #define AP_WAIT_TIMEOUT 1 |
@@ -175,6 +176,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size) | |||
175 | struct tboot_mac_region *mr; | 176 | struct tboot_mac_region *mr; |
176 | phys_addr_t end = start + size; | 177 | phys_addr_t end = start + size; |
177 | 178 | ||
179 | if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS) | ||
180 | panic("tboot: Too many MAC regions\n"); | ||
181 | |||
178 | if (start && size) { | 182 | if (start && size) { |
179 | mr = &tboot->mac_regions[tboot->num_mac_regions++]; | 183 | mr = &tboot->mac_regions[tboot->num_mac_regions++]; |
180 | mr->start = round_down(start, PAGE_SIZE); | 184 | mr->start = round_down(start, PAGE_SIZE); |
@@ -184,18 +188,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size) | |||
184 | 188 | ||
185 | static int tboot_setup_sleep(void) | 189 | static int tboot_setup_sleep(void) |
186 | { | 190 | { |
191 | int i; | ||
192 | |||
187 | tboot->num_mac_regions = 0; | 193 | tboot->num_mac_regions = 0; |
188 | 194 | ||
189 | /* S3 resume code */ | 195 | for (i = 0; i < e820.nr_map; i++) { |
190 | add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); | 196 | if ((e820.map[i].type != E820_RAM) |
197 | && (e820.map[i].type != E820_RESERVED_KERN)) | ||
198 | continue; | ||
191 | 199 | ||
192 | #ifdef CONFIG_X86_TRAMPOLINE | 200 | add_mac_region(e820.map[i].addr, e820.map[i].size); |
193 | /* AP trampoline code */ | 201 | } |
194 | add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); | ||
195 | #endif | ||
196 | |||
197 | /* kernel code + data + bss */ | ||
198 | add_mac_region(virt_to_phys(_text), _end - _text); | ||
199 | 202 | ||
200 | tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; | 203 | tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; |
201 | 204 | ||
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 17b03dd3a6b5..7fea555929e2 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * SGI UltraViolet TLB flush routines. | 2 | * SGI UltraViolet TLB flush routines. |
3 | * | 3 | * |
4 | * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. | 4 | * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. |
5 | * | 5 | * |
6 | * This code is released under the GNU General Public License version 2 or | 6 | * This code is released under the GNU General Public License version 2 or |
7 | * later. | 7 | * later. |
@@ -20,42 +20,67 @@ | |||
20 | #include <asm/idle.h> | 20 | #include <asm/idle.h> |
21 | #include <asm/tsc.h> | 21 | #include <asm/tsc.h> |
22 | #include <asm/irq_vectors.h> | 22 | #include <asm/irq_vectors.h> |
23 | #include <asm/timer.h> | ||
23 | 24 | ||
24 | static struct bau_control **uv_bau_table_bases __read_mostly; | 25 | struct msg_desc { |
25 | static int uv_bau_retry_limit __read_mostly; | 26 | struct bau_payload_queue_entry *msg; |
27 | int msg_slot; | ||
28 | int sw_ack_slot; | ||
29 | struct bau_payload_queue_entry *va_queue_first; | ||
30 | struct bau_payload_queue_entry *va_queue_last; | ||
31 | }; | ||
26 | 32 | ||
27 | /* base pnode in this partition */ | 33 | #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL |
28 | static int uv_partition_base_pnode __read_mostly; | 34 | |
35 | static int uv_bau_max_concurrent __read_mostly; | ||
36 | |||
37 | static int nobau; | ||
38 | static int __init setup_nobau(char *arg) | ||
39 | { | ||
40 | nobau = 1; | ||
41 | return 0; | ||
42 | } | ||
43 | early_param("nobau", setup_nobau); | ||
29 | 44 | ||
30 | static unsigned long uv_mmask __read_mostly; | 45 | /* base pnode in this partition */ |
46 | static int uv_partition_base_pnode __read_mostly; | ||
47 | /* position of pnode (which is nasid>>1): */ | ||
48 | static int uv_nshift __read_mostly; | ||
49 | static unsigned long uv_mmask __read_mostly; | ||
31 | 50 | ||
32 | static DEFINE_PER_CPU(struct ptc_stats, ptcstats); | 51 | static DEFINE_PER_CPU(struct ptc_stats, ptcstats); |
33 | static DEFINE_PER_CPU(struct bau_control, bau_control); | 52 | static DEFINE_PER_CPU(struct bau_control, bau_control); |
53 | static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); | ||
54 | |||
55 | struct reset_args { | ||
56 | int sender; | ||
57 | }; | ||
34 | 58 | ||
35 | /* | 59 | /* |
36 | * Determine the first node on a blade. | 60 | * Determine the first node on a uvhub. 'Nodes' are used for kernel |
61 | * memory allocation. | ||
37 | */ | 62 | */ |
38 | static int __init blade_to_first_node(int blade) | 63 | static int __init uvhub_to_first_node(int uvhub) |
39 | { | 64 | { |
40 | int node, b; | 65 | int node, b; |
41 | 66 | ||
42 | for_each_online_node(node) { | 67 | for_each_online_node(node) { |
43 | b = uv_node_to_blade_id(node); | 68 | b = uv_node_to_blade_id(node); |
44 | if (blade == b) | 69 | if (uvhub == b) |
45 | return node; | 70 | return node; |
46 | } | 71 | } |
47 | return -1; /* shouldn't happen */ | 72 | return -1; |
48 | } | 73 | } |
49 | 74 | ||
50 | /* | 75 | /* |
51 | * Determine the apicid of the first cpu on a blade. | 76 | * Determine the apicid of the first cpu on a uvhub. |
52 | */ | 77 | */ |
53 | static int __init blade_to_first_apicid(int blade) | 78 | static int __init uvhub_to_first_apicid(int uvhub) |
54 | { | 79 | { |
55 | int cpu; | 80 | int cpu; |
56 | 81 | ||
57 | for_each_present_cpu(cpu) | 82 | for_each_present_cpu(cpu) |
58 | if (blade == uv_cpu_to_blade_id(cpu)) | 83 | if (uvhub == uv_cpu_to_blade_id(cpu)) |
59 | return per_cpu(x86_cpu_to_apicid, cpu); | 84 | return per_cpu(x86_cpu_to_apicid, cpu); |
60 | return -1; | 85 | return -1; |
61 | } | 86 | } |
@@ -68,195 +93,459 @@ static int __init blade_to_first_apicid(int blade) | |||
68 | * clear of the Timeout bit (as well) will free the resource. No reply will | 93 | * clear of the Timeout bit (as well) will free the resource. No reply will |
69 | * be sent (the hardware will only do one reply per message). | 94 | * be sent (the hardware will only do one reply per message). |
70 | */ | 95 | */ |
71 | static void uv_reply_to_message(int resource, | 96 | static inline void uv_reply_to_message(struct msg_desc *mdp, |
72 | struct bau_payload_queue_entry *msg, | 97 | struct bau_control *bcp) |
73 | struct bau_msg_status *msp) | ||
74 | { | 98 | { |
75 | unsigned long dw; | 99 | unsigned long dw; |
100 | struct bau_payload_queue_entry *msg; | ||
76 | 101 | ||
77 | dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); | 102 | msg = mdp->msg; |
103 | if (!msg->canceled) { | ||
104 | dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | | ||
105 | msg->sw_ack_vector; | ||
106 | uv_write_local_mmr( | ||
107 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); | ||
108 | } | ||
78 | msg->replied_to = 1; | 109 | msg->replied_to = 1; |
79 | msg->sw_ack_vector = 0; | 110 | msg->sw_ack_vector = 0; |
80 | if (msp) | ||
81 | msp->seen_by.bits = 0; | ||
82 | uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); | ||
83 | } | 111 | } |
84 | 112 | ||
85 | /* | 113 | /* |
86 | * Do all the things a cpu should do for a TLB shootdown message. | 114 | * Process the receipt of a RETRY message |
87 | * Other cpu's may come here at the same time for this message. | ||
88 | */ | 115 | */ |
89 | static void uv_bau_process_message(struct bau_payload_queue_entry *msg, | 116 | static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, |
90 | int msg_slot, int sw_ack_slot) | 117 | struct bau_control *bcp) |
91 | { | 118 | { |
92 | unsigned long this_cpu_mask; | 119 | int i; |
93 | struct bau_msg_status *msp; | 120 | int cancel_count = 0; |
94 | int cpu; | 121 | int slot2; |
122 | unsigned long msg_res; | ||
123 | unsigned long mmr = 0; | ||
124 | struct bau_payload_queue_entry *msg; | ||
125 | struct bau_payload_queue_entry *msg2; | ||
126 | struct ptc_stats *stat; | ||
95 | 127 | ||
96 | msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; | 128 | msg = mdp->msg; |
97 | cpu = uv_blade_processor_id(); | 129 | stat = &per_cpu(ptcstats, bcp->cpu); |
98 | msg->number_of_cpus = | 130 | stat->d_retries++; |
99 | uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); | 131 | /* |
100 | this_cpu_mask = 1UL << cpu; | 132 | * cancel any message from msg+1 to the retry itself |
101 | if (msp->seen_by.bits & this_cpu_mask) | 133 | */ |
102 | return; | 134 | for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { |
103 | atomic_or_long(&msp->seen_by.bits, this_cpu_mask); | 135 | if (msg2 > mdp->va_queue_last) |
136 | msg2 = mdp->va_queue_first; | ||
137 | if (msg2 == msg) | ||
138 | break; | ||
139 | |||
140 | /* same conditions for cancellation as uv_do_reset */ | ||
141 | if ((msg2->replied_to == 0) && (msg2->canceled == 0) && | ||
142 | (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & | ||
143 | msg->sw_ack_vector) == 0) && | ||
144 | (msg2->sending_cpu == msg->sending_cpu) && | ||
145 | (msg2->msg_type != MSG_NOOP)) { | ||
146 | slot2 = msg2 - mdp->va_queue_first; | ||
147 | mmr = uv_read_local_mmr | ||
148 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); | ||
149 | msg_res = ((msg2->sw_ack_vector << 8) | | ||
150 | msg2->sw_ack_vector); | ||
151 | /* | ||
152 | * This is a message retry; clear the resources held | ||
153 | * by the previous message only if they timed out. | ||
154 | * If it has not timed out we have an unexpected | ||
155 | * situation to report. | ||
156 | */ | ||
157 | if (mmr & (msg_res << 8)) { | ||
158 | /* | ||
159 | * is the resource timed out? | ||
160 | * make everyone ignore the cancelled message. | ||
161 | */ | ||
162 | msg2->canceled = 1; | ||
163 | stat->d_canceled++; | ||
164 | cancel_count++; | ||
165 | uv_write_local_mmr( | ||
166 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, | ||
167 | (msg_res << 8) | msg_res); | ||
168 | } else | ||
169 | printk(KERN_INFO "note bau retry: no effect\n"); | ||
170 | } | ||
171 | } | ||
172 | if (!cancel_count) | ||
173 | stat->d_nocanceled++; | ||
174 | } | ||
104 | 175 | ||
105 | if (msg->replied_to == 1) | 176 | /* |
106 | return; | 177 | * Do all the things a cpu should do for a TLB shootdown message. |
178 | * Other cpu's may come here at the same time for this message. | ||
179 | */ | ||
180 | static void uv_bau_process_message(struct msg_desc *mdp, | ||
181 | struct bau_control *bcp) | ||
182 | { | ||
183 | int msg_ack_count; | ||
184 | short socket_ack_count = 0; | ||
185 | struct ptc_stats *stat; | ||
186 | struct bau_payload_queue_entry *msg; | ||
187 | struct bau_control *smaster = bcp->socket_master; | ||
107 | 188 | ||
189 | /* | ||
190 | * This must be a normal message, or retry of a normal message | ||
191 | */ | ||
192 | msg = mdp->msg; | ||
193 | stat = &per_cpu(ptcstats, bcp->cpu); | ||
108 | if (msg->address == TLB_FLUSH_ALL) { | 194 | if (msg->address == TLB_FLUSH_ALL) { |
109 | local_flush_tlb(); | 195 | local_flush_tlb(); |
110 | __get_cpu_var(ptcstats).alltlb++; | 196 | stat->d_alltlb++; |
111 | } else { | 197 | } else { |
112 | __flush_tlb_one(msg->address); | 198 | __flush_tlb_one(msg->address); |
113 | __get_cpu_var(ptcstats).onetlb++; | 199 | stat->d_onetlb++; |
114 | } | 200 | } |
201 | stat->d_requestee++; | ||
202 | |||
203 | /* | ||
204 | * One cpu on each uvhub has the additional job on a RETRY | ||
205 | * of releasing the resource held by the message that is | ||
206 | * being retried. That message is identified by sending | ||
207 | * cpu number. | ||
208 | */ | ||
209 | if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) | ||
210 | uv_bau_process_retry_msg(mdp, bcp); | ||
115 | 211 | ||
116 | __get_cpu_var(ptcstats).requestee++; | 212 | /* |
213 | * This is a sw_ack message, so we have to reply to it. | ||
214 | * Count each responding cpu on the socket. This avoids | ||
215 | * pinging the count's cache line back and forth between | ||
216 | * the sockets. | ||
217 | */ | ||
218 | socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) | ||
219 | &smaster->socket_acknowledge_count[mdp->msg_slot]); | ||
220 | if (socket_ack_count == bcp->cpus_in_socket) { | ||
221 | /* | ||
222 | * Both sockets dump their completed count total into | ||
223 | * the message's count. | ||
224 | */ | ||
225 | smaster->socket_acknowledge_count[mdp->msg_slot] = 0; | ||
226 | msg_ack_count = atomic_add_short_return(socket_ack_count, | ||
227 | (struct atomic_short *)&msg->acknowledge_count); | ||
228 | |||
229 | if (msg_ack_count == bcp->cpus_in_uvhub) { | ||
230 | /* | ||
231 | * All cpus in uvhub saw it; reply | ||
232 | */ | ||
233 | uv_reply_to_message(mdp, bcp); | ||
234 | } | ||
235 | } | ||
117 | 236 | ||
118 | atomic_inc_short(&msg->acknowledge_count); | 237 | return; |
119 | if (msg->number_of_cpus == msg->acknowledge_count) | ||
120 | uv_reply_to_message(sw_ack_slot, msg, msp); | ||
121 | } | 238 | } |
122 | 239 | ||
123 | /* | 240 | /* |
124 | * Examine the payload queue on one distribution node to see | 241 | * Determine the first cpu on a uvhub. |
125 | * which messages have not been seen, and which cpu(s) have not seen them. | 242 | */ |
243 | static int uvhub_to_first_cpu(int uvhub) | ||
244 | { | ||
245 | int cpu; | ||
246 | for_each_present_cpu(cpu) | ||
247 | if (uvhub == uv_cpu_to_blade_id(cpu)) | ||
248 | return cpu; | ||
249 | return -1; | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * Last resort when we get a large number of destination timeouts is | ||
254 | * to clear resources held by a given cpu. | ||
255 | * Do this with IPI so that all messages in the BAU message queue | ||
256 | * can be identified by their nonzero sw_ack_vector field. | ||
126 | * | 257 | * |
127 | * Returns the number of cpu's that have not responded. | 258 | * This is entered for a single cpu on the uvhub. |
259 | * The sender want's this uvhub to free a specific message's | ||
260 | * sw_ack resources. | ||
128 | */ | 261 | */ |
129 | static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) | 262 | static void |
263 | uv_do_reset(void *ptr) | ||
130 | { | 264 | { |
131 | struct bau_payload_queue_entry *msg; | ||
132 | struct bau_msg_status *msp; | ||
133 | int count = 0; | ||
134 | int i; | 265 | int i; |
135 | int j; | 266 | int slot; |
267 | int count = 0; | ||
268 | unsigned long mmr; | ||
269 | unsigned long msg_res; | ||
270 | struct bau_control *bcp; | ||
271 | struct reset_args *rap; | ||
272 | struct bau_payload_queue_entry *msg; | ||
273 | struct ptc_stats *stat; | ||
136 | 274 | ||
137 | for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; | 275 | bcp = &per_cpu(bau_control, smp_processor_id()); |
138 | msg++, i++) { | 276 | rap = (struct reset_args *)ptr; |
139 | if ((msg->sending_cpu == sender) && (!msg->replied_to)) { | 277 | stat = &per_cpu(ptcstats, bcp->cpu); |
140 | msp = bau_tablesp->msg_statuses + i; | 278 | stat->d_resets++; |
141 | printk(KERN_DEBUG | 279 | |
142 | "blade %d: address:%#lx %d of %d, not cpu(s): ", | 280 | /* |
143 | i, msg->address, msg->acknowledge_count, | 281 | * We're looking for the given sender, and |
144 | msg->number_of_cpus); | 282 | * will free its sw_ack resource. |
145 | for (j = 0; j < msg->number_of_cpus; j++) { | 283 | * If all cpu's finally responded after the timeout, its |
146 | if (!((1L << j) & msp->seen_by.bits)) { | 284 | * message 'replied_to' was set. |
147 | count++; | 285 | */ |
148 | printk("%d ", j); | 286 | for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { |
149 | } | 287 | /* uv_do_reset: same conditions for cancellation as |
288 | uv_bau_process_retry_msg() */ | ||
289 | if ((msg->replied_to == 0) && | ||
290 | (msg->canceled == 0) && | ||
291 | (msg->sending_cpu == rap->sender) && | ||
292 | (msg->sw_ack_vector) && | ||
293 | (msg->msg_type != MSG_NOOP)) { | ||
294 | /* | ||
295 | * make everyone else ignore this message | ||
296 | */ | ||
297 | msg->canceled = 1; | ||
298 | slot = msg - bcp->va_queue_first; | ||
299 | count++; | ||
300 | /* | ||
301 | * only reset the resource if it is still pending | ||
302 | */ | ||
303 | mmr = uv_read_local_mmr | ||
304 | (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); | ||
305 | msg_res = ((msg->sw_ack_vector << 8) | | ||
306 | msg->sw_ack_vector); | ||
307 | if (mmr & msg_res) { | ||
308 | stat->d_rcanceled++; | ||
309 | uv_write_local_mmr( | ||
310 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, | ||
311 | msg_res); | ||
150 | } | 312 | } |
151 | printk("\n"); | ||
152 | } | 313 | } |
153 | } | 314 | } |
154 | return count; | 315 | return; |
155 | } | 316 | } |
156 | 317 | ||
157 | /* | 318 | /* |
158 | * Examine the payload queue on all the distribution nodes to see | 319 | * Use IPI to get all target uvhubs to release resources held by |
159 | * which messages have not been seen, and which cpu(s) have not seen them. | 320 | * a given sending cpu number. |
160 | * | ||
161 | * Returns the number of cpu's that have not responded. | ||
162 | */ | 321 | */ |
163 | static int uv_examine_destinations(struct bau_target_nodemask *distribution) | 322 | static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, |
323 | int sender) | ||
164 | { | 324 | { |
165 | int sender; | 325 | int uvhub; |
166 | int i; | 326 | int cpu; |
167 | int count = 0; | 327 | cpumask_t mask; |
328 | struct reset_args reset_args; | ||
329 | |||
330 | reset_args.sender = sender; | ||
168 | 331 | ||
169 | sender = smp_processor_id(); | 332 | cpus_clear(mask); |
170 | for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { | 333 | /* find a single cpu for each uvhub in this distribution mask */ |
171 | if (!bau_node_isset(i, distribution)) | 334 | for (uvhub = 0; |
335 | uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; | ||
336 | uvhub++) { | ||
337 | if (!bau_uvhub_isset(uvhub, distribution)) | ||
172 | continue; | 338 | continue; |
173 | count += uv_examine_destination(uv_bau_table_bases[i], sender); | 339 | /* find a cpu for this uvhub */ |
340 | cpu = uvhub_to_first_cpu(uvhub); | ||
341 | cpu_set(cpu, mask); | ||
174 | } | 342 | } |
175 | return count; | 343 | /* IPI all cpus; Preemption is already disabled */ |
344 | smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); | ||
345 | return; | ||
346 | } | ||
347 | |||
348 | static inline unsigned long | ||
349 | cycles_2_us(unsigned long long cyc) | ||
350 | { | ||
351 | unsigned long long ns; | ||
352 | unsigned long us; | ||
353 | ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) | ||
354 | >> CYC2NS_SCALE_FACTOR; | ||
355 | us = ns / 1000; | ||
356 | return us; | ||
176 | } | 357 | } |
177 | 358 | ||
178 | /* | 359 | /* |
179 | * wait for completion of a broadcast message | 360 | * wait for all cpus on this hub to finish their sends and go quiet |
180 | * | 361 | * leaves uvhub_quiesce set so that no new broadcasts are started by |
181 | * return COMPLETE, RETRY or GIVEUP | 362 | * bau_flush_send_and_wait() |
363 | */ | ||
364 | static inline void | ||
365 | quiesce_local_uvhub(struct bau_control *hmaster) | ||
366 | { | ||
367 | atomic_add_short_return(1, (struct atomic_short *) | ||
368 | &hmaster->uvhub_quiesce); | ||
369 | } | ||
370 | |||
371 | /* | ||
372 | * mark this quiet-requestor as done | ||
373 | */ | ||
374 | static inline void | ||
375 | end_uvhub_quiesce(struct bau_control *hmaster) | ||
376 | { | ||
377 | atomic_add_short_return(-1, (struct atomic_short *) | ||
378 | &hmaster->uvhub_quiesce); | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * Wait for completion of a broadcast software ack message | ||
383 | * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP | ||
182 | */ | 384 | */ |
183 | static int uv_wait_completion(struct bau_desc *bau_desc, | 385 | static int uv_wait_completion(struct bau_desc *bau_desc, |
184 | unsigned long mmr_offset, int right_shift) | 386 | unsigned long mmr_offset, int right_shift, int this_cpu, |
387 | struct bau_control *bcp, struct bau_control *smaster, long try) | ||
185 | { | 388 | { |
186 | int exams = 0; | 389 | int relaxes = 0; |
187 | long destination_timeouts = 0; | ||
188 | long source_timeouts = 0; | ||
189 | unsigned long descriptor_status; | 390 | unsigned long descriptor_status; |
391 | unsigned long mmr; | ||
392 | unsigned long mask; | ||
393 | cycles_t ttime; | ||
394 | cycles_t timeout_time; | ||
395 | struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); | ||
396 | struct bau_control *hmaster; | ||
397 | |||
398 | hmaster = bcp->uvhub_master; | ||
399 | timeout_time = get_cycles() + bcp->timeout_interval; | ||
190 | 400 | ||
401 | /* spin on the status MMR, waiting for it to go idle */ | ||
191 | while ((descriptor_status = (((unsigned long) | 402 | while ((descriptor_status = (((unsigned long) |
192 | uv_read_local_mmr(mmr_offset) >> | 403 | uv_read_local_mmr(mmr_offset) >> |
193 | right_shift) & UV_ACT_STATUS_MASK)) != | 404 | right_shift) & UV_ACT_STATUS_MASK)) != |
194 | DESC_STATUS_IDLE) { | 405 | DESC_STATUS_IDLE) { |
195 | if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { | ||
196 | source_timeouts++; | ||
197 | if (source_timeouts > SOURCE_TIMEOUT_LIMIT) | ||
198 | source_timeouts = 0; | ||
199 | __get_cpu_var(ptcstats).s_retry++; | ||
200 | return FLUSH_RETRY; | ||
201 | } | ||
202 | /* | 406 | /* |
203 | * spin here looking for progress at the destinations | 407 | * Our software ack messages may be blocked because there are |
408 | * no swack resources available. As long as none of them | ||
409 | * has timed out hardware will NACK our message and its | ||
410 | * state will stay IDLE. | ||
204 | */ | 411 | */ |
205 | if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { | 412 | if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { |
206 | destination_timeouts++; | 413 | stat->s_stimeout++; |
207 | if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { | 414 | return FLUSH_GIVEUP; |
208 | /* | 415 | } else if (descriptor_status == |
209 | * returns number of cpus not responding | 416 | DESC_STATUS_DESTINATION_TIMEOUT) { |
210 | */ | 417 | stat->s_dtimeout++; |
211 | if (uv_examine_destinations | 418 | ttime = get_cycles(); |
212 | (&bau_desc->distribution) == 0) { | 419 | |
213 | __get_cpu_var(ptcstats).d_retry++; | 420 | /* |
214 | return FLUSH_RETRY; | 421 | * Our retries may be blocked by all destination |
215 | } | 422 | * swack resources being consumed, and a timeout |
216 | exams++; | 423 | * pending. In that case hardware returns the |
217 | if (exams >= uv_bau_retry_limit) { | 424 | * ERROR that looks like a destination timeout. |
218 | printk(KERN_DEBUG | 425 | */ |
219 | "uv_flush_tlb_others"); | 426 | if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { |
220 | printk("giving up on cpu %d\n", | 427 | bcp->conseccompletes = 0; |
221 | smp_processor_id()); | 428 | return FLUSH_RETRY_PLUGGED; |
429 | } | ||
430 | |||
431 | bcp->conseccompletes = 0; | ||
432 | return FLUSH_RETRY_TIMEOUT; | ||
433 | } else { | ||
434 | /* | ||
435 | * descriptor_status is still BUSY | ||
436 | */ | ||
437 | cpu_relax(); | ||
438 | relaxes++; | ||
439 | if (relaxes >= 10000) { | ||
440 | relaxes = 0; | ||
441 | if (get_cycles() > timeout_time) { | ||
442 | quiesce_local_uvhub(hmaster); | ||
443 | |||
444 | /* single-thread the register change */ | ||
445 | spin_lock(&hmaster->masks_lock); | ||
446 | mmr = uv_read_local_mmr(mmr_offset); | ||
447 | mask = 0UL; | ||
448 | mask |= (3UL < right_shift); | ||
449 | mask = ~mask; | ||
450 | mmr &= mask; | ||
451 | uv_write_local_mmr(mmr_offset, mmr); | ||
452 | spin_unlock(&hmaster->masks_lock); | ||
453 | end_uvhub_quiesce(hmaster); | ||
454 | stat->s_busy++; | ||
222 | return FLUSH_GIVEUP; | 455 | return FLUSH_GIVEUP; |
223 | } | 456 | } |
224 | /* | ||
225 | * delays can hang the simulator | ||
226 | udelay(1000); | ||
227 | */ | ||
228 | destination_timeouts = 0; | ||
229 | } | 457 | } |
230 | } | 458 | } |
231 | cpu_relax(); | ||
232 | } | 459 | } |
460 | bcp->conseccompletes++; | ||
233 | return FLUSH_COMPLETE; | 461 | return FLUSH_COMPLETE; |
234 | } | 462 | } |
235 | 463 | ||
464 | static inline cycles_t | ||
465 | sec_2_cycles(unsigned long sec) | ||
466 | { | ||
467 | unsigned long ns; | ||
468 | cycles_t cyc; | ||
469 | |||
470 | ns = sec * 1000000000; | ||
471 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); | ||
472 | return cyc; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * conditionally add 1 to *v, unless *v is >= u | ||
477 | * return 0 if we cannot add 1 to *v because it is >= u | ||
478 | * return 1 if we can add 1 to *v because it is < u | ||
479 | * the add is atomic | ||
480 | * | ||
481 | * This is close to atomic_add_unless(), but this allows the 'u' value | ||
482 | * to be lowered below the current 'v'. atomic_add_unless can only stop | ||
483 | * on equal. | ||
484 | */ | ||
485 | static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) | ||
486 | { | ||
487 | spin_lock(lock); | ||
488 | if (atomic_read(v) >= u) { | ||
489 | spin_unlock(lock); | ||
490 | return 0; | ||
491 | } | ||
492 | atomic_inc(v); | ||
493 | spin_unlock(lock); | ||
494 | return 1; | ||
495 | } | ||
496 | |||
236 | /** | 497 | /** |
237 | * uv_flush_send_and_wait | 498 | * uv_flush_send_and_wait |
238 | * | 499 | * |
239 | * Send a broadcast and wait for a broadcast message to complete. | 500 | * Send a broadcast and wait for it to complete. |
240 | * | 501 | * |
241 | * The flush_mask contains the cpus the broadcast was sent to. | 502 | * The flush_mask contains the cpus the broadcast is to be sent to, plus |
503 | * cpus that are on the local uvhub. | ||
242 | * | 504 | * |
243 | * Returns NULL if all remote flushing was done. The mask is zeroed. | 505 | * Returns NULL if all flushing represented in the mask was done. The mask |
506 | * is zeroed. | ||
244 | * Returns @flush_mask if some remote flushing remains to be done. The | 507 | * Returns @flush_mask if some remote flushing remains to be done. The |
245 | * mask will have some bits still set. | 508 | * mask will have some bits still set, representing any cpus on the local |
509 | * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. | ||
246 | */ | 510 | */ |
247 | const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, | 511 | const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, |
248 | struct bau_desc *bau_desc, | 512 | struct cpumask *flush_mask, |
249 | struct cpumask *flush_mask) | 513 | struct bau_control *bcp) |
250 | { | 514 | { |
251 | int completion_status = 0; | ||
252 | int right_shift; | 515 | int right_shift; |
253 | int tries = 0; | 516 | int uvhub; |
254 | int pnode; | ||
255 | int bit; | 517 | int bit; |
518 | int completion_status = 0; | ||
519 | int seq_number = 0; | ||
520 | long try = 0; | ||
521 | int cpu = bcp->uvhub_cpu; | ||
522 | int this_cpu = bcp->cpu; | ||
523 | int this_uvhub = bcp->uvhub; | ||
256 | unsigned long mmr_offset; | 524 | unsigned long mmr_offset; |
257 | unsigned long index; | 525 | unsigned long index; |
258 | cycles_t time1; | 526 | cycles_t time1; |
259 | cycles_t time2; | 527 | cycles_t time2; |
528 | struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); | ||
529 | struct bau_control *smaster = bcp->socket_master; | ||
530 | struct bau_control *hmaster = bcp->uvhub_master; | ||
531 | |||
532 | /* | ||
533 | * Spin here while there are hmaster->max_concurrent or more active | ||
534 | * descriptors. This is the per-uvhub 'throttle'. | ||
535 | */ | ||
536 | if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, | ||
537 | &hmaster->active_descriptor_count, | ||
538 | hmaster->max_concurrent)) { | ||
539 | stat->s_throttles++; | ||
540 | do { | ||
541 | cpu_relax(); | ||
542 | } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, | ||
543 | &hmaster->active_descriptor_count, | ||
544 | hmaster->max_concurrent)); | ||
545 | } | ||
546 | |||
547 | while (hmaster->uvhub_quiesce) | ||
548 | cpu_relax(); | ||
260 | 549 | ||
261 | if (cpu < UV_CPUS_PER_ACT_STATUS) { | 550 | if (cpu < UV_CPUS_PER_ACT_STATUS) { |
262 | mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; | 551 | mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; |
@@ -268,24 +557,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, | |||
268 | } | 557 | } |
269 | time1 = get_cycles(); | 558 | time1 = get_cycles(); |
270 | do { | 559 | do { |
271 | tries++; | 560 | /* |
561 | * Every message from any given cpu gets a unique message | ||
562 | * sequence number. But retries use that same number. | ||
563 | * Our message may have timed out at the destination because | ||
564 | * all sw-ack resources are in use and there is a timeout | ||
565 | * pending there. In that case, our last send never got | ||
566 | * placed into the queue and we need to persist until it | ||
567 | * does. | ||
568 | * | ||
569 | * Make any retry a type MSG_RETRY so that the destination will | ||
570 | * free any resource held by a previous message from this cpu. | ||
571 | */ | ||
572 | if (try == 0) { | ||
573 | /* use message type set by the caller the first time */ | ||
574 | seq_number = bcp->message_number++; | ||
575 | } else { | ||
576 | /* use RETRY type on all the rest; same sequence */ | ||
577 | bau_desc->header.msg_type = MSG_RETRY; | ||
578 | stat->s_retry_messages++; | ||
579 | } | ||
580 | bau_desc->header.sequence = seq_number; | ||
272 | index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | | 581 | index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | |
273 | cpu; | 582 | bcp->uvhub_cpu; |
583 | bcp->send_message = get_cycles(); | ||
584 | |||
274 | uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); | 585 | uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); |
586 | |||
587 | try++; | ||
275 | completion_status = uv_wait_completion(bau_desc, mmr_offset, | 588 | completion_status = uv_wait_completion(bau_desc, mmr_offset, |
276 | right_shift); | 589 | right_shift, this_cpu, bcp, smaster, try); |
277 | } while (completion_status == FLUSH_RETRY); | 590 | |
591 | if (completion_status == FLUSH_RETRY_PLUGGED) { | ||
592 | /* | ||
593 | * Our retries may be blocked by all destination swack | ||
594 | * resources being consumed, and a timeout pending. In | ||
595 | * that case hardware immediately returns the ERROR | ||
596 | * that looks like a destination timeout. | ||
597 | */ | ||
598 | udelay(TIMEOUT_DELAY); | ||
599 | bcp->plugged_tries++; | ||
600 | if (bcp->plugged_tries >= PLUGSB4RESET) { | ||
601 | bcp->plugged_tries = 0; | ||
602 | quiesce_local_uvhub(hmaster); | ||
603 | spin_lock(&hmaster->queue_lock); | ||
604 | uv_reset_with_ipi(&bau_desc->distribution, | ||
605 | this_cpu); | ||
606 | spin_unlock(&hmaster->queue_lock); | ||
607 | end_uvhub_quiesce(hmaster); | ||
608 | bcp->ipi_attempts++; | ||
609 | stat->s_resets_plug++; | ||
610 | } | ||
611 | } else if (completion_status == FLUSH_RETRY_TIMEOUT) { | ||
612 | hmaster->max_concurrent = 1; | ||
613 | bcp->timeout_tries++; | ||
614 | udelay(TIMEOUT_DELAY); | ||
615 | if (bcp->timeout_tries >= TIMEOUTSB4RESET) { | ||
616 | bcp->timeout_tries = 0; | ||
617 | quiesce_local_uvhub(hmaster); | ||
618 | spin_lock(&hmaster->queue_lock); | ||
619 | uv_reset_with_ipi(&bau_desc->distribution, | ||
620 | this_cpu); | ||
621 | spin_unlock(&hmaster->queue_lock); | ||
622 | end_uvhub_quiesce(hmaster); | ||
623 | bcp->ipi_attempts++; | ||
624 | stat->s_resets_timeout++; | ||
625 | } | ||
626 | } | ||
627 | if (bcp->ipi_attempts >= 3) { | ||
628 | bcp->ipi_attempts = 0; | ||
629 | completion_status = FLUSH_GIVEUP; | ||
630 | break; | ||
631 | } | ||
632 | cpu_relax(); | ||
633 | } while ((completion_status == FLUSH_RETRY_PLUGGED) || | ||
634 | (completion_status == FLUSH_RETRY_TIMEOUT)); | ||
278 | time2 = get_cycles(); | 635 | time2 = get_cycles(); |
279 | __get_cpu_var(ptcstats).sflush += (time2 - time1); | ||
280 | if (tries > 1) | ||
281 | __get_cpu_var(ptcstats).retriesok++; | ||
282 | 636 | ||
283 | if (completion_status == FLUSH_GIVEUP) { | 637 | if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) |
638 | && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) | ||
639 | hmaster->max_concurrent++; | ||
640 | |||
641 | /* | ||
642 | * hold any cpu not timing out here; no other cpu currently held by | ||
643 | * the 'throttle' should enter the activation code | ||
644 | */ | ||
645 | while (hmaster->uvhub_quiesce) | ||
646 | cpu_relax(); | ||
647 | atomic_dec(&hmaster->active_descriptor_count); | ||
648 | |||
649 | /* guard against cycles wrap */ | ||
650 | if (time2 > time1) | ||
651 | stat->s_time += (time2 - time1); | ||
652 | else | ||
653 | stat->s_requestor--; /* don't count this one */ | ||
654 | if (completion_status == FLUSH_COMPLETE && try > 1) | ||
655 | stat->s_retriesok++; | ||
656 | else if (completion_status == FLUSH_GIVEUP) { | ||
284 | /* | 657 | /* |
285 | * Cause the caller to do an IPI-style TLB shootdown on | 658 | * Cause the caller to do an IPI-style TLB shootdown on |
286 | * the cpu's, all of which are still in the mask. | 659 | * the target cpu's, all of which are still in the mask. |
287 | */ | 660 | */ |
288 | __get_cpu_var(ptcstats).ptc_i++; | 661 | stat->s_giveup++; |
289 | return flush_mask; | 662 | return flush_mask; |
290 | } | 663 | } |
291 | 664 | ||
@@ -294,18 +667,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, | |||
294 | * use the IPI method of shootdown on them. | 667 | * use the IPI method of shootdown on them. |
295 | */ | 668 | */ |
296 | for_each_cpu(bit, flush_mask) { | 669 | for_each_cpu(bit, flush_mask) { |
297 | pnode = uv_cpu_to_pnode(bit); | 670 | uvhub = uv_cpu_to_blade_id(bit); |
298 | if (pnode == this_pnode) | 671 | if (uvhub == this_uvhub) |
299 | continue; | 672 | continue; |
300 | cpumask_clear_cpu(bit, flush_mask); | 673 | cpumask_clear_cpu(bit, flush_mask); |
301 | } | 674 | } |
302 | if (!cpumask_empty(flush_mask)) | 675 | if (!cpumask_empty(flush_mask)) |
303 | return flush_mask; | 676 | return flush_mask; |
677 | |||
304 | return NULL; | 678 | return NULL; |
305 | } | 679 | } |
306 | 680 | ||
307 | static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); | ||
308 | |||
309 | /** | 681 | /** |
310 | * uv_flush_tlb_others - globally purge translation cache of a virtual | 682 | * uv_flush_tlb_others - globally purge translation cache of a virtual |
311 | * address or all TLB's | 683 | * address or all TLB's |
@@ -322,8 +694,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); | |||
322 | * The caller has derived the cpumask from the mm_struct. This function | 694 | * The caller has derived the cpumask from the mm_struct. This function |
323 | * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) | 695 | * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) |
324 | * | 696 | * |
325 | * The cpumask is converted into a nodemask of the nodes containing | 697 | * The cpumask is converted into a uvhubmask of the uvhubs containing |
326 | * the cpus. | 698 | * those cpus. |
327 | * | 699 | * |
328 | * Note that this function should be called with preemption disabled. | 700 | * Note that this function should be called with preemption disabled. |
329 | * | 701 | * |
@@ -335,52 +707,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
335 | struct mm_struct *mm, | 707 | struct mm_struct *mm, |
336 | unsigned long va, unsigned int cpu) | 708 | unsigned long va, unsigned int cpu) |
337 | { | 709 | { |
338 | struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); | 710 | int remotes; |
339 | int i; | 711 | int tcpu; |
340 | int bit; | 712 | int uvhub; |
341 | int pnode; | ||
342 | int uv_cpu; | ||
343 | int this_pnode; | ||
344 | int locals = 0; | 713 | int locals = 0; |
345 | struct bau_desc *bau_desc; | 714 | struct bau_desc *bau_desc; |
715 | struct cpumask *flush_mask; | ||
716 | struct ptc_stats *stat; | ||
717 | struct bau_control *bcp; | ||
346 | 718 | ||
347 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); | 719 | if (nobau) |
720 | return cpumask; | ||
348 | 721 | ||
349 | uv_cpu = uv_blade_processor_id(); | 722 | bcp = &per_cpu(bau_control, cpu); |
350 | this_pnode = uv_hub_info->pnode; | 723 | /* |
351 | bau_desc = __get_cpu_var(bau_control).descriptor_base; | 724 | * Each sending cpu has a per-cpu mask which it fills from the caller's |
352 | bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; | 725 | * cpu mask. Only remote cpus are converted to uvhubs and copied. |
726 | */ | ||
727 | flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); | ||
728 | /* | ||
729 | * copy cpumask to flush_mask, removing current cpu | ||
730 | * (current cpu should already have been flushed by the caller and | ||
731 | * should never be returned if we return flush_mask) | ||
732 | */ | ||
733 | cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); | ||
734 | if (cpu_isset(cpu, *cpumask)) | ||
735 | locals++; /* current cpu was targeted */ | ||
353 | 736 | ||
354 | bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); | 737 | bau_desc = bcp->descriptor_base; |
738 | bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; | ||
355 | 739 | ||
356 | i = 0; | 740 | bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); |
357 | for_each_cpu(bit, flush_mask) { | 741 | remotes = 0; |
358 | pnode = uv_cpu_to_pnode(bit); | 742 | for_each_cpu(tcpu, flush_mask) { |
359 | BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); | 743 | uvhub = uv_cpu_to_blade_id(tcpu); |
360 | if (pnode == this_pnode) { | 744 | if (uvhub == bcp->uvhub) { |
361 | locals++; | 745 | locals++; |
362 | continue; | 746 | continue; |
363 | } | 747 | } |
364 | bau_node_set(pnode - uv_partition_base_pnode, | 748 | bau_uvhub_set(uvhub, &bau_desc->distribution); |
365 | &bau_desc->distribution); | 749 | remotes++; |
366 | i++; | ||
367 | } | 750 | } |
368 | if (i == 0) { | 751 | if (remotes == 0) { |
369 | /* | 752 | /* |
370 | * no off_node flushing; return status for local node | 753 | * No off_hub flushing; return status for local hub. |
754 | * Return the caller's mask if all were local (the current | ||
755 | * cpu may be in that mask). | ||
371 | */ | 756 | */ |
372 | if (locals) | 757 | if (locals) |
373 | return flush_mask; | 758 | return cpumask; |
374 | else | 759 | else |
375 | return NULL; | 760 | return NULL; |
376 | } | 761 | } |
377 | __get_cpu_var(ptcstats).requestor++; | 762 | stat = &per_cpu(ptcstats, cpu); |
378 | __get_cpu_var(ptcstats).ntargeted += i; | 763 | stat->s_requestor++; |
764 | stat->s_ntargcpu += remotes; | ||
765 | remotes = bau_uvhub_weight(&bau_desc->distribution); | ||
766 | stat->s_ntarguvhub += remotes; | ||
767 | if (remotes >= 16) | ||
768 | stat->s_ntarguvhub16++; | ||
769 | else if (remotes >= 8) | ||
770 | stat->s_ntarguvhub8++; | ||
771 | else if (remotes >= 4) | ||
772 | stat->s_ntarguvhub4++; | ||
773 | else if (remotes >= 2) | ||
774 | stat->s_ntarguvhub2++; | ||
775 | else | ||
776 | stat->s_ntarguvhub1++; | ||
379 | 777 | ||
380 | bau_desc->payload.address = va; | 778 | bau_desc->payload.address = va; |
381 | bau_desc->payload.sending_cpu = cpu; | 779 | bau_desc->payload.sending_cpu = cpu; |
382 | 780 | ||
383 | return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); | 781 | /* |
782 | * uv_flush_send_and_wait returns null if all cpu's were messaged, or | ||
783 | * the adjusted flush_mask if any cpu's were not messaged. | ||
784 | */ | ||
785 | return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); | ||
384 | } | 786 | } |
385 | 787 | ||
386 | /* | 788 | /* |
@@ -389,87 +791,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, | |||
389 | * | 791 | * |
390 | * We received a broadcast assist message. | 792 | * We received a broadcast assist message. |
391 | * | 793 | * |
392 | * Interrupts may have been disabled; this interrupt could represent | 794 | * Interrupts are disabled; this interrupt could represent |
393 | * the receipt of several messages. | 795 | * the receipt of several messages. |
394 | * | 796 | * |
395 | * All cores/threads on this node get this interrupt. | 797 | * All cores/threads on this hub get this interrupt. |
396 | * The last one to see it does the s/w ack. | 798 | * The last one to see it does the software ack. |
397 | * (the resource will not be freed until noninterruptable cpus see this | 799 | * (the resource will not be freed until noninterruptable cpus see this |
398 | * interrupt; hardware will timeout the s/w ack and reply ERROR) | 800 | * interrupt; hardware may timeout the s/w ack and reply ERROR) |
399 | */ | 801 | */ |
400 | void uv_bau_message_interrupt(struct pt_regs *regs) | 802 | void uv_bau_message_interrupt(struct pt_regs *regs) |
401 | { | 803 | { |
402 | struct bau_payload_queue_entry *va_queue_first; | ||
403 | struct bau_payload_queue_entry *va_queue_last; | ||
404 | struct bau_payload_queue_entry *msg; | ||
405 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
406 | cycles_t time1; | ||
407 | cycles_t time2; | ||
408 | int msg_slot; | ||
409 | int sw_ack_slot; | ||
410 | int fw; | ||
411 | int count = 0; | 804 | int count = 0; |
412 | unsigned long local_pnode; | 805 | cycles_t time_start; |
413 | 806 | struct bau_payload_queue_entry *msg; | |
414 | ack_APIC_irq(); | 807 | struct bau_control *bcp; |
415 | exit_idle(); | 808 | struct ptc_stats *stat; |
416 | irq_enter(); | 809 | struct msg_desc msgdesc; |
417 | 810 | ||
418 | time1 = get_cycles(); | 811 | time_start = get_cycles(); |
419 | 812 | bcp = &per_cpu(bau_control, smp_processor_id()); | |
420 | local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); | 813 | stat = &per_cpu(ptcstats, smp_processor_id()); |
421 | 814 | msgdesc.va_queue_first = bcp->va_queue_first; | |
422 | va_queue_first = __get_cpu_var(bau_control).va_queue_first; | 815 | msgdesc.va_queue_last = bcp->va_queue_last; |
423 | va_queue_last = __get_cpu_var(bau_control).va_queue_last; | 816 | msg = bcp->bau_msg_head; |
424 | |||
425 | msg = __get_cpu_var(bau_control).bau_msg_head; | ||
426 | while (msg->sw_ack_vector) { | 817 | while (msg->sw_ack_vector) { |
427 | count++; | 818 | count++; |
428 | fw = msg->sw_ack_vector; | 819 | msgdesc.msg_slot = msg - msgdesc.va_queue_first; |
429 | msg_slot = msg - va_queue_first; | 820 | msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; |
430 | sw_ack_slot = ffs(fw) - 1; | 821 | msgdesc.msg = msg; |
431 | 822 | uv_bau_process_message(&msgdesc, bcp); | |
432 | uv_bau_process_message(msg, msg_slot, sw_ack_slot); | ||
433 | |||
434 | msg++; | 823 | msg++; |
435 | if (msg > va_queue_last) | 824 | if (msg > msgdesc.va_queue_last) |
436 | msg = va_queue_first; | 825 | msg = msgdesc.va_queue_first; |
437 | __get_cpu_var(bau_control).bau_msg_head = msg; | 826 | bcp->bau_msg_head = msg; |
438 | } | 827 | } |
828 | stat->d_time += (get_cycles() - time_start); | ||
439 | if (!count) | 829 | if (!count) |
440 | __get_cpu_var(ptcstats).nomsg++; | 830 | stat->d_nomsg++; |
441 | else if (count > 1) | 831 | else if (count > 1) |
442 | __get_cpu_var(ptcstats).multmsg++; | 832 | stat->d_multmsg++; |
443 | 833 | ack_APIC_irq(); | |
444 | time2 = get_cycles(); | ||
445 | __get_cpu_var(ptcstats).dflush += (time2 - time1); | ||
446 | |||
447 | irq_exit(); | ||
448 | set_irq_regs(old_regs); | ||
449 | } | 834 | } |
450 | 835 | ||
451 | /* | 836 | /* |
452 | * uv_enable_timeouts | 837 | * uv_enable_timeouts |
453 | * | 838 | * |
454 | * Each target blade (i.e. blades that have cpu's) needs to have | 839 | * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have |
455 | * shootdown message timeouts enabled. The timeout does not cause | 840 | * shootdown message timeouts enabled. The timeout does not cause |
456 | * an interrupt, but causes an error message to be returned to | 841 | * an interrupt, but causes an error message to be returned to |
457 | * the sender. | 842 | * the sender. |
458 | */ | 843 | */ |
459 | static void uv_enable_timeouts(void) | 844 | static void uv_enable_timeouts(void) |
460 | { | 845 | { |
461 | int blade; | 846 | int uvhub; |
462 | int nblades; | 847 | int nuvhubs; |
463 | int pnode; | 848 | int pnode; |
464 | unsigned long mmr_image; | 849 | unsigned long mmr_image; |
465 | 850 | ||
466 | nblades = uv_num_possible_blades(); | 851 | nuvhubs = uv_num_possible_blades(); |
467 | 852 | ||
468 | for (blade = 0; blade < nblades; blade++) { | 853 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) { |
469 | if (!uv_blade_nr_possible_cpus(blade)) | 854 | if (!uv_blade_nr_possible_cpus(uvhub)) |
470 | continue; | 855 | continue; |
471 | 856 | ||
472 | pnode = uv_blade_to_pnode(blade); | 857 | pnode = uv_blade_to_pnode(uvhub); |
473 | mmr_image = | 858 | mmr_image = |
474 | uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); | 859 | uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); |
475 | /* | 860 | /* |
@@ -479,16 +864,16 @@ static void uv_enable_timeouts(void) | |||
479 | * To program the period, the SOFT_ACK_MODE must be off. | 864 | * To program the period, the SOFT_ACK_MODE must be off. |
480 | */ | 865 | */ |
481 | mmr_image &= ~((unsigned long)1 << | 866 | mmr_image &= ~((unsigned long)1 << |
482 | UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); | 867 | UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); |
483 | uv_write_global_mmr64 | 868 | uv_write_global_mmr64 |
484 | (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); | 869 | (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); |
485 | /* | 870 | /* |
486 | * Set the 4-bit period. | 871 | * Set the 4-bit period. |
487 | */ | 872 | */ |
488 | mmr_image &= ~((unsigned long)0xf << | 873 | mmr_image &= ~((unsigned long)0xf << |
489 | UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); | 874 | UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); |
490 | mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << | 875 | mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << |
491 | UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); | 876 | UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); |
492 | uv_write_global_mmr64 | 877 | uv_write_global_mmr64 |
493 | (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); | 878 | (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); |
494 | /* | 879 | /* |
@@ -497,7 +882,7 @@ static void uv_enable_timeouts(void) | |||
497 | * indicated in bits 2:0 (7 causes all of them to timeout). | 882 | * indicated in bits 2:0 (7 causes all of them to timeout). |
498 | */ | 883 | */ |
499 | mmr_image |= ((unsigned long)1 << | 884 | mmr_image |= ((unsigned long)1 << |
500 | UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); | 885 | UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); |
501 | uv_write_global_mmr64 | 886 | uv_write_global_mmr64 |
502 | (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); | 887 | (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); |
503 | } | 888 | } |
@@ -522,9 +907,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) | |||
522 | { | 907 | { |
523 | } | 908 | } |
524 | 909 | ||
910 | static inline unsigned long long | ||
911 | millisec_2_cycles(unsigned long millisec) | ||
912 | { | ||
913 | unsigned long ns; | ||
914 | unsigned long long cyc; | ||
915 | |||
916 | ns = millisec * 1000; | ||
917 | cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); | ||
918 | return cyc; | ||
919 | } | ||
920 | |||
525 | /* | 921 | /* |
526 | * Display the statistics thru /proc | 922 | * Display the statistics thru /proc. |
527 | * data points to the cpu number | 923 | * 'data' points to the cpu number |
528 | */ | 924 | */ |
529 | static int uv_ptc_seq_show(struct seq_file *file, void *data) | 925 | static int uv_ptc_seq_show(struct seq_file *file, void *data) |
530 | { | 926 | { |
@@ -535,78 +931,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
535 | 931 | ||
536 | if (!cpu) { | 932 | if (!cpu) { |
537 | seq_printf(file, | 933 | seq_printf(file, |
538 | "# cpu requestor requestee one all sretry dretry ptc_i "); | 934 | "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); |
539 | seq_printf(file, | 935 | seq_printf(file, |
540 | "sw_ack sflush dflush sok dnomsg dmult starget\n"); | 936 | "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); |
937 | seq_printf(file, | ||
938 | "retries rok resetp resett giveup sto bz throt "); | ||
939 | seq_printf(file, | ||
940 | "sw_ack recv rtime all "); | ||
941 | seq_printf(file, | ||
942 | "one mult none retry canc nocan reset rcan\n"); | ||
541 | } | 943 | } |
542 | if (cpu < num_possible_cpus() && cpu_online(cpu)) { | 944 | if (cpu < num_possible_cpus() && cpu_online(cpu)) { |
543 | stat = &per_cpu(ptcstats, cpu); | 945 | stat = &per_cpu(ptcstats, cpu); |
544 | seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", | 946 | /* source side statistics */ |
545 | cpu, stat->requestor, | 947 | seq_printf(file, |
546 | stat->requestee, stat->onetlb, stat->alltlb, | 948 | "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", |
547 | stat->s_retry, stat->d_retry, stat->ptc_i); | 949 | cpu, stat->s_requestor, cycles_2_us(stat->s_time), |
548 | seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", | 950 | stat->s_ntarguvhub, stat->s_ntarguvhub16, |
951 | stat->s_ntarguvhub8, stat->s_ntarguvhub4, | ||
952 | stat->s_ntarguvhub2, stat->s_ntarguvhub1, | ||
953 | stat->s_ntargcpu, stat->s_dtimeout); | ||
954 | seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", | ||
955 | stat->s_retry_messages, stat->s_retriesok, | ||
956 | stat->s_resets_plug, stat->s_resets_timeout, | ||
957 | stat->s_giveup, stat->s_stimeout, | ||
958 | stat->s_busy, stat->s_throttles); | ||
959 | /* destination side statistics */ | ||
960 | seq_printf(file, | ||
961 | "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", | ||
549 | uv_read_global_mmr64(uv_cpu_to_pnode(cpu), | 962 | uv_read_global_mmr64(uv_cpu_to_pnode(cpu), |
550 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), | 963 | UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), |
551 | stat->sflush, stat->dflush, | 964 | stat->d_requestee, cycles_2_us(stat->d_time), |
552 | stat->retriesok, stat->nomsg, | 965 | stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, |
553 | stat->multmsg, stat->ntargeted); | 966 | stat->d_nomsg, stat->d_retries, stat->d_canceled, |
967 | stat->d_nocanceled, stat->d_resets, | ||
968 | stat->d_rcanceled); | ||
554 | } | 969 | } |
555 | 970 | ||
556 | return 0; | 971 | return 0; |
557 | } | 972 | } |
558 | 973 | ||
559 | /* | 974 | /* |
975 | * -1: resetf the statistics | ||
560 | * 0: display meaning of the statistics | 976 | * 0: display meaning of the statistics |
561 | * >0: retry limit | 977 | * >0: maximum concurrent active descriptors per uvhub (throttle) |
562 | */ | 978 | */ |
563 | static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, | 979 | static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, |
564 | size_t count, loff_t *data) | 980 | size_t count, loff_t *data) |
565 | { | 981 | { |
566 | long newmode; | 982 | int cpu; |
983 | long input_arg; | ||
567 | char optstr[64]; | 984 | char optstr[64]; |
985 | struct ptc_stats *stat; | ||
986 | struct bau_control *bcp; | ||
568 | 987 | ||
569 | if (count == 0 || count > sizeof(optstr)) | 988 | if (count == 0 || count > sizeof(optstr)) |
570 | return -EINVAL; | 989 | return -EINVAL; |
571 | if (copy_from_user(optstr, user, count)) | 990 | if (copy_from_user(optstr, user, count)) |
572 | return -EFAULT; | 991 | return -EFAULT; |
573 | optstr[count - 1] = '\0'; | 992 | optstr[count - 1] = '\0'; |
574 | if (strict_strtoul(optstr, 10, &newmode) < 0) { | 993 | if (strict_strtol(optstr, 10, &input_arg) < 0) { |
575 | printk(KERN_DEBUG "%s is invalid\n", optstr); | 994 | printk(KERN_DEBUG "%s is invalid\n", optstr); |
576 | return -EINVAL; | 995 | return -EINVAL; |
577 | } | 996 | } |
578 | 997 | ||
579 | if (newmode == 0) { | 998 | if (input_arg == 0) { |
580 | printk(KERN_DEBUG "# cpu: cpu number\n"); | 999 | printk(KERN_DEBUG "# cpu: cpu number\n"); |
1000 | printk(KERN_DEBUG "Sender statistics:\n"); | ||
1001 | printk(KERN_DEBUG | ||
1002 | "sent: number of shootdown messages sent\n"); | ||
1003 | printk(KERN_DEBUG | ||
1004 | "stime: time spent sending messages\n"); | ||
1005 | printk(KERN_DEBUG | ||
1006 | "numuvhubs: number of hubs targeted with shootdown\n"); | ||
1007 | printk(KERN_DEBUG | ||
1008 | "numuvhubs16: number times 16 or more hubs targeted\n"); | ||
1009 | printk(KERN_DEBUG | ||
1010 | "numuvhubs8: number times 8 or more hubs targeted\n"); | ||
1011 | printk(KERN_DEBUG | ||
1012 | "numuvhubs4: number times 4 or more hubs targeted\n"); | ||
1013 | printk(KERN_DEBUG | ||
1014 | "numuvhubs2: number times 2 or more hubs targeted\n"); | ||
1015 | printk(KERN_DEBUG | ||
1016 | "numuvhubs1: number times 1 hub targeted\n"); | ||
1017 | printk(KERN_DEBUG | ||
1018 | "numcpus: number of cpus targeted with shootdown\n"); | ||
1019 | printk(KERN_DEBUG | ||
1020 | "dto: number of destination timeouts\n"); | ||
1021 | printk(KERN_DEBUG | ||
1022 | "retries: destination timeout retries sent\n"); | ||
1023 | printk(KERN_DEBUG | ||
1024 | "rok: : destination timeouts successfully retried\n"); | ||
1025 | printk(KERN_DEBUG | ||
1026 | "resetp: ipi-style resource resets for plugs\n"); | ||
1027 | printk(KERN_DEBUG | ||
1028 | "resett: ipi-style resource resets for timeouts\n"); | ||
1029 | printk(KERN_DEBUG | ||
1030 | "giveup: fall-backs to ipi-style shootdowns\n"); | ||
1031 | printk(KERN_DEBUG | ||
1032 | "sto: number of source timeouts\n"); | ||
1033 | printk(KERN_DEBUG | ||
1034 | "bz: number of stay-busy's\n"); | ||
1035 | printk(KERN_DEBUG | ||
1036 | "throt: number times spun in throttle\n"); | ||
1037 | printk(KERN_DEBUG "Destination side statistics:\n"); | ||
581 | printk(KERN_DEBUG | 1038 | printk(KERN_DEBUG |
582 | "requestor: times this cpu was the flush requestor\n"); | 1039 | "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); |
583 | printk(KERN_DEBUG | 1040 | printk(KERN_DEBUG |
584 | "requestee: times this cpu was requested to flush its TLBs\n"); | 1041 | "recv: shootdown messages received\n"); |
585 | printk(KERN_DEBUG | 1042 | printk(KERN_DEBUG |
586 | "one: times requested to flush a single address\n"); | 1043 | "rtime: time spent processing messages\n"); |
587 | printk(KERN_DEBUG | 1044 | printk(KERN_DEBUG |
588 | "all: times requested to flush all TLB's\n"); | 1045 | "all: shootdown all-tlb messages\n"); |
589 | printk(KERN_DEBUG | 1046 | printk(KERN_DEBUG |
590 | "sretry: number of retries of source-side timeouts\n"); | 1047 | "one: shootdown one-tlb messages\n"); |
591 | printk(KERN_DEBUG | 1048 | printk(KERN_DEBUG |
592 | "dretry: number of retries of destination-side timeouts\n"); | 1049 | "mult: interrupts that found multiple messages\n"); |
593 | printk(KERN_DEBUG | 1050 | printk(KERN_DEBUG |
594 | "ptc_i: times UV fell through to IPI-style flushes\n"); | 1051 | "none: interrupts that found no messages\n"); |
595 | printk(KERN_DEBUG | 1052 | printk(KERN_DEBUG |
596 | "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); | 1053 | "retry: number of retry messages processed\n"); |
597 | printk(KERN_DEBUG | 1054 | printk(KERN_DEBUG |
598 | "sflush_us: cycles spent in uv_flush_tlb_others()\n"); | 1055 | "canc: number messages canceled by retries\n"); |
599 | printk(KERN_DEBUG | 1056 | printk(KERN_DEBUG |
600 | "dflush_us: cycles spent in handling flush requests\n"); | 1057 | "nocan: number retries that found nothing to cancel\n"); |
601 | printk(KERN_DEBUG "sok: successes on retry\n"); | ||
602 | printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); | ||
603 | printk(KERN_DEBUG | 1058 | printk(KERN_DEBUG |
604 | "dmult: interrupts with multiple messages\n"); | 1059 | "reset: number of ipi-style reset requests processed\n"); |
605 | printk(KERN_DEBUG "starget: nodes targeted\n"); | 1060 | printk(KERN_DEBUG |
1061 | "rcan: number messages canceled by reset requests\n"); | ||
1062 | } else if (input_arg == -1) { | ||
1063 | for_each_present_cpu(cpu) { | ||
1064 | stat = &per_cpu(ptcstats, cpu); | ||
1065 | memset(stat, 0, sizeof(struct ptc_stats)); | ||
1066 | } | ||
606 | } else { | 1067 | } else { |
607 | uv_bau_retry_limit = newmode; | 1068 | uv_bau_max_concurrent = input_arg; |
608 | printk(KERN_DEBUG "timeout retry limit:%d\n", | 1069 | bcp = &per_cpu(bau_control, smp_processor_id()); |
609 | uv_bau_retry_limit); | 1070 | if (uv_bau_max_concurrent < 1 || |
1071 | uv_bau_max_concurrent > bcp->cpus_in_uvhub) { | ||
1072 | printk(KERN_DEBUG | ||
1073 | "Error: BAU max concurrent %d; %d is invalid\n", | ||
1074 | bcp->max_concurrent, uv_bau_max_concurrent); | ||
1075 | return -EINVAL; | ||
1076 | } | ||
1077 | printk(KERN_DEBUG "Set BAU max concurrent:%d\n", | ||
1078 | uv_bau_max_concurrent); | ||
1079 | for_each_present_cpu(cpu) { | ||
1080 | bcp = &per_cpu(bau_control, cpu); | ||
1081 | bcp->max_concurrent = uv_bau_max_concurrent; | ||
1082 | } | ||
610 | } | 1083 | } |
611 | 1084 | ||
612 | return count; | 1085 | return count; |
@@ -650,79 +1123,30 @@ static int __init uv_ptc_init(void) | |||
650 | } | 1123 | } |
651 | 1124 | ||
652 | /* | 1125 | /* |
653 | * begin the initialization of the per-blade control structures | ||
654 | */ | ||
655 | static struct bau_control * __init uv_table_bases_init(int blade, int node) | ||
656 | { | ||
657 | int i; | ||
658 | struct bau_msg_status *msp; | ||
659 | struct bau_control *bau_tabp; | ||
660 | |||
661 | bau_tabp = | ||
662 | kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); | ||
663 | BUG_ON(!bau_tabp); | ||
664 | |||
665 | bau_tabp->msg_statuses = | ||
666 | kmalloc_node(sizeof(struct bau_msg_status) * | ||
667 | DEST_Q_SIZE, GFP_KERNEL, node); | ||
668 | BUG_ON(!bau_tabp->msg_statuses); | ||
669 | |||
670 | for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++) | ||
671 | bau_cpubits_clear(&msp->seen_by, (int) | ||
672 | uv_blade_nr_possible_cpus(blade)); | ||
673 | |||
674 | uv_bau_table_bases[blade] = bau_tabp; | ||
675 | |||
676 | return bau_tabp; | ||
677 | } | ||
678 | |||
679 | /* | ||
680 | * finish the initialization of the per-blade control structures | ||
681 | */ | ||
682 | static void __init | ||
683 | uv_table_bases_finish(int blade, | ||
684 | struct bau_control *bau_tablesp, | ||
685 | struct bau_desc *adp) | ||
686 | { | ||
687 | struct bau_control *bcp; | ||
688 | int cpu; | ||
689 | |||
690 | for_each_present_cpu(cpu) { | ||
691 | if (blade != uv_cpu_to_blade_id(cpu)) | ||
692 | continue; | ||
693 | |||
694 | bcp = (struct bau_control *)&per_cpu(bau_control, cpu); | ||
695 | bcp->bau_msg_head = bau_tablesp->va_queue_first; | ||
696 | bcp->va_queue_first = bau_tablesp->va_queue_first; | ||
697 | bcp->va_queue_last = bau_tablesp->va_queue_last; | ||
698 | bcp->msg_statuses = bau_tablesp->msg_statuses; | ||
699 | bcp->descriptor_base = adp; | ||
700 | } | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * initialize the sending side's sending buffers | 1126 | * initialize the sending side's sending buffers |
705 | */ | 1127 | */ |
706 | static struct bau_desc * __init | 1128 | static void |
707 | uv_activation_descriptor_init(int node, int pnode) | 1129 | uv_activation_descriptor_init(int node, int pnode) |
708 | { | 1130 | { |
709 | int i; | 1131 | int i; |
1132 | int cpu; | ||
710 | unsigned long pa; | 1133 | unsigned long pa; |
711 | unsigned long m; | 1134 | unsigned long m; |
712 | unsigned long n; | 1135 | unsigned long n; |
713 | struct bau_desc *adp; | 1136 | struct bau_desc *bau_desc; |
714 | struct bau_desc *ad2; | 1137 | struct bau_desc *bd2; |
1138 | struct bau_control *bcp; | ||
715 | 1139 | ||
716 | /* | 1140 | /* |
717 | * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) | 1141 | * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) |
718 | * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade | 1142 | * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub |
719 | */ | 1143 | */ |
720 | adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* | 1144 | bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* |
721 | UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); | 1145 | UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); |
722 | BUG_ON(!adp); | 1146 | BUG_ON(!bau_desc); |
723 | 1147 | ||
724 | pa = uv_gpa(adp); /* need the real nasid*/ | 1148 | pa = uv_gpa(bau_desc); /* need the real nasid*/ |
725 | n = uv_gpa_to_pnode(pa); | 1149 | n = pa >> uv_nshift; |
726 | m = pa & uv_mmask; | 1150 | m = pa & uv_mmask; |
727 | 1151 | ||
728 | uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, | 1152 | uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, |
@@ -731,96 +1155,188 @@ uv_activation_descriptor_init(int node, int pnode) | |||
731 | /* | 1155 | /* |
732 | * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each | 1156 | * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each |
733 | * cpu even though we only use the first one; one descriptor can | 1157 | * cpu even though we only use the first one; one descriptor can |
734 | * describe a broadcast to 256 nodes. | 1158 | * describe a broadcast to 256 uv hubs. |
735 | */ | 1159 | */ |
736 | for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); | 1160 | for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); |
737 | i++, ad2++) { | 1161 | i++, bd2++) { |
738 | memset(ad2, 0, sizeof(struct bau_desc)); | 1162 | memset(bd2, 0, sizeof(struct bau_desc)); |
739 | ad2->header.sw_ack_flag = 1; | 1163 | bd2->header.sw_ack_flag = 1; |
740 | /* | 1164 | /* |
741 | * base_dest_nodeid is the first node in the partition, so | 1165 | * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub |
742 | * the bit map will indicate partition-relative node numbers. | 1166 | * in the partition. The bit map will indicate uvhub numbers, |
743 | * note that base_dest_nodeid is actually a nasid. | 1167 | * which are 0-N in a partition. Pnodes are unique system-wide. |
744 | */ | 1168 | */ |
745 | ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; | 1169 | bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; |
746 | ad2->header.dest_subnodeid = 0x10; /* the LB */ | 1170 | bd2->header.dest_subnodeid = 0x10; /* the LB */ |
747 | ad2->header.command = UV_NET_ENDPOINT_INTD; | 1171 | bd2->header.command = UV_NET_ENDPOINT_INTD; |
748 | ad2->header.int_both = 1; | 1172 | bd2->header.int_both = 1; |
749 | /* | 1173 | /* |
750 | * all others need to be set to zero: | 1174 | * all others need to be set to zero: |
751 | * fairness chaining multilevel count replied_to | 1175 | * fairness chaining multilevel count replied_to |
752 | */ | 1176 | */ |
753 | } | 1177 | } |
754 | return adp; | 1178 | for_each_present_cpu(cpu) { |
1179 | if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) | ||
1180 | continue; | ||
1181 | bcp = &per_cpu(bau_control, cpu); | ||
1182 | bcp->descriptor_base = bau_desc; | ||
1183 | } | ||
755 | } | 1184 | } |
756 | 1185 | ||
757 | /* | 1186 | /* |
758 | * initialize the destination side's receiving buffers | 1187 | * initialize the destination side's receiving buffers |
1188 | * entered for each uvhub in the partition | ||
1189 | * - node is first node (kernel memory notion) on the uvhub | ||
1190 | * - pnode is the uvhub's physical identifier | ||
759 | */ | 1191 | */ |
760 | static struct bau_payload_queue_entry * __init | 1192 | static void |
761 | uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) | 1193 | uv_payload_queue_init(int node, int pnode) |
762 | { | 1194 | { |
763 | struct bau_payload_queue_entry *pqp; | ||
764 | unsigned long pa; | ||
765 | int pn; | 1195 | int pn; |
1196 | int cpu; | ||
766 | char *cp; | 1197 | char *cp; |
1198 | unsigned long pa; | ||
1199 | struct bau_payload_queue_entry *pqp; | ||
1200 | struct bau_payload_queue_entry *pqp_malloc; | ||
1201 | struct bau_control *bcp; | ||
767 | 1202 | ||
768 | pqp = (struct bau_payload_queue_entry *) kmalloc_node( | 1203 | pqp = (struct bau_payload_queue_entry *) kmalloc_node( |
769 | (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), | 1204 | (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), |
770 | GFP_KERNEL, node); | 1205 | GFP_KERNEL, node); |
771 | BUG_ON(!pqp); | 1206 | BUG_ON(!pqp); |
1207 | pqp_malloc = pqp; | ||
772 | 1208 | ||
773 | cp = (char *)pqp + 31; | 1209 | cp = (char *)pqp + 31; |
774 | pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); | 1210 | pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); |
775 | bau_tablesp->va_queue_first = pqp; | 1211 | |
1212 | for_each_present_cpu(cpu) { | ||
1213 | if (pnode != uv_cpu_to_pnode(cpu)) | ||
1214 | continue; | ||
1215 | /* for every cpu on this pnode: */ | ||
1216 | bcp = &per_cpu(bau_control, cpu); | ||
1217 | bcp->va_queue_first = pqp; | ||
1218 | bcp->bau_msg_head = pqp; | ||
1219 | bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); | ||
1220 | } | ||
776 | /* | 1221 | /* |
777 | * need the pnode of where the memory was really allocated | 1222 | * need the pnode of where the memory was really allocated |
778 | */ | 1223 | */ |
779 | pa = uv_gpa(pqp); | 1224 | pa = uv_gpa(pqp); |
780 | pn = uv_gpa_to_pnode(pa); | 1225 | pn = pa >> uv_nshift; |
781 | uv_write_global_mmr64(pnode, | 1226 | uv_write_global_mmr64(pnode, |
782 | UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, | 1227 | UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, |
783 | ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | | 1228 | ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | |
784 | uv_physnodeaddr(pqp)); | 1229 | uv_physnodeaddr(pqp)); |
785 | uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, | 1230 | uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, |
786 | uv_physnodeaddr(pqp)); | 1231 | uv_physnodeaddr(pqp)); |
787 | bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1); | ||
788 | uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, | 1232 | uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, |
789 | (unsigned long) | 1233 | (unsigned long) |
790 | uv_physnodeaddr(bau_tablesp->va_queue_last)); | 1234 | uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1))); |
1235 | /* in effect, all msg_type's are set to MSG_NOOP */ | ||
791 | memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); | 1236 | memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); |
792 | |||
793 | return pqp; | ||
794 | } | 1237 | } |
795 | 1238 | ||
796 | /* | 1239 | /* |
797 | * Initialization of each UV blade's structures | 1240 | * Initialization of each UV hub's structures |
798 | */ | 1241 | */ |
799 | static int __init uv_init_blade(int blade) | 1242 | static void __init uv_init_uvhub(int uvhub, int vector) |
800 | { | 1243 | { |
801 | int node; | 1244 | int node; |
802 | int pnode; | 1245 | int pnode; |
803 | unsigned long pa; | ||
804 | unsigned long apicid; | 1246 | unsigned long apicid; |
805 | struct bau_desc *adp; | 1247 | |
806 | struct bau_payload_queue_entry *pqp; | 1248 | node = uvhub_to_first_node(uvhub); |
807 | struct bau_control *bau_tablesp; | 1249 | pnode = uv_blade_to_pnode(uvhub); |
808 | 1250 | uv_activation_descriptor_init(node, pnode); | |
809 | node = blade_to_first_node(blade); | 1251 | uv_payload_queue_init(node, pnode); |
810 | bau_tablesp = uv_table_bases_init(blade, node); | ||
811 | pnode = uv_blade_to_pnode(blade); | ||
812 | adp = uv_activation_descriptor_init(node, pnode); | ||
813 | pqp = uv_payload_queue_init(node, pnode, bau_tablesp); | ||
814 | uv_table_bases_finish(blade, bau_tablesp, adp); | ||
815 | /* | 1252 | /* |
816 | * the below initialization can't be in firmware because the | 1253 | * the below initialization can't be in firmware because the |
817 | * messaging IRQ will be determined by the OS | 1254 | * messaging IRQ will be determined by the OS |
818 | */ | 1255 | */ |
819 | apicid = blade_to_first_apicid(blade); | 1256 | apicid = uvhub_to_first_apicid(uvhub); |
820 | pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); | ||
821 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, | 1257 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, |
822 | ((apicid << 32) | UV_BAU_MESSAGE)); | 1258 | ((apicid << 32) | vector)); |
823 | return 0; | 1259 | } |
1260 | |||
1261 | /* | ||
1262 | * initialize the bau_control structure for each cpu | ||
1263 | */ | ||
1264 | static void uv_init_per_cpu(int nuvhubs) | ||
1265 | { | ||
1266 | int i, j, k; | ||
1267 | int cpu; | ||
1268 | int pnode; | ||
1269 | int uvhub; | ||
1270 | short socket = 0; | ||
1271 | struct bau_control *bcp; | ||
1272 | struct uvhub_desc *bdp; | ||
1273 | struct socket_desc *sdp; | ||
1274 | struct bau_control *hmaster = NULL; | ||
1275 | struct bau_control *smaster = NULL; | ||
1276 | struct socket_desc { | ||
1277 | short num_cpus; | ||
1278 | short cpu_number[16]; | ||
1279 | }; | ||
1280 | struct uvhub_desc { | ||
1281 | short num_sockets; | ||
1282 | short num_cpus; | ||
1283 | short uvhub; | ||
1284 | short pnode; | ||
1285 | struct socket_desc socket[2]; | ||
1286 | }; | ||
1287 | struct uvhub_desc *uvhub_descs; | ||
1288 | |||
1289 | uvhub_descs = (struct uvhub_desc *) | ||
1290 | kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); | ||
1291 | memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); | ||
1292 | for_each_present_cpu(cpu) { | ||
1293 | bcp = &per_cpu(bau_control, cpu); | ||
1294 | memset(bcp, 0, sizeof(struct bau_control)); | ||
1295 | spin_lock_init(&bcp->masks_lock); | ||
1296 | bcp->max_concurrent = uv_bau_max_concurrent; | ||
1297 | pnode = uv_cpu_hub_info(cpu)->pnode; | ||
1298 | uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; | ||
1299 | bdp = &uvhub_descs[uvhub]; | ||
1300 | bdp->num_cpus++; | ||
1301 | bdp->uvhub = uvhub; | ||
1302 | bdp->pnode = pnode; | ||
1303 | /* time interval to catch a hardware stay-busy bug */ | ||
1304 | bcp->timeout_interval = millisec_2_cycles(3); | ||
1305 | /* kludge: assume uv_hub.h is constant */ | ||
1306 | socket = (cpu_physical_id(cpu)>>5)&1; | ||
1307 | if (socket >= bdp->num_sockets) | ||
1308 | bdp->num_sockets = socket+1; | ||
1309 | sdp = &bdp->socket[socket]; | ||
1310 | sdp->cpu_number[sdp->num_cpus] = cpu; | ||
1311 | sdp->num_cpus++; | ||
1312 | } | ||
1313 | socket = 0; | ||
1314 | for_each_possible_blade(uvhub) { | ||
1315 | bdp = &uvhub_descs[uvhub]; | ||
1316 | for (i = 0; i < bdp->num_sockets; i++) { | ||
1317 | sdp = &bdp->socket[i]; | ||
1318 | for (j = 0; j < sdp->num_cpus; j++) { | ||
1319 | cpu = sdp->cpu_number[j]; | ||
1320 | bcp = &per_cpu(bau_control, cpu); | ||
1321 | bcp->cpu = cpu; | ||
1322 | if (j == 0) { | ||
1323 | smaster = bcp; | ||
1324 | if (i == 0) | ||
1325 | hmaster = bcp; | ||
1326 | } | ||
1327 | bcp->cpus_in_uvhub = bdp->num_cpus; | ||
1328 | bcp->cpus_in_socket = sdp->num_cpus; | ||
1329 | bcp->socket_master = smaster; | ||
1330 | bcp->uvhub_master = hmaster; | ||
1331 | for (k = 0; k < DEST_Q_SIZE; k++) | ||
1332 | bcp->socket_acknowledge_count[k] = 0; | ||
1333 | bcp->uvhub_cpu = | ||
1334 | uv_cpu_hub_info(cpu)->blade_processor_id; | ||
1335 | } | ||
1336 | socket++; | ||
1337 | } | ||
1338 | } | ||
1339 | kfree(uvhub_descs); | ||
824 | } | 1340 | } |
825 | 1341 | ||
826 | /* | 1342 | /* |
@@ -828,38 +1344,54 @@ static int __init uv_init_blade(int blade) | |||
828 | */ | 1344 | */ |
829 | static int __init uv_bau_init(void) | 1345 | static int __init uv_bau_init(void) |
830 | { | 1346 | { |
831 | int blade; | 1347 | int uvhub; |
832 | int nblades; | 1348 | int pnode; |
1349 | int nuvhubs; | ||
833 | int cur_cpu; | 1350 | int cur_cpu; |
1351 | int vector; | ||
1352 | unsigned long mmr; | ||
834 | 1353 | ||
835 | if (!is_uv_system()) | 1354 | if (!is_uv_system()) |
836 | return 0; | 1355 | return 0; |
837 | 1356 | ||
1357 | if (nobau) | ||
1358 | return 0; | ||
1359 | |||
838 | for_each_possible_cpu(cur_cpu) | 1360 | for_each_possible_cpu(cur_cpu) |
839 | zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), | 1361 | zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), |
840 | GFP_KERNEL, cpu_to_node(cur_cpu)); | 1362 | GFP_KERNEL, cpu_to_node(cur_cpu)); |
841 | 1363 | ||
842 | uv_bau_retry_limit = 1; | 1364 | uv_bau_max_concurrent = MAX_BAU_CONCURRENT; |
1365 | uv_nshift = uv_hub_info->m_val; | ||
843 | uv_mmask = (1UL << uv_hub_info->m_val) - 1; | 1366 | uv_mmask = (1UL << uv_hub_info->m_val) - 1; |
844 | nblades = uv_num_possible_blades(); | 1367 | nuvhubs = uv_num_possible_blades(); |
845 | 1368 | ||
846 | uv_bau_table_bases = (struct bau_control **) | 1369 | uv_init_per_cpu(nuvhubs); |
847 | kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); | ||
848 | BUG_ON(!uv_bau_table_bases); | ||
849 | 1370 | ||
850 | uv_partition_base_pnode = 0x7fffffff; | 1371 | uv_partition_base_pnode = 0x7fffffff; |
851 | for (blade = 0; blade < nblades; blade++) | 1372 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) |
852 | if (uv_blade_nr_possible_cpus(blade) && | 1373 | if (uv_blade_nr_possible_cpus(uvhub) && |
853 | (uv_blade_to_pnode(blade) < uv_partition_base_pnode)) | 1374 | (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) |
854 | uv_partition_base_pnode = uv_blade_to_pnode(blade); | 1375 | uv_partition_base_pnode = uv_blade_to_pnode(uvhub); |
855 | for (blade = 0; blade < nblades; blade++) | 1376 | |
856 | if (uv_blade_nr_possible_cpus(blade)) | 1377 | vector = UV_BAU_MESSAGE; |
857 | uv_init_blade(blade); | 1378 | for_each_possible_blade(uvhub) |
858 | 1379 | if (uv_blade_nr_possible_cpus(uvhub)) | |
859 | alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); | 1380 | uv_init_uvhub(uvhub, vector); |
1381 | |||
860 | uv_enable_timeouts(); | 1382 | uv_enable_timeouts(); |
1383 | alloc_intr_gate(vector, uv_bau_message_intr1); | ||
1384 | |||
1385 | for_each_possible_blade(uvhub) { | ||
1386 | pnode = uv_blade_to_pnode(uvhub); | ||
1387 | /* INIT the bau */ | ||
1388 | uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, | ||
1389 | ((unsigned long)1 << 63)); | ||
1390 | mmr = 1; /* should be 1 to broadcast to both sockets */ | ||
1391 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); | ||
1392 | } | ||
861 | 1393 | ||
862 | return 0; | 1394 | return 0; |
863 | } | 1395 | } |
864 | __initcall(uv_bau_init); | 1396 | core_initcall(uv_bau_init); |
865 | __initcall(uv_ptc_init); | 1397 | core_initcall(uv_ptc_init); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1168e4454188..60788dee0f8a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/kprobes.h> | 15 | #include <linux/kprobes.h> |
16 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
17 | #include <linux/kdebug.h> | 17 | #include <linux/kdebug.h> |
18 | #include <linux/kgdb.h> | ||
18 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
19 | #include <linux/module.h> | 20 | #include <linux/module.h> |
20 | #include <linux/ptrace.h> | 21 | #include <linux/ptrace.h> |
@@ -108,15 +109,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) | |||
108 | dec_preempt_count(); | 109 | dec_preempt_count(); |
109 | } | 110 | } |
110 | 111 | ||
111 | #ifdef CONFIG_X86_32 | ||
112 | static inline void | ||
113 | die_if_kernel(const char *str, struct pt_regs *regs, long err) | ||
114 | { | ||
115 | if (!user_mode_vm(regs)) | ||
116 | die(str, regs, err); | ||
117 | } | ||
118 | #endif | ||
119 | |||
120 | static void __kprobes | 112 | static void __kprobes |
121 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, | 113 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, |
122 | long error_code, siginfo_t *info) | 114 | long error_code, siginfo_t *info) |
@@ -400,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
400 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | 392 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) |
401 | == NOTIFY_STOP) | 393 | == NOTIFY_STOP) |
402 | return; | 394 | return; |
395 | |||
403 | #ifdef CONFIG_X86_LOCAL_APIC | 396 | #ifdef CONFIG_X86_LOCAL_APIC |
397 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
398 | == NOTIFY_STOP) | ||
399 | return; | ||
400 | |||
401 | #ifndef CONFIG_LOCKUP_DETECTOR | ||
404 | /* | 402 | /* |
405 | * Ok, so this is none of the documented NMI sources, | 403 | * Ok, so this is none of the documented NMI sources, |
406 | * so it must be the NMI watchdog. | 404 | * so it must be the NMI watchdog. |
@@ -408,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
408 | if (nmi_watchdog_tick(regs, reason)) | 406 | if (nmi_watchdog_tick(regs, reason)) |
409 | return; | 407 | return; |
410 | if (!do_nmi_callback(regs, cpu)) | 408 | if (!do_nmi_callback(regs, cpu)) |
409 | #endif /* !CONFIG_LOCKUP_DETECTOR */ | ||
411 | unknown_nmi_error(reason, regs); | 410 | unknown_nmi_error(reason, regs); |
412 | #else | 411 | #else |
413 | unknown_nmi_error(reason, regs); | 412 | unknown_nmi_error(reason, regs); |
@@ -460,6 +459,11 @@ void restart_nmi(void) | |||
460 | /* May run on IST stack. */ | 459 | /* May run on IST stack. */ |
461 | dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) | 460 | dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) |
462 | { | 461 | { |
462 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP | ||
463 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | ||
464 | == NOTIFY_STOP) | ||
465 | return; | ||
466 | #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ | ||
463 | #ifdef CONFIG_KPROBES | 467 | #ifdef CONFIG_KPROBES |
464 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | 468 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) |
465 | == NOTIFY_STOP) | 469 | == NOTIFY_STOP) |
@@ -529,6 +533,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | |||
529 | dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | 533 | dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) |
530 | { | 534 | { |
531 | struct task_struct *tsk = current; | 535 | struct task_struct *tsk = current; |
536 | int user_icebp = 0; | ||
532 | unsigned long dr6; | 537 | unsigned long dr6; |
533 | int si_code; | 538 | int si_code; |
534 | 539 | ||
@@ -537,17 +542,25 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
537 | /* Filter out all the reserved bits which are preset to 1 */ | 542 | /* Filter out all the reserved bits which are preset to 1 */ |
538 | dr6 &= ~DR6_RESERVED; | 543 | dr6 &= ~DR6_RESERVED; |
539 | 544 | ||
545 | /* | ||
546 | * If dr6 has no reason to give us about the origin of this trap, | ||
547 | * then it's very likely the result of an icebp/int01 trap. | ||
548 | * User wants a sigtrap for that. | ||
549 | */ | ||
550 | if (!dr6 && user_mode(regs)) | ||
551 | user_icebp = 1; | ||
552 | |||
540 | /* Catch kmemcheck conditions first of all! */ | 553 | /* Catch kmemcheck conditions first of all! */ |
541 | if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) | 554 | if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) |
542 | return; | 555 | return; |
543 | 556 | ||
544 | /* DR6 may or may not be cleared by the CPU */ | 557 | /* DR6 may or may not be cleared by the CPU */ |
545 | set_debugreg(0, 6); | 558 | set_debugreg(0, 6); |
559 | |||
546 | /* | 560 | /* |
547 | * The processor cleared BTF, so don't mark that we need it set. | 561 | * The processor cleared BTF, so don't mark that we need it set. |
548 | */ | 562 | */ |
549 | clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); | 563 | clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); |
550 | tsk->thread.debugctlmsr = 0; | ||
551 | 564 | ||
552 | /* Store the virtualized DR6 value */ | 565 | /* Store the virtualized DR6 value */ |
553 | tsk->thread.debugreg6 = dr6; | 566 | tsk->thread.debugreg6 = dr6; |
@@ -578,62 +591,74 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
578 | regs->flags &= ~X86_EFLAGS_TF; | 591 | regs->flags &= ~X86_EFLAGS_TF; |
579 | } | 592 | } |
580 | si_code = get_si_code(tsk->thread.debugreg6); | 593 | si_code = get_si_code(tsk->thread.debugreg6); |
581 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) | 594 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) |
582 | send_sigtrap(tsk, regs, error_code, si_code); | 595 | send_sigtrap(tsk, regs, error_code, si_code); |
583 | preempt_conditional_cli(regs); | 596 | preempt_conditional_cli(regs); |
584 | 597 | ||
585 | return; | 598 | return; |
586 | } | 599 | } |
587 | 600 | ||
588 | #ifdef CONFIG_X86_64 | ||
589 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | ||
590 | { | ||
591 | if (fixup_exception(regs)) | ||
592 | return 1; | ||
593 | |||
594 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | ||
595 | /* Illegal floating point operation in the kernel */ | ||
596 | current->thread.trap_no = trapnr; | ||
597 | die(str, regs, 0); | ||
598 | return 0; | ||
599 | } | ||
600 | #endif | ||
601 | |||
602 | /* | 601 | /* |
603 | * Note that we play around with the 'TS' bit in an attempt to get | 602 | * Note that we play around with the 'TS' bit in an attempt to get |
604 | * the correct behaviour even in the presence of the asynchronous | 603 | * the correct behaviour even in the presence of the asynchronous |
605 | * IRQ13 behaviour | 604 | * IRQ13 behaviour |
606 | */ | 605 | */ |
607 | void math_error(void __user *ip) | 606 | void math_error(struct pt_regs *regs, int error_code, int trapnr) |
608 | { | 607 | { |
609 | struct task_struct *task; | 608 | struct task_struct *task = current; |
610 | siginfo_t info; | 609 | siginfo_t info; |
611 | unsigned short cwd, swd, err; | 610 | unsigned short err; |
611 | char *str = (trapnr == 16) ? "fpu exception" : "simd exception"; | ||
612 | |||
613 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) | ||
614 | return; | ||
615 | conditional_sti(regs); | ||
616 | |||
617 | if (!user_mode_vm(regs)) | ||
618 | { | ||
619 | if (!fixup_exception(regs)) { | ||
620 | task->thread.error_code = error_code; | ||
621 | task->thread.trap_no = trapnr; | ||
622 | die(str, regs, error_code); | ||
623 | } | ||
624 | return; | ||
625 | } | ||
612 | 626 | ||
613 | /* | 627 | /* |
614 | * Save the info for the exception handler and clear the error. | 628 | * Save the info for the exception handler and clear the error. |
615 | */ | 629 | */ |
616 | task = current; | ||
617 | save_init_fpu(task); | 630 | save_init_fpu(task); |
618 | task->thread.trap_no = 16; | 631 | task->thread.trap_no = trapnr; |
619 | task->thread.error_code = 0; | 632 | task->thread.error_code = error_code; |
620 | info.si_signo = SIGFPE; | 633 | info.si_signo = SIGFPE; |
621 | info.si_errno = 0; | 634 | info.si_errno = 0; |
622 | info.si_addr = ip; | 635 | info.si_addr = (void __user *)regs->ip; |
623 | /* | 636 | if (trapnr == 16) { |
624 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | 637 | unsigned short cwd, swd; |
625 | * status. 0x3f is the exception bits in these regs, 0x200 is the | 638 | /* |
626 | * C1 reg you need in case of a stack fault, 0x040 is the stack | 639 | * (~cwd & swd) will mask out exceptions that are not set to unmasked |
627 | * fault bit. We should only be taking one exception at a time, | 640 | * status. 0x3f is the exception bits in these regs, 0x200 is the |
628 | * so if this combination doesn't produce any single exception, | 641 | * C1 reg you need in case of a stack fault, 0x040 is the stack |
629 | * then we have a bad program that isn't synchronizing its FPU usage | 642 | * fault bit. We should only be taking one exception at a time, |
630 | * and it will suffer the consequences since we won't be able to | 643 | * so if this combination doesn't produce any single exception, |
631 | * fully reproduce the context of the exception | 644 | * then we have a bad program that isn't synchronizing its FPU usage |
632 | */ | 645 | * and it will suffer the consequences since we won't be able to |
633 | cwd = get_fpu_cwd(task); | 646 | * fully reproduce the context of the exception |
634 | swd = get_fpu_swd(task); | 647 | */ |
648 | cwd = get_fpu_cwd(task); | ||
649 | swd = get_fpu_swd(task); | ||
635 | 650 | ||
636 | err = swd & ~cwd; | 651 | err = swd & ~cwd; |
652 | } else { | ||
653 | /* | ||
654 | * The SIMD FPU exceptions are handled a little differently, as there | ||
655 | * is only a single status/control register. Thus, to determine which | ||
656 | * unmasked exception was caught we must mask the exception mask bits | ||
657 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
658 | */ | ||
659 | unsigned short mxcsr = get_fpu_mxcsr(task); | ||
660 | err = ~(mxcsr >> 7) & mxcsr; | ||
661 | } | ||
637 | 662 | ||
638 | if (err & 0x001) { /* Invalid op */ | 663 | if (err & 0x001) { /* Invalid op */ |
639 | /* | 664 | /* |
@@ -662,97 +687,17 @@ void math_error(void __user *ip) | |||
662 | 687 | ||
663 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) | 688 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) |
664 | { | 689 | { |
665 | conditional_sti(regs); | ||
666 | |||
667 | #ifdef CONFIG_X86_32 | 690 | #ifdef CONFIG_X86_32 |
668 | ignore_fpu_irq = 1; | 691 | ignore_fpu_irq = 1; |
669 | #else | ||
670 | if (!user_mode(regs) && | ||
671 | kernel_math_error(regs, "kernel x87 math error", 16)) | ||
672 | return; | ||
673 | #endif | 692 | #endif |
674 | 693 | ||
675 | math_error((void __user *)regs->ip); | 694 | math_error(regs, error_code, 16); |
676 | } | ||
677 | |||
678 | static void simd_math_error(void __user *ip) | ||
679 | { | ||
680 | struct task_struct *task; | ||
681 | siginfo_t info; | ||
682 | unsigned short mxcsr; | ||
683 | |||
684 | /* | ||
685 | * Save the info for the exception handler and clear the error. | ||
686 | */ | ||
687 | task = current; | ||
688 | save_init_fpu(task); | ||
689 | task->thread.trap_no = 19; | ||
690 | task->thread.error_code = 0; | ||
691 | info.si_signo = SIGFPE; | ||
692 | info.si_errno = 0; | ||
693 | info.si_code = __SI_FAULT; | ||
694 | info.si_addr = ip; | ||
695 | /* | ||
696 | * The SIMD FPU exceptions are handled a little differently, as there | ||
697 | * is only a single status/control register. Thus, to determine which | ||
698 | * unmasked exception was caught we must mask the exception mask bits | ||
699 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
700 | */ | ||
701 | mxcsr = get_fpu_mxcsr(task); | ||
702 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
703 | case 0x000: | ||
704 | default: | ||
705 | break; | ||
706 | case 0x001: /* Invalid Op */ | ||
707 | info.si_code = FPE_FLTINV; | ||
708 | break; | ||
709 | case 0x002: /* Denormalize */ | ||
710 | case 0x010: /* Underflow */ | ||
711 | info.si_code = FPE_FLTUND; | ||
712 | break; | ||
713 | case 0x004: /* Zero Divide */ | ||
714 | info.si_code = FPE_FLTDIV; | ||
715 | break; | ||
716 | case 0x008: /* Overflow */ | ||
717 | info.si_code = FPE_FLTOVF; | ||
718 | break; | ||
719 | case 0x020: /* Precision */ | ||
720 | info.si_code = FPE_FLTRES; | ||
721 | break; | ||
722 | } | ||
723 | force_sig_info(SIGFPE, &info, task); | ||
724 | } | 695 | } |
725 | 696 | ||
726 | dotraplinkage void | 697 | dotraplinkage void |
727 | do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | 698 | do_simd_coprocessor_error(struct pt_regs *regs, long error_code) |
728 | { | 699 | { |
729 | conditional_sti(regs); | 700 | math_error(regs, error_code, 19); |
730 | |||
731 | #ifdef CONFIG_X86_32 | ||
732 | if (cpu_has_xmm) { | ||
733 | /* Handle SIMD FPU exceptions on PIII+ processors. */ | ||
734 | ignore_fpu_irq = 1; | ||
735 | simd_math_error((void __user *)regs->ip); | ||
736 | return; | ||
737 | } | ||
738 | /* | ||
739 | * Handle strange cache flush from user space exception | ||
740 | * in all other cases. This is undocumented behaviour. | ||
741 | */ | ||
742 | if (regs->flags & X86_VM_MASK) { | ||
743 | handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); | ||
744 | return; | ||
745 | } | ||
746 | current->thread.trap_no = 19; | ||
747 | current->thread.error_code = error_code; | ||
748 | die_if_kernel("cache flush denied", regs, error_code); | ||
749 | force_sig(SIGSEGV, current); | ||
750 | #else | ||
751 | if (!user_mode(regs) && | ||
752 | kernel_math_error(regs, "kernel simd math error", 19)) | ||
753 | return; | ||
754 | simd_math_error((void __user *)regs->ip); | ||
755 | #endif | ||
756 | } | 701 | } |
757 | 702 | ||
758 | dotraplinkage void | 703 | dotraplinkage void |
@@ -879,6 +824,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | |||
879 | } | 824 | } |
880 | #endif | 825 | #endif |
881 | 826 | ||
827 | /* Set of traps needed for early debugging. */ | ||
828 | void __init early_trap_init(void) | ||
829 | { | ||
830 | set_intr_gate_ist(1, &debug, DEBUG_STACK); | ||
831 | /* int3 can be called from all */ | ||
832 | set_system_intr_gate_ist(3, &int3, DEBUG_STACK); | ||
833 | set_intr_gate(14, &page_fault); | ||
834 | load_idt(&idt_descr); | ||
835 | } | ||
836 | |||
882 | void __init trap_init(void) | 837 | void __init trap_init(void) |
883 | { | 838 | { |
884 | int i; | 839 | int i; |
@@ -892,10 +847,7 @@ void __init trap_init(void) | |||
892 | #endif | 847 | #endif |
893 | 848 | ||
894 | set_intr_gate(0, ÷_error); | 849 | set_intr_gate(0, ÷_error); |
895 | set_intr_gate_ist(1, &debug, DEBUG_STACK); | ||
896 | set_intr_gate_ist(2, &nmi, NMI_STACK); | 850 | set_intr_gate_ist(2, &nmi, NMI_STACK); |
897 | /* int3 can be called from all */ | ||
898 | set_system_intr_gate_ist(3, &int3, DEBUG_STACK); | ||
899 | /* int4 can be called from all */ | 851 | /* int4 can be called from all */ |
900 | set_system_intr_gate(4, &overflow); | 852 | set_system_intr_gate(4, &overflow); |
901 | set_intr_gate(5, &bounds); | 853 | set_intr_gate(5, &bounds); |
@@ -911,7 +863,6 @@ void __init trap_init(void) | |||
911 | set_intr_gate(11, &segment_not_present); | 863 | set_intr_gate(11, &segment_not_present); |
912 | set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); | 864 | set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); |
913 | set_intr_gate(13, &general_protection); | 865 | set_intr_gate(13, &general_protection); |
914 | set_intr_gate(14, &page_fault); | ||
915 | set_intr_gate(15, &spurious_interrupt_bug); | 866 | set_intr_gate(15, &spurious_interrupt_bug); |
916 | set_intr_gate(16, &coprocessor_error); | 867 | set_intr_gate(16, &coprocessor_error); |
917 | set_intr_gate(17, &alignment_check); | 868 | set_intr_gate(17, &alignment_check); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae1841..ce8e50239332 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = { | |||
751 | .read = read_tsc, | 751 | .read = read_tsc, |
752 | .resume = resume_tsc, | 752 | .resume = resume_tsc, |
753 | .mask = CLOCKSOURCE_MASK(64), | 753 | .mask = CLOCKSOURCE_MASK(64), |
754 | .shift = 22, | ||
755 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | 754 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | |
756 | CLOCK_SOURCE_MUST_VERIFY, | 755 | CLOCK_SOURCE_MUST_VERIFY, |
757 | #ifdef CONFIG_X86_64 | 756 | #ifdef CONFIG_X86_64 |
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void) | |||
845 | 844 | ||
846 | static void __init init_tsc_clocksource(void) | 845 | static void __init init_tsc_clocksource(void) |
847 | { | 846 | { |
848 | clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, | ||
849 | clocksource_tsc.shift); | ||
850 | if (tsc_clocksource_reliable) | 847 | if (tsc_clocksource_reliable) |
851 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; | 848 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; |
852 | /* lower the rating if we already know its unstable: */ | 849 | /* lower the rating if we already know its unstable: */ |
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void) | |||
854 | clocksource_tsc.rating = 0; | 851 | clocksource_tsc.rating = 0; |
855 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; | 852 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; |
856 | } | 853 | } |
857 | clocksource_register(&clocksource_tsc); | 854 | clocksource_register_khz(&clocksource_tsc, tsc_khz); |
858 | } | 855 | } |
859 | 856 | ||
860 | #ifdef CONFIG_X86_64 | 857 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 1d40336b030a..1132129db792 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c | |||
@@ -44,7 +44,7 @@ static void uv_ack_apic(unsigned int irq) | |||
44 | ack_APIC_irq(); | 44 | ack_APIC_irq(); |
45 | } | 45 | } |
46 | 46 | ||
47 | struct irq_chip uv_irq_chip = { | 47 | static struct irq_chip uv_irq_chip = { |
48 | .name = "UV-CORE", | 48 | .name = "UV-CORE", |
49 | .startup = uv_noop_ret, | 49 | .startup = uv_noop_ret, |
50 | .shutdown = uv_noop, | 50 | .shutdown = uv_noop, |
@@ -141,7 +141,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) | |||
141 | */ | 141 | */ |
142 | static int | 142 | static int |
143 | arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, | 143 | arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, |
144 | unsigned long mmr_offset, int restrict) | 144 | unsigned long mmr_offset, int limit) |
145 | { | 145 | { |
146 | const struct cpumask *eligible_cpu = cpumask_of(cpu); | 146 | const struct cpumask *eligible_cpu = cpumask_of(cpu); |
147 | struct irq_desc *desc = irq_to_desc(irq); | 147 | struct irq_desc *desc = irq_to_desc(irq); |
@@ -160,7 +160,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, | |||
160 | if (err != 0) | 160 | if (err != 0) |
161 | return err; | 161 | return err; |
162 | 162 | ||
163 | if (restrict == UV_AFFINITY_CPU) | 163 | if (limit == UV_AFFINITY_CPU) |
164 | desc->status |= IRQ_NO_BALANCING; | 164 | desc->status |= IRQ_NO_BALANCING; |
165 | else | 165 | else |
166 | desc->status |= IRQ_MOVE_PCNTXT; | 166 | desc->status |= IRQ_MOVE_PCNTXT; |
@@ -214,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
214 | unsigned long mmr_value; | 214 | unsigned long mmr_value; |
215 | struct uv_IO_APIC_route_entry *entry; | 215 | struct uv_IO_APIC_route_entry *entry; |
216 | unsigned long mmr_offset; | 216 | unsigned long mmr_offset; |
217 | unsigned mmr_pnode; | 217 | int mmr_pnode; |
218 | 218 | ||
219 | if (set_desc_affinity(desc, mask, &dest)) | 219 | if (set_desc_affinity(desc, mask, &dest)) |
220 | return -1; | 220 | return -1; |
@@ -248,7 +248,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) | |||
248 | * interrupt is raised. | 248 | * interrupt is raised. |
249 | */ | 249 | */ |
250 | int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, | 250 | int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, |
251 | unsigned long mmr_offset, int restrict) | 251 | unsigned long mmr_offset, int limit) |
252 | { | 252 | { |
253 | int irq, ret; | 253 | int irq, ret; |
254 | 254 | ||
@@ -258,7 +258,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, | |||
258 | return -EBUSY; | 258 | return -EBUSY; |
259 | 259 | ||
260 | ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, | 260 | ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, |
261 | restrict); | 261 | limit); |
262 | if (ret == irq) | 262 | if (ret == irq) |
263 | uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); | 263 | uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); |
264 | else | 264 | else |
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a975a1..56a8c2a867d9 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S | |||
@@ -31,6 +31,7 @@ | |||
31 | */ | 31 | */ |
32 | 32 | ||
33 | #include <asm/cpufeature.h> | 33 | #include <asm/cpufeature.h> |
34 | #include <asm/msr-index.h> | ||
34 | 35 | ||
35 | verify_cpu: | 36 | verify_cpu: |
36 | pushfl # Save caller passed flags | 37 | pushfl # Save caller passed flags |
@@ -88,7 +89,7 @@ verify_cpu_sse_test: | |||
88 | je verify_cpu_sse_ok | 89 | je verify_cpu_sse_ok |
89 | test %di,%di | 90 | test %di,%di |
90 | jz verify_cpu_no_longmode # only try to force SSE on AMD | 91 | jz verify_cpu_no_longmode # only try to force SSE on AMD |
91 | movl $0xc0010015,%ecx # HWCR | 92 | movl $MSR_K7_HWCR,%ecx |
92 | rdmsr | 93 | rdmsr |
93 | btr $15,%eax # enable SSE | 94 | btr $15,%eax # enable SSE |
94 | wrmsr | 95 | wrmsr |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 2cc249718c46..d0bb52296fa3 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -97,7 +97,7 @@ SECTIONS | |||
97 | HEAD_TEXT | 97 | HEAD_TEXT |
98 | #ifdef CONFIG_X86_32 | 98 | #ifdef CONFIG_X86_32 |
99 | . = ALIGN(PAGE_SIZE); | 99 | . = ALIGN(PAGE_SIZE); |
100 | *(.text.page_aligned) | 100 | *(.text..page_aligned) |
101 | #endif | 101 | #endif |
102 | . = ALIGN(8); | 102 | . = ALIGN(8); |
103 | _stext = .; | 103 | _stext = .; |
@@ -305,7 +305,7 @@ SECTIONS | |||
305 | . = ALIGN(PAGE_SIZE); | 305 | . = ALIGN(PAGE_SIZE); |
306 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { | 306 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { |
307 | __bss_start = .; | 307 | __bss_start = .; |
308 | *(.bss.page_aligned) | 308 | *(.bss..page_aligned) |
309 | *(.bss) | 309 | *(.bss) |
310 | . = ALIGN(4); | 310 | . = ALIGN(4); |
311 | __bss_stop = .; | 311 | __bss_stop = .; |
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab9c60f..dcbb28c4b694 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void) | |||
73 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | 73 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
74 | } | 74 | } |
75 | 75 | ||
76 | void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, | 76 | void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, |
77 | u32 mult) | 77 | struct clocksource *clock, u32 mult) |
78 | { | 78 | { |
79 | unsigned long flags; | 79 | unsigned long flags; |
80 | 80 | ||
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, | |||
87 | vsyscall_gtod_data.clock.shift = clock->shift; | 87 | vsyscall_gtod_data.clock.shift = clock->shift; |
88 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; | 88 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; |
89 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | 89 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; |
90 | vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; | 90 | vsyscall_gtod_data.wall_to_monotonic = *wtm; |
91 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); | 91 | vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); |
92 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | 92 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); |
93 | } | 93 | } |
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | |||
169 | * unlikely */ | 169 | * unlikely */ |
170 | time_t __vsyscall(1) vtime(time_t *t) | 170 | time_t __vsyscall(1) vtime(time_t *t) |
171 | { | 171 | { |
172 | struct timeval tv; | 172 | unsigned seq; |
173 | time_t result; | 173 | time_t result; |
174 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) | 174 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) |
175 | return time_syscall(t); | 175 | return time_syscall(t); |
176 | 176 | ||
177 | vgettimeofday(&tv, NULL); | 177 | do { |
178 | result = tv.tv_sec; | 178 | seq = read_seqbegin(&__vsyscall_gtod_data.lock); |
179 | |||
180 | result = __vsyscall_gtod_data.wall_time_sec; | ||
181 | |||
182 | } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); | ||
183 | |||
179 | if (t) | 184 | if (t) |
180 | *t = result; | 185 | *t = result; |
181 | return result; | 186 | return result; |
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 693920b22496..1b950d151e58 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c | |||
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy); | |||
54 | EXPORT_SYMBOL(__memcpy); | 54 | EXPORT_SYMBOL(__memcpy); |
55 | 55 | ||
56 | EXPORT_SYMBOL(empty_zero_page); | 56 | EXPORT_SYMBOL(empty_zero_page); |
57 | EXPORT_SYMBOL(init_level4_pgt); | ||
58 | #ifndef CONFIG_PARAVIRT | 57 | #ifndef CONFIG_PARAVIRT |
59 | EXPORT_SYMBOL(native_load_gs_index); | 58 | EXPORT_SYMBOL(native_load_gs_index); |
60 | #endif | 59 | #endif |
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 61a1e8c7e19f..cd6da6bf3eca 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c | |||
@@ -5,6 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
7 | #include <linux/ioport.h> | 7 | #include <linux/ioport.h> |
8 | #include <linux/module.h> | ||
8 | 9 | ||
9 | #include <asm/bios_ebda.h> | 10 | #include <asm/bios_ebda.h> |
10 | #include <asm/paravirt.h> | 11 | #include <asm/paravirt.h> |
@@ -85,6 +86,7 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { | |||
85 | }; | 86 | }; |
86 | 87 | ||
87 | static void default_nmi_init(void) { }; | 88 | static void default_nmi_init(void) { }; |
89 | static int default_i8042_detect(void) { return 1; }; | ||
88 | 90 | ||
89 | struct x86_platform_ops x86_platform = { | 91 | struct x86_platform_ops x86_platform = { |
90 | .calibrate_tsc = native_calibrate_tsc, | 92 | .calibrate_tsc = native_calibrate_tsc, |
@@ -92,5 +94,8 @@ struct x86_platform_ops x86_platform = { | |||
92 | .set_wallclock = mach_set_rtc_mmss, | 94 | .set_wallclock = mach_set_rtc_mmss, |
93 | .iommu_shutdown = iommu_shutdown_noop, | 95 | .iommu_shutdown = iommu_shutdown_noop, |
94 | .is_untracked_pat_range = is_ISA_range, | 96 | .is_untracked_pat_range = is_ISA_range, |
95 | .nmi_init = default_nmi_init | 97 | .nmi_init = default_nmi_init, |
98 | .i8042_detect = default_i8042_detect | ||
96 | }; | 99 | }; |
100 | |||
101 | EXPORT_SYMBOL_GPL(x86_platform); | ||
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 782c3a362ec6..9c253bd65e24 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -16,11 +16,88 @@ | |||
16 | */ | 16 | */ |
17 | u64 pcntxt_mask; | 17 | u64 pcntxt_mask; |
18 | 18 | ||
19 | /* | ||
20 | * Represents init state for the supported extended state. | ||
21 | */ | ||
22 | static struct xsave_struct *init_xstate_buf; | ||
23 | |||
19 | struct _fpx_sw_bytes fx_sw_reserved; | 24 | struct _fpx_sw_bytes fx_sw_reserved; |
20 | #ifdef CONFIG_IA32_EMULATION | 25 | #ifdef CONFIG_IA32_EMULATION |
21 | struct _fpx_sw_bytes fx_sw_reserved_ia32; | 26 | struct _fpx_sw_bytes fx_sw_reserved_ia32; |
22 | #endif | 27 | #endif |
23 | 28 | ||
29 | static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; | ||
30 | |||
31 | /* | ||
32 | * If a processor implementation discern that a processor state component is | ||
33 | * in its initialized state it may modify the corresponding bit in the | ||
34 | * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory | ||
35 | * layout in the case of xsaveopt. While presenting the xstate information to | ||
36 | * the user, we always ensure that the memory layout of a feature will be in | ||
37 | * the init state if the corresponding header bit is zero. This is to ensure | ||
38 | * that the user doesn't see some stale state in the memory layout during | ||
39 | * signal handling, debugging etc. | ||
40 | */ | ||
41 | void __sanitize_i387_state(struct task_struct *tsk) | ||
42 | { | ||
43 | u64 xstate_bv; | ||
44 | int feature_bit = 0x2; | ||
45 | struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; | ||
46 | |||
47 | if (!fx) | ||
48 | return; | ||
49 | |||
50 | BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); | ||
51 | |||
52 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; | ||
53 | |||
54 | /* | ||
55 | * None of the feature bits are in init state. So nothing else | ||
56 | * to do for us, as the memory layout is upto date. | ||
57 | */ | ||
58 | if ((xstate_bv & pcntxt_mask) == pcntxt_mask) | ||
59 | return; | ||
60 | |||
61 | /* | ||
62 | * FP is in init state | ||
63 | */ | ||
64 | if (!(xstate_bv & XSTATE_FP)) { | ||
65 | fx->cwd = 0x37f; | ||
66 | fx->swd = 0; | ||
67 | fx->twd = 0; | ||
68 | fx->fop = 0; | ||
69 | fx->rip = 0; | ||
70 | fx->rdp = 0; | ||
71 | memset(&fx->st_space[0], 0, 128); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * SSE is in init state | ||
76 | */ | ||
77 | if (!(xstate_bv & XSTATE_SSE)) | ||
78 | memset(&fx->xmm_space[0], 0, 256); | ||
79 | |||
80 | xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; | ||
81 | |||
82 | /* | ||
83 | * Update all the other memory layouts for which the corresponding | ||
84 | * header bit is in the init state. | ||
85 | */ | ||
86 | while (xstate_bv) { | ||
87 | if (xstate_bv & 0x1) { | ||
88 | int offset = xstate_offsets[feature_bit]; | ||
89 | int size = xstate_sizes[feature_bit]; | ||
90 | |||
91 | memcpy(((void *) fx) + offset, | ||
92 | ((void *) init_xstate_buf) + offset, | ||
93 | size); | ||
94 | } | ||
95 | |||
96 | xstate_bv >>= 1; | ||
97 | feature_bit++; | ||
98 | } | ||
99 | } | ||
100 | |||
24 | /* | 101 | /* |
25 | * Check for the presence of extended state information in the | 102 | * Check for the presence of extended state information in the |
26 | * user fpstate pointer in the sigcontext. | 103 | * user fpstate pointer in the sigcontext. |
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, | |||
36 | 113 | ||
37 | err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], | 114 | err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], |
38 | sizeof(struct _fpx_sw_bytes)); | 115 | sizeof(struct _fpx_sw_bytes)); |
39 | |||
40 | if (err) | 116 | if (err) |
41 | return err; | 117 | return -EFAULT; |
42 | 118 | ||
43 | /* | 119 | /* |
44 | * First Magic check failed. | 120 | * First Magic check failed. |
45 | */ | 121 | */ |
46 | if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) | 122 | if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) |
47 | return -1; | 123 | return -EINVAL; |
48 | 124 | ||
49 | /* | 125 | /* |
50 | * Check for error scenarios. | 126 | * Check for error scenarios. |
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, | |||
52 | if (fx_sw_user->xstate_size < min_xstate_size || | 128 | if (fx_sw_user->xstate_size < min_xstate_size || |
53 | fx_sw_user->xstate_size > xstate_size || | 129 | fx_sw_user->xstate_size > xstate_size || |
54 | fx_sw_user->xstate_size > fx_sw_user->extended_size) | 130 | fx_sw_user->xstate_size > fx_sw_user->extended_size) |
55 | return -1; | 131 | return -EINVAL; |
56 | 132 | ||
57 | err = __get_user(magic2, (__u32 *) (((void *)fpstate) + | 133 | err = __get_user(magic2, (__u32 *) (((void *)fpstate) + |
58 | fx_sw_user->extended_size - | 134 | fx_sw_user->extended_size - |
59 | FP_XSTATE_MAGIC2_SIZE)); | 135 | FP_XSTATE_MAGIC2_SIZE)); |
136 | if (err) | ||
137 | return err; | ||
60 | /* | 138 | /* |
61 | * Check for the presence of second magic word at the end of memory | 139 | * Check for the presence of second magic word at the end of memory |
62 | * layout. This detects the case where the user just copied the legacy | 140 | * layout. This detects the case where the user just copied the legacy |
63 | * fpstate layout with out copying the extended state information | 141 | * fpstate layout with out copying the extended state information |
64 | * in the memory layout. | 142 | * in the memory layout. |
65 | */ | 143 | */ |
66 | if (err || magic2 != FP_XSTATE_MAGIC2) | 144 | if (magic2 != FP_XSTATE_MAGIC2) |
67 | return -1; | 145 | return -EFAULT; |
68 | 146 | ||
69 | return 0; | 147 | return 0; |
70 | } | 148 | } |
@@ -91,15 +169,7 @@ int save_i387_xstate(void __user *buf) | |||
91 | return 0; | 169 | return 0; |
92 | 170 | ||
93 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | 171 | if (task_thread_info(tsk)->status & TS_USEDFPU) { |
94 | /* | 172 | if (use_xsave()) |
95 | * Start with clearing the user buffer. This will present a | ||
96 | * clean context for the bytes not touched by the fxsave/xsave. | ||
97 | */ | ||
98 | err = __clear_user(buf, sig_xstate_size); | ||
99 | if (err) | ||
100 | return err; | ||
101 | |||
102 | if (task_thread_info(tsk)->status & TS_XSAVE) | ||
103 | err = xsave_user(buf); | 173 | err = xsave_user(buf); |
104 | else | 174 | else |
105 | err = fxsave_user(buf); | 175 | err = fxsave_user(buf); |
@@ -109,14 +179,15 @@ int save_i387_xstate(void __user *buf) | |||
109 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | 179 | task_thread_info(tsk)->status &= ~TS_USEDFPU; |
110 | stts(); | 180 | stts(); |
111 | } else { | 181 | } else { |
112 | if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, | 182 | sanitize_i387_state(tsk); |
183 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, | ||
113 | xstate_size)) | 184 | xstate_size)) |
114 | return -1; | 185 | return -1; |
115 | } | 186 | } |
116 | 187 | ||
117 | clear_used_math(); /* trigger finit */ | 188 | clear_used_math(); /* trigger finit */ |
118 | 189 | ||
119 | if (task_thread_info(tsk)->status & TS_XSAVE) { | 190 | if (use_xsave()) { |
120 | struct _fpstate __user *fx = buf; | 191 | struct _fpstate __user *fx = buf; |
121 | struct _xstate __user *x = buf; | 192 | struct _xstate __user *x = buf; |
122 | u64 xstate_bv; | 193 | u64 xstate_bv; |
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf) | |||
184 | * init the state skipped by the user. | 255 | * init the state skipped by the user. |
185 | */ | 256 | */ |
186 | mask = pcntxt_mask & ~mask; | 257 | mask = pcntxt_mask & ~mask; |
187 | 258 | if (unlikely(mask)) | |
188 | xrstor_state(init_xstate_buf, mask); | 259 | xrstor_state(init_xstate_buf, mask); |
189 | 260 | ||
190 | return 0; | 261 | return 0; |
191 | 262 | ||
@@ -225,7 +296,7 @@ int restore_i387_xstate(void __user *buf) | |||
225 | clts(); | 296 | clts(); |
226 | task_thread_info(current)->status |= TS_USEDFPU; | 297 | task_thread_info(current)->status |= TS_USEDFPU; |
227 | } | 298 | } |
228 | if (task_thread_info(tsk)->status & TS_XSAVE) | 299 | if (use_xsave()) |
229 | err = restore_user_xstate(buf); | 300 | err = restore_user_xstate(buf); |
230 | else | 301 | else |
231 | err = fxrstor_checking((__force struct i387_fxsave_struct *) | 302 | err = fxrstor_checking((__force struct i387_fxsave_struct *) |
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void) | |||
274 | #endif | 345 | #endif |
275 | } | 346 | } |
276 | 347 | ||
277 | /* | ||
278 | * Represents init state for the supported extended state. | ||
279 | */ | ||
280 | struct xsave_struct *init_xstate_buf; | ||
281 | |||
282 | #ifdef CONFIG_X86_64 | 348 | #ifdef CONFIG_X86_64 |
283 | unsigned int sig_xstate_size = sizeof(struct _fpstate); | 349 | unsigned int sig_xstate_size = sizeof(struct _fpstate); |
284 | #endif | 350 | #endif |
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); | |||
286 | /* | 352 | /* |
287 | * Enable the extended processor state save/restore feature | 353 | * Enable the extended processor state save/restore feature |
288 | */ | 354 | */ |
289 | void __cpuinit xsave_init(void) | 355 | static inline void xstate_enable(void) |
290 | { | 356 | { |
291 | if (!cpu_has_xsave) | ||
292 | return; | ||
293 | |||
294 | set_in_cr4(X86_CR4_OSXSAVE); | 357 | set_in_cr4(X86_CR4_OSXSAVE); |
295 | |||
296 | /* | ||
297 | * Enable all the features that the HW is capable of | ||
298 | * and the Linux kernel is aware of. | ||
299 | */ | ||
300 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); | 358 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); |
301 | } | 359 | } |
302 | 360 | ||
303 | /* | 361 | /* |
362 | * Record the offsets and sizes of different state managed by the xsave | ||
363 | * memory layout. | ||
364 | */ | ||
365 | static void __init setup_xstate_features(void) | ||
366 | { | ||
367 | int eax, ebx, ecx, edx, leaf = 0x2; | ||
368 | |||
369 | xstate_features = fls64(pcntxt_mask); | ||
370 | xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); | ||
371 | xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); | ||
372 | |||
373 | do { | ||
374 | cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); | ||
375 | |||
376 | if (eax == 0) | ||
377 | break; | ||
378 | |||
379 | xstate_offsets[leaf] = ebx; | ||
380 | xstate_sizes[leaf] = eax; | ||
381 | |||
382 | leaf++; | ||
383 | } while (1); | ||
384 | } | ||
385 | |||
386 | /* | ||
304 | * setup the xstate image representing the init state | 387 | * setup the xstate image representing the init state |
305 | */ | 388 | */ |
306 | static void __init setup_xstate_init(void) | 389 | static void __init setup_xstate_init(void) |
307 | { | 390 | { |
391 | setup_xstate_features(); | ||
392 | |||
393 | /* | ||
394 | * Setup init_xstate_buf to represent the init state of | ||
395 | * all the features managed by the xsave | ||
396 | */ | ||
308 | init_xstate_buf = alloc_bootmem(xstate_size); | 397 | init_xstate_buf = alloc_bootmem(xstate_size); |
309 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; | 398 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; |
399 | |||
400 | clts(); | ||
401 | /* | ||
402 | * Init all the features state with header_bv being 0x0 | ||
403 | */ | ||
404 | xrstor_state(init_xstate_buf, -1); | ||
405 | /* | ||
406 | * Dump the init state again. This is to identify the init state | ||
407 | * of any feature which is not represented by all zero's. | ||
408 | */ | ||
409 | xsave_state(init_xstate_buf, -1); | ||
410 | stts(); | ||
310 | } | 411 | } |
311 | 412 | ||
312 | /* | 413 | /* |
313 | * Enable and initialize the xsave feature. | 414 | * Enable and initialize the xsave feature. |
314 | */ | 415 | */ |
315 | void __ref xsave_cntxt_init(void) | 416 | static void __init xstate_enable_boot_cpu(void) |
316 | { | 417 | { |
317 | unsigned int eax, ebx, ecx, edx; | 418 | unsigned int eax, ebx, ecx, edx; |
318 | 419 | ||
319 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | 420 | if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { |
421 | WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); | ||
422 | return; | ||
423 | } | ||
424 | |||
425 | cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); | ||
320 | pcntxt_mask = eax + ((u64)edx << 32); | 426 | pcntxt_mask = eax + ((u64)edx << 32); |
321 | 427 | ||
322 | if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { | 428 | if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { |
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void) | |||
329 | * Support only the state known to OS. | 435 | * Support only the state known to OS. |
330 | */ | 436 | */ |
331 | pcntxt_mask = pcntxt_mask & XCNTXT_MASK; | 437 | pcntxt_mask = pcntxt_mask & XCNTXT_MASK; |
332 | xsave_init(); | 438 | |
439 | xstate_enable(); | ||
333 | 440 | ||
334 | /* | 441 | /* |
335 | * Recompute the context size for enabled features | 442 | * Recompute the context size for enabled features |
336 | */ | 443 | */ |
337 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | 444 | cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); |
338 | xstate_size = ebx; | 445 | xstate_size = ebx; |
339 | 446 | ||
340 | update_regset_xstate_info(xstate_size, pcntxt_mask); | 447 | update_regset_xstate_info(xstate_size, pcntxt_mask); |
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void) | |||
346 | "cntxt size 0x%x\n", | 453 | "cntxt size 0x%x\n", |
347 | pcntxt_mask, xstate_size); | 454 | pcntxt_mask, xstate_size); |
348 | } | 455 | } |
456 | |||
457 | /* | ||
458 | * For the very first instance, this calls xstate_enable_boot_cpu(); | ||
459 | * for all subsequent instances, this calls xstate_enable(). | ||
460 | * | ||
461 | * This is somewhat obfuscated due to the lack of powerful enough | ||
462 | * overrides for the section checks. | ||
463 | */ | ||
464 | void __cpuinit xsave_init(void) | ||
465 | { | ||
466 | static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; | ||
467 | void (*this_func)(void); | ||
468 | |||
469 | if (!cpu_has_xsave) | ||
470 | return; | ||
471 | |||
472 | this_func = next_func; | ||
473 | next_func = xstate_enable; | ||
474 | this_func(); | ||
475 | } | ||