aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-11 17:31:52 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-11 17:31:52 -0400
commit0d5959723e1db3fd7323c198a50c16cecf96c7a9 (patch)
tree802b623fff261ebcbbddadf84af5524398364a18 /arch/x86/kernel
parent62fdac5913f71f8f200bd2c9bd59a02e9a1498e9 (diff)
parent512626a04e72aca60effe111fa0333ed0b195d21 (diff)
Merge branch 'linus' into x86/mce3
Conflicts: arch/x86/kernel/cpu/mcheck/mce_64.c arch/x86/kernel/irq.c Merge reason: Resolve the conflicts above. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c5
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile2
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c500
-rw-r--r--arch/x86/kernel/amd_iommu_init.c273
-rw-r--r--arch/x86/kernel/apic/apic.c23
-rw-r--r--arch/x86/kernel/apic/io_apic.c9
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c15
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c417
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c15
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c24
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1704
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/ds.c921
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/entry_64.S29
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irqinit.c1
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c329
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_32.c20
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c284
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/setup.c22
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smp.c3
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c17
-rw-r--r--arch/x86/kernel/traps.c17
-rw-r--r--arch/x86/kernel/tsc.c19
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S430
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
75 files changed, 4747 insertions, 2086 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 235f5927bb97..4f78bd682125 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,7 @@ obj-y += process.o
44obj-y += i387.o xsave.o 44obj-y += i387.o xsave.o
45obj-y += ptrace.o 45obj-y += ptrace.o
46obj-$(CONFIG_X86_DS) += ds.o 46obj-$(CONFIG_X86_DS) += ds.o
47obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
47obj-$(CONFIG_X86_32) += tls.o 48obj-$(CONFIG_X86_32) += tls.o
48obj-$(CONFIG_IA32_EMULATION) += tls.o 49obj-$(CONFIG_IA32_EMULATION) += tls.o
49obj-y += step.o 50obj-y += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 844e5e25213b..631086159c53 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
985 985
986 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 986 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
987 mp_ioapics[idx].apicid = uniq_ioapic_id(id); 987 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
988#ifdef CONFIG_X86_32
989 mp_ioapics[idx].apicver = io_apic_get_version(idx); 988 mp_ioapics[idx].apicver = io_apic_get_version(idx);
990#else 989
991 mp_ioapics[idx].apicver = 0;
992#endif
993 /* 990 /*
994 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 991 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
995 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 992 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9def..167bc16ce0e5 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
9always := wakeup.bin 9always := wakeup.bin
10targets := wakeup.elf wakeup.lds 10targets := wakeup.elf wakeup.lds
11 11
12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o 12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
13 13
14# The link order of the video-*.o modules can matter. In particular, 14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o. 15# video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 000000000000..f51eb0bb56ce
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 000000000000..6206033ba202
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52..1c60554537c3 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 56 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 57static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom,
59 unsigned long address, u64
60 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page,
63 unsigned int pages);
58 64
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
67#endif
59 68
60#ifdef CONFIG_AMD_IOMMU_STATS 69#ifdef CONFIG_AMD_IOMMU_STATS
61 70
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
213{ 222{
214 struct amd_iommu *iommu; 223 struct amd_iommu *iommu;
215 224
216 list_for_each_entry(iommu, &amd_iommu_list, list) 225 for_each_iommu(iommu)
217 iommu_poll_events(iommu); 226 iommu_poll_events(iommu);
218 227
219 return IRQ_HANDLED; 228 return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
440 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 449 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
441 domid, 1, 1); 450 domid, 1, 1);
442 451
443 list_for_each_entry(iommu, &amd_iommu_list, list) { 452 for_each_iommu(iommu) {
444 spin_lock_irqsave(&iommu->lock, flags); 453 spin_lock_irqsave(&iommu->lock, flags);
445 __iommu_queue_command(iommu, &cmd); 454 __iommu_queue_command(iommu, &cmd);
446 __iommu_completion_wait(iommu); 455 __iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
449 } 458 }
450} 459}
451 460
461void amd_iommu_flush_all_domains(void)
462{
463 int i;
464
465 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
466 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
467 continue;
468 iommu_flush_domain(i);
469 }
470}
471
472void amd_iommu_flush_all_devices(void)
473{
474 struct amd_iommu *iommu;
475 int i;
476
477 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
478 if (amd_iommu_pd_table[i] == NULL)
479 continue;
480
481 iommu = amd_iommu_rlookup_table[i];
482 if (!iommu)
483 continue;
484
485 iommu_queue_inv_dev_entry(iommu, i);
486 iommu_completion_wait(iommu);
487 }
488}
489
452/**************************************************************************** 490/****************************************************************************
453 * 491 *
454 * The functions below are used the create the page table mappings for 492 * The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
468 unsigned long phys_addr, 506 unsigned long phys_addr,
469 int prot) 507 int prot)
470{ 508{
471 u64 __pte, *pte, *page; 509 u64 __pte, *pte;
472 510
473 bus_addr = PAGE_ALIGN(bus_addr); 511 bus_addr = PAGE_ALIGN(bus_addr);
474 phys_addr = PAGE_ALIGN(phys_addr); 512 phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
477 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 515 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
478 return -EINVAL; 516 return -EINVAL;
479 517
480 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; 518 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
481
482 if (!IOMMU_PTE_PRESENT(*pte)) {
483 page = (u64 *)get_zeroed_page(GFP_KERNEL);
484 if (!page)
485 return -ENOMEM;
486 *pte = IOMMU_L2_PDE(virt_to_phys(page));
487 }
488
489 pte = IOMMU_PTE_PAGE(*pte);
490 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
491
492 if (!IOMMU_PTE_PRESENT(*pte)) {
493 page = (u64 *)get_zeroed_page(GFP_KERNEL);
494 if (!page)
495 return -ENOMEM;
496 *pte = IOMMU_L1_PDE(virt_to_phys(page));
497 }
498
499 pte = IOMMU_PTE_PAGE(*pte);
500 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
501 519
502 if (IOMMU_PTE_PRESENT(*pte)) 520 if (IOMMU_PTE_PRESENT(*pte))
503 return -EBUSY; 521 return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
595 * as allocated in the aperture 613 * as allocated in the aperture
596 */ 614 */
597 if (addr < dma_dom->aperture_size) 615 if (addr < dma_dom->aperture_size)
598 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); 616 __set_bit(addr >> PAGE_SHIFT,
617 dma_dom->aperture[0]->bitmap);
599 } 618 }
600 619
601 return 0; 620 return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
632 ****************************************************************************/ 651 ****************************************************************************/
633 652
634/* 653/*
635 * The address allocator core function. 654 * The address allocator core functions.
636 * 655 *
637 * called with domain->lock held 656 * called with domain->lock held
638 */ 657 */
658
659/*
660 * This function checks if there is a PTE for a given dma address. If
661 * there is one, it returns the pointer to it.
662 */
663static u64* fetch_pte(struct protection_domain *domain,
664 unsigned long address)
665{
666 u64 *pte;
667
668 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
669
670 if (!IOMMU_PTE_PRESENT(*pte))
671 return NULL;
672
673 pte = IOMMU_PTE_PAGE(*pte);
674 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
675
676 if (!IOMMU_PTE_PRESENT(*pte))
677 return NULL;
678
679 pte = IOMMU_PTE_PAGE(*pte);
680 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
681
682 return pte;
683}
684
685/*
686 * This function is used to add a new aperture range to an existing
687 * aperture in case of dma_ops domain allocation or address allocation
688 * failure.
689 */
690static int alloc_new_range(struct amd_iommu *iommu,
691 struct dma_ops_domain *dma_dom,
692 bool populate, gfp_t gfp)
693{
694 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
695 int i;
696
697#ifdef CONFIG_IOMMU_STRESS
698 populate = false;
699#endif
700
701 if (index >= APERTURE_MAX_RANGES)
702 return -ENOMEM;
703
704 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
705 if (!dma_dom->aperture[index])
706 return -ENOMEM;
707
708 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
709 if (!dma_dom->aperture[index]->bitmap)
710 goto out_free;
711
712 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
713
714 if (populate) {
715 unsigned long address = dma_dom->aperture_size;
716 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
717 u64 *pte, *pte_page;
718
719 for (i = 0; i < num_ptes; ++i) {
720 pte = alloc_pte(&dma_dom->domain, address,
721 &pte_page, gfp);
722 if (!pte)
723 goto out_free;
724
725 dma_dom->aperture[index]->pte_pages[i] = pte_page;
726
727 address += APERTURE_RANGE_SIZE / 64;
728 }
729 }
730
731 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
732
733 /* Intialize the exclusion range if necessary */
734 if (iommu->exclusion_start &&
735 iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
736 iommu->exclusion_start < dma_dom->aperture_size) {
737 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
738 int pages = iommu_num_pages(iommu->exclusion_start,
739 iommu->exclusion_length,
740 PAGE_SIZE);
741 dma_ops_reserve_addresses(dma_dom, startpage, pages);
742 }
743
744 /*
745 * Check for areas already mapped as present in the new aperture
746 * range and mark those pages as reserved in the allocator. Such
747 * mappings may already exist as a result of requested unity
748 * mappings for devices.
749 */
750 for (i = dma_dom->aperture[index]->offset;
751 i < dma_dom->aperture_size;
752 i += PAGE_SIZE) {
753 u64 *pte = fetch_pte(&dma_dom->domain, i);
754 if (!pte || !IOMMU_PTE_PRESENT(*pte))
755 continue;
756
757 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
758 }
759
760 return 0;
761
762out_free:
763 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
764
765 kfree(dma_dom->aperture[index]);
766 dma_dom->aperture[index] = NULL;
767
768 return -ENOMEM;
769}
770
771static unsigned long dma_ops_area_alloc(struct device *dev,
772 struct dma_ops_domain *dom,
773 unsigned int pages,
774 unsigned long align_mask,
775 u64 dma_mask,
776 unsigned long start)
777{
778 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
779 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
780 int i = start >> APERTURE_RANGE_SHIFT;
781 unsigned long boundary_size;
782 unsigned long address = -1;
783 unsigned long limit;
784
785 next_bit >>= PAGE_SHIFT;
786
787 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
788 PAGE_SIZE) >> PAGE_SHIFT;
789
790 for (;i < max_index; ++i) {
791 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
792
793 if (dom->aperture[i]->offset >= dma_mask)
794 break;
795
796 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
797 dma_mask >> PAGE_SHIFT);
798
799 address = iommu_area_alloc(dom->aperture[i]->bitmap,
800 limit, next_bit, pages, 0,
801 boundary_size, align_mask);
802 if (address != -1) {
803 address = dom->aperture[i]->offset +
804 (address << PAGE_SHIFT);
805 dom->next_address = address + (pages << PAGE_SHIFT);
806 break;
807 }
808
809 next_bit = 0;
810 }
811
812 return address;
813}
814
639static unsigned long dma_ops_alloc_addresses(struct device *dev, 815static unsigned long dma_ops_alloc_addresses(struct device *dev,
640 struct dma_ops_domain *dom, 816 struct dma_ops_domain *dom,
641 unsigned int pages, 817 unsigned int pages,
642 unsigned long align_mask, 818 unsigned long align_mask,
643 u64 dma_mask) 819 u64 dma_mask)
644{ 820{
645 unsigned long limit;
646 unsigned long address; 821 unsigned long address;
647 unsigned long boundary_size;
648 822
649 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 823#ifdef CONFIG_IOMMU_STRESS
650 PAGE_SIZE) >> PAGE_SHIFT; 824 dom->next_address = 0;
651 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, 825 dom->need_flush = true;
652 dma_mask >> PAGE_SHIFT); 826#endif
653 827
654 if (dom->next_bit >= limit) { 828 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
655 dom->next_bit = 0; 829 dma_mask, dom->next_address);
656 dom->need_flush = true;
657 }
658 830
659 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
660 0 , boundary_size, align_mask);
661 if (address == -1) { 831 if (address == -1) {
662 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 832 dom->next_address = 0;
663 0, boundary_size, align_mask); 833 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
834 dma_mask, 0);
664 dom->need_flush = true; 835 dom->need_flush = true;
665 } 836 }
666 837
667 if (likely(address != -1)) { 838 if (unlikely(address == -1))
668 dom->next_bit = address + pages;
669 address <<= PAGE_SHIFT;
670 } else
671 address = bad_dma_address; 839 address = bad_dma_address;
672 840
673 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 841 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
684 unsigned long address, 852 unsigned long address,
685 unsigned int pages) 853 unsigned int pages)
686{ 854{
687 address >>= PAGE_SHIFT; 855 unsigned i = address >> APERTURE_RANGE_SHIFT;
688 iommu_area_free(dom->bitmap, address, pages); 856 struct aperture_range *range = dom->aperture[i];
689 857
690 if (address >= dom->next_bit) 858 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
859
860#ifdef CONFIG_IOMMU_STRESS
861 if (i < 4)
862 return;
863#endif
864
865 if (address >= dom->next_address)
691 dom->need_flush = true; 866 dom->need_flush = true;
867
868 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
869
870 iommu_area_free(range->bitmap, address, pages);
871
692} 872}
693 873
694/**************************************************************************** 874/****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
736 unsigned long start_page, 916 unsigned long start_page,
737 unsigned int pages) 917 unsigned int pages)
738{ 918{
739 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; 919 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
740 920
741 if (start_page + pages > last_page) 921 if (start_page + pages > last_page)
742 pages = last_page - start_page; 922 pages = last_page - start_page;
743 923
744 iommu_area_reserve(dom->bitmap, start_page, pages); 924 for (i = start_page; i < start_page + pages; ++i) {
925 int index = i / APERTURE_RANGE_PAGES;
926 int page = i % APERTURE_RANGE_PAGES;
927 __set_bit(page, dom->aperture[index]->bitmap);
928 }
745} 929}
746 930
747static void free_pagetable(struct protection_domain *domain) 931static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
780 */ 964 */
781static void dma_ops_domain_free(struct dma_ops_domain *dom) 965static void dma_ops_domain_free(struct dma_ops_domain *dom)
782{ 966{
967 int i;
968
783 if (!dom) 969 if (!dom)
784 return; 970 return;
785 971
786 free_pagetable(&dom->domain); 972 free_pagetable(&dom->domain);
787 973
788 kfree(dom->pte_pages); 974 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
789 975 if (!dom->aperture[i])
790 kfree(dom->bitmap); 976 continue;
977 free_page((unsigned long)dom->aperture[i]->bitmap);
978 kfree(dom->aperture[i]);
979 }
791 980
792 kfree(dom); 981 kfree(dom);
793} 982}
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
797 * It also intializes the page table and the address allocator data 986 * It also intializes the page table and the address allocator data
798 * structures required for the dma_ops interface 987 * structures required for the dma_ops interface
799 */ 988 */
800static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 989static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
801 unsigned order)
802{ 990{
803 struct dma_ops_domain *dma_dom; 991 struct dma_ops_domain *dma_dom;
804 unsigned i, num_pte_pages;
805 u64 *l2_pde;
806 u64 address;
807
808 /*
809 * Currently the DMA aperture must be between 32 MB and 1GB in size
810 */
811 if ((order < 25) || (order > 30))
812 return NULL;
813 992
814 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 993 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
815 if (!dma_dom) 994 if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
826 dma_dom->domain.priv = dma_dom; 1005 dma_dom->domain.priv = dma_dom;
827 if (!dma_dom->domain.pt_root) 1006 if (!dma_dom->domain.pt_root)
828 goto free_dma_dom; 1007 goto free_dma_dom;
829 dma_dom->aperture_size = (1ULL << order);
830 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
831 GFP_KERNEL);
832 if (!dma_dom->bitmap)
833 goto free_dma_dom;
834 /*
835 * mark the first page as allocated so we never return 0 as
836 * a valid dma-address. So we can use 0 as error value
837 */
838 dma_dom->bitmap[0] = 1;
839 dma_dom->next_bit = 0;
840 1008
841 dma_dom->need_flush = false; 1009 dma_dom->need_flush = false;
842 dma_dom->target_dev = 0xffff; 1010 dma_dom->target_dev = 0xffff;
843 1011
844 /* Intialize the exclusion range if necessary */ 1012 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
845 if (iommu->exclusion_start && 1013 goto free_dma_dom;
846 iommu->exclusion_start < dma_dom->aperture_size) {
847 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
848 int pages = iommu_num_pages(iommu->exclusion_start,
849 iommu->exclusion_length,
850 PAGE_SIZE);
851 dma_ops_reserve_addresses(dma_dom, startpage, pages);
852 }
853 1014
854 /* 1015 /*
855 * At the last step, build the page tables so we don't need to 1016 * mark the first page as allocated so we never return 0 as
856 * allocate page table pages in the dma_ops mapping/unmapping 1017 * a valid dma-address. So we can use 0 as error value
857 * path.
858 */ 1018 */
859 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 1019 dma_dom->aperture[0]->bitmap[0] = 1;
860 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 1020 dma_dom->next_address = 0;
861 GFP_KERNEL);
862 if (!dma_dom->pte_pages)
863 goto free_dma_dom;
864
865 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
866 if (l2_pde == NULL)
867 goto free_dma_dom;
868 1021
869 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
870
871 for (i = 0; i < num_pte_pages; ++i) {
872 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
873 if (!dma_dom->pte_pages[i])
874 goto free_dma_dom;
875 address = virt_to_phys(dma_dom->pte_pages[i]);
876 l2_pde[i] = IOMMU_L1_PDE(address);
877 }
878 1022
879 return dma_dom; 1023 return dma_dom;
880 1024
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
983 struct protection_domain *domain; 1127 struct protection_domain *domain;
984 struct dma_ops_domain *dma_domain; 1128 struct dma_ops_domain *dma_domain;
985 struct amd_iommu *iommu; 1129 struct amd_iommu *iommu;
986 int order = amd_iommu_aperture_order;
987 unsigned long flags; 1130 unsigned long flags;
988 1131
989 if (devid > amd_iommu_last_bdf) 1132 if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
1002 "to a non-dma-ops domain\n", dev_name(dev)); 1145 "to a non-dma-ops domain\n", dev_name(dev));
1003 1146
1004 switch (action) { 1147 switch (action) {
1005 case BUS_NOTIFY_BOUND_DRIVER: 1148 case BUS_NOTIFY_UNBOUND_DRIVER:
1006 if (domain)
1007 goto out;
1008 dma_domain = find_protection_domain(devid);
1009 if (!dma_domain)
1010 dma_domain = iommu->default_dom;
1011 attach_device(iommu, &dma_domain->domain, devid);
1012 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1013 "device %s\n", dma_domain->domain.id, dev_name(dev));
1014 break;
1015 case BUS_NOTIFY_UNBIND_DRIVER:
1016 if (!domain) 1149 if (!domain)
1017 goto out; 1150 goto out;
1018 detach_device(domain, devid); 1151 detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
1022 dma_domain = find_protection_domain(devid); 1155 dma_domain = find_protection_domain(devid);
1023 if (dma_domain) 1156 if (dma_domain)
1024 goto out; 1157 goto out;
1025 dma_domain = dma_ops_domain_alloc(iommu, order); 1158 dma_domain = dma_ops_domain_alloc(iommu);
1026 if (!dma_domain) 1159 if (!dma_domain)
1027 goto out; 1160 goto out;
1028 dma_domain->target_dev = devid; 1161 dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
1133 dma_dom = (*iommu)->default_dom; 1266 dma_dom = (*iommu)->default_dom;
1134 *domain = &dma_dom->domain; 1267 *domain = &dma_dom->domain;
1135 attach_device(*iommu, *domain, *bdf); 1268 attach_device(*iommu, *domain, *bdf);
1136 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1269 DUMP_printk("Using protection domain %d for device %s\n",
1137 "device %s\n", (*domain)->id, dev_name(dev)); 1270 (*domain)->id, dev_name(dev));
1138 } 1271 }
1139 1272
1140 if (domain_for_device(_bdf) == NULL) 1273 if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev,
1144} 1277}
1145 1278
1146/* 1279/*
1280 * If the pte_page is not yet allocated this function is called
1281 */
1282static u64* alloc_pte(struct protection_domain *dom,
1283 unsigned long address, u64 **pte_page, gfp_t gfp)
1284{
1285 u64 *pte, *page;
1286
1287 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
1288
1289 if (!IOMMU_PTE_PRESENT(*pte)) {
1290 page = (u64 *)get_zeroed_page(gfp);
1291 if (!page)
1292 return NULL;
1293 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1294 }
1295
1296 pte = IOMMU_PTE_PAGE(*pte);
1297 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
1298
1299 if (!IOMMU_PTE_PRESENT(*pte)) {
1300 page = (u64 *)get_zeroed_page(gfp);
1301 if (!page)
1302 return NULL;
1303 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1304 }
1305
1306 pte = IOMMU_PTE_PAGE(*pte);
1307
1308 if (pte_page)
1309 *pte_page = pte;
1310
1311 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
1312
1313 return pte;
1314}
1315
1316/*
1317 * This function fetches the PTE for a given address in the aperture
1318 */
1319static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1320 unsigned long address)
1321{
1322 struct aperture_range *aperture;
1323 u64 *pte, *pte_page;
1324
1325 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1326 if (!aperture)
1327 return NULL;
1328
1329 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1330 if (!pte) {
1331 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
1332 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1333 } else
1334 pte += IOMMU_PTE_L0_INDEX(address);
1335
1336 return pte;
1337}
1338
1339/*
1147 * This is the generic map function. It maps one 4kb page at paddr to 1340 * This is the generic map function. It maps one 4kb page at paddr to
1148 * the given address in the DMA address space for the domain. 1341 * the given address in the DMA address space for the domain.
1149 */ 1342 */
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1159 1352
1160 paddr &= PAGE_MASK; 1353 paddr &= PAGE_MASK;
1161 1354
1162 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1355 pte = dma_ops_get_pte(dom, address);
1163 pte += IOMMU_PTE_L0_INDEX(address); 1356 if (!pte)
1357 return bad_dma_address;
1164 1358
1165 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1359 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1166 1360
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1185 struct dma_ops_domain *dom, 1379 struct dma_ops_domain *dom,
1186 unsigned long address) 1380 unsigned long address)
1187{ 1381{
1382 struct aperture_range *aperture;
1188 u64 *pte; 1383 u64 *pte;
1189 1384
1190 if (address >= dom->aperture_size) 1385 if (address >= dom->aperture_size)
1191 return; 1386 return;
1192 1387
1193 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); 1388 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1389 if (!aperture)
1390 return;
1391
1392 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1393 if (!pte)
1394 return;
1194 1395
1195 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
1196 pte += IOMMU_PTE_L0_INDEX(address); 1396 pte += IOMMU_PTE_L0_INDEX(address);
1197 1397
1198 WARN_ON(!*pte); 1398 WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
1216 u64 dma_mask) 1416 u64 dma_mask)
1217{ 1417{
1218 dma_addr_t offset = paddr & ~PAGE_MASK; 1418 dma_addr_t offset = paddr & ~PAGE_MASK;
1219 dma_addr_t address, start; 1419 dma_addr_t address, start, ret;
1220 unsigned int pages; 1420 unsigned int pages;
1221 unsigned long align_mask = 0; 1421 unsigned long align_mask = 0;
1222 int i; 1422 int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
1232 if (align) 1432 if (align)
1233 align_mask = (1UL << get_order(size)) - 1; 1433 align_mask = (1UL << get_order(size)) - 1;
1234 1434
1435retry:
1235 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1436 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1236 dma_mask); 1437 dma_mask);
1237 if (unlikely(address == bad_dma_address)) 1438 if (unlikely(address == bad_dma_address)) {
1238 goto out; 1439 /*
1440 * setting next_address here will let the address
1441 * allocator only scan the new allocated range in the
1442 * first run. This is a small optimization.
1443 */
1444 dma_dom->next_address = dma_dom->aperture_size;
1445
1446 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
1447 goto out;
1448
1449 /*
1450 * aperture was sucessfully enlarged by 128 MB, try
1451 * allocation again
1452 */
1453 goto retry;
1454 }
1239 1455
1240 start = address; 1456 start = address;
1241 for (i = 0; i < pages; ++i) { 1457 for (i = 0; i < pages; ++i) {
1242 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1458 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
1459 if (ret == bad_dma_address)
1460 goto out_unmap;
1461
1243 paddr += PAGE_SIZE; 1462 paddr += PAGE_SIZE;
1244 start += PAGE_SIZE; 1463 start += PAGE_SIZE;
1245 } 1464 }
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
1255 1474
1256out: 1475out:
1257 return address; 1476 return address;
1477
1478out_unmap:
1479
1480 for (--i; i >= 0; --i) {
1481 start -= PAGE_SIZE;
1482 dma_ops_domain_unmap(iommu, dma_dom, start);
1483 }
1484
1485 dma_ops_free_addresses(dma_dom, address, pages);
1486
1487 return bad_dma_address;
1258} 1488}
1259 1489
1260/* 1490/*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
1537 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1767 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1538 size, DMA_BIDIRECTIONAL, true, dma_mask); 1768 size, DMA_BIDIRECTIONAL, true, dma_mask);
1539 1769
1540 if (*dma_addr == bad_dma_address) 1770 if (*dma_addr == bad_dma_address) {
1771 spin_unlock_irqrestore(&domain->lock, flags);
1541 goto out_free; 1772 goto out_free;
1773 }
1542 1774
1543 iommu_completion_wait(iommu); 1775 iommu_completion_wait(iommu);
1544 1776
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
1625 struct pci_dev *dev = NULL; 1857 struct pci_dev *dev = NULL;
1626 struct dma_ops_domain *dma_dom; 1858 struct dma_ops_domain *dma_dom;
1627 struct amd_iommu *iommu; 1859 struct amd_iommu *iommu;
1628 int order = amd_iommu_aperture_order;
1629 u16 devid; 1860 u16 devid;
1630 1861
1631 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1862 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
1638 iommu = amd_iommu_rlookup_table[devid]; 1869 iommu = amd_iommu_rlookup_table[devid];
1639 if (!iommu) 1870 if (!iommu)
1640 continue; 1871 continue;
1641 dma_dom = dma_ops_domain_alloc(iommu, order); 1872 dma_dom = dma_ops_domain_alloc(iommu);
1642 if (!dma_dom) 1873 if (!dma_dom)
1643 continue; 1874 continue;
1644 init_unity_mappings_for_device(dma_dom, devid); 1875 init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
1664int __init amd_iommu_init_dma_ops(void) 1895int __init amd_iommu_init_dma_ops(void)
1665{ 1896{
1666 struct amd_iommu *iommu; 1897 struct amd_iommu *iommu;
1667 int order = amd_iommu_aperture_order;
1668 int ret; 1898 int ret;
1669 1899
1670 /* 1900 /*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
1672 * found in the system. Devices not assigned to any other 1902 * found in the system. Devices not assigned to any other
1673 * protection domain will be assigned to the default one. 1903 * protection domain will be assigned to the default one.
1674 */ 1904 */
1675 list_for_each_entry(iommu, &amd_iommu_list, list) { 1905 for_each_iommu(iommu) {
1676 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1906 iommu->default_dom = dma_ops_domain_alloc(iommu);
1677 if (iommu->default_dom == NULL) 1907 if (iommu->default_dom == NULL)
1678 return -ENOMEM; 1908 return -ENOMEM;
1679 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 1909 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
1710 1940
1711free_domains: 1941free_domains:
1712 1942
1713 list_for_each_entry(iommu, &amd_iommu_list, list) { 1943 for_each_iommu(iommu) {
1714 if (iommu->default_dom) 1944 if (iommu->default_dom)
1715 dma_ops_domain_free(iommu->default_dom); 1945 dma_ops_domain_free(iommu->default_dom);
1716 } 1946 }
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
1842 2072
1843 old_domain = domain_for_device(devid); 2073 old_domain = domain_for_device(devid);
1844 if (old_domain) 2074 if (old_domain)
1845 return -EBUSY; 2075 detach_device(old_domain, devid);
1846 2076
1847 attach_device(iommu, domain, devid); 2077 attach_device(iommu, domain, devid);
1848 2078
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dac..238989ec077d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
115 u64 range_length; 115 u64 range_length;
116} __attribute__((packed)); 116} __attribute__((packed));
117 117
118bool amd_iommu_dump;
119
118static int __initdata amd_iommu_detected; 120static int __initdata amd_iommu_detected;
119 121
120u16 amd_iommu_last_bdf; /* largest PCI device id we have 122u16 amd_iommu_last_bdf; /* largest PCI device id we have
121 to handle */ 123 to handle */
122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
123 we find in ACPI */ 125 we find in ACPI */
124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
125bool amd_iommu_isolate = true; /* if true, device isolation is 129bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */ 130 enabled */
131#endif
132
127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
128 134
129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
175static inline unsigned long tbl_size(int entry_size) 181static inline unsigned long tbl_size(int entry_size)
176{ 182{
177 unsigned shift = PAGE_SHIFT + 183 unsigned shift = PAGE_SHIFT +
178 get_order(amd_iommu_last_bdf * entry_size); 184 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
179 185
180 return 1UL << shift; 186 return 1UL << shift;
181} 187}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
193 * This function set the exclusion range in the IOMMU. DMA accesses to the 199 * This function set the exclusion range in the IOMMU. DMA accesses to the
194 * exclusion range are passed through untranslated 200 * exclusion range are passed through untranslated
195 */ 201 */
196static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 202static void iommu_set_exclusion_range(struct amd_iommu *iommu)
197{ 203{
198 u64 start = iommu->exclusion_start & PAGE_MASK; 204 u64 start = iommu->exclusion_start & PAGE_MASK;
199 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; 205 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
225} 231}
226 232
227/* Generic functions to enable/disable certain features of the IOMMU. */ 233/* Generic functions to enable/disable certain features of the IOMMU. */
228static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 234static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
229{ 235{
230 u32 ctrl; 236 u32 ctrl;
231 237
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244} 250}
245 251
246/* Function to enable the hardware */ 252/* Function to enable the hardware */
247static void __init iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
248{ 254{
249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
250 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
253} 259}
254 260
255/* Function to enable IOMMU event logging and event interrupts */ 261static void iommu_disable(struct amd_iommu *iommu)
256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
257{ 262{
258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 263 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
260} 264}
261 265
262/* 266/*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
413{ 417{
414 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 418 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
415 get_order(CMD_BUFFER_SIZE)); 419 get_order(CMD_BUFFER_SIZE));
416 u64 entry;
417 420
418 if (cmd_buf == NULL) 421 if (cmd_buf == NULL)
419 return NULL; 422 return NULL;
420 423
421 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 424 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
422 425
423 entry = (u64)virt_to_phys(cmd_buf); 426 return cmd_buf;
427}
428
429/*
430 * This function writes the command buffer address to the hardware and
431 * enables it.
432 */
433static void iommu_enable_command_buffer(struct amd_iommu *iommu)
434{
435 u64 entry;
436
437 BUG_ON(iommu->cmd_buf == NULL);
438
439 entry = (u64)virt_to_phys(iommu->cmd_buf);
424 entry |= MMIO_CMD_SIZE_512; 440 entry |= MMIO_CMD_SIZE_512;
441
425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 442 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
426 &entry, sizeof(entry)); 443 &entry, sizeof(entry));
427 444
428 /* set head and tail to zero manually */ 445 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 446 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 447 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431 448
432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 449 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
433
434 return cmd_buf;
435} 450}
436 451
437static void __init free_command_buffer(struct amd_iommu *iommu) 452static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
443/* allocates the memory where the IOMMU will log its events to */ 458/* allocates the memory where the IOMMU will log its events to */
444static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) 459static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
445{ 460{
446 u64 entry;
447 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 461 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
448 get_order(EVT_BUFFER_SIZE)); 462 get_order(EVT_BUFFER_SIZE));
449 463
450 if (iommu->evt_buf == NULL) 464 if (iommu->evt_buf == NULL)
451 return NULL; 465 return NULL;
452 466
467 return iommu->evt_buf;
468}
469
470static void iommu_enable_event_buffer(struct amd_iommu *iommu)
471{
472 u64 entry;
473
474 BUG_ON(iommu->evt_buf == NULL);
475
453 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 476 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
477
454 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 478 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
455 &entry, sizeof(entry)); 479 &entry, sizeof(entry));
456 480
457 iommu->evt_buf_size = EVT_BUFFER_SIZE; 481 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
458
459 return iommu->evt_buf;
460} 482}
461 483
462static void __init free_event_buffer(struct amd_iommu *iommu) 484static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
596 p += sizeof(struct ivhd_header); 618 p += sizeof(struct ivhd_header);
597 end += h->length; 619 end += h->length;
598 620
621
599 while (p < end) { 622 while (p < end) {
600 e = (struct ivhd_entry *)p; 623 e = (struct ivhd_entry *)p;
601 switch (e->type) { 624 switch (e->type) {
602 case IVHD_DEV_ALL: 625 case IVHD_DEV_ALL:
626
627 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
628 " last device %02x:%02x.%x flags: %02x\n",
629 PCI_BUS(iommu->first_device),
630 PCI_SLOT(iommu->first_device),
631 PCI_FUNC(iommu->first_device),
632 PCI_BUS(iommu->last_device),
633 PCI_SLOT(iommu->last_device),
634 PCI_FUNC(iommu->last_device),
635 e->flags);
636
603 for (dev_i = iommu->first_device; 637 for (dev_i = iommu->first_device;
604 dev_i <= iommu->last_device; ++dev_i) 638 dev_i <= iommu->last_device; ++dev_i)
605 set_dev_entry_from_acpi(iommu, dev_i, 639 set_dev_entry_from_acpi(iommu, dev_i,
606 e->flags, 0); 640 e->flags, 0);
607 break; 641 break;
608 case IVHD_DEV_SELECT: 642 case IVHD_DEV_SELECT:
643
644 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
645 "flags: %02x\n",
646 PCI_BUS(e->devid),
647 PCI_SLOT(e->devid),
648 PCI_FUNC(e->devid),
649 e->flags);
650
609 devid = e->devid; 651 devid = e->devid;
610 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 652 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
611 break; 653 break;
612 case IVHD_DEV_SELECT_RANGE_START: 654 case IVHD_DEV_SELECT_RANGE_START:
655
656 DUMP_printk(" DEV_SELECT_RANGE_START\t "
657 "devid: %02x:%02x.%x flags: %02x\n",
658 PCI_BUS(e->devid),
659 PCI_SLOT(e->devid),
660 PCI_FUNC(e->devid),
661 e->flags);
662
613 devid_start = e->devid; 663 devid_start = e->devid;
614 flags = e->flags; 664 flags = e->flags;
615 ext_flags = 0; 665 ext_flags = 0;
616 alias = false; 666 alias = false;
617 break; 667 break;
618 case IVHD_DEV_ALIAS: 668 case IVHD_DEV_ALIAS:
669
670 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
671 "flags: %02x devid_to: %02x:%02x.%x\n",
672 PCI_BUS(e->devid),
673 PCI_SLOT(e->devid),
674 PCI_FUNC(e->devid),
675 e->flags,
676 PCI_BUS(e->ext >> 8),
677 PCI_SLOT(e->ext >> 8),
678 PCI_FUNC(e->ext >> 8));
679
619 devid = e->devid; 680 devid = e->devid;
620 devid_to = e->ext >> 8; 681 devid_to = e->ext >> 8;
621 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 682 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
622 amd_iommu_alias_table[devid] = devid_to; 683 amd_iommu_alias_table[devid] = devid_to;
623 break; 684 break;
624 case IVHD_DEV_ALIAS_RANGE: 685 case IVHD_DEV_ALIAS_RANGE:
686
687 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
688 "devid: %02x:%02x.%x flags: %02x "
689 "devid_to: %02x:%02x.%x\n",
690 PCI_BUS(e->devid),
691 PCI_SLOT(e->devid),
692 PCI_FUNC(e->devid),
693 e->flags,
694 PCI_BUS(e->ext >> 8),
695 PCI_SLOT(e->ext >> 8),
696 PCI_FUNC(e->ext >> 8));
697
625 devid_start = e->devid; 698 devid_start = e->devid;
626 flags = e->flags; 699 flags = e->flags;
627 devid_to = e->ext >> 8; 700 devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
629 alias = true; 702 alias = true;
630 break; 703 break;
631 case IVHD_DEV_EXT_SELECT: 704 case IVHD_DEV_EXT_SELECT:
705
706 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
707 "flags: %02x ext: %08x\n",
708 PCI_BUS(e->devid),
709 PCI_SLOT(e->devid),
710 PCI_FUNC(e->devid),
711 e->flags, e->ext);
712
632 devid = e->devid; 713 devid = e->devid;
633 set_dev_entry_from_acpi(iommu, devid, e->flags, 714 set_dev_entry_from_acpi(iommu, devid, e->flags,
634 e->ext); 715 e->ext);
635 break; 716 break;
636 case IVHD_DEV_EXT_SELECT_RANGE: 717 case IVHD_DEV_EXT_SELECT_RANGE:
718
719 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
720 "%02x:%02x.%x flags: %02x ext: %08x\n",
721 PCI_BUS(e->devid),
722 PCI_SLOT(e->devid),
723 PCI_FUNC(e->devid),
724 e->flags, e->ext);
725
637 devid_start = e->devid; 726 devid_start = e->devid;
638 flags = e->flags; 727 flags = e->flags;
639 ext_flags = e->ext; 728 ext_flags = e->ext;
640 alias = false; 729 alias = false;
641 break; 730 break;
642 case IVHD_DEV_RANGE_END: 731 case IVHD_DEV_RANGE_END:
732
733 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
734 PCI_BUS(e->devid),
735 PCI_SLOT(e->devid),
736 PCI_FUNC(e->devid));
737
643 devid = e->devid; 738 devid = e->devid;
644 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 739 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
645 if (alias) 740 if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
679{ 774{
680 struct amd_iommu *iommu, *next; 775 struct amd_iommu *iommu, *next;
681 776
682 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { 777 for_each_iommu_safe(iommu, next) {
683 list_del(&iommu->list); 778 list_del(&iommu->list);
684 free_iommu_one(iommu); 779 free_iommu_one(iommu);
685 kfree(iommu); 780 kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
710 if (!iommu->mmio_base) 805 if (!iommu->mmio_base)
711 return -ENOMEM; 806 return -ENOMEM;
712 807
713 iommu_set_device_table(iommu);
714 iommu->cmd_buf = alloc_command_buffer(iommu); 808 iommu->cmd_buf = alloc_command_buffer(iommu);
715 if (!iommu->cmd_buf) 809 if (!iommu->cmd_buf)
716 return -ENOMEM; 810 return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
746 h = (struct ivhd_header *)p; 840 h = (struct ivhd_header *)p;
747 switch (*p) { 841 switch (*p) {
748 case ACPI_IVHD_TYPE: 842 case ACPI_IVHD_TYPE:
843
844 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
845 "seg: %d flags: %01x info %04x\n",
846 PCI_BUS(h->devid), PCI_SLOT(h->devid),
847 PCI_FUNC(h->devid), h->cap_ptr,
848 h->pci_seg, h->flags, h->info);
849 DUMP_printk(" mmio-addr: %016llx\n",
850 h->mmio_phys);
851
749 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 852 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
750 if (iommu == NULL) 853 if (iommu == NULL)
751 return -ENOMEM; 854 return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
773 * 876 *
774 ****************************************************************************/ 877 ****************************************************************************/
775 878
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu) 879static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{ 880{
818 int r; 881 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826 882
827 if (pci_enable_msi(iommu->dev)) 883 if (pci_enable_msi(iommu->dev))
828 return 1; 884 return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
837 return 1; 893 return 1;
838 } 894 }
839 895
896 iommu->int_enabled = true;
897 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
898
840 return 0; 899 return 0;
841} 900}
842 901
843static int __init iommu_init_msi(struct amd_iommu *iommu) 902static int iommu_init_msi(struct amd_iommu *iommu)
844{ 903{
845 if (iommu->int_enabled) 904 if (iommu->int_enabled)
846 return 0; 905 return 0;
847 906
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) 907 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu); 908 return iommu_setup_msi(iommu);
852 909
853 return 1; 910 return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
899static int __init init_unity_map_range(struct ivmd_header *m) 956static int __init init_unity_map_range(struct ivmd_header *m)
900{ 957{
901 struct unity_map_entry *e = 0; 958 struct unity_map_entry *e = 0;
959 char *s;
902 960
903 e = kzalloc(sizeof(*e), GFP_KERNEL); 961 e = kzalloc(sizeof(*e), GFP_KERNEL);
904 if (e == NULL) 962 if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
906 964
907 switch (m->type) { 965 switch (m->type) {
908 default: 966 default:
967 kfree(e);
968 return 0;
909 case ACPI_IVMD_TYPE: 969 case ACPI_IVMD_TYPE:
970 s = "IVMD_TYPEi\t\t\t";
910 e->devid_start = e->devid_end = m->devid; 971 e->devid_start = e->devid_end = m->devid;
911 break; 972 break;
912 case ACPI_IVMD_TYPE_ALL: 973 case ACPI_IVMD_TYPE_ALL:
974 s = "IVMD_TYPE_ALL\t\t";
913 e->devid_start = 0; 975 e->devid_start = 0;
914 e->devid_end = amd_iommu_last_bdf; 976 e->devid_end = amd_iommu_last_bdf;
915 break; 977 break;
916 case ACPI_IVMD_TYPE_RANGE: 978 case ACPI_IVMD_TYPE_RANGE:
979 s = "IVMD_TYPE_RANGE\t\t";
917 e->devid_start = m->devid; 980 e->devid_start = m->devid;
918 e->devid_end = m->aux; 981 e->devid_end = m->aux;
919 break; 982 break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
922 e->address_end = e->address_start + PAGE_ALIGN(m->range_length); 985 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
923 e->prot = m->flags >> 1; 986 e->prot = m->flags >> 1;
924 987
988 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
989 " range_start: %016llx range_end: %016llx flags: %x\n", s,
990 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
991 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
992 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
993 e->address_start, e->address_end, m->flags);
994
925 list_add_tail(&e->list, &amd_iommu_unity_map); 995 list_add_tail(&e->list, &amd_iommu_unity_map);
926 996
927 return 0; 997 return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
967 * This function finally enables all IOMMUs found in the system after 1037 * This function finally enables all IOMMUs found in the system after
968 * they have been initialized 1038 * they have been initialized
969 */ 1039 */
970static void __init enable_iommus(void) 1040static void enable_iommus(void)
971{ 1041{
972 struct amd_iommu *iommu; 1042 struct amd_iommu *iommu;
973 1043
974 list_for_each_entry(iommu, &amd_iommu_list, list) { 1044 for_each_iommu(iommu) {
1045 iommu_set_device_table(iommu);
1046 iommu_enable_command_buffer(iommu);
1047 iommu_enable_event_buffer(iommu);
975 iommu_set_exclusion_range(iommu); 1048 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu); 1049 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
978 iommu_enable(iommu); 1050 iommu_enable(iommu);
979 } 1051 }
980} 1052}
981 1053
1054static void disable_iommus(void)
1055{
1056 struct amd_iommu *iommu;
1057
1058 for_each_iommu(iommu)
1059 iommu_disable(iommu);
1060}
1061
982/* 1062/*
983 * Suspend/Resume support 1063 * Suspend/Resume support
984 * disable suspend until real resume implemented 1064 * disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
986 1066
987static int amd_iommu_resume(struct sys_device *dev) 1067static int amd_iommu_resume(struct sys_device *dev)
988{ 1068{
1069 /*
1070 * Disable IOMMUs before reprogramming the hardware registers.
1071 * IOMMU is still enabled from the resume kernel.
1072 */
1073 disable_iommus();
1074
1075 /* re-load the hardware */
1076 enable_iommus();
1077
1078 /*
1079 * we have to flush after the IOMMUs are enabled because a
1080 * disabled IOMMU will never execute the commands we send
1081 */
1082 amd_iommu_flush_all_domains();
1083 amd_iommu_flush_all_devices();
1084
989 return 0; 1085 return 0;
990} 1086}
991 1087
992static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1088static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
993{ 1089{
994 return -EINVAL; 1090 /* disable IOMMUs to go out of the way for BIOS */
1091 disable_iommus();
1092
1093 return 0;
995} 1094}
996 1095
997static struct sysdev_class amd_iommu_sysdev_class = { 1096static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
1137 1236
1138 enable_iommus(); 1237 enable_iommus();
1139 1238
1140 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
1141 (1 << (amd_iommu_aperture_order-20)));
1142
1143 printk(KERN_INFO "AMD IOMMU: device isolation "); 1239 printk(KERN_INFO "AMD IOMMU: device isolation ");
1144 if (amd_iommu_isolate) 1240 if (amd_iommu_isolate)
1145 printk("enabled\n"); 1241 printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
1211 * 1307 *
1212 ****************************************************************************/ 1308 ****************************************************************************/
1213 1309
1310static int __init parse_amd_iommu_dump(char *str)
1311{
1312 amd_iommu_dump = true;
1313
1314 return 1;
1315}
1316
1214static int __init parse_amd_iommu_options(char *str) 1317static int __init parse_amd_iommu_options(char *str)
1215{ 1318{
1216 for (; *str; ++str) { 1319 for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
1225 return 1; 1328 return 1;
1226} 1329}
1227 1330
1228static int __init parse_amd_iommu_size_options(char *str) 1331__setup("amd_iommu_dump", parse_amd_iommu_dump);
1229{
1230 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1231
1232 if ((order > 24) && (order < 31))
1233 amd_iommu_aperture_order = order;
1234
1235 return 1;
1236}
1237
1238__setup("amd_iommu=", parse_amd_iommu_options); 1332__setup("amd_iommu=", parse_amd_iommu_options);
1239__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ee75d2a9b9cd..8c7c042ecad1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
19#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
34#include <linux/smp.h> 35#include <linux/smp.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36 37
38#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
38#include <asm/atomic.h> 40#include <asm/atomic.h>
39#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -249,7 +251,7 @@ static void native_apic_write_dummy(u32 reg, u32 v)
249 251
250static u32 native_apic_read_dummy(u32 reg) 252static u32 native_apic_read_dummy(u32 reg)
251{ 253{
252 WARN_ON_ONCE((cpu_has_apic || !disable_apic)); 254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
253 return 0; 255 return 0;
254} 256}
255 257
@@ -1187,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1187 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1188 } 1190 }
1189#endif 1191#endif
1192 perf_counters_lapic_init();
1190 1193
1191 preempt_disable(); 1194 preempt_disable();
1192 1195
@@ -1609,6 +1612,13 @@ void __init init_apic_mappings(void)
1609 new_apicid = read_apic_id(); 1612 new_apicid = read_apic_id();
1610 if (boot_cpu_physical_apicid != new_apicid) { 1613 if (boot_cpu_physical_apicid != new_apicid) {
1611 boot_cpu_physical_apicid = new_apicid; 1614 boot_cpu_physical_apicid = new_apicid;
1615 /*
1616 * yeah -- we lie about apic_version
1617 * in case if apic was disabled via boot option
1618 * but it's not a problem for SMP compiled kernel
1619 * since smp_sanity_check is prepared for such a case
1620 * and disable smp mode
1621 */
1612 apic_version[new_apicid] = 1622 apic_version[new_apicid] =
1613 GET_APIC_VERSION(apic_read(APIC_LVR)); 1623 GET_APIC_VERSION(apic_read(APIC_LVR));
1614 } 1624 }
@@ -2027,7 +2037,7 @@ static int lapic_resume(struct sys_device *dev)
2027 unsigned int l, h; 2037 unsigned int l, h;
2028 unsigned long flags; 2038 unsigned long flags;
2029 int maxlvt; 2039 int maxlvt;
2030 int ret; 2040 int ret = 0;
2031 struct IO_APIC_route_entry **ioapic_entries = NULL; 2041 struct IO_APIC_route_entry **ioapic_entries = NULL;
2032 2042
2033 if (!apic_pm_state.active) 2043 if (!apic_pm_state.active)
@@ -2038,14 +2048,15 @@ static int lapic_resume(struct sys_device *dev)
2038 ioapic_entries = alloc_ioapic_entries(); 2048 ioapic_entries = alloc_ioapic_entries();
2039 if (!ioapic_entries) { 2049 if (!ioapic_entries) {
2040 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2050 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
2041 return -ENOMEM; 2051 ret = -ENOMEM;
2052 goto restore;
2042 } 2053 }
2043 2054
2044 ret = save_IO_APIC_setup(ioapic_entries); 2055 ret = save_IO_APIC_setup(ioapic_entries);
2045 if (ret) { 2056 if (ret) {
2046 WARN(1, "Saving IO-APIC state failed: %d\n", ret); 2057 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2047 free_ioapic_entries(ioapic_entries); 2058 free_ioapic_entries(ioapic_entries);
2048 return ret; 2059 goto restore;
2049 } 2060 }
2050 2061
2051 mask_IO_APIC_setup(ioapic_entries); 2062 mask_IO_APIC_setup(ioapic_entries);
@@ -2097,10 +2108,10 @@ static int lapic_resume(struct sys_device *dev)
2097 restore_IO_APIC_setup(ioapic_entries); 2108 restore_IO_APIC_setup(ioapic_entries);
2098 free_ioapic_entries(ioapic_entries); 2109 free_ioapic_entries(ioapic_entries);
2099 } 2110 }
2100 2111restore:
2101 local_irq_restore(flags); 2112 local_irq_restore(flags);
2102 2113
2103 return 0; 2114 return ret;
2104} 2115}
2105 2116
2106/* 2117/*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ac7f3b6ad583..94605e7f6a54 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
59#include <asm/setup.h> 59#include <asm/setup.h>
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h>
62#include <asm/uv/uv_hub.h> 63#include <asm/uv/uv_hub.h>
63#include <asm/uv/uv_irq.h> 64#include <asm/uv/uv_irq.h>
64 65
@@ -176,16 +177,18 @@ int __init arch_early_irq_init(void)
176 struct irq_cfg *cfg; 177 struct irq_cfg *cfg;
177 struct irq_desc *desc; 178 struct irq_desc *desc;
178 int count; 179 int count;
180 int node;
179 int i; 181 int i;
180 182
181 cfg = irq_cfgx; 183 cfg = irq_cfgx;
182 count = ARRAY_SIZE(irq_cfgx); 184 count = ARRAY_SIZE(irq_cfgx);
185 node= cpu_to_node(boot_cpu_id);
183 186
184 for (i = 0; i < count; i++) { 187 for (i = 0; i < count; i++) {
185 desc = irq_to_desc(i); 188 desc = irq_to_desc(i);
186 desc->chip_data = &cfg[i]; 189 desc->chip_data = &cfg[i];
187 alloc_bootmem_cpumask_var(&cfg[i].domain); 190 alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
188 alloc_bootmem_cpumask_var(&cfg[i].old_domain); 191 alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
189 if (i < NR_IRQS_LEGACY) 192 if (i < NR_IRQS_LEGACY)
190 cpumask_setall(cfg[i].domain); 193 cpumask_setall(cfg[i].domain);
191 } 194 }
@@ -4012,6 +4015,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
4012 4015
4013 return apic_id; 4016 return apic_id;
4014} 4017}
4018#endif
4015 4019
4016int __init io_apic_get_version(int ioapic) 4020int __init io_apic_get_version(int ioapic)
4017{ 4021{
@@ -4024,7 +4028,6 @@ int __init io_apic_get_version(int ioapic)
4024 4028
4025 return reg_01.bits.version; 4029 return reg_01.bits.version;
4026} 4030}
4027#endif
4028 4031
4029int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4032int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4030{ 4033{
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index c4762276c17e..b3025b43b63a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
104} 104}
105#endif 105#endif
106 106
107static void report_broken_nmi(int cpu, int *prev_nmi_count) 107static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
108{ 108{
109 printk(KERN_CONT "\n"); 109 printk(KERN_CONT "\n");
110 110
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e4..440a8bccd91a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
160extern struct apic apic_bigsmp; 160extern struct apic apic_bigsmp;
161extern struct apic apic_es7000; 161extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster; 162extern struct apic apic_es7000_cluster;
163extern struct apic apic_default;
164 163
165struct apic *apic = &apic_default; 164struct apic *apic = &apic_default;
166EXPORT_SYMBOL_GPL(apic); 165EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d17..8e4cbb255c38 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
10#include <asm/apic.h> 10#include <asm/apic.h>
11#include <asm/ipi.h> 11#include <asm/ipi.h>
12 12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14 14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 16{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 780a733a5e7a..ef0ae207a7c8 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
562 union uvh_node_id_u node_id; 562 union uvh_node_id_u node_id;
563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
565 int max_pnode = 0; 565 int gnode_extra, max_pnode = 0;
566 unsigned long mmr_base, present, paddr; 566 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 567 unsigned short pnode_mask;
568 568
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
574 mmr_base = 574 mmr_base =
575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
576 ~UV_MMR_ENABLE; 576 ~UV_MMR_ENABLE;
577 pnode_mask = (1 << n_val) - 1;
578 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
579 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
580 gnode_upper = ((unsigned long)gnode_extra << m_val);
581 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
582 n_val, m_val, gnode_upper, gnode_extra);
583
577 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 584 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
578 585
579 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 586 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -610,11 +617,6 @@ void __init uv_system_init(void)
610 } 617 }
611 } 618 }
612 619
613 pnode_mask = (1 << n_val) - 1;
614 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
615 gnode_upper = (((unsigned long)node_id.s.node_id) &
616 ~((1 << n_val) - 1)) << m_val;
617
618 uv_bios_init(); 620 uv_bios_init();
619 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 621 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
620 &sn_coherency_id, &sn_region_size); 622 &sn_coherency_id, &sn_region_size);
@@ -637,6 +639,7 @@ void __init uv_system_init(void)
637 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 639 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
638 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 640 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
639 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 641 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
642 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
640 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 643 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
641 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 644 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
642 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 645 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162f..1a830cbd7015 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -146,4 +146,5 @@ void foo(void)
146 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 146 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
148 OFFSET(BP_version, boot_params, hdr.version); 148 OFFSET(BP_version, boot_params, hdr.version);
149 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 150}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b5..898ecc47e129 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
125 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 125 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
127 OFFSET(BP_version, boot_params, hdr.version); 127 OFFSET(BP_version, boot_params, hdr.version);
128 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
128 129
129 BLANK(); 130 BLANK();
130 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 131 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 728b3750a3e8..e5b27d8f1b47 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
9#include <asm/pci-direct.h>
9 10
10#ifdef CONFIG_X86_64 11#ifdef CONFIG_X86_64
11# include <asm/numa_64.h> 12# include <asm/numa_64.h>
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
351 (c->x86_model == 8 && c->x86_mask >= 8)) 352 (c->x86_model == 8 && c->x86_mask >= 8))
352 set_cpu_cap(c, X86_FEATURE_K6_MTRR); 353 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
353#endif 354#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) {
358 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
361 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
362 }
363#endif
354} 364}
355 365
356static void __cpuinit init_amd(struct cpuinfo_x86 *c) 366static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b0517aa2bd3b..3ffdcfa9abdf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -874,6 +875,7 @@ void __init identify_boot_cpu(void)
874#else 875#else
875 vgetcpu_set_mode(); 876 vgetcpu_set_mode();
876#endif 877#endif
878 init_hw_perf_counters();
877} 879}
878 880
879void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 881void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 2fc4f6bb9ca5..6b2a52dd0403 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38 36
39static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
40 38
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
80 { "value", CPU_REG_ALL, 1 }, 78 { "value", CPU_REG_ALL, 1 },
81}; 79};
82 80
83/* Intel Registers Range */ 81/* CPU Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = { 82static struct cpu_debug_range cpu_reg_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, 83 { 0x00000000, 0x00000001, CPU_MC, },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, 84 { 0x00000006, 0x00000007, CPU_MONITOR, },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, 85 { 0x00000010, 0x00000010, CPU_TIME, },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, 86 { 0x00000011, 0x00000013, CPU_PMC, },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, 87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, 88 { 0x0000001B, 0x0000001B, CPU_APIC, },
91 89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, 90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, 91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, 92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, 93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
96 94 { 0x00000079, 0x00000079, CPU_BIOS, },
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, 95 { 0x00000088, 0x0000008A, CPU_CACHE, },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, 96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, 97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, 98 { 0x000000C1, 0x000000C4, CPU_PMC, },
101 99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, 100 { 0x000000E7, 0x000000E8, CPU_PERF, },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, 101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, 102
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, 103 { 0x00000116, 0x0000011E, CPU_CACHE, },
106 104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, 105 { 0x00000179, 0x0000017B, CPU_MC, },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, 106 { 0x00000186, 0x00000189, CPU_PMC, },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, 107 { 0x00000198, 0x00000199, CPU_PERF, },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, 108 { 0x0000019A, 0x0000019A, CPU_TIME, },
111 109 { 0x0000019B, 0x0000019D, CPU_THERM, },
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, 110 { 0x000001A0, 0x000001A0, CPU_MISC, },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, 111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, 112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, 113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, 114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
117 115
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, 116 { 0x00000200, 0x0000020F, CPU_MTRR, },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, 117 { 0x00000250, 0x00000250, CPU_MTRR, },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, 118 { 0x00000258, 0x00000259, CPU_MTRR, },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, 119 { 0x00000268, 0x0000026F, CPU_MTRR, },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, 120 { 0x00000277, 0x00000277, CPU_PAT, },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, 121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, 122
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, 123 { 0x00000300, 0x00000311, CPU_PMC, },
126 124 { 0x00000345, 0x00000345, CPU_PMC, },
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, 125 { 0x00000360, 0x00000371, CPU_PMC, },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, 126 { 0x0000038D, 0x00000390, CPU_PMC, },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, 127 { 0x000003A0, 0x000003BE, CPU_PMC, },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, 128 { 0x000003C0, 0x000003CD, CPU_PMC, },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, 129 { 0x000003E0, 0x000003E1, CPU_PMC, },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, 130 { 0x000003F0, 0x000003F2, CPU_PMC, },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, 131
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, 132 { 0x00000400, 0x00000417, CPU_MC, },
135 133 { 0x00000480, 0x0000048B, CPU_VMX, },
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, 134
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, 135 { 0x00000600, 0x00000600, CPU_DEBUG, },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, 136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, 137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, 138
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, 139 { 0x000107CC, 0x000107D3, CPU_PMC, },
142 140
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, 141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, 142 { 0xC0000081, 0xC0000084, CPU_CALL, },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, 143 { 0xC0000100, 0xC0000102, CPU_BASE, },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, 144 { 0xC0000103, 0xC0000103, CPU_TIME, },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, 145
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, 146 { 0xC0010000, 0xC0010007, CPU_PMC, },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, 147 { 0xC0010010, 0xC0010010, CPU_CONF, },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, 148 { 0xC0010015, 0xC0010015, CPU_CONF, },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, 149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, 150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, 151 { 0xC001001F, 0xC001001F, CPU_CONF, },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, 152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
155 153 { 0xC0010044, 0xC0010048, CPU_MC, },
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, 154 { 0xC0010050, 0xC0010056, CPU_SMM, },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, 155 { 0xC0010058, 0xC0010058, CPU_CONF, },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, 156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, 157 { 0xC0010061, 0xC0010068, CPU_SMM, },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, 158 { 0xC0010069, 0xC001006B, CPU_SMM, },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, 159 { 0xC0010070, 0xC0010071, CPU_SMM, },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, 160 { 0xC0010111, 0xC0010113, CPU_SMM, },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, 161 { 0xC0010114, 0xC0010118, CPU_SVM, },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, 162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, 163 { 0xC0011022, 0xC0011023, CPU_CONF, },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178}; 164};
179 165
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag) 166static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{ 167{
355 unsigned vendor, modelflag; 168 int i;
356 int i, index;
357 169
358 /* Standard Registers should be always valid */ 170 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS) 171 if (flag >= CPU_TSS)
360 return 1; 172 return 1;
361 173
362 modelflag = per_cpu(cpu_modelflag, cpu); 174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
363 vendor = per_cpu(cpu_model, cpu) >> 16; 175 if (cpu_reg_range[i].flag == flag)
364 index = get_cpu_range_count(cpu); 176 return 1;
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 } 177 }
380 178
381 /* Invalid */ 179 /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, 183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag) 184 int index, unsigned flag)
387{ 185{
388 unsigned modelflag; 186 if (cpu_reg_range[index].flag == flag) {
389 187 *min = cpu_reg_range[index].min;
390 modelflag = per_cpu(cpu_modelflag, cpu); 188 *max = cpu_reg_range[index].max;
391 *max = 0; 189 } else
392 switch (per_cpu(cpu_model, cpu) >> 16) { 190 *max = 0;
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408 191
409 return *max; 192 return *max;
410} 193}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
434 unsigned msr, msr_min, msr_max; 217 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv; 218 struct cpu_private *priv;
436 u32 low, high; 219 u32 low, high;
437 int i, range; 220 int i;
438 221
439 if (seq) { 222 if (seq) {
440 priv = seq->private; 223 priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
446 } 229 }
447 } 230 }
448 231
449 range = get_cpu_range_count(cpu); 232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) 233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue; 234 continue;
454 235
@@ -800,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
800{ 581{
801 struct dentry *cpu_dentry = NULL; 582 struct dentry *cpu_dentry = NULL;
802 unsigned reg, reg_min, reg_max; 583 unsigned reg, reg_min, reg_max;
803 int i, range, err = 0; 584 int i, err = 0;
804 char reg_dir[12]; 585 char reg_dir[12];
805 u32 low, high; 586 u32 low, high;
806 587
807 range = get_cpu_range_count(cpu); 588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
808
809 for (i = 0; i < range; i++) {
810 if (!get_cpu_range(cpu, &reg_min, &reg_max, i, 589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
811 cpu_base[type].flag)) 590 cpu_base[type].flag))
812 continue; 591 continue;
@@ -862,10 +641,6 @@ static int cpu_init_cpu(void)
862 cpui = &cpu_data(cpu); 641 cpui = &cpu_data(cpu);
863 if (!cpu_has(cpui, X86_FEATURE_MSR)) 642 if (!cpu_has(cpui, X86_FEATURE_MSR))
864 continue; 643 continue;
865 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
866 (cpui->x86 << 8) |
867 (cpui->x86_model));
868 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
869 644
870 sprintf(cpu_dir, "cpu%d", cpu); 645 sprintf(cpu_dir, "cpu%d", cpu);
871 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); 646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c839875478..f138c6c389b9 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
220 If in doubt, say N. 220 If in doubt, say N.
221 221
222config X86_E_POWERSAVER 222config X86_E_POWERSAVER
223 tristate "VIA C7 Enhanced PowerSaver" 223 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
224 select CPU_FREQ_TABLE 224 select CPU_FREQ_TABLE
225 depends on X86_32 225 depends on X86_32 && EXPERIMENTAL
226 help 226 help
227 This adds the CPUFreq driver for VIA C7 processors. 227 This adds the CPUFreq driver for VIA C7 processors. However, this driver
228 does not have any safeguards to prevent operating the CPU out of spec
229 and is thus considered dangerous. Please use the regular ACPI cpufreq
230 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
228 231
229 If in doubt, say N. 232 If in doubt, say N.
230 233
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 54b6de2cd947..ae9b503220ca 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
90{ 90{
91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid); 91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
92 92
93 if (cpu->x86_vendor != X86_VENDOR_INTEL || 93 return cpu_has(cpu, X86_FEATURE_EST);
94 !cpu_has(cpu, X86_FEATURE_EST))
95 return 0;
96
97 return 1;
98} 94}
99 95
100static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) 96static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
@@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void)
550 return -ENOMEM; 546 return -ENOMEM;
551 } 547 }
552 for_each_possible_cpu(i) { 548 for_each_possible_cpu(i) {
553 if (!alloc_cpumask_var_node( 549 if (!zalloc_cpumask_var_node(
554 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, 550 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
555 GFP_KERNEL, cpu_to_node(i))) { 551 GFP_KERNEL, cpu_to_node(i))) {
556 552
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index a8363e5be4ef..d47c775eb0ab 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -322,7 +322,7 @@ static int powernow_acpi_init(void)
322 goto err0; 322 goto err0;
323 } 323 }
324 324
325 if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, 325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326 GFP_KERNEL)) { 326 GFP_KERNEL)) {
327 retval = -ENOMEM; 327 retval = -ENOMEM;
328 goto err05; 328 goto err05;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f6b32d112357..cf52215d9eb1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -835,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
835{ 835{
836 struct cpufreq_frequency_table *powernow_table; 836 struct cpufreq_frequency_table *powernow_table;
837 int ret_val = -ENODEV; 837 int ret_val = -ENODEV;
838 acpi_integer space_id; 838 acpi_integer control, status;
839 839
840 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 840 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
841 dprintk("register performance failed: bad ACPI data\n"); 841 dprintk("register performance failed: bad ACPI data\n");
@@ -848,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
848 goto err_out; 848 goto err_out;
849 } 849 }
850 850
851 space_id = data->acpi_data.control_register.space_id; 851 control = data->acpi_data.control_register.space_id;
852 if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 852 status = data->acpi_data.status_register.space_id;
853 (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 853
854 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
855 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
854 dprintk("Invalid control/status registers (%x - %x)\n", 856 dprintk("Invalid control/status registers (%x - %x)\n",
855 data->acpi_data.control_register.space_id, 857 control, status);
856 space_id);
857 goto err_out; 858 goto err_out;
858 } 859 }
859 860
@@ -886,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
886 /* notify BIOS that we exist */ 887 /* notify BIOS that we exist */
887 acpi_processor_notify_smm(THIS_MODULE); 888 acpi_processor_notify_smm(THIS_MODULE);
888 889
889 if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { 890 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
890 printk(KERN_ERR PFX 891 printk(KERN_ERR PFX
891 "unable to alloc powernow_k8_data cpumask\n"); 892 "unable to alloc powernow_k8_data cpumask\n");
892 ret_val = -ENOMEM; 893 ret_val = -ENOMEM;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index c9f1fdc02830..55c831ed71ce 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy,
471 471
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
473 return -ENOMEM; 473 return -ENOMEM;
474 if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { 474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask); 475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 476 return -ENOMEM;
477 } 477 }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e102..789efe217e1a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/k8.h>
20 21
21#define LVL_1_INST 1 22#define LVL_1_INST 1
22#define LVL_1_DATA 2 23#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
159 unsigned long can_disable; 160 unsigned long can_disable;
160}; 161};
161 162
162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
163static struct pci_device_id k8_nb_id[] = {
164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
166 {}
167};
168#endif
169
170unsigned short num_cache_leaves; 163unsigned short num_cache_leaves;
171 164
172/* AMD doesn't have CPUID4. Emulate it here to report the same 165/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
207}; 200};
208 201
209static const unsigned short __cpuinitconst assocs[] = { 202static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 203 [1] = 1,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 204 [2] = 2,
205 [4] = 4,
206 [6] = 8,
207 [8] = 16,
208 [0xa] = 32,
209 [0xb] = 48,
212 [0xc] = 64, 210 [0xc] = 64,
213 [0xf] = 0xffff // ?? 211 [0xd] = 96,
212 [0xe] = 128,
213 [0xf] = 0xffff /* fully associative - no way to show this currently */
214}; 214};
215 215
216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
271 eax->split.type = types[leaf]; 271 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 272 eax->split.level = levels[leaf];
273 if (leaf == 3) 273 if (leaf == 3)
274 eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; 274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
275 else 276 else
276 eax->split.num_threads_sharing = 0; 277 eax->split.num_threads_sharing = 0;
277 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
291{ 292{
292 if (index < 3) 293 if (index < 3)
293 return; 294 return;
295
296 if (boot_cpu_data.x86 == 0x11)
297 return;
298
299 /* see erratum #382 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
301 return;
302
294 this_leaf->can_disable = 1; 303 this_leaf->can_disable = 1;
295} 304}
296 305
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
696#define to_object(k) container_of(k, struct _index_kobject, kobj) 705#define to_object(k) container_of(k, struct _index_kobject, kobj)
697#define to_attr(a) container_of(a, struct _cache_attr, attr) 706#define to_attr(a) container_of(a, struct _cache_attr, attr)
698 707
699#ifdef CONFIG_PCI 708static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
700static struct pci_dev *get_k8_northbridge(int node) 709 unsigned int index)
701{
702 struct pci_dev *dev = NULL;
703 int i;
704
705 for (i = 0; i <= node; i++) {
706 do {
707 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
708 if (!dev)
709 break;
710 } while (!pci_match_id(&k8_nb_id[0], dev));
711 if (!dev)
712 break;
713 }
714 return dev;
715}
716#else
717static struct pci_dev *get_k8_northbridge(int node)
718{
719 return NULL;
720}
721#endif
722
723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
724{ 710{
725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 711 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
726 int node = cpu_to_node(cpumask_first(mask)); 712 int node = cpu_to_node(cpu);
727 struct pci_dev *dev = NULL; 713 struct pci_dev *dev = node_to_k8_nb_misc(node);
728 ssize_t ret = 0; 714 unsigned int reg = 0;
729 int i;
730 715
731 if (!this_leaf->can_disable) 716 if (!this_leaf->can_disable)
732 return sprintf(buf, "Feature not enabled\n");
733
734 dev = get_k8_northbridge(node);
735 if (!dev) {
736 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
737 return -EINVAL; 717 return -EINVAL;
738 }
739 718
740 for (i = 0; i < 2; i++) { 719 if (!dev)
741 unsigned int reg; 720 return -EINVAL;
742 721
743 pci_read_config_dword(dev, 0x1BC + i * 4, &reg); 722 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
723 return sprintf(buf, "%x\n", reg);
724}
744 725
745 ret += sprintf(buf, "%sEntry: %d\n", buf, i); 726#define SHOW_CACHE_DISABLE(index) \
746 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", 727static ssize_t \
747 buf, 728show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
748 reg & 0x80000000 ? "Disabled" : "Allowed", 729{ \
749 reg & 0x40000000 ? "Disabled" : "Allowed"); 730 return show_cache_disable(this_leaf, buf, index); \
750 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
751 buf, (reg & 0x30000) >> 16, reg & 0xfff);
752 }
753 return ret;
754} 731}
732SHOW_CACHE_DISABLE(0)
733SHOW_CACHE_DISABLE(1)
755 734
756static ssize_t 735static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 736 const char *buf, size_t count, unsigned int index)
758 size_t count)
759{ 737{
760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 738 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
761 int node = cpu_to_node(cpumask_first(mask)); 739 int node = cpu_to_node(cpu);
762 struct pci_dev *dev = NULL; 740 struct pci_dev *dev = node_to_k8_nb_misc(node);
763 unsigned int ret, index, val; 741 unsigned long val = 0;
742 unsigned int scrubber = 0;
764 743
765 if (!this_leaf->can_disable) 744 if (!this_leaf->can_disable)
766 return 0;
767
768 if (strlen(buf) > 15)
769 return -EINVAL; 745 return -EINVAL;
770 746
771 ret = sscanf(buf, "%x %x", &index, &val); 747 if (!capable(CAP_SYS_ADMIN))
772 if (ret != 2) 748 return -EPERM;
749
750 if (!dev)
773 return -EINVAL; 751 return -EINVAL;
774 if (index > 1) 752
753 if (strict_strtoul(buf, 10, &val) < 0)
775 return -EINVAL; 754 return -EINVAL;
776 755
777 val |= 0xc0000000; 756 val |= 0xc0000000;
778 dev = get_k8_northbridge(node); 757
779 if (!dev) { 758 pci_read_config_dword(dev, 0x58, &scrubber);
780 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); 759 scrubber &= ~0x1f000000;
781 return -EINVAL; 760 pci_write_config_dword(dev, 0x58, scrubber);
782 }
783 761
784 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); 762 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
785 wbinvd(); 763 wbinvd();
786 pci_write_config_dword(dev, 0x1BC + index * 4, val); 764 pci_write_config_dword(dev, 0x1BC + index * 4, val);
765 return count;
766}
787 767
788 return 1; 768#define STORE_CACHE_DISABLE(index) \
769static ssize_t \
770store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
771 const char *buf, size_t count) \
772{ \
773 return store_cache_disable(this_leaf, buf, count, index); \
789} 774}
775STORE_CACHE_DISABLE(0)
776STORE_CACHE_DISABLE(1)
790 777
791struct _cache_attr { 778struct _cache_attr {
792 struct attribute attr; 779 struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
808define_one_ro(shared_cpu_map); 795define_one_ro(shared_cpu_map);
809define_one_ro(shared_cpu_list); 796define_one_ro(shared_cpu_list);
810 797
811static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); 798static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
799 show_cache_disable_0, store_cache_disable_0);
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1);
812 802
813static struct attribute * default_attrs[] = { 803static struct attribute * default_attrs[] = {
814 &type.attr, 804 &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
820 &size.attr, 810 &size.attr,
821 &shared_cpu_map.attr, 811 &shared_cpu_map.attr,
822 &shared_cpu_list.attr, 812 &shared_cpu_list.attr,
823 &cache_disable.attr, 813 &cache_disable_0.attr,
814 &cache_disable_1.attr,
824 NULL 815 NULL
825}; 816};
826 817
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 046087e9808f..f2ef6952c400 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,6 @@
15#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
19 18
20#include "mce.h" 19#include "mce.h"
21 20
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04f..1d584a18a50d 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
808 808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy); 811 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 814 return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy); 1006 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1009 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d21d4fb161f7..0543f69f0b27 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
20}; 20};
21 21
22static struct fixed_range_block fixed_range_blocks[] = { 22static struct fixed_range_block fixed_range_blocks[] = {
23 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ 23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ 24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ 25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 26 {}
27}; 27};
28 28
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
194 194
195 k8_check_syscfg_dram_mod_en(); 195 k8_check_syscfg_dram_mod_en();
196 196
197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
198 198
199 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
200 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); 200 rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
201 for (i = 0; i < 8; i++) 201 for (i = 0; i < 8; i++)
202 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 202 rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
203} 203}
204 204
205void mtrr_save_fixed_ranges(void *info) 205void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
310 310
311 vrs = mtrr_state.var_ranges; 311 vrs = mtrr_state.var_ranges;
312 312
313 rdmsr(MTRRcap_MSR, lo, dummy); 313 rdmsr(MSR_MTRRcap, lo, dummy);
314 mtrr_state.have_fixed = (lo >> 8) & 1; 314 mtrr_state.have_fixed = (lo >> 8) & 1;
315 315
316 for (i = 0; i < num_var_ranges; i++) 316 for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
318 if (mtrr_state.have_fixed) 318 if (mtrr_state.have_fixed)
319 get_fixed_ranges(mtrr_state.fixed_ranges); 319 get_fixed_ranges(mtrr_state.fixed_ranges);
320 320
321 rdmsr(MTRRdefType_MSR, lo, dummy); 321 rdmsr(MSR_MTRRdefType, lo, dummy);
322 mtrr_state.def_type = (lo & 0xff); 322 mtrr_state.def_type = (lo & 0xff);
323 mtrr_state.enabled = (lo & 0xc00) >> 10; 323 mtrr_state.enabled = (lo & 0xc00) >> 10;
324 324
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
583 __flush_tlb(); 583 __flush_tlb();
584 584
585 /* Save MTRR state */ 585 /* Save MTRR state */
586 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
587 587
588 /* Disable MTRRs, and set the default type to uncached */ 588 /* Disable MTRRs, and set the default type to uncached */
589 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); 589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
590} 590}
591 591
592static void post_set(void) __releases(set_atomicity_lock) 592static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
595 __flush_tlb(); 595 __flush_tlb();
596 596
597 /* Intel (P6) standard MTRRs */ 597 /* Intel (P6) standard MTRRs */
598 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
599 599
600 /* Enable caches */ 600 /* Enable caches */
601 write_cr0(read_cr0() & 0xbfffffff); 601 write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
707static int generic_have_wrcomb(void) 707static int generic_have_wrcomb(void)
708{ 708{
709 unsigned long config, dummy; 709 unsigned long config, dummy;
710 rdmsr(MTRRcap_MSR, config, dummy); 710 rdmsr(MSR_MTRRcap, config, dummy);
711 return (config & (1 << 10)); 711 return (config & (1 << 10));
712} 712}
713 713
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c7..8fc248b5aeaf 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
104 unsigned long config = 0, dummy; 104 unsigned long config = 0, dummy;
105 105
106 if (use_intel()) { 106 if (use_intel()) {
107 rdmsr(MTRRcap_MSR, config, dummy); 107 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 108 } else if (is_cpu(AMD))
109 config = 2; 109 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347a..7538b767f206 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7 7
8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff
10
11#define MTRRfix64K_00000_MSR 0x250
12#define MTRRfix16K_80000_MSR 0x258
13#define MTRRfix16K_A0000_MSR 0x259
14#define MTRRfix4K_C0000_MSR 0x268
15#define MTRRfix4K_C8000_MSR 0x269
16#define MTRRfix4K_D0000_MSR 0x26a
17#define MTRRfix4K_D8000_MSR 0x26b
18#define MTRRfix4K_E0000_MSR 0x26c
19#define MTRRfix4K_E8000_MSR 0x26d
20#define MTRRfix4K_F0000_MSR 0x26e
21#define MTRRfix4K_F8000_MSR 0x26f
22
23#define MTRR_CHANGE_MASK_FIXED 0x01 8#define MTRR_CHANGE_MASK_FIXED 0x01
24#define MTRR_CHANGE_MASK_VARIABLE 0x02 9#define MTRR_CHANGE_MASK_VARIABLE 0x02
25#define MTRR_CHANGE_MASK_DEFTYPE 0x04 10#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685b..1f5fb1588d1f 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
35 35
36 if (use_intel()) 36 if (use_intel())
37 /* Save MTRR state */ 37 /* Save MTRR state */
38 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 39 else
40 /* Cyrix ARRs - everything else were excluded at the top */ 40 /* Cyrix ARRs - everything else were excluded at the top */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 41 ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 46{
47 if (use_intel()) 47 if (use_intel())
48 /* Disable MTRRs, and set the default type to uncached */ 48 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, 49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 50 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 51 else if (is_cpu(CYRIX))
52 /* Cyrix ARRs - everything else were excluded at the top */ 52 /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
64 /* Restore MTRRdefType */ 64 /* Restore MTRRdefType */
65 if (use_intel()) 65 if (use_intel())
66 /* Intel (P6) standard MTRRs */ 66 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
68 else 68 else
69 /* Cyrix ARRs - everything else was excluded at the top */ 69 /* Cyrix ARRs - everything else was excluded at the top */
70 setCx86(CX86_CCR3, ctxt->ccr3); 70 setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..895c82e78455
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1704 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22
23#include <asm/apic.h>
24#include <asm/stacktrace.h>
25#include <asm/nmi.h>
26
27static u64 perf_counter_mask __read_mostly;
28
29struct cpu_hw_counters {
30 struct perf_counter *counters[X86_PMC_IDX_MAX];
31 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long interrupts;
34 int enabled;
35};
36
37/*
38 * struct x86_pmu - generic x86 pmu
39 */
40struct x86_pmu {
41 const char *name;
42 int version;
43 int (*handle_irq)(struct pt_regs *);
44 void (*disable_all)(void);
45 void (*enable_all)(void);
46 void (*enable)(struct hw_perf_counter *, int);
47 void (*disable)(struct hw_perf_counter *, int);
48 unsigned eventsel;
49 unsigned perfctr;
50 u64 (*event_map)(int);
51 u64 (*raw_event)(u64);
52 int max_events;
53 int num_counters;
54 int num_counters_fixed;
55 int counter_bits;
56 u64 counter_mask;
57 u64 max_period;
58 u64 intel_ctrl;
59};
60
61static struct x86_pmu x86_pmu __read_mostly;
62
63static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64 .enabled = 1,
65};
66
67/*
68 * Intel PerfMon v3. Used on Core2 and later.
69 */
70static const u64 intel_perfmon_event_map[] =
71{
72 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
73 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
74 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
75 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
76 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
77 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
78 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
79};
80
81static u64 intel_pmu_event_map(int event)
82{
83 return intel_perfmon_event_map[event];
84}
85
86/*
87 * Generalized hw caching related event table, filled
88 * in on a per model basis. A value of 0 means
89 * 'not supported', -1 means 'event makes no sense on
90 * this CPU', any other value means the raw event
91 * ID.
92 */
93
94#define C(x) PERF_COUNT_HW_CACHE_##x
95
96static u64 __read_mostly hw_cache_event_ids
97 [PERF_COUNT_HW_CACHE_MAX]
98 [PERF_COUNT_HW_CACHE_OP_MAX]
99 [PERF_COUNT_HW_CACHE_RESULT_MAX];
100
101static const u64 nehalem_hw_cache_event_ids
102 [PERF_COUNT_HW_CACHE_MAX]
103 [PERF_COUNT_HW_CACHE_OP_MAX]
104 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
105{
106 [ C(L1D) ] = {
107 [ C(OP_READ) ] = {
108 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
109 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
110 },
111 [ C(OP_WRITE) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
113 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
114 },
115 [ C(OP_PREFETCH) ] = {
116 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
117 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
118 },
119 },
120 [ C(L1I ) ] = {
121 [ C(OP_READ) ] = {
122 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
123 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
124 },
125 [ C(OP_WRITE) ] = {
126 [ C(RESULT_ACCESS) ] = -1,
127 [ C(RESULT_MISS) ] = -1,
128 },
129 [ C(OP_PREFETCH) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0,
131 [ C(RESULT_MISS) ] = 0x0,
132 },
133 },
134 [ C(LL ) ] = {
135 [ C(OP_READ) ] = {
136 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
137 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
138 },
139 [ C(OP_WRITE) ] = {
140 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
141 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
142 },
143 [ C(OP_PREFETCH) ] = {
144 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
145 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
146 },
147 },
148 [ C(DTLB) ] = {
149 [ C(OP_READ) ] = {
150 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
151 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
152 },
153 [ C(OP_WRITE) ] = {
154 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
155 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
156 },
157 [ C(OP_PREFETCH) ] = {
158 [ C(RESULT_ACCESS) ] = 0x0,
159 [ C(RESULT_MISS) ] = 0x0,
160 },
161 },
162 [ C(ITLB) ] = {
163 [ C(OP_READ) ] = {
164 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
165 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
166 },
167 [ C(OP_WRITE) ] = {
168 [ C(RESULT_ACCESS) ] = -1,
169 [ C(RESULT_MISS) ] = -1,
170 },
171 [ C(OP_PREFETCH) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 },
176 [ C(BPU ) ] = {
177 [ C(OP_READ) ] = {
178 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
179 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
180 },
181 [ C(OP_WRITE) ] = {
182 [ C(RESULT_ACCESS) ] = -1,
183 [ C(RESULT_MISS) ] = -1,
184 },
185 [ C(OP_PREFETCH) ] = {
186 [ C(RESULT_ACCESS) ] = -1,
187 [ C(RESULT_MISS) ] = -1,
188 },
189 },
190};
191
192static const u64 core2_hw_cache_event_ids
193 [PERF_COUNT_HW_CACHE_MAX]
194 [PERF_COUNT_HW_CACHE_OP_MAX]
195 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
196{
197 [ C(L1D) ] = {
198 [ C(OP_READ) ] = {
199 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
200 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
201 },
202 [ C(OP_WRITE) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
204 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
205 },
206 [ C(OP_PREFETCH) ] = {
207 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
208 [ C(RESULT_MISS) ] = 0,
209 },
210 },
211 [ C(L1I ) ] = {
212 [ C(OP_READ) ] = {
213 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
214 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
215 },
216 [ C(OP_WRITE) ] = {
217 [ C(RESULT_ACCESS) ] = -1,
218 [ C(RESULT_MISS) ] = -1,
219 },
220 [ C(OP_PREFETCH) ] = {
221 [ C(RESULT_ACCESS) ] = 0,
222 [ C(RESULT_MISS) ] = 0,
223 },
224 },
225 [ C(LL ) ] = {
226 [ C(OP_READ) ] = {
227 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
228 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
229 },
230 [ C(OP_WRITE) ] = {
231 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
232 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
233 },
234 [ C(OP_PREFETCH) ] = {
235 [ C(RESULT_ACCESS) ] = 0,
236 [ C(RESULT_MISS) ] = 0,
237 },
238 },
239 [ C(DTLB) ] = {
240 [ C(OP_READ) ] = {
241 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
242 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
243 },
244 [ C(OP_WRITE) ] = {
245 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
246 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
247 },
248 [ C(OP_PREFETCH) ] = {
249 [ C(RESULT_ACCESS) ] = 0,
250 [ C(RESULT_MISS) ] = 0,
251 },
252 },
253 [ C(ITLB) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
256 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = -1,
260 [ C(RESULT_MISS) ] = -1,
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 },
267 [ C(BPU ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
270 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = -1,
278 [ C(RESULT_MISS) ] = -1,
279 },
280 },
281};
282
283static const u64 atom_hw_cache_event_ids
284 [PERF_COUNT_HW_CACHE_MAX]
285 [PERF_COUNT_HW_CACHE_OP_MAX]
286 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
287{
288 [ C(L1D) ] = {
289 [ C(OP_READ) ] = {
290 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
291 [ C(RESULT_MISS) ] = 0,
292 },
293 [ C(OP_WRITE) ] = {
294 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
295 [ C(RESULT_MISS) ] = 0,
296 },
297 [ C(OP_PREFETCH) ] = {
298 [ C(RESULT_ACCESS) ] = 0x0,
299 [ C(RESULT_MISS) ] = 0,
300 },
301 },
302 [ C(L1I ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
305 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
306 },
307 [ C(OP_WRITE) ] = {
308 [ C(RESULT_ACCESS) ] = -1,
309 [ C(RESULT_MISS) ] = -1,
310 },
311 [ C(OP_PREFETCH) ] = {
312 [ C(RESULT_ACCESS) ] = 0,
313 [ C(RESULT_MISS) ] = 0,
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
319 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
320 },
321 [ C(OP_WRITE) ] = {
322 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
323 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
324 },
325 [ C(OP_PREFETCH) ] = {
326 [ C(RESULT_ACCESS) ] = 0,
327 [ C(RESULT_MISS) ] = 0,
328 },
329 },
330 [ C(DTLB) ] = {
331 [ C(OP_READ) ] = {
332 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
333 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
334 },
335 [ C(OP_WRITE) ] = {
336 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
337 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
338 },
339 [ C(OP_PREFETCH) ] = {
340 [ C(RESULT_ACCESS) ] = 0,
341 [ C(RESULT_MISS) ] = 0,
342 },
343 },
344 [ C(ITLB) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
347 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = -1,
351 [ C(RESULT_MISS) ] = -1,
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 },
358 [ C(BPU ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
361 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = -1,
369 [ C(RESULT_MISS) ] = -1,
370 },
371 },
372};
373
374static u64 intel_pmu_raw_event(u64 event)
375{
376#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
377#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
378#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
379#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
380#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
381
382#define CORE_EVNTSEL_MASK \
383 (CORE_EVNTSEL_EVENT_MASK | \
384 CORE_EVNTSEL_UNIT_MASK | \
385 CORE_EVNTSEL_EDGE_MASK | \
386 CORE_EVNTSEL_INV_MASK | \
387 CORE_EVNTSEL_COUNTER_MASK)
388
389 return event & CORE_EVNTSEL_MASK;
390}
391
392static const u64 amd_0f_hw_cache_event_ids
393 [PERF_COUNT_HW_CACHE_MAX]
394 [PERF_COUNT_HW_CACHE_OP_MAX]
395 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
396{
397 [ C(L1D) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0,
400 [ C(RESULT_MISS) ] = 0,
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0,
404 [ C(RESULT_MISS) ] = 0,
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(L1I ) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
414 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = -1,
418 [ C(RESULT_MISS) ] = -1,
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(LL ) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0,
428 [ C(RESULT_MISS) ] = 0,
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = 0,
432 [ C(RESULT_MISS) ] = 0,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = 0,
436 [ C(RESULT_MISS) ] = 0,
437 },
438 },
439 [ C(DTLB) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0,
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = 0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = 0,
450 [ C(RESULT_MISS) ] = 0,
451 },
452 },
453 [ C(ITLB) ] = {
454 [ C(OP_READ) ] = {
455 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
456 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
457 },
458 [ C(OP_WRITE) ] = {
459 [ C(RESULT_ACCESS) ] = -1,
460 [ C(RESULT_MISS) ] = -1,
461 },
462 [ C(OP_PREFETCH) ] = {
463 [ C(RESULT_ACCESS) ] = -1,
464 [ C(RESULT_MISS) ] = -1,
465 },
466 },
467 [ C(BPU ) ] = {
468 [ C(OP_READ) ] = {
469 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
470 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
471 },
472 [ C(OP_WRITE) ] = {
473 [ C(RESULT_ACCESS) ] = -1,
474 [ C(RESULT_MISS) ] = -1,
475 },
476 [ C(OP_PREFETCH) ] = {
477 [ C(RESULT_ACCESS) ] = -1,
478 [ C(RESULT_MISS) ] = -1,
479 },
480 },
481};
482
483/*
484 * AMD Performance Monitor K7 and later.
485 */
486static const u64 amd_perfmon_event_map[] =
487{
488 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
489 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
490 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
491 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
492 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
493 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
494};
495
496static u64 amd_pmu_event_map(int event)
497{
498 return amd_perfmon_event_map[event];
499}
500
501static u64 amd_pmu_raw_event(u64 event)
502{
503#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
504#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
505#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
506#define K7_EVNTSEL_INV_MASK 0x000800000ULL
507#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
508
509#define K7_EVNTSEL_MASK \
510 (K7_EVNTSEL_EVENT_MASK | \
511 K7_EVNTSEL_UNIT_MASK | \
512 K7_EVNTSEL_EDGE_MASK | \
513 K7_EVNTSEL_INV_MASK | \
514 K7_EVNTSEL_COUNTER_MASK)
515
516 return event & K7_EVNTSEL_MASK;
517}
518
519/*
520 * Propagate counter elapsed time into the generic counter.
521 * Can only be executed on the CPU where the counter is active.
522 * Returns the delta events processed.
523 */
524static u64
525x86_perf_counter_update(struct perf_counter *counter,
526 struct hw_perf_counter *hwc, int idx)
527{
528 int shift = 64 - x86_pmu.counter_bits;
529 u64 prev_raw_count, new_raw_count;
530 s64 delta;
531
532 /*
533 * Careful: an NMI might modify the previous counter value.
534 *
535 * Our tactic to handle this is to first atomically read and
536 * exchange a new raw count - then add that new-prev delta
537 * count to the generic counter atomically:
538 */
539again:
540 prev_raw_count = atomic64_read(&hwc->prev_count);
541 rdmsrl(hwc->counter_base + idx, new_raw_count);
542
543 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
544 new_raw_count) != prev_raw_count)
545 goto again;
546
547 /*
548 * Now we have the new raw value and have updated the prev
549 * timestamp already. We can now calculate the elapsed delta
550 * (counter-)time and add that to the generic counter.
551 *
552 * Careful, not all hw sign-extends above the physical width
553 * of the count.
554 */
555 delta = (new_raw_count << shift) - (prev_raw_count << shift);
556 delta >>= shift;
557
558 atomic64_add(delta, &counter->count);
559 atomic64_sub(delta, &hwc->period_left);
560
561 return new_raw_count;
562}
563
564static atomic_t active_counters;
565static DEFINE_MUTEX(pmc_reserve_mutex);
566
567static bool reserve_pmc_hardware(void)
568{
569 int i;
570
571 if (nmi_watchdog == NMI_LOCAL_APIC)
572 disable_lapic_nmi_watchdog();
573
574 for (i = 0; i < x86_pmu.num_counters; i++) {
575 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
576 goto perfctr_fail;
577 }
578
579 for (i = 0; i < x86_pmu.num_counters; i++) {
580 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
581 goto eventsel_fail;
582 }
583
584 return true;
585
586eventsel_fail:
587 for (i--; i >= 0; i--)
588 release_evntsel_nmi(x86_pmu.eventsel + i);
589
590 i = x86_pmu.num_counters;
591
592perfctr_fail:
593 for (i--; i >= 0; i--)
594 release_perfctr_nmi(x86_pmu.perfctr + i);
595
596 if (nmi_watchdog == NMI_LOCAL_APIC)
597 enable_lapic_nmi_watchdog();
598
599 return false;
600}
601
602static void release_pmc_hardware(void)
603{
604 int i;
605
606 for (i = 0; i < x86_pmu.num_counters; i++) {
607 release_perfctr_nmi(x86_pmu.perfctr + i);
608 release_evntsel_nmi(x86_pmu.eventsel + i);
609 }
610
611 if (nmi_watchdog == NMI_LOCAL_APIC)
612 enable_lapic_nmi_watchdog();
613}
614
615static void hw_perf_counter_destroy(struct perf_counter *counter)
616{
617 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
618 release_pmc_hardware();
619 mutex_unlock(&pmc_reserve_mutex);
620 }
621}
622
623static inline int x86_pmu_initialized(void)
624{
625 return x86_pmu.handle_irq != NULL;
626}
627
628static inline int
629set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
630{
631 unsigned int cache_type, cache_op, cache_result;
632 u64 config, val;
633
634 config = attr->config;
635
636 cache_type = (config >> 0) & 0xff;
637 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
638 return -EINVAL;
639
640 cache_op = (config >> 8) & 0xff;
641 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
642 return -EINVAL;
643
644 cache_result = (config >> 16) & 0xff;
645 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
646 return -EINVAL;
647
648 val = hw_cache_event_ids[cache_type][cache_op][cache_result];
649
650 if (val == 0)
651 return -ENOENT;
652
653 if (val == -1)
654 return -EINVAL;
655
656 hwc->config |= val;
657
658 return 0;
659}
660
661/*
662 * Setup the hardware configuration for a given attr_type
663 */
664static int __hw_perf_counter_init(struct perf_counter *counter)
665{
666 struct perf_counter_attr *attr = &counter->attr;
667 struct hw_perf_counter *hwc = &counter->hw;
668 int err;
669
670 if (!x86_pmu_initialized())
671 return -ENODEV;
672
673 err = 0;
674 if (!atomic_inc_not_zero(&active_counters)) {
675 mutex_lock(&pmc_reserve_mutex);
676 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
677 err = -EBUSY;
678 else
679 atomic_inc(&active_counters);
680 mutex_unlock(&pmc_reserve_mutex);
681 }
682 if (err)
683 return err;
684
685 /*
686 * Generate PMC IRQs:
687 * (keep 'enabled' bit clear for now)
688 */
689 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
690
691 /*
692 * Count user and OS events unless requested not to.
693 */
694 if (!attr->exclude_user)
695 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
696 if (!attr->exclude_kernel)
697 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
698
699 if (!hwc->sample_period) {
700 hwc->sample_period = x86_pmu.max_period;
701 hwc->last_period = hwc->sample_period;
702 atomic64_set(&hwc->period_left, hwc->sample_period);
703 }
704
705 counter->destroy = hw_perf_counter_destroy;
706
707 /*
708 * Raw event type provide the config in the event structure
709 */
710 if (attr->type == PERF_TYPE_RAW) {
711 hwc->config |= x86_pmu.raw_event(attr->config);
712 return 0;
713 }
714
715 if (attr->type == PERF_TYPE_HW_CACHE)
716 return set_ext_hw_attr(hwc, attr);
717
718 if (attr->config >= x86_pmu.max_events)
719 return -EINVAL;
720 /*
721 * The generic map:
722 */
723 hwc->config |= x86_pmu.event_map(attr->config);
724
725 return 0;
726}
727
728static void intel_pmu_disable_all(void)
729{
730 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
731}
732
733static void amd_pmu_disable_all(void)
734{
735 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
736 int idx;
737
738 if (!cpuc->enabled)
739 return;
740
741 cpuc->enabled = 0;
742 /*
743 * ensure we write the disable before we start disabling the
744 * counters proper, so that amd_pmu_enable_counter() does the
745 * right thing.
746 */
747 barrier();
748
749 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
750 u64 val;
751
752 if (!test_bit(idx, cpuc->active_mask))
753 continue;
754 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
755 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
756 continue;
757 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
758 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
759 }
760}
761
762void hw_perf_disable(void)
763{
764 if (!x86_pmu_initialized())
765 return;
766 return x86_pmu.disable_all();
767}
768
769static void intel_pmu_enable_all(void)
770{
771 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
772}
773
774static void amd_pmu_enable_all(void)
775{
776 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
777 int idx;
778
779 if (cpuc->enabled)
780 return;
781
782 cpuc->enabled = 1;
783 barrier();
784
785 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
786 u64 val;
787
788 if (!test_bit(idx, cpuc->active_mask))
789 continue;
790 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
791 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
792 continue;
793 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
794 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
795 }
796}
797
798void hw_perf_enable(void)
799{
800 if (!x86_pmu_initialized())
801 return;
802 x86_pmu.enable_all();
803}
804
805static inline u64 intel_pmu_get_status(void)
806{
807 u64 status;
808
809 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
810
811 return status;
812}
813
814static inline void intel_pmu_ack_status(u64 ack)
815{
816 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
817}
818
819static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
820{
821 int err;
822 err = checking_wrmsrl(hwc->config_base + idx,
823 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
824}
825
826static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
827{
828 int err;
829 err = checking_wrmsrl(hwc->config_base + idx,
830 hwc->config);
831}
832
833static inline void
834intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
835{
836 int idx = __idx - X86_PMC_IDX_FIXED;
837 u64 ctrl_val, mask;
838 int err;
839
840 mask = 0xfULL << (idx * 4);
841
842 rdmsrl(hwc->config_base, ctrl_val);
843 ctrl_val &= ~mask;
844 err = checking_wrmsrl(hwc->config_base, ctrl_val);
845}
846
847static inline void
848intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
849{
850 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
851 intel_pmu_disable_fixed(hwc, idx);
852 return;
853 }
854
855 x86_pmu_disable_counter(hwc, idx);
856}
857
858static inline void
859amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
860{
861 x86_pmu_disable_counter(hwc, idx);
862}
863
864static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
865
866/*
867 * Set the next IRQ period, based on the hwc->period_left value.
868 * To be called with the counter disabled in hw:
869 */
870static int
871x86_perf_counter_set_period(struct perf_counter *counter,
872 struct hw_perf_counter *hwc, int idx)
873{
874 s64 left = atomic64_read(&hwc->period_left);
875 s64 period = hwc->sample_period;
876 int err, ret = 0;
877
878 /*
879 * If we are way outside a reasoable range then just skip forward:
880 */
881 if (unlikely(left <= -period)) {
882 left = period;
883 atomic64_set(&hwc->period_left, left);
884 hwc->last_period = period;
885 ret = 1;
886 }
887
888 if (unlikely(left <= 0)) {
889 left += period;
890 atomic64_set(&hwc->period_left, left);
891 hwc->last_period = period;
892 ret = 1;
893 }
894 /*
895 * Quirk: certain CPUs dont like it if just 1 event is left:
896 */
897 if (unlikely(left < 2))
898 left = 2;
899
900 if (left > x86_pmu.max_period)
901 left = x86_pmu.max_period;
902
903 per_cpu(prev_left[idx], smp_processor_id()) = left;
904
905 /*
906 * The hw counter starts counting from this counter offset,
907 * mark it to be able to extra future deltas:
908 */
909 atomic64_set(&hwc->prev_count, (u64)-left);
910
911 err = checking_wrmsrl(hwc->counter_base + idx,
912 (u64)(-left) & x86_pmu.counter_mask);
913
914 return ret;
915}
916
917static inline void
918intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
919{
920 int idx = __idx - X86_PMC_IDX_FIXED;
921 u64 ctrl_val, bits, mask;
922 int err;
923
924 /*
925 * Enable IRQ generation (0x8),
926 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
927 * if requested:
928 */
929 bits = 0x8ULL;
930 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
931 bits |= 0x2;
932 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
933 bits |= 0x1;
934 bits <<= (idx * 4);
935 mask = 0xfULL << (idx * 4);
936
937 rdmsrl(hwc->config_base, ctrl_val);
938 ctrl_val &= ~mask;
939 ctrl_val |= bits;
940 err = checking_wrmsrl(hwc->config_base, ctrl_val);
941}
942
943static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
944{
945 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
946 intel_pmu_enable_fixed(hwc, idx);
947 return;
948 }
949
950 x86_pmu_enable_counter(hwc, idx);
951}
952
953static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
954{
955 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
956
957 if (cpuc->enabled)
958 x86_pmu_enable_counter(hwc, idx);
959 else
960 x86_pmu_disable_counter(hwc, idx);
961}
962
963static int
964fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
965{
966 unsigned int event;
967
968 if (!x86_pmu.num_counters_fixed)
969 return -1;
970
971 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
972
973 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
974 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
975 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
976 return X86_PMC_IDX_FIXED_CPU_CYCLES;
977 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
978 return X86_PMC_IDX_FIXED_BUS_CYCLES;
979
980 return -1;
981}
982
983/*
984 * Find a PMC slot for the freshly enabled / scheduled in counter:
985 */
986static int x86_pmu_enable(struct perf_counter *counter)
987{
988 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
989 struct hw_perf_counter *hwc = &counter->hw;
990 int idx;
991
992 idx = fixed_mode_idx(counter, hwc);
993 if (idx >= 0) {
994 /*
995 * Try to get the fixed counter, if that is already taken
996 * then try to get a generic counter:
997 */
998 if (test_and_set_bit(idx, cpuc->used_mask))
999 goto try_generic;
1000
1001 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1002 /*
1003 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
1004 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1005 */
1006 hwc->counter_base =
1007 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1008 hwc->idx = idx;
1009 } else {
1010 idx = hwc->idx;
1011 /* Try to get the previous generic counter again */
1012 if (test_and_set_bit(idx, cpuc->used_mask)) {
1013try_generic:
1014 idx = find_first_zero_bit(cpuc->used_mask,
1015 x86_pmu.num_counters);
1016 if (idx == x86_pmu.num_counters)
1017 return -EAGAIN;
1018
1019 set_bit(idx, cpuc->used_mask);
1020 hwc->idx = idx;
1021 }
1022 hwc->config_base = x86_pmu.eventsel;
1023 hwc->counter_base = x86_pmu.perfctr;
1024 }
1025
1026 perf_counters_lapic_init();
1027
1028 x86_pmu.disable(hwc, idx);
1029
1030 cpuc->counters[idx] = counter;
1031 set_bit(idx, cpuc->active_mask);
1032
1033 x86_perf_counter_set_period(counter, hwc, idx);
1034 x86_pmu.enable(hwc, idx);
1035
1036 return 0;
1037}
1038
1039static void x86_pmu_unthrottle(struct perf_counter *counter)
1040{
1041 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1042 struct hw_perf_counter *hwc = &counter->hw;
1043
1044 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1045 cpuc->counters[hwc->idx] != counter))
1046 return;
1047
1048 x86_pmu.enable(hwc, hwc->idx);
1049}
1050
1051void perf_counter_print_debug(void)
1052{
1053 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1054 struct cpu_hw_counters *cpuc;
1055 unsigned long flags;
1056 int cpu, idx;
1057
1058 if (!x86_pmu.num_counters)
1059 return;
1060
1061 local_irq_save(flags);
1062
1063 cpu = smp_processor_id();
1064 cpuc = &per_cpu(cpu_hw_counters, cpu);
1065
1066 if (x86_pmu.version >= 2) {
1067 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1068 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1069 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1070 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1071
1072 pr_info("\n");
1073 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1074 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1075 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1076 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1077 }
1078 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1079
1080 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1081 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1082 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1083
1084 prev_left = per_cpu(prev_left[idx], cpu);
1085
1086 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1087 cpu, idx, pmc_ctrl);
1088 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1089 cpu, idx, pmc_count);
1090 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1091 cpu, idx, prev_left);
1092 }
1093 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1094 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1095
1096 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1097 cpu, idx, pmc_count);
1098 }
1099 local_irq_restore(flags);
1100}
1101
1102static void x86_pmu_disable(struct perf_counter *counter)
1103{
1104 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1105 struct hw_perf_counter *hwc = &counter->hw;
1106 int idx = hwc->idx;
1107
1108 /*
1109 * Must be done before we disable, otherwise the nmi handler
1110 * could reenable again:
1111 */
1112 clear_bit(idx, cpuc->active_mask);
1113 x86_pmu.disable(hwc, idx);
1114
1115 /*
1116 * Make sure the cleared pointer becomes visible before we
1117 * (potentially) free the counter:
1118 */
1119 barrier();
1120
1121 /*
1122 * Drain the remaining delta count out of a counter
1123 * that we are disabling:
1124 */
1125 x86_perf_counter_update(counter, hwc, idx);
1126 cpuc->counters[idx] = NULL;
1127 clear_bit(idx, cpuc->used_mask);
1128}
1129
1130/*
1131 * Save and restart an expired counter. Called by NMI contexts,
1132 * so it has to be careful about preempting normal counter ops:
1133 */
1134static int intel_pmu_save_and_restart(struct perf_counter *counter)
1135{
1136 struct hw_perf_counter *hwc = &counter->hw;
1137 int idx = hwc->idx;
1138 int ret;
1139
1140 x86_perf_counter_update(counter, hwc, idx);
1141 ret = x86_perf_counter_set_period(counter, hwc, idx);
1142
1143 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1144 intel_pmu_enable_counter(hwc, idx);
1145
1146 return ret;
1147}
1148
1149static void intel_pmu_reset(void)
1150{
1151 unsigned long flags;
1152 int idx;
1153
1154 if (!x86_pmu.num_counters)
1155 return;
1156
1157 local_irq_save(flags);
1158
1159 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1160
1161 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1162 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1163 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1164 }
1165 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1166 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1167 }
1168
1169 local_irq_restore(flags);
1170}
1171
1172
1173/*
1174 * This handler is triggered by the local APIC, so the APIC IRQ handling
1175 * rules apply:
1176 */
1177static int intel_pmu_handle_irq(struct pt_regs *regs)
1178{
1179 struct perf_sample_data data;
1180 struct cpu_hw_counters *cpuc;
1181 int bit, cpu, loops;
1182 u64 ack, status;
1183
1184 data.regs = regs;
1185 data.addr = 0;
1186
1187 cpu = smp_processor_id();
1188 cpuc = &per_cpu(cpu_hw_counters, cpu);
1189
1190 perf_disable();
1191 status = intel_pmu_get_status();
1192 if (!status) {
1193 perf_enable();
1194 return 0;
1195 }
1196
1197 loops = 0;
1198again:
1199 if (++loops > 100) {
1200 WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
1201 perf_counter_print_debug();
1202 intel_pmu_reset();
1203 perf_enable();
1204 return 1;
1205 }
1206
1207 inc_irq_stat(apic_perf_irqs);
1208 ack = status;
1209 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1210 struct perf_counter *counter = cpuc->counters[bit];
1211
1212 clear_bit(bit, (unsigned long *) &status);
1213 if (!test_bit(bit, cpuc->active_mask))
1214 continue;
1215
1216 if (!intel_pmu_save_and_restart(counter))
1217 continue;
1218
1219 if (perf_counter_overflow(counter, 1, &data))
1220 intel_pmu_disable_counter(&counter->hw, bit);
1221 }
1222
1223 intel_pmu_ack_status(ack);
1224
1225 /*
1226 * Repeat if there is more work to be done:
1227 */
1228 status = intel_pmu_get_status();
1229 if (status)
1230 goto again;
1231
1232 perf_enable();
1233
1234 return 1;
1235}
1236
1237static int amd_pmu_handle_irq(struct pt_regs *regs)
1238{
1239 struct perf_sample_data data;
1240 struct cpu_hw_counters *cpuc;
1241 struct perf_counter *counter;
1242 struct hw_perf_counter *hwc;
1243 int cpu, idx, handled = 0;
1244 u64 val;
1245
1246 data.regs = regs;
1247 data.addr = 0;
1248
1249 cpu = smp_processor_id();
1250 cpuc = &per_cpu(cpu_hw_counters, cpu);
1251
1252 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1253 if (!test_bit(idx, cpuc->active_mask))
1254 continue;
1255
1256 counter = cpuc->counters[idx];
1257 hwc = &counter->hw;
1258
1259 val = x86_perf_counter_update(counter, hwc, idx);
1260 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1261 continue;
1262
1263 /*
1264 * counter overflow
1265 */
1266 handled = 1;
1267 data.period = counter->hw.last_period;
1268
1269 if (!x86_perf_counter_set_period(counter, hwc, idx))
1270 continue;
1271
1272 if (perf_counter_overflow(counter, 1, &data))
1273 amd_pmu_disable_counter(hwc, idx);
1274 }
1275
1276 if (handled)
1277 inc_irq_stat(apic_perf_irqs);
1278
1279 return handled;
1280}
1281
1282void smp_perf_pending_interrupt(struct pt_regs *regs)
1283{
1284 irq_enter();
1285 ack_APIC_irq();
1286 inc_irq_stat(apic_pending_irqs);
1287 perf_counter_do_pending();
1288 irq_exit();
1289}
1290
1291void set_perf_counter_pending(void)
1292{
1293 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1294}
1295
1296void perf_counters_lapic_init(void)
1297{
1298 if (!x86_pmu_initialized())
1299 return;
1300
1301 /*
1302 * Always use NMI for PMU
1303 */
1304 apic_write(APIC_LVTPC, APIC_DM_NMI);
1305}
1306
1307static int __kprobes
1308perf_counter_nmi_handler(struct notifier_block *self,
1309 unsigned long cmd, void *__args)
1310{
1311 struct die_args *args = __args;
1312 struct pt_regs *regs;
1313
1314 if (!atomic_read(&active_counters))
1315 return NOTIFY_DONE;
1316
1317 switch (cmd) {
1318 case DIE_NMI:
1319 case DIE_NMI_IPI:
1320 break;
1321
1322 default:
1323 return NOTIFY_DONE;
1324 }
1325
1326 regs = args->regs;
1327
1328 apic_write(APIC_LVTPC, APIC_DM_NMI);
1329 /*
1330 * Can't rely on the handled return value to say it was our NMI, two
1331 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
1332 *
1333 * If the first NMI handles both, the latter will be empty and daze
1334 * the CPU.
1335 */
1336 x86_pmu.handle_irq(regs);
1337
1338 return NOTIFY_STOP;
1339}
1340
1341static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1342 .notifier_call = perf_counter_nmi_handler,
1343 .next = NULL,
1344 .priority = 1
1345};
1346
1347static struct x86_pmu intel_pmu = {
1348 .name = "Intel",
1349 .handle_irq = intel_pmu_handle_irq,
1350 .disable_all = intel_pmu_disable_all,
1351 .enable_all = intel_pmu_enable_all,
1352 .enable = intel_pmu_enable_counter,
1353 .disable = intel_pmu_disable_counter,
1354 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1355 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1356 .event_map = intel_pmu_event_map,
1357 .raw_event = intel_pmu_raw_event,
1358 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1359 /*
1360 * Intel PMCs cannot be accessed sanely above 32 bit width,
1361 * so we install an artificial 1<<31 period regardless of
1362 * the generic counter period:
1363 */
1364 .max_period = (1ULL << 31) - 1,
1365};
1366
1367static struct x86_pmu amd_pmu = {
1368 .name = "AMD",
1369 .handle_irq = amd_pmu_handle_irq,
1370 .disable_all = amd_pmu_disable_all,
1371 .enable_all = amd_pmu_enable_all,
1372 .enable = amd_pmu_enable_counter,
1373 .disable = amd_pmu_disable_counter,
1374 .eventsel = MSR_K7_EVNTSEL0,
1375 .perfctr = MSR_K7_PERFCTR0,
1376 .event_map = amd_pmu_event_map,
1377 .raw_event = amd_pmu_raw_event,
1378 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1379 .num_counters = 4,
1380 .counter_bits = 48,
1381 .counter_mask = (1ULL << 48) - 1,
1382 /* use highest bit to detect overflow */
1383 .max_period = (1ULL << 47) - 1,
1384};
1385
1386static int intel_pmu_init(void)
1387{
1388 union cpuid10_edx edx;
1389 union cpuid10_eax eax;
1390 unsigned int unused;
1391 unsigned int ebx;
1392 int version;
1393
1394 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1395 return -ENODEV;
1396
1397 /*
1398 * Check whether the Architectural PerfMon supports
1399 * Branch Misses Retired Event or not.
1400 */
1401 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1402 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1403 return -ENODEV;
1404
1405 version = eax.split.version_id;
1406 if (version < 2)
1407 return -ENODEV;
1408
1409 x86_pmu = intel_pmu;
1410 x86_pmu.version = version;
1411 x86_pmu.num_counters = eax.split.num_counters;
1412 x86_pmu.counter_bits = eax.split.bit_width;
1413 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1414
1415 /*
1416 * Quirk: v2 perfmon does not report fixed-purpose counters, so
1417 * assume at least 3 counters:
1418 */
1419 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1420
1421 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1422
1423 /*
1424 * Install the hw-cache-events table:
1425 */
1426 switch (boot_cpu_data.x86_model) {
1427 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1428 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1429 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1430 case 29: /* six-core 45 nm xeon "Dunnington" */
1431 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
1432 sizeof(hw_cache_event_ids));
1433
1434 pr_cont("Core2 events, ");
1435 break;
1436 default:
1437 case 26:
1438 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1439 sizeof(hw_cache_event_ids));
1440
1441 pr_cont("Nehalem/Corei7 events, ");
1442 break;
1443 case 28:
1444 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
1445 sizeof(hw_cache_event_ids));
1446
1447 pr_cont("Atom events, ");
1448 break;
1449 }
1450 return 0;
1451}
1452
1453static int amd_pmu_init(void)
1454{
1455 x86_pmu = amd_pmu;
1456
1457 switch (boot_cpu_data.x86) {
1458 case 0x0f:
1459 case 0x10:
1460 case 0x11:
1461 memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
1462 sizeof(hw_cache_event_ids));
1463
1464 pr_cont("AMD Family 0f/10/11 events, ");
1465 break;
1466 }
1467 return 0;
1468}
1469
1470void __init init_hw_perf_counters(void)
1471{
1472 int err;
1473
1474 pr_info("Performance Counters: ");
1475
1476 switch (boot_cpu_data.x86_vendor) {
1477 case X86_VENDOR_INTEL:
1478 err = intel_pmu_init();
1479 break;
1480 case X86_VENDOR_AMD:
1481 err = amd_pmu_init();
1482 break;
1483 default:
1484 return;
1485 }
1486 if (err != 0) {
1487 pr_cont("no PMU driver, software counters only.\n");
1488 return;
1489 }
1490
1491 pr_cont("%s PMU driver.\n", x86_pmu.name);
1492
1493 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1494 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1495 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1496 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1497 }
1498 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1499 perf_max_counters = x86_pmu.num_counters;
1500
1501 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1502 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1503 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1504 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1505 }
1506
1507 perf_counter_mask |=
1508 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1509
1510 perf_counters_lapic_init();
1511 register_die_notifier(&perf_counter_nmi_notifier);
1512
1513 pr_info("... version: %d\n", x86_pmu.version);
1514 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1515 pr_info("... generic counters: %d\n", x86_pmu.num_counters);
1516 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1517 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1518 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
1519 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1520}
1521
1522static inline void x86_pmu_read(struct perf_counter *counter)
1523{
1524 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1525}
1526
1527static const struct pmu pmu = {
1528 .enable = x86_pmu_enable,
1529 .disable = x86_pmu_disable,
1530 .read = x86_pmu_read,
1531 .unthrottle = x86_pmu_unthrottle,
1532};
1533
1534const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1535{
1536 int err;
1537
1538 err = __hw_perf_counter_init(counter);
1539 if (err)
1540 return ERR_PTR(err);
1541
1542 return &pmu;
1543}
1544
1545/*
1546 * callchain support
1547 */
1548
1549static inline
1550void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1551{
1552 if (entry->nr < MAX_STACK_DEPTH)
1553 entry->ip[entry->nr++] = ip;
1554}
1555
1556static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1557static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1558
1559
1560static void
1561backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1562{
1563 /* Ignore warnings */
1564}
1565
1566static void backtrace_warning(void *data, char *msg)
1567{
1568 /* Ignore warnings */
1569}
1570
1571static int backtrace_stack(void *data, char *name)
1572{
1573 /* Don't bother with IRQ stacks for now */
1574 return -1;
1575}
1576
1577static void backtrace_address(void *data, unsigned long addr, int reliable)
1578{
1579 struct perf_callchain_entry *entry = data;
1580
1581 if (reliable)
1582 callchain_store(entry, addr);
1583}
1584
1585static const struct stacktrace_ops backtrace_ops = {
1586 .warning = backtrace_warning,
1587 .warning_symbol = backtrace_warning_symbol,
1588 .stack = backtrace_stack,
1589 .address = backtrace_address,
1590};
1591
1592static void
1593perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1594{
1595 unsigned long bp;
1596 char *stack;
1597 int nr = entry->nr;
1598
1599 callchain_store(entry, instruction_pointer(regs));
1600
1601 stack = ((char *)regs + sizeof(struct pt_regs));
1602#ifdef CONFIG_FRAME_POINTER
1603 bp = frame_pointer(regs);
1604#else
1605 bp = 0;
1606#endif
1607
1608 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1609
1610 entry->kernel = entry->nr - nr;
1611}
1612
1613
1614struct stack_frame {
1615 const void __user *next_fp;
1616 unsigned long return_address;
1617};
1618
1619static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1620{
1621 int ret;
1622
1623 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1624 return 0;
1625
1626 ret = 1;
1627 pagefault_disable();
1628 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1629 ret = 0;
1630 pagefault_enable();
1631
1632 return ret;
1633}
1634
1635static void
1636perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1637{
1638 struct stack_frame frame;
1639 const void __user *fp;
1640 int nr = entry->nr;
1641
1642 regs = (struct pt_regs *)current->thread.sp0 - 1;
1643 fp = (void __user *)regs->bp;
1644
1645 callchain_store(entry, regs->ip);
1646
1647 while (entry->nr < MAX_STACK_DEPTH) {
1648 frame.next_fp = NULL;
1649 frame.return_address = 0;
1650
1651 if (!copy_stack_frame(fp, &frame))
1652 break;
1653
1654 if ((unsigned long)fp < user_stack_pointer(regs))
1655 break;
1656
1657 callchain_store(entry, frame.return_address);
1658 fp = frame.next_fp;
1659 }
1660
1661 entry->user = entry->nr - nr;
1662}
1663
1664static void
1665perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1666{
1667 int is_user;
1668
1669 if (!regs)
1670 return;
1671
1672 is_user = user_mode(regs);
1673
1674 if (!current || current->pid == 0)
1675 return;
1676
1677 if (is_user && current->state != TASK_RUNNING)
1678 return;
1679
1680 if (!is_user)
1681 perf_callchain_kernel(regs, entry);
1682
1683 if (current->mm)
1684 perf_callchain_user(regs, entry);
1685}
1686
1687struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1688{
1689 struct perf_callchain_entry *entry;
1690
1691 if (in_nmi())
1692 entry = &__get_cpu_var(nmi_entry);
1693 else
1694 entry = &__get_cpu_var(irq_entry);
1695
1696 entry->nr = 0;
1697 entry->hv = 0;
1698 entry->kernel = 0;
1699 entry->user = 0;
1700
1701 perf_do_callchain(regs, entry);
1702
1703 return entry;
1704}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3a765a..48bfe1386038 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,45 +19,61 @@
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */ 20 */
21 21
22 22#include <linux/kernel.h>
23#include <asm/ds.h>
24
25#include <linux/errno.h>
26#include <linux/string.h> 23#include <linux/string.h>
27#include <linux/slab.h> 24#include <linux/errno.h>
28#include <linux/sched.h> 25#include <linux/sched.h>
26#include <linux/slab.h>
29#include <linux/mm.h> 27#include <linux/mm.h>
30#include <linux/kernel.h> 28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31 31
32#include "ds_selftest.h"
32 33
33/* 34/*
34 * The configuration for a particular DS hardware implementation. 35 * The configuration for a particular DS hardware implementation:
35 */ 36 */
36struct ds_configuration { 37struct ds_configuration {
37 /* the name of the configuration */ 38 /* The name of the configuration: */
38 const char *name; 39 const char *name;
39 /* the size of one pointer-typed field in the DS structure and 40
40 in the BTS and PEBS buffers in bytes; 41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
41 this covers the first 8 DS fields related to buffer management. */ 42 unsigned char sizeof_ptr_field;
42 unsigned char sizeof_field; 43
43 /* the size of a BTS/PEBS record in bytes */ 44 /* The size of a BTS/PEBS record in bytes: */
44 unsigned char sizeof_rec[2]; 45 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed 46
46 * by enum ds_feature */ 47 /* The number of pebs counter reset values in the DS structure. */
47 unsigned long ctl[dsf_ctl_max]; 48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
48}; 52};
49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); 53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
50 58
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) 59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
52 61
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ 62/* BTS and PEBS buffer alignment: */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ 63#define DS_ALIGNMENT (1 << 3)
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56 64
57#define BTS_CONTROL \ 65/* Number of buffer pointers in DS: */
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ 66#define NUM_DS_PTR_FIELDS 8
59 ds_cfg.ctl[dsf_bts_overflow])
60 67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
61 77
62/* 78/*
63 * A BTS or PEBS tracer. 79 * A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
66 * to identify tracers. 82 * to identify tracers.
67 */ 83 */
68struct ds_tracer { 84struct ds_tracer {
69 /* the DS context (partially) owned by this tracer */ 85 /* The DS context (partially) owned by this tracer. */
70 struct ds_context *context; 86 struct ds_context *context;
71 /* the buffer provided on ds_request() and its size in bytes */ 87 /* The buffer provided on ds_request() and its size in bytes. */
72 void *buffer; 88 void *buffer;
73 size_t size; 89 size_t size;
74}; 90};
75 91
76struct bts_tracer { 92struct bts_tracer {
77 /* the common DS part */ 93 /* The common DS part: */
78 struct ds_tracer ds; 94 struct ds_tracer ds;
79 /* the trace including the DS configuration */ 95
80 struct bts_trace trace; 96 /* The trace including the DS configuration: */
81 /* buffer overflow notification function */ 97 struct bts_trace trace;
82 bts_ovfl_callback_t ovfl; 98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
83}; 104};
84 105
85struct pebs_tracer { 106struct pebs_tracer {
86 /* the common DS part */ 107 /* The common DS part: */
87 struct ds_tracer ds; 108 struct ds_tracer ds;
88 /* the trace including the DS configuration */ 109
89 struct pebs_trace trace; 110 /* The trace including the DS configuration: */
90 /* buffer overflow notification function */ 111 struct pebs_trace trace;
91 pebs_ovfl_callback_t ovfl; 112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
92}; 115};
93 116
94/* 117/*
@@ -97,6 +120,7 @@ struct pebs_tracer {
97 * 120 *
98 * The DS configuration consists of the following fields; different 121 * The DS configuration consists of the following fields; different
99 * architetures vary in the size of those fields. 122 * architetures vary in the size of those fields.
123 *
100 * - double-word aligned base linear address of the BTS buffer 124 * - double-word aligned base linear address of the BTS buffer
101 * - write pointer into the BTS buffer 125 * - write pointer into the BTS buffer
102 * - end linear address of the BTS buffer (one byte beyond the end of 126 * - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
135}; 159};
136 160
137enum ds_qualifier { 161enum ds_qualifier {
138 ds_bts = 0, 162 ds_bts = 0,
139 ds_pebs 163 ds_pebs
140}; 164};
141 165
142static inline unsigned long ds_get(const unsigned char *base, 166static inline unsigned long
143 enum ds_qualifier qual, enum ds_field field) 167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
144{ 168{
145 base += (ds_cfg.sizeof_field * (field + (4 * qual))); 169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
146 return *(unsigned long *)base; 170 return *(unsigned long *)base;
147} 171}
148 172
149static inline void ds_set(unsigned char *base, enum ds_qualifier qual, 173static inline void
150 enum ds_field field, unsigned long value) 174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
151{ 176{
152 base += (ds_cfg.sizeof_field * (field + (4 * qual))); 177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
153 (*(unsigned long *)base) = value; 178 (*(unsigned long *)base) = value;
154} 179}
155 180
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
159 */ 184 */
160static DEFINE_SPINLOCK(ds_lock); 185static DEFINE_SPINLOCK(ds_lock);
161 186
162
163/* 187/*
164 * We either support (system-wide) per-cpu or per-thread allocation. 188 * We either support (system-wide) per-cpu or per-thread allocation.
165 * We distinguish the two based on the task_struct pointer, where a 189 * We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
178 */ 202 */
179static atomic_t tracers = ATOMIC_INIT(0); 203static atomic_t tracers = ATOMIC_INIT(0);
180 204
181static inline void get_tracer(struct task_struct *task) 205static inline int get_tracer(struct task_struct *task)
182{ 206{
183 if (task) 207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
184 atomic_inc(&tracers); 215 atomic_inc(&tracers);
185 else 216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
186 atomic_dec(&tracers); 220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
187} 227}
188 228
189static inline void put_tracer(struct task_struct *task) 229static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
194 atomic_inc(&tracers); 234 atomic_inc(&tracers);
195} 235}
196 236
197static inline int check_tracer(struct task_struct *task)
198{
199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
202}
203
204
205/* 237/*
206 * The DS context is either attached to a thread or to a cpu: 238 * The DS context is either attached to a thread or to a cpu:
207 * - in the former case, the thread_struct contains a pointer to the 239 * - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
213 * deallocated when the last user puts the context. 245 * deallocated when the last user puts the context.
214 */ 246 */
215struct ds_context { 247struct ds_context {
216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ 248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
217 unsigned char ds[MAX_SIZEOF_DS]; 249 unsigned char ds[MAX_SIZEOF_DS];
218 /* the owner of the BTS and PEBS configuration, respectively */ 250
219 struct bts_tracer *bts_master; 251 /* The owner of the BTS and PEBS configuration, respectively: */
220 struct pebs_tracer *pebs_master; 252 struct bts_tracer *bts_master;
221 /* use count */ 253 struct pebs_tracer *pebs_master;
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
230 254
231static DEFINE_PER_CPU(struct ds_context *, system_context_array); 255 /* Use count: */
256 unsigned long count;
232 257
233#define system_context per_cpu(system_context_array, smp_processor_id()) 258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
234 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context);
235 269
236static inline struct ds_context *ds_get_context(struct task_struct *task) 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
237{ 272{
238 struct ds_context **p_context = 273 struct ds_context **p_context =
239 (task ? &task->thread.ds_ctx : &system_context); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
240 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
242 unsigned long irq;
243 277
244 /* Chances are small that we already have a context. */ 278 /* Chances are small that we already have a context. */
245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); 279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
246 if (!new_context) 280 if (!new_context)
247 return NULL; 281 return NULL;
248 282
249 spin_lock_irqsave(&ds_lock, irq); 283 spin_lock_irq(&ds_lock);
250 284
251 context = *p_context; 285 context = *p_context;
252 if (!context) { 286 if (likely(!context)) {
253 context = new_context; 287 context = new_context;
254 288
255 context->this = p_context; 289 context->this = p_context;
256 context->task = task; 290 context->task = task;
291 context->cpu = cpu;
257 context->count = 0; 292 context->count = 0;
258 293
259 if (task)
260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
261
262 if (!task || (task == current))
263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
264
265 *p_context = context; 294 *p_context = context;
266 } 295 }
267 296
268 context->count++; 297 context->count++;
269 298
270 spin_unlock_irqrestore(&ds_lock, irq); 299 spin_unlock_irq(&ds_lock);
271 300
272 if (context != new_context) 301 if (context != new_context)
273 kfree(new_context); 302 kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
275 return context; 304 return context;
276} 305}
277 306
278static inline void ds_put_context(struct ds_context *context) 307static void ds_put_context(struct ds_context *context)
279{ 308{
309 struct task_struct *task;
280 unsigned long irq; 310 unsigned long irq;
281 311
282 if (!context) 312 if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
291 321
292 *(context->this) = NULL; 322 *(context->this) = NULL;
293 323
294 if (context->task) 324 task = context->task;
295 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); 325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
296 328
297 if (!context->task || (context->task == current)) 329 /*
298 wrmsrl(MSR_IA32_DS_AREA, 0); 330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
299 338
300 spin_unlock_irqrestore(&ds_lock, irq); 339 spin_unlock_irqrestore(&ds_lock, irq);
301 340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
302 kfree(context); 345 kfree(context);
303} 346}
304 347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
305 373
306/* 374/*
307 * Call the tracer's callback on a buffer overflow. 375 * Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
332 * The remainder of any partially written record is zeroed out. 400 * The remainder of any partially written record is zeroed out.
333 * 401 *
334 * context: the DS context 402 * context: the DS context
335 * qual: the buffer type 403 * qual: the buffer type
336 * record: the data to write 404 * record: the data to write
337 * size: the size of the data 405 * size: the size of the data
338 */ 406 */
339static int ds_write(struct ds_context *context, enum ds_qualifier qual, 407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size) 408 const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
349 unsigned long write_size, adj_write_size; 417 unsigned long write_size, adj_write_size;
350 418
351 /* 419 /*
352 * write as much as possible without producing an 420 * Write as much as possible without producing an
353 * overflow interrupt. 421 * overflow interrupt.
354 * 422 *
355 * interrupt_threshold must either be 423 * Interrupt_threshold must either be
356 * - bigger than absolute_maximum or 424 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum 425 * - point to a record between buffer_base and absolute_maximum
358 * 426 *
359 * index points to a valid record. 427 * Index points to a valid record.
360 */ 428 */
361 base = ds_get(context->ds, qual, ds_buffer_base); 429 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index); 430 index = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
365 433
366 write_end = min(end, int_th); 434 write_end = min(end, int_th);
367 435
368 /* if we are already beyond the interrupt threshold, 436 /*
369 * we fill the entire buffer */ 437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
370 if (write_end <= index) 440 if (write_end <= index)
371 write_end = end; 441 write_end = end;
372 442
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual]; 454 adj_write_size *= ds_cfg.sizeof_rec[qual];
385 455
386 /* zero out trailing bytes */ 456 /* Zero out trailing bytes. */
387 memset((char *)index + write_size, 0, 457 memset((char *)index + write_size, 0,
388 adj_write_size - write_size); 458 adj_write_size - write_size);
389 index += adj_write_size; 459 index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
410 * Later architectures use 64bit pointers throughout, whereas earlier 480 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode. 481 * architectures use 32bit pointers in 32bit mode.
412 * 482 *
413 * We compute the base address for the first 8 fields based on: 483 * We compute the base address for the fields based on:
414 * - the field size stored in the DS configuration 484 * - the field size stored in the DS configuration
415 * - the relative field position 485 * - the relative field position
416 * 486 *
@@ -431,23 +501,23 @@ enum bts_field {
431 bts_to, 501 bts_to,
432 bts_flags, 502 bts_flags,
433 503
434 bts_qual = bts_from, 504 bts_qual = bts_from,
435 bts_jiffies = bts_to, 505 bts_clock = bts_to,
436 bts_pid = bts_flags, 506 bts_pid = bts_flags,
437 507
438 bts_qual_mask = (bts_qual_max - 1), 508 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask) 509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440}; 510};
441 511
442static inline unsigned long bts_get(const char *base, enum bts_field field) 512static inline unsigned long bts_get(const char *base, enum bts_field field)
443{ 513{
444 base += (ds_cfg.sizeof_field * field); 514 base += (ds_cfg.sizeof_ptr_field * field);
445 return *(unsigned long *)base; 515 return *(unsigned long *)base;
446} 516}
447 517
448static inline void bts_set(char *base, enum bts_field field, unsigned long val) 518static inline void bts_set(char *base, enum bts_field field, unsigned long val)
449{ 519{
450 base += (ds_cfg.sizeof_field * field);; 520 base += (ds_cfg.sizeof_ptr_field * field);;
451 (*(unsigned long *)base) = val; 521 (*(unsigned long *)base) = val;
452} 522}
453 523
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
463 * 533 *
464 * return: bytes read/written on success; -Eerrno, otherwise 534 * return: bytes read/written on success; -Eerrno, otherwise
465 */ 535 */
466static int bts_read(struct bts_tracer *tracer, const void *at, 536static int
467 struct bts_struct *out) 537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
468{ 538{
469 if (!tracer) 539 if (!tracer)
470 return -EINVAL; 540 return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
478 memset(out, 0, sizeof(*out)); 548 memset(out, 0, sizeof(*out));
479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { 549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); 550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); 551 out->variant.event.clock = bts_get(at, bts_clock);
482 out->variant.timestamp.pid = bts_get(at, bts_pid); 552 out->variant.event.pid = bts_get(at, bts_pid);
483 } else { 553 } else {
484 out->qualifier = bts_branch; 554 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from); 555 out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
516 case bts_task_arrives: 586 case bts_task_arrives:
517 case bts_task_departs: 587 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier)); 588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); 589 bts_set(raw, bts_clock, in->variant.event.clock);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid); 590 bts_set(raw, bts_pid, in->variant.event.pid);
521 break; 591 break;
522 default: 592 default:
523 return -EINVAL; 593 return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
555 unsigned int flags) { 625 unsigned int flags) {
556 unsigned long buffer, adj; 626 unsigned long buffer, adj;
557 627
558 /* adjust the buffer address and size to meet alignment 628 /*
629 * Adjust the buffer address and size to meet alignment
559 * constraints: 630 * constraints:
560 * - buffer is double-word aligned 631 * - buffer is double-word aligned
561 * - size is multiple of record size 632 * - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
577 trace->begin = (void *)buffer; 648 trace->begin = (void *)buffer;
578 trace->top = trace->begin; 649 trace->top = trace->begin;
579 trace->end = (void *)(buffer + size); 650 trace->end = (void *)(buffer + size);
580 /* The value for 'no threshold' is -1, which will set the 651 /*
652 * The value for 'no threshold' is -1, which will set the
581 * threshold outside of the buffer, just like we want it. 653 * threshold outside of the buffer, just like we want it.
582 */ 654 */
655 ith *= ds_cfg.sizeof_rec[qual];
583 trace->ith = (void *)(buffer + size - ith); 656 trace->ith = (void *)(buffer + size - ith);
584 657
585 trace->flags = flags; 658 trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
588 661
589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, 662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
590 enum ds_qualifier qual, struct task_struct *task, 663 enum ds_qualifier qual, struct task_struct *task,
591 void *base, size_t size, size_t th, unsigned int flags) 664 int cpu, void *base, size_t size, size_t th)
592{ 665{
593 struct ds_context *context; 666 struct ds_context *context;
594 int error; 667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
595 673
596 error = -EINVAL; 674 error = -EINVAL;
597 if (!base) 675 if (!base)
598 goto out; 676 goto out;
599 677
600 /* we require some space to do alignment adjustments below */ 678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
601 error = -EINVAL; 683 error = -EINVAL;
602 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) 684 if (size < req_size)
603 goto out; 685 goto out;
604 686
605 if (th != (size_t)-1) { 687 if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
614 tracer->size = size; 696 tracer->size = size;
615 697
616 error = -ENOMEM; 698 error = -ENOMEM;
617 context = ds_get_context(task); 699 context = ds_get_context(task, cpu);
618 if (!context) 700 if (!context)
619 goto out; 701 goto out;
620 tracer->context = context; 702 tracer->context = context;
621 703
622 ds_init_ds_trace(trace, qual, base, size, th, flags); 704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
623 708
624 error = 0; 709 error = 0;
625 out: 710 out:
626 return error; 711 return error;
627} 712}
628 713
629struct bts_tracer *ds_request_bts(struct task_struct *task, 714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
630 void *base, size_t size, 715 void *base, size_t size,
631 bts_ovfl_callback_t ovfl, size_t th, 716 bts_ovfl_callback_t ovfl, size_t th,
632 unsigned int flags) 717 unsigned int flags)
633{ 718{
634 struct bts_tracer *tracer; 719 struct bts_tracer *tracer;
635 unsigned long irq;
636 int error; 720 int error;
637 721
722 /* Buffer overflow notification is not yet implemented. */
638 error = -EOPNOTSUPP; 723 error = -EOPNOTSUPP;
639 if (!ds_cfg.ctl[dsf_bts]) 724 if (ovfl)
640 goto out; 725 goto out;
641 726
642 /* buffer overflow notification is not yet implemented */ 727 error = get_tracer(task);
643 error = -EOPNOTSUPP; 728 if (error < 0)
644 if (ovfl)
645 goto out; 729 goto out;
646 730
647 error = -ENOMEM; 731 error = -ENOMEM;
648 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); 732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
649 if (!tracer) 733 if (!tracer)
650 goto out; 734 goto out_put_tracer;
651 tracer->ovfl = ovfl; 735 tracer->ovfl = ovfl;
652 736
737 /* Do some more error checking and acquire a tracing context. */
653 error = ds_request(&tracer->ds, &tracer->trace.ds, 738 error = ds_request(&tracer->ds, &tracer->trace.ds,
654 ds_bts, task, base, size, th, flags); 739 ds_bts, task, cpu, base, size, th);
655 if (error < 0) 740 if (error < 0)
656 goto out_tracer; 741 goto out_tracer;
657 742
658 743 /* Claim the bts part of the tracing context we acquired above. */
659 spin_lock_irqsave(&ds_lock, irq); 744 spin_lock_irq(&ds_lock);
660
661 error = -EPERM;
662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
665 745
666 error = -EPERM; 746 error = -EPERM;
667 if (tracer->ds.context->bts_master) 747 if (tracer->ds.context->bts_master)
668 goto out_put_tracer; 748 goto out_unlock;
669 tracer->ds.context->bts_master = tracer; 749 tracer->ds.context->bts_master = tracer;
670 750
671 spin_unlock_irqrestore(&ds_lock, irq); 751 spin_unlock_irq(&ds_lock);
672 752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
673 760
674 tracer->trace.read = bts_read; 761 tracer->trace.read = bts_read;
675 tracer->trace.write = bts_write; 762 tracer->trace.write = bts_write;
676 763
677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); 764 /* Start tracing. */
678 ds_resume_bts(tracer); 765 ds_resume_bts(tracer);
679 766
680 return tracer; 767 return tracer;
681 768
682 out_put_tracer:
683 put_tracer(task);
684 out_unlock: 769 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq); 770 spin_unlock_irq(&ds_lock);
686 ds_put_context(tracer->ds.context); 771 ds_put_context(tracer->ds.context);
687 out_tracer: 772 out_tracer:
688 kfree(tracer); 773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
689 out: 776 out:
690 return ERR_PTR(error); 777 return ERR_PTR(error);
691} 778}
692 779
693struct pebs_tracer *ds_request_pebs(struct task_struct *task, 780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
694 void *base, size_t size, 781 void *base, size_t size,
695 pebs_ovfl_callback_t ovfl, size_t th, 782 bts_ovfl_callback_t ovfl,
696 unsigned int flags) 783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
697{ 799{
698 struct pebs_tracer *tracer; 800 struct pebs_tracer *tracer;
699 unsigned long irq;
700 int error; 801 int error;
701 802
702 /* buffer overflow notification is not yet implemented */ 803 /* Buffer overflow notification is not yet implemented. */
703 error = -EOPNOTSUPP; 804 error = -EOPNOTSUPP;
704 if (ovfl) 805 if (ovfl)
705 goto out; 806 goto out;
706 807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
707 error = -ENOMEM; 812 error = -ENOMEM;
708 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); 813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
709 if (!tracer) 814 if (!tracer)
710 goto out; 815 goto out_put_tracer;
711 tracer->ovfl = ovfl; 816 tracer->ovfl = ovfl;
712 817
818 /* Do some more error checking and acquire a tracing context. */
713 error = ds_request(&tracer->ds, &tracer->trace.ds, 819 error = ds_request(&tracer->ds, &tracer->trace.ds,
714 ds_pebs, task, base, size, th, flags); 820 ds_pebs, task, cpu, base, size, th);
715 if (error < 0) 821 if (error < 0)
716 goto out_tracer; 822 goto out_tracer;
717 823
718 spin_lock_irqsave(&ds_lock, irq); 824 /* Claim the pebs part of the tracing context we acquired above. */
719 825 spin_lock_irq(&ds_lock);
720 error = -EPERM;
721 if (!check_tracer(task))
722 goto out_unlock;
723 get_tracer(task);
724 826
725 error = -EPERM; 827 error = -EPERM;
726 if (tracer->ds.context->pebs_master) 828 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer; 829 goto out_unlock;
728 tracer->ds.context->pebs_master = tracer; 830 tracer->ds.context->pebs_master = tracer;
729 831
730 spin_unlock_irqrestore(&ds_lock, irq); 832 spin_unlock_irq(&ds_lock);
731 833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); 839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
733 ds_resume_pebs(tracer); 843 ds_resume_pebs(tracer);
734 844
735 return tracer; 845 return tracer;
736 846
737 out_put_tracer:
738 put_tracer(task);
739 out_unlock: 847 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq); 848 spin_unlock_irq(&ds_lock);
741 ds_put_context(tracer->ds.context); 849 ds_put_context(tracer->ds.context);
742 out_tracer: 850 out_tracer:
743 kfree(tracer); 851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
744 out: 854 out:
745 return ERR_PTR(error); 855 return ERR_PTR(error);
746} 856}
747 857
748void ds_release_bts(struct bts_tracer *tracer) 858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
749{ 862{
750 if (!tracer) 863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
751 return; 864}
752 865
753 ds_suspend_bts(tracer); 866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
754 878
755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); 879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
756 tracer->ds.context->bts_master = NULL; 880 tracer->ds.context->bts_master = NULL;
757 881
758 put_tracer(tracer->ds.context->task); 882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
759 ds_put_context(tracer->ds.context); 886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
760 888
761 kfree(tracer); 889 kfree(tracer);
762} 890}
763 891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
764void ds_suspend_bts(struct bts_tracer *tracer) 945void ds_suspend_bts(struct bts_tracer *tracer)
765{ 946{
766 struct task_struct *task; 947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
767 950
768 if (!tracer) 951 if (!tracer)
769 return; 952 return;
770 953
954 tracer->flags = 0;
955
771 task = tracer->ds.context->task; 956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
772 958
773 if (!task || (task == current)) 959 WARN_ON(!task && irqs_disabled());
774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
775 960
776 if (task) { 961 debugctlmsr = (task ?
777 task->thread.debugctlmsr &= ~BTS_CONTROL; 962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
778 965
779 if (!task->thread.debugctlmsr) 966 if (task)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); 967 update_task_debugctlmsr(task, debugctlmsr);
781 } 968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
782} 970}
783 971
784void ds_resume_bts(struct bts_tracer *tracer) 972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
785{ 973{
786 struct task_struct *task; 974 struct task_struct *task;
787 unsigned long control; 975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
788 977
789 if (!tracer) 978 if (!tracer)
790 return; 979 return 0;
980
981 tracer->flags = 0;
791 982
792 task = tracer->ds.context->task; 983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
793 1011
794 control = ds_cfg.ctl[dsf_bts]; 1012 control = ds_cfg.ctl[dsf_bts];
795 if (!(tracer->trace.ds.flags & BTS_KERNEL)) 1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
797 if (!(tracer->trace.ds.flags & BTS_USER)) 1015 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user]; 1016 control |= ds_cfg.ctl[dsf_bts_user];
799 1017
800 if (task) { 1018 return control;
801 task->thread.debugctlmsr |= control;
802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
803 }
804
805 if (!task || (task == current))
806 update_debugctlmsr(get_debugctlmsr() | control);
807} 1019}
808 1020
809void ds_release_pebs(struct pebs_tracer *tracer) 1021void ds_resume_bts(struct bts_tracer *tracer)
810{ 1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
811 if (!tracer) 1027 if (!tracer)
812 return; 1028 return;
813 1029
814 ds_suspend_pebs(tracer); 1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
815 1089
816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); 1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
817 tracer->ds.context->pebs_master = NULL; 1091 tracer->ds.context->pebs_master = NULL;
818 1092
819 put_tracer(tracer->ds.context->task);
820 ds_put_context(tracer->ds.context); 1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
821 1095
822 kfree(tracer); 1096 kfree(tracer);
823} 1097}
824 1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
825void ds_suspend_pebs(struct pebs_tracer *tracer) 1141void ds_suspend_pebs(struct pebs_tracer *tracer)
826{ 1142{
827 1143
828} 1144}
829 1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
830void ds_resume_pebs(struct pebs_tracer *tracer) 1151void ds_resume_pebs(struct pebs_tracer *tracer)
831{ 1152{
832 1153
833} 1154}
834 1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) 1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
836{ 1162{
837 if (!tracer) 1163 if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
847 return NULL; 1173 return NULL;
848 1174
849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); 1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value = 1176
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); 1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
852 1182
853 return &tracer->trace; 1183 return &tracer->trace;
854} 1184}
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
873 1203
874 tracer->trace.ds.top = tracer->trace.ds.begin; 1204 tracer->trace.ds.top = tracer->trace.ds.begin;
875 1205
876 ds_set(tracer->ds.context->ds, ds_bts, ds_index, 1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
877 (unsigned long)tracer->trace.ds.top); 1207 (unsigned long)tracer->trace.ds.top);
878 1208
879 return 0; 1209 return 0;
880} 1210}
881 1211
882int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) 1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
883{ 1214{
884 if (!tracer) 1215 if (!tracer)
885 return -EINVAL; 1216 return -EINVAL;
886 1217
887 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; 1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
888 1224
889 return 0; 1225 return 0;
890} 1226}
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
894 .ctl[dsf_bts] = (1 << 2) | (1 << 3), 1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
895 .ctl[dsf_bts_kernel] = (1 << 5), 1231 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6), 1232 .ctl[dsf_bts_user] = (1 << 6),
897 1233 .nr_counter_reset = 1,
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
900#ifdef __i386__
901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
902#else
903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
904#endif
905}; 1234};
906static const struct ds_configuration ds_cfg_pentium_m = { 1235static const struct ds_configuration ds_cfg_pentium_m = {
907 .name = "Pentium M", 1236 .name = "Pentium M",
908 .ctl[dsf_bts] = (1 << 6) | (1 << 7), 1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
909 1238 .nr_counter_reset = 1,
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
912#ifdef __i386__
913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
914#else
915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
916#endif
917}; 1239};
918static const struct ds_configuration ds_cfg_core2_atom = { 1240static const struct ds_configuration ds_cfg_core2_atom = {
919 .name = "Core 2/Atom", 1241 .name = "Core 2/Atom",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7), 1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9), 1243 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10), 1244 .ctl[dsf_bts_user] = (1 << 10),
923 1245 .nr_counter_reset = 1,
924 .sizeof_field = 8, 1246};
925 .sizeof_rec[ds_bts] = 8 * 3, 1247static const struct ds_configuration ds_cfg_core_i7 = {
926 .sizeof_rec[ds_pebs] = 8 * 18, 1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
927}; 1253};
928 1254
929static void 1255static void
930ds_configure(const struct ds_configuration *cfg) 1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
931{ 1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
932 memset(&ds_cfg, 0, sizeof(ds_cfg)); 1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
933 ds_cfg = *cfg; 1296 ds_cfg = *cfg;
934 1297
935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); 1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
936 1303
937 if (!cpu_has_bts) { 1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
938 ds_cfg.ctl[dsf_bts] = 0; 1305 ds_cfg.sizeof_rec[ds_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n"); 1306 printk(KERN_INFO "[ds] bts not available\n");
940 } 1307 }
941 if (!cpu_has_pebs) 1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
942 printk(KERN_INFO "[ds] pebs not available\n"); 1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
943 1317
944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); 1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
945} 1319}
946 1320
947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
948{ 1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
949 switch (c->x86) { 1327 switch (c->x86) {
950 case 0x6: 1328 case 0x6:
951 switch (c->x86_model) { 1329 switch (c->x86_model) {
952 case 0x9: 1330 case 0x9:
953 case 0xd: /* Pentium M */ 1331 case 0xd: /* Pentium M */
954 ds_configure(&ds_cfg_pentium_m); 1332 ds_configure(&ds_cfg_pentium_m, c);
955 break; 1333 break;
956 case 0xf: 1334 case 0xf:
957 case 0x17: /* Core2 */ 1335 case 0x17: /* Core2 */
958 case 0x1c: /* Atom */ 1336 case 0x1c: /* Atom */
959 ds_configure(&ds_cfg_core2_atom); 1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
960 break; 1341 break;
961 case 0x1a: /* i7 */
962 default: 1342 default:
963 /* sorry, don't know about them */ 1343 /* Sorry, don't know about them. */
964 break; 1344 break;
965 } 1345 }
966 break; 1346 break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
969 case 0x0: 1349 case 0x0:
970 case 0x1: 1350 case 0x1:
971 case 0x2: /* Netburst */ 1351 case 0x2: /* Netburst */
972 ds_configure(&ds_cfg_netburst); 1352 ds_configure(&ds_cfg_netburst, c);
973 break; 1353 break;
974 default: 1354 default:
975 /* sorry, don't know about them */ 1355 /* Sorry, don't know about them. */
976 break; 1356 break;
977 } 1357 }
978 break; 1358 break;
979 default: 1359 default:
980 /* sorry, don't know about them */ 1360 /* Sorry, don't know about them. */
981 break; 1361 break;
982 } 1362 }
983} 1363}
984 1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
985/* 1386/*
986 * Change the DS configuration from tracing prev to tracing next. 1387 * Change the DS configuration from tracing prev to tracing next.
987 */ 1388 */
988void ds_switch_to(struct task_struct *prev, struct task_struct *next) 1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
989{ 1390{
990 struct ds_context *prev_ctx = prev->thread.ds_ctx; 1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
991 struct ds_context *next_ctx = next->thread.ds_ctx; 1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
992 1397
993 if (prev_ctx) { 1398 if (prev_ctx) {
994 update_debugctlmsr(0); 1399 update_debugctlmsr(0);
995 1400
996 if (prev_ctx->bts_master && 1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
997 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
998 struct bts_struct ts = {
999 .qualifier = bts_task_departs,
1000 .variant.timestamp.jiffies = jiffies_64,
1001 .variant.timestamp.pid = prev->pid
1002 };
1003 bts_write(prev_ctx->bts_master, &ts);
1004 }
1005 } 1402 }
1006 1403
1007 if (next_ctx) { 1404 if (next_ctx) {
1008 if (next_ctx->bts_master && 1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1009 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1010 struct bts_struct ts = {
1011 .qualifier = bts_task_arrives,
1012 .variant.timestamp.jiffies = jiffies_64,
1013 .variant.timestamp.pid = next->pid
1014 };
1015 bts_write(next_ctx->bts_master, &ts);
1016 }
1017 1406
1018 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); 1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1019 } 1408 }
1020 1409
1021 update_debugctlmsr(next->thread.debugctlmsr); 1410 update_debugctlmsr(debugctlmsr);
1022} 1411}
1023 1412
1024void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) 1413static __init int ds_selftest(void)
1025{ 1414{
1026 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); 1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1027 tsk->thread.ds_ctx = NULL; 1416 int error;
1028}
1029 1417
1030void ds_exit_thread(struct task_struct *tsk) 1418 error = ds_selftest_bts();
1031{ 1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1032} 1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 000000000000..6bc7c199ab99
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,408 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 000000000000..2ba8745c6663
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b8698..81086c227ab7 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl); 29 unsigned long *sp, unsigned long bp, char *log_lvl);
30 30
31extern unsigned int code_bytes; 31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33 32
34/* The form of the top of the frame on the stack */ 33/* The form of the top of the frame on the stack */
35struct stack_frame { 34struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 006281302925..7271fa33d791 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
617 */ 617 */
618__init void e820_setup_gap(void) 618__init void e820_setup_gap(void)
619{ 619{
620 unsigned long gapstart, gapsize, round; 620 unsigned long gapstart, gapsize;
621 int found; 621 int found;
622 622
623 gapstart = 0x10000000; 623 gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
635#endif 635#endif
636 636
637 /* 637 /*
638 * See how much we want to round up: start off with 638 * e820_reserve_resources_late protect stolen RAM already
639 * rounding to the next 1MB area.
640 */ 639 */
641 round = 0x100000; 640 pci_mem_start = gapstart;
642 while ((gapsize >> 4) > round)
643 round += round;
644 /* Fun with two's complement */
645 pci_mem_start = (gapstart + round) & -round;
646 641
647 printk(KERN_INFO 642 printk(KERN_INFO
648 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 643 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
1371 } 1366 }
1372} 1367}
1373 1368
1369/* How much should we pad RAM ending depending on where it is? */
1370static unsigned long ram_alignment(resource_size_t pos)
1371{
1372 unsigned long mb = pos >> 20;
1373
1374 /* To 64kB in the first megabyte */
1375 if (!mb)
1376 return 64*1024;
1377
1378 /* To 1MB in the first 16MB */
1379 if (mb < 16)
1380 return 1024*1024;
1381
1382 /* To 32MB for anything above that */
1383 return 32*1024*1024;
1384}
1385
1374void __init e820_reserve_resources_late(void) 1386void __init e820_reserve_resources_late(void)
1375{ 1387{
1376 int i; 1388 int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
1382 insert_resource_expand_to_fit(&iomem_resource, res); 1394 insert_resource_expand_to_fit(&iomem_resource, res);
1383 res++; 1395 res++;
1384 } 1396 }
1397
1398 /*
1399 * Try to bump up RAM regions to reasonable boundaries to
1400 * avoid stolen RAM:
1401 */
1402 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i];
1404 resource_size_t start, end;
1405
1406 if (entry->type != E820_RAM)
1407 continue;
1408 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start));
1410 if (start == end)
1411 continue;
1412 reserve_region_with_split(&iomem_resource, start,
1413 end - 1, "RAM buffer");
1414 }
1385} 1415}
1386 1416
1387char *__init default_machine_specific_memory_setup(void) 1417char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953dee..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 101static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
101{ 102{
102 u32 d; 103 u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
114 d &= 0xff; 115 d &= 0xff;
115 return d; 116 return d;
116} 117}
118#endif
117 119
118static void __init ati_bugs(int num, int slot, int func) 120static void __init ati_bugs(int num, int slot, int func)
119{ 121{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4234b1235652..de74f0a3e0ed 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
147GLOBAL(return_to_handler) 147GLOBAL(return_to_handler)
148 subq $80, %rsp 148 subq $80, %rsp
149 149
150 /* Save the return values */
150 movq %rax, (%rsp) 151 movq %rax, (%rsp)
151 movq %rcx, 8(%rsp) 152 movq %rdx, 8(%rsp)
152 movq %rdx, 16(%rsp)
153 movq %rsi, 24(%rsp)
154 movq %rdi, 32(%rsp)
155 movq %r8, 40(%rsp)
156 movq %r9, 48(%rsp)
157 movq %r10, 56(%rsp)
158 movq %r11, 64(%rsp)
159 153
160 call ftrace_return_to_handler 154 call ftrace_return_to_handler
161 155
162 movq %rax, 72(%rsp) 156 movq %rax, 72(%rsp)
163 movq 64(%rsp), %r11 157 movq 8(%rsp), %rdx
164 movq 56(%rsp), %r10
165 movq 48(%rsp), %r9
166 movq 40(%rsp), %r8
167 movq 32(%rsp), %rdi
168 movq 24(%rsp), %rsi
169 movq 16(%rsp), %rdx
170 movq 8(%rsp), %rcx
171 movq (%rsp), %rax 158 movq (%rsp), %rax
172 addq $72, %rsp 159 addq $72, %rsp
173 retq 160 retq
@@ -1032,6 +1019,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1032apicinterrupt SPURIOUS_APIC_VECTOR \ 1019apicinterrupt SPURIOUS_APIC_VECTOR \
1033 spurious_interrupt smp_spurious_interrupt 1020 spurious_interrupt smp_spurious_interrupt
1034 1021
1022#ifdef CONFIG_PERF_COUNTERS
1023apicinterrupt LOCAL_PENDING_VECTOR \
1024 perf_pending_interrupt smp_perf_pending_interrupt
1025#endif
1026
1035/* 1027/*
1036 * Exception entry points. 1028 * Exception entry points.
1037 */ 1029 */
@@ -1386,6 +1378,11 @@ END(xen_failsafe_callback)
1386paranoidzeroentry_ist debug do_debug DEBUG_STACK 1378paranoidzeroentry_ist debug do_debug DEBUG_STACK
1387paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1379paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1388paranoiderrorentry stack_segment do_stack_segment 1380paranoiderrorentry stack_segment do_stack_segment
1381#ifdef CONFIG_XEN
1382zeroentry xen_debug do_debug
1383zeroentry xen_int3 do_int3
1384errorentry xen_stack_segment do_stack_segment
1385#endif
1389errorentry general_protection do_general_protection 1386errorentry general_protection do_general_protection
1390errorentry page_fault do_page_fault 1387errorentry page_fault do_page_fault
1391#ifdef CONFIG_X86_MCE 1388#ifdef CONFIG_X86_MCE
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0cd..dc5ed4bdd88d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
608ENTRY(initial_code) 608ENTRY(initial_code)
609 .long i386_start_kernel 609 .long i386_start_kernel
610 610
611.section .text
612/*
613 * Real beginning of normal "text" segment
614 */
615ENTRY(stext)
616ENTRY(_stext)
617
618/* 611/*
619 * BSS section 612 * BSS section
620 */ 613 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9773395aa758..b0cdde6932f5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -13,6 +13,7 @@
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h> 15#include <asm/mce.h>
16#include <asm/hw_irq.h>
16 17
17atomic_t irq_err_count; 18atomic_t irq_err_count;
18 19
@@ -62,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
62 for_each_online_cpu(j) 63 for_each_online_cpu(j)
63 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
64 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
65#endif 74#endif
66 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
67 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -175,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
175#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
176 sum += irq_stats(cpu)->apic_timer_irqs; 185 sum += irq_stats(cpu)->apic_timer_irqs;
177 sum += irq_stats(cpu)->irq_spurious_count; 186 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs;
178#endif 189#endif
179 if (generic_interrupt_extension) 190 if (generic_interrupt_extension)
180 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 4a69ec55be3d..696f0e475c2d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -207,7 +207,6 @@ static void __init apic_intr_init(void)
207 207
208 /* Performance monitoring interrupts: */ 208 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS 209# ifdef CONFIG_PERF_COUNTERS
210 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
211 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
212# endif 211# endif
213 212
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b1f4dffb919e..8d82a77a3f3b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
143 gdb_regs32[GDB_CS] = __KERNEL_CS; 143 gdb_regs32[GDB_CS] = __KERNEL_CS;
144 gdb_regs32[GDB_SS] = __KERNEL_DS; 144 gdb_regs32[GDB_SS] = __KERNEL_DS;
145 gdb_regs[GDB_PC] = p->thread.ip; 145 gdb_regs[GDB_PC] = 0;
146 gdb_regs[GDB_R8] = 0; 146 gdb_regs[GDB_R8] = 0;
147 gdb_regs[GDB_R9] = 0; 147 gdb_regs[GDB_R9] = 0;
148 gdb_regs[GDB_R10] = 0; 148 gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b4..a78ecad0c900 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <asm/timer.h>
30 31
31#define MMU_QUEUE_SIZE 1024 32#define MMU_QUEUE_SIZE 1024
32 33
@@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)
195 struct kvm_para_state *state = kvm_para_state(); 196 struct kvm_para_state *state = kvm_para_state();
196 197
197 mmu_queue_flush(state); 198 mmu_queue_flush(state);
198 paravirt_leave_lazy(paravirt_get_lazy_mode()); 199 paravirt_leave_lazy_mmu();
199 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
200} 201}
201 202
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
230 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 231 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
231 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 232 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
232 } 233 }
234#ifdef CONFIG_X86_IO_APIC
235 no_timer_check = 1;
236#endif
233} 237}
234 238
235void __init kvm_guest_init(void) 239void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c6..366baa179913 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16#include <linux/platform_device.h>
17#include <linux/capability.h>
18#include <linux/miscdevice.h>
19#include <linux/firmware.h> 16#include <linux/firmware.h>
20#include <linux/spinlock.h>
21#include <linux/cpumask.h>
22#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
23#include <linux/uaccess.h> 18#include <linux/uaccess.h>
24#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
25#include <linux/kernel.h> 20#include <linux/kernel.h>
26#include <linux/module.h> 21#include <linux/module.h>
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/cpu.h>
32#include <linux/pci.h> 22#include <linux/pci.h>
33#include <linux/fs.h>
34#include <linux/mm.h>
35 23
36#include <asm/microcode.h> 24#include <asm/microcode.h>
37#include <asm/processor.h> 25#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
79#define UCODE_CONTAINER_SECTION_HDR 8 67#define UCODE_CONTAINER_SECTION_HDR 8
80#define UCODE_CONTAINER_HEADER_SIZE 12 68#define UCODE_CONTAINER_HEADER_SIZE 12
81 69
82/* serialize access to the physical write */
83static DEFINE_SPINLOCK(microcode_update_lock);
84
85static struct equiv_cpu_entry *equiv_cpu_table; 70static struct equiv_cpu_entry *equiv_cpu_table;
86 71
87static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
144 return 1; 129 return 1;
145} 130}
146 131
147static void apply_microcode_amd(int cpu) 132static int apply_microcode_amd(int cpu)
148{ 133{
149 unsigned long flags;
150 u32 rev, dummy; 134 u32 rev, dummy;
151 int cpu_num = raw_smp_processor_id(); 135 int cpu_num = raw_smp_processor_id();
152 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
156 BUG_ON(cpu_num != cpu); 140 BUG_ON(cpu_num != cpu);
157 141
158 if (mc_amd == NULL) 142 if (mc_amd == NULL)
159 return; 143 return 0;
160 144
161 spin_lock_irqsave(&microcode_update_lock, flags);
162 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 145 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
163 /* get patch id after patching */ 146 /* get patch id after patching */
164 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 147 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
165 spin_unlock_irqrestore(&microcode_update_lock, flags);
166 148
167 /* check current patch id and patch's id for match */ 149 /* check current patch id and patch's id for match */
168 if (rev != mc_amd->hdr.patch_id) { 150 if (rev != mc_amd->hdr.patch_id) {
169 printk(KERN_ERR "microcode: CPU%d: update failed " 151 printk(KERN_ERR "microcode: CPU%d: update failed "
170 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
171 return; 153 return -1;
172 } 154 }
173 155
174 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
175 cpu, rev); 157 cpu, rev);
176 158
177 uci->cpu_sig.rev = rev; 159 uci->cpu_sig.rev = rev;
160
161 return 0;
178} 162}
179 163
180static int get_ucode_data(void *to, const u8 *from, size_t n) 164static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
257 241
258static void free_equiv_cpu_table(void) 242static void free_equiv_cpu_table(void)
259{ 243{
260 if (equiv_cpu_table) { 244 vfree(equiv_cpu_table);
261 vfree(equiv_cpu_table); 245 equiv_cpu_table = NULL;
262 equiv_cpu_table = NULL;
263 }
264} 246}
265 247
266static int generic_load_microcode(int cpu, const u8 *data, size_t size) 248static enum ucode_state
249generic_load_microcode(int cpu, const u8 *data, size_t size)
267{ 250{
268 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 251 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
269 const u8 *ucode_ptr = data; 252 const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
272 int new_rev = uci->cpu_sig.rev; 255 int new_rev = uci->cpu_sig.rev;
273 unsigned int leftover; 256 unsigned int leftover;
274 unsigned long offset; 257 unsigned long offset;
258 enum ucode_state state = UCODE_OK;
275 259
276 offset = install_equiv_cpu_table(ucode_ptr); 260 offset = install_equiv_cpu_table(ucode_ptr);
277 if (!offset) { 261 if (!offset) {
278 printk(KERN_ERR "microcode: failed to create " 262 printk(KERN_ERR "microcode: failed to create "
279 "equivalent cpu table\n"); 263 "equivalent cpu table\n");
280 return -EINVAL; 264 return UCODE_ERROR;
281 } 265 }
282 266
283 ucode_ptr += offset; 267 ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
293 277
294 mc_header = (struct microcode_header_amd *)mc; 278 mc_header = (struct microcode_header_amd *)mc;
295 if (get_matching_microcode(cpu, mc, new_rev)) { 279 if (get_matching_microcode(cpu, mc, new_rev)) {
296 if (new_mc) 280 vfree(new_mc);
297 vfree(new_mc);
298 new_rev = mc_header->patch_id; 281 new_rev = mc_header->patch_id;
299 new_mc = mc; 282 new_mc = mc;
300 } else 283 } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
306 289
307 if (new_mc) { 290 if (new_mc) {
308 if (!leftover) { 291 if (!leftover) {
309 if (uci->mc) 292 vfree(uci->mc);
310 vfree(uci->mc);
311 uci->mc = new_mc; 293 uci->mc = new_mc;
312 pr_debug("microcode: CPU%d found a matching microcode " 294 pr_debug("microcode: CPU%d found a matching microcode "
313 "update with version 0x%x (current=0x%x)\n", 295 "update with version 0x%x (current=0x%x)\n",
314 cpu, new_rev, uci->cpu_sig.rev); 296 cpu, new_rev, uci->cpu_sig.rev);
315 } else 297 } else {
316 vfree(new_mc); 298 vfree(new_mc);
317 } 299 state = UCODE_ERROR;
300 }
301 } else
302 state = UCODE_NFOUND;
318 303
319 free_equiv_cpu_table(); 304 free_equiv_cpu_table();
320 305
321 return (int)leftover; 306 return state;
322} 307}
323 308
324static int request_microcode_fw(int cpu, struct device *device) 309static enum ucode_state request_microcode_fw(int cpu, struct device *device)
325{ 310{
326 const char *fw_name = "amd-ucode/microcode_amd.bin"; 311 const char *fw_name = "amd-ucode/microcode_amd.bin";
327 const struct firmware *firmware; 312 const struct firmware *firmware;
328 int ret; 313 enum ucode_state ret;
329
330 /* We should bind the task to the CPU */
331 BUG_ON(cpu != raw_smp_processor_id());
332 314
333 ret = request_firmware(&firmware, fw_name, device); 315 if (request_firmware(&firmware, fw_name, device)) {
334 if (ret) {
335 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
336 return ret; 317 return UCODE_NFOUND;
337 } 318 }
338 319
339 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 320 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
343 return ret; 324 return ret;
344} 325}
345 326
346static int request_microcode_user(int cpu, const void __user *buf, size_t size) 327static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size)
347{ 329{
348 printk(KERN_INFO "microcode: AMD microcode update via " 330 printk(KERN_INFO "microcode: AMD microcode update via "
349 "/dev/cpu/microcode not supported\n"); 331 "/dev/cpu/microcode not supported\n");
350 return -1; 332 return UCODE_ERROR;
351} 333}
352 334
353static void microcode_fini_cpu_amd(int cpu) 335static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d1..9c4461501fcb 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
76#include <linux/firmware.h> 75#include <linux/capability.h>
77#include <linux/smp_lock.h> 76#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 77#include <linux/kernel.h>
83#include <linux/module.h> 78#include <linux/module.h>
84#include <linux/mutex.h> 79#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h> 80#include <linux/cpu.h>
89#include <linux/fs.h> 81#include <linux/fs.h>
90#include <linux/mm.h> 82#include <linux/mm.h>
91 83
92#include <asm/microcode.h> 84#include <asm/microcode.h>
93#include <asm/processor.h> 85#include <asm/processor.h>
94#include <asm/msr.h>
95 86
96MODULE_DESCRIPTION("Microcode Update Driver"); 87MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 88MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
101 92
102static struct microcode_ops *microcode_ops; 93static struct microcode_ops *microcode_ops;
103 94
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 95/*
96 * Synchronization.
97 *
98 * All non cpu-hotplug-callback call sites use:
99 *
100 * - microcode_mutex to synchronize with each other;
101 * - get/put_online_cpus() to synchronize with
102 * the cpu-hotplug-callback call sites.
103 *
104 * We guarantee that only a single cpu is being
105 * updated at any particular moment of time.
106 */
105static DEFINE_MUTEX(microcode_mutex); 107static DEFINE_MUTEX(microcode_mutex);
106 108
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 109struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 110EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 111
112/*
113 * Operations that are run on a target cpu:
114 */
115
116struct cpu_info_ctx {
117 struct cpu_signature *cpu_sig;
118 int err;
119};
120
121static void collect_cpu_info_local(void *arg)
122{
123 struct cpu_info_ctx *ctx = arg;
124
125 ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
126 ctx->cpu_sig);
127}
128
129static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
130{
131 struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
132 int ret;
133
134 ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
135 if (!ret)
136 ret = ctx.err;
137
138 return ret;
139}
140
141static int collect_cpu_info(int cpu)
142{
143 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
144 int ret;
145
146 memset(uci, 0, sizeof(*uci));
147
148 ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
149 if (!ret)
150 uci->valid = 1;
151
152 return ret;
153}
154
155struct apply_microcode_ctx {
156 int err;
157};
158
159static void apply_microcode_local(void *arg)
160{
161 struct apply_microcode_ctx *ctx = arg;
162
163 ctx->err = microcode_ops->apply_microcode(smp_processor_id());
164}
165
166static int apply_microcode_on_target(int cpu)
167{
168 struct apply_microcode_ctx ctx = { .err = 0 };
169 int ret;
170
171 ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
172 if (!ret)
173 ret = ctx.err;
174
175 return ret;
176}
177
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 178#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size) 179static int do_microcode_update(const void __user *buf, size_t size)
112{ 180{
113 cpumask_t old;
114 int error = 0; 181 int error = 0;
115 int cpu; 182 int cpu;
116 183
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) { 184 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 185 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
186 enum ucode_state ustate;
121 187
122 if (!uci->valid) 188 if (!uci->valid)
123 continue; 189 continue;
124 190
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 191 ustate = microcode_ops->request_microcode_user(cpu, buf, size);
126 error = microcode_ops->request_microcode_user(cpu, buf, size); 192 if (ustate == UCODE_ERROR) {
127 if (error < 0) 193 error = -1;
128 goto out; 194 break;
129 if (!error) 195 } else if (ustate == UCODE_OK)
130 microcode_ops->apply_microcode(cpu); 196 apply_microcode_on_target(cpu);
131 } 197 }
132out: 198
133 set_cpus_allowed_ptr(current, &old);
134 return error; 199 return error;
135} 200}
136 201
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
143static ssize_t microcode_write(struct file *file, const char __user *buf, 208static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos) 209 size_t len, loff_t *ppos)
145{ 210{
146 ssize_t ret; 211 ssize_t ret = -EINVAL;
147 212
148 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", 214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
150 num_physpages); 215 return ret;
151 return -EINVAL;
152 } 216 }
153 217
154 get_online_cpus(); 218 get_online_cpus();
155 mutex_lock(&microcode_mutex); 219 mutex_lock(&microcode_mutex);
156 220
157 ret = do_microcode_update(buf, len); 221 if (do_microcode_update(buf, len) == 0)
158 if (!ret)
159 ret = (ssize_t)len; 222 ret = (ssize_t)len;
160 223
161 mutex_unlock(&microcode_mutex); 224 mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
165} 228}
166 229
167static const struct file_operations microcode_fops = { 230static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
169 .write = microcode_write, 232 .write = microcode_write,
170 .open = microcode_open, 233 .open = microcode_open,
171}; 234};
172 235
173static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
175 .name = "microcode", 238 .name = "microcode",
176 .fops = &microcode_fops, 239 .fops = &microcode_fops,
177}; 240};
178 241
179static int __init microcode_dev_init(void) 242static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
182 245
183 error = misc_register(&microcode_dev); 246 error = misc_register(&microcode_dev);
184 if (error) { 247 if (error) {
185 printk(KERN_ERR 248 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error; 249 return error;
189 } 250 }
190 251
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
205/* fake device for request_firmware */ 266/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 267static struct platform_device *microcode_pdev;
207 268
208static long reload_for_cpu(void *unused) 269static int reload_for_cpu(int cpu)
209{ 270{
210 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 271 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
211 int err = 0; 272 int err = 0;
212 273
213 mutex_lock(&microcode_mutex); 274 mutex_lock(&microcode_mutex);
214 if (uci->valid) { 275 if (uci->valid) {
215 err = microcode_ops->request_microcode_fw(smp_processor_id(), 276 enum ucode_state ustate;
216 &microcode_pdev->dev); 277
217 if (!err) 278 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
218 microcode_ops->apply_microcode(smp_processor_id()); 279 if (ustate == UCODE_OK)
280 apply_microcode_on_target(cpu);
281 else
282 if (ustate == UCODE_ERROR)
283 err = -EINVAL;
219 } 284 }
220 mutex_unlock(&microcode_mutex); 285 mutex_unlock(&microcode_mutex);
286
221 return err; 287 return err;
222} 288}
223 289
224static ssize_t reload_store(struct sys_device *dev, 290static ssize_t reload_store(struct sys_device *dev,
225 struct sysdev_attribute *attr, 291 struct sysdev_attribute *attr,
226 const char *buf, size_t sz) 292 const char *buf, size_t size)
227{ 293{
228 char *end; 294 unsigned long val;
229 unsigned long val = simple_strtoul(buf, &end, 0);
230 int err = 0;
231 int cpu = dev->id; 295 int cpu = dev->id;
296 int ret = 0;
297 char *end;
232 298
299 val = simple_strtoul(buf, &end, 0);
233 if (end == buf) 300 if (end == buf)
234 return -EINVAL; 301 return -EINVAL;
302
235 if (val == 1) { 303 if (val == 1) {
236 get_online_cpus(); 304 get_online_cpus();
237 if (cpu_online(cpu)) 305 if (cpu_online(cpu))
238 err = work_on_cpu(cpu, reload_for_cpu, NULL); 306 ret = reload_for_cpu(cpu);
239 put_online_cpus(); 307 put_online_cpus();
240 } 308 }
241 if (err) 309
242 return err; 310 if (!ret)
243 return sz; 311 ret = size;
312
313 return ret;
244} 314}
245 315
246static ssize_t version_show(struct sys_device *dev, 316static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
271}; 341};
272 342
273static struct attribute_group mc_attr_group = { 343static struct attribute_group mc_attr_group = {
274 .attrs = mc_default_attrs, 344 .attrs = mc_default_attrs,
275 .name = "microcode", 345 .name = "microcode",
276}; 346};
277 347
278static void __microcode_fini_cpu(int cpu) 348static void microcode_fini_cpu(int cpu)
279{ 349{
280 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
281 351
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
283 uci->valid = 0; 353 uci->valid = 0;
284} 354}
285 355
286static void microcode_fini_cpu(int cpu) 356static enum ucode_state microcode_resume_cpu(int cpu)
287{
288 mutex_lock(&microcode_mutex);
289 __microcode_fini_cpu(cpu);
290 mutex_unlock(&microcode_mutex);
291}
292
293static void collect_cpu_info(int cpu)
294{ 357{
295 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 358 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
296 359
297 memset(uci, 0, sizeof(*uci)); 360 if (!uci->mc)
298 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) 361 return UCODE_NFOUND;
299 uci->valid = 1; 362
363 pr_debug("microcode: CPU%d updated upon resume\n", cpu);
364 apply_microcode_on_target(cpu);
365
366 return UCODE_OK;
300} 367}
301 368
302static int microcode_resume_cpu(int cpu) 369static enum ucode_state microcode_init_cpu(int cpu)
303{ 370{
304 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 371 enum ucode_state ustate;
305 struct cpu_signature nsig;
306 372
307 pr_debug("microcode: CPU%d resumed\n", cpu); 373 if (collect_cpu_info(cpu))
374 return UCODE_ERROR;
308 375
309 if (!uci->mc) 376 /* --dimm. Trigger a delayed update? */
310 return 1; 377 if (system_state != SYSTEM_RUNNING)
378 return UCODE_NFOUND;
311 379
312 /* 380 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
313 * Let's verify that the 'cached' ucode does belong
314 * to this cpu (a bit of paranoia):
315 */
316 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
317 __microcode_fini_cpu(cpu);
318 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
319 cpu);
320 return -1;
321 }
322 381
323 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { 382 if (ustate == UCODE_OK) {
324 __microcode_fini_cpu(cpu); 383 pr_debug("microcode: CPU%d updated upon init\n", cpu);
325 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", 384 apply_microcode_on_target(cpu);
326 cpu);
327 /* Should we look for a new ucode here? */
328 return 1;
329 } 385 }
330 386
331 return 0; 387 return ustate;
332} 388}
333 389
334static long microcode_update_cpu(void *unused) 390static enum ucode_state microcode_update_cpu(int cpu)
335{ 391{
336 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 392 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
337 int err = 0; 393 enum ucode_state ustate;
338 394
339 /* 395 if (uci->valid)
340 * Check if the system resume is in progress (uci->valid != NULL), 396 ustate = microcode_resume_cpu(cpu);
341 * otherwise just request a firmware: 397 else
342 */ 398 ustate = microcode_init_cpu(cpu);
343 if (uci->valid) {
344 err = microcode_resume_cpu(smp_processor_id());
345 } else {
346 collect_cpu_info(smp_processor_id());
347 if (uci->valid && system_state == SYSTEM_RUNNING)
348 err = microcode_ops->request_microcode_fw(
349 smp_processor_id(),
350 &microcode_pdev->dev);
351 }
352 if (!err)
353 microcode_ops->apply_microcode(smp_processor_id());
354 return err;
355}
356 399
357static int microcode_init_cpu(int cpu) 400 return ustate;
358{
359 int err;
360 mutex_lock(&microcode_mutex);
361 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex);
363
364 return err;
365} 401}
366 402
367static int mc_sysdev_add(struct sys_device *sys_dev) 403static int mc_sysdev_add(struct sys_device *sys_dev)
368{ 404{
369 int err, cpu = sys_dev->id; 405 int err, cpu = sys_dev->id;
370 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
371 406
372 if (!cpu_online(cpu)) 407 if (!cpu_online(cpu))
373 return 0; 408 return 0;
374 409
375 pr_debug("microcode: CPU%d added\n", cpu); 410 pr_debug("microcode: CPU%d added\n", cpu);
376 memset(uci, 0, sizeof(*uci));
377 411
378 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 412 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
379 if (err) 413 if (err)
380 return err; 414 return err;
381 415
382 err = microcode_init_cpu(cpu); 416 if (microcode_init_cpu(cpu) == UCODE_ERROR)
417 err = -EINVAL;
383 418
384 return err; 419 return err;
385} 420}
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
400static int mc_sysdev_resume(struct sys_device *dev) 435static int mc_sysdev_resume(struct sys_device *dev)
401{ 436{
402 int cpu = dev->id; 437 int cpu = dev->id;
438 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
403 439
404 if (!cpu_online(cpu)) 440 if (!cpu_online(cpu))
405 return 0; 441 return 0;
406 442
407 /* only CPU 0 will apply ucode here */ 443 /*
408 microcode_update_cpu(NULL); 444 * All non-bootup cpus are still disabled,
445 * so only CPU 0 will apply ucode here.
446 *
447 * Moreover, there can be no concurrent
448 * updates from any other places at this point.
449 */
450 WARN_ON(cpu != 0);
451
452 if (uci->valid && uci->mc)
453 microcode_ops->apply_microcode(cpu);
454
409 return 0; 455 return 0;
410} 456}
411 457
412static struct sysdev_driver mc_sysdev_driver = { 458static struct sysdev_driver mc_sysdev_driver = {
413 .add = mc_sysdev_add, 459 .add = mc_sysdev_add,
414 .remove = mc_sysdev_remove, 460 .remove = mc_sysdev_remove,
415 .resume = mc_sysdev_resume, 461 .resume = mc_sysdev_resume,
416}; 462};
417 463
418static __cpuinit int 464static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
425 switch (action) { 471 switch (action) {
426 case CPU_ONLINE: 472 case CPU_ONLINE:
427 case CPU_ONLINE_FROZEN: 473 case CPU_ONLINE_FROZEN:
428 if (microcode_init_cpu(cpu)) 474 microcode_update_cpu(cpu);
429 printk(KERN_ERR "microcode: failed to init CPU%d\n",
430 cpu);
431 case CPU_DOWN_FAILED: 475 case CPU_DOWN_FAILED:
432 case CPU_DOWN_FAILED_FROZEN: 476 case CPU_DOWN_FAILED_FROZEN:
433 pr_debug("microcode: CPU%d added\n", cpu); 477 pr_debug("microcode: CPU%d added\n", cpu);
434 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 478 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
435 printk(KERN_ERR "microcode: Failed to create the sysfs " 479 pr_err("microcode: Failed to create group for CPU%d\n", cpu);
436 "group for CPU%d\n", cpu);
437 break; 480 break;
438 case CPU_DOWN_PREPARE: 481 case CPU_DOWN_PREPARE:
439 case CPU_DOWN_PREPARE_FROZEN: 482 case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
465 microcode_ops = init_amd_microcode(); 508 microcode_ops = init_amd_microcode();
466 509
467 if (!microcode_ops) { 510 if (!microcode_ops) {
468 printk(KERN_ERR "microcode: no support for this CPU vendor\n"); 511 pr_err("microcode: no support for this CPU vendor\n");
469 return -ENODEV; 512 return -ENODEV;
470 } 513 }
471 514
472 error = microcode_dev_init();
473 if (error)
474 return error;
475 microcode_pdev = platform_device_register_simple("microcode", -1, 515 microcode_pdev = platform_device_register_simple("microcode", -1,
476 NULL, 0); 516 NULL, 0);
477 if (IS_ERR(microcode_pdev)) { 517 if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
480 } 520 }
481 521
482 get_online_cpus(); 522 get_online_cpus();
523 mutex_lock(&microcode_mutex);
524
483 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 525 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
526
527 mutex_unlock(&microcode_mutex);
484 put_online_cpus(); 528 put_online_cpus();
529
485 if (error) { 530 if (error) {
486 microcode_dev_exit();
487 platform_device_unregister(microcode_pdev); 531 platform_device_unregister(microcode_pdev);
488 return error; 532 return error;
489 } 533 }
490 534
535 error = microcode_dev_init();
536 if (error)
537 return error;
538
491 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
492 540
493 printk(KERN_INFO 541 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
494 "Microcode Update Driver: v" MICROCODE_VERSION
495 " <tigran@aivazian.fsnet.co.uk>," 542 " <tigran@aivazian.fsnet.co.uk>,"
496 " Peter Oruba\n"); 543 " Peter Oruba\n");
497 544
498 return 0; 545 return 0;
499} 546}
547module_init(microcode_init);
500 548
501static void __exit microcode_exit(void) 549static void __exit microcode_exit(void)
502{ 550{
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
505 unregister_hotcpu_notifier(&mc_cpu_notifier); 553 unregister_hotcpu_notifier(&mc_cpu_notifier);
506 554
507 get_online_cpus(); 555 get_online_cpus();
556 mutex_lock(&microcode_mutex);
557
508 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 558 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
559
560 mutex_unlock(&microcode_mutex);
509 put_online_cpus(); 561 put_online_cpus();
510 562
511 platform_device_unregister(microcode_pdev); 563 platform_device_unregister(microcode_pdev);
512 564
513 microcode_ops = NULL; 565 microcode_ops = NULL;
514 566
515 printk(KERN_INFO 567 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
516 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
517} 568}
518
519module_init(microcode_init);
520module_exit(microcode_exit); 569module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1ab..0d334ddd0a96 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h>
76#include <linux/firmware.h> 73#include <linux/firmware.h>
77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h> 74#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 75#include <linux/kernel.h>
83#include <linux/module.h> 76#include <linux/module.h>
84#include <linux/mutex.h> 77#include <linux/vmalloc.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h>
89#include <linux/fs.h>
90#include <linux/mm.h>
91 78
92#include <asm/microcode.h> 79#include <asm/microcode.h>
93#include <asm/processor.h> 80#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
150 137
151#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 138#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
152 139
153/* serialize access to the physical write to MSR 0x79 */
154static DEFINE_SPINLOCK(microcode_update_lock);
155
156static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 140static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
157{ 141{
158 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 142 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
159 unsigned long flags;
160 unsigned int val[2]; 143 unsigned int val[2];
161 144
162 memset(csig, 0, sizeof(*csig)); 145 memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
176 csig->pf = 1 << ((val[1] >> 18) & 7); 159 csig->pf = 1 << ((val[1] >> 18) & 7);
177 } 160 }
178 161
179 /* serialize access to the physical write to MSR 0x79 */
180 spin_lock_irqsave(&microcode_update_lock, flags);
181
182 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 162 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
183 /* see notes above for revision 1.07. Apparent chip bug */ 163 /* see notes above for revision 1.07. Apparent chip bug */
184 sync_core(); 164 sync_core();
185 /* get the current revision from MSR 0x8B */ 165 /* get the current revision from MSR 0x8B */
186 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
187 spin_unlock_irqrestore(&microcode_update_lock, flags);
188 167
189 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
190 csig->sig, csig->pf, csig->rev); 169 cpu_num, csig->sig, csig->pf, csig->rev);
191 170
192 return 0; 171 return 0;
193} 172}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 return 0; 297 return 0;
319} 298}
320 299
321static void apply_microcode(int cpu) 300static int apply_microcode(int cpu)
322{ 301{
323 struct microcode_intel *mc_intel; 302 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci; 303 struct ucode_cpu_info *uci;
325 unsigned long flags;
326 unsigned int val[2]; 304 unsigned int val[2];
327 int cpu_num; 305 int cpu_num;
328 306
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
334 BUG_ON(cpu_num != cpu); 312 BUG_ON(cpu_num != cpu);
335 313
336 if (mc_intel == NULL) 314 if (mc_intel == NULL)
337 return; 315 return 0;
338
339 /* serialize access to the physical write to MSR 0x79 */
340 spin_lock_irqsave(&microcode_update_lock, flags);
341 316
342 /* write microcode via MSR 0x79 */ 317 /* write microcode via MSR 0x79 */
343 wrmsr(MSR_IA32_UCODE_WRITE, 318 wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
351 /* get the current revision from MSR 0x8B */ 326 /* get the current revision from MSR 0x8B */
352 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
353 328
354 spin_unlock_irqrestore(&microcode_update_lock, flags);
355 if (val[1] != mc_intel->hdr.rev) { 329 if (val[1] != mc_intel->hdr.rev) {
356 printk(KERN_ERR "microcode: CPU%d update from revision " 330 printk(KERN_ERR "microcode: CPU%d update "
357 "0x%x to 0x%x failed\n", 331 "to revision 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]); 332 cpu_num, mc_intel->hdr.rev);
359 return; 333 return -1;
360 } 334 }
361 printk(KERN_INFO "microcode: CPU%d updated from revision " 335 printk(KERN_INFO "microcode: CPU%d updated to revision "
362 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 336 "0x%x, date = %04x-%02x-%02x \n",
363 cpu_num, uci->cpu_sig.rev, val[1], 337 cpu_num, val[1],
364 mc_intel->hdr.date & 0xffff, 338 mc_intel->hdr.date & 0xffff,
365 mc_intel->hdr.date >> 24, 339 mc_intel->hdr.date >> 24,
366 (mc_intel->hdr.date >> 16) & 0xff); 340 (mc_intel->hdr.date >> 16) & 0xff);
367 341
368 uci->cpu_sig.rev = val[1]; 342 uci->cpu_sig.rev = val[1];
343
344 return 0;
369} 345}
370 346
371static int generic_load_microcode(int cpu, void *data, size_t size, 347static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
372 int (*get_ucode_data)(void *, const void *, size_t)) 348 int (*get_ucode_data)(void *, const void *, size_t))
373{ 349{
374 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
375 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 351 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
376 int new_rev = uci->cpu_sig.rev; 352 int new_rev = uci->cpu_sig.rev;
377 unsigned int leftover = size; 353 unsigned int leftover = size;
354 enum ucode_state state = UCODE_OK;
378 355
379 while (leftover) { 356 while (leftover) {
380 struct microcode_header_intel mc_header; 357 struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
412 leftover -= mc_size; 389 leftover -= mc_size;
413 } 390 }
414 391
415 if (!new_mc) 392 if (leftover) {
393 if (new_mc)
394 vfree(new_mc);
395 state = UCODE_ERROR;
416 goto out; 396 goto out;
397 }
417 398
418 if (leftover) { 399 if (!new_mc) {
419 vfree(new_mc); 400 state = UCODE_NFOUND;
420 goto out; 401 goto out;
421 } 402 }
422 403
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
427 pr_debug("microcode: CPU%d found a matching microcode update with" 408 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n", 409 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev); 410 cpu, new_rev, uci->cpu_sig.rev);
430 411out:
431 out: 412 return state;
432 return (int)leftover;
433} 413}
434 414
435static int get_ucode_fw(void *to, const void *from, size_t n) 415static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
438 return 0; 418 return 0;
439} 419}
440 420
441static int request_microcode_fw(int cpu, struct device *device) 421static enum ucode_state request_microcode_fw(int cpu, struct device *device)
442{ 422{
443 char name[30]; 423 char name[30];
444 struct cpuinfo_x86 *c = &cpu_data(cpu); 424 struct cpuinfo_x86 *c = &cpu_data(cpu);
445 const struct firmware *firmware; 425 const struct firmware *firmware;
446 int ret; 426 enum ucode_state ret;
447 427
448 /* We should bind the task to the CPU */
449 BUG_ON(cpu != raw_smp_processor_id());
450 sprintf(name, "intel-ucode/%02x-%02x-%02x", 428 sprintf(name, "intel-ucode/%02x-%02x-%02x",
451 c->x86, c->x86_model, c->x86_mask); 429 c->x86, c->x86_model, c->x86_mask);
452 ret = request_firmware(&firmware, name, device); 430
453 if (ret) { 431 if (request_firmware(&firmware, name, device)) {
454 pr_debug("microcode: data file %s load failed\n", name); 432 pr_debug("microcode: data file %s load failed\n", name);
455 return ret; 433 return UCODE_NFOUND;
456 } 434 }
457 435
458 ret = generic_load_microcode(cpu, (void *)firmware->data, 436 ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
468 return copy_from_user(to, from, n); 446 return copy_from_user(to, from, n);
469} 447}
470 448
471static int request_microcode_user(int cpu, const void __user *buf, size_t size) 449static enum ucode_state
450request_microcode_user(int cpu, const void __user *buf, size_t size)
472{ 451{
473 /* We should bind the task to the CPU */
474 BUG_ON(cpu != raw_smp_processor_id());
475
476 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); 452 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
477} 453}
478 454
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9faf43bea336..70ec9b951d76 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
248 248
249static inline void enter_lazy(enum paravirt_lazy_mode mode) 249static inline void enter_lazy(enum paravirt_lazy_mode mode)
250{ 250{
251 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 251 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
252 BUG_ON(preemptible());
253 252
254 __get_cpu_var(paravirt_lazy_mode) = mode; 253 percpu_write(paravirt_lazy_mode, mode);
255} 254}
256 255
257void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 256static void leave_lazy(enum paravirt_lazy_mode mode)
258{ 257{
259 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); 258 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
260 BUG_ON(preemptible());
261 259
262 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; 260 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
263} 261}
264 262
265void paravirt_enter_lazy_mmu(void) 263void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
269 267
270void paravirt_leave_lazy_mmu(void) 268void paravirt_leave_lazy_mmu(void)
271{ 269{
272 paravirt_leave_lazy(PARAVIRT_LAZY_MMU); 270 leave_lazy(PARAVIRT_LAZY_MMU);
273} 271}
274 272
275void paravirt_enter_lazy_cpu(void) 273void paravirt_start_context_switch(struct task_struct *prev)
276{ 274{
275 BUG_ON(preemptible());
276
277 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
278 arch_leave_lazy_mmu_mode();
279 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
280 }
277 enter_lazy(PARAVIRT_LAZY_CPU); 281 enter_lazy(PARAVIRT_LAZY_CPU);
278} 282}
279 283
280void paravirt_leave_lazy_cpu(void) 284void paravirt_end_context_switch(struct task_struct *next)
281{ 285{
282 paravirt_leave_lazy(PARAVIRT_LAZY_CPU); 286 BUG_ON(preemptible());
287
288 leave_lazy(PARAVIRT_LAZY_CPU);
289
290 if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
291 arch_enter_lazy_mmu_mode();
283} 292}
284 293
285enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 294enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
286{ 295{
287 return __get_cpu_var(paravirt_lazy_mode); 296 if (in_interrupt())
297 return PARAVIRT_LAZY_NONE;
298
299 return percpu_read(paravirt_lazy_mode);
288} 300}
289 301
290void arch_flush_lazy_mmu_mode(void) 302void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
292 preempt_disable(); 304 preempt_disable();
293 305
294 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 306 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
295 WARN_ON(preempt_count() == 1);
296 arch_leave_lazy_mmu_mode(); 307 arch_leave_lazy_mmu_mode();
297 arch_enter_lazy_mmu_mode(); 308 arch_enter_lazy_mmu_mode();
298 } 309 }
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
300 preempt_enable(); 311 preempt_enable();
301} 312}
302 313
303void arch_flush_lazy_cpu_mode(void)
304{
305 preempt_disable();
306
307 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
308 WARN_ON(preempt_count() == 1);
309 arch_leave_lazy_cpu_mode();
310 arch_enter_lazy_cpu_mode();
311 }
312
313 preempt_enable();
314}
315
316struct pv_info pv_info = { 314struct pv_info pv_info = {
317 .name = "bare hardware", 315 .name = "bare hardware",
318 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
404 .set_iopl_mask = native_set_iopl_mask, 402 .set_iopl_mask = native_set_iopl_mask,
405 .io_delay = native_io_delay, 403 .io_delay = native_io_delay,
406 404
407 .lazy_mode = { 405 .start_context_switch = paravirt_nop,
408 .enter = paravirt_nop, 406 .end_context_switch = paravirt_nop,
409 .leave = paravirt_nop,
410 },
411}; 407};
412 408
413struct pv_apic_ops pv_apic_ops = { 409struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f3..971a3bec47a8 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
186 186
187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; 187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
188 188
189/* enable this to stress test the chip's TCE cache */
190#ifdef CONFIG_IOMMU_DEBUG
191static int debugging = 1;
192
193static inline unsigned long verify_bit_range(unsigned long* bitmap,
194 int expected, unsigned long start, unsigned long end)
195{
196 unsigned long idx = start;
197
198 BUG_ON(start >= end);
199
200 while (idx < end) {
201 if (!!test_bit(idx, bitmap) != expected)
202 return idx;
203 ++idx;
204 }
205
206 /* all bits have the expected value */
207 return ~0UL;
208}
209#else /* debugging is disabled */
210static int debugging;
211
212static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 int expected, unsigned long start, unsigned long end)
214{
215 return ~0UL;
216}
217
218#endif /* CONFIG_IOMMU_DEBUG */
219
220static inline int translation_enabled(struct iommu_table *tbl) 189static inline int translation_enabled(struct iommu_table *tbl)
221{ 190{
222 /* only PHBs with translation enabled have an IOMMU table */ 191 /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
228{ 197{
229 unsigned long index; 198 unsigned long index;
230 unsigned long end; 199 unsigned long end;
231 unsigned long badbit;
232 unsigned long flags; 200 unsigned long flags;
233 201
234 index = start_addr >> PAGE_SHIFT; 202 index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
243 211
244 spin_lock_irqsave(&tbl->it_lock, flags); 212 spin_lock_irqsave(&tbl->it_lock, flags);
245 213
246 badbit = verify_bit_range(tbl->it_map, 0, index, end);
247 if (badbit != ~0UL) {
248 if (printk_ratelimit())
249 printk(KERN_ERR "Calgary: entry already allocated at "
250 "0x%lx tbl %p dma 0x%lx npages %u\n",
251 badbit, tbl, start_addr, npages);
252 }
253
254 iommu_area_reserve(tbl->it_map, index, npages); 214 iommu_area_reserve(tbl->it_map, index, npages);
255 215
256 spin_unlock_irqrestore(&tbl->it_lock, flags); 216 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
326 unsigned int npages) 286 unsigned int npages)
327{ 287{
328 unsigned long entry; 288 unsigned long entry;
329 unsigned long badbit;
330 unsigned long badend; 289 unsigned long badend;
331 unsigned long flags; 290 unsigned long flags;
332 291
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
346 305
347 spin_lock_irqsave(&tbl->it_lock, flags); 306 spin_lock_irqsave(&tbl->it_lock, flags);
348 307
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 iommu_area_free(tbl->it_map, entry, npages); 308 iommu_area_free(tbl->it_map, entry, npages);
358 309
359 spin_unlock_irqrestore(&tbl->it_lock, flags); 310 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
1488 iommu_detected = 1; 1439 iommu_detected = 1;
1489 calgary_detected = 1; 1440 calgary_detected = 1;
1490 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); 1441 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1491 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1492 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1443 specified_table_size);
1493 debugging ? "enabled" : "disabled");
1494 1444
1495 /* swiotlb for devices that aren't behind the Calgary. */ 1445 /* swiotlb for devices that aren't behind the Calgary. */
1496 if (max_pfn > MAX_DMA32_PFN) 1446 if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035c..cfd9f9063896 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
144} 144}
145 145
146#ifdef CONFIG_IOMMU_LEAK 146#ifdef CONFIG_IOMMU_LEAK
147
148#define SET_LEAK(x) \
149 do { \
150 if (iommu_leak_tab) \
151 iommu_leak_tab[x] = __builtin_return_address(0);\
152 } while (0)
153
154#define CLEAR_LEAK(x) \
155 do { \
156 if (iommu_leak_tab) \
157 iommu_leak_tab[x] = NULL; \
158 } while (0)
159
160/* Debugging aid for drivers that don't free their IOMMU tables */ 147/* Debugging aid for drivers that don't free their IOMMU tables */
161static void **iommu_leak_tab;
162static int leak_trace; 148static int leak_trace;
163static int iommu_leak_pages = 20; 149static int iommu_leak_pages = 20;
164 150
165static void dump_leak(void) 151static void dump_leak(void)
166{ 152{
167 int i;
168 static int dump; 153 static int dump;
169 154
170 if (dump || !iommu_leak_tab) 155 if (dump)
171 return; 156 return;
172 dump = 1; 157 dump = 1;
173 show_stack(NULL, NULL);
174 158
175 /* Very crude. dump some from the end of the table too */ 159 show_stack(NULL, NULL);
176 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", 160 debug_dma_dump_mappings(NULL);
177 iommu_leak_pages);
178 for (i = 0; i < iommu_leak_pages; i += 2) {
179 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
180 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
181 0);
182 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
183 }
184 printk(KERN_DEBUG "\n");
185} 161}
186#else
187# define SET_LEAK(x)
188# define CLEAR_LEAK(x)
189#endif 162#endif
190 163
191static void iommu_full(struct device *dev, size_t size, int dir) 164static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
248 221
249 for (i = 0; i < npages; i++) { 222 for (i = 0; i < npages; i++) {
250 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); 223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
251 SET_LEAK(iommu_page + i);
252 phys_mem += PAGE_SIZE; 224 phys_mem += PAGE_SIZE;
253 } 225 }
254 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 226 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
294 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 266 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
295 for (i = 0; i < npages; i++) { 267 for (i = 0; i < npages; i++) {
296 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 268 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
297 CLEAR_LEAK(iommu_page + i);
298 } 269 }
299 free_iommu(iommu_page, npages); 270 free_iommu(iommu_page, npages);
300} 271}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
377 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); 348 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
378 while (pages--) { 349 while (pages--) {
379 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
380 SET_LEAK(iommu_page);
381 addr += PAGE_SIZE; 351 addr += PAGE_SIZE;
382 iommu_page++; 352 iommu_page++;
383 } 353 }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
688 658
689 agp_gatt_table = gatt; 659 agp_gatt_table = gatt;
690 660
691 enable_gart_translations();
692
693 error = sysdev_class_register(&gart_sysdev_class); 661 error = sysdev_class_register(&gart_sysdev_class);
694 if (!error) 662 if (!error)
695 error = sysdev_register(&device_gart); 663 error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
801 769
802#ifdef CONFIG_IOMMU_LEAK 770#ifdef CONFIG_IOMMU_LEAK
803 if (leak_trace) { 771 if (leak_trace) {
804 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 772 int ret;
805 get_order(iommu_pages*sizeof(void *))); 773
806 if (!iommu_leak_tab) 774 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret)
807 printk(KERN_DEBUG 776 printk(KERN_DEBUG
808 "PCI-DMA: Cannot allocate leak trace area\n"); 777 "PCI-DMA: Cannot trace all the entries\n");
809 } 778 }
810#endif 779#endif
811 780
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
845 * the pages as Not-Present: 814 * the pages as Not-Present:
846 */ 815 */
847 wbinvd(); 816 wbinvd();
817
818 /*
819 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility
821 * of stale cache entries that can lead to GART PTE
822 * errors.
823 */
824 enable_gart_translations();
848 825
849 /* 826 /*
850 * Try to workaround a bug (thanks to BenH): 827 * Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e268..a1712f2b50f1 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
28 return paddr; 28 return paddr;
29} 29}
30 30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) 31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{ 32{
33 return baddr; 33 return baddr;
34} 34}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e847..3bb2be1649bd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,12 +8,15 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h>
11#include <trace/power.h> 12#include <trace/power.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/uaccess.h> 17#include <asm/uaccess.h>
16#include <asm/i387.h> 18#include <asm/i387.h>
19#include <asm/ds.h>
17 20
18unsigned long idle_halt; 21unsigned long idle_halt;
19EXPORT_SYMBOL(idle_halt); 22EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
45 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); 48 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
46 tsk->thread.xstate = NULL; 49 tsk->thread.xstate = NULL;
47 } 50 }
51
52 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
48} 53}
49 54
50void free_thread_info(struct thread_info *ti) 55void free_thread_info(struct thread_info *ti)
@@ -83,8 +88,6 @@ void exit_thread(void)
83 put_cpu(); 88 put_cpu();
84 kfree(bp); 89 kfree(bp);
85 } 90 }
86
87 ds_exit_thread(current);
88} 91}
89 92
90void flush_thread(void) 93void flush_thread(void)
@@ -613,3 +616,16 @@ static int __init idle_setup(char *str)
613} 616}
614early_param("idle", idle_setup); 617early_param("idle", idle_setup);
615 618
619unsigned long arch_align_stack(unsigned long sp)
620{
621 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
622 sp -= get_random_int() % 8192;
623 return sp & ~0xf;
624}
625
626unsigned long arch_randomize_brk(struct mm_struct *mm)
627{
628 unsigned long range_end = mm->brk + 0x02000000;
629 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
630}
631
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a2..59f4524984af 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <stdarg.h>
13
14#include <linux/stackprotector.h> 12#include <linux/stackprotector.h>
15#include <linux/cpu.h> 13#include <linux/cpu.h>
16#include <linux/errno.h> 14#include <linux/errno.h>
@@ -33,7 +31,6 @@
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/kallsyms.h> 32#include <linux/kallsyms.h>
35#include <linux/ptrace.h> 33#include <linux/ptrace.h>
36#include <linux/random.h>
37#include <linux/personality.h> 34#include <linux/personality.h>
38#include <linux/tick.h> 35#include <linux/tick.h>
39#include <linux/percpu.h> 36#include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
290 p->thread.io_bitmap_max = 0; 287 p->thread.io_bitmap_max = 0;
291 } 288 }
292 289
293 ds_copy_thread(p, current); 290 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
291 p->thread.ds_ctx = NULL;
294 292
295 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 293 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
296 p->thread.debugctlmsr = 0; 294 p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
407 * done before math_state_restore, so the TS bit is up 405 * done before math_state_restore, so the TS bit is up
408 * to date. 406 * to date.
409 */ 407 */
410 arch_leave_lazy_cpu_mode(); 408 arch_end_context_switch(next_p);
411 409
412 /* If the task has used fpu the last 5 timeslices, just do a full 410 /* If the task has used fpu the last 5 timeslices, just do a full
413 * restore of the math state immediately to avoid the trap; the 411 * restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
497 return 0; 495 return 0;
498} 496}
499 497
500unsigned long arch_align_stack(unsigned long sp)
501{
502 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
503 sp -= get_random_int() % 8192;
504 return sp & ~0xf;
505}
506
507unsigned long arch_randomize_brk(struct mm_struct *mm)
508{
509 unsigned long range_end = mm->brk + 0x02000000;
510 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
511}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b1..ebefb5407b9d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h> 17#include <linux/stackprotector.h>
20#include <linux/cpu.h> 18#include <linux/cpu.h>
21#include <linux/errno.h> 19#include <linux/errno.h>
@@ -32,7 +30,6 @@
32#include <linux/delay.h> 30#include <linux/delay.h>
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/ptrace.h> 32#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h> 33#include <linux/notifier.h>
37#include <linux/kprobes.h> 34#include <linux/kprobes.h>
38#include <linux/kdebug.h> 35#include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
335 goto out; 332 goto out;
336 } 333 }
337 334
338 ds_copy_thread(p, me); 335 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
336 p->thread.ds_ctx = NULL;
339 337
340 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 338 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
341 p->thread.debugctlmsr = 0; 339 p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
428 * done before math_state_restore, so the TS bit is up 426 * done before math_state_restore, so the TS bit is up
429 * to date. 427 * to date.
430 */ 428 */
431 arch_leave_lazy_cpu_mode(); 429 arch_end_context_switch(next_p);
432 430
433 /* 431 /*
434 * Switch FS and GS. 432 * Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
660 return do_arch_prctl(current, code, addr); 658 return do_arch_prctl(current, code, addr);
661} 659}
662 660
663unsigned long arch_align_stack(unsigned long sp)
664{
665 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
666 sp -= get_random_int() % 8192;
667 return sp & ~0xf;
668}
669
670unsigned long arch_randomize_brk(struct mm_struct *mm)
671{
672 unsigned long range_end = mm->brk + 0x02000000;
673 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
674}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 23b7c8f017e2..09ecbde91c13 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
578} 579}
579 580
580#ifdef CONFIG_X86_PTRACE_BTS 581#ifdef CONFIG_X86_PTRACE_BTS
582/*
583 * A branch trace store context.
584 *
585 * Contexts may only be installed by ptrace_bts_config() and only for
586 * ptraced tasks.
587 *
588 * Contexts are destroyed when the tracee is detached from the tracer.
589 * The actual destruction work requires interrupts enabled, so the
590 * work is deferred and will be scheduled during __ptrace_unlink().
591 *
592 * Contexts hold an additional task_struct reference on the traced
593 * task, as well as a reference on the tracer's mm.
594 *
595 * Ptrace already holds a task_struct for the duration of ptrace operations,
596 * but since destruction is deferred, it may be executed after both
597 * tracer and tracee exited.
598 */
599struct bts_context {
600 /* The branch trace handle. */
601 struct bts_tracer *tracer;
602
603 /* The buffer used to store the branch trace and its size. */
604 void *buffer;
605 unsigned int size;
606
607 /* The mm that paid for the above buffer. */
608 struct mm_struct *mm;
609
610 /* The task this context belongs to. */
611 struct task_struct *task;
612
613 /* The signal to send on a bts buffer overflow. */
614 unsigned int bts_ovfl_signal;
615
616 /* The work struct to destroy a context. */
617 struct work_struct work;
618};
619
620static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
621{
622 void *buffer = NULL;
623 int err = -ENOMEM;
624
625 err = account_locked_memory(current->mm, current->signal->rlim, size);
626 if (err < 0)
627 return err;
628
629 buffer = kzalloc(size, GFP_KERNEL);
630 if (!buffer)
631 goto out_refund;
632
633 context->buffer = buffer;
634 context->size = size;
635 context->mm = get_task_mm(current);
636
637 return 0;
638
639 out_refund:
640 refund_locked_memory(current->mm, size);
641 return err;
642}
643
644static inline void free_bts_buffer(struct bts_context *context)
645{
646 if (!context->buffer)
647 return;
648
649 kfree(context->buffer);
650 context->buffer = NULL;
651
652 refund_locked_memory(context->mm, context->size);
653 context->size = 0;
654
655 mmput(context->mm);
656 context->mm = NULL;
657}
658
659static void free_bts_context_work(struct work_struct *w)
660{
661 struct bts_context *context;
662
663 context = container_of(w, struct bts_context, work);
664
665 ds_release_bts(context->tracer);
666 put_task_struct(context->task);
667 free_bts_buffer(context);
668 kfree(context);
669}
670
671static inline void free_bts_context(struct bts_context *context)
672{
673 INIT_WORK(&context->work, free_bts_context_work);
674 schedule_work(&context->work);
675}
676
677static inline struct bts_context *alloc_bts_context(struct task_struct *task)
678{
679 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
680 if (context) {
681 context->task = task;
682 task->bts = context;
683
684 get_task_struct(task);
685 }
686
687 return context;
688}
689
581static int ptrace_bts_read_record(struct task_struct *child, size_t index, 690static int ptrace_bts_read_record(struct task_struct *child, size_t index,
582 struct bts_struct __user *out) 691 struct bts_struct __user *out)
583{ 692{
693 struct bts_context *context;
584 const struct bts_trace *trace; 694 const struct bts_trace *trace;
585 struct bts_struct bts; 695 struct bts_struct bts;
586 const unsigned char *at; 696 const unsigned char *at;
587 int error; 697 int error;
588 698
589 trace = ds_read_bts(child->bts); 699 context = child->bts;
700 if (!context)
701 return -ESRCH;
702
703 trace = ds_read_bts(context->tracer);
590 if (!trace) 704 if (!trace)
591 return -EPERM; 705 return -ESRCH;
592 706
593 at = trace->ds.top - ((index + 1) * trace->ds.size); 707 at = trace->ds.top - ((index + 1) * trace->ds.size);
594 if ((void *)at < trace->ds.begin) 708 if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
597 if (!trace->read) 711 if (!trace->read)
598 return -EOPNOTSUPP; 712 return -EOPNOTSUPP;
599 713
600 error = trace->read(child->bts, at, &bts); 714 error = trace->read(context->tracer, at, &bts);
601 if (error < 0) 715 if (error < 0)
602 return error; 716 return error;
603 717
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
611 long size, 725 long size,
612 struct bts_struct __user *out) 726 struct bts_struct __user *out)
613{ 727{
728 struct bts_context *context;
614 const struct bts_trace *trace; 729 const struct bts_trace *trace;
615 const unsigned char *at; 730 const unsigned char *at;
616 int error, drained = 0; 731 int error, drained = 0;
617 732
618 trace = ds_read_bts(child->bts); 733 context = child->bts;
734 if (!context)
735 return -ESRCH;
736
737 trace = ds_read_bts(context->tracer);
619 if (!trace) 738 if (!trace)
620 return -EPERM; 739 return -ESRCH;
621 740
622 if (!trace->read) 741 if (!trace->read)
623 return -EOPNOTSUPP; 742 return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
628 for (at = trace->ds.begin; (void *)at < trace->ds.top; 747 for (at = trace->ds.begin; (void *)at < trace->ds.top;
629 out++, drained++, at += trace->ds.size) { 748 out++, drained++, at += trace->ds.size) {
630 struct bts_struct bts; 749 struct bts_struct bts;
631 int error;
632 750
633 error = trace->read(child->bts, at, &bts); 751 error = trace->read(context->tracer, at, &bts);
634 if (error < 0) 752 if (error < 0)
635 return error; 753 return error;
636 754
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
640 758
641 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); 759 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
642 760
643 error = ds_reset_bts(child->bts); 761 error = ds_reset_bts(context->tracer);
644 if (error < 0) 762 if (error < 0)
645 return error; 763 return error;
646 764
647 return drained; 765 return drained;
648} 766}
649 767
650static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
651{
652 child->bts_buffer = alloc_locked_buffer(size);
653 if (!child->bts_buffer)
654 return -ENOMEM;
655
656 child->bts_size = size;
657
658 return 0;
659}
660
661static void ptrace_bts_free_buffer(struct task_struct *child)
662{
663 free_locked_buffer(child->bts_buffer, child->bts_size);
664 child->bts_buffer = NULL;
665 child->bts_size = 0;
666}
667
668static int ptrace_bts_config(struct task_struct *child, 768static int ptrace_bts_config(struct task_struct *child,
669 long cfg_size, 769 long cfg_size,
670 const struct ptrace_bts_config __user *ucfg) 770 const struct ptrace_bts_config __user *ucfg)
671{ 771{
772 struct bts_context *context;
672 struct ptrace_bts_config cfg; 773 struct ptrace_bts_config cfg;
673 unsigned int flags = 0; 774 unsigned int flags = 0;
674 775
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
678 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 779 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
679 return -EFAULT; 780 return -EFAULT;
680 781
681 if (child->bts) { 782 context = child->bts;
682 ds_release_bts(child->bts); 783 if (!context)
683 child->bts = NULL; 784 context = alloc_bts_context(child);
684 } 785 if (!context)
786 return -ENOMEM;
685 787
686 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 788 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
687 if (!cfg.signal) 789 if (!cfg.signal)
688 return -EINVAL; 790 return -EINVAL;
689 791
690 child->thread.bts_ovfl_signal = cfg.signal;
691 return -EOPNOTSUPP; 792 return -EOPNOTSUPP;
793 context->bts_ovfl_signal = cfg.signal;
692 } 794 }
693 795
694 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && 796 ds_release_bts(context->tracer);
695 (cfg.size != child->bts_size)) { 797 context->tracer = NULL;
696 int error;
697 798
698 ptrace_bts_free_buffer(child); 799 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
800 int err;
699 801
700 error = ptrace_bts_allocate_buffer(child, cfg.size); 802 free_bts_buffer(context);
701 if (error < 0) 803 if (!cfg.size)
702 return error; 804 return 0;
805
806 err = alloc_bts_buffer(context, cfg.size);
807 if (err < 0)
808 return err;
703 } 809 }
704 810
705 if (cfg.flags & PTRACE_BTS_O_TRACE) 811 if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
708 if (cfg.flags & PTRACE_BTS_O_SCHED) 814 if (cfg.flags & PTRACE_BTS_O_SCHED)
709 flags |= BTS_TIMESTAMPS; 815 flags |= BTS_TIMESTAMPS;
710 816
711 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, 817 context->tracer =
712 /* ovfl = */ NULL, /* th = */ (size_t)-1, 818 ds_request_bts_task(child, context->buffer, context->size,
713 flags); 819 NULL, (size_t)-1, flags);
714 if (IS_ERR(child->bts)) { 820 if (unlikely(IS_ERR(context->tracer))) {
715 int error = PTR_ERR(child->bts); 821 int error = PTR_ERR(context->tracer);
716
717 ptrace_bts_free_buffer(child);
718 child->bts = NULL;
719 822
823 free_bts_buffer(context);
824 context->tracer = NULL;
720 return error; 825 return error;
721 } 826 }
722 827
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
727 long cfg_size, 832 long cfg_size,
728 struct ptrace_bts_config __user *ucfg) 833 struct ptrace_bts_config __user *ucfg)
729{ 834{
835 struct bts_context *context;
730 const struct bts_trace *trace; 836 const struct bts_trace *trace;
731 struct ptrace_bts_config cfg; 837 struct ptrace_bts_config cfg;
732 838
839 context = child->bts;
840 if (!context)
841 return -ESRCH;
842
733 if (cfg_size < sizeof(cfg)) 843 if (cfg_size < sizeof(cfg))
734 return -EIO; 844 return -EIO;
735 845
736 trace = ds_read_bts(child->bts); 846 trace = ds_read_bts(context->tracer);
737 if (!trace) 847 if (!trace)
738 return -EPERM; 848 return -ESRCH;
739 849
740 memset(&cfg, 0, sizeof(cfg)); 850 memset(&cfg, 0, sizeof(cfg));
741 cfg.size = trace->ds.end - trace->ds.begin; 851 cfg.size = trace->ds.end - trace->ds.begin;
742 cfg.signal = child->thread.bts_ovfl_signal; 852 cfg.signal = context->bts_ovfl_signal;
743 cfg.bts_size = sizeof(struct bts_struct); 853 cfg.bts_size = sizeof(struct bts_struct);
744 854
745 if (cfg.signal) 855 if (cfg.signal)
746 cfg.flags |= PTRACE_BTS_O_SIGNAL; 856 cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
759 869
760static int ptrace_bts_clear(struct task_struct *child) 870static int ptrace_bts_clear(struct task_struct *child)
761{ 871{
872 struct bts_context *context;
762 const struct bts_trace *trace; 873 const struct bts_trace *trace;
763 874
764 trace = ds_read_bts(child->bts); 875 context = child->bts;
876 if (!context)
877 return -ESRCH;
878
879 trace = ds_read_bts(context->tracer);
765 if (!trace) 880 if (!trace)
766 return -EPERM; 881 return -ESRCH;
767 882
768 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); 883 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
769 884
770 return ds_reset_bts(child->bts); 885 return ds_reset_bts(context->tracer);
771} 886}
772 887
773static int ptrace_bts_size(struct task_struct *child) 888static int ptrace_bts_size(struct task_struct *child)
774{ 889{
890 struct bts_context *context;
775 const struct bts_trace *trace; 891 const struct bts_trace *trace;
776 892
777 trace = ds_read_bts(child->bts); 893 context = child->bts;
894 if (!context)
895 return -ESRCH;
896
897 trace = ds_read_bts(context->tracer);
778 if (!trace) 898 if (!trace)
779 return -EPERM; 899 return -ESRCH;
780 900
781 return (trace->ds.top - trace->ds.begin) / trace->ds.size; 901 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
782} 902}
783 903
784static void ptrace_bts_fork(struct task_struct *tsk) 904/*
785{ 905 * Called from __ptrace_unlink() after the child has been moved back
786 tsk->bts = NULL; 906 * to its original parent.
787 tsk->bts_buffer = NULL; 907 */
788 tsk->bts_size = 0; 908void ptrace_bts_untrace(struct task_struct *child)
789 tsk->thread.bts_ovfl_signal = 0;
790}
791
792static void ptrace_bts_untrace(struct task_struct *child)
793{ 909{
794 if (unlikely(child->bts)) { 910 if (unlikely(child->bts)) {
795 ds_release_bts(child->bts); 911 free_bts_context(child->bts);
796 child->bts = NULL; 912 child->bts = NULL;
797
798 /* We cannot update total_vm and locked_vm since
799 child's mm is already gone. But we can reclaim the
800 memory. */
801 kfree(child->bts_buffer);
802 child->bts_buffer = NULL;
803 child->bts_size = 0;
804 } 913 }
805} 914}
806
807static void ptrace_bts_detach(struct task_struct *child)
808{
809 /*
810 * Ptrace_detach() races with ptrace_untrace() in case
811 * the child dies and is reaped by another thread.
812 *
813 * We only do the memory accounting at this point and
814 * leave the buffer deallocation and the bts tracer
815 * release to ptrace_bts_untrace() which will be called
816 * later on with tasklist_lock held.
817 */
818 release_locked_buffer(child->bts_buffer, child->bts_size);
819}
820#else
821static inline void ptrace_bts_fork(struct task_struct *tsk) {}
822static inline void ptrace_bts_detach(struct task_struct *child) {}
823static inline void ptrace_bts_untrace(struct task_struct *child) {}
824#endif /* CONFIG_X86_PTRACE_BTS */ 915#endif /* CONFIG_X86_PTRACE_BTS */
825 916
826void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
827{
828 ptrace_bts_fork(child);
829}
830
831void x86_ptrace_untrace(struct task_struct *child)
832{
833 ptrace_bts_untrace(child);
834}
835
836/* 917/*
837 * Called by kernel/ptrace.c when detaching.. 918 * Called by kernel/ptrace.c when detaching..
838 * 919 *
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
844#ifdef TIF_SYSCALL_EMU 925#ifdef TIF_SYSCALL_EMU
845 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 926 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
846#endif 927#endif
847 ptrace_bts_detach(child);
848} 928}
849 929
850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 930#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f03..af71d06624bf 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494#endif
495
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
497/* Set correct numa_node information for AMD NB functions */
498static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{
500 struct pci_dev *nb_ht;
501 unsigned int devfn;
502 u32 val;
503
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
505 nb_ht = pci_get_slot(dev->bus, devfn);
506 if (!nb_ht)
507 return;
508
509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev);
512}
494 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
515 quirk_amd_nb_node);
516DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
517 quirk_amd_nb_node);
518DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
519 quirk_amd_nb_node);
520DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
521 quirk_amd_nb_node);
522DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
523 quirk_amd_nb_node);
524DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
525 quirk_amd_nb_node);
526DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
527 quirk_amd_nb_node);
528DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
529 quirk_amd_nb_node);
530DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
531 quirk_amd_nb_node);
495#endif 532#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e0b5a0..d2d1ce8170f0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
193 }, 193 },
194 }, 194 },
195 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
196 .callback = set_bios_reboot,
197 .ident = "Dell OptiPlex 360",
198 .matches = {
199 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
200 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
201 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
202 },
203 },
195 { /* Handle problems with rebooting on Dell 2400's */ 204 { /* Handle problems with rebooting on Dell 2400's */
196 .callback = set_bios_reboot, 205 .callback = set_bios_reboot,
197 .ident = "Dell PowerEdge 2400", 206 .ident = "Dell PowerEdge 2400",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 523bb697120d..d1c636bf31a7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access
118 * apertures, ACPI and other tables without having to play with fixmaps.
119 */
120unsigned long max_low_pfn_mapped;
121unsigned long max_pfn_mapped;
122
115RESERVE_BRK(dmi_alloc, 65536); 123RESERVE_BRK(dmi_alloc, 65536);
116 124
117unsigned int boot_cpu_id __read_mostly; 125unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
214unsigned long mmu_cr4_features = X86_CR4_PAE; 222unsigned long mmu_cr4_features = X86_CR4_PAE;
215#endif 223#endif
216 224
217/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 225/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
218int bootloader_type; 226int bootloader_type, bootloader_version;
219 227
220/* 228/*
221 * Setup options 229 * Setup options
@@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p)
706#endif 714#endif
707 saved_video_mode = boot_params.hdr.vid_mode; 715 saved_video_mode = boot_params.hdr.vid_mode;
708 bootloader_type = boot_params.hdr.type_of_loader; 716 bootloader_type = boot_params.hdr.type_of_loader;
717 if ((bootloader_type >> 4) == 0xe) {
718 bootloader_type &= 0xf;
719 bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
720 }
721 bootloader_version = bootloader_type & 0xf;
722 bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
709 723
710#ifdef CONFIG_BLK_DEV_RAM 724#ifdef CONFIG_BLK_DEV_RAM
711 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; 725 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p)
854 max_low_pfn = max_pfn; 868 max_low_pfn = max_pfn;
855 869
856 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 870 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
871 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
857#endif 872#endif
858 873
859#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 874#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
860 setup_bios_corruption_check(); 875 setup_bios_corruption_check();
861#endif 876#endif
862 877
878 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
879 max_pfn_mapped<<PAGE_SHIFT);
880
863 reserve_brk(); 881 reserve_brk();
864 882
865 /* max_pfn_mapped is updated here */ 883 /* max_pfn_mapped is updated here */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8f0e13be36b3..9c3f0823e6aa 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
426#endif 426#endif
427 427
428#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
429 /*
430 * make sure boot cpu node_number is right, when boot cpu is on the
431 * node that doesn't have mem installed
432 */
433 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
434#endif
435
428 /* Setup node to cpumask map */ 436 /* Setup node to cpumask map */
429 setup_node_to_cpumask_map(); 437 setup_node_to_cpumask_map();
430 438
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4976888094f0..4c578751e94e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index bf1831aa14fa..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -198,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
198{ 198{
199 ack_APIC_irq(); 199 ack_APIC_irq();
200 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 /*
202 * KVM uses this interrupt to force a cpu out of guest mode
203 */
201} 204}
202 205
203void smp_call_function_interrupt(struct pt_regs *regs) 206void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d2e8de958156..7c80007ea5f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
992 */ 992 */
993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && 993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
994 !cpu_has_apic) { 994 !cpu_has_apic) {
995 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 995 if (!disable_apic) {
996 boot_cpu_physical_apicid); 996 pr_err("BIOS bug, local APIC #%d not detected!...\n",
997 printk(KERN_ERR "... forcing use of dummy APIC emulation." 997 boot_cpu_physical_apicid);
998 pr_err("... forcing use of dummy APIC emulation."
998 "(tell your hw vendor)\n"); 999 "(tell your hw vendor)\n");
1000 }
999 smpboot_clear_io_apic(); 1001 smpboot_clear_io_apic();
1000 arch_disable_smp_support(); 1002 arch_disable_smp_support();
1001 return -1; 1003 return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2e37d1..4aaf7e48394f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
20 20
21static int save_stack_stack(void *data, char *name) 21static int save_stack_stack(void *data, char *name)
22{ 22{
23 return -1; 23 return 0;
24} 24}
25 25
26static void save_stack_address(void *data, unsigned long addr, int reliable) 26static void save_stack_address(void *data, unsigned long addr, int reliable)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..d51321ddafda 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c33761e6d..124d40c575df 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
715 struct bau_desc *adp; 715 struct bau_desc *adp;
716 struct bau_desc *ad2; 716 struct bau_desc *ad2;
717 717
718 adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); 718 /*
719 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
720 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
721 */
722 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
723 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
719 BUG_ON(!adp); 724 BUG_ON(!adp);
720 725
721 pa = uv_gpa(adp); /* need the real nasid*/ 726 pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
729 (n << UV_DESC_BASE_PNODE_SHIFT | m)); 734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
730 } 735 }
731 736
732 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { 737 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
739 * cpu even though we only use the first one; one descriptor can
740 * describe a broadcast to 256 nodes.
741 */
742 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
743 i++, ad2++) {
733 memset(ad2, 0, sizeof(struct bau_desc)); 744 memset(ad2, 0, sizeof(struct bau_desc));
734 ad2->header.sw_ack_flag = 1; 745 ad2->header.sw_ack_flag = 1;
735 /* 746 /*
@@ -832,7 +843,7 @@ static int __init uv_bau_init(void)
832 return 0; 843 return 0;
833 844
834 for_each_possible_cpu(cur_cpu) 845 for_each_possible_cpu(cur_cpu)
835 alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 846 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
836 GFP_KERNEL, cpu_to_node(cur_cpu)); 847 GFP_KERNEL, cpu_to_node(cur_cpu));
837 848
838 uv_bau_retry_limit = 1; 849 uv_bau_retry_limit = 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f4d683b630ba..1e1e27b7d438 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
839 } 839 }
840 840
841 clts(); /* Allow maths ops (or we recurse) */ 841 clts(); /* Allow maths ops (or we recurse) */
842#ifdef CONFIG_X86_32
843 restore_fpu(tsk);
844#else
845 /* 842 /*
846 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 843 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
847 */ 844 */
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
850 force_sig(SIGSEGV, tsk); 847 force_sig(SIGSEGV, tsk);
851 return; 848 return;
852 } 849 }
853#endif 850
854 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 851 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
855 tsk->fpu_counter++; 852 tsk->fpu_counter++;
856} 853}
@@ -945,8 +942,13 @@ void __init trap_init(void)
945#endif 942#endif
946 set_intr_gate(19, &simd_coprocessor_error); 943 set_intr_gate(19, &simd_coprocessor_error);
947 944
945 /* Reserve all the builtin and the syscall vector: */
946 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
947 set_bit(i, used_vectors);
948
948#ifdef CONFIG_IA32_EMULATION 949#ifdef CONFIG_IA32_EMULATION
949 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 950 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
951 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
950#endif 952#endif
951 953
952#ifdef CONFIG_X86_32 954#ifdef CONFIG_X86_32
@@ -963,14 +965,9 @@ void __init trap_init(void)
963 } 965 }
964 966
965 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 967 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
968 set_bit(SYSCALL_VECTOR, used_vectors);
966#endif 969#endif
967 970
968 /* Reserve all the builtin and the syscall vector: */
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors);
971
972 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
973
974 /* 971 /*
975 * Should be a barrier for any external CPU state: 972 * Should be a barrier for any external CPU state:
976 */ 973 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc430..3e1c057e98fe 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
384{ 384{
385 u64 tsc1, tsc2, delta, ref1, ref2; 385 u64 tsc1, tsc2, delta, ref1, ref2;
386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
387 unsigned long flags, latch, ms, fast_calibrate, tsc_khz; 387 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
388 int hpet = is_hpet_enabled(), i, loopmin; 388 int hpet = is_hpet_enabled(), i, loopmin;
389 389
390 tsc_khz = get_hypervisor_tsc_freq(); 390 hv_tsc_khz = get_hypervisor_tsc_freq();
391 if (tsc_khz) { 391 if (hv_tsc_khz) {
392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); 392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
393 return tsc_khz; 393 return hv_tsc_khz;
394 } 394 }
395 395
396 local_irq_save(flags); 396 local_irq_save(flags);
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
710#ifdef CONFIG_X86_64 710#ifdef CONFIG_X86_64
711static cycle_t __vsyscall_fn vread_tsc(void) 711static cycle_t __vsyscall_fn vread_tsc(void)
712{ 712{
713 cycle_t ret = (cycle_t)vget_cycles(); 713 cycle_t ret;
714
715 /*
716 * Surround the RDTSC by barriers, to make sure it's not
717 * speculated to outside the seqlock critical section and
718 * does not cause time warps:
719 */
720 rdtsc_barrier();
721 ret = (cycle_t)vget_cycles();
722 rdtsc_barrier();
714 723
715 return ret >= __vsyscall_gtod_data.clock.cycle_last ? 724 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
716 ret : __vsyscall_gtod_data.clock.cycle_last; 725 ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef9..027b5b498993 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37
37static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps; 40static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 return; 114 return;
114 115
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO 117 pr_info("Skipping synchronization checks as TSC is reliable.\n");
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
123 123
124 /* 124 /*
125 * Reset it - in case this is a second bootup: 125 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
143 143
144 if (nr_warps) { 144 if (nr_warps) {
145 printk("\n"); 145 printk("\n");
146 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 " turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
195 while (atomic_read(&stop_count) != cpus) 195 while (atomic_read(&stop_count) != cpus)
196 cpu_relax(); 196 cpu_relax();
197} 197}
198#undef NR_LOOPS
199
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1c..9c4e62539058 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
287 info->regs.pt.ds = 0; 287 info->regs.pt.ds = 0;
288 info->regs.pt.es = 0; 288 info->regs.pt.es = 0;
289 info->regs.pt.fs = 0; 289 info->regs.pt.fs = 0;
290 290#ifndef CONFIG_X86_32_LAZY_GS
291/* we are clearing gs later just before "jmp resume_userspace", 291 info->regs.pt.gs = 0;
292 * because it is not saved/restored. 292#endif
293 */
294 293
295/* 294/*
296 * The flags register is also special: we cannot trust that the user 295 * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 } 317 }
319 318
320/* 319/*
321 * Save old state, set default return value (%ax) to 0 320 * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
322 */ 321 */
323 info->regs32->ax = 0; 322 info->regs32->ax = VM86_SIGNAL;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 323 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 324 tsk->thread.saved_fs = info->regs32->fs;
326 tsk->thread.saved_gs = get_user_gs(info->regs32); 325 tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
343 __asm__ __volatile__( 342 __asm__ __volatile__(
344 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
345 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345#ifdef CONFIG_X86_32_LAZY_GS
346 "mov %2, %%gs\n\t" 346 "mov %2, %%gs\n\t"
347#endif
347 "jmp resume_userspace" 348 "jmp resume_userspace"
348 : /* no outputs */ 349 : /* no outputs */
349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 350 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211e..b263423fbe2a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
462} 462}
463#endif 463#endif
464 464
465static void vmi_enter_lazy_cpu(void) 465static void vmi_start_context_switch(struct task_struct *prev)
466{ 466{
467 paravirt_enter_lazy_cpu(); 467 paravirt_start_context_switch(prev);
468 vmi_ops.set_lazy_mode(2); 468 vmi_ops.set_lazy_mode(2);
469} 469}
470 470
471static void vmi_end_context_switch(struct task_struct *next)
472{
473 vmi_ops.set_lazy_mode(0);
474 paravirt_end_context_switch(next);
475}
476
471static void vmi_enter_lazy_mmu(void) 477static void vmi_enter_lazy_mmu(void)
472{ 478{
473 paravirt_enter_lazy_mmu(); 479 paravirt_enter_lazy_mmu();
474 vmi_ops.set_lazy_mode(1); 480 vmi_ops.set_lazy_mode(1);
475} 481}
476 482
477static void vmi_leave_lazy(void) 483static void vmi_leave_lazy_mmu(void)
478{ 484{
479 paravirt_leave_lazy(paravirt_get_lazy_mode());
480 vmi_ops.set_lazy_mode(0); 485 vmi_ops.set_lazy_mode(0);
486 paravirt_leave_lazy_mmu();
481} 487}
482 488
483static inline int __init check_vmi_rom(struct vrom_header *rom) 489static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
711 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 717 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
712 para_fill(pv_cpu_ops.io_delay, IODelay); 718 para_fill(pv_cpu_ops.io_delay, IODelay);
713 719
714 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, 720 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
715 set_lazy_mode, SetLazyMode); 721 set_lazy_mode, SetLazyMode);
716 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, 722 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
717 set_lazy_mode, SetLazyMode); 723 set_lazy_mode, SetLazyMode);
718 724
719 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, 725 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
720 set_lazy_mode, SetLazyMode); 726 set_lazy_mode, SetLazyMode);
721 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, 727 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
722 set_lazy_mode, SetLazyMode); 728 set_lazy_mode, SetLazyMode);
723 729
724 /* user and kernel flush are just handled with different flags to FlushTLB */ 730 /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f013..4c85b2e2bb65 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,431 @@
1/*
2 * ld script for the x86 kernel
3 *
4 * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
5 *
6 * Modernisation, unification and other changes and fixes:
7 * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
8 *
9 *
10 * Don't define absolute symbols until and unless you know that symbol
11 * value is should remain constant even if kernel image is relocated
12 * at run time. Absolute symbols are not relocated. If symbol value should
13 * change if kernel is relocated, make the symbol section relative and
14 * put it inside the section definition.
15 */
16
1#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S" 18#define LOAD_OFFSET __PAGE_OFFSET
3#else 19#else
4# include "vmlinux_64.lds.S" 20#define LOAD_OFFSET __START_KERNEL_map
5#endif 21#endif
22
23#include <asm-generic/vmlinux.lds.h>
24#include <asm/asm-offsets.h>
25#include <asm/thread_info.h>
26#include <asm/page_types.h>
27#include <asm/cache.h>
28#include <asm/boot.h>
29
30#undef i386 /* in case the preprocessor is a 32bit one */
31
32OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
33
34#ifdef CONFIG_X86_32
35OUTPUT_ARCH(i386)
36ENTRY(phys_startup_32)
37jiffies = jiffies_64;
38#else
39OUTPUT_ARCH(i386:x86-64)
40ENTRY(phys_startup_64)
41jiffies_64 = jiffies;
42#endif
43
44PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */
49 data.init PT_LOAD FLAGS(7); /* RWE */
50#ifdef CONFIG_SMP
51 percpu PT_LOAD FLAGS(7); /* RWE */
52#endif
53 data.init2 PT_LOAD FLAGS(7); /* RWE */
54#endif
55 note PT_NOTE FLAGS(0); /* ___ */
56}
57
58SECTIONS
59{
60#ifdef CONFIG_X86_32
61 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
62 phys_startup_32 = startup_32 - LOAD_OFFSET;
63#else
64 . = __START_KERNEL;
65 phys_startup_64 = startup_64 - LOAD_OFFSET;
66#endif
67
68 /* Text and read-only data */
69
70 /* bootstrapping code */
71 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
72 _text = .;
73 *(.text.head)
74 } :text = 0x9090
75
76 /* The rest of the text */
77 .text : AT(ADDR(.text) - LOAD_OFFSET) {
78#ifdef CONFIG_X86_32
79 /* not really needed, already page aligned */
80 . = ALIGN(PAGE_SIZE);
81 *(.text.page_aligned)
82#endif
83 . = ALIGN(8);
84 _stext = .;
85 TEXT_TEXT
86 SCHED_TEXT
87 LOCK_TEXT
88 KPROBES_TEXT
89 IRQENTRY_TEXT
90 *(.fixup)
91 *(.gnu.warning)
92 /* End of text section */
93 _etext = .;
94 } :text = 0x9090
95
96 NOTES :text :note
97
98 /* Exception table */
99 . = ALIGN(16);
100 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
101 __start___ex_table = .;
102 *(__ex_table)
103 __stop___ex_table = .;
104 } :text = 0x9090
105
106 RODATA
107
108 /* Data */
109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 DATA_DATA
112 CONSTRUCTORS
113
114#ifdef CONFIG_X86_64
115 /* End of data section */
116 _edata = .;
117#endif
118 } :data
119
120#ifdef CONFIG_X86_32
121 /* 32 bit has nosave before _edata */
122 . = ALIGN(PAGE_SIZE);
123 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
124 __nosave_begin = .;
125 *(.data.nosave)
126 . = ALIGN(PAGE_SIZE);
127 __nosave_end = .;
128 }
129#endif
130
131 . = ALIGN(PAGE_SIZE);
132 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
133 *(.data.page_aligned)
134 *(.data.idt)
135 }
136
137#ifdef CONFIG_X86_32
138 . = ALIGN(32);
139#else
140 . = ALIGN(PAGE_SIZE);
141 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
142#endif
143 .data.cacheline_aligned :
144 AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
145 *(.data.cacheline_aligned)
146 }
147
148 /* rarely changed data like cpu maps */
149#ifdef CONFIG_X86_32
150 . = ALIGN(32);
151#else
152 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
153#endif
154 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
155 *(.data.read_mostly)
156
157#ifdef CONFIG_X86_32
158 /* End of data section */
159 _edata = .;
160#endif
161 }
162
163#ifdef CONFIG_X86_64
164
165#define VSYSCALL_ADDR (-10*1024*1024)
166#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
167 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
168#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
169 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
170
171#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
172#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
173
174#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
175#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
176
177 . = VSYSCALL_ADDR;
178 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
179 *(.vsyscall_0)
180 } :user
181
182 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
183
184 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
185 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
186 *(.vsyscall_fn)
187 }
188
189 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
190 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
191 *(.vsyscall_gtod_data)
192 }
193
194 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
195 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
196 *(.vsyscall_clock)
197 }
198 vsyscall_clock = VVIRT(.vsyscall_clock);
199
200
201 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
202 *(.vsyscall_1)
203 }
204 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
205 *(.vsyscall_2)
206 }
207
208 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
209 *(.vgetcpu_mode)
210 }
211 vgetcpu_mode = VVIRT(.vgetcpu_mode);
212
213 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
214 .jiffies : AT(VLOAD(.jiffies)) {
215 *(.jiffies)
216 }
217 jiffies = VVIRT(.jiffies);
218
219 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
220 *(.vsyscall_3)
221 }
222
223 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
224
225#undef VSYSCALL_ADDR
226#undef VSYSCALL_PHYS_ADDR
227#undef VSYSCALL_VIRT_ADDR
228#undef VLOAD_OFFSET
229#undef VLOAD
230#undef VVIRT_OFFSET
231#undef VVIRT
232
233#endif /* CONFIG_X86_64 */
234
235 /* init_task */
236 . = ALIGN(THREAD_SIZE);
237 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
238 *(.data.init_task)
239 }
240#ifdef CONFIG_X86_64
241 :data.init
242#endif
243
244 /*
245 * smp_locks might be freed after init
246 * start/end must be page aligned
247 */
248 . = ALIGN(PAGE_SIZE);
249 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
250 __smp_locks = .;
251 *(.smp_locks)
252 __smp_locks_end = .;
253 . = ALIGN(PAGE_SIZE);
254 }
255
256 /* Init code and data - will be freed after init */
257 . = ALIGN(PAGE_SIZE);
258 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
259 __init_begin = .; /* paired with __init_end */
260 _sinittext = .;
261 INIT_TEXT
262 _einittext = .;
263 }
264
265 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
266 INIT_DATA
267 }
268
269 . = ALIGN(16);
270 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
271 __setup_start = .;
272 *(.init.setup)
273 __setup_end = .;
274 }
275 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
276 __initcall_start = .;
277 INITCALLS
278 __initcall_end = .;
279 }
280
281 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
282 __con_initcall_start = .;
283 *(.con_initcall.init)
284 __con_initcall_end = .;
285 }
286
287 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
288 __x86_cpu_dev_start = .;
289 *(.x86_cpu_dev.init)
290 __x86_cpu_dev_end = .;
291 }
292
293 SECURITY_INIT
294
295 . = ALIGN(8);
296 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
297 __parainstructions = .;
298 *(.parainstructions)
299 __parainstructions_end = .;
300 }
301
302 . = ALIGN(8);
303 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
304 __alt_instructions = .;
305 *(.altinstructions)
306 __alt_instructions_end = .;
307 }
308
309 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
310 *(.altinstr_replacement)
311 }
312
313 /*
314 * .exit.text is discard at runtime, not link time, to deal with
315 * references from .altinstructions and .eh_frame
316 */
317 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
318 EXIT_TEXT
319 }
320
321 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
322 EXIT_DATA
323 }
324
325#ifdef CONFIG_BLK_DEV_INITRD
326 . = ALIGN(PAGE_SIZE);
327 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
328 __initramfs_start = .;
329 *(.init.ramfs)
330 __initramfs_end = .;
331 }
332#endif
333
334#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
335 /*
336 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
337 * output PHDR, so the next output section - __data_nosave - should
338 * start another section data.init2. Also, pda should be at the head of
339 * percpu area. Preallocate it and define the percpu offset symbol
340 * so that it can be accessed as a percpu variable.
341 */
342 . = ALIGN(PAGE_SIZE);
343 PERCPU_VADDR(0, :percpu)
344#else
345 PERCPU(PAGE_SIZE)
346#endif
347
348 . = ALIGN(PAGE_SIZE);
349
350 /* freed after init ends here */
351 .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
352 __init_end = .;
353 }
354
355#ifdef CONFIG_X86_64
356 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
357 . = ALIGN(PAGE_SIZE);
358 __nosave_begin = .;
359 *(.data.nosave)
360 . = ALIGN(PAGE_SIZE);
361 __nosave_end = .;
362 } :data.init2
363 /* use another section data.init2, see PERCPU_VADDR() above */
364#endif
365
366 /* BSS */
367 . = ALIGN(PAGE_SIZE);
368 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
369 __bss_start = .;
370 *(.bss.page_aligned)
371 *(.bss)
372 . = ALIGN(4);
373 __bss_stop = .;
374 }
375
376 . = ALIGN(PAGE_SIZE);
377 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
378 __brk_base = .;
379 . += 64 * 1024; /* 64k alignment slop space */
380 *(.brk_reservation) /* areas brk users have reserved */
381 __brk_limit = .;
382 }
383
384 .end : AT(ADDR(.end) - LOAD_OFFSET) {
385 _end = .;
386 }
387
388 /* Sections to be discarded */
389 /DISCARD/ : {
390 *(.exitcall.exit)
391 *(.eh_frame)
392 *(.discard)
393 }
394
395 STABS_DEBUG
396 DWARF_DEBUG
397}
398
399
400#ifdef CONFIG_X86_32
401ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
402 "kernel image bigger than KERNEL_IMAGE_SIZE")
403#else
404/*
405 * Per-cpu symbols which need to be offset from __per_cpu_load
406 * for the boot processor.
407 */
408#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
409INIT_PER_CPU(gdt_page);
410INIT_PER_CPU(irq_stack_union);
411
412/*
413 * Build-time check on the image size:
414 */
415ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
416 "kernel image bigger than KERNEL_IMAGE_SIZE")
417
418#ifdef CONFIG_SMP
419ASSERT((per_cpu__irq_stack_union == 0),
420 "irq_stack_union is not at start of per-cpu area");
421#endif
422
423#endif /* CONFIG_X86_32 */
424
425#ifdef CONFIG_KEXEC
426#include <asm/kexec.h>
427
428ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
429 "kexec control code size is too big")
430#endif
431
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f3..000000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11#define LOAD_OFFSET __PAGE_OFFSET
12
13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h>
15#include <asm/page_types.h>
16#include <asm/cache.h>
17#include <asm/boot.h>
18
19OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
20OUTPUT_ARCH(i386)
21ENTRY(phys_startup_32)
22jiffies = jiffies_64;
23
24PHDRS {
25 text PT_LOAD FLAGS(5); /* R_E */
26 data PT_LOAD FLAGS(7); /* RWE */
27 note PT_NOTE FLAGS(0); /* ___ */
28}
29SECTIONS
30{
31 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
32 phys_startup_32 = startup_32 - LOAD_OFFSET;
33
34 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
35 _text = .; /* Text and read-only data */
36 *(.text.head)
37 } :text = 0x9090
38
39 /* read-only */
40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
42 *(.text.page_aligned)
43 TEXT_TEXT
44 SCHED_TEXT
45 LOCK_TEXT
46 KPROBES_TEXT
47 IRQENTRY_TEXT
48 *(.fixup)
49 *(.gnu.warning)
50 _etext = .; /* End of text section */
51 } :text = 0x9090
52
53 NOTES :text :note
54
55 . = ALIGN(16); /* Exception table */
56 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
57 __start___ex_table = .;
58 *(__ex_table)
59 __stop___ex_table = .;
60 } :text = 0x9090
61
62 RODATA
63
64 /* writeable */
65 . = ALIGN(PAGE_SIZE);
66 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 . = ALIGN(PAGE_SIZE);
72 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
73 __nosave_begin = .;
74 *(.data.nosave)
75 . = ALIGN(PAGE_SIZE);
76 __nosave_end = .;
77 }
78
79 . = ALIGN(PAGE_SIZE);
80 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
81 *(.data.page_aligned)
82 *(.data.idt)
83 }
84
85 . = ALIGN(32);
86 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
87 *(.data.cacheline_aligned)
88 }
89
90 /* rarely changed data like cpu maps */
91 . = ALIGN(32);
92 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
93 *(.data.read_mostly)
94 _edata = .; /* End of data section */
95 }
96
97 . = ALIGN(THREAD_SIZE); /* init_task */
98 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
99 *(.data.init_task)
100 }
101
102 /* might get freed after init */
103 . = ALIGN(PAGE_SIZE);
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
105 __smp_locks = .;
106 *(.smp_locks)
107 __smp_locks_end = .;
108 }
109 /* will be freed after init
110 * Following ALIGN() is required to make sure no other data falls on the
111 * same page where __smp_alt_end is pointing as that page might be freed
112 * after boot. Always make sure that ALIGN() directive is present after
113 * the section which contains __smp_alt_end.
114 */
115 . = ALIGN(PAGE_SIZE);
116
117 /* will be freed after init */
118 . = ALIGN(PAGE_SIZE); /* Init code and data */
119 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
120 __init_begin = .;
121 _sinittext = .;
122 INIT_TEXT
123 _einittext = .;
124 }
125 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
126 INIT_DATA
127 }
128 . = ALIGN(16);
129 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
130 __setup_start = .;
131 *(.init.setup)
132 __setup_end = .;
133 }
134 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
135 __initcall_start = .;
136 INITCALLS
137 __initcall_end = .;
138 }
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
140 __con_initcall_start = .;
141 *(.con_initcall.init)
142 __con_initcall_end = .;
143 }
144 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
145 __x86_cpu_dev_start = .;
146 *(.x86_cpu_dev.init)
147 __x86_cpu_dev_end = .;
148 }
149 SECURITY_INIT
150 . = ALIGN(4);
151 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
152 __alt_instructions = .;
153 *(.altinstructions)
154 __alt_instructions_end = .;
155 }
156 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
157 *(.altinstr_replacement)
158 }
159 . = ALIGN(4);
160 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
161 __parainstructions = .;
162 *(.parainstructions)
163 __parainstructions_end = .;
164 }
165 /* .exit.text is discard at runtime, not link time, to deal with references
166 from .altinstructions and .eh_frame */
167 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
168 EXIT_TEXT
169 }
170 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
171 EXIT_DATA
172 }
173#if defined(CONFIG_BLK_DEV_INITRD)
174 . = ALIGN(PAGE_SIZE);
175 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
176 __initramfs_start = .;
177 *(.init.ramfs)
178 __initramfs_end = .;
179 }
180#endif
181 PERCPU(PAGE_SIZE)
182 . = ALIGN(PAGE_SIZE);
183 /* freed after init ends here */
184
185 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
186 __init_end = .;
187 __bss_start = .; /* BSS */
188 *(.bss.page_aligned)
189 *(.bss)
190 . = ALIGN(4);
191 __bss_stop = .;
192 }
193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
195 . = ALIGN(PAGE_SIZE);
196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
204 }
205
206 /* Sections to be discarded */
207 /DISCARD/ : {
208 *(.exitcall.exit)
209 *(.discard)
210 }
211
212 STABS_DEBUG
213
214 DWARF_DEBUG
215}
216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
223#ifdef CONFIG_KEXEC
224/* Link time checks */
225#include <asm/kexec.h>
226
227ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
228 "kexec control code size is too big")
229#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b030..000000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
10
11#undef i386 /* in case the preprocessor is a 32bit one */
12
13OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
14OUTPUT_ARCH(i386:x86-64)
15ENTRY(phys_startup_64)
16jiffies_64 = jiffies;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
26 note PT_NOTE FLAGS(0); /* ___ */
27}
28SECTIONS
29{
30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */
35 *(.text.head)
36 _stext = .;
37 /* Then the rest */
38 TEXT_TEXT
39 SCHED_TEXT
40 LOCK_TEXT
41 KPROBES_TEXT
42 IRQENTRY_TEXT
43 *(.fixup)
44 *(.gnu.warning)
45 _etext = .; /* End of text section */
46 } :text = 0x9090
47
48 NOTES :text :note
49
50 . = ALIGN(16); /* Exception table */
51 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
52 __start___ex_table = .;
53 *(__ex_table)
54 __stop___ex_table = .;
55 } :text = 0x9090
56
57 RODATA
58
59 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
60 /* Data */
61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA
63 CONSTRUCTORS
64 _edata = .; /* End of data section */
65 } :data
66
67
68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned)
72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
74 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
75 *(.data.read_mostly)
76 }
77
78#define VSYSCALL_ADDR (-10*1024*1024)
79#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
80#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
81
82#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
83#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
84
85#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
86#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
87
88 . = VSYSCALL_ADDR;
89 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
90 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
91
92 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
93 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
94 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
95 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
96 { *(.vsyscall_gtod_data) }
97 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
98 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
99 { *(.vsyscall_clock) }
100 vsyscall_clock = VVIRT(.vsyscall_clock);
101
102
103 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
104 { *(.vsyscall_1) }
105 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
106 { *(.vsyscall_2) }
107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
111 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
112 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
113 jiffies = VVIRT(.jiffies);
114
115 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
116 { *(.vsyscall_3) }
117
118 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
119
120#undef VSYSCALL_ADDR
121#undef VSYSCALL_PHYS_ADDR
122#undef VSYSCALL_VIRT_ADDR
123#undef VLOAD_OFFSET
124#undef VLOAD
125#undef VVIRT_OFFSET
126#undef VVIRT
127
128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task)
131 }:data.init
132
133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned)
136 }
137
138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
147 }
148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .;
153 INIT_TEXT
154 _einittext = .;
155 }
156 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
157 __initdata_begin = .;
158 INIT_DATA
159 __initdata_end = .;
160 }
161
162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 . = ALIGN(16);
164 __setup_start = .;
165 *(.init.setup)
166 __setup_end = .;
167 }
168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
170 INITCALLS
171 __initcall_end = .;
172 }
173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
175 *(.con_initcall.init)
176 __con_initcall_end = .;
177 }
178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
182 }
183 SECURITY_INIT
184
185 . = ALIGN(8);
186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
187 __parainstructions = .;
188 *(.parainstructions)
189 __parainstructions_end = .;
190 }
191
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
195 *(.altinstructions)
196 __alt_instructions_end = .;
197 }
198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
199 *(.altinstr_replacement)
200 }
201 /* .exit.text is discard at runtime, not link time, to deal with references
202 from .altinstructions and .eh_frame */
203 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
204 EXIT_TEXT
205 }
206 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
207 EXIT_DATA
208 }
209
210#ifdef CONFIG_BLK_DEV_INITRD
211 . = ALIGN(PAGE_SIZE);
212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
213 __initramfs_start = .;
214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
217#endif
218
219#ifdef CONFIG_SMP
220 /*
221 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
222 * output PHDR, so the next output section - __data_nosave - should
223 * start another section data.init2. Also, pda should be at the head of
224 * percpu area. Preallocate it and define the percpu offset symbol
225 * so that it can be accessed as a percpu variable.
226 */
227 . = ALIGN(PAGE_SIZE);
228 PERCPU_VADDR(0, :percpu)
229#else
230 PERCPU(PAGE_SIZE)
231#endif
232
233 . = ALIGN(PAGE_SIZE);
234 __init_end = .;
235
236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
243
244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
247 *(.bss.page_aligned)
248 *(.bss)
249 __bss_stop = .;
250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
259
260 _end = . ;
261
262 /* Sections to be discarded */
263 /DISCARD/ : {
264 *(.exitcall.exit)
265 *(.eh_frame)
266 *(.discard)
267 }
268
269 STABS_DEBUG
270
271 DWARF_DEBUG
272}
273
274 /*
275 * Per-cpu symbols which need to be offset from __per_cpu_load
276 * for the boot processor.
277 */
278#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
279INIT_PER_CPU(gdt_page);
280INIT_PER_CPU(irq_stack_union);
281
282/*
283 * Build-time check on the image size:
284 */
285ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
286 "kernel image bigger than KERNEL_IMAGE_SIZE")
287
288#ifdef CONFIG_SMP
289ASSERT((per_cpu__irq_stack_union == 0),
290 "irq_stack_union is not at start of per-cpu area");
291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc9067..25ee06a80aad 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
132 return; 132 return;
133 } 133 }
134 134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
141 now = vread(); 135 now = vread();
142 rdtsc_barrier();
143
144 base = __vsyscall_gtod_data.clock.cycle_last; 136 base = __vsyscall_gtod_data.clock.cycle_last;
145 mask = __vsyscall_gtod_data.clock.mask; 137 mask = __vsyscall_gtod_data.clock.mask;
146 mult = __vsyscall_gtod_data.clock.mult; 138 mult = __vsyscall_gtod_data.clock.mult;