From d5a2601734bcc740ee78dc4cb0c56b5687da7bd9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:42 +0200 Subject: [PATCH] i386/x86-64: Add user_mode checks to profile_pc for oprofile Fixes a obscure user space triggerable crash during oprofiling. Oprofile calls profile_pc from NMIs even when user_mode(regs) is not true and the program counter is inside the kernel lock section. This opens a race - when a user program jumps to a kernel lock address and a NMI happens before the illegal page fault exception is raised and the program has a unmapped esp or ebp then the kernel could oops. NMIs have a higher priority than exceptions so that could happen. Add user_mode checks to i386/x86-64 profile_pc to prevent that. Cc: John Levon Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index b9ff75992c16..e0341c6808e5 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -193,7 +193,7 @@ unsigned long profile_pc(struct pt_regs *regs) is just accounted to the spinlock function. Better would be to write these functions in assembler again and check exactly. */ - if (in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { char *v = *(char **)regs->rsp; if ((v >= _stext && v <= _etext) || (v >= _sinittext && v <= _einittext) || -- cgit v1.2.2 From 0e92da4acb763272c6060f0b14adc2377b627d07 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:45 +0200 Subject: [PATCH] x86_64: Don't clobber r8-r11 in int 0x80 handler When int 0x80 is called from long mode r8-r11 would leak out of the kernel (or rather they would be filled with some values from the kernel stack). I don't think it's a security issue because the values come from the fixed stack frame which should be near always user registers from a previous interrupt. Still better fix it. Longer term the register save macros need to be cleaned up to avoid such mistakes in the future. Original analysis from Richard Brunner, fix by me. Cc: Richard.Brunner@amd.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/ia32/ia32entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 9b5bb413a6e9..5d4a7d125ed0 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -103,7 +103,7 @@ ENTRY(ia32_sysenter_target) pushq %rax CFI_ADJUST_CFA_OFFSET 8 cld - SAVE_ARGS 0,0,1 + SAVE_ARGS 0,0,0 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%r9d -- cgit v1.2.2 From b13761ecd1d9977d2083da243e051e9f29097aef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:51 +0200 Subject: [PATCH] x86_64: Dump leftover backtrace entries when dwarf2 unwinder got stuck The dwarf2 unwinder currently often gets stuck because a lot of assembly code doesn't have proper dwarf2 annotiation yet. This currently often happens with __down. Should fix this by adding proper dwarf2 annotation to all inline assembly. However until that's done we need a quick fix for 2.6.18 to avoid incomplete backtraces. So when this happens dump the rest of the stack with the old unwinder instead of silently not dumping it. There was already a optional "both" mode that dumped both, but that was too ugly. I also clarified the headers for the different backtraces a bit. Also add a clear error message for missing dwarf2 annotation that people can work on. And I removed a dead variable left over from Ingo's changes. Cc: mingo@elte.hu Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index eb39a2775236..f7a9d1421078 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -254,7 +254,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - int i = 11; unsigned used = 0; printk("\nCall Trace:\n"); @@ -275,11 +274,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (unwind_init_blocked(&info, tsk) == 0) unw_ret = show_trace_unwind(&info, NULL); } - if (unw_ret > 0) { - if (call_trace > 0) + if (unw_ret > 0 && !arch_unw_user_mode(&info)) { +#ifdef CONFIG_STACK_UNWIND + unsigned long rip = info.regs.rip; + print_symbol("DWARF2 unwinder stuck at %s\n", rip); + if (call_trace == 1) { + printk("Leftover inexact backtrace:\n"); + stack = (unsigned long *)info.regs.rsp; + } else if (call_trace > 1) return; - printk("Legacy call trace:"); - i = 18; + else + printk("Full inexact backtrace again:\n"); +#else + printk("Inexact backtrace:\n"); +#endif } } @@ -1118,8 +1126,10 @@ static int __init call_trace_setup(char *s) call_trace = -1; else if (strcmp(s, "both") == 0) call_trace = 0; - else if (strcmp(s, "new") == 0) + else if (strcmp(s, "newfallback") == 0) call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; return 1; } __setup("call_trace=", call_trace_setup); -- cgit v1.2.2 From 260f659b232b17889e3f0c9bf411675898b222c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:34 +0200 Subject: [PATCH] x86_64: Update defconfig Update defconfig Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/defconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 83d389b8ebd8..840d5d93d5cc 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.17-git22 -# Tue Jul 4 14:24:40 2006 +# Linux kernel version: 2.6.18-rc2 +# Tue Jul 18 17:13:20 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -37,6 +37,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set +# CONFIG_TASKSTATS is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y @@ -413,6 +414,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set @@ -1195,7 +1197,7 @@ CONFIG_USB_MON=y # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set -# CONFIG_USB_CY7C63 is not set +# CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set # CONFIG_USB_PHIDGETKIT is not set # CONFIG_USB_PHIDGETSERVO is not set @@ -1373,7 +1375,6 @@ CONFIG_SUNRPC=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set # CONFIG_CIFS is not set -# CONFIG_CIFS_DEBUG2 is not set # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set -- cgit v1.2.2 From 0e5f61b00c577da698fb00cd9c91a96b79044dfd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:37 +0200 Subject: [PATCH] x86_64: On Intel systems when CPU has C3 don't use TSC On Intel systems generally the TSC stops in C3 or deeper, so don't use it there. Follows similar logic on i386. This should fix problems on Meroms. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/time.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index e0341c6808e5..7a9b18224182 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -28,6 +28,7 @@ #include #ifdef CONFIG_ACPI #include /* for PM timer frequency */ +#include #endif #include #include @@ -953,11 +954,18 @@ __cpuinit int unsynchronized_tsc(void) #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; - /* Intel systems are normally all synchronized. Exceptions - are handled in the check above. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; #endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + return 1; +#endif + return 0; + } + /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; } -- cgit v1.2.2 From 089bbbcb36979166131868a89ca5f4e695d6637d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Sat, 29 Jul 2006 21:42:40 +0200 Subject: [PATCH] x86_64: Calgary IOMMU - fix off by one error Fixed off-by-one error in detect_calgary and calgary_init which will cause arrays to overflow. Also, removed impossible to hit BUG_ON. Signed-off-by: Jon Mason Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-calgary.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index e71ed53b08fb..92744abff133 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -812,7 +812,7 @@ static int __init calgary_init(void) int i, ret = -ENODEV; struct pci_dev *dev = NULL; - for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) { + for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); @@ -890,9 +890,8 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0, table_idx = 0; - bus <= num_online_nodes() * MAX_PHB_BUS_NUM; + bus < num_online_nodes() * MAX_PHB_BUS_NUM; bus++) { - BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM); if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; if (test_bit(bus, translation_disabled)) { @@ -1002,7 +1001,7 @@ static int __init calgary_parse_options(char *p) if (p == endp) break; - if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) { + if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); set_bit(bridge, translation_disabled); -- cgit v1.2.2 From d2105b10fe0f460c388fe4e09226313f519d8c00 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Sat, 29 Jul 2006 21:42:43 +0200 Subject: [PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference fix Calgary hits a NULL pointer dereference when booting in a multi-chassis NUMA system. See Redhat bugzilla number 198498, found by Konrad Rzeszutek (konradr@redhat.com). There are many issues that had to be resolved to fix this problem. Firstly when I originally wrote the code to handle NUMA systems, I had a large misunderstanding that was not corrected until now. That was that I thought the "number of nodes online" referred to number of physical systems connected. So that if NUMA was disabled, there would only be 1 node and it would only show that node's PCI bus. In reality if NUMA is disabled, the system displays all of the connected chassis as one node but is only ignorant of the delays in accessing main memory. Therefore, references to num_online_nodes() and MAX_NUMNODES are incorrect and need to be set to the maximum number of nodes that can be accessed (which are 8). I created a variable, MAX_NUM_CHASSIS, and set it to 8 to fix this. Secondly, when walking the PCI in detect_calgary, the code only checked the first "slot" when looking to see if a device is present. This will work for most cases, but unfortunately it isn't always the case. In the NUMA MXE drawers, there are USB devices present on the 3rd slot (with slot 1 being empty). So, to work around this, all slots (up to 8) are scanned to see if there are any devices present. Lastly, the bus is being enumerated on large systems in a different way the we originally thought. This throws the ugly logic we had out the window. To more elegantly handle this, I reorganized the kva array to be sparse (which removed the need to have any bus number to kva slot logic in tce.c) and created a secondary space array to contain the bus number to phb mapping. With these changes Calgary boots on an x460 with 4 nodes with and without NUMA enabled. Signed-off-by: Jon Mason Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-calgary.c | 76 +++++++++++++++++++++++----------------- arch/x86_64/kernel/tce.c | 4 +-- 2 files changed, 45 insertions(+), 35 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 92744abff133..146924ba5df5 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -85,7 +85,8 @@ #define CSR_AGENT_MASK 0xffe0ffff #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ +#define MAX_NUM_CHASSIS 8 /* max number of chassis */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ #define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ @@ -110,7 +111,8 @@ static const unsigned long phb_offsets[] = { 0xB000 /* PHB3 */ }; -void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; +static char bus_to_phb[MAX_PHB_BUS_NUM]; +void* tce_table_kva[MAX_PHB_BUS_NUM]; unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; @@ -119,7 +121,7 @@ static int calgary_detected __read_mostly = 0; * the bitmap of PHBs the user requested that we disable * translation on. */ -static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); +static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); static void tce_cache_blast(struct iommu_table *tbl); @@ -452,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = { static inline int busno_to_phbid(unsigned char num) { - return bus_to_phb(num) % PHBS_PER_CALGARY; + return bus_to_phb[num]; } static inline unsigned long split_queue_offset(unsigned char num) @@ -812,7 +814,7 @@ static int __init calgary_init(void) int i, ret = -ENODEV; struct pci_dev *dev = NULL; - for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { + for (i = 0; i < MAX_PHB_BUS_NUM; i++) { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); @@ -822,7 +824,7 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!tce_table_kva[i] && !translate_empty_slots) { + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { pci_dev_put(dev); continue; } @@ -842,7 +844,7 @@ error: pci_dev_put(dev); continue; } - if (!tce_table_kva[i] && !translate_empty_slots) + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) continue; calgary_disable_translation(dev); calgary_free_tar(dev); @@ -876,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram) void __init detect_calgary(void) { u32 val; - int bus, table_idx; + int bus; void *tbl; - int detected = 0; + int calgary_found = 0; + int phb = -1; /* * if the user specified iommu=off or iommu=soft or we found @@ -889,37 +892,46 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); - for (bus = 0, table_idx = 0; - bus < num_online_nodes() * MAX_PHB_BUS_NUM; - bus++) { + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + int dev; + + tce_table_kva[bus] = NULL; + bus_to_phb[bus] = -1; + if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; + + /* + * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3) + * it is connected to releative to the clagary chip. + */ + phb = (phb + 1) % PHBS_PER_CALGARY; + if (test_bit(bus, translation_disabled)) { printk(KERN_INFO "Calgary: translation is disabled for " "PHB 0x%x\n", bus); /* skip this phb, don't allocate a tbl for it */ - tce_table_kva[table_idx] = NULL; - table_idx++; continue; } /* - * scan the first slot of the PCI bus to see if there - * are any devices present + * Scan the slots of the PCI bus to see if there is a device present. + * The parent bus will be the zero-ith device, so start at 1. */ - val = read_pci_config(bus, 1, 0, 0); - if (val != 0xffffffff || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - detected = 1; - } else - tbl = NULL; - - tce_table_kva[table_idx] = tbl; - table_idx++; + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff || translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + tce_table_kva[bus] = tbl; + bus_to_phb[bus] = phb; + calgary_found = 1; + break; + } + } } - if (detected) { + if (calgary_found) { iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " @@ -928,9 +940,9 @@ void __init detect_calgary(void) return; cleanup: - for (--table_idx; table_idx >= 0; --table_idx) - if (tce_table_kva[table_idx]) - free_tce_table(tce_table_kva[table_idx]); + for (--bus; bus >= 0; --bus) + if (tce_table_kva[bus]) + free_tce_table(tce_table_kva[bus]); } int __init calgary_iommu_init(void) @@ -1001,7 +1013,7 @@ static int __init calgary_parse_options(char *p) if (p == endp) break; - if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { + if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); set_bit(bridge, translation_disabled); diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index d3a9e79e954c..5530dda3f27a 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -96,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size) static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) { unsigned int bitmapsz; - unsigned int tce_table_index; unsigned long bmppages; int ret; @@ -105,8 +104,7 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) /* set the tce table size - measured in entries */ tbl->it_size = table_size_to_number_of_entries(specified_table_size); - tce_table_index = bus_to_phb(tbl->it_busno); - tbl->it_base = (unsigned long)tce_table_kva[tce_table_index]; + tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; if (!tbl->it_base) { printk(KERN_ERR "Calgary: iommu_table_setparms: " "no table allocated?!\n"); -- cgit v1.2.2 From 355540f3338e1408dac98413f05d612a76d4f5e3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:46 +0200 Subject: [PATCH] x86_64: Revert k8-bus.c northbridge access change As Travis Betak points out it accesses the wrong northbridge subfunction now. Switch back to the old code. Cc: "Travis Betak" Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/pci/k8-bus.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index b50a7c7c47f8..3acf60ded2a0 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -2,7 +2,6 @@ #include #include #include -#include /* * This discovers the pcibus <-> node mapping on AMD K8. @@ -19,6 +18,7 @@ #define NR_LDT_BUS_NUMBER_REGISTERS 3 #define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF) #define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) +#define PCI_DEVICE_ID_K8HTCONFIG 0x1100 /** * fill_mp_bus_to_cpumask() @@ -28,7 +28,8 @@ __init static int fill_mp_bus_to_cpumask(void) { - int i, j, k; + struct pci_dev *nb_dev = NULL; + int i, j; u32 ldtbus, nid; static int lbnr[3] = { LDT_BUS_NUMBER_REGISTER_0, @@ -36,9 +37,8 @@ fill_mp_bus_to_cpumask(void) LDT_BUS_NUMBER_REGISTER_2 }; - cache_k8_northbridges(); - for (k = 0; k < num_k8_northbridges; k++) { - struct pci_dev *nb_dev = k8_northbridges[k]; + while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) { pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid); for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { -- cgit v1.2.2 From 65f87d8a8a6e1b560c61951d0a68ed80f7c8ff19 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:49 +0200 Subject: [PATCH] x86_64: Fix swiotlb=force It was broken before. But having it is important as possible hardware bug workaround. And previously there was no way to force swiotlb if there is another IOMMU. Side effect is that iommu=force won't force swiotlb anymore even if there isn't another IOMMU. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-swiotlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index ebdb77fe2057..6a55f87ba97f 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -31,9 +31,10 @@ struct dma_mapping_ops swiotlb_dma_ops = { void pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_detected && !no_iommu && - (end_pfn > MAX_DMA32_PFN || force_iommu)) + if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) swiotlb = 1; + if (swiotlb_force) + swiotlb = 1; if (swiotlb) { printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_init(); -- cgit v1.2.2 From 2a8a3d5b65e86ec1dfef7d268c64a909eab94af7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 30 Jul 2006 03:03:20 -0700 Subject: [PATCH] machine_kexec.c: Fix the description of segment handling One of my original comments in machine_kexec was unclear and this should fix it. Signed-off-by: Eric W. Biederman Cc: Andi Kleen Acked-by: Horms Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/machine_kexec.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 83fb24a02821..106076b370fc 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -207,14 +207,11 @@ NORET_TYPE void machine_kexec(struct kimage *image) __flush_tlb(); - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again unless you set the segment to a different selector. - * - * The more common model are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. -- cgit v1.2.2 From cea6a4ba8acfba6f59cc9ed71e0d05cb770b9d9c Mon Sep 17 00:00:00 2001 From: Horms Date: Sun, 30 Jul 2006 03:03:34 -0700 Subject: [PATCH] panic_on_oops: remove ssleep() This patch is part of an effort to unify the panic_on_oops behaviour across all architectures that implement it. It was pointed out to me by Andi Kleen that if an oops has occured in interrupt context, then calling sleep() in the oops path will only cause a panic, and that it would be really better for it not to be in the path at all. This patch removes the ssleep() call and reworks the console message accordinly. I have a slght concern that the resulting console message is too long, feedback welcome. For powerpc it also unifies the 32bit and 64bit behaviour. Fror x86_64, this patch only updates the console message, as ssleep() is already not present. Signed-off-by: Horms Acked-by: Paul Mackerras Cc: Russell King Cc: "Luck, Tony" Cc: Andi Kleen Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index f7a9d1421078..4e9938dee060 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -529,7 +529,7 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Oops"); + panic("Fatal exception: panic_on_oops"); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) -- cgit v1.2.2 From be6b5a3505fa0cd54c3b5959a39293f47c648980 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Sun, 30 Jul 2006 03:03:37 -0700 Subject: [PATCH] cpu hotplug: use hotplug version of registration in late inits Use hotplug version of register_cpu_notifier in late init functions. Signed-off-by: Chandra Seetharaman Cc: "Luck, Tony" Cc: Martin Schwidefsky Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mce.c | 12 +++++------- arch/x86_64/kernel/mce_amd.c | 19 +++++++------------ 2 files changed, 12 insertions(+), 19 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 88845674c661..4e017fb30fb3 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -615,7 +615,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static __cpuinit void mce_remove_device(unsigned int cpu) +static void mce_remove_device(unsigned int cpu) { int i; @@ -626,10 +626,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); sysdev_unregister(&per_cpu(device_mce,cpu)); } -#endif /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -638,18 +637,17 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE: mce_create_device(cpu); break; -#ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: mce_remove_device(cpu); break; -#endif } return NOTIFY_OK; } -static struct notifier_block __cpuinitdata mce_cpu_notifier = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; +#endif static __init int mce_init_device(void) { @@ -664,7 +662,7 @@ static __init int mce_init_device(void) mce_create_device(i); } - register_cpu_notifier(&mce_cpu_notifier); + register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index db2acbf7ad28..883fe747f64c 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -558,7 +558,7 @@ out: * of shared sysfs dir/files, and rest of the cores will be symlinked to it. */ -static __cpuinit void deallocate_threshold_block(unsigned int cpu, +static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { struct threshold_block *pos = NULL; @@ -578,7 +578,7 @@ static __cpuinit void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } -static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) +static void threshold_remove_bank(unsigned int cpu, int bank) { int i = 0; struct threshold_bank *b; @@ -618,7 +618,7 @@ free_out: per_cpu(threshold_banks, cpu)[bank] = NULL; } -static __cpuinit void threshold_remove_device(unsigned int cpu) +static void threshold_remove_device(unsigned int cpu) { unsigned int bank; @@ -629,14 +629,8 @@ static __cpuinit void threshold_remove_device(unsigned int cpu) } } -#else /* !CONFIG_HOTPLUG_CPU */ -static void threshold_remove_device(unsigned int cpu) -{ -} -#endif - /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, +static int threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ @@ -659,9 +653,10 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { +static struct notifier_block threshold_cpu_notifier = { .notifier_call = threshold_cpu_callback, }; +#endif /* CONFIG_HOTPLUG_CPU */ static __init int threshold_init_device(void) { @@ -673,7 +668,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_cpu_notifier(&threshold_cpu_notifier); + register_hotcpu_notifier(&threshold_cpu_notifier); return 0; } -- cgit v1.2.2 From 0b0bf7a3ccb6f0b38ead71980e79f875046047b7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sun, 30 Jul 2006 03:04:06 -0700 Subject: [PATCH] vDSO hash-style fix The latest toolchains can produce a new ELF section in DSOs and dynamically-linked executables. The new section ".gnu.hash" replaces ".hash", and allows for more efficient runtime symbol lookups by the dynamic linker. The new ld option --hash-style={sysv|gnu|both} controls whether to produce the old ".hash", the new ".gnu.hash", or both. In some new systems such as Fedora Core 6, gcc by default passes --hash-style=gnu to the linker, so that a standard invocation of "gcc -shared" results in producing a DSO with only ".gnu.hash". The new ".gnu.hash" sections need to be dealt with the same way as ".hash" sections in all respects; only the dynamic linker cares about their contents. To work with older dynamic linkers (i.e. preexisting releases of glibc), a binary must have the old ".hash" section. The --hash-style=both option produces binaries that a new dynamic linker can use more efficiently, but an old dynamic linker can still handle. The new section runs afoul of the custom linker scripts used to build vDSO images for the kernel. On ia64, the failure mode for this is a boot-time panic because the vDSO's PT_IA_64_UNWIND segment winds up ill-formed. This patch addresses the problem in two ways. First, it mentions ".gnu.hash" in all the linker scripts alongside ".hash". This produces correct vDSO images with --hash-style=sysv (or old tools), with --hash-style=gnu, or with --hash-style=both. Second, it passes the --hash-style=sysv option when building the vDSO images, so that ".gnu.hash" is not actually produced. This is the most conservative choice for compatibility with any old userland. There is some concern that some ancient glibc builds (though not any known old production system) might choke on --hash-style=both binaries. The optimizations provided by the new style of hash section do not really matter for a DSO with a tiny number of symbols, as the vDSO has. If someone wants to use =gnu or =both for their vDSO builds and worry less about that compatibility, just change the option and the linker script changes will make any choice work fine. Signed-off-by: Roland McGrath Cc: "Luck, Tony" Cc: Kyle McMartin Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Jeff Dike Cc: Andi Kleen Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/ia32/Makefile | 1 + arch/x86_64/ia32/vsyscall.lds | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile index 62bc5f56da9e..cdae36435e21 100644 --- a/arch/x86_64/ia32/Makefile +++ b/arch/x86_64/ia32/Makefile @@ -23,6 +23,7 @@ targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so) # The DSO images are built using a special linker script quiet_cmd_syscall = SYSCALL $@ cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) \ -Wl,-soname=linux-gate.so.1 -o $@ \ -Wl,-T,$(filter-out FORCE,$^) diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds index f2e75ed4c6c7..1dc86ff5bcb9 100644 --- a/arch/x86_64/ia32/vsyscall.lds +++ b/arch/x86_64/ia32/vsyscall.lds @@ -11,6 +11,7 @@ SECTIONS . = VSYSCALL_BASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } -- cgit v1.2.2 From 2699500b31f41fc25656c42548c8a388c8a329fe Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 Aug 2006 22:37:28 +0200 Subject: [PATCH] x86_64: Fix backtracing for interrupt stacks Re-add backlink for old style unwinder to stack switching. Add proper stack frame and CFI annotations to call_softirq This prevents a oops when backtracing with fallback through the interrupt stack top. Suggested by Jan Beulich and Herbert Xu wanted it in 2.6.18. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/entry.S | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index d464dded68c0..6f810424df44 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -513,6 +513,7 @@ END(stub_rt_sigreturn) swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count cmoveq %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder /* * We entered an interrupt context - irqs are off: */ @@ -1139,18 +1140,21 @@ ENTRY(machine_check) END(machine_check) #endif +/* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - movq %gs:pda_irqstackptr,%rax - movq %rsp,%rdx - CFI_DEF_CFA_REGISTER rdx + push %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp incl %gs:pda_irqcount - cmove %rax,%rsp - pushq %rdx - /*todo CFI_DEF_CFA_EXPRESSION ...*/ + cmove %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder call __do_softirq - popq %rsp + leaveq CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 decl %gs:pda_irqcount ret CFI_ENDPROC -- cgit v1.2.2 From a166222cde740b34d97fe49dca70348197f4534e Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Wed, 2 Aug 2006 22:37:31 +0200 Subject: [PATCH] x86_64: Fix CONFIG_IOMMU_DEBUG If CONFIG_IOMMU_DEBUG is set force_iommu defaults to 1. In the case where no HW IOMMU is present in the machine and we end up using nommu, leaving force_iommu set to 1 causes dma_alloc_coherent to do the wrong thing. Therefore, if we end up using nommu, make sure force_iommu is 0. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-nommu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index c4c3cc36ac5b..aad7609d8e92 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -92,5 +92,7 @@ void __init no_iommu_init(void) { if (dma_ops) return; + + force_iommu = 0; /* no HW IOMMU */ dma_ops = &nommu_dma_ops; } -- cgit v1.2.2 From 825e037fb8912f38e9d9ddd3ec79fa7c584db708 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 5 Aug 2006 12:14:34 -0700 Subject: [PATCH] Fix more per-cpu typos Signed-off-by: Alexey Dobriyan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 5a1c0a3bf872..06af6ca60129 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -203,7 +203,7 @@ int __cpuinit init_smp_flush(void) { int i; for_each_cpu_mask(i, cpu_possible_map) { - spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i)); + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } -- cgit v1.2.2 From 012c437d03cb299814e58ac8d574f7510f5989a5 Mon Sep 17 00:00:00 2001 From: Horms Date: Sun, 13 Aug 2006 23:24:22 -0700 Subject: [PATCH] Change panic_on_oops message to "Fatal exception" Previously the message was "Fatal exception: panic_on_oops", as introduced in a recent patch whith removed a somewhat dangerous call to ssleep() in the panic_on_oops path. However, Paul Mackerras suggested that this was somewhat confusing, leadind people to believe that it was panic_on_oops that was the root cause of the fatal exception. On his suggestion, this patch changes the message to simply "Fatal exception". A suitable oops message should already have been displayed. Signed-off-by: Simon Horman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- arch/x86_64/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 4e9938dee060..14052f089814 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -529,7 +529,7 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Fatal exception: panic_on_oops"); + panic("Fatal exception"); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) -- cgit v1.2.2