From 6d9c00c67b4768105e8ae3d213484095c744eea8 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 22 Aug 2012 20:30:43 +0000 Subject: powerpc: Fix null pointer deref in perf hardware breakpoints Currently if you are doing a global perf recording with hardware breakpoints (ie perf record -e mem:0xdeadbeef -a), you can oops with: Faulting instruction address: 0xc000000000738890 cpu 0xc: Vector: 300 (Data Access) at [c0000003f76af8d0] pc: c000000000738890: .hw_breakpoint_handler+0xa0/0x1e0 lr: c000000000738830: .hw_breakpoint_handler+0x40/0x1e0 sp: c0000003f76afb50 msr: 8000000000001032 dar: 6f0 dsisr: 42000000 current = 0xc0000003f765ac00 paca = 0xc00000000f262a00 softe: 0 irq_happened: 0x01 pid = 6810, comm = loop-read enter ? for help [c0000003f76afbe0] c00000000073cd04 .notifier_call_chain.isra.0+0x84/0xe0 [c0000003f76afc80] c00000000073cdbc .notify_die+0x3c/0x60 [c0000003f76afd20] c0000000000139f0 .do_dabr+0x40/0xf0 [c0000003f76afe30] c000000000005a9c handle_dabr_fault+0x14/0x48 --- Exception: 300 (Data Access) at 0000000010000480 SP (ff8679e0) is in userspace This is because we don't check to see if the break point is associated with task before we deference the task_struct pointer. This changes the update to use current. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index f3a82dde61db..956a4c496de9 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -253,7 +253,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) /* Do not emulate user-space instructions, instead single-step them */ if (user_mode(regs)) { - bp->ctx->task->thread.last_hit_ubp = bp; + current->thread.last_hit_ubp = bp; regs->msr |= MSR_SE; goto out; } -- cgit v1.2.2 From 572b411cb4f1b208bb8ea278752f956b3554371e Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Wed, 22 Aug 2012 16:10:18 +0000 Subject: powerpc/kgdb: Do not set kgdb_single_step on ppc The kgdb_single_step flag has the possibility to indefinitely hang the system on an SMP system. The x86 arch have the same problem, and that problem was fixed by commit 8097551d9ab9b9e3630(kgdb,x86: do not set kgdb_single_step on x86). This patch does the same behaviors as x86's patch. Signed-off-by: Dongdong Deng Signed-off-by: Jason Wessel Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/kgdb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 782bd0a3c2f0..bbabc5abb1c4 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -410,7 +410,6 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code, #else linux_regs->msr |= MSR_SE; #endif - kgdb_single_step = 1; atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id()); } -- cgit v1.2.2 From 949616cf2d3095d1bb6b3d155c1cc963abd98b5c Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Wed, 22 Aug 2012 16:10:19 +0000 Subject: powerpc/kgdb: Bail out of KGDB when we've been triggered We need to skip a breakpoint exception when it occurs after a breakpoint has already been removed. Signed-off-by: Tiejun Chen Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/kgdb.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index bbabc5abb1c4..05adb69febf4 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -101,6 +101,21 @@ static int computeSignal(unsigned int tt) return SIGHUP; /* default for things we don't know about */ } +/** + * + * kgdb_skipexception - Bail out of KGDB when we've been triggered. + * @exception: Exception vector number + * @regs: Current &struct pt_regs. + * + * On some architectures we need to skip a breakpoint exception when + * it occurs after a breakpoint has been removed. + * + */ +int kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return kgdb_isremovedbreak(regs->nip); +} + static int kgdb_call_nmi_hook(struct pt_regs *regs) { kgdb_nmicallback(raw_smp_processor_id(), regs); -- cgit v1.2.2 From 5f630401f9e98bd062733b5bbef096dbf2158066 Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Wed, 22 Aug 2012 16:10:20 +0000 Subject: powerpc/kgdb: Restore current_thread_info properly For powerpc BooKE and e200, singlestep is handled on the critical/dbg exception stack. This causes current_thread_info() to fail for kgdb internal, so previously We work around this issue by copying the thread_info from the kernel stack before calling kgdb_handle_exception, and copying it back afterwards. But actually we don't do this properly. We should backup current_thread_info then restore that when exit. Signed-off-by: Tiejun Chen Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/kgdb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 05adb69febf4..c470a40b29f5 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * This table contains the mapping between PowerPC hardware trap types, and @@ -153,6 +154,8 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) static int kgdb_singlestep(struct pt_regs *regs) { struct thread_info *thread_info, *exception_thread_info; + struct thread_info *backup_current_thread_info = \ + (struct thread_info *)kmalloc(sizeof(struct thread_info), GFP_KERNEL); if (user_mode(regs)) return 0; @@ -170,13 +173,17 @@ static int kgdb_singlestep(struct pt_regs *regs) thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); exception_thread_info = current_thread_info(); - if (thread_info != exception_thread_info) + if (thread_info != exception_thread_info) { + /* Save the original current_thread_info. */ + memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info); memcpy(exception_thread_info, thread_info, sizeof *thread_info); + } kgdb_handle_exception(0, SIGTRAP, 0, regs); if (thread_info != exception_thread_info) - memcpy(thread_info, exception_thread_info, sizeof *thread_info); + /* Restore current_thread_info lastly. */ + memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info); return 1; } -- cgit v1.2.2 From 4c374af5fdee4bc6b4f5ea96c1a0f0ad7d3566be Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sat, 18 Aug 2012 07:34:15 +0000 Subject: powerpc/dma-iommu: Fix IOMMU window check Checking for device mask to cover the whole IOMMU table is too strict. IOMMU allocators should handle mask constraint properly for each allocation. The patch enables to use old AirPort Extreme cards on PowerMacs with more than 1GB of memory; without the patch the driver init fails with: b43-pci-bridge 0001:01:01.0: Warning: IOMMU window too big for device mask b43-pci-bridge 0001:01:01.0: mask: 0x3fffffff, table end: 0x80000000 b43-phy0 ERROR: The machine/kernel does not support the required 30-bit DMA mask Signed-off-by: Aaro Koskinen Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/dma-iommu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 2d7bb8ced136..e4897523de41 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -83,11 +83,10 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 0; } - if ((tbl->it_offset + tbl->it_size) > (mask >> IOMMU_PAGE_SHIFT)) { - dev_info(dev, "Warning: IOMMU window too big for device mask\n"); - dev_info(dev, "mask: 0x%08llx, table end: 0x%08lx\n", - mask, (tbl->it_offset + tbl->it_size) << - IOMMU_PAGE_SHIFT); + if (tbl->it_offset > (mask >> IOMMU_PAGE_SHIFT)) { + dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); + dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", + mask, tbl->it_offset << IOMMU_PAGE_SHIFT); return 0; } else return 1; -- cgit v1.2.2 From 7256a5d2da56f2ea8ad49e8dbe9e2984f0899b42 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 13 Aug 2012 03:18:28 +0000 Subject: powerpc: Fix personality handling in ppc64_personality() Directly comparing current->personality against PER_LINUX32 doesn't work in cases when any of the personality flags stored in the top three bytes are used. Directly forcefully setting personality to PER_LINUX32 or PER_LINUX discards any flags stored in the top three bytes Use personality() macro to compare only PER_MASK bytes and make sure that we are setting only the bits that should be set, instead of overwriting the whole value. Signed-off-by: Jiri Kosina Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/syscalls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index f2496f2faecc..4e3cc47f26b9 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -107,11 +107,11 @@ long ppc64_personality(unsigned long personality) long ret; if (personality(current->personality) == PER_LINUX32 - && personality == PER_LINUX) - personality = PER_LINUX32; + && personality(personality) == PER_LINUX) + personality = (personality & ~PER_MASK) | PER_LINUX32; ret = sys_personality(personality); - if (ret == PER_LINUX32) - ret = PER_LINUX; + if (personality(ret) == PER_LINUX32) + ret = (ret & ~PER_MASK) | PER_LINUX; return ret; } #endif -- cgit v1.2.2 From dabe859ec6360a12e71f39bf695d174e19ff2688 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Jul 2012 13:56:11 +0000 Subject: powerpc: Give hypervisor decrementer interrupts their own handler At the moment the handler for hypervisor decrementer interrupts is the same as for decrementer interrupts, i.e. timer_interrupt(). This is bogus; if we ever do get a hypervisor decrementer interrupt it won't have anything to do with the next timer event. In fact the only time we get hypervisor decrementer interrupts is when one is left pending on exit from a KVM guest. When we get a hypervisor decrementer interrupt we don't need to do anything special to clear it, since they are edge-triggered on the transition of HDEC from 0 to -1. Thus this adds an empty handler function for them. We don't need to have them masked when interrupts are soft-disabled, so we use STD_EXCEPTION_HV instead of MASKABLE_EXCEPTION_HV. Signed-off-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 3 ++- arch/powerpc/kernel/time.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e894515e77bb..39aa97d3ff88 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -186,7 +186,7 @@ hardware_interrupt_hv: KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800) MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer) - MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer) + STD_EXCEPTION_HV(0x980, 0x982, hdecrementer) STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00) @@ -486,6 +486,7 @@ machine_check_common: STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ) STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, .timer_interrupt) + STD_EXCEPTION_COMMON(0x980, hdecrementer, .hdec_interrupt) STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception) STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception) STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index be171ee73bf8..e49e93191b69 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -535,6 +535,15 @@ void timer_interrupt(struct pt_regs * regs) trace_timer_interrupt_exit(regs); } +/* + * Hypervisor decrementer interrupts shouldn't occur but are sometimes + * left pending on exit from a KVM guest. We don't need to do anything + * to clear them, as they are edge-triggered. + */ +void hdec_interrupt(struct pt_regs *regs) +{ +} + #ifdef CONFIG_SUSPEND static void generic_suspend_disable_irqs(void) { -- cgit v1.2.2 From 375f561a4131a0f501c8845a2a20f2ca1abc8f7a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Jul 2012 18:51:09 +0000 Subject: powerpc/powernv: Always go into nap mode when CPU is offline The CPU hotplug code for the powernv platform currently only puts offline CPUs into nap mode if the powersave_nap variable is set. However, HV-style KVM on this platform requires secondary CPU threads to be offline and in nap mode. Since we know nap mode works just fine on all POWER7 machines, and the only machines that support the powernv platform are POWER7 machines, this changes the code to always put offline CPUs into nap mode, regardless of powersave_nap. Powersave_nap still controls whether or not CPUs go into nap mode when idle, as before. Signed-off-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/idle_power7.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index 7140d838339e..e11863f4e595 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -28,7 +28,9 @@ _GLOBAL(power7_idle) lwz r4,ADDROFF(powersave_nap)(r3) cmpwi 0,r4,0 beqlr + /* fall through */ +_GLOBAL(power7_nap) /* NAP is a state loss, we create a regs frame on the * stack, fill it up with the state we care about and * stick a pointer to it in PACAR1. We really only -- cgit v1.2.2 From 1b6ca2a6fe56e7697d57348646e07df08f43b1bb Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:47:56 +0000 Subject: powerpc: Update DSCR on all CPUs when writing sysfs dscr_default Writing to dscr_default in sysfs doesn't actually change the DSCR - we rely on a context switch on each CPU to do the work. There is no guarantee we will get a context switch in a reasonable amount of time so fire off an IPI to force an immediate change. This issue was found with the following test case: http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 3529446c2abd..d4cbbd1fa75f 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -194,6 +194,12 @@ static ssize_t show_dscr_default(struct device *dev, return sprintf(buf, "%lx\n", dscr_default); } +static void update_dscr(void *dummy) +{ + if (!current->thread.dscr_inherit) + mtspr(SPRN_DSCR, dscr_default); +} + static ssize_t __used store_dscr_default(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -206,6 +212,8 @@ static ssize_t __used store_dscr_default(struct device *dev, return -EINVAL; dscr_default = val; + on_each_cpu(update_dscr, NULL, 1); + return count; } -- cgit v1.2.2 From 00ca0de02f80924dfff6b4f630e1dff3db005e35 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:48:46 +0000 Subject: powerpc: Keep thread.dscr and thread.dscr_inherit in sync When we update the DSCR either via emulation of mtspr(DSCR) or via a change to dscr_default in sysfs we don't update thread.dscr. We will eventually update it at context switch time but there is a period where thread.dscr is incorrect. If we fork at this point we will copy the old value of thread.dscr into the child. To avoid this, always keep thread.dscr in sync with reality. This issue was found with the following testcase: http://ozlabs.org/~anton/junkcode/dscr_inherit_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/sysfs.c | 4 +++- arch/powerpc/kernel/traps.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index d4cbbd1fa75f..8302af649219 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -196,8 +196,10 @@ static ssize_t show_dscr_default(struct device *dev, static void update_dscr(void *dummy) { - if (!current->thread.dscr_inherit) + if (!current->thread.dscr_inherit) { + current->thread.dscr = dscr_default; mtspr(SPRN_DSCR, dscr_default); + } } static ssize_t __used store_dscr_default(struct device *dev, diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 158972341a2d..ae0843fa7a61 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -972,8 +972,9 @@ static int emulate_instruction(struct pt_regs *regs) cpu_has_feature(CPU_FTR_DSCR)) { PPC_WARN_EMULATED(mtdscr, regs); rd = (instword >> 21) & 0x1f; - mtspr(SPRN_DSCR, regs->gpr[rd]); + current->thread.dscr = regs->gpr[rd]; current->thread.dscr_inherit = 1; + mtspr(SPRN_DSCR, current->thread.dscr); return 0; } #endif -- cgit v1.2.2 From 1021cb268b3025573c4811f1dee4a11260c4507b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:49:47 +0000 Subject: powerpc: Fix DSCR inheritance in copy_thread() If the default DSCR is non zero we set thread.dscr_inherit in copy_thread() meaning the new thread and all its children will ignore future updates to the default DSCR. This is not intended and is a change in behaviour that a number of our users have hit. We just need to inherit thread.dscr and thread.dscr_inherit from the parent which ends up being much simpler. This was found with the following test case: http://ozlabs.org/~anton/junkcode/dscr_default_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/process.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 710f400476de..1a1f2ddfb581 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -802,16 +802,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, #endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_PPC64 if (cpu_has_feature(CPU_FTR_DSCR)) { - if (current->thread.dscr_inherit) { - p->thread.dscr_inherit = 1; - p->thread.dscr = current->thread.dscr; - } else if (0 != dscr_default) { - p->thread.dscr_inherit = 1; - p->thread.dscr = dscr_default; - } else { - p->thread.dscr_inherit = 0; - p->thread.dscr = 0; - } + p->thread.dscr_inherit = current->thread.dscr_inherit; + p->thread.dscr = current->thread.dscr; } #endif -- cgit v1.2.2 From 714332858bfd40dcf8f741498336d93875c23aa7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 3 Sep 2012 16:51:10 +0000 Subject: powerpc: Restore correct DSCR in context switch During a context switch we always restore the per thread DSCR value. If we aren't doing explicit DSCR management (ie thread.dscr_inherit == 0) and the default DSCR changed while the process has been sleeping we end up with the wrong value. Check thread.dscr_inherit and select the default DSCR or per thread DSCR as required. This was found with the following test case, when running with more threads than CPUs (ie forcing context switching): http://ozlabs.org/~anton/junkcode/dscr_default_test.c With the four patches applied I can run a combination of all test cases successfully at the same time: http://ozlabs.org/~anton/junkcode/dscr_default_test.c http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c http://ozlabs.org/~anton/junkcode/dscr_inherit_test.c Signed-off-by: Anton Blanchard Cc: # 3.0+ Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/entry_64.S | 23 +++++++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 85b05c463fae..e8995727b1c1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -76,6 +76,7 @@ int main(void) DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr)); + DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit)); #else DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 4b01a25e29ef..b40e0b4815b3 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -370,6 +370,12 @@ _GLOBAL(ret_from_fork) li r3,0 b syscall_exit + .section ".toc","aw" +DSCR_DEFAULT: + .tc dscr_default[TC],dscr_default + + .section ".text" + /* * This routine switches between two different tasks. The process * state of one is saved on its kernel stack. Then the state @@ -509,9 +515,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) - ld r6,_CCR(r1) - mtcrf 0xFF,r6 - #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION ld r0,THREAD_VRSAVE(r4) @@ -520,14 +523,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_PPC64 BEGIN_FTR_SECTION + lwz r6,THREAD_DSCR_INHERIT(r4) + ld r7,DSCR_DEFAULT@toc(2) ld r0,THREAD_DSCR(r4) - cmpd r0,r25 - beq 1f + cmpwi r6,0 + bne 1f + ld r0,0(r7) +1: cmpd r0,r25 + beq 2f mtspr SPRN_DSCR,r0 -1: +2: END_FTR_SECTION_IFSET(CPU_FTR_DSCR) #endif + ld r6,_CCR(r1) + mtcrf 0xFF,r6 + /* r3-r13 are destroyed -- Cort */ REST_8GPRS(14, r1) REST_10GPRS(22, r1) -- cgit v1.2.2 From 9fb1b36ca1234e64a5d1cc573175303395e3354d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 4 Sep 2012 18:33:08 +0000 Subject: powerpc: Make sure IPI handlers see data written by IPI senders We have been observing hangs, both of KVM guest vcpu tasks and more generally, where a process that is woken doesn't properly wake up and continue to run, but instead sticks in TASK_WAKING state. This happens because the update of rq->wake_list in ttwu_queue_remote() is not ordered with the update of ipi_message in smp_muxed_ipi_message_pass(), and the reading of rq->wake_list in scheduler_ipi() is not ordered with the reading of ipi_message in smp_ipi_demux(). Thus it is possible for the IPI receiver not to see the updated rq->wake_list and therefore conclude that there is nothing for it to do. In order to make sure that anything done before smp_send_reschedule() is ordered before anything done in the resulting call to scheduler_ipi(), this adds barriers in smp_muxed_message_pass() and smp_ipi_demux(). The barrier in smp_muxed_message_pass() is a full barrier to ensure that there is a full ordering between the smp_send_reschedule() caller and scheduler_ipi(). In smp_ipi_demux(), we use xchg() rather than xchg_local() because xchg() includes release and acquire barriers. Using xchg() rather than xchg_local() makes sense given that ipi_message is not just accessed locally. This moves the barrier between setting the message and calling the cause_ipi() function into the individual cause_ipi implementations. Most of them -- those that used outb, out_8 or similar -- already had a full barrier because out_8 etc. include a sync before the MMIO store. This adds an explicit barrier in the two remaining cases. These changes made no measurable difference to the speed of IPIs as measured using a simple ping-pong latency test across two CPUs on different cores of a POWER7 machine. The analysis of the reason why processes were not waking up properly is due to Milton Miller. Cc: stable@vger.kernel.org # v3.0+ Reported-by: Milton Miller Signed-off-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/dbell.c | 2 ++ arch/powerpc/kernel/smp.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 5b25c8060fd6..a892680668d8 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -28,6 +28,8 @@ void doorbell_setup_this_cpu(void) void doorbell_cause_ipi(int cpu, unsigned long data) { + /* Order previous accesses vs. msgsnd, which is treated as a store */ + mb(); ppc_msgsnd(PPC_DBELL, 0, data); } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 0321007086f7..8d4214afc21d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -198,8 +198,15 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) struct cpu_messages *info = &per_cpu(ipi_message, cpu); char *message = (char *)&info->messages; + /* + * Order previous accesses before accesses in the IPI handler. + */ + smp_mb(); message[msg] = 1; - mb(); + /* + * cause_ipi functions are required to include a full barrier + * before doing whatever causes the IPI. + */ smp_ops->cause_ipi(cpu, info->data); } @@ -211,7 +218,7 @@ irqreturn_t smp_ipi_demux(void) mb(); /* order any irq clear */ do { - all = xchg_local(&info->messages, 0); + all = xchg(&info->messages, 0); #ifdef __BIG_ENDIAN if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION))) -- cgit v1.2.2