aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHeiko Carstens <heiko.carstens@de.ibm.com>2011-05-23 04:24:34 -0400
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2011-05-23 04:24:29 -0400
commitf2db2e6cb3f5f766cbb3788af44705685ff2445a (patch)
tree11fbf5522f332e13f9bfb6cf4552513e4d865003
parentb456d94a9757db54eca4677c1b3a13e7170c9bb3 (diff)
[S390] pfault: cpu hotplug vs missing completion interrupts
On cpu hot remove a PFAULT CANCEL command is sent to the hypervisor which in turn will cancel all outstanding pfault requests that have been issued on that cpu (the same happens with a SIGP cpu reset). The result is that we end up with uninterruptible processes where the interrupt that would wake up these processes never arrives. In order to solve this all processes which wait for a pfault completion interrupt get woken up after a cpu hot remove. The worst case that could happen is that they fault again and in turn need to wait again. Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
-rw-r--r--arch/s390/include/asm/lowcore.h4
-rw-r--r--arch/s390/include/asm/processor.h1
-rw-r--r--arch/s390/kernel/asm-offsets.c1
-rw-r--r--arch/s390/kernel/entry.S1
-rw-r--r--arch/s390/kernel/entry64.S1
-rw-r--r--arch/s390/mm/fault.c89
6 files changed, 71 insertions, 26 deletions
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index b8624d53c37..228cf0b295d 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -124,7 +124,7 @@ struct _lowcore {
124 /* Address space pointer. */ 124 /* Address space pointer. */
125 __u32 kernel_asce; /* 0x02ac */ 125 __u32 kernel_asce; /* 0x02ac */
126 __u32 user_asce; /* 0x02b0 */ 126 __u32 user_asce; /* 0x02b0 */
127 __u8 pad_0x02b4[0x02b8-0x02b4]; /* 0x02b4 */ 127 __u32 current_pid; /* 0x02b4 */
128 128
129 /* SMP info area */ 129 /* SMP info area */
130 __u32 cpu_nr; /* 0x02b8 */ 130 __u32 cpu_nr; /* 0x02b8 */
@@ -255,7 +255,7 @@ struct _lowcore {
255 /* Address space pointer. */ 255 /* Address space pointer. */
256 __u64 kernel_asce; /* 0x0310 */ 256 __u64 kernel_asce; /* 0x0310 */
257 __u64 user_asce; /* 0x0318 */ 257 __u64 user_asce; /* 0x0318 */
258 __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */ 258 __u64 current_pid; /* 0x0320 */
259 259
260 /* SMP info area */ 260 /* SMP info area */
261 __u32 cpu_nr; /* 0x0328 */ 261 __u32 cpu_nr; /* 0x0328 */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 2c79b641627..1300c302533 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -84,6 +84,7 @@ struct thread_struct {
84 struct per_event per_event; /* Cause of the last PER trap */ 84 struct per_event per_event; /* Cause of the last PER trap */
85 /* pfault_wait is used to block the process on a pfault event */ 85 /* pfault_wait is used to block the process on a pfault event */
86 unsigned long pfault_wait; 86 unsigned long pfault_wait;
87 struct list_head list;
87}; 88};
88 89
89typedef struct thread_struct thread_struct; 90typedef struct thread_struct thread_struct;
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ef455561101..edfbd17d708 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -124,6 +124,7 @@ int main(void)
124 DEFINE(__LC_LAST_UPDATE_TIMER, offsetof(struct _lowcore, last_update_timer)); 124 DEFINE(__LC_LAST_UPDATE_TIMER, offsetof(struct _lowcore, last_update_timer));
125 DEFINE(__LC_LAST_UPDATE_CLOCK, offsetof(struct _lowcore, last_update_clock)); 125 DEFINE(__LC_LAST_UPDATE_CLOCK, offsetof(struct _lowcore, last_update_clock));
126 DEFINE(__LC_CURRENT, offsetof(struct _lowcore, current_task)); 126 DEFINE(__LC_CURRENT, offsetof(struct _lowcore, current_task));
127 DEFINE(__LC_CURRENT_PID, offsetof(struct _lowcore, current_pid));
127 DEFINE(__LC_THREAD_INFO, offsetof(struct _lowcore, thread_info)); 128 DEFINE(__LC_THREAD_INFO, offsetof(struct _lowcore, thread_info));
128 DEFINE(__LC_KERNEL_STACK, offsetof(struct _lowcore, kernel_stack)); 129 DEFINE(__LC_KERNEL_STACK, offsetof(struct _lowcore, kernel_stack));
129 DEFINE(__LC_ASYNC_STACK, offsetof(struct _lowcore, async_stack)); 130 DEFINE(__LC_ASYNC_STACK, offsetof(struct _lowcore, async_stack));
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 1b67fc6ebdc..0476174dfff 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -212,6 +212,7 @@ __switch_to:
212 lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 212 lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4
213 lm %r6,%r15,__SF_GPRS(%r15) # load gprs of next task 213 lm %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
214 st %r3,__LC_CURRENT # store task struct of next 214 st %r3,__LC_CURRENT # store task struct of next
215 mvc __LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next
215 st %r5,__LC_THREAD_INFO # store thread info of next 216 st %r5,__LC_THREAD_INFO # store thread info of next
216 ahi %r5,STACK_SIZE # end of kernel stack of next 217 ahi %r5,STACK_SIZE # end of kernel stack of next
217 st %r5,__LC_KERNEL_STACK # store end of kernel stack 218 st %r5,__LC_KERNEL_STACK # store end of kernel stack
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 9fd86456349..d61967e2eab 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -220,6 +220,7 @@ __switch_to:
220 lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 220 lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4
221 lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task 221 lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
222 stg %r3,__LC_CURRENT # store task struct of next 222 stg %r3,__LC_CURRENT # store task struct of next
223 mvc __LC_CURRENT_PID+4(4,%r0),__TASK_pid(%r3) # store pid of next
223 stg %r5,__LC_THREAD_INFO # store thread info of next 224 stg %r5,__LC_THREAD_INFO # store thread info of next
224 aghi %r5,STACK_SIZE # end of kernel stack of next 225 aghi %r5,STACK_SIZE # end of kernel stack of next
225 stg %r5,__LC_KERNEL_STACK # store end of kernel stack 226 stg %r5,__LC_KERNEL_STACK # store end of kernel stack
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 177745c520c..1ca65647832 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -466,7 +466,7 @@ typedef struct {
466int pfault_init(void) 466int pfault_init(void)
467{ 467{
468 pfault_refbk_t refbk = 468 pfault_refbk_t refbk =
469 { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, 469 { 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48,
470 __PF_RES_FIELD }; 470 __PF_RES_FIELD };
471 int rc; 471 int rc;
472 472
@@ -498,11 +498,15 @@ void pfault_fini(void)
498 : : "a" (&refbk), "m" (refbk) : "cc"); 498 : : "a" (&refbk), "m" (refbk) : "cc");
499} 499}
500 500
501static DEFINE_SPINLOCK(pfault_lock);
502static LIST_HEAD(pfault_list);
503
501static void pfault_interrupt(unsigned int ext_int_code, 504static void pfault_interrupt(unsigned int ext_int_code,
502 unsigned int param32, unsigned long param64) 505 unsigned int param32, unsigned long param64)
503{ 506{
504 struct task_struct *tsk; 507 struct task_struct *tsk;
505 __u16 subcode; 508 __u16 subcode;
509 pid_t pid;
506 510
507 /* 511 /*
508 * Get the external interruption subcode & pfault 512 * Get the external interruption subcode & pfault
@@ -514,44 +518,79 @@ static void pfault_interrupt(unsigned int ext_int_code,
514 if ((subcode & 0xff00) != __SUBCODE_MASK) 518 if ((subcode & 0xff00) != __SUBCODE_MASK)
515 return; 519 return;
516 kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; 520 kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++;
517 521 if (subcode & 0x0080) {
518 /* 522 /* Get the token (= pid of the affected task). */
519 * Get the token (= address of the task structure of the affected task). 523 pid = sizeof(void *) == 4 ? param32 : param64;
520 */ 524 rcu_read_lock();
521#ifdef CONFIG_64BIT 525 tsk = find_task_by_pid_ns(pid, &init_pid_ns);
522 tsk = (struct task_struct *) param64; 526 if (tsk)
523#else 527 get_task_struct(tsk);
524 tsk = (struct task_struct *) param32; 528 rcu_read_unlock();
525#endif 529 if (!tsk)
526 530 return;
531 } else {
532 tsk = current;
533 }
534 spin_lock(&pfault_lock);
527 if (subcode & 0x0080) { 535 if (subcode & 0x0080) {
528 /* signal bit is set -> a page has been swapped in by VM */ 536 /* signal bit is set -> a page has been swapped in by VM */
529 if (xchg(&tsk->thread.pfault_wait, -1) != 0) { 537 if (tsk->thread.pfault_wait == 1) {
530 /* Initial interrupt was faster than the completion 538 /* Initial interrupt was faster than the completion
531 * interrupt. pfault_wait is valid. Set pfault_wait 539 * interrupt. pfault_wait is valid. Set pfault_wait
532 * back to zero and wake up the process. This can 540 * back to zero and wake up the process. This can
533 * safely be done because the task is still sleeping 541 * safely be done because the task is still sleeping
534 * and can't produce new pfaults. */ 542 * and can't produce new pfaults. */
535 tsk->thread.pfault_wait = 0; 543 tsk->thread.pfault_wait = 0;
544 list_del(&tsk->thread.list);
536 wake_up_process(tsk); 545 wake_up_process(tsk);
537 put_task_struct(tsk); 546 } else {
547 /* Completion interrupt was faster than initial
548 * interrupt. Set pfault_wait to -1 so the initial
549 * interrupt doesn't put the task to sleep. */
550 tsk->thread.pfault_wait = -1;
538 } 551 }
552 put_task_struct(tsk);
539 } else { 553 } else {
540 /* signal bit not set -> a real page is missing. */ 554 /* signal bit not set -> a real page is missing. */
541 get_task_struct(tsk); 555 if (tsk->thread.pfault_wait == -1) {
542 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
543 if (xchg(&tsk->thread.pfault_wait, 1) != 0) {
544 /* Completion interrupt was faster than the initial 556 /* Completion interrupt was faster than the initial
545 * interrupt (swapped in a -1 for pfault_wait). Set 557 * interrupt (pfault_wait == -1). Set pfault_wait
546 * pfault_wait back to zero and exit. This can be 558 * back to zero and exit. */
547 * done safely because tsk is running in kernel
548 * mode and can't produce new pfaults. */
549 tsk->thread.pfault_wait = 0; 559 tsk->thread.pfault_wait = 0;
550 set_task_state(tsk, TASK_RUNNING); 560 } else {
551 put_task_struct(tsk); 561 /* Initial interrupt arrived before completion
552 } else 562 * interrupt. Let the task sleep. */
563 tsk->thread.pfault_wait = 1;
564 list_add(&tsk->thread.list, &pfault_list);
565 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
553 set_tsk_need_resched(tsk); 566 set_tsk_need_resched(tsk);
567 }
568 }
569 spin_unlock(&pfault_lock);
570}
571
572static int __cpuinit pfault_cpu_notify(struct notifier_block *self,
573 unsigned long action, void *hcpu)
574{
575 struct thread_struct *thread, *next;
576 struct task_struct *tsk;
577
578 switch (action) {
579 case CPU_DEAD:
580 case CPU_DEAD_FROZEN:
581 spin_lock_irq(&pfault_lock);
582 list_for_each_entry_safe(thread, next, &pfault_list, list) {
583 thread->pfault_wait = 0;
584 list_del(&thread->list);
585 tsk = container_of(thread, struct task_struct, thread);
586 wake_up_process(tsk);
587 }
588 spin_unlock_irq(&pfault_lock);
589 break;
590 default:
591 break;
554 } 592 }
593 return NOTIFY_OK;
555} 594}
556 595
557static int __init pfault_irq_init(void) 596static int __init pfault_irq_init(void)
@@ -568,8 +607,10 @@ static int __init pfault_irq_init(void)
568 pfault_disable = 1; 607 pfault_disable = 1;
569 return rc; 608 return rc;
570 } 609 }
571 if (pfault_init() == 0) 610 if (pfault_init() == 0) {
611 hotcpu_notifier(pfault_cpu_notify, 0);
572 return 0; 612 return 0;
613 }
573 614
574 /* Tough luck, no pfault. */ 615 /* Tough luck, no pfault. */
575 pfault_disable = 1; 616 pfault_disable = 1;