diff options
author | Heiko Carstens <heiko.carstens@de.ibm.com> | 2011-05-23 04:24:34 -0400 |
---|---|---|
committer | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2011-05-23 04:24:29 -0400 |
commit | f2db2e6cb3f5f766cbb3788af44705685ff2445a (patch) | |
tree | 11fbf5522f332e13f9bfb6cf4552513e4d865003 | |
parent | b456d94a9757db54eca4677c1b3a13e7170c9bb3 (diff) |
[S390] pfault: cpu hotplug vs missing completion interrupts
On cpu hot remove a PFAULT CANCEL command is sent to the hypervisor
which in turn will cancel all outstanding pfault requests that have
been issued on that cpu (the same happens with a SIGP cpu reset).
The result is that we end up with uninterruptible processes where
the interrupt that would wake up these processes never arrives.
In order to solve this all processes which wait for a pfault
completion interrupt get woken up after a cpu hot remove. The worst
case that could happen is that they fault again and in turn need to
wait again.
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
-rw-r--r-- | arch/s390/include/asm/lowcore.h | 4 | ||||
-rw-r--r-- | arch/s390/include/asm/processor.h | 1 | ||||
-rw-r--r-- | arch/s390/kernel/asm-offsets.c | 1 | ||||
-rw-r--r-- | arch/s390/kernel/entry.S | 1 | ||||
-rw-r--r-- | arch/s390/kernel/entry64.S | 1 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 89 |
6 files changed, 71 insertions, 26 deletions
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index b8624d53c37..228cf0b295d 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h | |||
@@ -124,7 +124,7 @@ struct _lowcore { | |||
124 | /* Address space pointer. */ | 124 | /* Address space pointer. */ |
125 | __u32 kernel_asce; /* 0x02ac */ | 125 | __u32 kernel_asce; /* 0x02ac */ |
126 | __u32 user_asce; /* 0x02b0 */ | 126 | __u32 user_asce; /* 0x02b0 */ |
127 | __u8 pad_0x02b4[0x02b8-0x02b4]; /* 0x02b4 */ | 127 | __u32 current_pid; /* 0x02b4 */ |
128 | 128 | ||
129 | /* SMP info area */ | 129 | /* SMP info area */ |
130 | __u32 cpu_nr; /* 0x02b8 */ | 130 | __u32 cpu_nr; /* 0x02b8 */ |
@@ -255,7 +255,7 @@ struct _lowcore { | |||
255 | /* Address space pointer. */ | 255 | /* Address space pointer. */ |
256 | __u64 kernel_asce; /* 0x0310 */ | 256 | __u64 kernel_asce; /* 0x0310 */ |
257 | __u64 user_asce; /* 0x0318 */ | 257 | __u64 user_asce; /* 0x0318 */ |
258 | __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */ | 258 | __u64 current_pid; /* 0x0320 */ |
259 | 259 | ||
260 | /* SMP info area */ | 260 | /* SMP info area */ |
261 | __u32 cpu_nr; /* 0x0328 */ | 261 | __u32 cpu_nr; /* 0x0328 */ |
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 2c79b641627..1300c302533 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h | |||
@@ -84,6 +84,7 @@ struct thread_struct { | |||
84 | struct per_event per_event; /* Cause of the last PER trap */ | 84 | struct per_event per_event; /* Cause of the last PER trap */ |
85 | /* pfault_wait is used to block the process on a pfault event */ | 85 | /* pfault_wait is used to block the process on a pfault event */ |
86 | unsigned long pfault_wait; | 86 | unsigned long pfault_wait; |
87 | struct list_head list; | ||
87 | }; | 88 | }; |
88 | 89 | ||
89 | typedef struct thread_struct thread_struct; | 90 | typedef struct thread_struct thread_struct; |
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index ef455561101..edfbd17d708 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c | |||
@@ -124,6 +124,7 @@ int main(void) | |||
124 | DEFINE(__LC_LAST_UPDATE_TIMER, offsetof(struct _lowcore, last_update_timer)); | 124 | DEFINE(__LC_LAST_UPDATE_TIMER, offsetof(struct _lowcore, last_update_timer)); |
125 | DEFINE(__LC_LAST_UPDATE_CLOCK, offsetof(struct _lowcore, last_update_clock)); | 125 | DEFINE(__LC_LAST_UPDATE_CLOCK, offsetof(struct _lowcore, last_update_clock)); |
126 | DEFINE(__LC_CURRENT, offsetof(struct _lowcore, current_task)); | 126 | DEFINE(__LC_CURRENT, offsetof(struct _lowcore, current_task)); |
127 | DEFINE(__LC_CURRENT_PID, offsetof(struct _lowcore, current_pid)); | ||
127 | DEFINE(__LC_THREAD_INFO, offsetof(struct _lowcore, thread_info)); | 128 | DEFINE(__LC_THREAD_INFO, offsetof(struct _lowcore, thread_info)); |
128 | DEFINE(__LC_KERNEL_STACK, offsetof(struct _lowcore, kernel_stack)); | 129 | DEFINE(__LC_KERNEL_STACK, offsetof(struct _lowcore, kernel_stack)); |
129 | DEFINE(__LC_ASYNC_STACK, offsetof(struct _lowcore, async_stack)); | 130 | DEFINE(__LC_ASYNC_STACK, offsetof(struct _lowcore, async_stack)); |
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 1b67fc6ebdc..0476174dfff 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S | |||
@@ -212,6 +212,7 @@ __switch_to: | |||
212 | lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 | 212 | lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 |
213 | lm %r6,%r15,__SF_GPRS(%r15) # load gprs of next task | 213 | lm %r6,%r15,__SF_GPRS(%r15) # load gprs of next task |
214 | st %r3,__LC_CURRENT # store task struct of next | 214 | st %r3,__LC_CURRENT # store task struct of next |
215 | mvc __LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next | ||
215 | st %r5,__LC_THREAD_INFO # store thread info of next | 216 | st %r5,__LC_THREAD_INFO # store thread info of next |
216 | ahi %r5,STACK_SIZE # end of kernel stack of next | 217 | ahi %r5,STACK_SIZE # end of kernel stack of next |
217 | st %r5,__LC_KERNEL_STACK # store end of kernel stack | 218 | st %r5,__LC_KERNEL_STACK # store end of kernel stack |
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 9fd86456349..d61967e2eab 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S | |||
@@ -220,6 +220,7 @@ __switch_to: | |||
220 | lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 | 220 | lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 |
221 | lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task | 221 | lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task |
222 | stg %r3,__LC_CURRENT # store task struct of next | 222 | stg %r3,__LC_CURRENT # store task struct of next |
223 | mvc __LC_CURRENT_PID+4(4,%r0),__TASK_pid(%r3) # store pid of next | ||
223 | stg %r5,__LC_THREAD_INFO # store thread info of next | 224 | stg %r5,__LC_THREAD_INFO # store thread info of next |
224 | aghi %r5,STACK_SIZE # end of kernel stack of next | 225 | aghi %r5,STACK_SIZE # end of kernel stack of next |
225 | stg %r5,__LC_KERNEL_STACK # store end of kernel stack | 226 | stg %r5,__LC_KERNEL_STACK # store end of kernel stack |
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 177745c520c..1ca65647832 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c | |||
@@ -466,7 +466,7 @@ typedef struct { | |||
466 | int pfault_init(void) | 466 | int pfault_init(void) |
467 | { | 467 | { |
468 | pfault_refbk_t refbk = | 468 | pfault_refbk_t refbk = |
469 | { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, | 469 | { 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48, |
470 | __PF_RES_FIELD }; | 470 | __PF_RES_FIELD }; |
471 | int rc; | 471 | int rc; |
472 | 472 | ||
@@ -498,11 +498,15 @@ void pfault_fini(void) | |||
498 | : : "a" (&refbk), "m" (refbk) : "cc"); | 498 | : : "a" (&refbk), "m" (refbk) : "cc"); |
499 | } | 499 | } |
500 | 500 | ||
501 | static DEFINE_SPINLOCK(pfault_lock); | ||
502 | static LIST_HEAD(pfault_list); | ||
503 | |||
501 | static void pfault_interrupt(unsigned int ext_int_code, | 504 | static void pfault_interrupt(unsigned int ext_int_code, |
502 | unsigned int param32, unsigned long param64) | 505 | unsigned int param32, unsigned long param64) |
503 | { | 506 | { |
504 | struct task_struct *tsk; | 507 | struct task_struct *tsk; |
505 | __u16 subcode; | 508 | __u16 subcode; |
509 | pid_t pid; | ||
506 | 510 | ||
507 | /* | 511 | /* |
508 | * Get the external interruption subcode & pfault | 512 | * Get the external interruption subcode & pfault |
@@ -514,44 +518,79 @@ static void pfault_interrupt(unsigned int ext_int_code, | |||
514 | if ((subcode & 0xff00) != __SUBCODE_MASK) | 518 | if ((subcode & 0xff00) != __SUBCODE_MASK) |
515 | return; | 519 | return; |
516 | kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; | 520 | kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; |
517 | 521 | if (subcode & 0x0080) { | |
518 | /* | 522 | /* Get the token (= pid of the affected task). */ |
519 | * Get the token (= address of the task structure of the affected task). | 523 | pid = sizeof(void *) == 4 ? param32 : param64; |
520 | */ | 524 | rcu_read_lock(); |
521 | #ifdef CONFIG_64BIT | 525 | tsk = find_task_by_pid_ns(pid, &init_pid_ns); |
522 | tsk = (struct task_struct *) param64; | 526 | if (tsk) |
523 | #else | 527 | get_task_struct(tsk); |
524 | tsk = (struct task_struct *) param32; | 528 | rcu_read_unlock(); |
525 | #endif | 529 | if (!tsk) |
526 | 530 | return; | |
531 | } else { | ||
532 | tsk = current; | ||
533 | } | ||
534 | spin_lock(&pfault_lock); | ||
527 | if (subcode & 0x0080) { | 535 | if (subcode & 0x0080) { |
528 | /* signal bit is set -> a page has been swapped in by VM */ | 536 | /* signal bit is set -> a page has been swapped in by VM */ |
529 | if (xchg(&tsk->thread.pfault_wait, -1) != 0) { | 537 | if (tsk->thread.pfault_wait == 1) { |
530 | /* Initial interrupt was faster than the completion | 538 | /* Initial interrupt was faster than the completion |
531 | * interrupt. pfault_wait is valid. Set pfault_wait | 539 | * interrupt. pfault_wait is valid. Set pfault_wait |
532 | * back to zero and wake up the process. This can | 540 | * back to zero and wake up the process. This can |
533 | * safely be done because the task is still sleeping | 541 | * safely be done because the task is still sleeping |
534 | * and can't produce new pfaults. */ | 542 | * and can't produce new pfaults. */ |
535 | tsk->thread.pfault_wait = 0; | 543 | tsk->thread.pfault_wait = 0; |
544 | list_del(&tsk->thread.list); | ||
536 | wake_up_process(tsk); | 545 | wake_up_process(tsk); |
537 | put_task_struct(tsk); | 546 | } else { |
547 | /* Completion interrupt was faster than initial | ||
548 | * interrupt. Set pfault_wait to -1 so the initial | ||
549 | * interrupt doesn't put the task to sleep. */ | ||
550 | tsk->thread.pfault_wait = -1; | ||
538 | } | 551 | } |
552 | put_task_struct(tsk); | ||
539 | } else { | 553 | } else { |
540 | /* signal bit not set -> a real page is missing. */ | 554 | /* signal bit not set -> a real page is missing. */ |
541 | get_task_struct(tsk); | 555 | if (tsk->thread.pfault_wait == -1) { |
542 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
543 | if (xchg(&tsk->thread.pfault_wait, 1) != 0) { | ||
544 | /* Completion interrupt was faster than the initial | 556 | /* Completion interrupt was faster than the initial |
545 | * interrupt (swapped in a -1 for pfault_wait). Set | 557 | * interrupt (pfault_wait == -1). Set pfault_wait |
546 | * pfault_wait back to zero and exit. This can be | 558 | * back to zero and exit. */ |
547 | * done safely because tsk is running in kernel | ||
548 | * mode and can't produce new pfaults. */ | ||
549 | tsk->thread.pfault_wait = 0; | 559 | tsk->thread.pfault_wait = 0; |
550 | set_task_state(tsk, TASK_RUNNING); | 560 | } else { |
551 | put_task_struct(tsk); | 561 | /* Initial interrupt arrived before completion |
552 | } else | 562 | * interrupt. Let the task sleep. */ |
563 | tsk->thread.pfault_wait = 1; | ||
564 | list_add(&tsk->thread.list, &pfault_list); | ||
565 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
553 | set_tsk_need_resched(tsk); | 566 | set_tsk_need_resched(tsk); |
567 | } | ||
568 | } | ||
569 | spin_unlock(&pfault_lock); | ||
570 | } | ||
571 | |||
572 | static int __cpuinit pfault_cpu_notify(struct notifier_block *self, | ||
573 | unsigned long action, void *hcpu) | ||
574 | { | ||
575 | struct thread_struct *thread, *next; | ||
576 | struct task_struct *tsk; | ||
577 | |||
578 | switch (action) { | ||
579 | case CPU_DEAD: | ||
580 | case CPU_DEAD_FROZEN: | ||
581 | spin_lock_irq(&pfault_lock); | ||
582 | list_for_each_entry_safe(thread, next, &pfault_list, list) { | ||
583 | thread->pfault_wait = 0; | ||
584 | list_del(&thread->list); | ||
585 | tsk = container_of(thread, struct task_struct, thread); | ||
586 | wake_up_process(tsk); | ||
587 | } | ||
588 | spin_unlock_irq(&pfault_lock); | ||
589 | break; | ||
590 | default: | ||
591 | break; | ||
554 | } | 592 | } |
593 | return NOTIFY_OK; | ||
555 | } | 594 | } |
556 | 595 | ||
557 | static int __init pfault_irq_init(void) | 596 | static int __init pfault_irq_init(void) |
@@ -568,8 +607,10 @@ static int __init pfault_irq_init(void) | |||
568 | pfault_disable = 1; | 607 | pfault_disable = 1; |
569 | return rc; | 608 | return rc; |
570 | } | 609 | } |
571 | if (pfault_init() == 0) | 610 | if (pfault_init() == 0) { |
611 | hotcpu_notifier(pfault_cpu_notify, 0); | ||
572 | return 0; | 612 | return 0; |
613 | } | ||
573 | 614 | ||
574 | /* Tough luck, no pfault. */ | 615 | /* Tough luck, no pfault. */ |
575 | pfault_disable = 1; | 616 | pfault_disable = 1; |