aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/mm/fault.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm/fault.c')
-rw-r--r--arch/s390/mm/fault.c321
1 files changed, 183 insertions, 138 deletions
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 2505b2ea0ef1..fe103e891e7a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -10,6 +10,7 @@
10 * Copyright (C) 1995 Linus Torvalds 10 * Copyright (C) 1995 Linus Torvalds
11 */ 11 */
12 12
13#include <linux/kernel_stat.h>
13#include <linux/perf_event.h> 14#include <linux/perf_event.h>
14#include <linux/signal.h> 15#include <linux/signal.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
@@ -33,7 +34,7 @@
33#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/pgtable.h> 36#include <asm/pgtable.h>
36#include <asm/s390_ext.h> 37#include <asm/irq.h>
37#include <asm/mmu_context.h> 38#include <asm/mmu_context.h>
38#include <asm/compat.h> 39#include <asm/compat.h>
39#include "../kernel/entry.h" 40#include "../kernel/entry.h"
@@ -52,6 +53,14 @@
52#define VM_FAULT_BADMAP 0x020000 53#define VM_FAULT_BADMAP 0x020000
53#define VM_FAULT_BADACCESS 0x040000 54#define VM_FAULT_BADACCESS 0x040000
54 55
56static unsigned long store_indication;
57
58void fault_init(void)
59{
60 if (test_facility(2) && test_facility(75))
61 store_indication = 0xc00;
62}
63
55static inline int notify_page_fault(struct pt_regs *regs) 64static inline int notify_page_fault(struct pt_regs *regs)
56{ 65{
57 int ret = 0; 66 int ret = 0;
@@ -199,42 +208,22 @@ static noinline void do_sigbus(struct pt_regs *regs, long int_code,
199 unsigned long trans_exc_code) 208 unsigned long trans_exc_code)
200{ 209{
201 struct task_struct *tsk = current; 210 struct task_struct *tsk = current;
211 unsigned long address;
212 struct siginfo si;
202 213
203 /* 214 /*
204 * Send a sigbus, regardless of whether we were in kernel 215 * Send a sigbus, regardless of whether we were in kernel
205 * or user mode. 216 * or user mode.
206 */ 217 */
207 tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; 218 address = trans_exc_code & __FAIL_ADDR_MASK;
219 tsk->thread.prot_addr = address;
208 tsk->thread.trap_no = int_code; 220 tsk->thread.trap_no = int_code;
209 force_sig(SIGBUS, tsk); 221 si.si_signo = SIGBUS;
210} 222 si.si_errno = 0;
211 223 si.si_code = BUS_ADRERR;
212#ifdef CONFIG_S390_EXEC_PROTECT 224 si.si_addr = (void __user *) address;
213static noinline int signal_return(struct pt_regs *regs, long int_code, 225 force_sig_info(SIGBUS, &si, tsk);
214 unsigned long trans_exc_code)
215{
216 u16 instruction;
217 int rc;
218
219 rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
220
221 if (!rc && instruction == 0x0a77) {
222 clear_tsk_thread_flag(current, TIF_SINGLE_STEP);
223 if (is_compat_task())
224 sys32_sigreturn();
225 else
226 sys_sigreturn();
227 } else if (!rc && instruction == 0x0aad) {
228 clear_tsk_thread_flag(current, TIF_SINGLE_STEP);
229 if (is_compat_task())
230 sys32_rt_sigreturn();
231 else
232 sys_rt_sigreturn();
233 } else
234 do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code);
235 return 0;
236} 226}
237#endif /* CONFIG_S390_EXEC_PROTECT */
238 227
239static noinline void do_fault_error(struct pt_regs *regs, long int_code, 228static noinline void do_fault_error(struct pt_regs *regs, long int_code,
240 unsigned long trans_exc_code, int fault) 229 unsigned long trans_exc_code, int fault)
@@ -243,13 +232,6 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code,
243 232
244 switch (fault) { 233 switch (fault) {
245 case VM_FAULT_BADACCESS: 234 case VM_FAULT_BADACCESS:
246#ifdef CONFIG_S390_EXEC_PROTECT
247 if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY &&
248 (trans_exc_code & 3) == 0) {
249 signal_return(regs, int_code, trans_exc_code);
250 break;
251 }
252#endif /* CONFIG_S390_EXEC_PROTECT */
253 case VM_FAULT_BADMAP: 235 case VM_FAULT_BADMAP:
254 /* Bad memory access. Check if it is kernel or user space. */ 236 /* Bad memory access. Check if it is kernel or user space. */
255 if (regs->psw.mask & PSW_MASK_PSTATE) { 237 if (regs->psw.mask & PSW_MASK_PSTATE) {
@@ -263,13 +245,17 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code,
263 do_no_context(regs, int_code, trans_exc_code); 245 do_no_context(regs, int_code, trans_exc_code);
264 break; 246 break;
265 default: /* fault & VM_FAULT_ERROR */ 247 default: /* fault & VM_FAULT_ERROR */
266 if (fault & VM_FAULT_OOM) 248 if (fault & VM_FAULT_OOM) {
267 pagefault_out_of_memory(); 249 if (!(regs->psw.mask & PSW_MASK_PSTATE))
268 else if (fault & VM_FAULT_SIGBUS) { 250 do_no_context(regs, int_code, trans_exc_code);
269 do_sigbus(regs, int_code, trans_exc_code); 251 else
252 pagefault_out_of_memory();
253 } else if (fault & VM_FAULT_SIGBUS) {
270 /* Kernel mode? Handle exceptions or die */ 254 /* Kernel mode? Handle exceptions or die */
271 if (!(regs->psw.mask & PSW_MASK_PSTATE)) 255 if (!(regs->psw.mask & PSW_MASK_PSTATE))
272 do_no_context(regs, int_code, trans_exc_code); 256 do_no_context(regs, int_code, trans_exc_code);
257 else
258 do_sigbus(regs, int_code, trans_exc_code);
273 } else 259 } else
274 BUG(); 260 BUG();
275 break; 261 break;
@@ -294,6 +280,7 @@ static inline int do_exception(struct pt_regs *regs, int access,
294 struct mm_struct *mm; 280 struct mm_struct *mm;
295 struct vm_area_struct *vma; 281 struct vm_area_struct *vma;
296 unsigned long address; 282 unsigned long address;
283 unsigned int flags;
297 int fault; 284 int fault;
298 285
299 if (notify_page_fault(regs)) 286 if (notify_page_fault(regs))
@@ -312,13 +299,11 @@ static inline int do_exception(struct pt_regs *regs, int access,
312 goto out; 299 goto out;
313 300
314 address = trans_exc_code & __FAIL_ADDR_MASK; 301 address = trans_exc_code & __FAIL_ADDR_MASK;
315 /*
316 * When we get here, the fault happened in the current
317 * task's user address space, so we can switch on the
318 * interrupts again and then search the VMAs
319 */
320 local_irq_enable();
321 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 302 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
303 flags = FAULT_FLAG_ALLOW_RETRY;
304 if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
305 flags |= FAULT_FLAG_WRITE;
306retry:
322 down_read(&mm->mmap_sem); 307 down_read(&mm->mmap_sem);
323 308
324 fault = VM_FAULT_BADMAP; 309 fault = VM_FAULT_BADMAP;
@@ -348,25 +333,37 @@ static inline int do_exception(struct pt_regs *regs, int access,
348 * make sure we exit gracefully rather than endlessly redo 333 * make sure we exit gracefully rather than endlessly redo
349 * the fault. 334 * the fault.
350 */ 335 */
351 fault = handle_mm_fault(mm, vma, address, 336 fault = handle_mm_fault(mm, vma, address, flags);
352 (access == VM_WRITE) ? FAULT_FLAG_WRITE : 0);
353 if (unlikely(fault & VM_FAULT_ERROR)) 337 if (unlikely(fault & VM_FAULT_ERROR))
354 goto out_up; 338 goto out_up;
355 339
356 if (fault & VM_FAULT_MAJOR) { 340 /*
357 tsk->maj_flt++; 341 * Major/minor page fault accounting is only done on the
358 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 342 * initial attempt. If we go through a retry, it is extremely
359 regs, address); 343 * likely that the page will be found in page cache at that point.
360 } else { 344 */
361 tsk->min_flt++; 345 if (flags & FAULT_FLAG_ALLOW_RETRY) {
362 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 346 if (fault & VM_FAULT_MAJOR) {
363 regs, address); 347 tsk->maj_flt++;
348 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
349 regs, address);
350 } else {
351 tsk->min_flt++;
352 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
353 regs, address);
354 }
355 if (fault & VM_FAULT_RETRY) {
356 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
357 * of starvation. */
358 flags &= ~FAULT_FLAG_ALLOW_RETRY;
359 goto retry;
360 }
364 } 361 }
365 /* 362 /*
366 * The instruction that caused the program check will 363 * The instruction that caused the program check will
367 * be repeated. Don't signal single step via SIGTRAP. 364 * be repeated. Don't signal single step via SIGTRAP.
368 */ 365 */
369 clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP); 366 clear_tsk_thread_flag(tsk, TIF_PER_TRAP);
370 fault = 0; 367 fault = 0;
371out_up: 368out_up:
372 up_read(&mm->mmap_sem); 369 up_read(&mm->mmap_sem);
@@ -374,20 +371,20 @@ out:
374 return fault; 371 return fault;
375} 372}
376 373
377void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) 374void __kprobes do_protection_exception(struct pt_regs *regs, long pgm_int_code,
375 unsigned long trans_exc_code)
378{ 376{
379 unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
380 int fault; 377 int fault;
381 378
382 /* Protection exception is supressing, decrement psw address. */ 379 /* Protection exception is suppressing, decrement psw address. */
383 regs->psw.addr -= (int_code >> 16); 380 regs->psw.addr -= (pgm_int_code >> 16);
384 /* 381 /*
385 * Check for low-address protection. This needs to be treated 382 * Check for low-address protection. This needs to be treated
386 * as a special case because the translation exception code 383 * as a special case because the translation exception code
387 * field is not guaranteed to contain valid data in this case. 384 * field is not guaranteed to contain valid data in this case.
388 */ 385 */
389 if (unlikely(!(trans_exc_code & 4))) { 386 if (unlikely(!(trans_exc_code & 4))) {
390 do_low_address(regs, int_code, trans_exc_code); 387 do_low_address(regs, pgm_int_code, trans_exc_code);
391 return; 388 return;
392 } 389 }
393 fault = do_exception(regs, VM_WRITE, trans_exc_code); 390 fault = do_exception(regs, VM_WRITE, trans_exc_code);
@@ -395,34 +392,27 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code)
395 do_fault_error(regs, 4, trans_exc_code, fault); 392 do_fault_error(regs, 4, trans_exc_code, fault);
396} 393}
397 394
398void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) 395void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code,
396 unsigned long trans_exc_code)
399{ 397{
400 unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
401 int access, fault; 398 int access, fault;
402 399
403 access = VM_READ | VM_EXEC | VM_WRITE; 400 access = VM_READ | VM_EXEC | VM_WRITE;
404#ifdef CONFIG_S390_EXEC_PROTECT
405 if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY &&
406 (trans_exc_code & 3) == 0)
407 access = VM_EXEC;
408#endif
409 fault = do_exception(regs, access, trans_exc_code); 401 fault = do_exception(regs, access, trans_exc_code);
410 if (unlikely(fault)) 402 if (unlikely(fault))
411 do_fault_error(regs, int_code & 255, trans_exc_code, fault); 403 do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault);
412} 404}
413 405
414#ifdef CONFIG_64BIT 406#ifdef CONFIG_64BIT
415void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) 407void __kprobes do_asce_exception(struct pt_regs *regs, long pgm_int_code,
408 unsigned long trans_exc_code)
416{ 409{
417 unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
418 struct mm_struct *mm = current->mm; 410 struct mm_struct *mm = current->mm;
419 struct vm_area_struct *vma; 411 struct vm_area_struct *vma;
420 412
421 if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) 413 if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
422 goto no_context; 414 goto no_context;
423 415
424 local_irq_enable();
425
426 down_read(&mm->mmap_sem); 416 down_read(&mm->mmap_sem);
427 vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); 417 vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK);
428 up_read(&mm->mmap_sem); 418 up_read(&mm->mmap_sem);
@@ -434,16 +424,16 @@ void __kprobes do_asce_exception(struct pt_regs *regs, long int_code)
434 424
435 /* User mode accesses just cause a SIGSEGV */ 425 /* User mode accesses just cause a SIGSEGV */
436 if (regs->psw.mask & PSW_MASK_PSTATE) { 426 if (regs->psw.mask & PSW_MASK_PSTATE) {
437 do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); 427 do_sigsegv(regs, pgm_int_code, SEGV_MAPERR, trans_exc_code);
438 return; 428 return;
439 } 429 }
440 430
441no_context: 431no_context:
442 do_no_context(regs, int_code, trans_exc_code); 432 do_no_context(regs, pgm_int_code, trans_exc_code);
443} 433}
444#endif 434#endif
445 435
446int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) 436int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write)
447{ 437{
448 struct pt_regs regs; 438 struct pt_regs regs;
449 int access, fault; 439 int access, fault;
@@ -454,14 +444,13 @@ int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user)
454 regs.psw.addr = (unsigned long) __builtin_return_address(0); 444 regs.psw.addr = (unsigned long) __builtin_return_address(0);
455 regs.psw.addr |= PSW_ADDR_AMODE; 445 regs.psw.addr |= PSW_ADDR_AMODE;
456 uaddr &= PAGE_MASK; 446 uaddr &= PAGE_MASK;
457 access = write_user ? VM_WRITE : VM_READ; 447 access = write ? VM_WRITE : VM_READ;
458 fault = do_exception(&regs, access, uaddr | 2); 448 fault = do_exception(&regs, access, uaddr | 2);
459 if (unlikely(fault)) { 449 if (unlikely(fault)) {
460 if (fault & VM_FAULT_OOM) { 450 if (fault & VM_FAULT_OOM)
461 pagefault_out_of_memory(); 451 return -EFAULT;
462 fault = 0; 452 else if (fault & VM_FAULT_SIGBUS)
463 } else if (fault & VM_FAULT_SIGBUS) 453 do_sigbus(&regs, pgm_int_code, uaddr);
464 do_sigbus(&regs, int_code, uaddr);
465 } 454 }
466 return fault ? -EFAULT : 0; 455 return fault ? -EFAULT : 0;
467} 456}
@@ -470,8 +459,7 @@ int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user)
470/* 459/*
471 * 'pfault' pseudo page faults routines. 460 * 'pfault' pseudo page faults routines.
472 */ 461 */
473static ext_int_info_t ext_int_pfault; 462static int pfault_disable;
474static int pfault_disable = 0;
475 463
476static int __init nopfault(char *str) 464static int __init nopfault(char *str)
477{ 465{
@@ -481,22 +469,28 @@ static int __init nopfault(char *str)
481 469
482__setup("nopfault", nopfault); 470__setup("nopfault", nopfault);
483 471
484typedef struct { 472struct pfault_refbk {
485 __u16 refdiagc; 473 u16 refdiagc;
486 __u16 reffcode; 474 u16 reffcode;
487 __u16 refdwlen; 475 u16 refdwlen;
488 __u16 refversn; 476 u16 refversn;
489 __u64 refgaddr; 477 u64 refgaddr;
490 __u64 refselmk; 478 u64 refselmk;
491 __u64 refcmpmk; 479 u64 refcmpmk;
492 __u64 reserved; 480 u64 reserved;
493} __attribute__ ((packed, aligned(8))) pfault_refbk_t; 481} __attribute__ ((packed, aligned(8)));
494 482
495int pfault_init(void) 483int pfault_init(void)
496{ 484{
497 pfault_refbk_t refbk = 485 struct pfault_refbk refbk = {
498 { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, 486 .refdiagc = 0x258,
499 __PF_RES_FIELD }; 487 .reffcode = 0,
488 .refdwlen = 5,
489 .refversn = 2,
490 .refgaddr = __LC_CURRENT_PID,
491 .refselmk = 1ULL << 48,
492 .refcmpmk = 1ULL << 48,
493 .reserved = __PF_RES_FIELD };
500 int rc; 494 int rc;
501 495
502 if (!MACHINE_IS_VM || pfault_disable) 496 if (!MACHINE_IS_VM || pfault_disable)
@@ -508,18 +502,20 @@ int pfault_init(void)
508 "2:\n" 502 "2:\n"
509 EX_TABLE(0b,1b) 503 EX_TABLE(0b,1b)
510 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); 504 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
511 __ctl_set_bit(0, 9);
512 return rc; 505 return rc;
513} 506}
514 507
515void pfault_fini(void) 508void pfault_fini(void)
516{ 509{
517 pfault_refbk_t refbk = 510 struct pfault_refbk refbk = {
518 { 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; 511 .refdiagc = 0x258,
512 .reffcode = 1,
513 .refdwlen = 5,
514 .refversn = 2,
515 };
519 516
520 if (!MACHINE_IS_VM || pfault_disable) 517 if (!MACHINE_IS_VM || pfault_disable)
521 return; 518 return;
522 __ctl_clear_bit(0,9);
523 asm volatile( 519 asm volatile(
524 " diag %0,0,0x258\n" 520 " diag %0,0,0x258\n"
525 "0:\n" 521 "0:\n"
@@ -527,10 +523,15 @@ void pfault_fini(void)
527 : : "a" (&refbk), "m" (refbk) : "cc"); 523 : : "a" (&refbk), "m" (refbk) : "cc");
528} 524}
529 525
530static void pfault_interrupt(__u16 int_code) 526static DEFINE_SPINLOCK(pfault_lock);
527static LIST_HEAD(pfault_list);
528
529static void pfault_interrupt(unsigned int ext_int_code,
530 unsigned int param32, unsigned long param64)
531{ 531{
532 struct task_struct *tsk; 532 struct task_struct *tsk;
533 __u16 subcode; 533 __u16 subcode;
534 pid_t pid;
534 535
535 /* 536 /*
536 * Get the external interruption subcode & pfault 537 * Get the external interruption subcode & pfault
@@ -538,63 +539,107 @@ static void pfault_interrupt(__u16 int_code)
538 * in the 'cpu address' field associated with the 539 * in the 'cpu address' field associated with the
539 * external interrupt. 540 * external interrupt.
540 */ 541 */
541 subcode = S390_lowcore.cpu_addr; 542 subcode = ext_int_code >> 16;
542 if ((subcode & 0xff00) != __SUBCODE_MASK) 543 if ((subcode & 0xff00) != __SUBCODE_MASK)
543 return; 544 return;
544 545 kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++;
545 /* 546 if (subcode & 0x0080) {
546 * Get the token (= address of the task structure of the affected task). 547 /* Get the token (= pid of the affected task). */
547 */ 548 pid = sizeof(void *) == 4 ? param32 : param64;
548 tsk = *(struct task_struct **) __LC_PFAULT_INTPARM; 549 rcu_read_lock();
549 550 tsk = find_task_by_pid_ns(pid, &init_pid_ns);
551 if (tsk)
552 get_task_struct(tsk);
553 rcu_read_unlock();
554 if (!tsk)
555 return;
556 } else {
557 tsk = current;
558 }
559 spin_lock(&pfault_lock);
550 if (subcode & 0x0080) { 560 if (subcode & 0x0080) {
551 /* signal bit is set -> a page has been swapped in by VM */ 561 /* signal bit is set -> a page has been swapped in by VM */
552 if (xchg(&tsk->thread.pfault_wait, -1) != 0) { 562 if (tsk->thread.pfault_wait == 1) {
553 /* Initial interrupt was faster than the completion 563 /* Initial interrupt was faster than the completion
554 * interrupt. pfault_wait is valid. Set pfault_wait 564 * interrupt. pfault_wait is valid. Set pfault_wait
555 * back to zero and wake up the process. This can 565 * back to zero and wake up the process. This can
556 * safely be done because the task is still sleeping 566 * safely be done because the task is still sleeping
557 * and can't produce new pfaults. */ 567 * and can't produce new pfaults. */
558 tsk->thread.pfault_wait = 0; 568 tsk->thread.pfault_wait = 0;
569 list_del(&tsk->thread.list);
559 wake_up_process(tsk); 570 wake_up_process(tsk);
560 put_task_struct(tsk); 571 } else {
572 /* Completion interrupt was faster than initial
573 * interrupt. Set pfault_wait to -1 so the initial
574 * interrupt doesn't put the task to sleep. */
575 tsk->thread.pfault_wait = -1;
561 } 576 }
577 put_task_struct(tsk);
562 } else { 578 } else {
563 /* signal bit not set -> a real page is missing. */ 579 /* signal bit not set -> a real page is missing. */
564 get_task_struct(tsk); 580 if (tsk->thread.pfault_wait == -1) {
565 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
566 if (xchg(&tsk->thread.pfault_wait, 1) != 0) {
567 /* Completion interrupt was faster than the initial 581 /* Completion interrupt was faster than the initial
568 * interrupt (swapped in a -1 for pfault_wait). Set 582 * interrupt (pfault_wait == -1). Set pfault_wait
569 * pfault_wait back to zero and exit. This can be 583 * back to zero and exit. */
570 * done safely because tsk is running in kernel
571 * mode and can't produce new pfaults. */
572 tsk->thread.pfault_wait = 0; 584 tsk->thread.pfault_wait = 0;
573 set_task_state(tsk, TASK_RUNNING); 585 } else {
574 put_task_struct(tsk); 586 /* Initial interrupt arrived before completion
575 } else 587 * interrupt. Let the task sleep. */
588 tsk->thread.pfault_wait = 1;
589 list_add(&tsk->thread.list, &pfault_list);
590 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
576 set_tsk_need_resched(tsk); 591 set_tsk_need_resched(tsk);
592 }
577 } 593 }
594 spin_unlock(&pfault_lock);
578} 595}
579 596
580void __init pfault_irq_init(void) 597static int __cpuinit pfault_cpu_notify(struct notifier_block *self,
598 unsigned long action, void *hcpu)
581{ 599{
582 if (!MACHINE_IS_VM) 600 struct thread_struct *thread, *next;
583 return; 601 struct task_struct *tsk;
584 602
585 /* 603 switch (action) {
586 * Try to get pfault pseudo page faults going. 604 case CPU_DEAD:
587 */ 605 case CPU_DEAD_FROZEN:
588 if (register_early_external_interrupt(0x2603, pfault_interrupt, 606 spin_lock_irq(&pfault_lock);
589 &ext_int_pfault) != 0) 607 list_for_each_entry_safe(thread, next, &pfault_list, list) {
590 panic("Couldn't request external interrupt 0x2603"); 608 thread->pfault_wait = 0;
609 list_del(&thread->list);
610 tsk = container_of(thread, struct task_struct, thread);
611 wake_up_process(tsk);
612 }
613 spin_unlock_irq(&pfault_lock);
614 break;
615 default:
616 break;
617 }
618 return NOTIFY_OK;
619}
591 620
592 if (pfault_init() == 0) 621static int __init pfault_irq_init(void)
593 return; 622{
623 int rc;
594 624
595 /* Tough luck, no pfault. */ 625 if (!MACHINE_IS_VM)
626 return 0;
627 rc = register_external_interrupt(0x2603, pfault_interrupt);
628 if (rc)
629 goto out_extint;
630 rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
631 if (rc)
632 goto out_pfault;
633 service_subclass_irq_register();
634 hotcpu_notifier(pfault_cpu_notify, 0);
635 return 0;
636
637out_pfault:
638 unregister_external_interrupt(0x2603, pfault_interrupt);
639out_extint:
596 pfault_disable = 1; 640 pfault_disable = 1;
597 unregister_early_external_interrupt(0x2603, pfault_interrupt, 641 return rc;
598 &ext_int_pfault);
599} 642}
600#endif 643early_initcall(pfault_irq_init);
644
645#endif /* CONFIG_PFAULT */