diff options
Diffstat (limited to 'arch/s390/mm/fault.c')
-rw-r--r-- | arch/s390/mm/fault.c | 321 |
1 files changed, 183 insertions, 138 deletions
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2505b2ea0ef1..fe103e891e7a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c | |||
@@ -10,6 +10,7 @@ | |||
10 | * Copyright (C) 1995 Linus Torvalds | 10 | * Copyright (C) 1995 Linus Torvalds |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/kernel_stat.h> | ||
13 | #include <linux/perf_event.h> | 14 | #include <linux/perf_event.h> |
14 | #include <linux/signal.h> | 15 | #include <linux/signal.h> |
15 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
@@ -33,7 +34,7 @@ | |||
33 | #include <asm/asm-offsets.h> | 34 | #include <asm/asm-offsets.h> |
34 | #include <asm/system.h> | 35 | #include <asm/system.h> |
35 | #include <asm/pgtable.h> | 36 | #include <asm/pgtable.h> |
36 | #include <asm/s390_ext.h> | 37 | #include <asm/irq.h> |
37 | #include <asm/mmu_context.h> | 38 | #include <asm/mmu_context.h> |
38 | #include <asm/compat.h> | 39 | #include <asm/compat.h> |
39 | #include "../kernel/entry.h" | 40 | #include "../kernel/entry.h" |
@@ -52,6 +53,14 @@ | |||
52 | #define VM_FAULT_BADMAP 0x020000 | 53 | #define VM_FAULT_BADMAP 0x020000 |
53 | #define VM_FAULT_BADACCESS 0x040000 | 54 | #define VM_FAULT_BADACCESS 0x040000 |
54 | 55 | ||
56 | static unsigned long store_indication; | ||
57 | |||
58 | void fault_init(void) | ||
59 | { | ||
60 | if (test_facility(2) && test_facility(75)) | ||
61 | store_indication = 0xc00; | ||
62 | } | ||
63 | |||
55 | static inline int notify_page_fault(struct pt_regs *regs) | 64 | static inline int notify_page_fault(struct pt_regs *regs) |
56 | { | 65 | { |
57 | int ret = 0; | 66 | int ret = 0; |
@@ -199,42 +208,22 @@ static noinline void do_sigbus(struct pt_regs *regs, long int_code, | |||
199 | unsigned long trans_exc_code) | 208 | unsigned long trans_exc_code) |
200 | { | 209 | { |
201 | struct task_struct *tsk = current; | 210 | struct task_struct *tsk = current; |
211 | unsigned long address; | ||
212 | struct siginfo si; | ||
202 | 213 | ||
203 | /* | 214 | /* |
204 | * Send a sigbus, regardless of whether we were in kernel | 215 | * Send a sigbus, regardless of whether we were in kernel |
205 | * or user mode. | 216 | * or user mode. |
206 | */ | 217 | */ |
207 | tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; | 218 | address = trans_exc_code & __FAIL_ADDR_MASK; |
219 | tsk->thread.prot_addr = address; | ||
208 | tsk->thread.trap_no = int_code; | 220 | tsk->thread.trap_no = int_code; |
209 | force_sig(SIGBUS, tsk); | 221 | si.si_signo = SIGBUS; |
210 | } | 222 | si.si_errno = 0; |
211 | 223 | si.si_code = BUS_ADRERR; | |
212 | #ifdef CONFIG_S390_EXEC_PROTECT | 224 | si.si_addr = (void __user *) address; |
213 | static noinline int signal_return(struct pt_regs *regs, long int_code, | 225 | force_sig_info(SIGBUS, &si, tsk); |
214 | unsigned long trans_exc_code) | ||
215 | { | ||
216 | u16 instruction; | ||
217 | int rc; | ||
218 | |||
219 | rc = __get_user(instruction, (u16 __user *) regs->psw.addr); | ||
220 | |||
221 | if (!rc && instruction == 0x0a77) { | ||
222 | clear_tsk_thread_flag(current, TIF_SINGLE_STEP); | ||
223 | if (is_compat_task()) | ||
224 | sys32_sigreturn(); | ||
225 | else | ||
226 | sys_sigreturn(); | ||
227 | } else if (!rc && instruction == 0x0aad) { | ||
228 | clear_tsk_thread_flag(current, TIF_SINGLE_STEP); | ||
229 | if (is_compat_task()) | ||
230 | sys32_rt_sigreturn(); | ||
231 | else | ||
232 | sys_rt_sigreturn(); | ||
233 | } else | ||
234 | do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); | ||
235 | return 0; | ||
236 | } | 226 | } |
237 | #endif /* CONFIG_S390_EXEC_PROTECT */ | ||
238 | 227 | ||
239 | static noinline void do_fault_error(struct pt_regs *regs, long int_code, | 228 | static noinline void do_fault_error(struct pt_regs *regs, long int_code, |
240 | unsigned long trans_exc_code, int fault) | 229 | unsigned long trans_exc_code, int fault) |
@@ -243,13 +232,6 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, | |||
243 | 232 | ||
244 | switch (fault) { | 233 | switch (fault) { |
245 | case VM_FAULT_BADACCESS: | 234 | case VM_FAULT_BADACCESS: |
246 | #ifdef CONFIG_S390_EXEC_PROTECT | ||
247 | if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && | ||
248 | (trans_exc_code & 3) == 0) { | ||
249 | signal_return(regs, int_code, trans_exc_code); | ||
250 | break; | ||
251 | } | ||
252 | #endif /* CONFIG_S390_EXEC_PROTECT */ | ||
253 | case VM_FAULT_BADMAP: | 235 | case VM_FAULT_BADMAP: |
254 | /* Bad memory access. Check if it is kernel or user space. */ | 236 | /* Bad memory access. Check if it is kernel or user space. */ |
255 | if (regs->psw.mask & PSW_MASK_PSTATE) { | 237 | if (regs->psw.mask & PSW_MASK_PSTATE) { |
@@ -263,13 +245,17 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, | |||
263 | do_no_context(regs, int_code, trans_exc_code); | 245 | do_no_context(regs, int_code, trans_exc_code); |
264 | break; | 246 | break; |
265 | default: /* fault & VM_FAULT_ERROR */ | 247 | default: /* fault & VM_FAULT_ERROR */ |
266 | if (fault & VM_FAULT_OOM) | 248 | if (fault & VM_FAULT_OOM) { |
267 | pagefault_out_of_memory(); | 249 | if (!(regs->psw.mask & PSW_MASK_PSTATE)) |
268 | else if (fault & VM_FAULT_SIGBUS) { | 250 | do_no_context(regs, int_code, trans_exc_code); |
269 | do_sigbus(regs, int_code, trans_exc_code); | 251 | else |
252 | pagefault_out_of_memory(); | ||
253 | } else if (fault & VM_FAULT_SIGBUS) { | ||
270 | /* Kernel mode? Handle exceptions or die */ | 254 | /* Kernel mode? Handle exceptions or die */ |
271 | if (!(regs->psw.mask & PSW_MASK_PSTATE)) | 255 | if (!(regs->psw.mask & PSW_MASK_PSTATE)) |
272 | do_no_context(regs, int_code, trans_exc_code); | 256 | do_no_context(regs, int_code, trans_exc_code); |
257 | else | ||
258 | do_sigbus(regs, int_code, trans_exc_code); | ||
273 | } else | 259 | } else |
274 | BUG(); | 260 | BUG(); |
275 | break; | 261 | break; |
@@ -294,6 +280,7 @@ static inline int do_exception(struct pt_regs *regs, int access, | |||
294 | struct mm_struct *mm; | 280 | struct mm_struct *mm; |
295 | struct vm_area_struct *vma; | 281 | struct vm_area_struct *vma; |
296 | unsigned long address; | 282 | unsigned long address; |
283 | unsigned int flags; | ||
297 | int fault; | 284 | int fault; |
298 | 285 | ||
299 | if (notify_page_fault(regs)) | 286 | if (notify_page_fault(regs)) |
@@ -312,13 +299,11 @@ static inline int do_exception(struct pt_regs *regs, int access, | |||
312 | goto out; | 299 | goto out; |
313 | 300 | ||
314 | address = trans_exc_code & __FAIL_ADDR_MASK; | 301 | address = trans_exc_code & __FAIL_ADDR_MASK; |
315 | /* | ||
316 | * When we get here, the fault happened in the current | ||
317 | * task's user address space, so we can switch on the | ||
318 | * interrupts again and then search the VMAs | ||
319 | */ | ||
320 | local_irq_enable(); | ||
321 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); | 302 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); |
303 | flags = FAULT_FLAG_ALLOW_RETRY; | ||
304 | if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) | ||
305 | flags |= FAULT_FLAG_WRITE; | ||
306 | retry: | ||
322 | down_read(&mm->mmap_sem); | 307 | down_read(&mm->mmap_sem); |
323 | 308 | ||
324 | fault = VM_FAULT_BADMAP; | 309 | fault = VM_FAULT_BADMAP; |
@@ -348,25 +333,37 @@ static inline int do_exception(struct pt_regs *regs, int access, | |||
348 | * make sure we exit gracefully rather than endlessly redo | 333 | * make sure we exit gracefully rather than endlessly redo |
349 | * the fault. | 334 | * the fault. |
350 | */ | 335 | */ |
351 | fault = handle_mm_fault(mm, vma, address, | 336 | fault = handle_mm_fault(mm, vma, address, flags); |
352 | (access == VM_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
353 | if (unlikely(fault & VM_FAULT_ERROR)) | 337 | if (unlikely(fault & VM_FAULT_ERROR)) |
354 | goto out_up; | 338 | goto out_up; |
355 | 339 | ||
356 | if (fault & VM_FAULT_MAJOR) { | 340 | /* |
357 | tsk->maj_flt++; | 341 | * Major/minor page fault accounting is only done on the |
358 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | 342 | * initial attempt. If we go through a retry, it is extremely |
359 | regs, address); | 343 | * likely that the page will be found in page cache at that point. |
360 | } else { | 344 | */ |
361 | tsk->min_flt++; | 345 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
362 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | 346 | if (fault & VM_FAULT_MAJOR) { |
363 | regs, address); | 347 | tsk->maj_flt++; |
348 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | ||
349 | regs, address); | ||
350 | } else { | ||
351 | tsk->min_flt++; | ||
352 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | ||
353 | regs, address); | ||
354 | } | ||
355 | if (fault & VM_FAULT_RETRY) { | ||
356 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | ||
357 | * of starvation. */ | ||
358 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | ||
359 | goto retry; | ||
360 | } | ||
364 | } | 361 | } |
365 | /* | 362 | /* |
366 | * The instruction that caused the program check will | 363 | * The instruction that caused the program check will |
367 | * be repeated. Don't signal single step via SIGTRAP. | 364 | * be repeated. Don't signal single step via SIGTRAP. |
368 | */ | 365 | */ |
369 | clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP); | 366 | clear_tsk_thread_flag(tsk, TIF_PER_TRAP); |
370 | fault = 0; | 367 | fault = 0; |
371 | out_up: | 368 | out_up: |
372 | up_read(&mm->mmap_sem); | 369 | up_read(&mm->mmap_sem); |
@@ -374,20 +371,20 @@ out: | |||
374 | return fault; | 371 | return fault; |
375 | } | 372 | } |
376 | 373 | ||
377 | void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) | 374 | void __kprobes do_protection_exception(struct pt_regs *regs, long pgm_int_code, |
375 | unsigned long trans_exc_code) | ||
378 | { | 376 | { |
379 | unsigned long trans_exc_code = S390_lowcore.trans_exc_code; | ||
380 | int fault; | 377 | int fault; |
381 | 378 | ||
382 | /* Protection exception is supressing, decrement psw address. */ | 379 | /* Protection exception is suppressing, decrement psw address. */ |
383 | regs->psw.addr -= (int_code >> 16); | 380 | regs->psw.addr -= (pgm_int_code >> 16); |
384 | /* | 381 | /* |
385 | * Check for low-address protection. This needs to be treated | 382 | * Check for low-address protection. This needs to be treated |
386 | * as a special case because the translation exception code | 383 | * as a special case because the translation exception code |
387 | * field is not guaranteed to contain valid data in this case. | 384 | * field is not guaranteed to contain valid data in this case. |
388 | */ | 385 | */ |
389 | if (unlikely(!(trans_exc_code & 4))) { | 386 | if (unlikely(!(trans_exc_code & 4))) { |
390 | do_low_address(regs, int_code, trans_exc_code); | 387 | do_low_address(regs, pgm_int_code, trans_exc_code); |
391 | return; | 388 | return; |
392 | } | 389 | } |
393 | fault = do_exception(regs, VM_WRITE, trans_exc_code); | 390 | fault = do_exception(regs, VM_WRITE, trans_exc_code); |
@@ -395,34 +392,27 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) | |||
395 | do_fault_error(regs, 4, trans_exc_code, fault); | 392 | do_fault_error(regs, 4, trans_exc_code, fault); |
396 | } | 393 | } |
397 | 394 | ||
398 | void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) | 395 | void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code, |
396 | unsigned long trans_exc_code) | ||
399 | { | 397 | { |
400 | unsigned long trans_exc_code = S390_lowcore.trans_exc_code; | ||
401 | int access, fault; | 398 | int access, fault; |
402 | 399 | ||
403 | access = VM_READ | VM_EXEC | VM_WRITE; | 400 | access = VM_READ | VM_EXEC | VM_WRITE; |
404 | #ifdef CONFIG_S390_EXEC_PROTECT | ||
405 | if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && | ||
406 | (trans_exc_code & 3) == 0) | ||
407 | access = VM_EXEC; | ||
408 | #endif | ||
409 | fault = do_exception(regs, access, trans_exc_code); | 401 | fault = do_exception(regs, access, trans_exc_code); |
410 | if (unlikely(fault)) | 402 | if (unlikely(fault)) |
411 | do_fault_error(regs, int_code & 255, trans_exc_code, fault); | 403 | do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault); |
412 | } | 404 | } |
413 | 405 | ||
414 | #ifdef CONFIG_64BIT | 406 | #ifdef CONFIG_64BIT |
415 | void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) | 407 | void __kprobes do_asce_exception(struct pt_regs *regs, long pgm_int_code, |
408 | unsigned long trans_exc_code) | ||
416 | { | 409 | { |
417 | unsigned long trans_exc_code = S390_lowcore.trans_exc_code; | ||
418 | struct mm_struct *mm = current->mm; | 410 | struct mm_struct *mm = current->mm; |
419 | struct vm_area_struct *vma; | 411 | struct vm_area_struct *vma; |
420 | 412 | ||
421 | if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) | 413 | if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) |
422 | goto no_context; | 414 | goto no_context; |
423 | 415 | ||
424 | local_irq_enable(); | ||
425 | |||
426 | down_read(&mm->mmap_sem); | 416 | down_read(&mm->mmap_sem); |
427 | vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); | 417 | vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); |
428 | up_read(&mm->mmap_sem); | 418 | up_read(&mm->mmap_sem); |
@@ -434,16 +424,16 @@ void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) | |||
434 | 424 | ||
435 | /* User mode accesses just cause a SIGSEGV */ | 425 | /* User mode accesses just cause a SIGSEGV */ |
436 | if (regs->psw.mask & PSW_MASK_PSTATE) { | 426 | if (regs->psw.mask & PSW_MASK_PSTATE) { |
437 | do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); | 427 | do_sigsegv(regs, pgm_int_code, SEGV_MAPERR, trans_exc_code); |
438 | return; | 428 | return; |
439 | } | 429 | } |
440 | 430 | ||
441 | no_context: | 431 | no_context: |
442 | do_no_context(regs, int_code, trans_exc_code); | 432 | do_no_context(regs, pgm_int_code, trans_exc_code); |
443 | } | 433 | } |
444 | #endif | 434 | #endif |
445 | 435 | ||
446 | int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) | 436 | int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) |
447 | { | 437 | { |
448 | struct pt_regs regs; | 438 | struct pt_regs regs; |
449 | int access, fault; | 439 | int access, fault; |
@@ -454,14 +444,13 @@ int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) | |||
454 | regs.psw.addr = (unsigned long) __builtin_return_address(0); | 444 | regs.psw.addr = (unsigned long) __builtin_return_address(0); |
455 | regs.psw.addr |= PSW_ADDR_AMODE; | 445 | regs.psw.addr |= PSW_ADDR_AMODE; |
456 | uaddr &= PAGE_MASK; | 446 | uaddr &= PAGE_MASK; |
457 | access = write_user ? VM_WRITE : VM_READ; | 447 | access = write ? VM_WRITE : VM_READ; |
458 | fault = do_exception(®s, access, uaddr | 2); | 448 | fault = do_exception(®s, access, uaddr | 2); |
459 | if (unlikely(fault)) { | 449 | if (unlikely(fault)) { |
460 | if (fault & VM_FAULT_OOM) { | 450 | if (fault & VM_FAULT_OOM) |
461 | pagefault_out_of_memory(); | 451 | return -EFAULT; |
462 | fault = 0; | 452 | else if (fault & VM_FAULT_SIGBUS) |
463 | } else if (fault & VM_FAULT_SIGBUS) | 453 | do_sigbus(®s, pgm_int_code, uaddr); |
464 | do_sigbus(®s, int_code, uaddr); | ||
465 | } | 454 | } |
466 | return fault ? -EFAULT : 0; | 455 | return fault ? -EFAULT : 0; |
467 | } | 456 | } |
@@ -470,8 +459,7 @@ int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) | |||
470 | /* | 459 | /* |
471 | * 'pfault' pseudo page faults routines. | 460 | * 'pfault' pseudo page faults routines. |
472 | */ | 461 | */ |
473 | static ext_int_info_t ext_int_pfault; | 462 | static int pfault_disable; |
474 | static int pfault_disable = 0; | ||
475 | 463 | ||
476 | static int __init nopfault(char *str) | 464 | static int __init nopfault(char *str) |
477 | { | 465 | { |
@@ -481,22 +469,28 @@ static int __init nopfault(char *str) | |||
481 | 469 | ||
482 | __setup("nopfault", nopfault); | 470 | __setup("nopfault", nopfault); |
483 | 471 | ||
484 | typedef struct { | 472 | struct pfault_refbk { |
485 | __u16 refdiagc; | 473 | u16 refdiagc; |
486 | __u16 reffcode; | 474 | u16 reffcode; |
487 | __u16 refdwlen; | 475 | u16 refdwlen; |
488 | __u16 refversn; | 476 | u16 refversn; |
489 | __u64 refgaddr; | 477 | u64 refgaddr; |
490 | __u64 refselmk; | 478 | u64 refselmk; |
491 | __u64 refcmpmk; | 479 | u64 refcmpmk; |
492 | __u64 reserved; | 480 | u64 reserved; |
493 | } __attribute__ ((packed, aligned(8))) pfault_refbk_t; | 481 | } __attribute__ ((packed, aligned(8))); |
494 | 482 | ||
495 | int pfault_init(void) | 483 | int pfault_init(void) |
496 | { | 484 | { |
497 | pfault_refbk_t refbk = | 485 | struct pfault_refbk refbk = { |
498 | { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, | 486 | .refdiagc = 0x258, |
499 | __PF_RES_FIELD }; | 487 | .reffcode = 0, |
488 | .refdwlen = 5, | ||
489 | .refversn = 2, | ||
490 | .refgaddr = __LC_CURRENT_PID, | ||
491 | .refselmk = 1ULL << 48, | ||
492 | .refcmpmk = 1ULL << 48, | ||
493 | .reserved = __PF_RES_FIELD }; | ||
500 | int rc; | 494 | int rc; |
501 | 495 | ||
502 | if (!MACHINE_IS_VM || pfault_disable) | 496 | if (!MACHINE_IS_VM || pfault_disable) |
@@ -508,18 +502,20 @@ int pfault_init(void) | |||
508 | "2:\n" | 502 | "2:\n" |
509 | EX_TABLE(0b,1b) | 503 | EX_TABLE(0b,1b) |
510 | : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); | 504 | : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); |
511 | __ctl_set_bit(0, 9); | ||
512 | return rc; | 505 | return rc; |
513 | } | 506 | } |
514 | 507 | ||
515 | void pfault_fini(void) | 508 | void pfault_fini(void) |
516 | { | 509 | { |
517 | pfault_refbk_t refbk = | 510 | struct pfault_refbk refbk = { |
518 | { 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; | 511 | .refdiagc = 0x258, |
512 | .reffcode = 1, | ||
513 | .refdwlen = 5, | ||
514 | .refversn = 2, | ||
515 | }; | ||
519 | 516 | ||
520 | if (!MACHINE_IS_VM || pfault_disable) | 517 | if (!MACHINE_IS_VM || pfault_disable) |
521 | return; | 518 | return; |
522 | __ctl_clear_bit(0,9); | ||
523 | asm volatile( | 519 | asm volatile( |
524 | " diag %0,0,0x258\n" | 520 | " diag %0,0,0x258\n" |
525 | "0:\n" | 521 | "0:\n" |
@@ -527,10 +523,15 @@ void pfault_fini(void) | |||
527 | : : "a" (&refbk), "m" (refbk) : "cc"); | 523 | : : "a" (&refbk), "m" (refbk) : "cc"); |
528 | } | 524 | } |
529 | 525 | ||
530 | static void pfault_interrupt(__u16 int_code) | 526 | static DEFINE_SPINLOCK(pfault_lock); |
527 | static LIST_HEAD(pfault_list); | ||
528 | |||
529 | static void pfault_interrupt(unsigned int ext_int_code, | ||
530 | unsigned int param32, unsigned long param64) | ||
531 | { | 531 | { |
532 | struct task_struct *tsk; | 532 | struct task_struct *tsk; |
533 | __u16 subcode; | 533 | __u16 subcode; |
534 | pid_t pid; | ||
534 | 535 | ||
535 | /* | 536 | /* |
536 | * Get the external interruption subcode & pfault | 537 | * Get the external interruption subcode & pfault |
@@ -538,63 +539,107 @@ static void pfault_interrupt(__u16 int_code) | |||
538 | * in the 'cpu address' field associated with the | 539 | * in the 'cpu address' field associated with the |
539 | * external interrupt. | 540 | * external interrupt. |
540 | */ | 541 | */ |
541 | subcode = S390_lowcore.cpu_addr; | 542 | subcode = ext_int_code >> 16; |
542 | if ((subcode & 0xff00) != __SUBCODE_MASK) | 543 | if ((subcode & 0xff00) != __SUBCODE_MASK) |
543 | return; | 544 | return; |
544 | 545 | kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; | |
545 | /* | 546 | if (subcode & 0x0080) { |
546 | * Get the token (= address of the task structure of the affected task). | 547 | /* Get the token (= pid of the affected task). */ |
547 | */ | 548 | pid = sizeof(void *) == 4 ? param32 : param64; |
548 | tsk = *(struct task_struct **) __LC_PFAULT_INTPARM; | 549 | rcu_read_lock(); |
549 | 550 | tsk = find_task_by_pid_ns(pid, &init_pid_ns); | |
551 | if (tsk) | ||
552 | get_task_struct(tsk); | ||
553 | rcu_read_unlock(); | ||
554 | if (!tsk) | ||
555 | return; | ||
556 | } else { | ||
557 | tsk = current; | ||
558 | } | ||
559 | spin_lock(&pfault_lock); | ||
550 | if (subcode & 0x0080) { | 560 | if (subcode & 0x0080) { |
551 | /* signal bit is set -> a page has been swapped in by VM */ | 561 | /* signal bit is set -> a page has been swapped in by VM */ |
552 | if (xchg(&tsk->thread.pfault_wait, -1) != 0) { | 562 | if (tsk->thread.pfault_wait == 1) { |
553 | /* Initial interrupt was faster than the completion | 563 | /* Initial interrupt was faster than the completion |
554 | * interrupt. pfault_wait is valid. Set pfault_wait | 564 | * interrupt. pfault_wait is valid. Set pfault_wait |
555 | * back to zero and wake up the process. This can | 565 | * back to zero and wake up the process. This can |
556 | * safely be done because the task is still sleeping | 566 | * safely be done because the task is still sleeping |
557 | * and can't produce new pfaults. */ | 567 | * and can't produce new pfaults. */ |
558 | tsk->thread.pfault_wait = 0; | 568 | tsk->thread.pfault_wait = 0; |
569 | list_del(&tsk->thread.list); | ||
559 | wake_up_process(tsk); | 570 | wake_up_process(tsk); |
560 | put_task_struct(tsk); | 571 | } else { |
572 | /* Completion interrupt was faster than initial | ||
573 | * interrupt. Set pfault_wait to -1 so the initial | ||
574 | * interrupt doesn't put the task to sleep. */ | ||
575 | tsk->thread.pfault_wait = -1; | ||
561 | } | 576 | } |
577 | put_task_struct(tsk); | ||
562 | } else { | 578 | } else { |
563 | /* signal bit not set -> a real page is missing. */ | 579 | /* signal bit not set -> a real page is missing. */ |
564 | get_task_struct(tsk); | 580 | if (tsk->thread.pfault_wait == -1) { |
565 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
566 | if (xchg(&tsk->thread.pfault_wait, 1) != 0) { | ||
567 | /* Completion interrupt was faster than the initial | 581 | /* Completion interrupt was faster than the initial |
568 | * interrupt (swapped in a -1 for pfault_wait). Set | 582 | * interrupt (pfault_wait == -1). Set pfault_wait |
569 | * pfault_wait back to zero and exit. This can be | 583 | * back to zero and exit. */ |
570 | * done safely because tsk is running in kernel | ||
571 | * mode and can't produce new pfaults. */ | ||
572 | tsk->thread.pfault_wait = 0; | 584 | tsk->thread.pfault_wait = 0; |
573 | set_task_state(tsk, TASK_RUNNING); | 585 | } else { |
574 | put_task_struct(tsk); | 586 | /* Initial interrupt arrived before completion |
575 | } else | 587 | * interrupt. Let the task sleep. */ |
588 | tsk->thread.pfault_wait = 1; | ||
589 | list_add(&tsk->thread.list, &pfault_list); | ||
590 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
576 | set_tsk_need_resched(tsk); | 591 | set_tsk_need_resched(tsk); |
592 | } | ||
577 | } | 593 | } |
594 | spin_unlock(&pfault_lock); | ||
578 | } | 595 | } |
579 | 596 | ||
580 | void __init pfault_irq_init(void) | 597 | static int __cpuinit pfault_cpu_notify(struct notifier_block *self, |
598 | unsigned long action, void *hcpu) | ||
581 | { | 599 | { |
582 | if (!MACHINE_IS_VM) | 600 | struct thread_struct *thread, *next; |
583 | return; | 601 | struct task_struct *tsk; |
584 | 602 | ||
585 | /* | 603 | switch (action) { |
586 | * Try to get pfault pseudo page faults going. | 604 | case CPU_DEAD: |
587 | */ | 605 | case CPU_DEAD_FROZEN: |
588 | if (register_early_external_interrupt(0x2603, pfault_interrupt, | 606 | spin_lock_irq(&pfault_lock); |
589 | &ext_int_pfault) != 0) | 607 | list_for_each_entry_safe(thread, next, &pfault_list, list) { |
590 | panic("Couldn't request external interrupt 0x2603"); | 608 | thread->pfault_wait = 0; |
609 | list_del(&thread->list); | ||
610 | tsk = container_of(thread, struct task_struct, thread); | ||
611 | wake_up_process(tsk); | ||
612 | } | ||
613 | spin_unlock_irq(&pfault_lock); | ||
614 | break; | ||
615 | default: | ||
616 | break; | ||
617 | } | ||
618 | return NOTIFY_OK; | ||
619 | } | ||
591 | 620 | ||
592 | if (pfault_init() == 0) | 621 | static int __init pfault_irq_init(void) |
593 | return; | 622 | { |
623 | int rc; | ||
594 | 624 | ||
595 | /* Tough luck, no pfault. */ | 625 | if (!MACHINE_IS_VM) |
626 | return 0; | ||
627 | rc = register_external_interrupt(0x2603, pfault_interrupt); | ||
628 | if (rc) | ||
629 | goto out_extint; | ||
630 | rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; | ||
631 | if (rc) | ||
632 | goto out_pfault; | ||
633 | service_subclass_irq_register(); | ||
634 | hotcpu_notifier(pfault_cpu_notify, 0); | ||
635 | return 0; | ||
636 | |||
637 | out_pfault: | ||
638 | unregister_external_interrupt(0x2603, pfault_interrupt); | ||
639 | out_extint: | ||
596 | pfault_disable = 1; | 640 | pfault_disable = 1; |
597 | unregister_early_external_interrupt(0x2603, pfault_interrupt, | 641 | return rc; |
598 | &ext_int_pfault); | ||
599 | } | 642 | } |
600 | #endif | 643 | early_initcall(pfault_irq_init); |
644 | |||
645 | #endif /* CONFIG_PFAULT */ | ||