diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-06 13:17:26 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-06 13:17:26 -0500 |
commit | 28d9bfc37c861aa9c8386dff1ac7e9a10e5c5162 (patch) | |
tree | 85bcc2db18ff20e380a40aba375e70d14c2671b4 /kernel | |
parent | f3b0cfa9b017a9d4686c9b14b908a1685f97a077 (diff) | |
parent | 4b95f135f606c87e4056b6d7fd3c5781c818858b (diff) |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (146 commits)
tools, perf: Documentation for the power events API
perf: Add calls to suspend trace point
perf script: Make some lists static
perf script: Use the default lost event handler
perf session: Warn about errors when processing pipe events too
perf tools: Fix perf_event.h header usage
perf test: Clarify some error reports in the open syscall test
x86, NMI: Add touch_nmi_watchdog to io_check_error delay
x86: Avoid calling arch_trigger_all_cpu_backtrace() at the same time
x86: Only call smp_processor_id in non-preempt cases
perf timechart: Adjust perf timechart to the new power events
perf: Clean up power events by introducing new, more generic ones
perf: Do not export power_frequency, but power_start event
perf test: Add test for counting open syscalls
perf evsel: Auto allocate resources needed for some methods
perf evsel: Use {cpu,thread}_map to shorten list of parameters
perf tools: Refactor all_tids to hold nr and the map
perf tools: Refactor cpumap to hold nr and the map
perf evsel: Introduce per cpu and per thread open helpers
perf evsel: Steal the counter reading routines from stat
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/hw_breakpoint.c | 2 | ||||
-rw-r--r-- | kernel/kprobes.c | 565 | ||||
-rw-r--r-- | kernel/perf_event.c | 573 | ||||
-rw-r--r-- | kernel/power/suspend.c | 3 | ||||
-rw-r--r-- | kernel/sched.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 16 | ||||
-rw-r--r-- | kernel/sysctl_binary.c | 1 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 15 | ||||
-rw-r--r-- | kernel/trace/power-traces.c | 5 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 31 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_export.c | 14 | ||||
-rw-r--r-- | kernel/watchdog.c | 9 |
13 files changed, 894 insertions, 348 deletions
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index e5325825aeb6..086adf25a55e 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
@@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void) | |||
641 | 641 | ||
642 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
643 | 643 | ||
644 | perf_pmu_register(&perf_breakpoint); | 644 | perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); |
645 | 645 | ||
646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
647 | 647 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9737a76e106f..7663e5df0e6f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p) | |||
354 | return p->pre_handler == aggr_pre_handler; | 354 | return p->pre_handler == aggr_pre_handler; |
355 | } | 355 | } |
356 | 356 | ||
357 | /* Return true(!0) if the kprobe is unused */ | ||
358 | static inline int kprobe_unused(struct kprobe *p) | ||
359 | { | ||
360 | return kprobe_aggrprobe(p) && kprobe_disabled(p) && | ||
361 | list_empty(&p->list); | ||
362 | } | ||
363 | |||
357 | /* | 364 | /* |
358 | * Keep all fields in the kprobe consistent | 365 | * Keep all fields in the kprobe consistent |
359 | */ | 366 | */ |
360 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | 367 | static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) |
361 | { | 368 | { |
362 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | 369 | memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); |
363 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | 370 | memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); |
364 | } | 371 | } |
365 | 372 | ||
366 | #ifdef CONFIG_OPTPROBES | 373 | #ifdef CONFIG_OPTPROBES |
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
384 | } | 391 | } |
385 | } | 392 | } |
386 | 393 | ||
394 | /* Free optimized instructions and optimized_kprobe */ | ||
395 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
396 | { | ||
397 | struct optimized_kprobe *op; | ||
398 | |||
399 | op = container_of(p, struct optimized_kprobe, kp); | ||
400 | arch_remove_optimized_kprobe(op); | ||
401 | arch_remove_kprobe(p); | ||
402 | kfree(op); | ||
403 | } | ||
404 | |||
387 | /* Return true(!0) if the kprobe is ready for optimization. */ | 405 | /* Return true(!0) if the kprobe is ready for optimization. */ |
388 | static inline int kprobe_optready(struct kprobe *p) | 406 | static inline int kprobe_optready(struct kprobe *p) |
389 | { | 407 | { |
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p) | |||
397 | return 0; | 415 | return 0; |
398 | } | 416 | } |
399 | 417 | ||
418 | /* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ | ||
419 | static inline int kprobe_disarmed(struct kprobe *p) | ||
420 | { | ||
421 | struct optimized_kprobe *op; | ||
422 | |||
423 | /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ | ||
424 | if (!kprobe_aggrprobe(p)) | ||
425 | return kprobe_disabled(p); | ||
426 | |||
427 | op = container_of(p, struct optimized_kprobe, kp); | ||
428 | |||
429 | return kprobe_disabled(p) && list_empty(&op->list); | ||
430 | } | ||
431 | |||
432 | /* Return true(!0) if the probe is queued on (un)optimizing lists */ | ||
433 | static int __kprobes kprobe_queued(struct kprobe *p) | ||
434 | { | ||
435 | struct optimized_kprobe *op; | ||
436 | |||
437 | if (kprobe_aggrprobe(p)) { | ||
438 | op = container_of(p, struct optimized_kprobe, kp); | ||
439 | if (!list_empty(&op->list)) | ||
440 | return 1; | ||
441 | } | ||
442 | return 0; | ||
443 | } | ||
444 | |||
400 | /* | 445 | /* |
401 | * Return an optimized kprobe whose optimizing code replaces | 446 | * Return an optimized kprobe whose optimizing code replaces |
402 | * instructions including addr (exclude breakpoint). | 447 | * instructions including addr (exclude breakpoint). |
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | |||
422 | 467 | ||
423 | /* Optimization staging list, protected by kprobe_mutex */ | 468 | /* Optimization staging list, protected by kprobe_mutex */ |
424 | static LIST_HEAD(optimizing_list); | 469 | static LIST_HEAD(optimizing_list); |
470 | static LIST_HEAD(unoptimizing_list); | ||
425 | 471 | ||
426 | static void kprobe_optimizer(struct work_struct *work); | 472 | static void kprobe_optimizer(struct work_struct *work); |
427 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | 473 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); |
474 | static DECLARE_COMPLETION(optimizer_comp); | ||
428 | #define OPTIMIZE_DELAY 5 | 475 | #define OPTIMIZE_DELAY 5 |
429 | 476 | ||
430 | /* Kprobe jump optimizer */ | 477 | /* |
431 | static __kprobes void kprobe_optimizer(struct work_struct *work) | 478 | * Optimize (replace a breakpoint with a jump) kprobes listed on |
479 | * optimizing_list. | ||
480 | */ | ||
481 | static __kprobes void do_optimize_kprobes(void) | ||
432 | { | 482 | { |
433 | struct optimized_kprobe *op, *tmp; | 483 | /* Optimization never be done when disarmed */ |
434 | 484 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | |
435 | /* Lock modules while optimizing kprobes */ | 485 | list_empty(&optimizing_list)) |
436 | mutex_lock(&module_mutex); | 486 | return; |
437 | mutex_lock(&kprobe_mutex); | ||
438 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
439 | goto end; | ||
440 | |||
441 | /* | ||
442 | * Wait for quiesence period to ensure all running interrupts | ||
443 | * are done. Because optprobe may modify multiple instructions | ||
444 | * there is a chance that Nth instruction is interrupted. In that | ||
445 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
446 | * instruction. This wait is for avoiding it. | ||
447 | */ | ||
448 | synchronize_sched(); | ||
449 | 487 | ||
450 | /* | 488 | /* |
451 | * The optimization/unoptimization refers online_cpus via | 489 | * The optimization/unoptimization refers online_cpus via |
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
459 | */ | 497 | */ |
460 | get_online_cpus(); | 498 | get_online_cpus(); |
461 | mutex_lock(&text_mutex); | 499 | mutex_lock(&text_mutex); |
462 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | 500 | arch_optimize_kprobes(&optimizing_list); |
463 | WARN_ON(kprobe_disabled(&op->kp)); | 501 | mutex_unlock(&text_mutex); |
464 | if (arch_optimize_kprobe(op) < 0) | 502 | put_online_cpus(); |
465 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 503 | } |
466 | list_del_init(&op->list); | 504 | |
505 | /* | ||
506 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint | ||
507 | * if need) kprobes listed on unoptimizing_list. | ||
508 | */ | ||
509 | static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | ||
510 | { | ||
511 | struct optimized_kprobe *op, *tmp; | ||
512 | |||
513 | /* Unoptimization must be done anytime */ | ||
514 | if (list_empty(&unoptimizing_list)) | ||
515 | return; | ||
516 | |||
517 | /* Ditto to do_optimize_kprobes */ | ||
518 | get_online_cpus(); | ||
519 | mutex_lock(&text_mutex); | ||
520 | arch_unoptimize_kprobes(&unoptimizing_list, free_list); | ||
521 | /* Loop free_list for disarming */ | ||
522 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
523 | /* Disarm probes if marked disabled */ | ||
524 | if (kprobe_disabled(&op->kp)) | ||
525 | arch_disarm_kprobe(&op->kp); | ||
526 | if (kprobe_unused(&op->kp)) { | ||
527 | /* | ||
528 | * Remove unused probes from hash list. After waiting | ||
529 | * for synchronization, these probes are reclaimed. | ||
530 | * (reclaiming is done by do_free_cleaned_kprobes.) | ||
531 | */ | ||
532 | hlist_del_rcu(&op->kp.hlist); | ||
533 | } else | ||
534 | list_del_init(&op->list); | ||
467 | } | 535 | } |
468 | mutex_unlock(&text_mutex); | 536 | mutex_unlock(&text_mutex); |
469 | put_online_cpus(); | 537 | put_online_cpus(); |
470 | end: | 538 | } |
539 | |||
540 | /* Reclaim all kprobes on the free_list */ | ||
541 | static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) | ||
542 | { | ||
543 | struct optimized_kprobe *op, *tmp; | ||
544 | |||
545 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
546 | BUG_ON(!kprobe_unused(&op->kp)); | ||
547 | list_del_init(&op->list); | ||
548 | free_aggr_kprobe(&op->kp); | ||
549 | } | ||
550 | } | ||
551 | |||
552 | /* Start optimizer after OPTIMIZE_DELAY passed */ | ||
553 | static __kprobes void kick_kprobe_optimizer(void) | ||
554 | { | ||
555 | if (!delayed_work_pending(&optimizing_work)) | ||
556 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
557 | } | ||
558 | |||
559 | /* Kprobe jump optimizer */ | ||
560 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
561 | { | ||
562 | LIST_HEAD(free_list); | ||
563 | |||
564 | /* Lock modules while optimizing kprobes */ | ||
565 | mutex_lock(&module_mutex); | ||
566 | mutex_lock(&kprobe_mutex); | ||
567 | |||
568 | /* | ||
569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | ||
570 | * kprobes before waiting for quiesence period. | ||
571 | */ | ||
572 | do_unoptimize_kprobes(&free_list); | ||
573 | |||
574 | /* | ||
575 | * Step 2: Wait for quiesence period to ensure all running interrupts | ||
576 | * are done. Because optprobe may modify multiple instructions | ||
577 | * there is a chance that Nth instruction is interrupted. In that | ||
578 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
579 | * instruction. This wait is for avoiding it. | ||
580 | */ | ||
581 | synchronize_sched(); | ||
582 | |||
583 | /* Step 3: Optimize kprobes after quiesence period */ | ||
584 | do_optimize_kprobes(); | ||
585 | |||
586 | /* Step 4: Free cleaned kprobes after quiesence period */ | ||
587 | do_free_cleaned_kprobes(&free_list); | ||
588 | |||
471 | mutex_unlock(&kprobe_mutex); | 589 | mutex_unlock(&kprobe_mutex); |
472 | mutex_unlock(&module_mutex); | 590 | mutex_unlock(&module_mutex); |
591 | |||
592 | /* Step 5: Kick optimizer again if needed */ | ||
593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | ||
594 | kick_kprobe_optimizer(); | ||
595 | else | ||
596 | /* Wake up all waiters */ | ||
597 | complete_all(&optimizer_comp); | ||
598 | } | ||
599 | |||
600 | /* Wait for completing optimization and unoptimization */ | ||
601 | static __kprobes void wait_for_kprobe_optimizer(void) | ||
602 | { | ||
603 | if (delayed_work_pending(&optimizing_work)) | ||
604 | wait_for_completion(&optimizer_comp); | ||
473 | } | 605 | } |
474 | 606 | ||
475 | /* Optimize kprobe if p is ready to be optimized */ | 607 | /* Optimize kprobe if p is ready to be optimized */ |
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p) | |||
495 | /* Check if it is already optimized. */ | 627 | /* Check if it is already optimized. */ |
496 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | 628 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) |
497 | return; | 629 | return; |
498 | |||
499 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | 630 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; |
500 | list_add(&op->list, &optimizing_list); | 631 | |
501 | if (!delayed_work_pending(&optimizing_work)) | 632 | if (!list_empty(&op->list)) |
502 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | 633 | /* This is under unoptimizing. Just dequeue the probe */ |
634 | list_del_init(&op->list); | ||
635 | else { | ||
636 | list_add(&op->list, &optimizing_list); | ||
637 | kick_kprobe_optimizer(); | ||
638 | } | ||
639 | } | ||
640 | |||
641 | /* Short cut to direct unoptimizing */ | ||
642 | static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) | ||
643 | { | ||
644 | get_online_cpus(); | ||
645 | arch_unoptimize_kprobe(op); | ||
646 | put_online_cpus(); | ||
647 | if (kprobe_disabled(&op->kp)) | ||
648 | arch_disarm_kprobe(&op->kp); | ||
503 | } | 649 | } |
504 | 650 | ||
505 | /* Unoptimize a kprobe if p is optimized */ | 651 | /* Unoptimize a kprobe if p is optimized */ |
506 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | 652 | static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) |
507 | { | 653 | { |
508 | struct optimized_kprobe *op; | 654 | struct optimized_kprobe *op; |
509 | 655 | ||
510 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | 656 | if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) |
511 | op = container_of(p, struct optimized_kprobe, kp); | 657 | return; /* This is not an optprobe nor optimized */ |
512 | if (!list_empty(&op->list)) | 658 | |
513 | /* Dequeue from the optimization queue */ | 659 | op = container_of(p, struct optimized_kprobe, kp); |
660 | if (!kprobe_optimized(p)) { | ||
661 | /* Unoptimized or unoptimizing case */ | ||
662 | if (force && !list_empty(&op->list)) { | ||
663 | /* | ||
664 | * Only if this is unoptimizing kprobe and forced, | ||
665 | * forcibly unoptimize it. (No need to unoptimize | ||
666 | * unoptimized kprobe again :) | ||
667 | */ | ||
514 | list_del_init(&op->list); | 668 | list_del_init(&op->list); |
515 | else | 669 | force_unoptimize_kprobe(op); |
516 | /* Replace jump with break */ | 670 | } |
517 | arch_unoptimize_kprobe(op); | 671 | return; |
518 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 672 | } |
673 | |||
674 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
675 | if (!list_empty(&op->list)) { | ||
676 | /* Dequeue from the optimization queue */ | ||
677 | list_del_init(&op->list); | ||
678 | return; | ||
679 | } | ||
680 | /* Optimized kprobe case */ | ||
681 | if (force) | ||
682 | /* Forcibly update the code: this is a special case */ | ||
683 | force_unoptimize_kprobe(op); | ||
684 | else { | ||
685 | list_add(&op->list, &unoptimizing_list); | ||
686 | kick_kprobe_optimizer(); | ||
519 | } | 687 | } |
520 | } | 688 | } |
521 | 689 | ||
690 | /* Cancel unoptimizing for reusing */ | ||
691 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
692 | { | ||
693 | struct optimized_kprobe *op; | ||
694 | |||
695 | BUG_ON(!kprobe_unused(ap)); | ||
696 | /* | ||
697 | * Unused kprobe MUST be on the way of delayed unoptimizing (means | ||
698 | * there is still a relative jump) and disabled. | ||
699 | */ | ||
700 | op = container_of(ap, struct optimized_kprobe, kp); | ||
701 | if (unlikely(list_empty(&op->list))) | ||
702 | printk(KERN_WARNING "Warning: found a stray unused " | ||
703 | "aggrprobe@%p\n", ap->addr); | ||
704 | /* Enable the probe again */ | ||
705 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
706 | /* Optimize it again (remove from op->list) */ | ||
707 | BUG_ON(!kprobe_optready(ap)); | ||
708 | optimize_kprobe(ap); | ||
709 | } | ||
710 | |||
522 | /* Remove optimized instructions */ | 711 | /* Remove optimized instructions */ |
523 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | 712 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) |
524 | { | 713 | { |
525 | struct optimized_kprobe *op; | 714 | struct optimized_kprobe *op; |
526 | 715 | ||
527 | op = container_of(p, struct optimized_kprobe, kp); | 716 | op = container_of(p, struct optimized_kprobe, kp); |
528 | if (!list_empty(&op->list)) { | 717 | if (!list_empty(&op->list)) |
529 | /* Dequeue from the optimization queue */ | 718 | /* Dequeue from the (un)optimization queue */ |
530 | list_del_init(&op->list); | 719 | list_del_init(&op->list); |
531 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 720 | |
532 | } | 721 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; |
533 | /* Don't unoptimize, because the target code will be freed. */ | 722 | /* Don't touch the code, because it is already freed. */ |
534 | arch_remove_optimized_kprobe(op); | 723 | arch_remove_optimized_kprobe(op); |
535 | } | 724 | } |
536 | 725 | ||
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | |||
543 | arch_prepare_optimized_kprobe(op); | 732 | arch_prepare_optimized_kprobe(op); |
544 | } | 733 | } |
545 | 734 | ||
546 | /* Free optimized instructions and optimized_kprobe */ | ||
547 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
548 | { | ||
549 | struct optimized_kprobe *op; | ||
550 | |||
551 | op = container_of(p, struct optimized_kprobe, kp); | ||
552 | arch_remove_optimized_kprobe(op); | ||
553 | kfree(op); | ||
554 | } | ||
555 | |||
556 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | 735 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ |
557 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | 736 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) |
558 | { | 737 | { |
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
587 | op = container_of(ap, struct optimized_kprobe, kp); | 766 | op = container_of(ap, struct optimized_kprobe, kp); |
588 | if (!arch_prepared_optinsn(&op->optinsn)) { | 767 | if (!arch_prepared_optinsn(&op->optinsn)) { |
589 | /* If failed to setup optimizing, fallback to kprobe */ | 768 | /* If failed to setup optimizing, fallback to kprobe */ |
590 | free_aggr_kprobe(ap); | 769 | arch_remove_optimized_kprobe(op); |
770 | kfree(op); | ||
591 | return; | 771 | return; |
592 | } | 772 | } |
593 | 773 | ||
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void) | |||
631 | return; | 811 | return; |
632 | 812 | ||
633 | kprobes_allow_optimization = false; | 813 | kprobes_allow_optimization = false; |
634 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
635 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
636 | mutex_lock(&text_mutex); | ||
637 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 814 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
638 | head = &kprobe_table[i]; | 815 | head = &kprobe_table[i]; |
639 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 816 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
640 | if (!kprobe_disabled(p)) | 817 | if (!kprobe_disabled(p)) |
641 | unoptimize_kprobe(p); | 818 | unoptimize_kprobe(p, false); |
642 | } | 819 | } |
643 | } | 820 | } |
644 | 821 | /* Wait for unoptimizing completion */ | |
645 | mutex_unlock(&text_mutex); | 822 | wait_for_kprobe_optimizer(); |
646 | put_online_cpus(); | 823 | printk(KERN_INFO "Kprobes globally unoptimized\n"); |
647 | /* Allow all currently running kprobes to complete */ | ||
648 | synchronize_sched(); | ||
649 | } | 824 | } |
650 | 825 | ||
651 | int sysctl_kprobes_optimization; | 826 | int sysctl_kprobes_optimization; |
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
669 | } | 844 | } |
670 | #endif /* CONFIG_SYSCTL */ | 845 | #endif /* CONFIG_SYSCTL */ |
671 | 846 | ||
847 | /* Put a breakpoint for a probe. Must be called with text_mutex locked */ | ||
672 | static void __kprobes __arm_kprobe(struct kprobe *p) | 848 | static void __kprobes __arm_kprobe(struct kprobe *p) |
673 | { | 849 | { |
674 | struct kprobe *old_p; | 850 | struct kprobe *_p; |
675 | 851 | ||
676 | /* Check collision with other optimized kprobes */ | 852 | /* Check collision with other optimized kprobes */ |
677 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 853 | _p = get_optimized_kprobe((unsigned long)p->addr); |
678 | if (unlikely(old_p)) | 854 | if (unlikely(_p)) |
679 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | 855 | /* Fallback to unoptimized kprobe */ |
856 | unoptimize_kprobe(_p, true); | ||
680 | 857 | ||
681 | arch_arm_kprobe(p); | 858 | arch_arm_kprobe(p); |
682 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | 859 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ |
683 | } | 860 | } |
684 | 861 | ||
685 | static void __kprobes __disarm_kprobe(struct kprobe *p) | 862 | /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ |
863 | static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) | ||
686 | { | 864 | { |
687 | struct kprobe *old_p; | 865 | struct kprobe *_p; |
688 | 866 | ||
689 | unoptimize_kprobe(p); /* Try to unoptimize */ | 867 | unoptimize_kprobe(p, false); /* Try to unoptimize */ |
690 | arch_disarm_kprobe(p); | ||
691 | 868 | ||
692 | /* If another kprobe was blocked, optimize it. */ | 869 | if (!kprobe_queued(p)) { |
693 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 870 | arch_disarm_kprobe(p); |
694 | if (unlikely(old_p)) | 871 | /* If another kprobe was blocked, optimize it. */ |
695 | optimize_kprobe(old_p); | 872 | _p = get_optimized_kprobe((unsigned long)p->addr); |
873 | if (unlikely(_p) && reopt) | ||
874 | optimize_kprobe(_p); | ||
875 | } | ||
876 | /* TODO: reoptimize others after unoptimized this probe */ | ||
696 | } | 877 | } |
697 | 878 | ||
698 | #else /* !CONFIG_OPTPROBES */ | 879 | #else /* !CONFIG_OPTPROBES */ |
699 | 880 | ||
700 | #define optimize_kprobe(p) do {} while (0) | 881 | #define optimize_kprobe(p) do {} while (0) |
701 | #define unoptimize_kprobe(p) do {} while (0) | 882 | #define unoptimize_kprobe(p, f) do {} while (0) |
702 | #define kill_optimized_kprobe(p) do {} while (0) | 883 | #define kill_optimized_kprobe(p) do {} while (0) |
703 | #define prepare_optimized_kprobe(p) do {} while (0) | 884 | #define prepare_optimized_kprobe(p) do {} while (0) |
704 | #define try_to_optimize_kprobe(p) do {} while (0) | 885 | #define try_to_optimize_kprobe(p) do {} while (0) |
705 | #define __arm_kprobe(p) arch_arm_kprobe(p) | 886 | #define __arm_kprobe(p) arch_arm_kprobe(p) |
706 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | 887 | #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) |
888 | #define kprobe_disarmed(p) kprobe_disabled(p) | ||
889 | #define wait_for_kprobe_optimizer() do {} while (0) | ||
890 | |||
891 | /* There should be no unused kprobes can be reused without optimization */ | ||
892 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
893 | { | ||
894 | printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); | ||
895 | BUG_ON(kprobe_unused(ap)); | ||
896 | } | ||
707 | 897 | ||
708 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | 898 | static __kprobes void free_aggr_kprobe(struct kprobe *p) |
709 | { | 899 | { |
900 | arch_remove_kprobe(p); | ||
710 | kfree(p); | 901 | kfree(p); |
711 | } | 902 | } |
712 | 903 | ||
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
732 | /* Disarm a kprobe with text_mutex */ | 923 | /* Disarm a kprobe with text_mutex */ |
733 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 924 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
734 | { | 925 | { |
735 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | 926 | /* Ditto */ |
736 | mutex_lock(&text_mutex); | 927 | mutex_lock(&text_mutex); |
737 | __disarm_kprobe(kp); | 928 | __disarm_kprobe(kp, true); |
738 | mutex_unlock(&text_mutex); | 929 | mutex_unlock(&text_mutex); |
739 | put_online_cpus(); | ||
740 | } | 930 | } |
741 | 931 | ||
742 | /* | 932 | /* |
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
942 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 1132 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
943 | 1133 | ||
944 | if (p->break_handler || p->post_handler) | 1134 | if (p->break_handler || p->post_handler) |
945 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | 1135 | unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ |
946 | 1136 | ||
947 | if (p->break_handler) { | 1137 | if (p->break_handler) { |
948 | if (ap->break_handler) | 1138 | if (ap->break_handler) |
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
993 | * This is the second or subsequent kprobe at the address - handle | 1183 | * This is the second or subsequent kprobe at the address - handle |
994 | * the intricacies | 1184 | * the intricacies |
995 | */ | 1185 | */ |
996 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 1186 | static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, |
997 | struct kprobe *p) | 1187 | struct kprobe *p) |
998 | { | 1188 | { |
999 | int ret = 0; | 1189 | int ret = 0; |
1000 | struct kprobe *ap = old_p; | 1190 | struct kprobe *ap = orig_p; |
1001 | 1191 | ||
1002 | if (!kprobe_aggrprobe(old_p)) { | 1192 | if (!kprobe_aggrprobe(orig_p)) { |
1003 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ | 1193 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ |
1004 | ap = alloc_aggr_kprobe(old_p); | 1194 | ap = alloc_aggr_kprobe(orig_p); |
1005 | if (!ap) | 1195 | if (!ap) |
1006 | return -ENOMEM; | 1196 | return -ENOMEM; |
1007 | init_aggr_kprobe(ap, old_p); | 1197 | init_aggr_kprobe(ap, orig_p); |
1008 | } | 1198 | } else if (kprobe_unused(ap)) |
1199 | /* This probe is going to die. Rescue it */ | ||
1200 | reuse_unused_kprobe(ap); | ||
1009 | 1201 | ||
1010 | if (kprobe_gone(ap)) { | 1202 | if (kprobe_gone(ap)) { |
1011 | /* | 1203 | /* |
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
1039 | return add_new_kprobe(ap, p); | 1231 | return add_new_kprobe(ap, p); |
1040 | } | 1232 | } |
1041 | 1233 | ||
1042 | /* Try to disable aggr_kprobe, and return 1 if succeeded.*/ | ||
1043 | static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) | ||
1044 | { | ||
1045 | struct kprobe *kp; | ||
1046 | |||
1047 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
1048 | if (!kprobe_disabled(kp)) | ||
1049 | /* | ||
1050 | * There is an active probe on the list. | ||
1051 | * We can't disable aggr_kprobe. | ||
1052 | */ | ||
1053 | return 0; | ||
1054 | } | ||
1055 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1056 | return 1; | ||
1057 | } | ||
1058 | |||
1059 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1234 | static int __kprobes in_kprobes_functions(unsigned long addr) |
1060 | { | 1235 | { |
1061 | struct kprobe_blackpoint *kb; | 1236 | struct kprobe_blackpoint *kb; |
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | |||
1098 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1273 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
1099 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | 1274 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) |
1100 | { | 1275 | { |
1101 | struct kprobe *old_p, *list_p; | 1276 | struct kprobe *ap, *list_p; |
1102 | 1277 | ||
1103 | old_p = get_kprobe(p->addr); | 1278 | ap = get_kprobe(p->addr); |
1104 | if (unlikely(!old_p)) | 1279 | if (unlikely(!ap)) |
1105 | return NULL; | 1280 | return NULL; |
1106 | 1281 | ||
1107 | if (p != old_p) { | 1282 | if (p != ap) { |
1108 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 1283 | list_for_each_entry_rcu(list_p, &ap->list, list) |
1109 | if (list_p == p) | 1284 | if (list_p == p) |
1110 | /* kprobe p is a valid probe */ | 1285 | /* kprobe p is a valid probe */ |
1111 | goto valid; | 1286 | goto valid; |
1112 | return NULL; | 1287 | return NULL; |
1113 | } | 1288 | } |
1114 | valid: | 1289 | valid: |
1115 | return old_p; | 1290 | return ap; |
1116 | } | 1291 | } |
1117 | 1292 | ||
1118 | /* Return error if the kprobe is being re-registered */ | 1293 | /* Return error if the kprobe is being re-registered */ |
1119 | static inline int check_kprobe_rereg(struct kprobe *p) | 1294 | static inline int check_kprobe_rereg(struct kprobe *p) |
1120 | { | 1295 | { |
1121 | int ret = 0; | 1296 | int ret = 0; |
1122 | struct kprobe *old_p; | ||
1123 | 1297 | ||
1124 | mutex_lock(&kprobe_mutex); | 1298 | mutex_lock(&kprobe_mutex); |
1125 | old_p = __get_valid_kprobe(p); | 1299 | if (__get_valid_kprobe(p)) |
1126 | if (old_p) | ||
1127 | ret = -EINVAL; | 1300 | ret = -EINVAL; |
1128 | mutex_unlock(&kprobe_mutex); | 1301 | mutex_unlock(&kprobe_mutex); |
1302 | |||
1129 | return ret; | 1303 | return ret; |
1130 | } | 1304 | } |
1131 | 1305 | ||
@@ -1229,67 +1403,121 @@ fail_with_jump_label: | |||
1229 | } | 1403 | } |
1230 | EXPORT_SYMBOL_GPL(register_kprobe); | 1404 | EXPORT_SYMBOL_GPL(register_kprobe); |
1231 | 1405 | ||
1406 | /* Check if all probes on the aggrprobe are disabled */ | ||
1407 | static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) | ||
1408 | { | ||
1409 | struct kprobe *kp; | ||
1410 | |||
1411 | list_for_each_entry_rcu(kp, &ap->list, list) | ||
1412 | if (!kprobe_disabled(kp)) | ||
1413 | /* | ||
1414 | * There is an active probe on the list. | ||
1415 | * We can't disable this ap. | ||
1416 | */ | ||
1417 | return 0; | ||
1418 | |||
1419 | return 1; | ||
1420 | } | ||
1421 | |||
1422 | /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ | ||
1423 | static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | ||
1424 | { | ||
1425 | struct kprobe *orig_p; | ||
1426 | |||
1427 | /* Get an original kprobe for return */ | ||
1428 | orig_p = __get_valid_kprobe(p); | ||
1429 | if (unlikely(orig_p == NULL)) | ||
1430 | return NULL; | ||
1431 | |||
1432 | if (!kprobe_disabled(p)) { | ||
1433 | /* Disable probe if it is a child probe */ | ||
1434 | if (p != orig_p) | ||
1435 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1436 | |||
1437 | /* Try to disarm and disable this/parent probe */ | ||
1438 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { | ||
1439 | disarm_kprobe(orig_p); | ||
1440 | orig_p->flags |= KPROBE_FLAG_DISABLED; | ||
1441 | } | ||
1442 | } | ||
1443 | |||
1444 | return orig_p; | ||
1445 | } | ||
1446 | |||
1232 | /* | 1447 | /* |
1233 | * Unregister a kprobe without a scheduler synchronization. | 1448 | * Unregister a kprobe without a scheduler synchronization. |
1234 | */ | 1449 | */ |
1235 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) | 1450 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) |
1236 | { | 1451 | { |
1237 | struct kprobe *old_p, *list_p; | 1452 | struct kprobe *ap, *list_p; |
1238 | 1453 | ||
1239 | old_p = __get_valid_kprobe(p); | 1454 | /* Disable kprobe. This will disarm it if needed. */ |
1240 | if (old_p == NULL) | 1455 | ap = __disable_kprobe(p); |
1456 | if (ap == NULL) | ||
1241 | return -EINVAL; | 1457 | return -EINVAL; |
1242 | 1458 | ||
1243 | if (old_p == p || | 1459 | if (ap == p) |
1244 | (kprobe_aggrprobe(old_p) && | ||
1245 | list_is_singular(&old_p->list))) { | ||
1246 | /* | 1460 | /* |
1247 | * Only probe on the hash list. Disarm only if kprobes are | 1461 | * This probe is an independent(and non-optimized) kprobe |
1248 | * enabled and not gone - otherwise, the breakpoint would | 1462 | * (not an aggrprobe). Remove from the hash list. |
1249 | * already have been removed. We save on flushing icache. | ||
1250 | */ | 1463 | */ |
1251 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1464 | goto disarmed; |
1252 | disarm_kprobe(old_p); | 1465 | |
1253 | hlist_del_rcu(&old_p->hlist); | 1466 | /* Following process expects this probe is an aggrprobe */ |
1254 | } else { | 1467 | WARN_ON(!kprobe_aggrprobe(ap)); |
1468 | |||
1469 | if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) | ||
1470 | /* | ||
1471 | * !disarmed could be happen if the probe is under delayed | ||
1472 | * unoptimizing. | ||
1473 | */ | ||
1474 | goto disarmed; | ||
1475 | else { | ||
1476 | /* If disabling probe has special handlers, update aggrprobe */ | ||
1255 | if (p->break_handler && !kprobe_gone(p)) | 1477 | if (p->break_handler && !kprobe_gone(p)) |
1256 | old_p->break_handler = NULL; | 1478 | ap->break_handler = NULL; |
1257 | if (p->post_handler && !kprobe_gone(p)) { | 1479 | if (p->post_handler && !kprobe_gone(p)) { |
1258 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | 1480 | list_for_each_entry_rcu(list_p, &ap->list, list) { |
1259 | if ((list_p != p) && (list_p->post_handler)) | 1481 | if ((list_p != p) && (list_p->post_handler)) |
1260 | goto noclean; | 1482 | goto noclean; |
1261 | } | 1483 | } |
1262 | old_p->post_handler = NULL; | 1484 | ap->post_handler = NULL; |
1263 | } | 1485 | } |
1264 | noclean: | 1486 | noclean: |
1487 | /* | ||
1488 | * Remove from the aggrprobe: this path will do nothing in | ||
1489 | * __unregister_kprobe_bottom(). | ||
1490 | */ | ||
1265 | list_del_rcu(&p->list); | 1491 | list_del_rcu(&p->list); |
1266 | if (!kprobe_disabled(old_p)) { | 1492 | if (!kprobe_disabled(ap) && !kprobes_all_disarmed) |
1267 | try_to_disable_aggr_kprobe(old_p); | 1493 | /* |
1268 | if (!kprobes_all_disarmed) { | 1494 | * Try to optimize this probe again, because post |
1269 | if (kprobe_disabled(old_p)) | 1495 | * handler may have been changed. |
1270 | disarm_kprobe(old_p); | 1496 | */ |
1271 | else | 1497 | optimize_kprobe(ap); |
1272 | /* Try to optimize this probe again */ | ||
1273 | optimize_kprobe(old_p); | ||
1274 | } | ||
1275 | } | ||
1276 | } | 1498 | } |
1277 | return 0; | 1499 | return 0; |
1500 | |||
1501 | disarmed: | ||
1502 | BUG_ON(!kprobe_disarmed(ap)); | ||
1503 | hlist_del_rcu(&ap->hlist); | ||
1504 | return 0; | ||
1278 | } | 1505 | } |
1279 | 1506 | ||
1280 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | 1507 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
1281 | { | 1508 | { |
1282 | struct kprobe *old_p; | 1509 | struct kprobe *ap; |
1283 | 1510 | ||
1284 | if (list_empty(&p->list)) | 1511 | if (list_empty(&p->list)) |
1512 | /* This is an independent kprobe */ | ||
1285 | arch_remove_kprobe(p); | 1513 | arch_remove_kprobe(p); |
1286 | else if (list_is_singular(&p->list)) { | 1514 | else if (list_is_singular(&p->list)) { |
1287 | /* "p" is the last child of an aggr_kprobe */ | 1515 | /* This is the last child of an aggrprobe */ |
1288 | old_p = list_entry(p->list.next, struct kprobe, list); | 1516 | ap = list_entry(p->list.next, struct kprobe, list); |
1289 | list_del(&p->list); | 1517 | list_del(&p->list); |
1290 | arch_remove_kprobe(old_p); | 1518 | free_aggr_kprobe(ap); |
1291 | free_aggr_kprobe(old_p); | ||
1292 | } | 1519 | } |
1520 | /* Otherwise, do nothing. */ | ||
1293 | } | 1521 | } |
1294 | 1522 | ||
1295 | int __kprobes register_kprobes(struct kprobe **kps, int num) | 1523 | int __kprobes register_kprobes(struct kprobe **kps, int num) |
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1607 | int __kprobes disable_kprobe(struct kprobe *kp) | 1835 | int __kprobes disable_kprobe(struct kprobe *kp) |
1608 | { | 1836 | { |
1609 | int ret = 0; | 1837 | int ret = 0; |
1610 | struct kprobe *p; | ||
1611 | 1838 | ||
1612 | mutex_lock(&kprobe_mutex); | 1839 | mutex_lock(&kprobe_mutex); |
1613 | 1840 | ||
1614 | /* Check whether specified probe is valid. */ | 1841 | /* Disable this kprobe */ |
1615 | p = __get_valid_kprobe(kp); | 1842 | if (__disable_kprobe(kp) == NULL) |
1616 | if (unlikely(p == NULL)) { | ||
1617 | ret = -EINVAL; | 1843 | ret = -EINVAL; |
1618 | goto out; | ||
1619 | } | ||
1620 | 1844 | ||
1621 | /* If the probe is already disabled (or gone), just return */ | ||
1622 | if (kprobe_disabled(kp)) | ||
1623 | goto out; | ||
1624 | |||
1625 | kp->flags |= KPROBE_FLAG_DISABLED; | ||
1626 | if (p != kp) | ||
1627 | /* When kp != p, p is always enabled. */ | ||
1628 | try_to_disable_aggr_kprobe(p); | ||
1629 | |||
1630 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
1631 | disarm_kprobe(p); | ||
1632 | out: | ||
1633 | mutex_unlock(&kprobe_mutex); | 1845 | mutex_unlock(&kprobe_mutex); |
1634 | return ret; | 1846 | return ret; |
1635 | } | 1847 | } |
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void) | |||
1927 | mutex_lock(&kprobe_mutex); | 2139 | mutex_lock(&kprobe_mutex); |
1928 | 2140 | ||
1929 | /* If kprobes are already disarmed, just return */ | 2141 | /* If kprobes are already disarmed, just return */ |
1930 | if (kprobes_all_disarmed) | 2142 | if (kprobes_all_disarmed) { |
1931 | goto already_disabled; | 2143 | mutex_unlock(&kprobe_mutex); |
2144 | return; | ||
2145 | } | ||
1932 | 2146 | ||
1933 | kprobes_all_disarmed = true; | 2147 | kprobes_all_disarmed = true; |
1934 | printk(KERN_INFO "Kprobes globally disabled\n"); | 2148 | printk(KERN_INFO "Kprobes globally disabled\n"); |
1935 | 2149 | ||
1936 | /* | ||
1937 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
1938 | * because disarming may also unoptimize kprobes. | ||
1939 | */ | ||
1940 | get_online_cpus(); | ||
1941 | mutex_lock(&text_mutex); | 2150 | mutex_lock(&text_mutex); |
1942 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2151 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1943 | head = &kprobe_table[i]; | 2152 | head = &kprobe_table[i]; |
1944 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2153 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1945 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2154 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
1946 | __disarm_kprobe(p); | 2155 | __disarm_kprobe(p, false); |
1947 | } | 2156 | } |
1948 | } | 2157 | } |
1949 | |||
1950 | mutex_unlock(&text_mutex); | 2158 | mutex_unlock(&text_mutex); |
1951 | put_online_cpus(); | ||
1952 | mutex_unlock(&kprobe_mutex); | 2159 | mutex_unlock(&kprobe_mutex); |
1953 | /* Allow all currently running kprobes to complete */ | ||
1954 | synchronize_sched(); | ||
1955 | return; | ||
1956 | 2160 | ||
1957 | already_disabled: | 2161 | /* Wait for disarming all kprobes by optimizer */ |
1958 | mutex_unlock(&kprobe_mutex); | 2162 | wait_for_kprobe_optimizer(); |
1959 | return; | ||
1960 | } | 2163 | } |
1961 | 2164 | ||
1962 | /* | 2165 | /* |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2870feee81dd..11847bf1e8cc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/smp.h> | 15 | #include <linux/smp.h> |
16 | #include <linux/idr.h> | ||
16 | #include <linux/file.h> | 17 | #include <linux/file.h> |
17 | #include <linux/poll.h> | 18 | #include <linux/poll.h> |
18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
@@ -21,7 +22,9 @@ | |||
21 | #include <linux/dcache.h> | 22 | #include <linux/dcache.h> |
22 | #include <linux/percpu.h> | 23 | #include <linux/percpu.h> |
23 | #include <linux/ptrace.h> | 24 | #include <linux/ptrace.h> |
25 | #include <linux/reboot.h> | ||
24 | #include <linux/vmstat.h> | 26 | #include <linux/vmstat.h> |
27 | #include <linux/device.h> | ||
25 | #include <linux/vmalloc.h> | 28 | #include <linux/vmalloc.h> |
26 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
27 | #include <linux/rculist.h> | 30 | #include <linux/rculist.h> |
@@ -133,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
133 | } | 136 | } |
134 | } | 137 | } |
135 | 138 | ||
139 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
140 | { | ||
141 | /* | ||
142 | * only top level events have the pid namespace they were created in | ||
143 | */ | ||
144 | if (event->parent) | ||
145 | event = event->parent; | ||
146 | |||
147 | return task_tgid_nr_ns(p, event->ns); | ||
148 | } | ||
149 | |||
150 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
151 | { | ||
152 | /* | ||
153 | * only top level events have the pid namespace they were created in | ||
154 | */ | ||
155 | if (event->parent) | ||
156 | event = event->parent; | ||
157 | |||
158 | return task_pid_nr_ns(p, event->ns); | ||
159 | } | ||
160 | |||
136 | /* | 161 | /* |
137 | * If we inherit events we want to return the parent event id | 162 | * If we inherit events we want to return the parent event id |
138 | * to userspace. | 163 | * to userspace. |
@@ -312,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
312 | ctx->nr_stat++; | 337 | ctx->nr_stat++; |
313 | } | 338 | } |
314 | 339 | ||
340 | /* | ||
341 | * Called at perf_event creation and when events are attached/detached from a | ||
342 | * group. | ||
343 | */ | ||
344 | static void perf_event__read_size(struct perf_event *event) | ||
345 | { | ||
346 | int entry = sizeof(u64); /* value */ | ||
347 | int size = 0; | ||
348 | int nr = 1; | ||
349 | |||
350 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
351 | size += sizeof(u64); | ||
352 | |||
353 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
354 | size += sizeof(u64); | ||
355 | |||
356 | if (event->attr.read_format & PERF_FORMAT_ID) | ||
357 | entry += sizeof(u64); | ||
358 | |||
359 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
360 | nr += event->group_leader->nr_siblings; | ||
361 | size += sizeof(u64); | ||
362 | } | ||
363 | |||
364 | size += entry * nr; | ||
365 | event->read_size = size; | ||
366 | } | ||
367 | |||
368 | static void perf_event__header_size(struct perf_event *event) | ||
369 | { | ||
370 | struct perf_sample_data *data; | ||
371 | u64 sample_type = event->attr.sample_type; | ||
372 | u16 size = 0; | ||
373 | |||
374 | perf_event__read_size(event); | ||
375 | |||
376 | if (sample_type & PERF_SAMPLE_IP) | ||
377 | size += sizeof(data->ip); | ||
378 | |||
379 | if (sample_type & PERF_SAMPLE_ADDR) | ||
380 | size += sizeof(data->addr); | ||
381 | |||
382 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
383 | size += sizeof(data->period); | ||
384 | |||
385 | if (sample_type & PERF_SAMPLE_READ) | ||
386 | size += event->read_size; | ||
387 | |||
388 | event->header_size = size; | ||
389 | } | ||
390 | |||
391 | static void perf_event__id_header_size(struct perf_event *event) | ||
392 | { | ||
393 | struct perf_sample_data *data; | ||
394 | u64 sample_type = event->attr.sample_type; | ||
395 | u16 size = 0; | ||
396 | |||
397 | if (sample_type & PERF_SAMPLE_TID) | ||
398 | size += sizeof(data->tid_entry); | ||
399 | |||
400 | if (sample_type & PERF_SAMPLE_TIME) | ||
401 | size += sizeof(data->time); | ||
402 | |||
403 | if (sample_type & PERF_SAMPLE_ID) | ||
404 | size += sizeof(data->id); | ||
405 | |||
406 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
407 | size += sizeof(data->stream_id); | ||
408 | |||
409 | if (sample_type & PERF_SAMPLE_CPU) | ||
410 | size += sizeof(data->cpu_entry); | ||
411 | |||
412 | event->id_header_size = size; | ||
413 | } | ||
414 | |||
315 | static void perf_group_attach(struct perf_event *event) | 415 | static void perf_group_attach(struct perf_event *event) |
316 | { | 416 | { |
317 | struct perf_event *group_leader = event->group_leader; | 417 | struct perf_event *group_leader = event->group_leader, *pos; |
318 | 418 | ||
319 | /* | 419 | /* |
320 | * We can have double attach due to group movement in perf_event_open. | 420 | * We can have double attach due to group movement in perf_event_open. |
@@ -333,6 +433,11 @@ static void perf_group_attach(struct perf_event *event) | |||
333 | 433 | ||
334 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 434 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
335 | group_leader->nr_siblings++; | 435 | group_leader->nr_siblings++; |
436 | |||
437 | perf_event__header_size(group_leader); | ||
438 | |||
439 | list_for_each_entry(pos, &group_leader->sibling_list, group_entry) | ||
440 | perf_event__header_size(pos); | ||
336 | } | 441 | } |
337 | 442 | ||
338 | /* | 443 | /* |
@@ -391,7 +496,7 @@ static void perf_group_detach(struct perf_event *event) | |||
391 | if (event->group_leader != event) { | 496 | if (event->group_leader != event) { |
392 | list_del_init(&event->group_entry); | 497 | list_del_init(&event->group_entry); |
393 | event->group_leader->nr_siblings--; | 498 | event->group_leader->nr_siblings--; |
394 | return; | 499 | goto out; |
395 | } | 500 | } |
396 | 501 | ||
397 | if (!list_empty(&event->group_entry)) | 502 | if (!list_empty(&event->group_entry)) |
@@ -410,6 +515,12 @@ static void perf_group_detach(struct perf_event *event) | |||
410 | /* Inherit group flags from the previous leader */ | 515 | /* Inherit group flags from the previous leader */ |
411 | sibling->group_flags = event->group_flags; | 516 | sibling->group_flags = event->group_flags; |
412 | } | 517 | } |
518 | |||
519 | out: | ||
520 | perf_event__header_size(event->group_leader); | ||
521 | |||
522 | list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) | ||
523 | perf_event__header_size(tmp); | ||
413 | } | 524 | } |
414 | 525 | ||
415 | static inline int | 526 | static inline int |
@@ -1073,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1073 | /* | 1184 | /* |
1074 | * not supported on inherited events | 1185 | * not supported on inherited events |
1075 | */ | 1186 | */ |
1076 | if (event->attr.inherit) | 1187 | if (event->attr.inherit || !is_sampling_event(event)) |
1077 | return -EINVAL; | 1188 | return -EINVAL; |
1078 | 1189 | ||
1079 | atomic_add(refresh, &event->event_limit); | 1190 | atomic_add(refresh, &event->event_limit); |
@@ -2289,31 +2400,6 @@ static int perf_release(struct inode *inode, struct file *file) | |||
2289 | return perf_event_release_kernel(event); | 2400 | return perf_event_release_kernel(event); |
2290 | } | 2401 | } |
2291 | 2402 | ||
2292 | static int perf_event_read_size(struct perf_event *event) | ||
2293 | { | ||
2294 | int entry = sizeof(u64); /* value */ | ||
2295 | int size = 0; | ||
2296 | int nr = 1; | ||
2297 | |||
2298 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
2299 | size += sizeof(u64); | ||
2300 | |||
2301 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
2302 | size += sizeof(u64); | ||
2303 | |||
2304 | if (event->attr.read_format & PERF_FORMAT_ID) | ||
2305 | entry += sizeof(u64); | ||
2306 | |||
2307 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
2308 | nr += event->group_leader->nr_siblings; | ||
2309 | size += sizeof(u64); | ||
2310 | } | ||
2311 | |||
2312 | size += entry * nr; | ||
2313 | |||
2314 | return size; | ||
2315 | } | ||
2316 | |||
2317 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 2403 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
2318 | { | 2404 | { |
2319 | struct perf_event *child; | 2405 | struct perf_event *child; |
@@ -2428,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) | |||
2428 | if (event->state == PERF_EVENT_STATE_ERROR) | 2514 | if (event->state == PERF_EVENT_STATE_ERROR) |
2429 | return 0; | 2515 | return 0; |
2430 | 2516 | ||
2431 | if (count < perf_event_read_size(event)) | 2517 | if (count < event->read_size) |
2432 | return -ENOSPC; | 2518 | return -ENOSPC; |
2433 | 2519 | ||
2434 | WARN_ON_ONCE(event->ctx->parent_ctx); | 2520 | WARN_ON_ONCE(event->ctx->parent_ctx); |
@@ -2514,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
2514 | int ret = 0; | 2600 | int ret = 0; |
2515 | u64 value; | 2601 | u64 value; |
2516 | 2602 | ||
2517 | if (!event->attr.sample_period) | 2603 | if (!is_sampling_event(event)) |
2518 | return -EINVAL; | 2604 | return -EINVAL; |
2519 | 2605 | ||
2520 | if (copy_from_user(&value, arg, sizeof(value))) | 2606 | if (copy_from_user(&value, arg, sizeof(value))) |
@@ -3305,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
3305 | } while (len); | 3391 | } while (len); |
3306 | } | 3392 | } |
3307 | 3393 | ||
3394 | static void __perf_event_header__init_id(struct perf_event_header *header, | ||
3395 | struct perf_sample_data *data, | ||
3396 | struct perf_event *event) | ||
3397 | { | ||
3398 | u64 sample_type = event->attr.sample_type; | ||
3399 | |||
3400 | data->type = sample_type; | ||
3401 | header->size += event->id_header_size; | ||
3402 | |||
3403 | if (sample_type & PERF_SAMPLE_TID) { | ||
3404 | /* namespace issues */ | ||
3405 | data->tid_entry.pid = perf_event_pid(event, current); | ||
3406 | data->tid_entry.tid = perf_event_tid(event, current); | ||
3407 | } | ||
3408 | |||
3409 | if (sample_type & PERF_SAMPLE_TIME) | ||
3410 | data->time = perf_clock(); | ||
3411 | |||
3412 | if (sample_type & PERF_SAMPLE_ID) | ||
3413 | data->id = primary_event_id(event); | ||
3414 | |||
3415 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
3416 | data->stream_id = event->id; | ||
3417 | |||
3418 | if (sample_type & PERF_SAMPLE_CPU) { | ||
3419 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
3420 | data->cpu_entry.reserved = 0; | ||
3421 | } | ||
3422 | } | ||
3423 | |||
3424 | static void perf_event_header__init_id(struct perf_event_header *header, | ||
3425 | struct perf_sample_data *data, | ||
3426 | struct perf_event *event) | ||
3427 | { | ||
3428 | if (event->attr.sample_id_all) | ||
3429 | __perf_event_header__init_id(header, data, event); | ||
3430 | } | ||
3431 | |||
3432 | static void __perf_event__output_id_sample(struct perf_output_handle *handle, | ||
3433 | struct perf_sample_data *data) | ||
3434 | { | ||
3435 | u64 sample_type = data->type; | ||
3436 | |||
3437 | if (sample_type & PERF_SAMPLE_TID) | ||
3438 | perf_output_put(handle, data->tid_entry); | ||
3439 | |||
3440 | if (sample_type & PERF_SAMPLE_TIME) | ||
3441 | perf_output_put(handle, data->time); | ||
3442 | |||
3443 | if (sample_type & PERF_SAMPLE_ID) | ||
3444 | perf_output_put(handle, data->id); | ||
3445 | |||
3446 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
3447 | perf_output_put(handle, data->stream_id); | ||
3448 | |||
3449 | if (sample_type & PERF_SAMPLE_CPU) | ||
3450 | perf_output_put(handle, data->cpu_entry); | ||
3451 | } | ||
3452 | |||
3453 | static void perf_event__output_id_sample(struct perf_event *event, | ||
3454 | struct perf_output_handle *handle, | ||
3455 | struct perf_sample_data *sample) | ||
3456 | { | ||
3457 | if (event->attr.sample_id_all) | ||
3458 | __perf_event__output_id_sample(handle, sample); | ||
3459 | } | ||
3460 | |||
3308 | int perf_output_begin(struct perf_output_handle *handle, | 3461 | int perf_output_begin(struct perf_output_handle *handle, |
3309 | struct perf_event *event, unsigned int size, | 3462 | struct perf_event *event, unsigned int size, |
3310 | int nmi, int sample) | 3463 | int nmi, int sample) |
@@ -3312,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3312 | struct perf_buffer *buffer; | 3465 | struct perf_buffer *buffer; |
3313 | unsigned long tail, offset, head; | 3466 | unsigned long tail, offset, head; |
3314 | int have_lost; | 3467 | int have_lost; |
3468 | struct perf_sample_data sample_data; | ||
3315 | struct { | 3469 | struct { |
3316 | struct perf_event_header header; | 3470 | struct perf_event_header header; |
3317 | u64 id; | 3471 | u64 id; |
@@ -3338,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3338 | goto out; | 3492 | goto out; |
3339 | 3493 | ||
3340 | have_lost = local_read(&buffer->lost); | 3494 | have_lost = local_read(&buffer->lost); |
3341 | if (have_lost) | 3495 | if (have_lost) { |
3342 | size += sizeof(lost_event); | 3496 | lost_event.header.size = sizeof(lost_event); |
3497 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
3498 | event); | ||
3499 | size += lost_event.header.size; | ||
3500 | } | ||
3343 | 3501 | ||
3344 | perf_output_get_handle(handle); | 3502 | perf_output_get_handle(handle); |
3345 | 3503 | ||
@@ -3370,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3370 | if (have_lost) { | 3528 | if (have_lost) { |
3371 | lost_event.header.type = PERF_RECORD_LOST; | 3529 | lost_event.header.type = PERF_RECORD_LOST; |
3372 | lost_event.header.misc = 0; | 3530 | lost_event.header.misc = 0; |
3373 | lost_event.header.size = sizeof(lost_event); | ||
3374 | lost_event.id = event->id; | 3531 | lost_event.id = event->id; |
3375 | lost_event.lost = local_xchg(&buffer->lost, 0); | 3532 | lost_event.lost = local_xchg(&buffer->lost, 0); |
3376 | 3533 | ||
3377 | perf_output_put(handle, lost_event); | 3534 | perf_output_put(handle, lost_event); |
3535 | perf_event__output_id_sample(event, handle, &sample_data); | ||
3378 | } | 3536 | } |
3379 | 3537 | ||
3380 | return 0; | 3538 | return 0; |
@@ -3407,28 +3565,6 @@ void perf_output_end(struct perf_output_handle *handle) | |||
3407 | rcu_read_unlock(); | 3565 | rcu_read_unlock(); |
3408 | } | 3566 | } |
3409 | 3567 | ||
3410 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
3411 | { | ||
3412 | /* | ||
3413 | * only top level events have the pid namespace they were created in | ||
3414 | */ | ||
3415 | if (event->parent) | ||
3416 | event = event->parent; | ||
3417 | |||
3418 | return task_tgid_nr_ns(p, event->ns); | ||
3419 | } | ||
3420 | |||
3421 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
3422 | { | ||
3423 | /* | ||
3424 | * only top level events have the pid namespace they were created in | ||
3425 | */ | ||
3426 | if (event->parent) | ||
3427 | event = event->parent; | ||
3428 | |||
3429 | return task_pid_nr_ns(p, event->ns); | ||
3430 | } | ||
3431 | |||
3432 | static void perf_output_read_one(struct perf_output_handle *handle, | 3568 | static void perf_output_read_one(struct perf_output_handle *handle, |
3433 | struct perf_event *event, | 3569 | struct perf_event *event, |
3434 | u64 enabled, u64 running) | 3570 | u64 enabled, u64 running) |
@@ -3603,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
3603 | { | 3739 | { |
3604 | u64 sample_type = event->attr.sample_type; | 3740 | u64 sample_type = event->attr.sample_type; |
3605 | 3741 | ||
3606 | data->type = sample_type; | ||
3607 | |||
3608 | header->type = PERF_RECORD_SAMPLE; | 3742 | header->type = PERF_RECORD_SAMPLE; |
3609 | header->size = sizeof(*header); | 3743 | header->size = sizeof(*header) + event->header_size; |
3610 | 3744 | ||
3611 | header->misc = 0; | 3745 | header->misc = 0; |
3612 | header->misc |= perf_misc_flags(regs); | 3746 | header->misc |= perf_misc_flags(regs); |
3613 | 3747 | ||
3614 | if (sample_type & PERF_SAMPLE_IP) { | 3748 | __perf_event_header__init_id(header, data, event); |
3615 | data->ip = perf_instruction_pointer(regs); | ||
3616 | |||
3617 | header->size += sizeof(data->ip); | ||
3618 | } | ||
3619 | |||
3620 | if (sample_type & PERF_SAMPLE_TID) { | ||
3621 | /* namespace issues */ | ||
3622 | data->tid_entry.pid = perf_event_pid(event, current); | ||
3623 | data->tid_entry.tid = perf_event_tid(event, current); | ||
3624 | |||
3625 | header->size += sizeof(data->tid_entry); | ||
3626 | } | ||
3627 | |||
3628 | if (sample_type & PERF_SAMPLE_TIME) { | ||
3629 | data->time = perf_clock(); | ||
3630 | |||
3631 | header->size += sizeof(data->time); | ||
3632 | } | ||
3633 | |||
3634 | if (sample_type & PERF_SAMPLE_ADDR) | ||
3635 | header->size += sizeof(data->addr); | ||
3636 | |||
3637 | if (sample_type & PERF_SAMPLE_ID) { | ||
3638 | data->id = primary_event_id(event); | ||
3639 | |||
3640 | header->size += sizeof(data->id); | ||
3641 | } | ||
3642 | |||
3643 | if (sample_type & PERF_SAMPLE_STREAM_ID) { | ||
3644 | data->stream_id = event->id; | ||
3645 | |||
3646 | header->size += sizeof(data->stream_id); | ||
3647 | } | ||
3648 | |||
3649 | if (sample_type & PERF_SAMPLE_CPU) { | ||
3650 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
3651 | data->cpu_entry.reserved = 0; | ||
3652 | |||
3653 | header->size += sizeof(data->cpu_entry); | ||
3654 | } | ||
3655 | |||
3656 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
3657 | header->size += sizeof(data->period); | ||
3658 | 3749 | ||
3659 | if (sample_type & PERF_SAMPLE_READ) | 3750 | if (sample_type & PERF_SAMPLE_IP) |
3660 | header->size += perf_event_read_size(event); | 3751 | data->ip = perf_instruction_pointer(regs); |
3661 | 3752 | ||
3662 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { | 3753 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
3663 | int size = 1; | 3754 | int size = 1; |
@@ -3722,23 +3813,26 @@ perf_event_read_event(struct perf_event *event, | |||
3722 | struct task_struct *task) | 3813 | struct task_struct *task) |
3723 | { | 3814 | { |
3724 | struct perf_output_handle handle; | 3815 | struct perf_output_handle handle; |
3816 | struct perf_sample_data sample; | ||
3725 | struct perf_read_event read_event = { | 3817 | struct perf_read_event read_event = { |
3726 | .header = { | 3818 | .header = { |
3727 | .type = PERF_RECORD_READ, | 3819 | .type = PERF_RECORD_READ, |
3728 | .misc = 0, | 3820 | .misc = 0, |
3729 | .size = sizeof(read_event) + perf_event_read_size(event), | 3821 | .size = sizeof(read_event) + event->read_size, |
3730 | }, | 3822 | }, |
3731 | .pid = perf_event_pid(event, task), | 3823 | .pid = perf_event_pid(event, task), |
3732 | .tid = perf_event_tid(event, task), | 3824 | .tid = perf_event_tid(event, task), |
3733 | }; | 3825 | }; |
3734 | int ret; | 3826 | int ret; |
3735 | 3827 | ||
3828 | perf_event_header__init_id(&read_event.header, &sample, event); | ||
3736 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); | 3829 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); |
3737 | if (ret) | 3830 | if (ret) |
3738 | return; | 3831 | return; |
3739 | 3832 | ||
3740 | perf_output_put(&handle, read_event); | 3833 | perf_output_put(&handle, read_event); |
3741 | perf_output_read(&handle, event); | 3834 | perf_output_read(&handle, event); |
3835 | perf_event__output_id_sample(event, &handle, &sample); | ||
3742 | 3836 | ||
3743 | perf_output_end(&handle); | 3837 | perf_output_end(&handle); |
3744 | } | 3838 | } |
@@ -3768,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event, | |||
3768 | struct perf_task_event *task_event) | 3862 | struct perf_task_event *task_event) |
3769 | { | 3863 | { |
3770 | struct perf_output_handle handle; | 3864 | struct perf_output_handle handle; |
3865 | struct perf_sample_data sample; | ||
3771 | struct task_struct *task = task_event->task; | 3866 | struct task_struct *task = task_event->task; |
3772 | int size, ret; | 3867 | int ret, size = task_event->event_id.header.size; |
3773 | 3868 | ||
3774 | size = task_event->event_id.header.size; | 3869 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
3775 | ret = perf_output_begin(&handle, event, size, 0, 0); | ||
3776 | 3870 | ||
3871 | ret = perf_output_begin(&handle, event, | ||
3872 | task_event->event_id.header.size, 0, 0); | ||
3777 | if (ret) | 3873 | if (ret) |
3778 | return; | 3874 | goto out; |
3779 | 3875 | ||
3780 | task_event->event_id.pid = perf_event_pid(event, task); | 3876 | task_event->event_id.pid = perf_event_pid(event, task); |
3781 | task_event->event_id.ppid = perf_event_pid(event, current); | 3877 | task_event->event_id.ppid = perf_event_pid(event, current); |
@@ -3785,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event, | |||
3785 | 3881 | ||
3786 | perf_output_put(&handle, task_event->event_id); | 3882 | perf_output_put(&handle, task_event->event_id); |
3787 | 3883 | ||
3884 | perf_event__output_id_sample(event, &handle, &sample); | ||
3885 | |||
3788 | perf_output_end(&handle); | 3886 | perf_output_end(&handle); |
3887 | out: | ||
3888 | task_event->event_id.header.size = size; | ||
3789 | } | 3889 | } |
3790 | 3890 | ||
3791 | static int perf_event_task_match(struct perf_event *event) | 3891 | static int perf_event_task_match(struct perf_event *event) |
@@ -3900,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3900 | struct perf_comm_event *comm_event) | 4000 | struct perf_comm_event *comm_event) |
3901 | { | 4001 | { |
3902 | struct perf_output_handle handle; | 4002 | struct perf_output_handle handle; |
4003 | struct perf_sample_data sample; | ||
3903 | int size = comm_event->event_id.header.size; | 4004 | int size = comm_event->event_id.header.size; |
3904 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4005 | int ret; |
4006 | |||
4007 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | ||
4008 | ret = perf_output_begin(&handle, event, | ||
4009 | comm_event->event_id.header.size, 0, 0); | ||
3905 | 4010 | ||
3906 | if (ret) | 4011 | if (ret) |
3907 | return; | 4012 | goto out; |
3908 | 4013 | ||
3909 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); | 4014 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); |
3910 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); | 4015 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
@@ -3912,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3912 | perf_output_put(&handle, comm_event->event_id); | 4017 | perf_output_put(&handle, comm_event->event_id); |
3913 | perf_output_copy(&handle, comm_event->comm, | 4018 | perf_output_copy(&handle, comm_event->comm, |
3914 | comm_event->comm_size); | 4019 | comm_event->comm_size); |
4020 | |||
4021 | perf_event__output_id_sample(event, &handle, &sample); | ||
4022 | |||
3915 | perf_output_end(&handle); | 4023 | perf_output_end(&handle); |
4024 | out: | ||
4025 | comm_event->event_id.header.size = size; | ||
3916 | } | 4026 | } |
3917 | 4027 | ||
3918 | static int perf_event_comm_match(struct perf_event *event) | 4028 | static int perf_event_comm_match(struct perf_event *event) |
@@ -3957,7 +4067,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3957 | comm_event->comm_size = size; | 4067 | comm_event->comm_size = size; |
3958 | 4068 | ||
3959 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4069 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3960 | |||
3961 | rcu_read_lock(); | 4070 | rcu_read_lock(); |
3962 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4071 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3963 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4072 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
@@ -4038,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4038 | struct perf_mmap_event *mmap_event) | 4147 | struct perf_mmap_event *mmap_event) |
4039 | { | 4148 | { |
4040 | struct perf_output_handle handle; | 4149 | struct perf_output_handle handle; |
4150 | struct perf_sample_data sample; | ||
4041 | int size = mmap_event->event_id.header.size; | 4151 | int size = mmap_event->event_id.header.size; |
4042 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4152 | int ret; |
4043 | 4153 | ||
4154 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | ||
4155 | ret = perf_output_begin(&handle, event, | ||
4156 | mmap_event->event_id.header.size, 0, 0); | ||
4044 | if (ret) | 4157 | if (ret) |
4045 | return; | 4158 | goto out; |
4046 | 4159 | ||
4047 | mmap_event->event_id.pid = perf_event_pid(event, current); | 4160 | mmap_event->event_id.pid = perf_event_pid(event, current); |
4048 | mmap_event->event_id.tid = perf_event_tid(event, current); | 4161 | mmap_event->event_id.tid = perf_event_tid(event, current); |
@@ -4050,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4050 | perf_output_put(&handle, mmap_event->event_id); | 4163 | perf_output_put(&handle, mmap_event->event_id); |
4051 | perf_output_copy(&handle, mmap_event->file_name, | 4164 | perf_output_copy(&handle, mmap_event->file_name, |
4052 | mmap_event->file_size); | 4165 | mmap_event->file_size); |
4166 | |||
4167 | perf_event__output_id_sample(event, &handle, &sample); | ||
4168 | |||
4053 | perf_output_end(&handle); | 4169 | perf_output_end(&handle); |
4170 | out: | ||
4171 | mmap_event->event_id.header.size = size; | ||
4054 | } | 4172 | } |
4055 | 4173 | ||
4056 | static int perf_event_mmap_match(struct perf_event *event, | 4174 | static int perf_event_mmap_match(struct perf_event *event, |
@@ -4205,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
4205 | static void perf_log_throttle(struct perf_event *event, int enable) | 4323 | static void perf_log_throttle(struct perf_event *event, int enable) |
4206 | { | 4324 | { |
4207 | struct perf_output_handle handle; | 4325 | struct perf_output_handle handle; |
4326 | struct perf_sample_data sample; | ||
4208 | int ret; | 4327 | int ret; |
4209 | 4328 | ||
4210 | struct { | 4329 | struct { |
@@ -4226,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
4226 | if (enable) | 4345 | if (enable) |
4227 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; | 4346 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; |
4228 | 4347 | ||
4229 | ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); | 4348 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
4349 | |||
4350 | ret = perf_output_begin(&handle, event, | ||
4351 | throttle_event.header.size, 1, 0); | ||
4230 | if (ret) | 4352 | if (ret) |
4231 | return; | 4353 | return; |
4232 | 4354 | ||
4233 | perf_output_put(&handle, throttle_event); | 4355 | perf_output_put(&handle, throttle_event); |
4356 | perf_event__output_id_sample(event, &handle, &sample); | ||
4234 | perf_output_end(&handle); | 4357 | perf_output_end(&handle); |
4235 | } | 4358 | } |
4236 | 4359 | ||
@@ -4246,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4246 | struct hw_perf_event *hwc = &event->hw; | 4369 | struct hw_perf_event *hwc = &event->hw; |
4247 | int ret = 0; | 4370 | int ret = 0; |
4248 | 4371 | ||
4372 | /* | ||
4373 | * Non-sampling counters might still use the PMI to fold short | ||
4374 | * hardware counters, ignore those. | ||
4375 | */ | ||
4376 | if (unlikely(!is_sampling_event(event))) | ||
4377 | return 0; | ||
4378 | |||
4249 | if (!throttle) { | 4379 | if (!throttle) { |
4250 | hwc->interrupts++; | 4380 | hwc->interrupts++; |
4251 | } else { | 4381 | } else { |
@@ -4391,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
4391 | if (!regs) | 4521 | if (!regs) |
4392 | return; | 4522 | return; |
4393 | 4523 | ||
4394 | if (!hwc->sample_period) | 4524 | if (!is_sampling_event(event)) |
4395 | return; | 4525 | return; |
4396 | 4526 | ||
4397 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4527 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
@@ -4554,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
4554 | struct hw_perf_event *hwc = &event->hw; | 4684 | struct hw_perf_event *hwc = &event->hw; |
4555 | struct hlist_head *head; | 4685 | struct hlist_head *head; |
4556 | 4686 | ||
4557 | if (hwc->sample_period) { | 4687 | if (is_sampling_event(event)) { |
4558 | hwc->last_period = hwc->sample_period; | 4688 | hwc->last_period = hwc->sample_period; |
4559 | perf_swevent_set_period(event); | 4689 | perf_swevent_set_period(event); |
4560 | } | 4690 | } |
@@ -4811,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event) | |||
4811 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 4941 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
4812 | return -ENOENT; | 4942 | return -ENOENT; |
4813 | 4943 | ||
4814 | /* | ||
4815 | * Raw tracepoint data is a severe data leak, only allow root to | ||
4816 | * have these. | ||
4817 | */ | ||
4818 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | ||
4819 | perf_paranoid_tracepoint_raw() && | ||
4820 | !capable(CAP_SYS_ADMIN)) | ||
4821 | return -EPERM; | ||
4822 | |||
4823 | err = perf_trace_init(event); | 4944 | err = perf_trace_init(event); |
4824 | if (err) | 4945 | if (err) |
4825 | return err; | 4946 | return err; |
@@ -4842,7 +4963,7 @@ static struct pmu perf_tracepoint = { | |||
4842 | 4963 | ||
4843 | static inline void perf_tp_register(void) | 4964 | static inline void perf_tp_register(void) |
4844 | { | 4965 | { |
4845 | perf_pmu_register(&perf_tracepoint); | 4966 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); |
4846 | } | 4967 | } |
4847 | 4968 | ||
4848 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4969 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4932,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
4932 | static void perf_swevent_start_hrtimer(struct perf_event *event) | 5053 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4933 | { | 5054 | { |
4934 | struct hw_perf_event *hwc = &event->hw; | 5055 | struct hw_perf_event *hwc = &event->hw; |
5056 | s64 period; | ||
5057 | |||
5058 | if (!is_sampling_event(event)) | ||
5059 | return; | ||
4935 | 5060 | ||
4936 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 5061 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4937 | hwc->hrtimer.function = perf_swevent_hrtimer; | 5062 | hwc->hrtimer.function = perf_swevent_hrtimer; |
4938 | if (hwc->sample_period) { | ||
4939 | s64 period = local64_read(&hwc->period_left); | ||
4940 | 5063 | ||
4941 | if (period) { | 5064 | period = local64_read(&hwc->period_left); |
4942 | if (period < 0) | 5065 | if (period) { |
4943 | period = 10000; | 5066 | if (period < 0) |
5067 | period = 10000; | ||
4944 | 5068 | ||
4945 | local64_set(&hwc->period_left, 0); | 5069 | local64_set(&hwc->period_left, 0); |
4946 | } else { | 5070 | } else { |
4947 | period = max_t(u64, 10000, hwc->sample_period); | 5071 | period = max_t(u64, 10000, hwc->sample_period); |
4948 | } | 5072 | } |
4949 | __hrtimer_start_range_ns(&hwc->hrtimer, | 5073 | __hrtimer_start_range_ns(&hwc->hrtimer, |
4950 | ns_to_ktime(period), 0, | 5074 | ns_to_ktime(period), 0, |
4951 | HRTIMER_MODE_REL_PINNED, 0); | 5075 | HRTIMER_MODE_REL_PINNED, 0); |
4952 | } | ||
4953 | } | 5076 | } |
4954 | 5077 | ||
4955 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | 5078 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
4956 | { | 5079 | { |
4957 | struct hw_perf_event *hwc = &event->hw; | 5080 | struct hw_perf_event *hwc = &event->hw; |
4958 | 5081 | ||
4959 | if (hwc->sample_period) { | 5082 | if (is_sampling_event(event)) { |
4960 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | 5083 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); |
4961 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | 5084 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); |
4962 | 5085 | ||
@@ -5184,8 +5307,61 @@ static void free_pmu_context(struct pmu *pmu) | |||
5184 | out: | 5307 | out: |
5185 | mutex_unlock(&pmus_lock); | 5308 | mutex_unlock(&pmus_lock); |
5186 | } | 5309 | } |
5310 | static struct idr pmu_idr; | ||
5311 | |||
5312 | static ssize_t | ||
5313 | type_show(struct device *dev, struct device_attribute *attr, char *page) | ||
5314 | { | ||
5315 | struct pmu *pmu = dev_get_drvdata(dev); | ||
5316 | |||
5317 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | ||
5318 | } | ||
5319 | |||
5320 | static struct device_attribute pmu_dev_attrs[] = { | ||
5321 | __ATTR_RO(type), | ||
5322 | __ATTR_NULL, | ||
5323 | }; | ||
5324 | |||
5325 | static int pmu_bus_running; | ||
5326 | static struct bus_type pmu_bus = { | ||
5327 | .name = "event_source", | ||
5328 | .dev_attrs = pmu_dev_attrs, | ||
5329 | }; | ||
5330 | |||
5331 | static void pmu_dev_release(struct device *dev) | ||
5332 | { | ||
5333 | kfree(dev); | ||
5334 | } | ||
5335 | |||
5336 | static int pmu_dev_alloc(struct pmu *pmu) | ||
5337 | { | ||
5338 | int ret = -ENOMEM; | ||
5339 | |||
5340 | pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); | ||
5341 | if (!pmu->dev) | ||
5342 | goto out; | ||
5343 | |||
5344 | device_initialize(pmu->dev); | ||
5345 | ret = dev_set_name(pmu->dev, "%s", pmu->name); | ||
5346 | if (ret) | ||
5347 | goto free_dev; | ||
5348 | |||
5349 | dev_set_drvdata(pmu->dev, pmu); | ||
5350 | pmu->dev->bus = &pmu_bus; | ||
5351 | pmu->dev->release = pmu_dev_release; | ||
5352 | ret = device_add(pmu->dev); | ||
5353 | if (ret) | ||
5354 | goto free_dev; | ||
5355 | |||
5356 | out: | ||
5357 | return ret; | ||
5358 | |||
5359 | free_dev: | ||
5360 | put_device(pmu->dev); | ||
5361 | goto out; | ||
5362 | } | ||
5187 | 5363 | ||
5188 | int perf_pmu_register(struct pmu *pmu) | 5364 | int perf_pmu_register(struct pmu *pmu, char *name, int type) |
5189 | { | 5365 | { |
5190 | int cpu, ret; | 5366 | int cpu, ret; |
5191 | 5367 | ||
@@ -5195,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu) | |||
5195 | if (!pmu->pmu_disable_count) | 5371 | if (!pmu->pmu_disable_count) |
5196 | goto unlock; | 5372 | goto unlock; |
5197 | 5373 | ||
5374 | pmu->type = -1; | ||
5375 | if (!name) | ||
5376 | goto skip_type; | ||
5377 | pmu->name = name; | ||
5378 | |||
5379 | if (type < 0) { | ||
5380 | int err = idr_pre_get(&pmu_idr, GFP_KERNEL); | ||
5381 | if (!err) | ||
5382 | goto free_pdc; | ||
5383 | |||
5384 | err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); | ||
5385 | if (err) { | ||
5386 | ret = err; | ||
5387 | goto free_pdc; | ||
5388 | } | ||
5389 | } | ||
5390 | pmu->type = type; | ||
5391 | |||
5392 | if (pmu_bus_running) { | ||
5393 | ret = pmu_dev_alloc(pmu); | ||
5394 | if (ret) | ||
5395 | goto free_idr; | ||
5396 | } | ||
5397 | |||
5398 | skip_type: | ||
5198 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | 5399 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); |
5199 | if (pmu->pmu_cpu_context) | 5400 | if (pmu->pmu_cpu_context) |
5200 | goto got_cpu_context; | 5401 | goto got_cpu_context; |
5201 | 5402 | ||
5202 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | 5403 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); |
5203 | if (!pmu->pmu_cpu_context) | 5404 | if (!pmu->pmu_cpu_context) |
5204 | goto free_pdc; | 5405 | goto free_dev; |
5205 | 5406 | ||
5206 | for_each_possible_cpu(cpu) { | 5407 | for_each_possible_cpu(cpu) { |
5207 | struct perf_cpu_context *cpuctx; | 5408 | struct perf_cpu_context *cpuctx; |
@@ -5245,6 +5446,14 @@ unlock: | |||
5245 | 5446 | ||
5246 | return ret; | 5447 | return ret; |
5247 | 5448 | ||
5449 | free_dev: | ||
5450 | device_del(pmu->dev); | ||
5451 | put_device(pmu->dev); | ||
5452 | |||
5453 | free_idr: | ||
5454 | if (pmu->type >= PERF_TYPE_MAX) | ||
5455 | idr_remove(&pmu_idr, pmu->type); | ||
5456 | |||
5248 | free_pdc: | 5457 | free_pdc: |
5249 | free_percpu(pmu->pmu_disable_count); | 5458 | free_percpu(pmu->pmu_disable_count); |
5250 | goto unlock; | 5459 | goto unlock; |
@@ -5264,6 +5473,10 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
5264 | synchronize_rcu(); | 5473 | synchronize_rcu(); |
5265 | 5474 | ||
5266 | free_percpu(pmu->pmu_disable_count); | 5475 | free_percpu(pmu->pmu_disable_count); |
5476 | if (pmu->type >= PERF_TYPE_MAX) | ||
5477 | idr_remove(&pmu_idr, pmu->type); | ||
5478 | device_del(pmu->dev); | ||
5479 | put_device(pmu->dev); | ||
5267 | free_pmu_context(pmu); | 5480 | free_pmu_context(pmu); |
5268 | } | 5481 | } |
5269 | 5482 | ||
@@ -5273,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5273 | int idx; | 5486 | int idx; |
5274 | 5487 | ||
5275 | idx = srcu_read_lock(&pmus_srcu); | 5488 | idx = srcu_read_lock(&pmus_srcu); |
5489 | |||
5490 | rcu_read_lock(); | ||
5491 | pmu = idr_find(&pmu_idr, event->attr.type); | ||
5492 | rcu_read_unlock(); | ||
5493 | if (pmu) | ||
5494 | goto unlock; | ||
5495 | |||
5276 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 5496 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5277 | int ret = pmu->event_init(event); | 5497 | int ret = pmu->event_init(event); |
5278 | if (!ret) | 5498 | if (!ret) |
@@ -5738,6 +5958,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5738 | mutex_unlock(¤t->perf_event_mutex); | 5958 | mutex_unlock(¤t->perf_event_mutex); |
5739 | 5959 | ||
5740 | /* | 5960 | /* |
5961 | * Precalculate sample_data sizes | ||
5962 | */ | ||
5963 | perf_event__header_size(event); | ||
5964 | perf_event__id_header_size(event); | ||
5965 | |||
5966 | /* | ||
5741 | * Drop the reference on the group_event after placing the | 5967 | * Drop the reference on the group_event after placing the |
5742 | * new event on the sibling_list. This ensures destruction | 5968 | * new event on the sibling_list. This ensures destruction |
5743 | * of the group leader will find the pointer to itself in | 5969 | * of the group leader will find the pointer to itself in |
@@ -6090,6 +6316,12 @@ inherit_event(struct perf_event *parent_event, | |||
6090 | child_event->overflow_handler = parent_event->overflow_handler; | 6316 | child_event->overflow_handler = parent_event->overflow_handler; |
6091 | 6317 | ||
6092 | /* | 6318 | /* |
6319 | * Precalculate sample_data sizes | ||
6320 | */ | ||
6321 | perf_event__header_size(child_event); | ||
6322 | perf_event__id_header_size(child_event); | ||
6323 | |||
6324 | /* | ||
6093 | * Link it up in the child's context: | 6325 | * Link it up in the child's context: |
6094 | */ | 6326 | */ |
6095 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | 6327 | raw_spin_lock_irqsave(&child_ctx->lock, flags); |
@@ -6320,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) | |||
6320 | mutex_unlock(&swhash->hlist_mutex); | 6552 | mutex_unlock(&swhash->hlist_mutex); |
6321 | } | 6553 | } |
6322 | 6554 | ||
6323 | #ifdef CONFIG_HOTPLUG_CPU | 6555 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC |
6324 | static void perf_pmu_rotate_stop(struct pmu *pmu) | 6556 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
6325 | { | 6557 | { |
6326 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 6558 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
@@ -6374,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu) | |||
6374 | static inline void perf_event_exit_cpu(int cpu) { } | 6606 | static inline void perf_event_exit_cpu(int cpu) { } |
6375 | #endif | 6607 | #endif |
6376 | 6608 | ||
6609 | static int | ||
6610 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) | ||
6611 | { | ||
6612 | int cpu; | ||
6613 | |||
6614 | for_each_online_cpu(cpu) | ||
6615 | perf_event_exit_cpu(cpu); | ||
6616 | |||
6617 | return NOTIFY_OK; | ||
6618 | } | ||
6619 | |||
6620 | /* | ||
6621 | * Run the perf reboot notifier at the very last possible moment so that | ||
6622 | * the generic watchdog code runs as long as possible. | ||
6623 | */ | ||
6624 | static struct notifier_block perf_reboot_notifier = { | ||
6625 | .notifier_call = perf_reboot, | ||
6626 | .priority = INT_MIN, | ||
6627 | }; | ||
6628 | |||
6377 | static int __cpuinit | 6629 | static int __cpuinit |
6378 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | 6630 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) |
6379 | { | 6631 | { |
@@ -6402,14 +6654,45 @@ void __init perf_event_init(void) | |||
6402 | { | 6654 | { |
6403 | int ret; | 6655 | int ret; |
6404 | 6656 | ||
6657 | idr_init(&pmu_idr); | ||
6658 | |||
6405 | perf_event_init_all_cpus(); | 6659 | perf_event_init_all_cpus(); |
6406 | init_srcu_struct(&pmus_srcu); | 6660 | init_srcu_struct(&pmus_srcu); |
6407 | perf_pmu_register(&perf_swevent); | 6661 | perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); |
6408 | perf_pmu_register(&perf_cpu_clock); | 6662 | perf_pmu_register(&perf_cpu_clock, NULL, -1); |
6409 | perf_pmu_register(&perf_task_clock); | 6663 | perf_pmu_register(&perf_task_clock, NULL, -1); |
6410 | perf_tp_register(); | 6664 | perf_tp_register(); |
6411 | perf_cpu_notifier(perf_cpu_notify); | 6665 | perf_cpu_notifier(perf_cpu_notify); |
6666 | register_reboot_notifier(&perf_reboot_notifier); | ||
6412 | 6667 | ||
6413 | ret = init_hw_breakpoint(); | 6668 | ret = init_hw_breakpoint(); |
6414 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | 6669 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
6415 | } | 6670 | } |
6671 | |||
6672 | static int __init perf_event_sysfs_init(void) | ||
6673 | { | ||
6674 | struct pmu *pmu; | ||
6675 | int ret; | ||
6676 | |||
6677 | mutex_lock(&pmus_lock); | ||
6678 | |||
6679 | ret = bus_register(&pmu_bus); | ||
6680 | if (ret) | ||
6681 | goto unlock; | ||
6682 | |||
6683 | list_for_each_entry(pmu, &pmus, entry) { | ||
6684 | if (!pmu->name || pmu->type < 0) | ||
6685 | continue; | ||
6686 | |||
6687 | ret = pmu_dev_alloc(pmu); | ||
6688 | WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); | ||
6689 | } | ||
6690 | pmu_bus_running = 1; | ||
6691 | ret = 0; | ||
6692 | |||
6693 | unlock: | ||
6694 | mutex_unlock(&pmus_lock); | ||
6695 | |||
6696 | return ret; | ||
6697 | } | ||
6698 | device_initcall(perf_event_sysfs_init); | ||
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ecf770509d0d..031d5e3a6197 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/suspend.h> | 24 | #include <linux/suspend.h> |
25 | #include <trace/events/power.h> | ||
25 | 26 | ||
26 | #include "power.h" | 27 | #include "power.h" |
27 | 28 | ||
@@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
201 | if (!suspend_ops) | 202 | if (!suspend_ops) |
202 | return -ENOSYS; | 203 | return -ENOSYS; |
203 | 204 | ||
205 | trace_machine_suspend(state); | ||
204 | if (suspend_ops->begin) { | 206 | if (suspend_ops->begin) { |
205 | error = suspend_ops->begin(state); | 207 | error = suspend_ops->begin(state); |
206 | if (error) | 208 | if (error) |
@@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
229 | Close: | 231 | Close: |
230 | if (suspend_ops->end) | 232 | if (suspend_ops->end) |
231 | suspend_ops->end(); | 233 | suspend_ops->end(); |
234 | trace_machine_suspend(PWR_EVENT_EXIT); | ||
232 | return error; | 235 | return error; |
233 | 236 | ||
234 | Recover_platform: | 237 | Recover_platform: |
diff --git a/kernel/sched.c b/kernel/sched.c index e6f8f1254319..260132961a99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -8293,8 +8293,6 @@ void __init sched_init(void) | |||
8293 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8293 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8294 | #endif /* SMP */ | 8294 | #endif /* SMP */ |
8295 | 8295 | ||
8296 | perf_event_init(); | ||
8297 | |||
8298 | scheduler_running = 1; | 8296 | scheduler_running = 1; |
8299 | } | 8297 | } |
8300 | 8298 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5abfa1518554..46404414d8a7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -745,21 +745,21 @@ static struct ctl_table kern_table[] = { | |||
745 | .extra1 = &zero, | 745 | .extra1 = &zero, |
746 | .extra2 = &one, | 746 | .extra2 = &one, |
747 | }, | 747 | }, |
748 | #endif | ||
749 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
750 | { | 748 | { |
751 | .procname = "unknown_nmi_panic", | 749 | .procname = "nmi_watchdog", |
752 | .data = &unknown_nmi_panic, | 750 | .data = &watchdog_enabled, |
753 | .maxlen = sizeof (int), | 751 | .maxlen = sizeof (int), |
754 | .mode = 0644, | 752 | .mode = 0644, |
755 | .proc_handler = proc_dointvec, | 753 | .proc_handler = proc_dowatchdog_enabled, |
756 | }, | 754 | }, |
755 | #endif | ||
756 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
757 | { | 757 | { |
758 | .procname = "nmi_watchdog", | 758 | .procname = "unknown_nmi_panic", |
759 | .data = &nmi_watchdog_enabled, | 759 | .data = &unknown_nmi_panic, |
760 | .maxlen = sizeof (int), | 760 | .maxlen = sizeof (int), |
761 | .mode = 0644, | 761 | .mode = 0644, |
762 | .proc_handler = proc_nmi_enabled, | 762 | .proc_handler = proc_dointvec, |
763 | }, | 763 | }, |
764 | #endif | 764 | #endif |
765 | #if defined(CONFIG_X86) | 765 | #if defined(CONFIG_X86) |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c5786064..4b2545a136ff 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = { | |||
136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, | 136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, |
137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
139 | { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, | ||
140 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
141 | {} | 140 | {} |
142 | }; | 141 | }; |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ea37e2ff4164..14674dce77a6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -69,6 +69,21 @@ config EVENT_TRACING | |||
69 | select CONTEXT_SWITCH_TRACER | 69 | select CONTEXT_SWITCH_TRACER |
70 | bool | 70 | bool |
71 | 71 | ||
72 | config EVENT_POWER_TRACING_DEPRECATED | ||
73 | depends on EVENT_TRACING | ||
74 | bool "Deprecated power event trace API, to be removed" | ||
75 | default y | ||
76 | help | ||
77 | Provides old power event types: | ||
78 | C-state/idle accounting events: | ||
79 | power:power_start | ||
80 | power:power_end | ||
81 | and old cpufreq accounting event: | ||
82 | power:power_frequency | ||
83 | This is for userspace compatibility | ||
84 | and will vanish after 5 kernel iterations, | ||
85 | namely 2.6.41. | ||
86 | |||
72 | config CONTEXT_SWITCH_TRACER | 87 | config CONTEXT_SWITCH_TRACER |
73 | bool | 88 | bool |
74 | 89 | ||
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a06161..f55fcf61b223 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c | |||
@@ -13,5 +13,8 @@ | |||
13 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
14 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
15 | 15 | ||
16 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); | 16 | #ifdef EVENT_POWER_TRACING_DEPRECATED |
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); | ||
18 | #endif | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); | ||
17 | 20 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 39c059ca670e..19a359d5e6d5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) | |||
21 | /* Count the events in use (per event id, not per instance) */ | 21 | /* Count the events in use (per event id, not per instance) */ |
22 | static int total_ref_count; | 22 | static int total_ref_count; |
23 | 23 | ||
24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | ||
25 | struct perf_event *p_event) | ||
26 | { | ||
27 | /* No tracing, just counting, so no obvious leak */ | ||
28 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) | ||
29 | return 0; | ||
30 | |||
31 | /* Some events are ok to be traced by non-root users... */ | ||
32 | if (p_event->attach_state == PERF_ATTACH_TASK) { | ||
33 | if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | /* | ||
38 | * ...otherwise raw tracepoint data can be a severe data leak, | ||
39 | * only allow root to have these. | ||
40 | */ | ||
41 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | ||
42 | return -EPERM; | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 47 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
25 | struct perf_event *p_event) | 48 | struct perf_event *p_event) |
26 | { | 49 | { |
27 | struct hlist_head __percpu *list; | 50 | struct hlist_head __percpu *list; |
28 | int ret = -ENOMEM; | 51 | int ret; |
29 | int cpu; | 52 | int cpu; |
30 | 53 | ||
54 | ret = perf_trace_event_perm(tp_event, p_event); | ||
55 | if (ret) | ||
56 | return ret; | ||
57 | |||
31 | p_event->tp_event = tp_event; | 58 | p_event->tp_event = tp_event; |
32 | if (tp_event->perf_refcount++ > 0) | 59 | if (tp_event->perf_refcount++ > 0) |
33 | return 0; | 60 | return 0; |
34 | 61 | ||
62 | ret = -ENOMEM; | ||
63 | |||
35 | list = alloc_percpu(struct hlist_head); | 64 | list = alloc_percpu(struct hlist_head); |
36 | if (!list) | 65 | if (!list) |
37 | goto fail; | 66 | goto fail; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0725eeab1937..35fde09b81de 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -27,6 +27,12 @@ | |||
27 | 27 | ||
28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
29 | 29 | ||
30 | DEFINE_MUTEX(event_storage_mutex); | ||
31 | EXPORT_SYMBOL_GPL(event_storage_mutex); | ||
32 | |||
33 | char event_storage[EVENT_STORAGE_SIZE]; | ||
34 | EXPORT_SYMBOL_GPL(event_storage); | ||
35 | |||
30 | LIST_HEAD(ftrace_events); | 36 | LIST_HEAD(ftrace_events); |
31 | LIST_HEAD(ftrace_common_fields); | 37 | LIST_HEAD(ftrace_common_fields); |
32 | 38 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac25..4b74d71705c0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
83 | 83 | ||
84 | #undef __array | 84 | #undef __array |
85 | #define __array(type, item, len) \ | 85 | #define __array(type, item, len) \ |
86 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ | 86 | do { \ |
87 | ret = trace_define_field(event_call, #type "[" #len "]", #item, \ | 87 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ |
88 | mutex_lock(&event_storage_mutex); \ | ||
89 | snprintf(event_storage, sizeof(event_storage), \ | ||
90 | "%s[%d]", #type, len); \ | ||
91 | ret = trace_define_field(event_call, event_storage, #item, \ | ||
88 | offsetof(typeof(field), item), \ | 92 | offsetof(typeof(field), item), \ |
89 | sizeof(field.item), \ | 93 | sizeof(field.item), \ |
90 | is_signed_type(type), FILTER_OTHER); \ | 94 | is_signed_type(type), FILTER_OTHER); \ |
91 | if (ret) \ | 95 | mutex_unlock(&event_storage_mutex); \ |
92 | return ret; | 96 | if (ret) \ |
97 | return ret; \ | ||
98 | } while (0); | ||
93 | 99 | ||
94 | #undef __array_desc | 100 | #undef __array_desc |
95 | #define __array_desc(type, container, item, len) \ | 101 | #define __array_desc(type, container, item, len) \ |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5b082156cd21..aaa8dae08236 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str) | |||
57 | { | 57 | { |
58 | if (!strncmp(str, "panic", 5)) | 58 | if (!strncmp(str, "panic", 5)) |
59 | hardlockup_panic = 1; | 59 | hardlockup_panic = 1; |
60 | else if (!strncmp(str, "0", 1)) | ||
61 | no_watchdog = 1; | ||
60 | return 1; | 62 | return 1; |
61 | } | 63 | } |
62 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 64 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
@@ -548,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = { | |||
548 | .notifier_call = cpu_callback | 550 | .notifier_call = cpu_callback |
549 | }; | 551 | }; |
550 | 552 | ||
551 | static int __init spawn_watchdog_task(void) | 553 | void __init lockup_detector_init(void) |
552 | { | 554 | { |
553 | void *cpu = (void *)(long)smp_processor_id(); | 555 | void *cpu = (void *)(long)smp_processor_id(); |
554 | int err; | 556 | int err; |
555 | 557 | ||
556 | if (no_watchdog) | 558 | if (no_watchdog) |
557 | return 0; | 559 | return; |
558 | 560 | ||
559 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 561 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
560 | WARN_ON(notifier_to_errno(err)); | 562 | WARN_ON(notifier_to_errno(err)); |
@@ -562,6 +564,5 @@ static int __init spawn_watchdog_task(void) | |||
562 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 564 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
563 | register_cpu_notifier(&cpu_nfb); | 565 | register_cpu_notifier(&cpu_nfb); |
564 | 566 | ||
565 | return 0; | 567 | return; |
566 | } | 568 | } |
567 | early_initcall(spawn_watchdog_task); | ||