diff options
author | Paul Mackerras <paulus@samba.org> | 2009-02-09 06:42:47 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-02-09 06:47:16 -0500 |
commit | 23a185ca8abbeef64b6ffc33059b1d630e43ec10 (patch) | |
tree | c5eb9454ff969377adb40532119240f6fc893fcb | |
parent | 82aa9a1829199233f9bdaf26e2ee271114f4701e (diff) |
perf_counters: make software counters work as per-cpu counters
Impact: kernel crash fix
Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software
counter as a per-cpu counter would reliably crash the system, because
it calls __task_delta_exec with a null pointer. The page fault,
context switch and cpu migration counters also won't function
correctly as per-cpu counters since they reference the current task.
This fixes the problem by redirecting the task_clock counter to the
cpu_clock counter when used as a per-cpu counter, and by implementing
per-cpu page fault, context switch and cpu migration counters.
Along the way, this:
- Initializes counter->ctx earlier, in perf_counter_alloc, so that
sw_perf_counter_init can use it
- Adds code to kernel/sched.c to count task migrations into each
cpu, in rq->nr_migrations_in
- Exports the per-cpu context switch and task migration counts
via new functions added to kernel/sched.c
- Makes sure that if sw_perf_counter_init fails, we don't try to
initialize the counter as a hardware counter. Since the user has
passed a negative, non-raw event type, they clearly don't intend
for it to be interpreted as a hardware event.
Reported-by: "Zhang Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/perf_counter.c | 78 | ||||
-rw-r--r-- | kernel/sched.c | 17 |
3 files changed, 64 insertions, 33 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index b85b10abf770..1e5f70062a9c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -137,6 +137,8 @@ extern unsigned long nr_running(void); | |||
137 | extern unsigned long nr_uninterruptible(void); | 137 | extern unsigned long nr_uninterruptible(void); |
138 | extern unsigned long nr_active(void); | 138 | extern unsigned long nr_active(void); |
139 | extern unsigned long nr_iowait(void); | 139 | extern unsigned long nr_iowait(void); |
140 | extern u64 cpu_nr_switches(int cpu); | ||
141 | extern u64 cpu_nr_migrations(int cpu); | ||
140 | 142 | ||
141 | struct seq_file; | 143 | struct seq_file; |
142 | struct cfs_rq; | 144 | struct cfs_rq; |
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f27a7e9f3c41..544193cbc478 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <linux/anon_inodes.h> | 20 | #include <linux/anon_inodes.h> |
21 | #include <linux/kernel_stat.h> | 21 | #include <linux/kernel_stat.h> |
22 | #include <linux/perf_counter.h> | 22 | #include <linux/perf_counter.h> |
23 | #include <linux/mm.h> | ||
24 | #include <linux/vmstat.h> | ||
23 | 25 | ||
24 | /* | 26 | /* |
25 | * Each CPU has a list of per CPU counters: | 27 | * Each CPU has a list of per CPU counters: |
@@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx, | |||
502 | { | 504 | { |
503 | struct task_struct *task = ctx->task; | 505 | struct task_struct *task = ctx->task; |
504 | 506 | ||
505 | counter->ctx = ctx; | ||
506 | if (!task) { | 507 | if (!task) { |
507 | /* | 508 | /* |
508 | * Per cpu counters are installed via an smp call and | 509 | * Per cpu counters are installed via an smp call and |
@@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { | |||
1417 | .read = task_clock_perf_counter_read, | 1418 | .read = task_clock_perf_counter_read, |
1418 | }; | 1419 | }; |
1419 | 1420 | ||
1420 | static u64 get_page_faults(void) | 1421 | #ifdef CONFIG_VM_EVENT_COUNTERS |
1422 | #define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] | ||
1423 | #else | ||
1424 | #define cpu_page_faults() 0 | ||
1425 | #endif | ||
1426 | |||
1427 | static u64 get_page_faults(struct perf_counter *counter) | ||
1421 | { | 1428 | { |
1422 | struct task_struct *curr = current; | 1429 | struct task_struct *curr = counter->ctx->task; |
1423 | 1430 | ||
1424 | return curr->maj_flt + curr->min_flt; | 1431 | if (curr) |
1432 | return curr->maj_flt + curr->min_flt; | ||
1433 | return cpu_page_faults(); | ||
1425 | } | 1434 | } |
1426 | 1435 | ||
1427 | static void page_faults_perf_counter_update(struct perf_counter *counter) | 1436 | static void page_faults_perf_counter_update(struct perf_counter *counter) |
@@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter) | |||
1430 | s64 delta; | 1439 | s64 delta; |
1431 | 1440 | ||
1432 | prev = atomic64_read(&counter->hw.prev_count); | 1441 | prev = atomic64_read(&counter->hw.prev_count); |
1433 | now = get_page_faults(); | 1442 | now = get_page_faults(counter); |
1434 | 1443 | ||
1435 | atomic64_set(&counter->hw.prev_count, now); | 1444 | atomic64_set(&counter->hw.prev_count, now); |
1436 | 1445 | ||
@@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) | |||
1446 | 1455 | ||
1447 | static int page_faults_perf_counter_enable(struct perf_counter *counter) | 1456 | static int page_faults_perf_counter_enable(struct perf_counter *counter) |
1448 | { | 1457 | { |
1449 | /* | 1458 | atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); |
1450 | * page-faults is a per-task value already, | ||
1451 | * so we dont have to clear it on switch-in. | ||
1452 | */ | ||
1453 | |||
1454 | return 0; | 1459 | return 0; |
1455 | } | 1460 | } |
1456 | 1461 | ||
@@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = { | |||
1465 | .read = page_faults_perf_counter_read, | 1470 | .read = page_faults_perf_counter_read, |
1466 | }; | 1471 | }; |
1467 | 1472 | ||
1468 | static u64 get_context_switches(void) | 1473 | static u64 get_context_switches(struct perf_counter *counter) |
1469 | { | 1474 | { |
1470 | struct task_struct *curr = current; | 1475 | struct task_struct *curr = counter->ctx->task; |
1471 | 1476 | ||
1472 | return curr->nvcsw + curr->nivcsw; | 1477 | if (curr) |
1478 | return curr->nvcsw + curr->nivcsw; | ||
1479 | return cpu_nr_switches(smp_processor_id()); | ||
1473 | } | 1480 | } |
1474 | 1481 | ||
1475 | static void context_switches_perf_counter_update(struct perf_counter *counter) | 1482 | static void context_switches_perf_counter_update(struct perf_counter *counter) |
@@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter) | |||
1478 | s64 delta; | 1485 | s64 delta; |
1479 | 1486 | ||
1480 | prev = atomic64_read(&counter->hw.prev_count); | 1487 | prev = atomic64_read(&counter->hw.prev_count); |
1481 | now = get_context_switches(); | 1488 | now = get_context_switches(counter); |
1482 | 1489 | ||
1483 | atomic64_set(&counter->hw.prev_count, now); | 1490 | atomic64_set(&counter->hw.prev_count, now); |
1484 | 1491 | ||
@@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) | |||
1494 | 1501 | ||
1495 | static int context_switches_perf_counter_enable(struct perf_counter *counter) | 1502 | static int context_switches_perf_counter_enable(struct perf_counter *counter) |
1496 | { | 1503 | { |
1497 | /* | 1504 | atomic64_set(&counter->hw.prev_count, get_context_switches(counter)); |
1498 | * ->nvcsw + curr->nivcsw is a per-task value already, | ||
1499 | * so we dont have to clear it on switch-in. | ||
1500 | */ | ||
1501 | |||
1502 | return 0; | 1505 | return 0; |
1503 | } | 1506 | } |
1504 | 1507 | ||
@@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { | |||
1513 | .read = context_switches_perf_counter_read, | 1516 | .read = context_switches_perf_counter_read, |
1514 | }; | 1517 | }; |
1515 | 1518 | ||
1516 | static inline u64 get_cpu_migrations(void) | 1519 | static inline u64 get_cpu_migrations(struct perf_counter *counter) |
1517 | { | 1520 | { |
1518 | return current->se.nr_migrations; | 1521 | struct task_struct *curr = counter->ctx->task; |
1522 | |||
1523 | if (curr) | ||
1524 | return curr->se.nr_migrations; | ||
1525 | return cpu_nr_migrations(smp_processor_id()); | ||
1519 | } | 1526 | } |
1520 | 1527 | ||
1521 | static void cpu_migrations_perf_counter_update(struct perf_counter *counter) | 1528 | static void cpu_migrations_perf_counter_update(struct perf_counter *counter) |
@@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter) | |||
1524 | s64 delta; | 1531 | s64 delta; |
1525 | 1532 | ||
1526 | prev = atomic64_read(&counter->hw.prev_count); | 1533 | prev = atomic64_read(&counter->hw.prev_count); |
1527 | now = get_cpu_migrations(); | 1534 | now = get_cpu_migrations(counter); |
1528 | 1535 | ||
1529 | atomic64_set(&counter->hw.prev_count, now); | 1536 | atomic64_set(&counter->hw.prev_count, now); |
1530 | 1537 | ||
@@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) | |||
1540 | 1547 | ||
1541 | static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) | 1548 | static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) |
1542 | { | 1549 | { |
1543 | /* | 1550 | atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter)); |
1544 | * se.nr_migrations is a per-task value already, | ||
1545 | * so we dont have to clear it on switch-in. | ||
1546 | */ | ||
1547 | |||
1548 | return 0; | 1551 | return 0; |
1549 | } | 1552 | } |
1550 | 1553 | ||
@@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter) | |||
1569 | hw_ops = &perf_ops_cpu_clock; | 1572 | hw_ops = &perf_ops_cpu_clock; |
1570 | break; | 1573 | break; |
1571 | case PERF_COUNT_TASK_CLOCK: | 1574 | case PERF_COUNT_TASK_CLOCK: |
1572 | hw_ops = &perf_ops_task_clock; | 1575 | /* |
1576 | * If the user instantiates this as a per-cpu counter, | ||
1577 | * use the cpu_clock counter instead. | ||
1578 | */ | ||
1579 | if (counter->ctx->task) | ||
1580 | hw_ops = &perf_ops_task_clock; | ||
1581 | else | ||
1582 | hw_ops = &perf_ops_cpu_clock; | ||
1573 | break; | 1583 | break; |
1574 | case PERF_COUNT_PAGE_FAULTS: | 1584 | case PERF_COUNT_PAGE_FAULTS: |
1575 | hw_ops = &perf_ops_page_faults; | 1585 | hw_ops = &perf_ops_page_faults; |
@@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter) | |||
1592 | static struct perf_counter * | 1602 | static struct perf_counter * |
1593 | perf_counter_alloc(struct perf_counter_hw_event *hw_event, | 1603 | perf_counter_alloc(struct perf_counter_hw_event *hw_event, |
1594 | int cpu, | 1604 | int cpu, |
1605 | struct perf_counter_context *ctx, | ||
1595 | struct perf_counter *group_leader, | 1606 | struct perf_counter *group_leader, |
1596 | gfp_t gfpflags) | 1607 | gfp_t gfpflags) |
1597 | { | 1608 | { |
@@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, | |||
1623 | counter->wakeup_pending = 0; | 1634 | counter->wakeup_pending = 0; |
1624 | counter->group_leader = group_leader; | 1635 | counter->group_leader = group_leader; |
1625 | counter->hw_ops = NULL; | 1636 | counter->hw_ops = NULL; |
1637 | counter->ctx = ctx; | ||
1626 | 1638 | ||
1627 | counter->state = PERF_COUNTER_STATE_INACTIVE; | 1639 | counter->state = PERF_COUNTER_STATE_INACTIVE; |
1628 | if (hw_event->disabled) | 1640 | if (hw_event->disabled) |
@@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, | |||
1631 | hw_ops = NULL; | 1643 | hw_ops = NULL; |
1632 | if (!hw_event->raw && hw_event->type < 0) | 1644 | if (!hw_event->raw && hw_event->type < 0) |
1633 | hw_ops = sw_perf_counter_init(counter); | 1645 | hw_ops = sw_perf_counter_init(counter); |
1634 | if (!hw_ops) | 1646 | else |
1635 | hw_ops = hw_perf_counter_init(counter); | 1647 | hw_ops = hw_perf_counter_init(counter); |
1636 | 1648 | ||
1637 | if (!hw_ops) { | 1649 | if (!hw_ops) { |
@@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, | |||
1707 | } | 1719 | } |
1708 | 1720 | ||
1709 | ret = -EINVAL; | 1721 | ret = -EINVAL; |
1710 | counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); | 1722 | counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, |
1723 | GFP_KERNEL); | ||
1711 | if (!counter) | 1724 | if (!counter) |
1712 | goto err_put_context; | 1725 | goto err_put_context; |
1713 | 1726 | ||
@@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter, | |||
1777 | parent_counter = parent_counter->parent; | 1790 | parent_counter = parent_counter->parent; |
1778 | 1791 | ||
1779 | child_counter = perf_counter_alloc(&parent_counter->hw_event, | 1792 | child_counter = perf_counter_alloc(&parent_counter->hw_event, |
1780 | parent_counter->cpu, group_leader, | 1793 | parent_counter->cpu, child_ctx, |
1781 | GFP_KERNEL); | 1794 | group_leader, GFP_KERNEL); |
1782 | if (!child_counter) | 1795 | if (!child_counter) |
1783 | return NULL; | 1796 | return NULL; |
1784 | 1797 | ||
1785 | /* | 1798 | /* |
1786 | * Link it up in the child's context: | 1799 | * Link it up in the child's context: |
1787 | */ | 1800 | */ |
1788 | child_counter->ctx = child_ctx; | ||
1789 | child_counter->task = child; | 1801 | child_counter->task = child; |
1790 | list_add_counter(child_counter, child_ctx); | 1802 | list_add_counter(child_counter, child_ctx); |
1791 | child_ctx->nr_counters++; | 1803 | child_ctx->nr_counters++; |
diff --git a/kernel/sched.c b/kernel/sched.c index 8db1a4cf2082..173768f142ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -558,6 +558,7 @@ struct rq { | |||
558 | struct load_weight load; | 558 | struct load_weight load; |
559 | unsigned long nr_load_updates; | 559 | unsigned long nr_load_updates; |
560 | u64 nr_switches; | 560 | u64 nr_switches; |
561 | u64 nr_migrations_in; | ||
561 | 562 | ||
562 | struct cfs_rq cfs; | 563 | struct cfs_rq cfs; |
563 | struct rt_rq rt; | 564 | struct rt_rq rt; |
@@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1908 | #endif | 1909 | #endif |
1909 | if (old_cpu != new_cpu) { | 1910 | if (old_cpu != new_cpu) { |
1910 | p->se.nr_migrations++; | 1911 | p->se.nr_migrations++; |
1912 | new_rq->nr_migrations_in++; | ||
1911 | #ifdef CONFIG_SCHEDSTATS | 1913 | #ifdef CONFIG_SCHEDSTATS |
1912 | if (task_hot(p, old_rq->clock, NULL)) | 1914 | if (task_hot(p, old_rq->clock, NULL)) |
1913 | schedstat_inc(p, se.nr_forced2_migrations); | 1915 | schedstat_inc(p, se.nr_forced2_migrations); |
@@ -2811,6 +2813,21 @@ unsigned long nr_active(void) | |||
2811 | } | 2813 | } |
2812 | 2814 | ||
2813 | /* | 2815 | /* |
2816 | * Externally visible per-cpu scheduler statistics: | ||
2817 | * cpu_nr_switches(cpu) - number of context switches on that cpu | ||
2818 | * cpu_nr_migrations(cpu) - number of migrations into that cpu | ||
2819 | */ | ||
2820 | u64 cpu_nr_switches(int cpu) | ||
2821 | { | ||
2822 | return cpu_rq(cpu)->nr_switches; | ||
2823 | } | ||
2824 | |||
2825 | u64 cpu_nr_migrations(int cpu) | ||
2826 | { | ||
2827 | return cpu_rq(cpu)->nr_migrations_in; | ||
2828 | } | ||
2829 | |||
2830 | /* | ||
2814 | * Update rq->cpu_load[] statistics. This function is usually called every | 2831 | * Update rq->cpu_load[] statistics. This function is usually called every |
2815 | * scheduler tick (TICK_NSEC). | 2832 | * scheduler tick (TICK_NSEC). |
2816 | */ | 2833 | */ |