aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-02-09 06:42:47 -0500
committerIngo Molnar <mingo@elte.hu>2009-02-09 06:47:16 -0500
commit23a185ca8abbeef64b6ffc33059b1d630e43ec10 (patch)
treec5eb9454ff969377adb40532119240f6fc893fcb
parent82aa9a1829199233f9bdaf26e2ee271114f4701e (diff)
perf_counters: make software counters work as per-cpu counters
Impact: kernel crash fix Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software counter as a per-cpu counter would reliably crash the system, because it calls __task_delta_exec with a null pointer. The page fault, context switch and cpu migration counters also won't function correctly as per-cpu counters since they reference the current task. This fixes the problem by redirecting the task_clock counter to the cpu_clock counter when used as a per-cpu counter, and by implementing per-cpu page fault, context switch and cpu migration counters. Along the way, this: - Initializes counter->ctx earlier, in perf_counter_alloc, so that sw_perf_counter_init can use it - Adds code to kernel/sched.c to count task migrations into each cpu, in rq->nr_migrations_in - Exports the per-cpu context switch and task migration counts via new functions added to kernel/sched.c - Makes sure that if sw_perf_counter_init fails, we don't try to initialize the counter as a hardware counter. Since the user has passed a negative, non-raw event type, they clearly don't intend for it to be interpreted as a hardware event. Reported-by: "Zhang Yanmin" <yanmin_zhang@linux.intel.com> Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/perf_counter.c78
-rw-r--r--kernel/sched.c17
3 files changed, 64 insertions, 33 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b85b10abf770..1e5f70062a9c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_running(void);
137extern unsigned long nr_uninterruptible(void); 137extern unsigned long nr_uninterruptible(void);
138extern unsigned long nr_active(void); 138extern unsigned long nr_active(void);
139extern unsigned long nr_iowait(void); 139extern unsigned long nr_iowait(void);
140extern u64 cpu_nr_switches(int cpu);
141extern u64 cpu_nr_migrations(int cpu);
140 142
141struct seq_file; 143struct seq_file;
142struct cfs_rq; 144struct cfs_rq;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f27a7e9f3c41..544193cbc478 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -20,6 +20,8 @@
20#include <linux/anon_inodes.h> 20#include <linux/anon_inodes.h>
21#include <linux/kernel_stat.h> 21#include <linux/kernel_stat.h>
22#include <linux/perf_counter.h> 22#include <linux/perf_counter.h>
23#include <linux/mm.h>
24#include <linux/vmstat.h>
23 25
24/* 26/*
25 * Each CPU has a list of per CPU counters: 27 * Each CPU has a list of per CPU counters:
@@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
502{ 504{
503 struct task_struct *task = ctx->task; 505 struct task_struct *task = ctx->task;
504 506
505 counter->ctx = ctx;
506 if (!task) { 507 if (!task) {
507 /* 508 /*
508 * Per cpu counters are installed via an smp call and 509 * Per cpu counters are installed via an smp call and
@@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
1417 .read = task_clock_perf_counter_read, 1418 .read = task_clock_perf_counter_read,
1418}; 1419};
1419 1420
1420static u64 get_page_faults(void) 1421#ifdef CONFIG_VM_EVENT_COUNTERS
1422#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT]
1423#else
1424#define cpu_page_faults() 0
1425#endif
1426
1427static u64 get_page_faults(struct perf_counter *counter)
1421{ 1428{
1422 struct task_struct *curr = current; 1429 struct task_struct *curr = counter->ctx->task;
1423 1430
1424 return curr->maj_flt + curr->min_flt; 1431 if (curr)
1432 return curr->maj_flt + curr->min_flt;
1433 return cpu_page_faults();
1425} 1434}
1426 1435
1427static void page_faults_perf_counter_update(struct perf_counter *counter) 1436static void page_faults_perf_counter_update(struct perf_counter *counter)
@@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
1430 s64 delta; 1439 s64 delta;
1431 1440
1432 prev = atomic64_read(&counter->hw.prev_count); 1441 prev = atomic64_read(&counter->hw.prev_count);
1433 now = get_page_faults(); 1442 now = get_page_faults(counter);
1434 1443
1435 atomic64_set(&counter->hw.prev_count, now); 1444 atomic64_set(&counter->hw.prev_count, now);
1436 1445
@@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
1446 1455
1447static int page_faults_perf_counter_enable(struct perf_counter *counter) 1456static int page_faults_perf_counter_enable(struct perf_counter *counter)
1448{ 1457{
1449 /* 1458 atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
1450 * page-faults is a per-task value already,
1451 * so we dont have to clear it on switch-in.
1452 */
1453
1454 return 0; 1459 return 0;
1455} 1460}
1456 1461
@@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
1465 .read = page_faults_perf_counter_read, 1470 .read = page_faults_perf_counter_read,
1466}; 1471};
1467 1472
1468static u64 get_context_switches(void) 1473static u64 get_context_switches(struct perf_counter *counter)
1469{ 1474{
1470 struct task_struct *curr = current; 1475 struct task_struct *curr = counter->ctx->task;
1471 1476
1472 return curr->nvcsw + curr->nivcsw; 1477 if (curr)
1478 return curr->nvcsw + curr->nivcsw;
1479 return cpu_nr_switches(smp_processor_id());
1473} 1480}
1474 1481
1475static void context_switches_perf_counter_update(struct perf_counter *counter) 1482static void context_switches_perf_counter_update(struct perf_counter *counter)
@@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
1478 s64 delta; 1485 s64 delta;
1479 1486
1480 prev = atomic64_read(&counter->hw.prev_count); 1487 prev = atomic64_read(&counter->hw.prev_count);
1481 now = get_context_switches(); 1488 now = get_context_switches(counter);
1482 1489
1483 atomic64_set(&counter->hw.prev_count, now); 1490 atomic64_set(&counter->hw.prev_count, now);
1484 1491
@@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
1494 1501
1495static int context_switches_perf_counter_enable(struct perf_counter *counter) 1502static int context_switches_perf_counter_enable(struct perf_counter *counter)
1496{ 1503{
1497 /* 1504 atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
1498 * ->nvcsw + curr->nivcsw is a per-task value already,
1499 * so we dont have to clear it on switch-in.
1500 */
1501
1502 return 0; 1505 return 0;
1503} 1506}
1504 1507
@@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
1513 .read = context_switches_perf_counter_read, 1516 .read = context_switches_perf_counter_read,
1514}; 1517};
1515 1518
1516static inline u64 get_cpu_migrations(void) 1519static inline u64 get_cpu_migrations(struct perf_counter *counter)
1517{ 1520{
1518 return current->se.nr_migrations; 1521 struct task_struct *curr = counter->ctx->task;
1522
1523 if (curr)
1524 return curr->se.nr_migrations;
1525 return cpu_nr_migrations(smp_processor_id());
1519} 1526}
1520 1527
1521static void cpu_migrations_perf_counter_update(struct perf_counter *counter) 1528static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
@@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1524 s64 delta; 1531 s64 delta;
1525 1532
1526 prev = atomic64_read(&counter->hw.prev_count); 1533 prev = atomic64_read(&counter->hw.prev_count);
1527 now = get_cpu_migrations(); 1534 now = get_cpu_migrations(counter);
1528 1535
1529 atomic64_set(&counter->hw.prev_count, now); 1536 atomic64_set(&counter->hw.prev_count, now);
1530 1537
@@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1540 1547
1541static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) 1548static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1542{ 1549{
1543 /* 1550 atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
1544 * se.nr_migrations is a per-task value already,
1545 * so we dont have to clear it on switch-in.
1546 */
1547
1548 return 0; 1551 return 0;
1549} 1552}
1550 1553
@@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter)
1569 hw_ops = &perf_ops_cpu_clock; 1572 hw_ops = &perf_ops_cpu_clock;
1570 break; 1573 break;
1571 case PERF_COUNT_TASK_CLOCK: 1574 case PERF_COUNT_TASK_CLOCK:
1572 hw_ops = &perf_ops_task_clock; 1575 /*
1576 * If the user instantiates this as a per-cpu counter,
1577 * use the cpu_clock counter instead.
1578 */
1579 if (counter->ctx->task)
1580 hw_ops = &perf_ops_task_clock;
1581 else
1582 hw_ops = &perf_ops_cpu_clock;
1573 break; 1583 break;
1574 case PERF_COUNT_PAGE_FAULTS: 1584 case PERF_COUNT_PAGE_FAULTS:
1575 hw_ops = &perf_ops_page_faults; 1585 hw_ops = &perf_ops_page_faults;
@@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter)
1592static struct perf_counter * 1602static struct perf_counter *
1593perf_counter_alloc(struct perf_counter_hw_event *hw_event, 1603perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1594 int cpu, 1604 int cpu,
1605 struct perf_counter_context *ctx,
1595 struct perf_counter *group_leader, 1606 struct perf_counter *group_leader,
1596 gfp_t gfpflags) 1607 gfp_t gfpflags)
1597{ 1608{
@@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1623 counter->wakeup_pending = 0; 1634 counter->wakeup_pending = 0;
1624 counter->group_leader = group_leader; 1635 counter->group_leader = group_leader;
1625 counter->hw_ops = NULL; 1636 counter->hw_ops = NULL;
1637 counter->ctx = ctx;
1626 1638
1627 counter->state = PERF_COUNTER_STATE_INACTIVE; 1639 counter->state = PERF_COUNTER_STATE_INACTIVE;
1628 if (hw_event->disabled) 1640 if (hw_event->disabled)
@@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1631 hw_ops = NULL; 1643 hw_ops = NULL;
1632 if (!hw_event->raw && hw_event->type < 0) 1644 if (!hw_event->raw && hw_event->type < 0)
1633 hw_ops = sw_perf_counter_init(counter); 1645 hw_ops = sw_perf_counter_init(counter);
1634 if (!hw_ops) 1646 else
1635 hw_ops = hw_perf_counter_init(counter); 1647 hw_ops = hw_perf_counter_init(counter);
1636 1648
1637 if (!hw_ops) { 1649 if (!hw_ops) {
@@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1707 } 1719 }
1708 1720
1709 ret = -EINVAL; 1721 ret = -EINVAL;
1710 counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); 1722 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
1723 GFP_KERNEL);
1711 if (!counter) 1724 if (!counter)
1712 goto err_put_context; 1725 goto err_put_context;
1713 1726
@@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter,
1777 parent_counter = parent_counter->parent; 1790 parent_counter = parent_counter->parent;
1778 1791
1779 child_counter = perf_counter_alloc(&parent_counter->hw_event, 1792 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1780 parent_counter->cpu, group_leader, 1793 parent_counter->cpu, child_ctx,
1781 GFP_KERNEL); 1794 group_leader, GFP_KERNEL);
1782 if (!child_counter) 1795 if (!child_counter)
1783 return NULL; 1796 return NULL;
1784 1797
1785 /* 1798 /*
1786 * Link it up in the child's context: 1799 * Link it up in the child's context:
1787 */ 1800 */
1788 child_counter->ctx = child_ctx;
1789 child_counter->task = child; 1801 child_counter->task = child;
1790 list_add_counter(child_counter, child_ctx); 1802 list_add_counter(child_counter, child_ctx);
1791 child_ctx->nr_counters++; 1803 child_ctx->nr_counters++;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8db1a4cf2082..173768f142ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -558,6 +558,7 @@ struct rq {
558 struct load_weight load; 558 struct load_weight load;
559 unsigned long nr_load_updates; 559 unsigned long nr_load_updates;
560 u64 nr_switches; 560 u64 nr_switches;
561 u64 nr_migrations_in;
561 562
562 struct cfs_rq cfs; 563 struct cfs_rq cfs;
563 struct rt_rq rt; 564 struct rt_rq rt;
@@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1908#endif 1909#endif
1909 if (old_cpu != new_cpu) { 1910 if (old_cpu != new_cpu) {
1910 p->se.nr_migrations++; 1911 p->se.nr_migrations++;
1912 new_rq->nr_migrations_in++;
1911#ifdef CONFIG_SCHEDSTATS 1913#ifdef CONFIG_SCHEDSTATS
1912 if (task_hot(p, old_rq->clock, NULL)) 1914 if (task_hot(p, old_rq->clock, NULL))
1913 schedstat_inc(p, se.nr_forced2_migrations); 1915 schedstat_inc(p, se.nr_forced2_migrations);
@@ -2811,6 +2813,21 @@ unsigned long nr_active(void)
2811} 2813}
2812 2814
2813/* 2815/*
2816 * Externally visible per-cpu scheduler statistics:
2817 * cpu_nr_switches(cpu) - number of context switches on that cpu
2818 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2819 */
2820u64 cpu_nr_switches(int cpu)
2821{
2822 return cpu_rq(cpu)->nr_switches;
2823}
2824
2825u64 cpu_nr_migrations(int cpu)
2826{
2827 return cpu_rq(cpu)->nr_migrations_in;
2828}
2829
2830/*
2814 * Update rq->cpu_load[] statistics. This function is usually called every 2831 * Update rq->cpu_load[] statistics. This function is usually called every
2815 * scheduler tick (TICK_NSEC). 2832 * scheduler tick (TICK_NSEC).
2816 */ 2833 */