perf_counter: Add support for pinned and exclusive counter groups

Impact: New perf_counter features A pinned counter group is one that the user wants to have on the CPU whenever possible, i.e. whenever the associated task is running, for a per-task group, or always for a per-cpu group. If the system cannot satisfy that, it puts the group into an error state where it is not scheduled any more and reads from it return EOF (i.e. 0 bytes read). The group can be released from error state and made readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE). When we have finer-grained enable/disable controls on counters we'll be able to reset the error state on individual groups. An exclusive group is one that the user wants to be the only group using the CPU performance monitor hardware whenever it is on. The counter group scheduler will not schedule an exclusive group if there are already other groups on the CPU and will not schedule other groups onto the CPU if there is an exclusive group scheduled (that statement does not apply to groups containing only software counters, which can always go on and which do not prevent an exclusive group from going on). With an exclusive group, we will be able to let users program PMU registers at a low level without the concern that those settings will perturb other measurements. Along the way this reorganizes things a little: - is_software_counter() is moved to perf_counter.h. - cpuctx->active_oncpu now records the number of hardware counters on the CPU, i.e. it now excludes software counters. Nothing was reading cpuctx->active_oncpu before, so this change is harmless. - A new cpuctx->exclusive field records whether we currently have an exclusive group on the CPU. - counter_sched_out moves higher up in perf_counter.c and gets called from __perf_counter_remove_from_context and __perf_counter_exit_task, where we used to have essentially the same code. - __perf_counter_sched_in now goes through the counter list twice, doing the pinned counters in the first loop and the non-pinned counters in the second loop, in order to give the pinned counters the best chance to be scheduled in. Note that only a group leader can be exclusive or pinned, and that attribute applies to the whole group. This avoids some awkwardness in some corner cases (e.g. where a group leader is closed and the other group members get added to the context list). If we want to relax that restriction later, we can, and it is easier to relax a restriction than to apply a new one. This doesn't yet handle the case where a pinned counter is inherited and goes into error state in the child - the error state is not propagated up to the parent when the child exits, and arguably it should. Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Paul Mackerras <paulus@samba.org> 2009-01-14 05:00:30 -0500
committer: Paul Mackerras <paulus@samba.org> 2009-01-14 05:00:30 -0500
commit: 3b6f9e5cb21964b7ce12bf81076f830885563ec8 (patch)
tree: e9d5ecffafa66cc3aeb259ade15a2611ad795327 /kernel/perf_counter.c
parent: 01d0287f068de2934109ba9b989d8807526cccc2 (diff)
1 files changed, 154 insertions, 72 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52f2f526248e..faf671b29566 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
        }
 }
+static void
+counter_sched_out(struct perf_counter *counter,
+                  struct perf_cpu_context *cpuctx,
+                  struct perf_counter_context *ctx)
+{
+        if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                return;
+        counter->state = PERF_COUNTER_STATE_INACTIVE;
+        counter->hw_ops->disable(counter);
+        counter->oncpu = -1;
+        if (!is_software_counter(counter))
+                cpuctx->active_oncpu--;
+        ctx->nr_active--;
+        if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+                cpuctx->exclusive = 0;
+}
 /*
 * Cross CPU call to remove a performance counter
 *
@@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info)
        curr_rq_lock_irq_save(&flags);
        spin_lock(&ctx->lock);
-        if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+        counter_sched_out(counter, cpuctx, ctx);
-                counter->state = PERF_COUNTER_STATE_INACTIVE;
-                counter->hw_ops->disable(counter);
+        counter->task = NULL;
-                ctx->nr_active--;
-                cpuctx->active_oncpu--;
-                counter->task = NULL;
-                counter->oncpu = -1;
-        }
        ctx->nr_counters--;
        /*
@@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter,
                 struct perf_counter_context *ctx,
                 int cpu)
 {
-        if (counter->state == PERF_COUNTER_STATE_OFF)
+        if (counter->state <= PERF_COUNTER_STATE_OFF)
                return 0;
        counter->state = PERF_COUNTER_STATE_ACTIVE;
@@ -223,13 +237,64 @@ counter_sched_in(struct perf_counter *counter,
                return -EAGAIN;
        }
-        cpuctx->active_oncpu++;
+        if (!is_software_counter(counter))
+                cpuctx->active_oncpu++;
        ctx->nr_active++;
+        if (counter->hw_event.exclusive)
+                cpuctx->exclusive = 1;
        return 0;
 }
 /*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+        struct perf_counter *counter;
+        if (!is_software_counter(leader))
+                return 0;
+        list_for_each_entry(counter, &leader->sibling_list, list_entry)
+                if (!is_software_counter(counter))
+                        return 0;
+        return 1;
+}
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+                           struct perf_cpu_context *cpuctx,
+                           int can_add_hw)
+{
+        /*
+         * Groups consisting entirely of software counters can always go on.
+         */
+        if (is_software_only_group(counter))
+                return 1;
+        /*
+         * If an exclusive group is already on, no other hardware
+         * counters can go on.
+         */
+        if (cpuctx->exclusive)
+                return 0;
+        /*
+         * If this group is exclusive and there are already
+         * counters on the CPU, it can't go on.
+         */
+        if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+                return 0;
+        /*
+         * Otherwise, try to add it if all previous groups were able
+         * to go on.
+         */
+        return can_add_hw;
+}
+/*
 * Cross CPU call to install and enable a performance counter
 */
 static void __perf_install_in_context(void *info)
@@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info)
        int cpu = smp_processor_id();
        unsigned long flags;
        u64 perf_flags;
+        int err;
        /*
         * If this is a task context, we need to check whether it is
@@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info)
        list_add_counter(counter, ctx);
        ctx->nr_counters++;
-        counter_sched_in(counter, cpuctx, ctx, cpu);
+        /*
+         * An exclusive counter can't go on if there are already active
+         * hardware counters, and no hardware counter can go on if there
+         * is already an exclusive counter on.
+         */
+        if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
+            !group_can_go_on(counter, cpuctx, 1))
+                err = -EEXIST;
+        else
+                err = counter_sched_in(counter, cpuctx, ctx, cpu);
+        if (err && counter->hw_event.pinned)
+                counter->state = PERF_COUNTER_STATE_ERROR;
-        if (!ctx->task && cpuctx->max_pertask)
+        if (!err && !ctx->task && cpuctx->max_pertask)
                cpuctx->max_pertask--;
        hw_perf_restore(perf_flags);
@@ -327,22 +405,6 @@ retry:
 }
 static void
-counter_sched_out(struct perf_counter *counter,
-                  struct perf_cpu_context *cpuctx,
-                  struct perf_counter_context *ctx)
-{
-        if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-                return;
-        counter->state = PERF_COUNTER_STATE_INACTIVE;
-        counter->hw_ops->disable(counter);
-        counter->oncpu = -1;
-        cpuctx->active_oncpu--;
-        ctx->nr_active--;
-}
-static void
 group_sched_out(struct perf_counter *group_counter,
                struct perf_cpu_context *cpuctx,
                struct perf_counter_context *ctx)
@@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter,
         */
        list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
                counter_sched_out(counter, cpuctx, ctx);
+        if (group_counter->hw_event.exclusive)
+                cpuctx->exclusive = 0;
 }
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -455,30 +520,6 @@ group_error:
        return -EAGAIN;
 }
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-        return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-/*
- * Return 1 for a group consisting entirely of software counters,
- * 0 if the group contains any hardware counters.
- */
-static int is_software_only_group(struct perf_counter *leader)
-{
-        struct perf_counter *counter;
-        if (!is_software_counter(leader))
-                return 0;
-        list_for_each_entry(counter, &leader->sibling_list, list_entry)
-                if (!is_software_counter(counter))
-                        return 0;
-        return 1;
-}
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
                        struct perf_cpu_context *cpuctx, int cpu)
@@ -492,22 +533,49 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
        spin_lock(&ctx->lock);
        flags = hw_perf_save_disable();
+        /*
+         * First go through the list and put on any pinned groups
+         * in order to give them the best chance of going on.
+         */
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                    !counter->hw_event.pinned)
+                        continue;
+                if (counter->cpu != -1 && counter->cpu != cpu)
+                        continue;
+                if (group_can_go_on(counter, cpuctx, 1))
+                        group_sched_in(counter, cpuctx, ctx, cpu);
+                /*
+                 * If this pinned group hasn't been scheduled,
+                 * put it in error state.
+                 */
+                if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+                        counter->state = PERF_COUNTER_STATE_ERROR;
+        }
        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
                /*
+                 * Ignore counters in OFF or ERROR state, and
+                 * ignore pinned counters since we did them already.
+                 */
+                if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                    counter->hw_event.pinned)
+                        continue;
+                /*
                 * Listen to the 'cpu' scheduling filter constraint
                 * of counters:
                 */
                if (counter->cpu != -1 && counter->cpu != cpu)
                        continue;
-                /*
+                if (group_can_go_on(counter, cpuctx, can_add_hw)) {
-                 * If we scheduled in a group atomically and exclusively,
-                 * or if this group can't go on, don't add any more
-                 * hardware counters.
-                 */
-                if (can_add_hw || is_software_only_group(counter))
                        if (group_sched_in(counter, cpuctx, ctx, cpu))
                                can_add_hw = 0;
+                }
        }
        hw_perf_restore(flags);
        spin_unlock(&ctx->lock);
@@ -567,8 +635,10 @@ int perf_counter_task_disable(void)
         */
        perf_flags = hw_perf_save_disable();
-        list_for_each_entry(counter, &ctx->counter_list, list_entry)
+        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-                counter->state = PERF_COUNTER_STATE_OFF;
+                if (counter->state != PERF_COUNTER_STATE_ERROR)
+                        counter->state = PERF_COUNTER_STATE_OFF;
+        }
        hw_perf_restore(perf_flags);
@@ -607,7 +677,7 @@ int perf_counter_task_enable(void)
        perf_flags = hw_perf_save_disable();
        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-                if (counter->state != PERF_COUNTER_STATE_OFF)
+                if (counter->state > PERF_COUNTER_STATE_OFF)
                        continue;
                counter->state = PERF_COUNTER_STATE_INACTIVE;
                counter->hw_event.disabled = 0;
@@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
        if (count != sizeof(cntval))
                return -EINVAL;
+        /*
+         * Return end-of-file for a read on a counter that is in
+         * error state (i.e. because it was pinned but it couldn't be
+         * scheduled on to the CPU at some point).
+         */
+        if (counter->state == PERF_COUNTER_STATE_ERROR)
+                return 0;
        mutex_lock(&counter->mutex);
        cntval = perf_counter_read(counter);
        mutex_unlock(&counter->mutex);
@@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter	*counter,
 {
        struct perf_data *irqdata, *usrdata;
        DECLARE_WAITQUEUE(wait, current);
-        ssize_t res;
+        ssize_t res, res2;
        irqdata = counter->irqdata;
        usrdata = counter->usrdata;
@@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter	*counter,
                if (signal_pending(current))
                        break;
+                if (counter->state == PERF_COUNTER_STATE_ERROR)
+                        break;
                spin_unlock_irq(&counter->waitq.lock);
                schedule();
                spin_lock_irq(&counter->waitq.lock);
@@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter	*counter,
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&counter->waitq.lock);
-        if (usrdata->len + irqdata->len < count)
+        if (usrdata->len + irqdata->len < count &&
+            counter->state != PERF_COUNTER_STATE_ERROR)
                return -ERESTARTSYS;
 read_pending:
        mutex_lock(&counter->mutex);
@@ -925,11 +1007,12 @@ read_pending:
        /* Switch irq buffer: */
        usrdata = perf_switch_irq_data(counter);
-        if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+        res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
+        if (res2 < 0) {
                if (!res)
                        res = -EFAULT;
        } else {
-                res = count;
+                res += res2;
        }
 out:
        mutex_unlock(&counter->mutex);
@@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
                 */
                if (group_leader->ctx != ctx)
                        goto err_put_context;
+                /*
+                 * Only a group leader can be exclusive or pinned
+                 */
+                if (hw_event.exclusive || hw_event.pinned)
+                        goto err_put_context;
        }
        ret = -EINVAL;
@@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child,
                cpuctx = &__get_cpu_var(perf_cpu_context);
-                if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+                counter_sched_out(child_counter, cpuctx, child_ctx);
-                        child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-                        child_counter->hw_ops->disable(child_counter);
-                        cpuctx->active_oncpu--;
-                        child_ctx->nr_active--;
-                        child_counter->oncpu = -1;
-                }
                list_del_init(&child_counter->list_entry);
author	Paul Mackerras <paulus@samba.org>	2009-01-14 05:00:30 -0500
committer	Paul Mackerras <paulus@samba.org>	2009-01-14 05:00:30 -0500
commit	3b6f9e5cb21964b7ce12bf81076f830885563ec8 (patch)
tree	e9d5ecffafa66cc3aeb259ade15a2611ad795327 /kernel/perf_counter.c
parent	01d0287f068de2934109ba9b989d8807526cccc2 (diff)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52f2f526248e..faf671b29566 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c
@@ -93,6 +93,25 @@ list_del_counter(struct perf_counter counter, struct perf_counter_context ctx)
93	}	93	}
94	}	94	}
95		95
		96	static void
		97	counter_sched_out(struct perf_counter *counter,
		98	struct perf_cpu_context *cpuctx,
		99	struct perf_counter_context *ctx)
		100	{
		101	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
		102	return;
		103
		104	counter->state = PERF_COUNTER_STATE_INACTIVE;
		105	counter->hw_ops->disable(counter);
		106	counter->oncpu = -1;
		107
		108	if (!is_software_counter(counter))
		109	cpuctx->active_oncpu--;
		110	ctx->nr_active--;
		111	if (counter->hw_event.exclusive \|\| !cpuctx->active_oncpu)
		112	cpuctx->exclusive = 0;
		113	}
		114
96	/*	115	/*
97	* Cross CPU call to remove a performance counter	116	* Cross CPU call to remove a performance counter
98	*	117	*
@@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info)
118	curr_rq_lock_irq_save(&flags);	137	curr_rq_lock_irq_save(&flags);
119	spin_lock(&ctx->lock);	138	spin_lock(&ctx->lock);
120		139
121	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {	140	counter_sched_out(counter, cpuctx, ctx);
122	counter->state = PERF_COUNTER_STATE_INACTIVE;	141
123	counter->hw_ops->disable(counter);	142	counter->task = NULL;
124	ctx->nr_active--;
125	cpuctx->active_oncpu--;
126	counter->task = NULL;
127	counter->oncpu = -1;
128	}
129	ctx->nr_counters--;	143	ctx->nr_counters--;
130		144
131	/*	145	/*
@@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter,
207	struct perf_counter_context *ctx,	221	struct perf_counter_context *ctx,
208	int cpu)	222	int cpu)
209	{	223	{
210	if (counter->state == PERF_COUNTER_STATE_OFF)	224	if (counter->state <= PERF_COUNTER_STATE_OFF)
211	return 0;	225	return 0;
212		226
213	counter->state = PERF_COUNTER_STATE_ACTIVE;	227	counter->state = PERF_COUNTER_STATE_ACTIVE;
@@ -223,13 +237,64 @@ counter_sched_in(struct perf_counter *counter,
223	return -EAGAIN;	237	return -EAGAIN;
224	}	238	}
225		239
226	cpuctx->active_oncpu++;	240	if (!is_software_counter(counter))
		241	cpuctx->active_oncpu++;
227	ctx->nr_active++;	242	ctx->nr_active++;
228		243
		244	if (counter->hw_event.exclusive)
		245	cpuctx->exclusive = 1;
		246
229	return 0;	247	return 0;
230	}	248	}
231		249
232	/*	250	/*
		251	* Return 1 for a group consisting entirely of software counters,
		252	* 0 if the group contains any hardware counters.
		253	*/
		254	static int is_software_only_group(struct perf_counter *leader)
		255	{
		256	struct perf_counter *counter;
		257
		258	if (!is_software_counter(leader))
		259	return 0;
		260	list_for_each_entry(counter, &leader->sibling_list, list_entry)
		261	if (!is_software_counter(counter))
		262	return 0;
		263	return 1;
		264	}
		265
		266	/*
		267	* Work out whether we can put this counter group on the CPU now.
		268	*/
		269	static int group_can_go_on(struct perf_counter *counter,
		270	struct perf_cpu_context *cpuctx,
		271	int can_add_hw)
		272	{
		273	/*
		274	* Groups consisting entirely of software counters can always go on.
		275	*/
		276	if (is_software_only_group(counter))
		277	return 1;
		278	/*
		279	* If an exclusive group is already on, no other hardware
		280	* counters can go on.
		281	*/
		282	if (cpuctx->exclusive)
		283	return 0;
		284	/*
		285	* If this group is exclusive and there are already
		286	* counters on the CPU, it can't go on.
		287	*/
		288	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
		289	return 0;
		290	/*
		291	* Otherwise, try to add it if all previous groups were able
		292	* to go on.
		293	*/
		294	return can_add_hw;
		295	}
		296
		297	/*
233	* Cross CPU call to install and enable a performance counter	298	* Cross CPU call to install and enable a performance counter
234	*/	299	*/
235	static void __perf_install_in_context(void *info)	300	static void __perf_install_in_context(void *info)
@@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info)
240	int cpu = smp_processor_id();	305	int cpu = smp_processor_id();
241	unsigned long flags;	306	unsigned long flags;
242	u64 perf_flags;	307	u64 perf_flags;
		308	int err;
243		309
244	/*	310	/*
245	* If this is a task context, we need to check whether it is	311	* If this is a task context, we need to check whether it is
@@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info)
261	list_add_counter(counter, ctx);	327	list_add_counter(counter, ctx);
262	ctx->nr_counters++;	328	ctx->nr_counters++;
263		329
264	counter_sched_in(counter, cpuctx, ctx, cpu);	330	/*
		331	* An exclusive counter can't go on if there are already active
		332	* hardware counters, and no hardware counter can go on if there
		333	* is already an exclusive counter on.
		334	*/
		335	if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
		336	!group_can_go_on(counter, cpuctx, 1))
		337	err = -EEXIST;
		338	else
		339	err = counter_sched_in(counter, cpuctx, ctx, cpu);
		340
		341	if (err && counter->hw_event.pinned)
		342	counter->state = PERF_COUNTER_STATE_ERROR;
265		343
266	if (!ctx->task && cpuctx->max_pertask)	344	if (!err && !ctx->task && cpuctx->max_pertask)
267	cpuctx->max_pertask--;	345	cpuctx->max_pertask--;
268		346
269	hw_perf_restore(perf_flags);	347	hw_perf_restore(perf_flags);
@@ -327,22 +405,6 @@ retry:
327	}	405	}
328		406
329	static void	407	static void
330	counter_sched_out(struct perf_counter *counter,
331	struct perf_cpu_context *cpuctx,
332	struct perf_counter_context *ctx)
333	{
334	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
335	return;
336
337	counter->state = PERF_COUNTER_STATE_INACTIVE;
338	counter->hw_ops->disable(counter);
339	counter->oncpu = -1;
340
341	cpuctx->active_oncpu--;
342	ctx->nr_active--;
343	}
344
345	static void
346	group_sched_out(struct perf_counter *group_counter,	408	group_sched_out(struct perf_counter *group_counter,
347	struct perf_cpu_context *cpuctx,	409	struct perf_cpu_context *cpuctx,
348	struct perf_counter_context *ctx)	410	struct perf_counter_context *ctx)
@@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter,
359	*/	421	*/
360	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)	422	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
361	counter_sched_out(counter, cpuctx, ctx);	423	counter_sched_out(counter, cpuctx, ctx);
		424
		425	if (group_counter->hw_event.exclusive)
		426	cpuctx->exclusive = 0;
362	}	427	}
363		428
364	void __perf_counter_sched_out(struct perf_counter_context *ctx,	429	void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -455,30 +520,6 @@ group_error:
455	return -EAGAIN;	520	return -EAGAIN;
456	}	521	}
457		522
458	/*
459	* Return 1 for a software counter, 0 for a hardware counter
460	*/
461	static inline int is_software_counter(struct perf_counter *counter)
462	{
463	return !counter->hw_event.raw && counter->hw_event.type < 0;
464	}
465
466	/*
467	* Return 1 for a group consisting entirely of software counters,
468	* 0 if the group contains any hardware counters.
469	*/
470	static int is_software_only_group(struct perf_counter *leader)
471	{
472	struct perf_counter *counter;
473
474	if (!is_software_counter(leader))
475	return 0;
476	list_for_each_entry(counter, &leader->sibling_list, list_entry)
477	if (!is_software_counter(counter))
478	return 0;
479	return 1;
480	}
481
482	static void	523	static void
483	__perf_counter_sched_in(struct perf_counter_context *ctx,	524	__perf_counter_sched_in(struct perf_counter_context *ctx,
484	struct perf_cpu_context *cpuctx, int cpu)	525	struct perf_cpu_context *cpuctx, int cpu)
@@ -492,22 +533,49 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
492		533
493	spin_lock(&ctx->lock);	534	spin_lock(&ctx->lock);
494	flags = hw_perf_save_disable();	535	flags = hw_perf_save_disable();
		536
		537	/*
		538	* First go through the list and put on any pinned groups
		539	* in order to give them the best chance of going on.
		540	*/
		541	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
		542	if (counter->state <= PERF_COUNTER_STATE_OFF \|\|
		543	!counter->hw_event.pinned)
		544	continue;
		545	if (counter->cpu != -1 && counter->cpu != cpu)
		546	continue;
		547
		548	if (group_can_go_on(counter, cpuctx, 1))
		549	group_sched_in(counter, cpuctx, ctx, cpu);
		550
		551	/*
		552	* If this pinned group hasn't been scheduled,
		553	* put it in error state.
		554	*/
		555	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
		556	counter->state = PERF_COUNTER_STATE_ERROR;
		557	}
		558
495	list_for_each_entry(counter, &ctx->counter_list, list_entry) {	559	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
496	/*	560	/*
		561	* Ignore counters in OFF or ERROR state, and
		562	* ignore pinned counters since we did them already.
		563	*/
		564	if (counter->state <= PERF_COUNTER_STATE_OFF \|\|
		565	counter->hw_event.pinned)
		566	continue;
		567
		568	/*
497	* Listen to the 'cpu' scheduling filter constraint	569	* Listen to the 'cpu' scheduling filter constraint
498	* of counters:	570	* of counters:
499	*/	571	*/
500	if (counter->cpu != -1 && counter->cpu != cpu)	572	if (counter->cpu != -1 && counter->cpu != cpu)
501	continue;	573	continue;
502		574
503	/*	575	if (group_can_go_on(counter, cpuctx, can_add_hw)) {
504	* If we scheduled in a group atomically and exclusively,
505	* or if this group can't go on, don't add any more
506	* hardware counters.
507	*/
508	if (can_add_hw \|\| is_software_only_group(counter))
509	if (group_sched_in(counter, cpuctx, ctx, cpu))	576	if (group_sched_in(counter, cpuctx, ctx, cpu))
510	can_add_hw = 0;	577	can_add_hw = 0;
		578	}
511	}	579	}
512	hw_perf_restore(flags);	580	hw_perf_restore(flags);
513	spin_unlock(&ctx->lock);	581	spin_unlock(&ctx->lock);
@@ -567,8 +635,10 @@ int perf_counter_task_disable(void)
567	*/	635	*/
568	perf_flags = hw_perf_save_disable();	636	perf_flags = hw_perf_save_disable();
569		637
570	list_for_each_entry(counter, &ctx->counter_list, list_entry)	638	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
571	counter->state = PERF_COUNTER_STATE_OFF;	639	if (counter->state != PERF_COUNTER_STATE_ERROR)
		640	counter->state = PERF_COUNTER_STATE_OFF;
		641	}
572		642
573	hw_perf_restore(perf_flags);	643	hw_perf_restore(perf_flags);
574		644
@@ -607,7 +677,7 @@ int perf_counter_task_enable(void)
607	perf_flags = hw_perf_save_disable();	677	perf_flags = hw_perf_save_disable();
608		678
609	list_for_each_entry(counter, &ctx->counter_list, list_entry) {	679	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
610	if (counter->state != PERF_COUNTER_STATE_OFF)	680	if (counter->state > PERF_COUNTER_STATE_OFF)
611	continue;	681	continue;
612	counter->state = PERF_COUNTER_STATE_INACTIVE;	682	counter->state = PERF_COUNTER_STATE_INACTIVE;
613	counter->hw_event.disabled = 0;	683	counter->hw_event.disabled = 0;
@@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter counter, char __user buf, size_t count)
849	if (count != sizeof(cntval))	919	if (count != sizeof(cntval))
850	return -EINVAL;	920	return -EINVAL;
851		921
		922	/*
		923	* Return end-of-file for a read on a counter that is in
		924	* error state (i.e. because it was pinned but it couldn't be
		925	* scheduled on to the CPU at some point).
		926	*/
		927	if (counter->state == PERF_COUNTER_STATE_ERROR)
		928	return 0;
		929
852	mutex_lock(&counter->mutex);	930	mutex_lock(&counter->mutex);
853	cntval = perf_counter_read(counter);	931	cntval = perf_counter_read(counter);
854	mutex_unlock(&counter->mutex);	932	mutex_unlock(&counter->mutex);
@@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter *counter,
884	{	962	{
885	struct perf_data irqdata, usrdata;	963	struct perf_data irqdata, usrdata;
886	DECLARE_WAITQUEUE(wait, current);	964	DECLARE_WAITQUEUE(wait, current);
887	ssize_t res;	965	ssize_t res, res2;
888		966
889	irqdata = counter->irqdata;	967	irqdata = counter->irqdata;
890	usrdata = counter->usrdata;	968	usrdata = counter->usrdata;
@@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter *counter,
905	if (signal_pending(current))	983	if (signal_pending(current))
906	break;	984	break;
907		985
		986	if (counter->state == PERF_COUNTER_STATE_ERROR)
		987	break;
		988
908	spin_unlock_irq(&counter->waitq.lock);	989	spin_unlock_irq(&counter->waitq.lock);
909	schedule();	990	schedule();
910	spin_lock_irq(&counter->waitq.lock);	991	spin_lock_irq(&counter->waitq.lock);
@@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter *counter,
913	__set_current_state(TASK_RUNNING);	994	__set_current_state(TASK_RUNNING);
914	spin_unlock_irq(&counter->waitq.lock);	995	spin_unlock_irq(&counter->waitq.lock);
915		996
916	if (usrdata->len + irqdata->len < count)	997	if (usrdata->len + irqdata->len < count &&
		998	counter->state != PERF_COUNTER_STATE_ERROR)
917	return -ERESTARTSYS;	999	return -ERESTARTSYS;
918	read_pending:	1000	read_pending:
919	mutex_lock(&counter->mutex);	1001	mutex_lock(&counter->mutex);
@@ -925,11 +1007,12 @@ read_pending:
925		1007
926	/* Switch irq buffer: */	1008	/* Switch irq buffer: */
927	usrdata = perf_switch_irq_data(counter);	1009	usrdata = perf_switch_irq_data(counter);
928	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {	1010	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
		1011	if (res2 < 0) {
929	if (!res)	1012	if (!res)
930	res = -EFAULT;	1013	res = -EFAULT;
931	} else {	1014	} else {
932	res = count;	1015	res += res2;
933	}	1016	}
934	out:	1017	out:
935	mutex_unlock(&counter->mutex);	1018	mutex_unlock(&counter->mutex);
@@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1348	*/	1431	*/
1349	if (group_leader->ctx != ctx)	1432	if (group_leader->ctx != ctx)
1350	goto err_put_context;	1433	goto err_put_context;
		1434	/*
		1435	* Only a group leader can be exclusive or pinned
		1436	*/
		1437	if (hw_event.exclusive \|\| hw_event.pinned)
		1438	goto err_put_context;
1351	}	1439	}
1352		1440
1353	ret = -EINVAL;	1441	ret = -EINVAL;
@@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child,
1473		1561
1474	cpuctx = &__get_cpu_var(perf_cpu_context);	1562	cpuctx = &__get_cpu_var(perf_cpu_context);
1475		1563
1476	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {	1564	counter_sched_out(child_counter, cpuctx, child_ctx);
1477	child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1478	child_counter->hw_ops->disable(child_counter);
1479	cpuctx->active_oncpu--;
1480	child_ctx->nr_active--;
1481	child_counter->oncpu = -1;
1482	}
1483		1565
1484	list_del_init(&child_counter->list_entry);	1566	list_del_init(&child_counter->list_entry);
1485		1567