1 files changed, 603 insertions, 35 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d3f282fa50e..65dcdc76d709 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
        return data.ret;
 }
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+                       PERF_FLAG_FD_OUTPUT  |\
+                       PERF_FLAG_PID_CGROUP)
 enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
@@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type);
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type);
+                             enum event_type_t event_type,
+                             struct task_struct *task);
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
 void __weak perf_event_print_debug(void)        { }
@@ -162,6 +176,338 @@ static inline u64 perf_clock(void)
        return local_clock();
 }
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+        return container_of(task_subsys_state(task, perf_subsys_id),
+                        struct perf_cgroup, css);
+}
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+        return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+        css_get(&event->cgrp->css);
+}
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+        css_put(&event->cgrp->css);
+}
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+        perf_put_cgroup(event);
+        event->cgrp = NULL;
+}
+static inline int is_cgroup_event(struct perf_event *event)
+{
+        return event->cgrp != NULL;
+}
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+        struct perf_cgroup_info *t;
+        t = per_cpu_ptr(event->cgrp->info, event->cpu);
+        return t->time;
+}
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+        struct perf_cgroup_info *info;
+        u64 now;
+        now = perf_clock();
+        info = this_cpu_ptr(cgrp->info);
+        info->time += now - info->timestamp;
+        info->timestamp = now;
+}
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+        struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+        if (cgrp_out)
+                __update_cgrp_time(cgrp_out);
+}
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+        struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+        /*
+         * do not update time when cgroup is not active
+         */
+        if (!event->cgrp || cgrp != event->cgrp)
+                return;
+        __update_cgrp_time(event->cgrp);
+}
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+        struct perf_cgroup *cgrp;
+        struct perf_cgroup_info *info;
+        if (!task)
+                return;
+        cgrp = perf_cgroup_from_task(task);
+        info = this_cpu_ptr(cgrp->info);
+        info->timestamp = now;
+}
+#define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        /*
+         * disable interrupts to avoid geting nr_cgroup
+         * changes via __perf_event_disable(). Also
+         * avoids preemption.
+         */
+        local_irq_save(flags);
+        /*
+         * we reschedule only in the presence of cgroup
+         * constrained events.
+         */
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                perf_pmu_disable(cpuctx->ctx.pmu);
+                /*
+                 * perf_cgroup_events says at least one
+                 * context on this CPU has cgroup events.
+                 *
+                 * ctx->nr_cgroups reports the number of cgroup
+                 * events for a context.
+                 */
+                if (cpuctx->ctx.nr_cgroups > 0) {
+                        if (mode & PERF_CGROUP_SWOUT) {
+                                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+                                /*
+                                 * must not be done before ctxswout due
+                                 * to event_filter_match() in event_sched_out()
+                                 */
+                                cpuctx->cgrp = NULL;
+                        }
+                        if (mode & PERF_CGROUP_SWIN) {
+                                /* set cgrp before ctxsw in to
+                                 * allow event_filter_match() to not
+                                 * have to pass task around
+                                 */
+                                cpuctx->cgrp = perf_cgroup_from_task(task);
+                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+                        }
+                }
+                perf_pmu_enable(cpuctx->ctx.pmu);
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+        perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+        perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+                                      struct perf_event_attr *attr,
+                                      struct perf_event *group_leader)
+{
+        struct perf_cgroup *cgrp;
+        struct cgroup_subsys_state *css;
+        struct file *file;
+        int ret = 0, fput_needed;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        css = cgroup_css_from_dir(file, perf_subsys_id);
+        if (IS_ERR(css))
+                return PTR_ERR(css);
+        cgrp = container_of(css, struct perf_cgroup, css);
+        event->cgrp = cgrp;
+        /*
+         * all events in a group must monitor
+         * the same cgroup because a task belongs
+         * to only one perf cgroup at a time
+         */
+        if (group_leader && group_leader->cgrp != cgrp) {
+                perf_detach_cgroup(event);
+                ret = -EINVAL;
+        } else {
+                /* must be done before we fput() the file */
+                perf_get_cgroup(event);
+        }
+        fput_light(file, fput_needed);
+        return ret;
+}
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+        struct perf_cgroup_info *t;
+        t = per_cpu_ptr(event->cgrp->info, event->cpu);
+        event->shadow_ctx_time = now - t->timestamp;
+}
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+        /*
+         * when the current task's perf cgroup does not match
+         * the event's, we need to remember to call the
+         * perf_mark_enable() function the first time a task with
+         * a matching perf cgroup is scheduled in.
+         */
+        if (is_cgroup_event(event) && !perf_cgroup_match(event))
+                event->cgrp_defer_enabled = 1;
+}
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+                         struct perf_event_context *ctx)
+{
+        struct perf_event *sub;
+        u64 tstamp = perf_event_time(event);
+        if (!event->cgrp_defer_enabled)
+                return;
+        event->cgrp_defer_enabled = 0;
+        event->tstamp_enabled = tstamp - event->total_time_enabled;
+        list_for_each_entry(sub, &event->sibling_list, group_entry) {
+                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+                        sub->cgrp_defer_enabled = 0;
+                }
+        }
+}
+#else /* !CONFIG_CGROUP_PERF */
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+        return true;
+}
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+static inline int is_cgroup_event(struct perf_event *event)
+{
+        return 0;
+}
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+        return 0;
+}
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+                                      struct perf_event_attr *attr,
+                                      struct perf_event *group_leader)
+{
+        return -EINVAL;
+}
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+}
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+        return 0;
+}
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+                         struct perf_event_context *ctx)
+{
+}
+#endif
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx)
 static u64 perf_event_time(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
+        if (is_cgroup_event(event))
+                return perf_cgroup_event_time(event);
        return ctx ? ctx->time : 0;
 }
@@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE ||
            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
                return;
+        /*
-        if (ctx->is_active)
+         * in cgroup mode, time_enabled represents
+         * the time the event was enabled AND active
+         * tasks were in the monitored cgroup. This is
+         * independent of the activity of the context as
+         * there may be a mix of cgroup and non-cgroup events.
+         *
+         * That is why we treat cgroup events differently
+         * here.
+         */
+        if (is_cgroup_event(event))
                run_end = perf_event_time(event);
+        else if (ctx->is_active)
+                run_end = ctx->time;
        else
                run_end = event->tstamp_stopped;
@@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event)
                run_end = perf_event_time(event);
        event->total_time_running = run_end - event->tstamp_running;
 }
 /*
@@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                list_add_tail(&event->group_entry, list);
        }
+        if (is_cgroup_event(event)) {
+                ctx->nr_cgroups++;
+                /*
+                 * one more event:
+                 * - that has cgroup constraint on event->cpu
+                 * - that may need work on context switch
+                 */
+                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+                jump_label_inc(&perf_sched_events);
+        }
        list_add_rcu(&event->event_entry, &ctx->event_list);
        if (!ctx->nr_events)
                perf_pmu_rotate_start(ctx->pmu);
@@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        event->attach_state &= ~PERF_ATTACH_CONTEXT;
+        if (is_cgroup_event(event)) {
+                ctx->nr_cgroups--;
+                atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+                jump_label_dec(&perf_sched_events);
+        }
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -616,7 +995,8 @@ out:
 static inline int
 event_filter_match(struct perf_event *event)
 {
-        return event->cpu == -1 || event->cpu == smp_processor_id();
+        return (event->cpu == -1 || event->cpu == smp_processor_id())
+            && perf_cgroup_match(event);
 }
 static void
@@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event,
         */
        if (event->state == PERF_EVENT_STATE_INACTIVE
            && !event_filter_match(event)) {
-                delta = ctx->time - event->tstamp_stopped;
+                delta = tstamp - event->tstamp_stopped;
                event->tstamp_running += delta;
                event->tstamp_stopped = tstamp;
        }
@@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event,
                cpuctx->exclusive = 0;
 }
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
 /*
 * Cross CPU call to remove a performance event
 *
@@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info)
         */
        if (event->state >= PERF_EVENT_STATE_INACTIVE) {
                update_context_time(ctx);
+                update_cgrp_time_from_event(event);
                update_group_times(event);
                if (event == event->group_leader)
                        group_sched_out(event, cpuctx, ctx);
@@ -851,6 +1226,41 @@ retry:
        raw_spin_unlock_irq(&ctx->lock);
 }
+static void perf_set_shadow_time(struct perf_event *event,
+                                 struct perf_event_context *ctx,
+                                 u64 tstamp)
+{
+        /*
+         * use the correct time source for the time snapshot
+         *
+         * We could get by without this by leveraging the
+         * fact that to get to this function, the caller
+         * has most likely already called update_context_time()
+         * and update_cgrp_time_xx() and thus both timestamp
+         * are identical (or very close). Given that tstamp is,
+         * already adjusted for cgroup, we could say that:
+         *    tstamp - ctx->timestamp
+         * is equivalent to
+         *    tstamp - cgrp->timestamp.
+         *
+         * Then, in perf_output_read(), the calculation would
+         * work with no changes because:
+         * - event is guaranteed scheduled in
+         * - no scheduled out in between
+         * - thus the timestamp would be the same
+         *
+         * But this is a bit hairy.
+         *
+         * So instead, we have an explicit cgroup call to remain
+         * within the time time source all along. We believe it
+         * is cleaner and simpler to understand.
+         */
+        if (is_cgroup_event(event))
+                perf_cgroup_set_shadow_time(event, tstamp);
+        else
+                event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
 #define MAX_INTERRUPTS (~0ULL)
 static void perf_log_throttle(struct perf_event *event, int enable);
@@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event,
        event->tstamp_running += tstamp - event->tstamp_stopped;
-        event->shadow_ctx_time = tstamp - ctx->timestamp;
+        perf_set_shadow_time(event, ctx, tstamp);
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
@@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event,
        event->tstamp_stopped = tstamp;
 }
-static void perf_event_context_sched_in(struct perf_event_context *ctx);
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+                                        struct task_struct *tsk);
 /*
 * Cross CPU call to install and enable a performance event
@@ -1033,11 +1444,17 @@ static int  __perf_install_in_context(void *info)
         * which do context switches with IRQs enabled.
         */
        if (ctx->task && !cpuctx->task_ctx)
-                perf_event_context_sched_in(ctx);
+                perf_event_context_sched_in(ctx, ctx->task);
        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
+        /*
+         * update cgrp time only if current cgrp
+         * matches event->cgrp. Must be done before
+         * calling add_event_to_ctx()
+         */
+        update_cgrp_time_from_event(event);
        add_event_to_ctx(event, ctx);
@@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info)
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                goto unlock;
+        /*
+         * set current task's cgroup time reference point
+         */
+        perf_cgroup_set_timestamp(current, perf_clock());
        __perf_event_mark_enabled(event, ctx);
-        if (!event_filter_match(event))
+        if (!event_filter_match(event)) {
+                if (is_cgroup_event(event))
+                        perf_cgroup_defer_enabled(event);
                goto unlock;
+        }
        /*
         * If the event is in a group and isn't the group leader,
@@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        if (likely(!ctx->nr_events))
                goto out;
        update_context_time(ctx);
+        update_cgrp_time_from_cpuctx(cpuctx);
        if (!ctx->nr_active)
                goto out;
@@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
+        /*
+         * if cgroup events exist on this CPU, then we need
+         * to check if we have to switch out PMU state.
+         * cgroup event are system-wide mode only
+         */
+        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+                perf_cgroup_sched_out(task);
 }
 static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
                if (!event_filter_match(event))
                        continue;
+                /* may need to reset tstamp_enabled */
+                if (is_cgroup_event(event))
+                        perf_cgroup_mark_enabled(event, ctx);
                if (group_can_go_on(event, cpuctx, 1))
                        group_sched_in(event, cpuctx, ctx);
@@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                if (!event_filter_match(event))
                        continue;
+                /* may need to reset tstamp_enabled */
+                if (is_cgroup_event(event))
+                        perf_cgroup_mark_enabled(event, ctx);
                if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
@@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
-             enum event_type_t event_type)
+             enum event_type_t event_type,
+             struct task_struct *task)
 {
+        u64 now;
        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        if (likely(!ctx->nr_events))
                goto out;
-        ctx->timestamp = perf_clock();
+        now = perf_clock();
+        ctx->timestamp = now;
+        perf_cgroup_set_timestamp(task, now);
        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
@@ -1601,11 +2048,12 @@ out:
 }
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type)
+                             enum event_type_t event_type,
+                             struct task_struct *task)
 {
        struct perf_event_context *ctx = &cpuctx->ctx;
-        ctx_sched_in(ctx, cpuctx, event_type);
+        ctx_sched_in(ctx, cpuctx, event_type, task);
 }
 static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
        if (cpuctx->task_ctx == ctx)
                return;
-        ctx_sched_in(ctx, cpuctx, event_type);
+        ctx_sched_in(ctx, cpuctx, event_type, NULL);
        cpuctx->task_ctx = ctx;
 }
-static void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+                                        struct task_struct *task)
 {
        struct perf_cpu_context *cpuctx;
@@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx)
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-        ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+        ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
-        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
        cpuctx->task_ctx = ctx;
@@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
                if (likely(!ctx))
                        continue;
-                perf_event_context_sched_in(ctx);
+                perf_event_context_sched_in(ctx, task);
        }
+        /*
+         * if cgroup events exist on this CPU, then we need
+         * to check if we have to switch in PMU state.
+         * cgroup event are system-wide mode only
+         */
+        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+                perf_cgroup_sched_in(task);
 }
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
        if (ctx)
                rotate_ctx(ctx);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
        if (ctx)
                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
@@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        raw_spin_unlock(&ctx->lock);
-        perf_event_context_sched_in(ctx);
+        perf_event_context_sched_in(ctx, ctx->task);
 out:
        local_irq_restore(flags);
 }
@@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info)
                return;
        raw_spin_lock(&ctx->lock);
-        if (ctx->is_active)
+        if (ctx->is_active) {
                update_context_time(ctx);
+                update_cgrp_time_from_event(event);
+        }
        update_event_times(event);
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);
@@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event)
                 * (e.g., thread is blocked), in that case
                 * we cannot update context time
                 */
-                if (ctx->is_active)
+                if (ctx->is_active) {
                        update_context_time(ctx);
+                        update_cgrp_time_from_event(event);
+                }
                update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
@@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event)
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_dec(&perf_task_events);
+                        jump_label_dec(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event)
                event->buffer = NULL;
        }
+        if (is_cgroup_event(event))
+                perf_detach_cgroup(event);
        if (event->destroy)
                event->destroy(event);
@@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event)
        if (!in_nmi()) {
                update_context_time(event->ctx);
+                update_cgrp_time_from_event(event);
                time = event->ctx->time;
        } else {
                u64 now = perf_clock();
@@ -5725,7 +6189,7 @@ done:
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_inc(&perf_task_events);
+                        jump_label_inc(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open,
        int err;
        /* for future expandability... */
-        if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;
        err = perf_copy_attr(attr_uptr, &attr);
@@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EINVAL;
        }
+        /*
+         * In cgroup mode, the pid argument is used to pass the fd
+         * opened to the cgroup directory in cgroupfs. The cpu argument
+         * designates the cpu on which to monitor threads from that
+         * cgroup.
+         */
+        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+                return -EINVAL;
        event_fd = get_unused_fd_flags(O_RDWR);
        if (event_fd < 0)
                return event_fd;
@@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open,
                        group_leader = NULL;
        }
-        if (pid != -1) {
+        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
@@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open,
                goto err_task;
        }
+        if (flags & PERF_FLAG_PID_CGROUP) {
+                err = perf_cgroup_connect(pid, event, &attr, group_leader);
+                if (err)
+                        goto err_alloc;
+        }
        /*
         * Special case software events and allow them to be part of
         * any hardware group.
@@ -6808,3 +7287,92 @@ unlock:
        return ret;
 }
 device_initcall(perf_event_sysfs_init);
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+        struct cgroup_subsys *ss, struct cgroup *cont)
+{
+        struct perf_cgroup *jc;
+        struct perf_cgroup_info *t;
+        int c;
+        jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+        if (!jc)
+                return ERR_PTR(-ENOMEM);
+        memset(jc, 0, sizeof(*jc));
+        jc->info = alloc_percpu(struct perf_cgroup_info);
+        if (!jc->info) {
+                kfree(jc);
+                return ERR_PTR(-ENOMEM);
+        }
+        for_each_possible_cpu(c) {
+                t = per_cpu_ptr(jc->info, c);
+                t->time = 0;
+                t->timestamp = 0;
+        }
+        return &jc->css;
+}
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+                                struct cgroup *cont)
+{
+        struct perf_cgroup *jc;
+        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+                          struct perf_cgroup, css);
+        free_percpu(jc->info);
+        kfree(jc);
+}
+static int __perf_cgroup_move(void *info)
+{
+        struct task_struct *task = info;
+        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+        return 0;
+}
+static void perf_cgroup_move(struct task_struct *task)
+{
+        task_function_call(task, __perf_cgroup_move, task);
+}
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                struct cgroup *old_cgrp, struct task_struct *task,
+                bool threadgroup)
+{
+        perf_cgroup_move(task);
+        if (threadgroup) {
+                struct task_struct *c;
+                rcu_read_lock();
+                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+                        perf_cgroup_move(c);
+                }
+                rcu_read_unlock();
+        }
+}
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                struct cgroup *old_cgrp, struct task_struct *task)
+{
+        /*
+         * cgroup_exit() is called in the copy_process() failure path.
+         * Ignore this case since the task hasn't ran yet, this avoids
+         * trying to poke a half freed task state from generic code.
+         */
+        if (!(task->flags & PF_EXITING))
+                return;
+        perf_cgroup_move(task);
+}
+struct cgroup_subsys perf_subsys = {
+        .name = "perf_event",
+        .subsys_id = perf_subsys_id,
+        .create = perf_cgroup_create,
+        .destroy = perf_cgroup_destroy,
+        .exit = perf_cgroup_exit,
+        .attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */