perf_counter: new output ABI - part 1

Impact: Rework the perfcounter output ABI use sys_read() only for instant data and provide mmap() output for all async overflow data. The first mmap() determines the size of the output buffer. The mmap() size must be a PAGE_SIZE multiple of 1+pages, where pages must be a power of 2 or 0. Further mmap()s of the same fd must have the same size. Once all maps are gone, you can again mmap() with a new size. In case of 0 extra pages there is no data output and the first page only contains meta data. When there are data pages, a poll() event will be generated for each full page of data. Furthermore, the output is circular. This means that although 1 page is a valid configuration, its useless, since we'll start overwriting it the instant we report a full page. Future work will focus on the output format (currently maintained) where we'll likey want each entry denoted by a header which includes a type and length. Further future work will allow to splice() the fd, also containing the async overflow data -- splice() would be mutually exclusive with mmap() of the data. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Mackerras <paulus@samba.org> Orig-LKML-Reference: <20090323172417.470536358@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2009-03-23 13:22:10 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-04-06 03:30:27 -0400
commit: 7b732a75047738e4f85438ed2f9cd34bf5f2a19a (patch)
tree: bae36de785ac819ceef6fa5e1b7884a4a421cc3c /kernel/perf_counter.c
parent: b09d2501ed3d294619cbfbcf828ad39324d0e548 (diff)
1 files changed, 245 insertions, 219 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d9cfd902140e..0dfe91094fd1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,7 +4,8 @@
 *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
 *
- *  For licencing details see kernel-base/COPYING
+ *
+ *  For licensing details see kernel-base/COPYING
 */
 #include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter)
        return atomic64_read(&counter->count);
 }
-/*
- * Cross CPU call to switch performance data pointers
- */
-static void __perf_switch_irq_data(void *info)
-{
-        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-        struct perf_counter *counter = info;
-        struct perf_counter_context *ctx = counter->ctx;
-        struct perf_data *oldirqdata = counter->irqdata;
-        /*
-         * If this is a task context, we need to check whether it is
-         * the current task context of this cpu. If not it has been
-         * scheduled out before the smp call arrived.
-         */
-        if (ctx->task) {
-                if (cpuctx->task_ctx != ctx)
-                        return;
-                spin_lock(&ctx->lock);
-        }
-        /* Change the pointer NMI safe */
-        atomic_long_set((atomic_long_t *)&counter->irqdata,
-                        (unsigned long) counter->usrdata);
-        counter->usrdata = oldirqdata;
-        if (ctx->task)
-                spin_unlock(&ctx->lock);
-}
-static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
-{
-        struct perf_counter_context *ctx = counter->ctx;
-        struct perf_data *oldirqdata = counter->irqdata;
-        struct task_struct *task = ctx->task;
-        if (!task) {
-                smp_call_function_single(counter->cpu,
-                                         __perf_switch_irq_data,
-                                         counter, 1);
-                return counter->usrdata;
-        }
-retry:
-        spin_lock_irq(&ctx->lock);
-        if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-                counter->irqdata = counter->usrdata;
-                counter->usrdata = oldirqdata;
-                spin_unlock_irq(&ctx->lock);
-                return oldirqdata;
-        }
-        spin_unlock_irq(&ctx->lock);
-        task_oncpu_function_call(task, __perf_switch_irq_data, counter);
-        /* Might have failed, because task was scheduled out */
-        if (counter->irqdata == oldirqdata)
-                goto retry;
-        return counter->usrdata;
-}
 static void put_context(struct perf_counter_context *ctx)
 {
        if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file)
        mutex_unlock(&counter->mutex);
        mutex_unlock(&ctx->mutex);
-        free_page(counter->user_page);
        free_counter(counter);
        put_context(ctx);
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
        u64 cntval;
-        if (count != sizeof(cntval))
+        if (count < sizeof(cntval))
                return -EINVAL;
        /*
@@ -1211,121 +1151,20 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 }
 static ssize_t
-perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
-{
-        if (!usrdata->len)
-                return 0;
-        count = min(count, (size_t)usrdata->len);
-        if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
-                return -EFAULT;
-        /* Adjust the counters */
-        usrdata->len -= count;
-        if (!usrdata->len)
-                usrdata->rd_idx = 0;
-        else
-                usrdata->rd_idx += count;
-        return count;
-}
-static ssize_t
-perf_read_irq_data(struct perf_counter  *counter,
-                   char __user          *buf,
-                   size_t               count,
-                   int                  nonblocking)
-{
-        struct perf_data *irqdata, *usrdata;
-        DECLARE_WAITQUEUE(wait, current);
-        ssize_t res, res2;
-        irqdata = counter->irqdata;
-        usrdata = counter->usrdata;
-        if (usrdata->len + irqdata->len >= count)
-                goto read_pending;
-        if (nonblocking)
-                return -EAGAIN;
-        spin_lock_irq(&counter->waitq.lock);
-        __add_wait_queue(&counter->waitq, &wait);
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (usrdata->len + irqdata->len >= count)
-                        break;
-                if (signal_pending(current))
-                        break;
-                if (counter->state == PERF_COUNTER_STATE_ERROR)
-                        break;
-                spin_unlock_irq(&counter->waitq.lock);
-                schedule();
-                spin_lock_irq(&counter->waitq.lock);
-        }
-        __remove_wait_queue(&counter->waitq, &wait);
-        __set_current_state(TASK_RUNNING);
-        spin_unlock_irq(&counter->waitq.lock);
-        if (usrdata->len + irqdata->len < count &&
-            counter->state != PERF_COUNTER_STATE_ERROR)
-                return -ERESTARTSYS;
-read_pending:
-        mutex_lock(&counter->mutex);
-        /* Drain pending data first: */
-        res = perf_copy_usrdata(usrdata, buf, count);
-        if (res < 0 || res == count)
-                goto out;
-        /* Switch irq buffer: */
-        usrdata = perf_switch_irq_data(counter);
-        res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
-        if (res2 < 0) {
-                if (!res)
-                        res = -EFAULT;
-        } else {
-                res += res2;
-        }
-out:
-        mutex_unlock(&counter->mutex);
-        return res;
-}
-static ssize_t
 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
        struct perf_counter *counter = file->private_data;
-        switch (counter->hw_event.record_type) {
+        return perf_read_hw(counter, buf, count);
-        case PERF_RECORD_SIMPLE:
-                return perf_read_hw(counter, buf, count);
-        case PERF_RECORD_IRQ:
-        case PERF_RECORD_GROUP:
-                return perf_read_irq_data(counter, buf, count,
-                                          file->f_flags & O_NONBLOCK);
-        }
-        return -EINVAL;
 }
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
        struct perf_counter *counter = file->private_data;
-        unsigned int events = 0;
+        unsigned int events = POLLIN;
-        unsigned long flags;
        poll_wait(file, &counter->waitq, wait);
-        spin_lock_irqsave(&counter->waitq.lock, flags);
-        if (counter->usrdata->len || counter->irqdata->len)
-                events |= POLLIN;
-        spin_unlock_irqrestore(&counter->waitq.lock, flags);
        return events;
 }
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        return err;
 }
-void perf_counter_update_userpage(struct perf_counter *counter)
+static void __perf_counter_update_userpage(struct perf_counter *counter,
+                                           struct perf_mmap_data *data)
 {
-        struct perf_counter_mmap_page *userpg;
+        struct perf_counter_mmap_page *userpg = data->user_page;
-        if (!counter->user_page)
-                return;
-        userpg = (struct perf_counter_mmap_page *) counter->user_page;
+        /*
+         * Disable preemption so as to not let the corresponding user-space
+         * spin too long if we get preempted.
+         */
+        preempt_disable();
        ++userpg->lock;
        smp_wmb();
        userpg->index = counter->hw.idx;
        userpg->offset = atomic64_read(&counter->count);
        if (counter->state == PERF_COUNTER_STATE_ACTIVE)
                userpg->offset -= atomic64_read(&counter->hw.prev_count);
+        userpg->data_head = atomic_read(&data->head);
        smp_wmb();
        ++userpg->lock;
+        preempt_enable();
+}
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+        struct perf_mmap_data *data;
+        rcu_read_lock();
+        data = rcu_dereference(counter->data);
+        if (data)
+                __perf_counter_update_userpage(counter, data);
+        rcu_read_unlock();
 }
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_counter *counter = vma->vm_file->private_data;
+        struct perf_mmap_data *data;
+        int ret = VM_FAULT_SIGBUS;
-        if (!counter->user_page)
+        rcu_read_lock();
-                return VM_FAULT_SIGBUS;
+        data = rcu_dereference(counter->data);
+        if (!data)
+                goto unlock;
+        if (vmf->pgoff == 0) {
+                vmf->page = virt_to_page(data->user_page);
+        } else {
+                int nr = vmf->pgoff - 1;
-        vmf->page = virt_to_page(counter->user_page);
+                if ((unsigned)nr > data->nr_pages)
+                        goto unlock;
+                vmf->page = virt_to_page(data->data_pages[nr]);
+        }
        get_page(vmf->page);
+        ret = 0;
+unlock:
+        rcu_read_unlock();
+        return ret;
+}
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+        struct perf_mmap_data *data;
+        unsigned long size;
+        int i;
+        WARN_ON(atomic_read(&counter->mmap_count));
+        size = sizeof(struct perf_mmap_data);
+        size += nr_pages * sizeof(void *);
+        data = kzalloc(size, GFP_KERNEL);
+        if (!data)
+                goto fail;
+        data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+        if (!data->user_page)
+                goto fail_user_page;
+        for (i = 0; i < nr_pages; i++) {
+                data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+                if (!data->data_pages[i])
+                        goto fail_data_pages;
+        }
+        data->nr_pages = nr_pages;
+        rcu_assign_pointer(counter->data, data);
        return 0;
+fail_data_pages:
+        for (i--; i >= 0; i--)
+                free_page((unsigned long)data->data_pages[i]);
+        free_page((unsigned long)data->user_page);
+fail_user_page:
+        kfree(data);
+fail:
+        return -ENOMEM;
+}
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+        struct perf_mmap_data *data = container_of(rcu_head,
+                        struct perf_mmap_data, rcu_head);
+        int i;
+        free_page((unsigned long)data->user_page);
+        for (i = 0; i < data->nr_pages; i++)
+                free_page((unsigned long)data->data_pages[i]);
+        kfree(data);
+}
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+        struct perf_mmap_data *data = counter->data;
+        WARN_ON(atomic_read(&counter->mmap_count));
+        rcu_assign_pointer(counter->data, NULL);
+        call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+        struct perf_counter *counter = vma->vm_file->private_data;
+        atomic_inc(&counter->mmap_count);
+}
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+        struct perf_counter *counter = vma->vm_file->private_data;
+        if (atomic_dec_and_mutex_lock(&counter->mmap_count,
+                                      &counter->mmap_mutex)) {
+                perf_mmap_data_free(counter);
+                mutex_unlock(&counter->mmap_mutex);
+        }
 }
 static struct vm_operations_struct perf_mmap_vmops = {
+        .open = perf_mmap_open,
+        .close = perf_mmap_close,
        .fault = perf_mmap_fault,
 };
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct perf_counter *counter = file->private_data;
-        unsigned long userpg;
+        unsigned long vma_size;
+        unsigned long nr_pages;
+        unsigned long locked, lock_limit;
+        int ret = 0;
        if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
                return -EINVAL;
-        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+        vma_size = vma->vm_end - vma->vm_start;
+        nr_pages = (vma_size / PAGE_SIZE) - 1;
+        if (nr_pages == 0 || !is_power_of_2(nr_pages))
                return -EINVAL;
-        /*
+        if (vma_size != PAGE_SIZE * (1 + nr_pages))
-         * For now, restrict to the case of a hardware counter
-         * on the current task.
-         */
-        if (is_software_counter(counter) || counter->task != current)
                return -EINVAL;
-        userpg = counter->user_page;
+        if (vma->vm_pgoff != 0)
-        if (!userpg) {
+                return -EINVAL;
-                userpg = get_zeroed_page(GFP_KERNEL);
-                mutex_lock(&counter->mutex);
+        locked = vma_size >>  PAGE_SHIFT;
-                if (counter->user_page) {
+        locked += vma->vm_mm->locked_vm;
-                        free_page(userpg);
-                        userpg = counter->user_page;
-                } else {
-                        counter->user_page = userpg;
-                }
-                mutex_unlock(&counter->mutex);
-                if (!userpg)
-                        return -ENOMEM;
-        }
-        perf_counter_update_userpage(counter);
+        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit >>= PAGE_SHIFT;
+        if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+                return -EPERM;
+        mutex_lock(&counter->mmap_mutex);
+        if (atomic_inc_not_zero(&counter->mmap_count))
+                goto out;
+        WARN_ON(counter->data);
+        ret = perf_mmap_data_alloc(counter, nr_pages);
+        if (!ret)
+                atomic_set(&counter->mmap_count, 1);
+out:
+        mutex_unlock(&counter->mmap_mutex);
        vma->vm_flags &= ~VM_MAYWRITE;
        vma->vm_flags |= VM_RESERVED;
        vma->vm_ops = &perf_mmap_vmops;
-        return 0;
+        return ret;
 }
 static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = {
 * Output
 */
-static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+static int perf_output_write(struct perf_counter *counter, int nmi,
+                             void *buf, ssize_t size)
 {
-        struct perf_data *irqdata = counter->irqdata;
+        struct perf_mmap_data *data;
+        unsigned int offset, head, nr;
+        unsigned int len;
+        int ret, wakeup;
-        if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+        rcu_read_lock();
-                irqdata->overrun++;
+        ret = -ENOSPC;
-        } else {
+        data = rcu_dereference(counter->data);
-                u64 *p = (u64 *) &irqdata->data[irqdata->len];
+        if (!data)
+                goto out;
+        if (!data->nr_pages)
+                goto out;
+        ret = -EINVAL;
+        if (size > PAGE_SIZE)
+                goto out;
+        do {
+                offset = head = atomic_read(&data->head);
+                head += sizeof(u64);
+        } while (atomic_cmpxchg(&data->head, offset, head) != offset);
+        wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
-                *p = data;
+        nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
-                irqdata->len += sizeof(u64);
+        offset &= PAGE_SIZE - 1;
+        len = min_t(unsigned int, PAGE_SIZE - offset, size);
+        memcpy(data->data_pages[nr] + offset, buf, len);
+        size -= len;
+        if (size) {
+                nr = (nr + 1) & (data->nr_pages - 1);
+                memcpy(data->data_pages[nr], buf + len, size);
+        }
+        /*
+         * generate a poll() wakeup for every page boundary crossed
+         */
+        if (wakeup) {
+                __perf_counter_update_userpage(counter, data);
+                if (nmi) {
+                        counter->wakeup_pending = 1;
+                        set_perf_counter_pending();
+                } else
+                        wake_up(&counter->waitq);
        }
+        ret = 0;
+out:
+        rcu_read_unlock();
+        return ret;
 }
-static void perf_counter_handle_group(struct perf_counter *counter)
+static void perf_output_simple(struct perf_counter *counter,
+                               int nmi, struct pt_regs *regs)
+{
+        u64 entry;
+        entry = instruction_pointer(regs);
+        perf_output_write(counter, nmi, &entry, sizeof(entry));
+}
+struct group_entry {
+        u64 event;
+        u64 counter;
+};
+static void perf_output_group(struct perf_counter *counter, int nmi)
 {
        struct perf_counter *leader, *sub;
        leader = counter->group_leader;
        list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+                struct group_entry entry;
                if (sub != counter)
                        sub->hw_ops->read(sub);
-                perf_counter_store_irq(counter, sub->hw_event.config);
-                perf_counter_store_irq(counter, atomic64_read(&sub->count));
+                entry.event = sub->hw_event.config;
+                entry.counter = atomic64_read(&sub->count);
+                perf_output_write(counter, nmi, &entry, sizeof(entry));
        }
 }
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter,
                return;
        case PERF_RECORD_IRQ:
-                perf_counter_store_irq(counter, instruction_pointer(regs));
+                perf_output_simple(counter, nmi, regs);
                break;
        case PERF_RECORD_GROUP:
-                perf_counter_handle_group(counter);
+                perf_output_group(counter, nmi);
                break;
        }
-        if (nmi) {
-                counter->wakeup_pending = 1;
-                set_perf_counter_pending();
-        } else
-                wake_up(&counter->waitq);
 }
 /*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
        INIT_LIST_HEAD(&counter->sibling_list);
        init_waitqueue_head(&counter->waitq);
+        mutex_init(&counter->mmap_mutex);
        INIT_LIST_HEAD(&counter->child_list);
-        counter->irqdata                = &counter->data[0];
-        counter->usrdata                = &counter->data[1];
        counter->cpu                    = cpu;
        counter->hw_event               = *hw_event;
        counter->wakeup_pending         = 0;
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2009-03-23 13:22:10 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-04-06 03:30:27 -0400
commit	7b732a75047738e4f85438ed2f9cd34bf5f2a19a (patch)
tree	bae36de785ac819ceef6fa5e1b7884a4a421cc3c /kernel/perf_counter.c
parent	b09d2501ed3d294619cbfbcf828ad39324d0e548 (diff)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d9cfd902140e..0dfe91094fd1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c
@@ -4,7 +4,8 @@
4	* Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>	4	* Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5	* Copyright(C) 2008 Red Hat, Inc., Ingo Molnar	5	* Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6	*	6	*
7	* For licencing details see kernel-base/COPYING	7	*
		8	* For licensing details see kernel-base/COPYING
8	*/	9	*/
9		10
10	#include <linux/fs.h>	11	#include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter)
1022	return atomic64_read(&counter->count);	1023	return atomic64_read(&counter->count);
1023	}	1024	}
1024		1025
1025	/*
1026	* Cross CPU call to switch performance data pointers
1027	*/
1028	static void __perf_switch_irq_data(void *info)
1029	{
1030	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1031	struct perf_counter *counter = info;
1032	struct perf_counter_context *ctx = counter->ctx;
1033	struct perf_data *oldirqdata = counter->irqdata;
1034
1035	/*
1036	* If this is a task context, we need to check whether it is
1037	* the current task context of this cpu. If not it has been
1038	* scheduled out before the smp call arrived.
1039	*/
1040	if (ctx->task) {
1041	if (cpuctx->task_ctx != ctx)
1042	return;
1043	spin_lock(&ctx->lock);
1044	}
1045
1046	/* Change the pointer NMI safe */
1047	atomic_long_set((atomic_long_t *)&counter->irqdata,
1048	(unsigned long) counter->usrdata);
1049	counter->usrdata = oldirqdata;
1050
1051	if (ctx->task)
1052	spin_unlock(&ctx->lock);
1053	}
1054
1055	static struct perf_data perf_switch_irq_data(struct perf_counter counter)
1056	{
1057	struct perf_counter_context *ctx = counter->ctx;
1058	struct perf_data *oldirqdata = counter->irqdata;
1059	struct task_struct *task = ctx->task;
1060
1061	if (!task) {
1062	smp_call_function_single(counter->cpu,
1063	__perf_switch_irq_data,
1064	counter, 1);
1065	return counter->usrdata;
1066	}
1067
1068	retry:
1069	spin_lock_irq(&ctx->lock);
1070	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1071	counter->irqdata = counter->usrdata;
1072	counter->usrdata = oldirqdata;
1073	spin_unlock_irq(&ctx->lock);
1074	return oldirqdata;
1075	}
1076	spin_unlock_irq(&ctx->lock);
1077	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1078	/* Might have failed, because task was scheduled out */
1079	if (counter->irqdata == oldirqdata)
1080	goto retry;
1081
1082	return counter->usrdata;
1083	}
1084
1085	static void put_context(struct perf_counter_context *ctx)	1026	static void put_context(struct perf_counter_context *ctx)
1086	{	1027	{
1087	if (ctx->task)	1028	if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode inode, struct file file)
1177	mutex_unlock(&counter->mutex);	1118	mutex_unlock(&counter->mutex);
1178	mutex_unlock(&ctx->mutex);	1119	mutex_unlock(&ctx->mutex);
1179		1120
1180	free_page(counter->user_page);
1181	free_counter(counter);	1121	free_counter(counter);
1182	put_context(ctx);	1122	put_context(ctx);
1183		1123
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter counter, char __user buf, size_t count)
1192	{	1132	{
1193	u64 cntval;	1133	u64 cntval;
1194		1134
1195	if (count != sizeof(cntval))	1135	if (count < sizeof(cntval))
1196	return -EINVAL;	1136	return -EINVAL;
1197		1137
1198	/*	1138	/*
@@ -1211,121 +1151,20 @@ perf_read_hw(struct perf_counter counter, char __user buf, size_t count)
1211	}	1151	}
1212		1152
1213	static ssize_t	1153	static ssize_t
1214	perf_copy_usrdata(struct perf_data usrdata, char __user buf, size_t count)
1215	{
1216	if (!usrdata->len)
1217	return 0;
1218
1219	count = min(count, (size_t)usrdata->len);
1220	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1221	return -EFAULT;
1222
1223	/* Adjust the counters */
1224	usrdata->len -= count;
1225	if (!usrdata->len)
1226	usrdata->rd_idx = 0;
1227	else
1228	usrdata->rd_idx += count;
1229
1230	return count;
1231	}
1232
1233	static ssize_t
1234	perf_read_irq_data(struct perf_counter *counter,
1235	char __user *buf,
1236	size_t count,
1237	int nonblocking)
1238	{
1239	struct perf_data irqdata, usrdata;
1240	DECLARE_WAITQUEUE(wait, current);
1241	ssize_t res, res2;
1242
1243	irqdata = counter->irqdata;
1244	usrdata = counter->usrdata;
1245
1246	if (usrdata->len + irqdata->len >= count)
1247	goto read_pending;
1248
1249	if (nonblocking)
1250	return -EAGAIN;
1251
1252	spin_lock_irq(&counter->waitq.lock);
1253	__add_wait_queue(&counter->waitq, &wait);
1254	for (;;) {
1255	set_current_state(TASK_INTERRUPTIBLE);
1256	if (usrdata->len + irqdata->len >= count)
1257	break;
1258
1259	if (signal_pending(current))
1260	break;
1261
1262	if (counter->state == PERF_COUNTER_STATE_ERROR)
1263	break;
1264
1265	spin_unlock_irq(&counter->waitq.lock);
1266	schedule();
1267	spin_lock_irq(&counter->waitq.lock);
1268	}
1269	__remove_wait_queue(&counter->waitq, &wait);
1270	__set_current_state(TASK_RUNNING);
1271	spin_unlock_irq(&counter->waitq.lock);
1272
1273	if (usrdata->len + irqdata->len < count &&
1274	counter->state != PERF_COUNTER_STATE_ERROR)
1275	return -ERESTARTSYS;
1276	read_pending:
1277	mutex_lock(&counter->mutex);
1278
1279	/* Drain pending data first: */
1280	res = perf_copy_usrdata(usrdata, buf, count);
1281	if (res < 0 \|\| res == count)
1282	goto out;
1283
1284	/* Switch irq buffer: */
1285	usrdata = perf_switch_irq_data(counter);
1286	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1287	if (res2 < 0) {
1288	if (!res)
1289	res = -EFAULT;
1290	} else {
1291	res += res2;
1292	}
1293	out:
1294	mutex_unlock(&counter->mutex);
1295
1296	return res;
1297	}
1298
1299	static ssize_t
1300	perf_read(struct file file, char __user buf, size_t count, loff_t *ppos)	1154	perf_read(struct file file, char __user buf, size_t count, loff_t *ppos)
1301	{	1155	{
1302	struct perf_counter *counter = file->private_data;	1156	struct perf_counter *counter = file->private_data;
1303		1157
1304	switch (counter->hw_event.record_type) {	1158	return perf_read_hw(counter, buf, count);
1305	case PERF_RECORD_SIMPLE:
1306	return perf_read_hw(counter, buf, count);
1307
1308	case PERF_RECORD_IRQ:
1309	case PERF_RECORD_GROUP:
1310	return perf_read_irq_data(counter, buf, count,
1311	file->f_flags & O_NONBLOCK);
1312	}
1313	return -EINVAL;
1314	}	1159	}
1315		1160
1316	static unsigned int perf_poll(struct file file, poll_table wait)	1161	static unsigned int perf_poll(struct file file, poll_table wait)
1317	{	1162	{
1318	struct perf_counter *counter = file->private_data;	1163	struct perf_counter *counter = file->private_data;
1319	unsigned int events = 0;	1164	unsigned int events = POLLIN;
1320	unsigned long flags;
1321		1165
1322	poll_wait(file, &counter->waitq, wait);	1166	poll_wait(file, &counter->waitq, wait);
1323		1167
1324	spin_lock_irqsave(&counter->waitq.lock, flags);
1325	if (counter->usrdata->len \|\| counter->irqdata->len)
1326	events \|= POLLIN;
1327	spin_unlock_irqrestore(&counter->waitq.lock, flags);
1328
1329	return events;	1168	return events;
1330	}	1169	}
1331		1170
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1347	return err;	1186	return err;
1348	}	1187	}
1349		1188
1350	void perf_counter_update_userpage(struct perf_counter *counter)	1189	static void __perf_counter_update_userpage(struct perf_counter *counter,
		1190	struct perf_mmap_data *data)
1351	{	1191	{
1352	struct perf_counter_mmap_page *userpg;	1192	struct perf_counter_mmap_page *userpg = data->user_page;
1353
1354	if (!counter->user_page)
1355	return;
1356	userpg = (struct perf_counter_mmap_page *) counter->user_page;
1357		1193
		1194	/*
		1195	* Disable preemption so as to not let the corresponding user-space
		1196	* spin too long if we get preempted.
		1197	*/
		1198	preempt_disable();
1358	++userpg->lock;	1199	++userpg->lock;
1359	smp_wmb();	1200	smp_wmb();
1360	userpg->index = counter->hw.idx;	1201	userpg->index = counter->hw.idx;
1361	userpg->offset = atomic64_read(&counter->count);	1202	userpg->offset = atomic64_read(&counter->count);
1362	if (counter->state == PERF_COUNTER_STATE_ACTIVE)	1203	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1363	userpg->offset -= atomic64_read(&counter->hw.prev_count);	1204	userpg->offset -= atomic64_read(&counter->hw.prev_count);
		1205
		1206	userpg->data_head = atomic_read(&data->head);
1364	smp_wmb();	1207	smp_wmb();
1365	++userpg->lock;	1208	++userpg->lock;
		1209	preempt_enable();
		1210	}
		1211
		1212	void perf_counter_update_userpage(struct perf_counter *counter)
		1213	{
		1214	struct perf_mmap_data *data;
		1215
		1216	rcu_read_lock();
		1217	data = rcu_dereference(counter->data);
		1218	if (data)
		1219	__perf_counter_update_userpage(counter, data);
		1220	rcu_read_unlock();
1366	}	1221	}
1367		1222
1368	static int perf_mmap_fault(struct vm_area_struct vma, struct vm_fault vmf)	1223	static int perf_mmap_fault(struct vm_area_struct vma, struct vm_fault vmf)
1369	{	1224	{
1370	struct perf_counter *counter = vma->vm_file->private_data;	1225	struct perf_counter *counter = vma->vm_file->private_data;
		1226	struct perf_mmap_data *data;
		1227	int ret = VM_FAULT_SIGBUS;
1371		1228
1372	if (!counter->user_page)	1229	rcu_read_lock();
1373	return VM_FAULT_SIGBUS;	1230	data = rcu_dereference(counter->data);
		1231	if (!data)
		1232	goto unlock;
		1233
		1234	if (vmf->pgoff == 0) {
		1235	vmf->page = virt_to_page(data->user_page);
		1236	} else {
		1237	int nr = vmf->pgoff - 1;
1374		1238
1375	vmf->page = virt_to_page(counter->user_page);	1239	if ((unsigned)nr > data->nr_pages)
		1240	goto unlock;
		1241
		1242	vmf->page = virt_to_page(data->data_pages[nr]);
		1243	}
1376	get_page(vmf->page);	1244	get_page(vmf->page);
		1245	ret = 0;
		1246	unlock:
		1247	rcu_read_unlock();
		1248
		1249	return ret;
		1250	}
		1251
		1252	static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
		1253	{
		1254	struct perf_mmap_data *data;
		1255	unsigned long size;
		1256	int i;
		1257
		1258	WARN_ON(atomic_read(&counter->mmap_count));
		1259
		1260	size = sizeof(struct perf_mmap_data);
		1261	size += nr_pages * sizeof(void *);
		1262
		1263	data = kzalloc(size, GFP_KERNEL);
		1264	if (!data)
		1265	goto fail;
		1266
		1267	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
		1268	if (!data->user_page)
		1269	goto fail_user_page;
		1270
		1271	for (i = 0; i < nr_pages; i++) {
		1272	data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
		1273	if (!data->data_pages[i])
		1274	goto fail_data_pages;
		1275	}
		1276
		1277	data->nr_pages = nr_pages;
		1278
		1279	rcu_assign_pointer(counter->data, data);
		1280
1377	return 0;	1281	return 0;
		1282
		1283	fail_data_pages:
		1284	for (i--; i >= 0; i--)
		1285	free_page((unsigned long)data->data_pages[i]);
		1286
		1287	free_page((unsigned long)data->user_page);
		1288
		1289	fail_user_page:
		1290	kfree(data);
		1291
		1292	fail:
		1293	return -ENOMEM;
		1294	}
		1295
		1296	static void __perf_mmap_data_free(struct rcu_head *rcu_head)
		1297	{
		1298	struct perf_mmap_data *data = container_of(rcu_head,
		1299	struct perf_mmap_data, rcu_head);
		1300	int i;
		1301
		1302	free_page((unsigned long)data->user_page);
		1303	for (i = 0; i < data->nr_pages; i++)
		1304	free_page((unsigned long)data->data_pages[i]);
		1305	kfree(data);
		1306	}
		1307
		1308	static void perf_mmap_data_free(struct perf_counter *counter)
		1309	{
		1310	struct perf_mmap_data *data = counter->data;
		1311
		1312	WARN_ON(atomic_read(&counter->mmap_count));
		1313
		1314	rcu_assign_pointer(counter->data, NULL);
		1315	call_rcu(&data->rcu_head, __perf_mmap_data_free);
		1316	}
		1317
		1318	static void perf_mmap_open(struct vm_area_struct *vma)
		1319	{
		1320	struct perf_counter *counter = vma->vm_file->private_data;
		1321
		1322	atomic_inc(&counter->mmap_count);
		1323	}
		1324
		1325	static void perf_mmap_close(struct vm_area_struct *vma)
		1326	{
		1327	struct perf_counter *counter = vma->vm_file->private_data;
		1328
		1329	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
		1330	&counter->mmap_mutex)) {
		1331	perf_mmap_data_free(counter);
		1332	mutex_unlock(&counter->mmap_mutex);
		1333	}
1378	}	1334	}
1379		1335
1380	static struct vm_operations_struct perf_mmap_vmops = {	1336	static struct vm_operations_struct perf_mmap_vmops = {
		1337	.open = perf_mmap_open,
		1338	.close = perf_mmap_close,
1381	.fault = perf_mmap_fault,	1339	.fault = perf_mmap_fault,
1382	};	1340	};
1383		1341
1384	static int perf_mmap(struct file file, struct vm_area_struct vma)	1342	static int perf_mmap(struct file file, struct vm_area_struct vma)
1385	{	1343	{
1386	struct perf_counter *counter = file->private_data;	1344	struct perf_counter *counter = file->private_data;
1387	unsigned long userpg;	1345	unsigned long vma_size;
		1346	unsigned long nr_pages;
		1347	unsigned long locked, lock_limit;
		1348	int ret = 0;
1388		1349
1389	if (!(vma->vm_flags & VM_SHARED) \|\| (vma->vm_flags & VM_WRITE))	1350	if (!(vma->vm_flags & VM_SHARED) \|\| (vma->vm_flags & VM_WRITE))
1390	return -EINVAL;	1351	return -EINVAL;
1391	if (vma->vm_end - vma->vm_start != PAGE_SIZE)	1352
		1353	vma_size = vma->vm_end - vma->vm_start;
		1354	nr_pages = (vma_size / PAGE_SIZE) - 1;
		1355
		1356	if (nr_pages == 0 \|\| !is_power_of_2(nr_pages))
1392	return -EINVAL;	1357	return -EINVAL;
1393		1358
1394	/*	1359	if (vma_size != PAGE_SIZE * (1 + nr_pages))
1395	* For now, restrict to the case of a hardware counter
1396	* on the current task.
1397	*/
1398	if (is_software_counter(counter) \|\| counter->task != current)
1399	return -EINVAL;	1360	return -EINVAL;
1400		1361
1401	userpg = counter->user_page;	1362	if (vma->vm_pgoff != 0)
1402	if (!userpg) {	1363	return -EINVAL;
1403	userpg = get_zeroed_page(GFP_KERNEL);	1364
1404	mutex_lock(&counter->mutex);	1365	locked = vma_size >> PAGE_SHIFT;
1405	if (counter->user_page) {	1366	locked += vma->vm_mm->locked_vm;
1406	free_page(userpg);
1407	userpg = counter->user_page;
1408	} else {
1409	counter->user_page = userpg;
1410	}
1411	mutex_unlock(&counter->mutex);
1412	if (!userpg)
1413	return -ENOMEM;
1414	}
1415		1367
1416	perf_counter_update_userpage(counter);	1368	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
		1369	lock_limit >>= PAGE_SHIFT;
		1370
		1371	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
		1372	return -EPERM;
		1373
		1374	mutex_lock(&counter->mmap_mutex);
		1375	if (atomic_inc_not_zero(&counter->mmap_count))
		1376	goto out;
		1377
		1378	WARN_ON(counter->data);
		1379	ret = perf_mmap_data_alloc(counter, nr_pages);
		1380	if (!ret)
		1381	atomic_set(&counter->mmap_count, 1);
		1382	out:
		1383	mutex_unlock(&counter->mmap_mutex);
1417		1384
1418	vma->vm_flags &= ~VM_MAYWRITE;	1385	vma->vm_flags &= ~VM_MAYWRITE;
1419	vma->vm_flags \|= VM_RESERVED;	1386	vma->vm_flags \|= VM_RESERVED;
1420	vma->vm_ops = &perf_mmap_vmops;	1387	vma->vm_ops = &perf_mmap_vmops;
1421	return 0;	1388
		1389	return ret;
1422	}	1390	}
1423		1391
1424	static const struct file_operations perf_fops = {	1392	static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = {
1434	* Output	1402	* Output
1435	*/	1403	*/
1436		1404
1437	static void perf_counter_store_irq(struct perf_counter *counter, u64 data)	1405	static int perf_output_write(struct perf_counter *counter, int nmi,
		1406	void *buf, ssize_t size)
1438	{	1407	{
1439	struct perf_data *irqdata = counter->irqdata;	1408	struct perf_mmap_data *data;
		1409	unsigned int offset, head, nr;
		1410	unsigned int len;
		1411	int ret, wakeup;
1440		1412
1441	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {	1413	rcu_read_lock();
1442	irqdata->overrun++;	1414	ret = -ENOSPC;
1443	} else {	1415	data = rcu_dereference(counter->data);
1444	u64 p = (u64 ) &irqdata->data[irqdata->len];	1416	if (!data)
		1417	goto out;
		1418
		1419	if (!data->nr_pages)
		1420	goto out;
		1421
		1422	ret = -EINVAL;
		1423	if (size > PAGE_SIZE)
		1424	goto out;
		1425
		1426	do {
		1427	offset = head = atomic_read(&data->head);
		1428	head += sizeof(u64);
		1429	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
		1430
		1431	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
1445		1432
1446	*p = data;	1433	nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
1447	irqdata->len += sizeof(u64);	1434	offset &= PAGE_SIZE - 1;
		1435
		1436	len = min_t(unsigned int, PAGE_SIZE - offset, size);
		1437	memcpy(data->data_pages[nr] + offset, buf, len);
		1438	size -= len;
		1439
		1440	if (size) {
		1441	nr = (nr + 1) & (data->nr_pages - 1);
		1442	memcpy(data->data_pages[nr], buf + len, size);
		1443	}
		1444
		1445	/*
		1446	* generate a poll() wakeup for every page boundary crossed
		1447	*/
		1448	if (wakeup) {
		1449	__perf_counter_update_userpage(counter, data);
		1450	if (nmi) {
		1451	counter->wakeup_pending = 1;
		1452	set_perf_counter_pending();
		1453	} else
		1454	wake_up(&counter->waitq);
1448	}	1455	}
		1456	ret = 0;
		1457	out:
		1458	rcu_read_unlock();
		1459
		1460	return ret;
1449	}	1461	}
1450		1462
1451	static void perf_counter_handle_group(struct perf_counter *counter)	1463	static void perf_output_simple(struct perf_counter *counter,
		1464	int nmi, struct pt_regs *regs)
		1465	{
		1466	u64 entry;
		1467
		1468	entry = instruction_pointer(regs);
		1469
		1470	perf_output_write(counter, nmi, &entry, sizeof(entry));
		1471	}
		1472
		1473	struct group_entry {
		1474	u64 event;
		1475	u64 counter;
		1476	};
		1477
		1478	static void perf_output_group(struct perf_counter *counter, int nmi)
1452	{	1479	{
1453	struct perf_counter leader, sub;	1480	struct perf_counter leader, sub;
1454		1481
1455	leader = counter->group_leader;	1482	leader = counter->group_leader;
1456	list_for_each_entry(sub, &leader->sibling_list, list_entry) {	1483	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
		1484	struct group_entry entry;
		1485
1457	if (sub != counter)	1486	if (sub != counter)
1458	sub->hw_ops->read(sub);	1487	sub->hw_ops->read(sub);
1459	perf_counter_store_irq(counter, sub->hw_event.config);	1488
1460	perf_counter_store_irq(counter, atomic64_read(&sub->count));	1489	entry.event = sub->hw_event.config;
		1490	entry.counter = atomic64_read(&sub->count);
		1491
		1492	perf_output_write(counter, nmi, &entry, sizeof(entry));
1461	}	1493	}
1462	}	1494	}
1463		1495
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter,
1469	return;	1501	return;
1470		1502
1471	case PERF_RECORD_IRQ:	1503	case PERF_RECORD_IRQ:
1472	perf_counter_store_irq(counter, instruction_pointer(regs));	1504	perf_output_simple(counter, nmi, regs);
1473	break;	1505	break;
1474		1506
1475	case PERF_RECORD_GROUP:	1507	case PERF_RECORD_GROUP:
1476	perf_counter_handle_group(counter);	1508	perf_output_group(counter, nmi);
1477	break;	1509	break;
1478	}	1510	}
1479
1480	if (nmi) {
1481	counter->wakeup_pending = 1;
1482	set_perf_counter_pending();
1483	} else
1484	wake_up(&counter->waitq);
1485	}	1511	}
1486		1512
1487	/*	1513	/*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1967	INIT_LIST_HEAD(&counter->sibling_list);	1993	INIT_LIST_HEAD(&counter->sibling_list);
1968	init_waitqueue_head(&counter->waitq);	1994	init_waitqueue_head(&counter->waitq);
1969		1995
		1996	mutex_init(&counter->mmap_mutex);
		1997
1970	INIT_LIST_HEAD(&counter->child_list);	1998	INIT_LIST_HEAD(&counter->child_list);
1971		1999
1972	counter->irqdata = &counter->data[0];
1973	counter->usrdata = &counter->data[1];
1974	counter->cpu = cpu;	2000	counter->cpu = cpu;
1975	counter->hw_event = *hw_event;	2001	counter->hw_event = *hw_event;
1976	counter->wakeup_pending = 0;	2002	counter->wakeup_pending = 0;