powerpc/oprofile: Fix mutex locking for cell spu-oprofile

The issue is the SPU code is not holding the kernel mutex lock while adding samples to the kernel buffer. This patch creates per SPU buffers to hold the data. Data is added to the buffers from in interrupt context. The data is periodically pushed to the kernel buffer via a new Oprofile function oprofile_put_buff(). The oprofile_put_buff() function is called via a work queue enabling the funtion to acquire the mutex lock. The existing user controls for adjusting the per CPU buffer size is used to control the size of the per SPU buffers. Similarly, overflows of the SPU buffers are reported by incrementing the per CPU buffer stats. This eliminates the need to have architecture specific controls for the per SPU buffers which is not acceptable to the OProfile user tool maintainer. The export of the oprofile add_event_entry() is removed as it is no longer needed given this patch. Note, this patch has not addressed the issue of indexing arrays by the spu number. This still needs to be fixed as the spu numbering is not guarenteed to be 0 to max_num_spus-1. Signed-off-by: Carl Love <carll@us.ibm.com> Signed-off-by: Maynard Johnson <maynardj@us.ibm.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Acked-by: Acked-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Carl Love <cel@us.ibm.com> 2008-10-14 19:37:01 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2008-10-21 00:17:48 -0400
commit: a5598ca0d49821912a5053c05f07fd650671eb6d (patch)
tree: fcfa402eeb45f76fbb03886708e5042fe5f1babb /arch
parent: bb5e6491cae4c5d6ddfa3e173e22efb35f595949 (diff)
3 files changed, 225 insertions, 28 deletions
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
index 22e4e8d4eb2c..628009c01958 100644
--- a/arch/powerpc/oprofile/cell/pr_util.h
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
 #define SKIP_GENERIC_SYNC 0
 #define SYNC_START_ERROR -1
 #define DO_GENERIC_SYNC 1
+#define SPUS_PER_NODE   8
+#define DEFAULT_TIMER_EXPIRE  (HZ / 10)
+extern struct delayed_work spu_work;
+extern int spu_prof_running;
 struct spu_overlay_info {       /* map of sections within an SPU overlay */
        unsigned int vma;       /* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map {	/* map of sections within an SPU program */
 };
+struct spu_buffer {
+        int last_guard_val;
+        int ctx_sw_seen;
+        unsigned long *buff;
+        unsigned int head, tail;
+};
 /* The three functions below are for maintaining and accessing
 * the vma-to-fileoffset map.
 */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index 380d7e217531..6edaebd5099a 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@
 static u32 *samples;
-static int spu_prof_running;
+int spu_prof_running;
 static unsigned int profiling_interval;
 #define NUM_SPU_BITS_TRBUF 16
 #define SPUS_PER_TB_ENTRY   4
-#define SPUS_PER_NODE        8
 #define SPU_PC_MASK          0xFFFF
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset)
        spu_prof_running = 1;
        hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+        schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
        return 0;
 }
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 2a9b4a049329..2949126d28d1 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
 static DEFINE_SPINLOCK(cache_lock);
 static int num_spu_nodes;
 int spu_prof_num_nodes;
-int last_guard_val[MAX_NUMNODES * 8];
+struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
+struct delayed_work spu_work;
+static unsigned max_spu_buff;
+static void spu_buff_add(unsigned long int value, int spu)
+{
+        /* spu buff is a circular buffer.  Add entries to the
+         * head.  Head is the index to store the next value.
+         * The buffer is full when there is one available entry
+         * in the queue, i.e. head and tail can't be equal.
+         * That way we can tell the difference between the
+         * buffer being full versus empty.
+         *
+         *  ASSUPTION: the buffer_lock is held when this function
+         *             is called to lock the buffer, head and tail.
+         */
+        int full = 1;
+        if (spu_buff[spu].head >= spu_buff[spu].tail) {
+                if ((spu_buff[spu].head - spu_buff[spu].tail)
+                    <  (max_spu_buff - 1))
+                        full = 0;
+        } else if (spu_buff[spu].tail > spu_buff[spu].head) {
+                if ((spu_buff[spu].tail - spu_buff[spu].head)
+                    > 1)
+                        full = 0;
+        }
+        if (!full) {
+                spu_buff[spu].buff[spu_buff[spu].head] = value;
+                spu_buff[spu].head++;
+                if (spu_buff[spu].head >= max_spu_buff)
+                        spu_buff[spu].head = 0;
+        } else {
+                /* From the user's perspective make the SPU buffer
+                 * size management/overflow look like we are using
+                 * per cpu buffers.  The user uses the same
+                 * per cpu parameter to adjust the SPU buffer size.
+                 * Increment the sample_lost_overflow to inform
+                 * the user the buffer size needs to be increased.
+                 */
+                oprofile_cpu_buffer_inc_smpl_lost();
+        }
+}
+/* This function copies the per SPU buffers to the
+ * OProfile kernel buffer.
+ */
+void sync_spu_buff(void)
+{
+        int spu;
+        unsigned long flags;
+        int curr_head;
+        for (spu = 0; spu < num_spu_nodes; spu++) {
+                /* In case there was an issue and the buffer didn't
+                 * get created skip it.
+                 */
+                if (spu_buff[spu].buff == NULL)
+                        continue;
+                /* Hold the lock to make sure the head/tail
+                 * doesn't change while spu_buff_add() is
+                 * deciding if the buffer is full or not.
+                 * Being a little paranoid.
+                 */
+                spin_lock_irqsave(&buffer_lock, flags);
+                curr_head = spu_buff[spu].head;
+                spin_unlock_irqrestore(&buffer_lock, flags);
+                /* Transfer the current contents to the kernel buffer.
+                 * data can still be added to the head of the buffer.
+                 */
+                oprofile_put_buff(spu_buff[spu].buff,
+                                  spu_buff[spu].tail,
+                                  curr_head, max_spu_buff);
+                spin_lock_irqsave(&buffer_lock, flags);
+                spu_buff[spu].tail = curr_head;
+                spin_unlock_irqrestore(&buffer_lock, flags);
+        }
+}
+static void wq_sync_spu_buff(struct work_struct *work)
+{
+        /* move data from spu buffers to kernel buffer */
+        sync_spu_buff();
+        /* only reschedule if profiling is not done */
+        if (spu_prof_running)
+                schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+}
 /* Container for caching information about an active SPU task. */
 struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId)
        /* Record context info in event buffer */
        spin_lock_irqsave(&buffer_lock, flags);
-        add_event_entry(ESCAPE_CODE);
+        spu_buff_add(ESCAPE_CODE, spu->number);
-        add_event_entry(SPU_CTX_SWITCH_CODE);
+        spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
-        add_event_entry(spu->number);
+        spu_buff_add(spu->number, spu->number);
-        add_event_entry(spu->pid);
+        spu_buff_add(spu->pid, spu->number);
-        add_event_entry(spu->tgid);
+        spu_buff_add(spu->tgid, spu->number);
-        add_event_entry(app_dcookie);
+        spu_buff_add(app_dcookie, spu->number);
-        add_event_entry(spu_cookie);
+        spu_buff_add(spu_cookie, spu->number);
-        add_event_entry(offset);
+        spu_buff_add(offset, spu->number);
+        /* Set flag to indicate SPU PC data can now be written out.  If
+         * the SPU program counter data is seen before an SPU context
+         * record is seen, the postprocessing will fail.
+         */
+        spu_buff[spu->number].ctx_sw_seen = 1;
        spin_unlock_irqrestore(&buffer_lock, flags);
        smp_wmb();      /* insure spu event buffer updates are written */
                        /* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
        return nodes;
 }
+static int oprofile_spu_buff_create(void)
+{
+        int spu;
+        max_spu_buff = oprofile_get_cpu_buffer_size();
+        for (spu = 0; spu < num_spu_nodes; spu++) {
+                /* create circular buffers to store the data in.
+                 * use locks to manage accessing the buffers
+                 */
+                spu_buff[spu].head = 0;
+                spu_buff[spu].tail = 0;
+                /*
+                 * Create a buffer for each SPU.  Can't reliably
+                 * create a single buffer for all spus due to not
+                 * enough contiguous kernel memory.
+                 */
+                spu_buff[spu].buff = kzalloc((max_spu_buff
+                                              * sizeof(unsigned long)),
+                                             GFP_KERNEL);
+                if (!spu_buff[spu].buff) {
+                        printk(KERN_ERR "SPU_PROF: "
+                               "%s, line %d:  oprofile_spu_buff_create "
+                       "failed to allocate spu buffer %d.\n",
+                               __func__, __LINE__, spu);
+                        /* release the spu buffers that have been allocated */
+                        while (spu >= 0) {
+                                kfree(spu_buff[spu].buff);
+                                spu_buff[spu].buff = 0;
+                                spu--;
+                        }
+                        return -ENOMEM;
+                }
+        }
+        return 0;
+}
 /* The main purpose of this function is to synchronize
 * OProfile with SPUFS by registering to be notified of
 * SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
 */
 int spu_sync_start(void)
 {
-        int k;
+        int spu;
        int ret = SKIP_GENERIC_SYNC;
        int register_ret;
        unsigned long flags = 0;
        spu_prof_num_nodes = number_of_online_nodes();
        num_spu_nodes = spu_prof_num_nodes * 8;
+        INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
+        /* create buffer for storing the SPU data to put in
+         * the kernel buffer.
+         */
+        ret = oprofile_spu_buff_create();
+        if (ret)
+                goto out;
        spin_lock_irqsave(&buffer_lock, flags);
-        add_event_entry(ESCAPE_CODE);
+        for (spu = 0; spu < num_spu_nodes; spu++) {
-        add_event_entry(SPU_PROFILING_CODE);
+                spu_buff_add(ESCAPE_CODE, spu);
-        add_event_entry(num_spu_nodes);
+                spu_buff_add(SPU_PROFILING_CODE, spu);
+                spu_buff_add(num_spu_nodes, spu);
+        }
        spin_unlock_irqrestore(&buffer_lock, flags);
+        for (spu = 0; spu < num_spu_nodes; spu++) {
+                spu_buff[spu].ctx_sw_seen = 0;
+                spu_buff[spu].last_guard_val = 0;
+        }
        /* Register for SPU events  */
        register_ret = spu_switch_event_register(&spu_active);
        if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
                goto out;
        }
-        for (k = 0; k < (MAX_NUMNODES * 8); k++)
-                last_guard_val[k] = 0;
        pr_debug("spu_sync_start -- running.\n");
 out:
        return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples,
                 * use.  We need to discard samples taken during the time
                 * period which an overlay occurs (i.e., guard value changes).
                 */
-                if (grd_val && grd_val != last_guard_val[spu_num]) {
+                if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
-                        last_guard_val[spu_num] = grd_val;
+                        spu_buff[spu_num].last_guard_val = grd_val;
                        /* Drop the rest of the samples. */
                        break;
                }
-                add_event_entry(file_offset | spu_num_shifted);
+                /* We must ensure that the SPU context switch has been written
+                 * out before samples for the SPU.  Otherwise, the SPU context
+                 * information is not available and the postprocessing of the
+                 * SPU PC will fail with no available anonymous map information.
+                 */
+                if (spu_buff[spu_num].ctx_sw_seen)
+                        spu_buff_add((file_offset | spu_num_shifted),
+                                         spu_num);
        }
        spin_unlock(&buffer_lock);
 out:
@@ -463,20 +626,41 @@ out:
 int spu_sync_stop(void)
 {
        unsigned long flags = 0;
-        int ret = spu_switch_event_unregister(&spu_active);
+        int ret;
-        if (ret) {
+        int k;
+        ret = spu_switch_event_unregister(&spu_active);
+        if (ret)
                printk(KERN_ERR "SPU_PROF: "
-                        "%s, line %d: spu_switch_event_unregister returned %d\n",
+                       "%s, line %d: spu_switch_event_unregister "      \
-                        __func__, __LINE__, ret);
+                       "returned %d\n",
-                goto out;
+                       __func__, __LINE__, ret);
-        }
+        /* flush any remaining data in the per SPU buffers */
+        sync_spu_buff();
        spin_lock_irqsave(&cache_lock, flags);
        ret = release_cached_info(RELEASE_ALL);
        spin_unlock_irqrestore(&cache_lock, flags);
-out:
+        /* remove scheduled work queue item rather then waiting
+         * for every queued entry to execute.  Then flush pending
+         * system wide buffer to event buffer.
+         */
+        cancel_delayed_work(&spu_work);
+        for (k = 0; k < num_spu_nodes; k++) {
+                spu_buff[k].ctx_sw_seen = 0;
+                /*
+                 * spu_sys_buff will be null if there was a problem
+                 * allocating the buffer.  Only delete if it exists.
+                 */
+                kfree(spu_buff[k].buff);
+                spu_buff[k].buff = 0;
+        }
        pr_debug("spu_sync_stop -- done.\n");
        return ret;
 }
author	Carl Love <cel@us.ibm.com>	2008-10-14 19:37:01 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2008-10-21 00:17:48 -0400
commit	a5598ca0d49821912a5053c05f07fd650671eb6d (patch)
tree	fcfa402eeb45f76fbb03886708e5042fe5f1babb /arch
parent	bb5e6491cae4c5d6ddfa3e173e22efb35f595949 (diff)

diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h index 22e4e8d4eb2c..628009c01958 100644 --- a/arch/powerpc/oprofile/cell/pr_util.h +++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
24	#define SKIP_GENERIC_SYNC 0	24	#define SKIP_GENERIC_SYNC 0
25	#define SYNC_START_ERROR -1	25	#define SYNC_START_ERROR -1
26	#define DO_GENERIC_SYNC 1	26	#define DO_GENERIC_SYNC 1
		27	#define SPUS_PER_NODE 8
		28	#define DEFAULT_TIMER_EXPIRE (HZ / 10)
		29
		30	extern struct delayed_work spu_work;
		31	extern int spu_prof_running;
27		32
28	struct spu_overlay_info { /* map of sections within an SPU overlay */	33	struct spu_overlay_info { /* map of sections within an SPU overlay */
29	unsigned int vma; /* SPU virtual memory address from elf */	34	unsigned int vma; /* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map { /* map of sections within an SPU program */
62		67
63	};	68	};
64		69
		70	struct spu_buffer {
		71	int last_guard_val;
		72	int ctx_sw_seen;
		73	unsigned long *buff;
		74	unsigned int head, tail;
		75	};
		76
		77
65	/* The three functions below are for maintaining and accessing	78	/* The three functions below are for maintaining and accessing
66	* the vma-to-fileoffset map.	79	* the vma-to-fileoffset map.
67	*/	80	*/


diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c index 380d7e217531..6edaebd5099a 100644 --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@
23		23
24	static u32 *samples;	24	static u32 *samples;
25		25
26	static int spu_prof_running;	26	int spu_prof_running;
27	static unsigned int profiling_interval;	27	static unsigned int profiling_interval;
28		28
29	#define NUM_SPU_BITS_TRBUF 16	29	#define NUM_SPU_BITS_TRBUF 16
30	#define SPUS_PER_TB_ENTRY 4	30	#define SPUS_PER_TB_ENTRY 4
31	#define SPUS_PER_NODE 8
32		31
33	#define SPU_PC_MASK 0xFFFF	32	#define SPU_PC_MASK 0xFFFF
34		33
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset)
208		207
209	spu_prof_running = 1;	208	spu_prof_running = 1;
210	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);	209	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
		210	schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
211		211
212	return 0;	212	return 0;
213	}	213	}


diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index 2a9b4a049329..2949126d28d1 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
35	static DEFINE_SPINLOCK(cache_lock);	35	static DEFINE_SPINLOCK(cache_lock);
36	static int num_spu_nodes;	36	static int num_spu_nodes;
37	int spu_prof_num_nodes;	37	int spu_prof_num_nodes;
38	int last_guard_val[MAX_NUMNODES * 8];	38
		39	struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
		40	struct delayed_work spu_work;
		41	static unsigned max_spu_buff;
		42
		43	static void spu_buff_add(unsigned long int value, int spu)
		44	{
		45	/* spu buff is a circular buffer. Add entries to the
		46	* head. Head is the index to store the next value.
		47	* The buffer is full when there is one available entry
		48	* in the queue, i.e. head and tail can't be equal.
		49	* That way we can tell the difference between the
		50	* buffer being full versus empty.
		51	*
		52	* ASSUPTION: the buffer_lock is held when this function
		53	* is called to lock the buffer, head and tail.
		54	*/
		55	int full = 1;
		56
		57	if (spu_buff[spu].head >= spu_buff[spu].tail) {
		58	if ((spu_buff[spu].head - spu_buff[spu].tail)
		59	< (max_spu_buff - 1))
		60	full = 0;
		61
		62	} else if (spu_buff[spu].tail > spu_buff[spu].head) {
		63	if ((spu_buff[spu].tail - spu_buff[spu].head)
		64	> 1)
		65	full = 0;
		66	}
		67
		68	if (!full) {
		69	spu_buff[spu].buff[spu_buff[spu].head] = value;
		70	spu_buff[spu].head++;
		71
		72	if (spu_buff[spu].head >= max_spu_buff)
		73	spu_buff[spu].head = 0;
		74	} else {
		75	/* From the user's perspective make the SPU buffer
		76	* size management/overflow look like we are using
		77	* per cpu buffers. The user uses the same
		78	* per cpu parameter to adjust the SPU buffer size.
		79	* Increment the sample_lost_overflow to inform
		80	* the user the buffer size needs to be increased.
		81	*/
		82	oprofile_cpu_buffer_inc_smpl_lost();
		83	}
		84	}
		85
		86	/* This function copies the per SPU buffers to the
		87	* OProfile kernel buffer.
		88	*/
		89	void sync_spu_buff(void)
		90	{
		91	int spu;
		92	unsigned long flags;
		93	int curr_head;
		94
		95	for (spu = 0; spu < num_spu_nodes; spu++) {
		96	/* In case there was an issue and the buffer didn't
		97	* get created skip it.
		98	*/
		99	if (spu_buff[spu].buff == NULL)
		100	continue;
		101
		102	/* Hold the lock to make sure the head/tail
		103	* doesn't change while spu_buff_add() is
		104	* deciding if the buffer is full or not.
		105	* Being a little paranoid.
		106	*/
		107	spin_lock_irqsave(&buffer_lock, flags);
		108	curr_head = spu_buff[spu].head;
		109	spin_unlock_irqrestore(&buffer_lock, flags);
		110
		111	/* Transfer the current contents to the kernel buffer.
		112	* data can still be added to the head of the buffer.
		113	*/
		114	oprofile_put_buff(spu_buff[spu].buff,
		115	spu_buff[spu].tail,
		116	curr_head, max_spu_buff);
		117
		118	spin_lock_irqsave(&buffer_lock, flags);
		119	spu_buff[spu].tail = curr_head;
		120	spin_unlock_irqrestore(&buffer_lock, flags);
		121	}
		122
		123	}
		124
		125	static void wq_sync_spu_buff(struct work_struct *work)
		126	{
		127	/* move data from spu buffers to kernel buffer */
		128	sync_spu_buff();
		129
		130	/* only reschedule if profiling is not done */
		131	if (spu_prof_running)
		132	schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
		133	}
39		134
40	/* Container for caching information about an active SPU task. */	135	/* Container for caching information about an active SPU task. */
41	struct cached_info {	136	struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId)
305		400
306	/* Record context info in event buffer */	401	/* Record context info in event buffer */
307	spin_lock_irqsave(&buffer_lock, flags);	402	spin_lock_irqsave(&buffer_lock, flags);
308	add_event_entry(ESCAPE_CODE);	403	spu_buff_add(ESCAPE_CODE, spu->number);
309	add_event_entry(SPU_CTX_SWITCH_CODE);	404	spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
310	add_event_entry(spu->number);	405	spu_buff_add(spu->number, spu->number);
311	add_event_entry(spu->pid);	406	spu_buff_add(spu->pid, spu->number);
312	add_event_entry(spu->tgid);	407	spu_buff_add(spu->tgid, spu->number);
313	add_event_entry(app_dcookie);	408	spu_buff_add(app_dcookie, spu->number);
314	add_event_entry(spu_cookie);	409	spu_buff_add(spu_cookie, spu->number);
315	add_event_entry(offset);	410	spu_buff_add(offset, spu->number);
		411
		412	/* Set flag to indicate SPU PC data can now be written out. If
		413	* the SPU program counter data is seen before an SPU context
		414	* record is seen, the postprocessing will fail.
		415	*/
		416	spu_buff[spu->number].ctx_sw_seen = 1;
		417
316	spin_unlock_irqrestore(&buffer_lock, flags);	418	spin_unlock_irqrestore(&buffer_lock, flags);
317	smp_wmb(); /* insure spu event buffer updates are written */	419	smp_wmb(); /* insure spu event buffer updates are written */
318	/* don't want entries intermingled... */	420	/* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
360	return nodes;	462	return nodes;
361	}	463	}
362		464
		465	static int oprofile_spu_buff_create(void)
		466	{
		467	int spu;
		468
		469	max_spu_buff = oprofile_get_cpu_buffer_size();
		470
		471	for (spu = 0; spu < num_spu_nodes; spu++) {
		472	/* create circular buffers to store the data in.
		473	* use locks to manage accessing the buffers
		474	*/
		475	spu_buff[spu].head = 0;
		476	spu_buff[spu].tail = 0;
		477
		478	/*
		479	* Create a buffer for each SPU. Can't reliably
		480	* create a single buffer for all spus due to not
		481	* enough contiguous kernel memory.
		482	*/
		483
		484	spu_buff[spu].buff = kzalloc((max_spu_buff
		485	* sizeof(unsigned long)),
		486	GFP_KERNEL);
		487
		488	if (!spu_buff[spu].buff) {
		489	printk(KERN_ERR "SPU_PROF: "
		490	"%s, line %d: oprofile_spu_buff_create "
		491	"failed to allocate spu buffer %d.\n",
		492	__func__, __LINE__, spu);
		493
		494	/* release the spu buffers that have been allocated */
		495	while (spu >= 0) {
		496	kfree(spu_buff[spu].buff);
		497	spu_buff[spu].buff = 0;
		498	spu--;
		499	}
		500	return -ENOMEM;
		501	}
		502	}
		503	return 0;
		504	}
		505
363	/* The main purpose of this function is to synchronize	506	/* The main purpose of this function is to synchronize
364	* OProfile with SPUFS by registering to be notified of	507	* OProfile with SPUFS by registering to be notified of
365	* SPU task switches.	508	* SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
372	*/	515	*/
373	int spu_sync_start(void)	516	int spu_sync_start(void)
374	{	517	{
375	int k;	518	int spu;
376	int ret = SKIP_GENERIC_SYNC;	519	int ret = SKIP_GENERIC_SYNC;
377	int register_ret;	520	int register_ret;
378	unsigned long flags = 0;	521	unsigned long flags = 0;
379		522
380	spu_prof_num_nodes = number_of_online_nodes();	523	spu_prof_num_nodes = number_of_online_nodes();
381	num_spu_nodes = spu_prof_num_nodes * 8;	524	num_spu_nodes = spu_prof_num_nodes * 8;
		525	INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
		526
		527	/* create buffer for storing the SPU data to put in
		528	* the kernel buffer.
		529	*/
		530	ret = oprofile_spu_buff_create();
		531	if (ret)
		532	goto out;
382		533
383	spin_lock_irqsave(&buffer_lock, flags);	534	spin_lock_irqsave(&buffer_lock, flags);
384	add_event_entry(ESCAPE_CODE);	535	for (spu = 0; spu < num_spu_nodes; spu++) {
385	add_event_entry(SPU_PROFILING_CODE);	536	spu_buff_add(ESCAPE_CODE, spu);
386	add_event_entry(num_spu_nodes);	537	spu_buff_add(SPU_PROFILING_CODE, spu);
		538	spu_buff_add(num_spu_nodes, spu);
		539	}
387	spin_unlock_irqrestore(&buffer_lock, flags);	540	spin_unlock_irqrestore(&buffer_lock, flags);
388		541
		542	for (spu = 0; spu < num_spu_nodes; spu++) {
		543	spu_buff[spu].ctx_sw_seen = 0;
		544	spu_buff[spu].last_guard_val = 0;
		545	}
		546
389	/* Register for SPU events */	547	/* Register for SPU events */
390	register_ret = spu_switch_event_register(&spu_active);	548	register_ret = spu_switch_event_register(&spu_active);
391	if (register_ret) {	549	if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
393	goto out;	551	goto out;
394	}	552	}
395		553
396	for (k = 0; k < (MAX_NUMNODES * 8); k++)
397	last_guard_val[k] = 0;
398	pr_debug("spu_sync_start -- running.\n");	554	pr_debug("spu_sync_start -- running.\n");
399	out:	555	out:
400	return ret;	556	return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples,
446	* use. We need to discard samples taken during the time	602	* use. We need to discard samples taken during the time
447	* period which an overlay occurs (i.e., guard value changes).	603	* period which an overlay occurs (i.e., guard value changes).
448	*/	604	*/
449	if (grd_val && grd_val != last_guard_val[spu_num]) {	605	if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
450	last_guard_val[spu_num] = grd_val;	606	spu_buff[spu_num].last_guard_val = grd_val;
451	/* Drop the rest of the samples. */	607	/* Drop the rest of the samples. */
452	break;	608	break;
453	}	609	}
454		610
455	add_event_entry(file_offset \| spu_num_shifted);	611	/* We must ensure that the SPU context switch has been written
		612	* out before samples for the SPU. Otherwise, the SPU context
		613	* information is not available and the postprocessing of the
		614	* SPU PC will fail with no available anonymous map information.
		615	*/
		616	if (spu_buff[spu_num].ctx_sw_seen)
		617	spu_buff_add((file_offset \| spu_num_shifted),
		618	spu_num);
456	}	619	}
457	spin_unlock(&buffer_lock);	620	spin_unlock(&buffer_lock);
458	out:	621	out:
@@ -463,20 +626,41 @@ out:
463	int spu_sync_stop(void)	626	int spu_sync_stop(void)
464	{	627	{
465	unsigned long flags = 0;	628	unsigned long flags = 0;
466	int ret = spu_switch_event_unregister(&spu_active);	629	int ret;
467	if (ret) {	630	int k;
		631
		632	ret = spu_switch_event_unregister(&spu_active);
		633
		634	if (ret)
468	printk(KERN_ERR "SPU_PROF: "	635	printk(KERN_ERR "SPU_PROF: "
469	"%s, line %d: spu_switch_event_unregister returned %d\n",	636	"%s, line %d: spu_switch_event_unregister " \
470	__func__, __LINE__, ret);	637	"returned %d\n",
471	goto out;	638	__func__, __LINE__, ret);
472	}	639
		640	/* flush any remaining data in the per SPU buffers */
		641	sync_spu_buff();
473		642
474	spin_lock_irqsave(&cache_lock, flags);	643	spin_lock_irqsave(&cache_lock, flags);
475	ret = release_cached_info(RELEASE_ALL);	644	ret = release_cached_info(RELEASE_ALL);
476	spin_unlock_irqrestore(&cache_lock, flags);	645	spin_unlock_irqrestore(&cache_lock, flags);
477	out:	646
		647	/* remove scheduled work queue item rather then waiting
		648	* for every queued entry to execute. Then flush pending
		649	* system wide buffer to event buffer.
		650	*/
		651	cancel_delayed_work(&spu_work);
		652
		653	for (k = 0; k < num_spu_nodes; k++) {
		654	spu_buff[k].ctx_sw_seen = 0;
		655
		656	/*
		657	* spu_sys_buff will be null if there was a problem
		658	* allocating the buffer. Only delete if it exists.
		659	*/
		660	kfree(spu_buff[k].buff);
		661	spu_buff[k].buff = 0;
		662	}
478	pr_debug("spu_sync_stop -- done.\n");	663	pr_debug("spu_sync_stop -- done.\n");
479	return ret;	664	return ret;
480	}	665	}
481		666
482