3 files changed, 225 insertions, 28 deletions
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
index 22e4e8d4eb2c..628009c01958 100644
--- a/arch/powerpc/oprofile/cell/pr_util.h
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
 #define SKIP_GENERIC_SYNC 0
 #define SYNC_START_ERROR -1
 #define DO_GENERIC_SYNC 1
+#define SPUS_PER_NODE   8
+#define DEFAULT_TIMER_EXPIRE  (HZ / 10)
+extern struct delayed_work spu_work;
+extern int spu_prof_running;
 struct spu_overlay_info {       /* map of sections within an SPU overlay */
        unsigned int vma;       /* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map {	/* map of sections within an SPU program */
 };
+struct spu_buffer {
+        int last_guard_val;
+        int ctx_sw_seen;
+        unsigned long *buff;
+        unsigned int head, tail;
+};
 /* The three functions below are for maintaining and accessing
 * the vma-to-fileoffset map.
 */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index 380d7e217531..6edaebd5099a 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@
 static u32 *samples;
-static int spu_prof_running;
+int spu_prof_running;
 static unsigned int profiling_interval;
 #define NUM_SPU_BITS_TRBUF 16
 #define SPUS_PER_TB_ENTRY   4
-#define SPUS_PER_NODE        8
 #define SPU_PC_MASK          0xFFFF
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset)
        spu_prof_running = 1;
        hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+        schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
        return 0;
 }
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 2a9b4a049329..2949126d28d1 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
 static DEFINE_SPINLOCK(cache_lock);
 static int num_spu_nodes;
 int spu_prof_num_nodes;
-int last_guard_val[MAX_NUMNODES * 8];
+struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
+struct delayed_work spu_work;
+static unsigned max_spu_buff;
+static void spu_buff_add(unsigned long int value, int spu)
+{
+        /* spu buff is a circular buffer.  Add entries to the
+         * head.  Head is the index to store the next value.
+         * The buffer is full when there is one available entry
+         * in the queue, i.e. head and tail can't be equal.
+         * That way we can tell the difference between the
+         * buffer being full versus empty.
+         *
+         *  ASSUPTION: the buffer_lock is held when this function
+         *             is called to lock the buffer, head and tail.
+         */
+        int full = 1;
+        if (spu_buff[spu].head >= spu_buff[spu].tail) {
+                if ((spu_buff[spu].head - spu_buff[spu].tail)
+                    <  (max_spu_buff - 1))
+                        full = 0;
+        } else if (spu_buff[spu].tail > spu_buff[spu].head) {
+                if ((spu_buff[spu].tail - spu_buff[spu].head)
+                    > 1)
+                        full = 0;
+        }
+        if (!full) {
+                spu_buff[spu].buff[spu_buff[spu].head] = value;
+                spu_buff[spu].head++;
+                if (spu_buff[spu].head >= max_spu_buff)
+                        spu_buff[spu].head = 0;
+        } else {
+                /* From the user's perspective make the SPU buffer
+                 * size management/overflow look like we are using
+                 * per cpu buffers.  The user uses the same
+                 * per cpu parameter to adjust the SPU buffer size.
+                 * Increment the sample_lost_overflow to inform
+                 * the user the buffer size needs to be increased.
+                 */
+                oprofile_cpu_buffer_inc_smpl_lost();
+        }
+}
+/* This function copies the per SPU buffers to the
+ * OProfile kernel buffer.
+ */
+void sync_spu_buff(void)
+{
+        int spu;
+        unsigned long flags;
+        int curr_head;
+        for (spu = 0; spu < num_spu_nodes; spu++) {
+                /* In case there was an issue and the buffer didn't
+                 * get created skip it.
+                 */
+                if (spu_buff[spu].buff == NULL)
+                        continue;
+                /* Hold the lock to make sure the head/tail
+                 * doesn't change while spu_buff_add() is
+                 * deciding if the buffer is full or not.
+                 * Being a little paranoid.
+                 */
+                spin_lock_irqsave(&buffer_lock, flags);
+                curr_head = spu_buff[spu].head;
+                spin_unlock_irqrestore(&buffer_lock, flags);
+                /* Transfer the current contents to the kernel buffer.
+                 * data can still be added to the head of the buffer.
+                 */
+                oprofile_put_buff(spu_buff[spu].buff,
+                                  spu_buff[spu].tail,
+                                  curr_head, max_spu_buff);
+                spin_lock_irqsave(&buffer_lock, flags);
+                spu_buff[spu].tail = curr_head;
+                spin_unlock_irqrestore(&buffer_lock, flags);
+        }
+}
+static void wq_sync_spu_buff(struct work_struct *work)
+{
+        /* move data from spu buffers to kernel buffer */
+        sync_spu_buff();
+        /* only reschedule if profiling is not done */
+        if (spu_prof_running)
+                schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+}
 /* Container for caching information about an active SPU task. */
 struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId)
        /* Record context info in event buffer */
        spin_lock_irqsave(&buffer_lock, flags);
-        add_event_entry(ESCAPE_CODE);
+        spu_buff_add(ESCAPE_CODE, spu->number);
-        add_event_entry(SPU_CTX_SWITCH_CODE);
+        spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
-        add_event_entry(spu->number);
+        spu_buff_add(spu->number, spu->number);
-        add_event_entry(spu->pid);
+        spu_buff_add(spu->pid, spu->number);
-        add_event_entry(spu->tgid);
+        spu_buff_add(spu->tgid, spu->number);
-        add_event_entry(app_dcookie);
+        spu_buff_add(app_dcookie, spu->number);
-        add_event_entry(spu_cookie);
+        spu_buff_add(spu_cookie, spu->number);
-        add_event_entry(offset);
+        spu_buff_add(offset, spu->number);
+        /* Set flag to indicate SPU PC data can now be written out.  If
+         * the SPU program counter data is seen before an SPU context
+         * record is seen, the postprocessing will fail.
+         */
+        spu_buff[spu->number].ctx_sw_seen = 1;
        spin_unlock_irqrestore(&buffer_lock, flags);
        smp_wmb();      /* insure spu event buffer updates are written */
                        /* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
        return nodes;
 }
+static int oprofile_spu_buff_create(void)
+{
+        int spu;
+        max_spu_buff = oprofile_get_cpu_buffer_size();
+        for (spu = 0; spu < num_spu_nodes; spu++) {
+                /* create circular buffers to store the data in.
+                 * use locks to manage accessing the buffers
+                 */
+                spu_buff[spu].head = 0;
+                spu_buff[spu].tail = 0;
+                /*
+                 * Create a buffer for each SPU.  Can't reliably
+                 * create a single buffer for all spus due to not
+                 * enough contiguous kernel memory.
+                 */
+                spu_buff[spu].buff = kzalloc((max_spu_buff
+                                              * sizeof(unsigned long)),
+                                             GFP_KERNEL);
+                if (!spu_buff[spu].buff) {
+                        printk(KERN_ERR "SPU_PROF: "
+                               "%s, line %d:  oprofile_spu_buff_create "
+                       "failed to allocate spu buffer %d.\n",
+                               __func__, __LINE__, spu);
+                        /* release the spu buffers that have been allocated */
+                        while (spu >= 0) {
+                                kfree(spu_buff[spu].buff);
+                                spu_buff[spu].buff = 0;
+                                spu--;
+                        }
+                        return -ENOMEM;
+                }
+        }
+        return 0;
+}
 /* The main purpose of this function is to synchronize
 * OProfile with SPUFS by registering to be notified of
 * SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
 */
 int spu_sync_start(void)
 {
-        int k;
+        int spu;
        int ret = SKIP_GENERIC_SYNC;
        int register_ret;
        unsigned long flags = 0;
        spu_prof_num_nodes = number_of_online_nodes();
        num_spu_nodes = spu_prof_num_nodes * 8;
+        INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
+        /* create buffer for storing the SPU data to put in
+         * the kernel buffer.
+         */
+        ret = oprofile_spu_buff_create();
+        if (ret)
+                goto out;
        spin_lock_irqsave(&buffer_lock, flags);
-        add_event_entry(ESCAPE_CODE);
+        for (spu = 0; spu < num_spu_nodes; spu++) {
-        add_event_entry(SPU_PROFILING_CODE);
+                spu_buff_add(ESCAPE_CODE, spu);
-        add_event_entry(num_spu_nodes);
+                spu_buff_add(SPU_PROFILING_CODE, spu);
+                spu_buff_add(num_spu_nodes, spu);
+        }
        spin_unlock_irqrestore(&buffer_lock, flags);
+        for (spu = 0; spu < num_spu_nodes; spu++) {
+                spu_buff[spu].ctx_sw_seen = 0;
+                spu_buff[spu].last_guard_val = 0;
+        }
        /* Register for SPU events  */
        register_ret = spu_switch_event_register(&spu_active);
        if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
                goto out;
        }
-        for (k = 0; k < (MAX_NUMNODES * 8); k++)
-                last_guard_val[k] = 0;
        pr_debug("spu_sync_start -- running.\n");
 out:
        return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples,
                 * use.  We need to discard samples taken during the time
                 * period which an overlay occurs (i.e., guard value changes).
                 */
-                if (grd_val && grd_val != last_guard_val[spu_num]) {
+                if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
-                        last_guard_val[spu_num] = grd_val;
+                        spu_buff[spu_num].last_guard_val = grd_val;
                        /* Drop the rest of the samples. */
                        break;
                }
-                add_event_entry(file_offset | spu_num_shifted);
+                /* We must ensure that the SPU context switch has been written
+                 * out before samples for the SPU.  Otherwise, the SPU context
+                 * information is not available and the postprocessing of the
+                 * SPU PC will fail with no available anonymous map information.
+                 */
+                if (spu_buff[spu_num].ctx_sw_seen)
+                        spu_buff_add((file_offset | spu_num_shifted),
+                                         spu_num);
        }
        spin_unlock(&buffer_lock);
 out:
@@ -463,20 +626,41 @@ out:
 int spu_sync_stop(void)
 {
        unsigned long flags = 0;
-        int ret = spu_switch_event_unregister(&spu_active);
+        int ret;
-        if (ret) {
+        int k;
+        ret = spu_switch_event_unregister(&spu_active);
+        if (ret)
                printk(KERN_ERR "SPU_PROF: "
-                        "%s, line %d: spu_switch_event_unregister returned %d\n",
+                       "%s, line %d: spu_switch_event_unregister "      \
-                        __func__, __LINE__, ret);
+                       "returned %d\n",
-                goto out;
+                       __func__, __LINE__, ret);
-        }
+        /* flush any remaining data in the per SPU buffers */
+        sync_spu_buff();
        spin_lock_irqsave(&cache_lock, flags);
        ret = release_cached_info(RELEASE_ALL);
        spin_unlock_irqrestore(&cache_lock, flags);
-out:
+        /* remove scheduled work queue item rather then waiting
+         * for every queued entry to execute.  Then flush pending
+         * system wide buffer to event buffer.
+         */
+        cancel_delayed_work(&spu_work);
+        for (k = 0; k < num_spu_nodes; k++) {
+                spu_buff[k].ctx_sw_seen = 0;
+                /*
+                 * spu_sys_buff will be null if there was a problem
+                 * allocating the buffer.  Only delete if it exists.
+                 */
+                kfree(spu_buff[k].buff);
+                spu_buff[k].buff = 0;
+        }
        pr_debug("spu_sync_stop -- done.\n");
        return ret;
 }

diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h index 22e4e8d4eb2c..628009c01958 100644 --- a/arch/powerpc/oprofile/cell/pr_util.h +++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
24	#define SKIP_GENERIC_SYNC 0	24	#define SKIP_GENERIC_SYNC 0
25	#define SYNC_START_ERROR -1	25	#define SYNC_START_ERROR -1
26	#define DO_GENERIC_SYNC 1	26	#define DO_GENERIC_SYNC 1
		27	#define SPUS_PER_NODE 8
		28	#define DEFAULT_TIMER_EXPIRE (HZ / 10)
		29
		30	extern struct delayed_work spu_work;
		31	extern int spu_prof_running;
27		32
28	struct spu_overlay_info { /* map of sections within an SPU overlay */	33	struct spu_overlay_info { /* map of sections within an SPU overlay */
29	unsigned int vma; /* SPU virtual memory address from elf */	34	unsigned int vma; /* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map { /* map of sections within an SPU program */
62		67
63	};	68	};
64		69
		70	struct spu_buffer {
		71	int last_guard_val;
		72	int ctx_sw_seen;
		73	unsigned long *buff;
		74	unsigned int head, tail;
		75	};
		76
		77
65	/* The three functions below are for maintaining and accessing	78	/* The three functions below are for maintaining and accessing
66	* the vma-to-fileoffset map.	79	* the vma-to-fileoffset map.
67	*/	80	*/


diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c index 380d7e217531..6edaebd5099a 100644 --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@
23		23
24	static u32 *samples;	24	static u32 *samples;
25		25
26	static int spu_prof_running;	26	int spu_prof_running;
27	static unsigned int profiling_interval;	27	static unsigned int profiling_interval;
28		28
29	#define NUM_SPU_BITS_TRBUF 16	29	#define NUM_SPU_BITS_TRBUF 16
30	#define SPUS_PER_TB_ENTRY 4	30	#define SPUS_PER_TB_ENTRY 4
31	#define SPUS_PER_NODE 8
32		31
33	#define SPU_PC_MASK 0xFFFF	32	#define SPU_PC_MASK 0xFFFF
34		33
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset)
208		207
209	spu_prof_running = 1;	208	spu_prof_running = 1;
210	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);	209	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
		210	schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
211		211
212	return 0;	212	return 0;
213	}	213	}


diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index 2a9b4a049329..2949126d28d1 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
35	static DEFINE_SPINLOCK(cache_lock);	35	static DEFINE_SPINLOCK(cache_lock);
36	static int num_spu_nodes;	36	static int num_spu_nodes;
37	int spu_prof_num_nodes;	37	int spu_prof_num_nodes;
38	int last_guard_val[MAX_NUMNODES * 8];	38
		39	struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
		40	struct delayed_work spu_work;
		41	static unsigned max_spu_buff;
		42
		43	static void spu_buff_add(unsigned long int value, int spu)
		44	{
		45	/* spu buff is a circular buffer. Add entries to the
		46	* head. Head is the index to store the next value.
		47	* The buffer is full when there is one available entry
		48	* in the queue, i.e. head and tail can't be equal.
		49	* That way we can tell the difference between the
		50	* buffer being full versus empty.
		51	*
		52	* ASSUPTION: the buffer_lock is held when this function
		53	* is called to lock the buffer, head and tail.
		54	*/
		55	int full = 1;
		56
		57	if (spu_buff[spu].head >= spu_buff[spu].tail) {
		58	if ((spu_buff[spu].head - spu_buff[spu].tail)
		59	< (max_spu_buff - 1))
		60	full = 0;
		61
		62	} else if (spu_buff[spu].tail > spu_buff[spu].head) {
		63	if ((spu_buff[spu].tail - spu_buff[spu].head)
		64	> 1)
		65	full = 0;
		66	}
		67
		68	if (!full) {
		69	spu_buff[spu].buff[spu_buff[spu].head] = value;
		70	spu_buff[spu].head++;
		71
		72	if (spu_buff[spu].head >= max_spu_buff)
		73	spu_buff[spu].head = 0;
		74	} else {
		75	/* From the user's perspective make the SPU buffer
		76	* size management/overflow look like we are using
		77	* per cpu buffers. The user uses the same
		78	* per cpu parameter to adjust the SPU buffer size.
		79	* Increment the sample_lost_overflow to inform
		80	* the user the buffer size needs to be increased.
		81	*/
		82	oprofile_cpu_buffer_inc_smpl_lost();
		83	}
		84	}
		85
		86	/* This function copies the per SPU buffers to the
		87	* OProfile kernel buffer.
		88	*/
		89	void sync_spu_buff(void)
		90	{
		91	int spu;
		92	unsigned long flags;
		93	int curr_head;
		94
		95	for (spu = 0; spu < num_spu_nodes; spu++) {
		96	/* In case there was an issue and the buffer didn't
		97	* get created skip it.
		98	*/
		99	if (spu_buff[spu].buff == NULL)
		100	continue;
		101
		102	/* Hold the lock to make sure the head/tail
		103	* doesn't change while spu_buff_add() is
		104	* deciding if the buffer is full or not.
		105	* Being a little paranoid.
		106	*/
		107	spin_lock_irqsave(&buffer_lock, flags);
		108	curr_head = spu_buff[spu].head;
		109	spin_unlock_irqrestore(&buffer_lock, flags);
		110
		111	/* Transfer the current contents to the kernel buffer.
		112	* data can still be added to the head of the buffer.
		113	*/
		114	oprofile_put_buff(spu_buff[spu].buff,
		115	spu_buff[spu].tail,
		116	curr_head, max_spu_buff);
		117
		118	spin_lock_irqsave(&buffer_lock, flags);
		119	spu_buff[spu].tail = curr_head;
		120	spin_unlock_irqrestore(&buffer_lock, flags);
		121	}
		122
		123	}
		124
		125	static void wq_sync_spu_buff(struct work_struct *work)
		126	{
		127	/* move data from spu buffers to kernel buffer */
		128	sync_spu_buff();
		129
		130	/* only reschedule if profiling is not done */
		131	if (spu_prof_running)
		132	schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
		133	}
39		134
40	/* Container for caching information about an active SPU task. */	135	/* Container for caching information about an active SPU task. */
41	struct cached_info {	136	struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId)
305		400
306	/* Record context info in event buffer */	401	/* Record context info in event buffer */
307	spin_lock_irqsave(&buffer_lock, flags);	402	spin_lock_irqsave(&buffer_lock, flags);
308	add_event_entry(ESCAPE_CODE);	403	spu_buff_add(ESCAPE_CODE, spu->number);
309	add_event_entry(SPU_CTX_SWITCH_CODE);	404	spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
310	add_event_entry(spu->number);	405	spu_buff_add(spu->number, spu->number);
311	add_event_entry(spu->pid);	406	spu_buff_add(spu->pid, spu->number);
312	add_event_entry(spu->tgid);	407	spu_buff_add(spu->tgid, spu->number);
313	add_event_entry(app_dcookie);	408	spu_buff_add(app_dcookie, spu->number);
314	add_event_entry(spu_cookie);	409	spu_buff_add(spu_cookie, spu->number);
315	add_event_entry(offset);	410	spu_buff_add(offset, spu->number);
		411
		412	/* Set flag to indicate SPU PC data can now be written out. If
		413	* the SPU program counter data is seen before an SPU context
		414	* record is seen, the postprocessing will fail.
		415	*/
		416	spu_buff[spu->number].ctx_sw_seen = 1;
		417
316	spin_unlock_irqrestore(&buffer_lock, flags);	418	spin_unlock_irqrestore(&buffer_lock, flags);
317	smp_wmb(); /* insure spu event buffer updates are written */	419	smp_wmb(); /* insure spu event buffer updates are written */
318	/* don't want entries intermingled... */	420	/* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
360	return nodes;	462	return nodes;
361	}	463	}
362		464
		465	static int oprofile_spu_buff_create(void)
		466	{
		467	int spu;
		468
		469	max_spu_buff = oprofile_get_cpu_buffer_size();
		470
		471	for (spu = 0; spu < num_spu_nodes; spu++) {
		472	/* create circular buffers to store the data in.
		473	* use locks to manage accessing the buffers
		474	*/
		475	spu_buff[spu].head = 0;
		476	spu_buff[spu].tail = 0;
		477
		478	/*
		479	* Create a buffer for each SPU. Can't reliably
		480	* create a single buffer for all spus due to not
		481	* enough contiguous kernel memory.
		482	*/
		483
		484	spu_buff[spu].buff = kzalloc((max_spu_buff
		485	* sizeof(unsigned long)),
		486	GFP_KERNEL);
		487
		488	if (!spu_buff[spu].buff) {
		489	printk(KERN_ERR "SPU_PROF: "
		490	"%s, line %d: oprofile_spu_buff_create "
		491	"failed to allocate spu buffer %d.\n",
		492	__func__, __LINE__, spu);
		493
		494	/* release the spu buffers that have been allocated */
		495	while (spu >= 0) {
		496	kfree(spu_buff[spu].buff);
		497	spu_buff[spu].buff = 0;
		498	spu--;
		499	}
		500	return -ENOMEM;
		501	}
		502	}
		503	return 0;
		504	}
		505
363	/* The main purpose of this function is to synchronize	506	/* The main purpose of this function is to synchronize
364	* OProfile with SPUFS by registering to be notified of	507	* OProfile with SPUFS by registering to be notified of
365	* SPU task switches.	508	* SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
372	*/	515	*/
373	int spu_sync_start(void)	516	int spu_sync_start(void)
374	{	517	{
375	int k;	518	int spu;
376	int ret = SKIP_GENERIC_SYNC;	519	int ret = SKIP_GENERIC_SYNC;
377	int register_ret;	520	int register_ret;
378	unsigned long flags = 0;	521	unsigned long flags = 0;
379		522
380	spu_prof_num_nodes = number_of_online_nodes();	523	spu_prof_num_nodes = number_of_online_nodes();
381	num_spu_nodes = spu_prof_num_nodes * 8;	524	num_spu_nodes = spu_prof_num_nodes * 8;
		525	INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
		526
		527	/* create buffer for storing the SPU data to put in
		528	* the kernel buffer.
		529	*/
		530	ret = oprofile_spu_buff_create();
		531	if (ret)
		532	goto out;
382		533
383	spin_lock_irqsave(&buffer_lock, flags);	534	spin_lock_irqsave(&buffer_lock, flags);
384	add_event_entry(ESCAPE_CODE);	535	for (spu = 0; spu < num_spu_nodes; spu++) {
385	add_event_entry(SPU_PROFILING_CODE);	536	spu_buff_add(ESCAPE_CODE, spu);
386	add_event_entry(num_spu_nodes);	537	spu_buff_add(SPU_PROFILING_CODE, spu);
		538	spu_buff_add(num_spu_nodes, spu);
		539	}
387	spin_unlock_irqrestore(&buffer_lock, flags);	540	spin_unlock_irqrestore(&buffer_lock, flags);
388		541
		542	for (spu = 0; spu < num_spu_nodes; spu++) {
		543	spu_buff[spu].ctx_sw_seen = 0;
		544	spu_buff[spu].last_guard_val = 0;
		545	}
		546
389	/* Register for SPU events */	547	/* Register for SPU events */
390	register_ret = spu_switch_event_register(&spu_active);	548	register_ret = spu_switch_event_register(&spu_active);
391	if (register_ret) {	549	if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
393	goto out;	551	goto out;
394	}	552	}
395		553
396	for (k = 0; k < (MAX_NUMNODES * 8); k++)
397	last_guard_val[k] = 0;
398	pr_debug("spu_sync_start -- running.\n");	554	pr_debug("spu_sync_start -- running.\n");
399	out:	555	out:
400	return ret;	556	return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples,
446	* use. We need to discard samples taken during the time	602	* use. We need to discard samples taken during the time
447	* period which an overlay occurs (i.e., guard value changes).	603	* period which an overlay occurs (i.e., guard value changes).
448	*/	604	*/
449	if (grd_val && grd_val != last_guard_val[spu_num]) {	605	if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
450	last_guard_val[spu_num] = grd_val;	606	spu_buff[spu_num].last_guard_val = grd_val;
451	/* Drop the rest of the samples. */	607	/* Drop the rest of the samples. */
452	break;	608	break;
453	}	609	}
454		610
455	add_event_entry(file_offset \| spu_num_shifted);	611	/* We must ensure that the SPU context switch has been written
		612	* out before samples for the SPU. Otherwise, the SPU context
		613	* information is not available and the postprocessing of the
		614	* SPU PC will fail with no available anonymous map information.
		615	*/
		616	if (spu_buff[spu_num].ctx_sw_seen)
		617	spu_buff_add((file_offset \| spu_num_shifted),
		618	spu_num);
456	}	619	}
457	spin_unlock(&buffer_lock);	620	spin_unlock(&buffer_lock);
458	out:	621	out:
@@ -463,20 +626,41 @@ out:
463	int spu_sync_stop(void)	626	int spu_sync_stop(void)
464	{	627	{
465	unsigned long flags = 0;	628	unsigned long flags = 0;
466	int ret = spu_switch_event_unregister(&spu_active);	629	int ret;
467	if (ret) {	630	int k;
		631
		632	ret = spu_switch_event_unregister(&spu_active);
		633
		634	if (ret)
468	printk(KERN_ERR "SPU_PROF: "	635	printk(KERN_ERR "SPU_PROF: "
469	"%s, line %d: spu_switch_event_unregister returned %d\n",	636	"%s, line %d: spu_switch_event_unregister " \
470	__func__, __LINE__, ret);	637	"returned %d\n",
471	goto out;	638	__func__, __LINE__, ret);
472	}	639
		640	/* flush any remaining data in the per SPU buffers */
		641	sync_spu_buff();
473		642
474	spin_lock_irqsave(&cache_lock, flags);	643	spin_lock_irqsave(&cache_lock, flags);
475	ret = release_cached_info(RELEASE_ALL);	644	ret = release_cached_info(RELEASE_ALL);
476	spin_unlock_irqrestore(&cache_lock, flags);	645	spin_unlock_irqrestore(&cache_lock, flags);
477	out:	646
		647	/* remove scheduled work queue item rather then waiting
		648	* for every queued entry to execute. Then flush pending
		649	* system wide buffer to event buffer.
		650	*/
		651	cancel_delayed_work(&spu_work);
		652
		653	for (k = 0; k < num_spu_nodes; k++) {
		654	spu_buff[k].ctx_sw_seen = 0;
		655
		656	/*
		657	* spu_sys_buff will be null if there was a problem
		658	* allocating the buffer. Only delete if it exists.
		659	*/
		660	kfree(spu_buff[k].buff);
		661	spu_buff[k].buff = 0;
		662	}
478	pr_debug("spu_sync_stop -- done.\n");	663	pr_debug("spu_sync_stop -- done.\n");
479	return ret;	664	return ret;
480	}	665	}
481		666
482