aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/oprofile/cell
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/oprofile/cell')
-rw-r--r--arch/powerpc/oprofile/cell/pr_util.h13
-rw-r--r--arch/powerpc/oprofile/cell/spu_profiler.c4
-rw-r--r--arch/powerpc/oprofile/cell/spu_task_sync.c236
3 files changed, 225 insertions, 28 deletions
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
index 22e4e8d4eb2c..628009c01958 100644
--- a/arch/powerpc/oprofile/cell/pr_util.h
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
24#define SKIP_GENERIC_SYNC 0 24#define SKIP_GENERIC_SYNC 0
25#define SYNC_START_ERROR -1 25#define SYNC_START_ERROR -1
26#define DO_GENERIC_SYNC 1 26#define DO_GENERIC_SYNC 1
27#define SPUS_PER_NODE 8
28#define DEFAULT_TIMER_EXPIRE (HZ / 10)
29
30extern struct delayed_work spu_work;
31extern int spu_prof_running;
27 32
28struct spu_overlay_info { /* map of sections within an SPU overlay */ 33struct spu_overlay_info { /* map of sections within an SPU overlay */
29 unsigned int vma; /* SPU virtual memory address from elf */ 34 unsigned int vma; /* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map { /* map of sections within an SPU program */
62 67
63}; 68};
64 69
70struct spu_buffer {
71 int last_guard_val;
72 int ctx_sw_seen;
73 unsigned long *buff;
74 unsigned int head, tail;
75};
76
77
65/* The three functions below are for maintaining and accessing 78/* The three functions below are for maintaining and accessing
66 * the vma-to-fileoffset map. 79 * the vma-to-fileoffset map.
67 */ 80 */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index 380d7e217531..6edaebd5099a 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@
23 23
24static u32 *samples; 24static u32 *samples;
25 25
26static int spu_prof_running; 26int spu_prof_running;
27static unsigned int profiling_interval; 27static unsigned int profiling_interval;
28 28
29#define NUM_SPU_BITS_TRBUF 16 29#define NUM_SPU_BITS_TRBUF 16
30#define SPUS_PER_TB_ENTRY 4 30#define SPUS_PER_TB_ENTRY 4
31#define SPUS_PER_NODE 8
32 31
33#define SPU_PC_MASK 0xFFFF 32#define SPU_PC_MASK 0xFFFF
34 33
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset)
208 207
209 spu_prof_running = 1; 208 spu_prof_running = 1;
210 hrtimer_start(&timer, kt, HRTIMER_MODE_REL); 209 hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
210 schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
211 211
212 return 0; 212 return 0;
213} 213}
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 2a9b4a049329..2949126d28d1 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
35static DEFINE_SPINLOCK(cache_lock); 35static DEFINE_SPINLOCK(cache_lock);
36static int num_spu_nodes; 36static int num_spu_nodes;
37int spu_prof_num_nodes; 37int spu_prof_num_nodes;
38int last_guard_val[MAX_NUMNODES * 8]; 38
39struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
40struct delayed_work spu_work;
41static unsigned max_spu_buff;
42
43static void spu_buff_add(unsigned long int value, int spu)
44{
45 /* spu buff is a circular buffer. Add entries to the
46 * head. Head is the index to store the next value.
47 * The buffer is full when there is one available entry
48 * in the queue, i.e. head and tail can't be equal.
49 * That way we can tell the difference between the
50 * buffer being full versus empty.
51 *
52 * ASSUPTION: the buffer_lock is held when this function
53 * is called to lock the buffer, head and tail.
54 */
55 int full = 1;
56
57 if (spu_buff[spu].head >= spu_buff[spu].tail) {
58 if ((spu_buff[spu].head - spu_buff[spu].tail)
59 < (max_spu_buff - 1))
60 full = 0;
61
62 } else if (spu_buff[spu].tail > spu_buff[spu].head) {
63 if ((spu_buff[spu].tail - spu_buff[spu].head)
64 > 1)
65 full = 0;
66 }
67
68 if (!full) {
69 spu_buff[spu].buff[spu_buff[spu].head] = value;
70 spu_buff[spu].head++;
71
72 if (spu_buff[spu].head >= max_spu_buff)
73 spu_buff[spu].head = 0;
74 } else {
75 /* From the user's perspective make the SPU buffer
76 * size management/overflow look like we are using
77 * per cpu buffers. The user uses the same
78 * per cpu parameter to adjust the SPU buffer size.
79 * Increment the sample_lost_overflow to inform
80 * the user the buffer size needs to be increased.
81 */
82 oprofile_cpu_buffer_inc_smpl_lost();
83 }
84}
85
86/* This function copies the per SPU buffers to the
87 * OProfile kernel buffer.
88 */
89void sync_spu_buff(void)
90{
91 int spu;
92 unsigned long flags;
93 int curr_head;
94
95 for (spu = 0; spu < num_spu_nodes; spu++) {
96 /* In case there was an issue and the buffer didn't
97 * get created skip it.
98 */
99 if (spu_buff[spu].buff == NULL)
100 continue;
101
102 /* Hold the lock to make sure the head/tail
103 * doesn't change while spu_buff_add() is
104 * deciding if the buffer is full or not.
105 * Being a little paranoid.
106 */
107 spin_lock_irqsave(&buffer_lock, flags);
108 curr_head = spu_buff[spu].head;
109 spin_unlock_irqrestore(&buffer_lock, flags);
110
111 /* Transfer the current contents to the kernel buffer.
112 * data can still be added to the head of the buffer.
113 */
114 oprofile_put_buff(spu_buff[spu].buff,
115 spu_buff[spu].tail,
116 curr_head, max_spu_buff);
117
118 spin_lock_irqsave(&buffer_lock, flags);
119 spu_buff[spu].tail = curr_head;
120 spin_unlock_irqrestore(&buffer_lock, flags);
121 }
122
123}
124
125static void wq_sync_spu_buff(struct work_struct *work)
126{
127 /* move data from spu buffers to kernel buffer */
128 sync_spu_buff();
129
130 /* only reschedule if profiling is not done */
131 if (spu_prof_running)
132 schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
133}
39 134
40/* Container for caching information about an active SPU task. */ 135/* Container for caching information about an active SPU task. */
41struct cached_info { 136struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId)
305 400
306 /* Record context info in event buffer */ 401 /* Record context info in event buffer */
307 spin_lock_irqsave(&buffer_lock, flags); 402 spin_lock_irqsave(&buffer_lock, flags);
308 add_event_entry(ESCAPE_CODE); 403 spu_buff_add(ESCAPE_CODE, spu->number);
309 add_event_entry(SPU_CTX_SWITCH_CODE); 404 spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
310 add_event_entry(spu->number); 405 spu_buff_add(spu->number, spu->number);
311 add_event_entry(spu->pid); 406 spu_buff_add(spu->pid, spu->number);
312 add_event_entry(spu->tgid); 407 spu_buff_add(spu->tgid, spu->number);
313 add_event_entry(app_dcookie); 408 spu_buff_add(app_dcookie, spu->number);
314 add_event_entry(spu_cookie); 409 spu_buff_add(spu_cookie, spu->number);
315 add_event_entry(offset); 410 spu_buff_add(offset, spu->number);
411
412 /* Set flag to indicate SPU PC data can now be written out. If
413 * the SPU program counter data is seen before an SPU context
414 * record is seen, the postprocessing will fail.
415 */
416 spu_buff[spu->number].ctx_sw_seen = 1;
417
316 spin_unlock_irqrestore(&buffer_lock, flags); 418 spin_unlock_irqrestore(&buffer_lock, flags);
317 smp_wmb(); /* insure spu event buffer updates are written */ 419 smp_wmb(); /* insure spu event buffer updates are written */
318 /* don't want entries intermingled... */ 420 /* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
360 return nodes; 462 return nodes;
361} 463}
362 464
465static int oprofile_spu_buff_create(void)
466{
467 int spu;
468
469 max_spu_buff = oprofile_get_cpu_buffer_size();
470
471 for (spu = 0; spu < num_spu_nodes; spu++) {
472 /* create circular buffers to store the data in.
473 * use locks to manage accessing the buffers
474 */
475 spu_buff[spu].head = 0;
476 spu_buff[spu].tail = 0;
477
478 /*
479 * Create a buffer for each SPU. Can't reliably
480 * create a single buffer for all spus due to not
481 * enough contiguous kernel memory.
482 */
483
484 spu_buff[spu].buff = kzalloc((max_spu_buff
485 * sizeof(unsigned long)),
486 GFP_KERNEL);
487
488 if (!spu_buff[spu].buff) {
489 printk(KERN_ERR "SPU_PROF: "
490 "%s, line %d: oprofile_spu_buff_create "
491 "failed to allocate spu buffer %d.\n",
492 __func__, __LINE__, spu);
493
494 /* release the spu buffers that have been allocated */
495 while (spu >= 0) {
496 kfree(spu_buff[spu].buff);
497 spu_buff[spu].buff = 0;
498 spu--;
499 }
500 return -ENOMEM;
501 }
502 }
503 return 0;
504}
505
363/* The main purpose of this function is to synchronize 506/* The main purpose of this function is to synchronize
364 * OProfile with SPUFS by registering to be notified of 507 * OProfile with SPUFS by registering to be notified of
365 * SPU task switches. 508 * SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
372 */ 515 */
373int spu_sync_start(void) 516int spu_sync_start(void)
374{ 517{
375 int k; 518 int spu;
376 int ret = SKIP_GENERIC_SYNC; 519 int ret = SKIP_GENERIC_SYNC;
377 int register_ret; 520 int register_ret;
378 unsigned long flags = 0; 521 unsigned long flags = 0;
379 522
380 spu_prof_num_nodes = number_of_online_nodes(); 523 spu_prof_num_nodes = number_of_online_nodes();
381 num_spu_nodes = spu_prof_num_nodes * 8; 524 num_spu_nodes = spu_prof_num_nodes * 8;
525 INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
526
527 /* create buffer for storing the SPU data to put in
528 * the kernel buffer.
529 */
530 ret = oprofile_spu_buff_create();
531 if (ret)
532 goto out;
382 533
383 spin_lock_irqsave(&buffer_lock, flags); 534 spin_lock_irqsave(&buffer_lock, flags);
384 add_event_entry(ESCAPE_CODE); 535 for (spu = 0; spu < num_spu_nodes; spu++) {
385 add_event_entry(SPU_PROFILING_CODE); 536 spu_buff_add(ESCAPE_CODE, spu);
386 add_event_entry(num_spu_nodes); 537 spu_buff_add(SPU_PROFILING_CODE, spu);
538 spu_buff_add(num_spu_nodes, spu);
539 }
387 spin_unlock_irqrestore(&buffer_lock, flags); 540 spin_unlock_irqrestore(&buffer_lock, flags);
388 541
542 for (spu = 0; spu < num_spu_nodes; spu++) {
543 spu_buff[spu].ctx_sw_seen = 0;
544 spu_buff[spu].last_guard_val = 0;
545 }
546
389 /* Register for SPU events */ 547 /* Register for SPU events */
390 register_ret = spu_switch_event_register(&spu_active); 548 register_ret = spu_switch_event_register(&spu_active);
391 if (register_ret) { 549 if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
393 goto out; 551 goto out;
394 } 552 }
395 553
396 for (k = 0; k < (MAX_NUMNODES * 8); k++)
397 last_guard_val[k] = 0;
398 pr_debug("spu_sync_start -- running.\n"); 554 pr_debug("spu_sync_start -- running.\n");
399out: 555out:
400 return ret; 556 return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples,
446 * use. We need to discard samples taken during the time 602 * use. We need to discard samples taken during the time
447 * period which an overlay occurs (i.e., guard value changes). 603 * period which an overlay occurs (i.e., guard value changes).
448 */ 604 */
449 if (grd_val && grd_val != last_guard_val[spu_num]) { 605 if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
450 last_guard_val[spu_num] = grd_val; 606 spu_buff[spu_num].last_guard_val = grd_val;
451 /* Drop the rest of the samples. */ 607 /* Drop the rest of the samples. */
452 break; 608 break;
453 } 609 }
454 610
455 add_event_entry(file_offset | spu_num_shifted); 611 /* We must ensure that the SPU context switch has been written
612 * out before samples for the SPU. Otherwise, the SPU context
613 * information is not available and the postprocessing of the
614 * SPU PC will fail with no available anonymous map information.
615 */
616 if (spu_buff[spu_num].ctx_sw_seen)
617 spu_buff_add((file_offset | spu_num_shifted),
618 spu_num);
456 } 619 }
457 spin_unlock(&buffer_lock); 620 spin_unlock(&buffer_lock);
458out: 621out:
@@ -463,20 +626,41 @@ out:
463int spu_sync_stop(void) 626int spu_sync_stop(void)
464{ 627{
465 unsigned long flags = 0; 628 unsigned long flags = 0;
466 int ret = spu_switch_event_unregister(&spu_active); 629 int ret;
467 if (ret) { 630 int k;
631
632 ret = spu_switch_event_unregister(&spu_active);
633
634 if (ret)
468 printk(KERN_ERR "SPU_PROF: " 635 printk(KERN_ERR "SPU_PROF: "
469 "%s, line %d: spu_switch_event_unregister returned %d\n", 636 "%s, line %d: spu_switch_event_unregister " \
470 __func__, __LINE__, ret); 637 "returned %d\n",
471 goto out; 638 __func__, __LINE__, ret);
472 } 639
640 /* flush any remaining data in the per SPU buffers */
641 sync_spu_buff();
473 642
474 spin_lock_irqsave(&cache_lock, flags); 643 spin_lock_irqsave(&cache_lock, flags);
475 ret = release_cached_info(RELEASE_ALL); 644 ret = release_cached_info(RELEASE_ALL);
476 spin_unlock_irqrestore(&cache_lock, flags); 645 spin_unlock_irqrestore(&cache_lock, flags);
477out: 646
647 /* remove scheduled work queue item rather then waiting
648 * for every queued entry to execute. Then flush pending
649 * system wide buffer to event buffer.
650 */
651 cancel_delayed_work(&spu_work);
652
653 for (k = 0; k < num_spu_nodes; k++) {
654 spu_buff[k].ctx_sw_seen = 0;
655
656 /*
657 * spu_sys_buff will be null if there was a problem
658 * allocating the buffer. Only delete if it exists.
659 */
660 kfree(spu_buff[k].buff);
661 spu_buff[k].buff = 0;
662 }
478 pr_debug("spu_sync_stop -- done.\n"); 663 pr_debug("spu_sync_stop -- done.\n");
479 return ret; 664 return ret;
480} 665}
481 666
482