diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-27 07:38:02 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-27 07:38:02 -0400 |
commit | 5292ae11babca23c3ff82593630d2d7eebc350a9 (patch) | |
tree | 30a6c8123b35686098f306ea39398b7621f42054 /arch/powerpc/oprofile | |
parent | b0f209898f1a177bd503d49215b8c6628797a81c (diff) | |
parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) |
Merge commit 'v2.6.28-rc2' into x86/uv
Diffstat (limited to 'arch/powerpc/oprofile')
-rw-r--r-- | arch/powerpc/oprofile/cell/pr_util.h | 13 | ||||
-rw-r--r-- | arch/powerpc/oprofile/cell/spu_profiler.c | 6 | ||||
-rw-r--r-- | arch/powerpc/oprofile/cell/spu_task_sync.c | 236 | ||||
-rw-r--r-- | arch/powerpc/oprofile/op_model_cell.c | 2 |
4 files changed, 227 insertions, 30 deletions
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h index 22e4e8d4eb2c..628009c01958 100644 --- a/arch/powerpc/oprofile/cell/pr_util.h +++ b/arch/powerpc/oprofile/cell/pr_util.h | |||
@@ -24,6 +24,11 @@ | |||
24 | #define SKIP_GENERIC_SYNC 0 | 24 | #define SKIP_GENERIC_SYNC 0 |
25 | #define SYNC_START_ERROR -1 | 25 | #define SYNC_START_ERROR -1 |
26 | #define DO_GENERIC_SYNC 1 | 26 | #define DO_GENERIC_SYNC 1 |
27 | #define SPUS_PER_NODE 8 | ||
28 | #define DEFAULT_TIMER_EXPIRE (HZ / 10) | ||
29 | |||
30 | extern struct delayed_work spu_work; | ||
31 | extern int spu_prof_running; | ||
27 | 32 | ||
28 | struct spu_overlay_info { /* map of sections within an SPU overlay */ | 33 | struct spu_overlay_info { /* map of sections within an SPU overlay */ |
29 | unsigned int vma; /* SPU virtual memory address from elf */ | 34 | unsigned int vma; /* SPU virtual memory address from elf */ |
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map { /* map of sections within an SPU program */ | |||
62 | 67 | ||
63 | }; | 68 | }; |
64 | 69 | ||
70 | struct spu_buffer { | ||
71 | int last_guard_val; | ||
72 | int ctx_sw_seen; | ||
73 | unsigned long *buff; | ||
74 | unsigned int head, tail; | ||
75 | }; | ||
76 | |||
77 | |||
65 | /* The three functions below are for maintaining and accessing | 78 | /* The three functions below are for maintaining and accessing |
66 | * the vma-to-fileoffset map. | 79 | * the vma-to-fileoffset map. |
67 | */ | 80 | */ |
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c index 380d7e217531..dd499c3e9da7 100644 --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c | |||
@@ -23,12 +23,11 @@ | |||
23 | 23 | ||
24 | static u32 *samples; | 24 | static u32 *samples; |
25 | 25 | ||
26 | static int spu_prof_running; | 26 | int spu_prof_running; |
27 | static unsigned int profiling_interval; | 27 | static unsigned int profiling_interval; |
28 | 28 | ||
29 | #define NUM_SPU_BITS_TRBUF 16 | 29 | #define NUM_SPU_BITS_TRBUF 16 |
30 | #define SPUS_PER_TB_ENTRY 4 | 30 | #define SPUS_PER_TB_ENTRY 4 |
31 | #define SPUS_PER_NODE 8 | ||
32 | 31 | ||
33 | #define SPU_PC_MASK 0xFFFF | 32 | #define SPU_PC_MASK 0xFFFF |
34 | 33 | ||
@@ -196,7 +195,7 @@ int start_spu_profiling(unsigned int cycles_reset) | |||
196 | pr_debug("timer resolution: %lu\n", TICK_NSEC); | 195 | pr_debug("timer resolution: %lu\n", TICK_NSEC); |
197 | kt = ktime_set(0, profiling_interval); | 196 | kt = ktime_set(0, profiling_interval); |
198 | hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 197 | hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
199 | timer.expires = kt; | 198 | hrtimer_set_expires(&timer, kt); |
200 | timer.function = profile_spus; | 199 | timer.function = profile_spus; |
201 | 200 | ||
202 | /* Allocate arrays for collecting SPU PC samples */ | 201 | /* Allocate arrays for collecting SPU PC samples */ |
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset) | |||
208 | 207 | ||
209 | spu_prof_running = 1; | 208 | spu_prof_running = 1; |
210 | hrtimer_start(&timer, kt, HRTIMER_MODE_REL); | 209 | hrtimer_start(&timer, kt, HRTIMER_MODE_REL); |
210 | schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); | ||
211 | 211 | ||
212 | return 0; | 212 | return 0; |
213 | } | 213 | } |
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index 2a9b4a049329..2949126d28d1 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c | |||
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock); | |||
35 | static DEFINE_SPINLOCK(cache_lock); | 35 | static DEFINE_SPINLOCK(cache_lock); |
36 | static int num_spu_nodes; | 36 | static int num_spu_nodes; |
37 | int spu_prof_num_nodes; | 37 | int spu_prof_num_nodes; |
38 | int last_guard_val[MAX_NUMNODES * 8]; | 38 | |
39 | struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE]; | ||
40 | struct delayed_work spu_work; | ||
41 | static unsigned max_spu_buff; | ||
42 | |||
43 | static void spu_buff_add(unsigned long int value, int spu) | ||
44 | { | ||
45 | /* spu buff is a circular buffer. Add entries to the | ||
46 | * head. Head is the index to store the next value. | ||
47 | * The buffer is full when there is one available entry | ||
48 | * in the queue, i.e. head and tail can't be equal. | ||
49 | * That way we can tell the difference between the | ||
50 | * buffer being full versus empty. | ||
51 | * | ||
52 | * ASSUPTION: the buffer_lock is held when this function | ||
53 | * is called to lock the buffer, head and tail. | ||
54 | */ | ||
55 | int full = 1; | ||
56 | |||
57 | if (spu_buff[spu].head >= spu_buff[spu].tail) { | ||
58 | if ((spu_buff[spu].head - spu_buff[spu].tail) | ||
59 | < (max_spu_buff - 1)) | ||
60 | full = 0; | ||
61 | |||
62 | } else if (spu_buff[spu].tail > spu_buff[spu].head) { | ||
63 | if ((spu_buff[spu].tail - spu_buff[spu].head) | ||
64 | > 1) | ||
65 | full = 0; | ||
66 | } | ||
67 | |||
68 | if (!full) { | ||
69 | spu_buff[spu].buff[spu_buff[spu].head] = value; | ||
70 | spu_buff[spu].head++; | ||
71 | |||
72 | if (spu_buff[spu].head >= max_spu_buff) | ||
73 | spu_buff[spu].head = 0; | ||
74 | } else { | ||
75 | /* From the user's perspective make the SPU buffer | ||
76 | * size management/overflow look like we are using | ||
77 | * per cpu buffers. The user uses the same | ||
78 | * per cpu parameter to adjust the SPU buffer size. | ||
79 | * Increment the sample_lost_overflow to inform | ||
80 | * the user the buffer size needs to be increased. | ||
81 | */ | ||
82 | oprofile_cpu_buffer_inc_smpl_lost(); | ||
83 | } | ||
84 | } | ||
85 | |||
86 | /* This function copies the per SPU buffers to the | ||
87 | * OProfile kernel buffer. | ||
88 | */ | ||
89 | void sync_spu_buff(void) | ||
90 | { | ||
91 | int spu; | ||
92 | unsigned long flags; | ||
93 | int curr_head; | ||
94 | |||
95 | for (spu = 0; spu < num_spu_nodes; spu++) { | ||
96 | /* In case there was an issue and the buffer didn't | ||
97 | * get created skip it. | ||
98 | */ | ||
99 | if (spu_buff[spu].buff == NULL) | ||
100 | continue; | ||
101 | |||
102 | /* Hold the lock to make sure the head/tail | ||
103 | * doesn't change while spu_buff_add() is | ||
104 | * deciding if the buffer is full or not. | ||
105 | * Being a little paranoid. | ||
106 | */ | ||
107 | spin_lock_irqsave(&buffer_lock, flags); | ||
108 | curr_head = spu_buff[spu].head; | ||
109 | spin_unlock_irqrestore(&buffer_lock, flags); | ||
110 | |||
111 | /* Transfer the current contents to the kernel buffer. | ||
112 | * data can still be added to the head of the buffer. | ||
113 | */ | ||
114 | oprofile_put_buff(spu_buff[spu].buff, | ||
115 | spu_buff[spu].tail, | ||
116 | curr_head, max_spu_buff); | ||
117 | |||
118 | spin_lock_irqsave(&buffer_lock, flags); | ||
119 | spu_buff[spu].tail = curr_head; | ||
120 | spin_unlock_irqrestore(&buffer_lock, flags); | ||
121 | } | ||
122 | |||
123 | } | ||
124 | |||
125 | static void wq_sync_spu_buff(struct work_struct *work) | ||
126 | { | ||
127 | /* move data from spu buffers to kernel buffer */ | ||
128 | sync_spu_buff(); | ||
129 | |||
130 | /* only reschedule if profiling is not done */ | ||
131 | if (spu_prof_running) | ||
132 | schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); | ||
133 | } | ||
39 | 134 | ||
40 | /* Container for caching information about an active SPU task. */ | 135 | /* Container for caching information about an active SPU task. */ |
41 | struct cached_info { | 136 | struct cached_info { |
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId) | |||
305 | 400 | ||
306 | /* Record context info in event buffer */ | 401 | /* Record context info in event buffer */ |
307 | spin_lock_irqsave(&buffer_lock, flags); | 402 | spin_lock_irqsave(&buffer_lock, flags); |
308 | add_event_entry(ESCAPE_CODE); | 403 | spu_buff_add(ESCAPE_CODE, spu->number); |
309 | add_event_entry(SPU_CTX_SWITCH_CODE); | 404 | spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number); |
310 | add_event_entry(spu->number); | 405 | spu_buff_add(spu->number, spu->number); |
311 | add_event_entry(spu->pid); | 406 | spu_buff_add(spu->pid, spu->number); |
312 | add_event_entry(spu->tgid); | 407 | spu_buff_add(spu->tgid, spu->number); |
313 | add_event_entry(app_dcookie); | 408 | spu_buff_add(app_dcookie, spu->number); |
314 | add_event_entry(spu_cookie); | 409 | spu_buff_add(spu_cookie, spu->number); |
315 | add_event_entry(offset); | 410 | spu_buff_add(offset, spu->number); |
411 | |||
412 | /* Set flag to indicate SPU PC data can now be written out. If | ||
413 | * the SPU program counter data is seen before an SPU context | ||
414 | * record is seen, the postprocessing will fail. | ||
415 | */ | ||
416 | spu_buff[spu->number].ctx_sw_seen = 1; | ||
417 | |||
316 | spin_unlock_irqrestore(&buffer_lock, flags); | 418 | spin_unlock_irqrestore(&buffer_lock, flags); |
317 | smp_wmb(); /* insure spu event buffer updates are written */ | 419 | smp_wmb(); /* insure spu event buffer updates are written */ |
318 | /* don't want entries intermingled... */ | 420 | /* don't want entries intermingled... */ |
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void) | |||
360 | return nodes; | 462 | return nodes; |
361 | } | 463 | } |
362 | 464 | ||
465 | static int oprofile_spu_buff_create(void) | ||
466 | { | ||
467 | int spu; | ||
468 | |||
469 | max_spu_buff = oprofile_get_cpu_buffer_size(); | ||
470 | |||
471 | for (spu = 0; spu < num_spu_nodes; spu++) { | ||
472 | /* create circular buffers to store the data in. | ||
473 | * use locks to manage accessing the buffers | ||
474 | */ | ||
475 | spu_buff[spu].head = 0; | ||
476 | spu_buff[spu].tail = 0; | ||
477 | |||
478 | /* | ||
479 | * Create a buffer for each SPU. Can't reliably | ||
480 | * create a single buffer for all spus due to not | ||
481 | * enough contiguous kernel memory. | ||
482 | */ | ||
483 | |||
484 | spu_buff[spu].buff = kzalloc((max_spu_buff | ||
485 | * sizeof(unsigned long)), | ||
486 | GFP_KERNEL); | ||
487 | |||
488 | if (!spu_buff[spu].buff) { | ||
489 | printk(KERN_ERR "SPU_PROF: " | ||
490 | "%s, line %d: oprofile_spu_buff_create " | ||
491 | "failed to allocate spu buffer %d.\n", | ||
492 | __func__, __LINE__, spu); | ||
493 | |||
494 | /* release the spu buffers that have been allocated */ | ||
495 | while (spu >= 0) { | ||
496 | kfree(spu_buff[spu].buff); | ||
497 | spu_buff[spu].buff = 0; | ||
498 | spu--; | ||
499 | } | ||
500 | return -ENOMEM; | ||
501 | } | ||
502 | } | ||
503 | return 0; | ||
504 | } | ||
505 | |||
363 | /* The main purpose of this function is to synchronize | 506 | /* The main purpose of this function is to synchronize |
364 | * OProfile with SPUFS by registering to be notified of | 507 | * OProfile with SPUFS by registering to be notified of |
365 | * SPU task switches. | 508 | * SPU task switches. |
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void) | |||
372 | */ | 515 | */ |
373 | int spu_sync_start(void) | 516 | int spu_sync_start(void) |
374 | { | 517 | { |
375 | int k; | 518 | int spu; |
376 | int ret = SKIP_GENERIC_SYNC; | 519 | int ret = SKIP_GENERIC_SYNC; |
377 | int register_ret; | 520 | int register_ret; |
378 | unsigned long flags = 0; | 521 | unsigned long flags = 0; |
379 | 522 | ||
380 | spu_prof_num_nodes = number_of_online_nodes(); | 523 | spu_prof_num_nodes = number_of_online_nodes(); |
381 | num_spu_nodes = spu_prof_num_nodes * 8; | 524 | num_spu_nodes = spu_prof_num_nodes * 8; |
525 | INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff); | ||
526 | |||
527 | /* create buffer for storing the SPU data to put in | ||
528 | * the kernel buffer. | ||
529 | */ | ||
530 | ret = oprofile_spu_buff_create(); | ||
531 | if (ret) | ||
532 | goto out; | ||
382 | 533 | ||
383 | spin_lock_irqsave(&buffer_lock, flags); | 534 | spin_lock_irqsave(&buffer_lock, flags); |
384 | add_event_entry(ESCAPE_CODE); | 535 | for (spu = 0; spu < num_spu_nodes; spu++) { |
385 | add_event_entry(SPU_PROFILING_CODE); | 536 | spu_buff_add(ESCAPE_CODE, spu); |
386 | add_event_entry(num_spu_nodes); | 537 | spu_buff_add(SPU_PROFILING_CODE, spu); |
538 | spu_buff_add(num_spu_nodes, spu); | ||
539 | } | ||
387 | spin_unlock_irqrestore(&buffer_lock, flags); | 540 | spin_unlock_irqrestore(&buffer_lock, flags); |
388 | 541 | ||
542 | for (spu = 0; spu < num_spu_nodes; spu++) { | ||
543 | spu_buff[spu].ctx_sw_seen = 0; | ||
544 | spu_buff[spu].last_guard_val = 0; | ||
545 | } | ||
546 | |||
389 | /* Register for SPU events */ | 547 | /* Register for SPU events */ |
390 | register_ret = spu_switch_event_register(&spu_active); | 548 | register_ret = spu_switch_event_register(&spu_active); |
391 | if (register_ret) { | 549 | if (register_ret) { |
@@ -393,8 +551,6 @@ int spu_sync_start(void) | |||
393 | goto out; | 551 | goto out; |
394 | } | 552 | } |
395 | 553 | ||
396 | for (k = 0; k < (MAX_NUMNODES * 8); k++) | ||
397 | last_guard_val[k] = 0; | ||
398 | pr_debug("spu_sync_start -- running.\n"); | 554 | pr_debug("spu_sync_start -- running.\n"); |
399 | out: | 555 | out: |
400 | return ret; | 556 | return ret; |
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples, | |||
446 | * use. We need to discard samples taken during the time | 602 | * use. We need to discard samples taken during the time |
447 | * period which an overlay occurs (i.e., guard value changes). | 603 | * period which an overlay occurs (i.e., guard value changes). |
448 | */ | 604 | */ |
449 | if (grd_val && grd_val != last_guard_val[spu_num]) { | 605 | if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) { |
450 | last_guard_val[spu_num] = grd_val; | 606 | spu_buff[spu_num].last_guard_val = grd_val; |
451 | /* Drop the rest of the samples. */ | 607 | /* Drop the rest of the samples. */ |
452 | break; | 608 | break; |
453 | } | 609 | } |
454 | 610 | ||
455 | add_event_entry(file_offset | spu_num_shifted); | 611 | /* We must ensure that the SPU context switch has been written |
612 | * out before samples for the SPU. Otherwise, the SPU context | ||
613 | * information is not available and the postprocessing of the | ||
614 | * SPU PC will fail with no available anonymous map information. | ||
615 | */ | ||
616 | if (spu_buff[spu_num].ctx_sw_seen) | ||
617 | spu_buff_add((file_offset | spu_num_shifted), | ||
618 | spu_num); | ||
456 | } | 619 | } |
457 | spin_unlock(&buffer_lock); | 620 | spin_unlock(&buffer_lock); |
458 | out: | 621 | out: |
@@ -463,20 +626,41 @@ out: | |||
463 | int spu_sync_stop(void) | 626 | int spu_sync_stop(void) |
464 | { | 627 | { |
465 | unsigned long flags = 0; | 628 | unsigned long flags = 0; |
466 | int ret = spu_switch_event_unregister(&spu_active); | 629 | int ret; |
467 | if (ret) { | 630 | int k; |
631 | |||
632 | ret = spu_switch_event_unregister(&spu_active); | ||
633 | |||
634 | if (ret) | ||
468 | printk(KERN_ERR "SPU_PROF: " | 635 | printk(KERN_ERR "SPU_PROF: " |
469 | "%s, line %d: spu_switch_event_unregister returned %d\n", | 636 | "%s, line %d: spu_switch_event_unregister " \ |
470 | __func__, __LINE__, ret); | 637 | "returned %d\n", |
471 | goto out; | 638 | __func__, __LINE__, ret); |
472 | } | 639 | |
640 | /* flush any remaining data in the per SPU buffers */ | ||
641 | sync_spu_buff(); | ||
473 | 642 | ||
474 | spin_lock_irqsave(&cache_lock, flags); | 643 | spin_lock_irqsave(&cache_lock, flags); |
475 | ret = release_cached_info(RELEASE_ALL); | 644 | ret = release_cached_info(RELEASE_ALL); |
476 | spin_unlock_irqrestore(&cache_lock, flags); | 645 | spin_unlock_irqrestore(&cache_lock, flags); |
477 | out: | 646 | |
647 | /* remove scheduled work queue item rather then waiting | ||
648 | * for every queued entry to execute. Then flush pending | ||
649 | * system wide buffer to event buffer. | ||
650 | */ | ||
651 | cancel_delayed_work(&spu_work); | ||
652 | |||
653 | for (k = 0; k < num_spu_nodes; k++) { | ||
654 | spu_buff[k].ctx_sw_seen = 0; | ||
655 | |||
656 | /* | ||
657 | * spu_sys_buff will be null if there was a problem | ||
658 | * allocating the buffer. Only delete if it exists. | ||
659 | */ | ||
660 | kfree(spu_buff[k].buff); | ||
661 | spu_buff[k].buff = 0; | ||
662 | } | ||
478 | pr_debug("spu_sync_stop -- done.\n"); | 663 | pr_debug("spu_sync_stop -- done.\n"); |
479 | return ret; | 664 | return ret; |
480 | } | 665 | } |
481 | 666 | ||
482 | |||
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index 5ff4de3eb3be..35141a8bc3d9 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c | |||
@@ -404,7 +404,7 @@ set_count_mode(u32 kernel, u32 user) | |||
404 | } | 404 | } |
405 | } | 405 | } |
406 | 406 | ||
407 | static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl) | 407 | static inline void enable_ctr(u32 cpu, u32 ctr, u32 *pm07_cntrl) |
408 | { | 408 | { |
409 | 409 | ||
410 | pm07_cntrl[ctr] |= CBE_PM_CTR_ENABLE; | 410 | pm07_cntrl[ctr] |= CBE_PM_CTR_ENABLE; |