diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-12-12 07:49:45 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-12-14 14:30:49 -0500 |
commit | 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 (patch) | |
tree | f7b3482ae284c214119efe309e356fb84de126bb /kernel | |
parent | ee06094f8279e1312fc0a31591320cc7b6f0ab1e (diff) |
perfcounters: implement "counter inheritance"
Impact: implement new performance feature
Counter inheritance can be used to run performance counters in a workload,
transparently - and pipe back the counter results to the parent counter.
Inheritance for performance counters works the following way: when creating
a counter it can be marked with the .inherit=1 flag. Such counters are then
'inherited' by all child tasks (be they fork()-ed or clone()-ed). These
counters get inherited through exec() boundaries as well (except through
setuid boundaries).
The counter values get added back to the parent counter(s) when the child
task(s) exit - much like stime/utime statistics are gathered. So inherited
counters are ideal to gather summary statistics about an application's
behavior via shell commands, without having to modify that application.
The timec.c command utilizes counter inheritance:
http://redhat.com/~mingo/perfcounters/timec.c
Sample output:
$ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
Performance counter stats for 'ls':
163516953 instructions
2295 cache-misses
2855182 branch-misses
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/exit.c | 7 | ||||
-rw-r--r-- | kernel/perf_counter.c | 248 |
2 files changed, 210 insertions, 45 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7ebb0f7..d336c90a5f13 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code) | |||
1093 | mpol_put(tsk->mempolicy); | 1093 | mpol_put(tsk->mempolicy); |
1094 | tsk->mempolicy = NULL; | 1094 | tsk->mempolicy = NULL; |
1095 | #endif | 1095 | #endif |
1096 | #ifdef CONFIG_FUTEX | ||
1097 | /* | 1096 | /* |
1098 | * This must happen late, after the PID is not | 1097 | * These must happen late, after the PID is not |
1099 | * hashed anymore: | 1098 | * hashed anymore, but still at a point that may sleep: |
1100 | */ | 1099 | */ |
1100 | perf_counter_exit_task(tsk); | ||
1101 | #ifdef CONFIG_FUTEX | ||
1101 | if (unlikely(!list_empty(&tsk->pi_state_list))) | 1102 | if (unlikely(!list_empty(&tsk->pi_state_list))) |
1102 | exit_pi_state_list(tsk); | 1103 | exit_pi_state_list(tsk); |
1103 | if (unlikely(current->pi_state_cache)) | 1104 | if (unlikely(current->pi_state_cache)) |
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 416861ce8b27..f5e81dd193d1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) | |||
80 | 80 | ||
81 | list_del_init(&sibling->list_entry); | 81 | list_del_init(&sibling->list_entry); |
82 | list_add_tail(&sibling->list_entry, &ctx->counter_list); | 82 | list_add_tail(&sibling->list_entry, &ctx->counter_list); |
83 | WARN_ON_ONCE(!sibling->group_leader); | ||
84 | WARN_ON_ONCE(sibling->group_leader == sibling); | ||
85 | sibling->group_leader = sibling; | 83 | sibling->group_leader = sibling; |
86 | } | 84 | } |
87 | } | 85 | } |
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info) | |||
97 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 95 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
98 | struct perf_counter *counter = info; | 96 | struct perf_counter *counter = info; |
99 | struct perf_counter_context *ctx = counter->ctx; | 97 | struct perf_counter_context *ctx = counter->ctx; |
98 | unsigned long flags; | ||
100 | u64 perf_flags; | 99 | u64 perf_flags; |
101 | 100 | ||
102 | /* | 101 | /* |
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info) | |||
107 | if (ctx->task && cpuctx->task_ctx != ctx) | 106 | if (ctx->task && cpuctx->task_ctx != ctx) |
108 | return; | 107 | return; |
109 | 108 | ||
110 | spin_lock(&ctx->lock); | 109 | spin_lock_irqsave(&ctx->lock, flags); |
111 | 110 | ||
112 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { | 111 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { |
113 | counter->hw_ops->hw_perf_counter_disable(counter); | 112 | counter->hw_ops->hw_perf_counter_disable(counter); |
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info) | |||
136 | perf_max_counters - perf_reserved_percpu); | 135 | perf_max_counters - perf_reserved_percpu); |
137 | } | 136 | } |
138 | 137 | ||
139 | spin_unlock(&ctx->lock); | 138 | spin_unlock_irqrestore(&ctx->lock, flags); |
140 | } | 139 | } |
141 | 140 | ||
142 | 141 | ||
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info) | |||
199 | struct perf_counter *counter = info; | 198 | struct perf_counter *counter = info; |
200 | struct perf_counter_context *ctx = counter->ctx; | 199 | struct perf_counter_context *ctx = counter->ctx; |
201 | int cpu = smp_processor_id(); | 200 | int cpu = smp_processor_id(); |
201 | unsigned long flags; | ||
202 | u64 perf_flags; | 202 | u64 perf_flags; |
203 | 203 | ||
204 | /* | 204 | /* |
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info) | |||
209 | if (ctx->task && cpuctx->task_ctx != ctx) | 209 | if (ctx->task && cpuctx->task_ctx != ctx) |
210 | return; | 210 | return; |
211 | 211 | ||
212 | spin_lock(&ctx->lock); | 212 | spin_lock_irqsave(&ctx->lock, flags); |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * Protect the list operation against NMI by disabling the | 215 | * Protect the list operation against NMI by disabling the |
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info) | |||
232 | if (!ctx->task && cpuctx->max_pertask) | 232 | if (!ctx->task && cpuctx->max_pertask) |
233 | cpuctx->max_pertask--; | 233 | cpuctx->max_pertask--; |
234 | 234 | ||
235 | spin_unlock(&ctx->lock); | 235 | spin_unlock_irqrestore(&ctx->lock, flags); |
236 | } | 236 | } |
237 | 237 | ||
238 | /* | 238 | /* |
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void) | |||
446 | */ | 446 | */ |
447 | perf_flags = hw_perf_save_disable(); | 447 | perf_flags = hw_perf_save_disable(); |
448 | 448 | ||
449 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | 449 | list_for_each_entry(counter, &ctx->counter_list, list_entry) |
450 | WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE); | ||
451 | counter->state = PERF_COUNTER_STATE_OFF; | 450 | counter->state = PERF_COUNTER_STATE_OFF; |
452 | } | 451 | |
453 | hw_perf_restore(perf_flags); | 452 | hw_perf_restore(perf_flags); |
454 | 453 | ||
455 | spin_unlock(&ctx->lock); | 454 | spin_unlock(&ctx->lock); |
@@ -526,26 +525,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) | |||
526 | } | 525 | } |
527 | 526 | ||
528 | /* | 527 | /* |
529 | * Initialize the perf_counter context in a task_struct: | ||
530 | */ | ||
531 | static void | ||
532 | __perf_counter_init_context(struct perf_counter_context *ctx, | ||
533 | struct task_struct *task) | ||
534 | { | ||
535 | spin_lock_init(&ctx->lock); | ||
536 | INIT_LIST_HEAD(&ctx->counter_list); | ||
537 | ctx->nr_counters = 0; | ||
538 | ctx->task = task; | ||
539 | } | ||
540 | /* | ||
541 | * Initialize the perf_counter context in task_struct | ||
542 | */ | ||
543 | void perf_counter_init_task(struct task_struct *task) | ||
544 | { | ||
545 | __perf_counter_init_context(&task->perf_counter_ctx, task); | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * Cross CPU call to read the hardware counter | 528 | * Cross CPU call to read the hardware counter |
550 | */ | 529 | */ |
551 | static void __hw_perf_counter_read(void *info) | 530 | static void __hw_perf_counter_read(void *info) |
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | |||
663 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 642 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
664 | ctx = &cpuctx->ctx; | 643 | ctx = &cpuctx->ctx; |
665 | 644 | ||
666 | WARN_ON_ONCE(ctx->task); | ||
667 | return ctx; | 645 | return ctx; |
668 | } | 646 | } |
669 | 647 | ||
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter) | |||
915 | static struct perf_counter * | 893 | static struct perf_counter * |
916 | perf_counter_alloc(struct perf_counter_hw_event *hw_event, | 894 | perf_counter_alloc(struct perf_counter_hw_event *hw_event, |
917 | int cpu, | 895 | int cpu, |
918 | struct perf_counter *group_leader) | 896 | struct perf_counter *group_leader, |
897 | gfp_t gfpflags) | ||
919 | { | 898 | { |
920 | const struct hw_perf_counter_ops *hw_ops; | 899 | const struct hw_perf_counter_ops *hw_ops; |
921 | struct perf_counter *counter; | 900 | struct perf_counter *counter; |
922 | 901 | ||
923 | counter = kzalloc(sizeof(*counter), GFP_KERNEL); | 902 | counter = kzalloc(sizeof(*counter), gfpflags); |
924 | if (!counter) | 903 | if (!counter) |
925 | return NULL; | 904 | return NULL; |
926 | 905 | ||
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, | |||
947 | hw_ops = NULL; | 926 | hw_ops = NULL; |
948 | if (!hw_event->raw && hw_event->type < 0) | 927 | if (!hw_event->raw && hw_event->type < 0) |
949 | hw_ops = sw_perf_counter_init(counter); | 928 | hw_ops = sw_perf_counter_init(counter); |
950 | if (!hw_ops) { | 929 | if (!hw_ops) |
951 | hw_ops = hw_perf_counter_init(counter); | 930 | hw_ops = hw_perf_counter_init(counter); |
952 | } | ||
953 | 931 | ||
954 | if (!hw_ops) { | 932 | if (!hw_ops) { |
955 | kfree(counter); | 933 | kfree(counter); |
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, | |||
975 | struct perf_counter *counter, *group_leader; | 953 | struct perf_counter *counter, *group_leader; |
976 | struct perf_counter_hw_event hw_event; | 954 | struct perf_counter_hw_event hw_event; |
977 | struct perf_counter_context *ctx; | 955 | struct perf_counter_context *ctx; |
956 | struct file *counter_file = NULL; | ||
978 | struct file *group_file = NULL; | 957 | struct file *group_file = NULL; |
979 | int fput_needed = 0; | 958 | int fput_needed = 0; |
959 | int fput_needed2 = 0; | ||
980 | int ret; | 960 | int ret; |
981 | 961 | ||
982 | if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) | 962 | if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) |
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, | |||
1017 | } | 997 | } |
1018 | 998 | ||
1019 | ret = -EINVAL; | 999 | ret = -EINVAL; |
1020 | counter = perf_counter_alloc(&hw_event, cpu, group_leader); | 1000 | counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); |
1021 | if (!counter) | 1001 | if (!counter) |
1022 | goto err_put_context; | 1002 | goto err_put_context; |
1023 | 1003 | ||
1024 | perf_install_in_context(ctx, counter, cpu); | ||
1025 | |||
1026 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); | 1004 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); |
1027 | if (ret < 0) | 1005 | if (ret < 0) |
1028 | goto err_remove_free_put_context; | 1006 | goto err_free_put_context; |
1007 | |||
1008 | counter_file = fget_light(ret, &fput_needed2); | ||
1009 | if (!counter_file) | ||
1010 | goto err_free_put_context; | ||
1011 | |||
1012 | counter->filp = counter_file; | ||
1013 | perf_install_in_context(ctx, counter, cpu); | ||
1014 | |||
1015 | fput_light(counter_file, fput_needed2); | ||
1029 | 1016 | ||
1030 | out_fput: | 1017 | out_fput: |
1031 | fput_light(group_file, fput_needed); | 1018 | fput_light(group_file, fput_needed); |
1032 | 1019 | ||
1033 | return ret; | 1020 | return ret; |
1034 | 1021 | ||
1035 | err_remove_free_put_context: | 1022 | err_free_put_context: |
1036 | mutex_lock(&counter->mutex); | ||
1037 | perf_counter_remove_from_context(counter); | ||
1038 | mutex_unlock(&counter->mutex); | ||
1039 | kfree(counter); | 1023 | kfree(counter); |
1040 | 1024 | ||
1041 | err_put_context: | 1025 | err_put_context: |
@@ -1044,6 +1028,186 @@ err_put_context: | |||
1044 | goto out_fput; | 1028 | goto out_fput; |
1045 | } | 1029 | } |
1046 | 1030 | ||
1031 | /* | ||
1032 | * Initialize the perf_counter context in a task_struct: | ||
1033 | */ | ||
1034 | static void | ||
1035 | __perf_counter_init_context(struct perf_counter_context *ctx, | ||
1036 | struct task_struct *task) | ||
1037 | { | ||
1038 | memset(ctx, 0, sizeof(*ctx)); | ||
1039 | spin_lock_init(&ctx->lock); | ||
1040 | INIT_LIST_HEAD(&ctx->counter_list); | ||
1041 | ctx->task = task; | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * inherit a counter from parent task to child task: | ||
1046 | */ | ||
1047 | static int | ||
1048 | inherit_counter(struct perf_counter *parent_counter, | ||
1049 | struct task_struct *parent, | ||
1050 | struct perf_counter_context *parent_ctx, | ||
1051 | struct task_struct *child, | ||
1052 | struct perf_counter_context *child_ctx) | ||
1053 | { | ||
1054 | struct perf_counter *child_counter; | ||
1055 | |||
1056 | child_counter = perf_counter_alloc(&parent_counter->hw_event, | ||
1057 | parent_counter->cpu, NULL, | ||
1058 | GFP_ATOMIC); | ||
1059 | if (!child_counter) | ||
1060 | return -ENOMEM; | ||
1061 | |||
1062 | /* | ||
1063 | * Link it up in the child's context: | ||
1064 | */ | ||
1065 | child_counter->ctx = child_ctx; | ||
1066 | child_counter->task = child; | ||
1067 | list_add_counter(child_counter, child_ctx); | ||
1068 | child_ctx->nr_counters++; | ||
1069 | |||
1070 | child_counter->parent = parent_counter; | ||
1071 | parent_counter->nr_inherited++; | ||
1072 | /* | ||
1073 | * inherit into child's child as well: | ||
1074 | */ | ||
1075 | child_counter->hw_event.inherit = 1; | ||
1076 | |||
1077 | /* | ||
1078 | * Get a reference to the parent filp - we will fput it | ||
1079 | * when the child counter exits. This is safe to do because | ||
1080 | * we are in the parent and we know that the filp still | ||
1081 | * exists and has a nonzero count: | ||
1082 | */ | ||
1083 | atomic_long_inc(&parent_counter->filp->f_count); | ||
1084 | |||
1085 | return 0; | ||
1086 | } | ||
1087 | |||
1088 | static void | ||
1089 | __perf_counter_exit_task(struct task_struct *child, | ||
1090 | struct perf_counter *child_counter, | ||
1091 | struct perf_counter_context *child_ctx) | ||
1092 | { | ||
1093 | struct perf_counter *parent_counter; | ||
1094 | u64 parent_val, child_val; | ||
1095 | u64 perf_flags; | ||
1096 | |||
1097 | /* | ||
1098 | * Disable and unlink this counter. | ||
1099 | * | ||
1100 | * Be careful about zapping the list - IRQ/NMI context | ||
1101 | * could still be processing it: | ||
1102 | */ | ||
1103 | local_irq_disable(); | ||
1104 | perf_flags = hw_perf_save_disable(); | ||
1105 | |||
1106 | if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) | ||
1107 | child_counter->hw_ops->hw_perf_counter_disable(child_counter); | ||
1108 | list_del_init(&child_counter->list_entry); | ||
1109 | |||
1110 | hw_perf_restore(perf_flags); | ||
1111 | local_irq_enable(); | ||
1112 | |||
1113 | parent_counter = child_counter->parent; | ||
1114 | /* | ||
1115 | * It can happen that parent exits first, and has counters | ||
1116 | * that are still around due to the child reference. These | ||
1117 | * counters need to be zapped - but otherwise linger. | ||
1118 | */ | ||
1119 | if (!parent_counter) | ||
1120 | return; | ||
1121 | |||
1122 | parent_val = atomic64_read(&parent_counter->count); | ||
1123 | child_val = atomic64_read(&child_counter->count); | ||
1124 | |||
1125 | /* | ||
1126 | * Add back the child's count to the parent's count: | ||
1127 | */ | ||
1128 | atomic64_add(child_val, &parent_counter->count); | ||
1129 | |||
1130 | fput(parent_counter->filp); | ||
1131 | |||
1132 | kfree(child_counter); | ||
1133 | } | ||
1134 | |||
1135 | /* | ||
1136 | * When a child task exist, feed back counter values to parent counters. | ||
1137 | * | ||
1138 | * Note: we are running in child context, but the PID is not hashed | ||
1139 | * anymore so new counters will not be added. | ||
1140 | */ | ||
1141 | void perf_counter_exit_task(struct task_struct *child) | ||
1142 | { | ||
1143 | struct perf_counter *child_counter, *tmp; | ||
1144 | struct perf_counter_context *child_ctx; | ||
1145 | |||
1146 | child_ctx = &child->perf_counter_ctx; | ||
1147 | |||
1148 | if (likely(!child_ctx->nr_counters)) | ||
1149 | return; | ||
1150 | |||
1151 | list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, | ||
1152 | list_entry) | ||
1153 | __perf_counter_exit_task(child, child_counter, child_ctx); | ||
1154 | } | ||
1155 | |||
1156 | /* | ||
1157 | * Initialize the perf_counter context in task_struct | ||
1158 | */ | ||
1159 | void perf_counter_init_task(struct task_struct *child) | ||
1160 | { | ||
1161 | struct perf_counter_context *child_ctx, *parent_ctx; | ||
1162 | struct perf_counter *counter, *parent_counter; | ||
1163 | struct task_struct *parent = current; | ||
1164 | unsigned long flags; | ||
1165 | |||
1166 | child_ctx = &child->perf_counter_ctx; | ||
1167 | parent_ctx = &parent->perf_counter_ctx; | ||
1168 | |||
1169 | __perf_counter_init_context(child_ctx, child); | ||
1170 | |||
1171 | /* | ||
1172 | * This is executed from the parent task context, so inherit | ||
1173 | * counters that have been marked for cloning: | ||
1174 | */ | ||
1175 | |||
1176 | if (likely(!parent_ctx->nr_counters)) | ||
1177 | return; | ||
1178 | |||
1179 | /* | ||
1180 | * Lock the parent list. No need to lock the child - not PID | ||
1181 | * hashed yet and not running, so nobody can access it. | ||
1182 | */ | ||
1183 | spin_lock_irqsave(&parent_ctx->lock, flags); | ||
1184 | |||
1185 | /* | ||
1186 | * We dont have to disable NMIs - we are only looking at | ||
1187 | * the list, not manipulating it: | ||
1188 | */ | ||
1189 | list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { | ||
1190 | if (!counter->hw_event.inherit || counter->group_leader != counter) | ||
1191 | continue; | ||
1192 | |||
1193 | /* | ||
1194 | * Instead of creating recursive hierarchies of counters, | ||
1195 | * we link inheritd counters back to the original parent, | ||
1196 | * which has a filp for sure, which we use as the reference | ||
1197 | * count: | ||
1198 | */ | ||
1199 | parent_counter = counter; | ||
1200 | if (counter->parent) | ||
1201 | parent_counter = counter->parent; | ||
1202 | |||
1203 | if (inherit_counter(parent_counter, parent, | ||
1204 | parent_ctx, child, child_ctx)) | ||
1205 | break; | ||
1206 | } | ||
1207 | |||
1208 | spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
1209 | } | ||
1210 | |||
1047 | static void __cpuinit perf_counter_init_cpu(int cpu) | 1211 | static void __cpuinit perf_counter_init_cpu(int cpu) |
1048 | { | 1212 | { |
1049 | struct perf_cpu_context *cpuctx; | 1213 | struct perf_cpu_context *cpuctx; |