aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-12-12 07:49:45 -0500
committerIngo Molnar <mingo@elte.hu>2008-12-14 14:30:49 -0500
commit9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 (patch)
treef7b3482ae284c214119efe309e356fb84de126bb /kernel
parentee06094f8279e1312fc0a31591320cc7b6f0ab1e (diff)
perfcounters: implement "counter inheritance"
Impact: implement new performance feature Counter inheritance can be used to run performance counters in a workload, transparently - and pipe back the counter results to the parent counter. Inheritance for performance counters works the following way: when creating a counter it can be marked with the .inherit=1 flag. Such counters are then 'inherited' by all child tasks (be they fork()-ed or clone()-ed). These counters get inherited through exec() boundaries as well (except through setuid boundaries). The counter values get added back to the parent counter(s) when the child task(s) exit - much like stime/utime statistics are gathered. So inherited counters are ideal to gather summary statistics about an application's behavior via shell commands, without having to modify that application. The timec.c command utilizes counter inheritance: http://redhat.com/~mingo/perfcounters/timec.c Sample output: $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null Performance counter stats for 'ls': 163516953 instructions 2295 cache-misses 2855182 branch-misses Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/perf_counter.c248
2 files changed, 210 insertions, 45 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..d336c90a5f13 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
1093 mpol_put(tsk->mempolicy); 1093 mpol_put(tsk->mempolicy);
1094 tsk->mempolicy = NULL; 1094 tsk->mempolicy = NULL;
1095#endif 1095#endif
1096#ifdef CONFIG_FUTEX
1097 /* 1096 /*
1098 * This must happen late, after the PID is not 1097 * These must happen late, after the PID is not
1099 * hashed anymore: 1098 * hashed anymore, but still at a point that may sleep:
1100 */ 1099 */
1100 perf_counter_exit_task(tsk);
1101#ifdef CONFIG_FUTEX
1101 if (unlikely(!list_empty(&tsk->pi_state_list))) 1102 if (unlikely(!list_empty(&tsk->pi_state_list)))
1102 exit_pi_state_list(tsk); 1103 exit_pi_state_list(tsk);
1103 if (unlikely(current->pi_state_cache)) 1104 if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861ce8b27..f5e81dd193d1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
80 80
81 list_del_init(&sibling->list_entry); 81 list_del_init(&sibling->list_entry);
82 list_add_tail(&sibling->list_entry, &ctx->counter_list); 82 list_add_tail(&sibling->list_entry, &ctx->counter_list);
83 WARN_ON_ONCE(!sibling->group_leader);
84 WARN_ON_ONCE(sibling->group_leader == sibling);
85 sibling->group_leader = sibling; 83 sibling->group_leader = sibling;
86 } 84 }
87} 85}
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info)
97 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 95 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
98 struct perf_counter *counter = info; 96 struct perf_counter *counter = info;
99 struct perf_counter_context *ctx = counter->ctx; 97 struct perf_counter_context *ctx = counter->ctx;
98 unsigned long flags;
100 u64 perf_flags; 99 u64 perf_flags;
101 100
102 /* 101 /*
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info)
107 if (ctx->task && cpuctx->task_ctx != ctx) 106 if (ctx->task && cpuctx->task_ctx != ctx)
108 return; 107 return;
109 108
110 spin_lock(&ctx->lock); 109 spin_lock_irqsave(&ctx->lock, flags);
111 110
112 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 111 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
113 counter->hw_ops->hw_perf_counter_disable(counter); 112 counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info)
136 perf_max_counters - perf_reserved_percpu); 135 perf_max_counters - perf_reserved_percpu);
137 } 136 }
138 137
139 spin_unlock(&ctx->lock); 138 spin_unlock_irqrestore(&ctx->lock, flags);
140} 139}
141 140
142 141
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info)
199 struct perf_counter *counter = info; 198 struct perf_counter *counter = info;
200 struct perf_counter_context *ctx = counter->ctx; 199 struct perf_counter_context *ctx = counter->ctx;
201 int cpu = smp_processor_id(); 200 int cpu = smp_processor_id();
201 unsigned long flags;
202 u64 perf_flags; 202 u64 perf_flags;
203 203
204 /* 204 /*
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info)
209 if (ctx->task && cpuctx->task_ctx != ctx) 209 if (ctx->task && cpuctx->task_ctx != ctx)
210 return; 210 return;
211 211
212 spin_lock(&ctx->lock); 212 spin_lock_irqsave(&ctx->lock, flags);
213 213
214 /* 214 /*
215 * Protect the list operation against NMI by disabling the 215 * Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info)
232 if (!ctx->task && cpuctx->max_pertask) 232 if (!ctx->task && cpuctx->max_pertask)
233 cpuctx->max_pertask--; 233 cpuctx->max_pertask--;
234 234
235 spin_unlock(&ctx->lock); 235 spin_unlock_irqrestore(&ctx->lock, flags);
236} 236}
237 237
238/* 238/*
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void)
446 */ 446 */
447 perf_flags = hw_perf_save_disable(); 447 perf_flags = hw_perf_save_disable();
448 448
449 list_for_each_entry(counter, &ctx->counter_list, list_entry) { 449 list_for_each_entry(counter, &ctx->counter_list, list_entry)
450 WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
451 counter->state = PERF_COUNTER_STATE_OFF; 450 counter->state = PERF_COUNTER_STATE_OFF;
452 } 451
453 hw_perf_restore(perf_flags); 452 hw_perf_restore(perf_flags);
454 453
455 spin_unlock(&ctx->lock); 454 spin_unlock(&ctx->lock);
@@ -526,26 +525,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
526} 525}
527 526
528/* 527/*
529 * Initialize the perf_counter context in a task_struct:
530 */
531static void
532__perf_counter_init_context(struct perf_counter_context *ctx,
533 struct task_struct *task)
534{
535 spin_lock_init(&ctx->lock);
536 INIT_LIST_HEAD(&ctx->counter_list);
537 ctx->nr_counters = 0;
538 ctx->task = task;
539}
540/*
541 * Initialize the perf_counter context in task_struct
542 */
543void perf_counter_init_task(struct task_struct *task)
544{
545 __perf_counter_init_context(&task->perf_counter_ctx, task);
546}
547
548/*
549 * Cross CPU call to read the hardware counter 528 * Cross CPU call to read the hardware counter
550 */ 529 */
551static void __hw_perf_counter_read(void *info) 530static void __hw_perf_counter_read(void *info)
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
663 cpuctx = &per_cpu(perf_cpu_context, cpu); 642 cpuctx = &per_cpu(perf_cpu_context, cpu);
664 ctx = &cpuctx->ctx; 643 ctx = &cpuctx->ctx;
665 644
666 WARN_ON_ONCE(ctx->task);
667 return ctx; 645 return ctx;
668 } 646 }
669 647
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter)
915static struct perf_counter * 893static struct perf_counter *
916perf_counter_alloc(struct perf_counter_hw_event *hw_event, 894perf_counter_alloc(struct perf_counter_hw_event *hw_event,
917 int cpu, 895 int cpu,
918 struct perf_counter *group_leader) 896 struct perf_counter *group_leader,
897 gfp_t gfpflags)
919{ 898{
920 const struct hw_perf_counter_ops *hw_ops; 899 const struct hw_perf_counter_ops *hw_ops;
921 struct perf_counter *counter; 900 struct perf_counter *counter;
922 901
923 counter = kzalloc(sizeof(*counter), GFP_KERNEL); 902 counter = kzalloc(sizeof(*counter), gfpflags);
924 if (!counter) 903 if (!counter)
925 return NULL; 904 return NULL;
926 905
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
947 hw_ops = NULL; 926 hw_ops = NULL;
948 if (!hw_event->raw && hw_event->type < 0) 927 if (!hw_event->raw && hw_event->type < 0)
949 hw_ops = sw_perf_counter_init(counter); 928 hw_ops = sw_perf_counter_init(counter);
950 if (!hw_ops) { 929 if (!hw_ops)
951 hw_ops = hw_perf_counter_init(counter); 930 hw_ops = hw_perf_counter_init(counter);
952 }
953 931
954 if (!hw_ops) { 932 if (!hw_ops) {
955 kfree(counter); 933 kfree(counter);
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
975 struct perf_counter *counter, *group_leader; 953 struct perf_counter *counter, *group_leader;
976 struct perf_counter_hw_event hw_event; 954 struct perf_counter_hw_event hw_event;
977 struct perf_counter_context *ctx; 955 struct perf_counter_context *ctx;
956 struct file *counter_file = NULL;
978 struct file *group_file = NULL; 957 struct file *group_file = NULL;
979 int fput_needed = 0; 958 int fput_needed = 0;
959 int fput_needed2 = 0;
980 int ret; 960 int ret;
981 961
982 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) 962 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1017 } 997 }
1018 998
1019 ret = -EINVAL; 999 ret = -EINVAL;
1020 counter = perf_counter_alloc(&hw_event, cpu, group_leader); 1000 counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1021 if (!counter) 1001 if (!counter)
1022 goto err_put_context; 1002 goto err_put_context;
1023 1003
1024 perf_install_in_context(ctx, counter, cpu);
1025
1026 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 1004 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1027 if (ret < 0) 1005 if (ret < 0)
1028 goto err_remove_free_put_context; 1006 goto err_free_put_context;
1007
1008 counter_file = fget_light(ret, &fput_needed2);
1009 if (!counter_file)
1010 goto err_free_put_context;
1011
1012 counter->filp = counter_file;
1013 perf_install_in_context(ctx, counter, cpu);
1014
1015 fput_light(counter_file, fput_needed2);
1029 1016
1030out_fput: 1017out_fput:
1031 fput_light(group_file, fput_needed); 1018 fput_light(group_file, fput_needed);
1032 1019
1033 return ret; 1020 return ret;
1034 1021
1035err_remove_free_put_context: 1022err_free_put_context:
1036 mutex_lock(&counter->mutex);
1037 perf_counter_remove_from_context(counter);
1038 mutex_unlock(&counter->mutex);
1039 kfree(counter); 1023 kfree(counter);
1040 1024
1041err_put_context: 1025err_put_context:
@@ -1044,6 +1028,186 @@ err_put_context:
1044 goto out_fput; 1028 goto out_fput;
1045} 1029}
1046 1030
1031/*
1032 * Initialize the perf_counter context in a task_struct:
1033 */
1034static void
1035__perf_counter_init_context(struct perf_counter_context *ctx,
1036 struct task_struct *task)
1037{
1038 memset(ctx, 0, sizeof(*ctx));
1039 spin_lock_init(&ctx->lock);
1040 INIT_LIST_HEAD(&ctx->counter_list);
1041 ctx->task = task;
1042}
1043
1044/*
1045 * inherit a counter from parent task to child task:
1046 */
1047static int
1048inherit_counter(struct perf_counter *parent_counter,
1049 struct task_struct *parent,
1050 struct perf_counter_context *parent_ctx,
1051 struct task_struct *child,
1052 struct perf_counter_context *child_ctx)
1053{
1054 struct perf_counter *child_counter;
1055
1056 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1057 parent_counter->cpu, NULL,
1058 GFP_ATOMIC);
1059 if (!child_counter)
1060 return -ENOMEM;
1061
1062 /*
1063 * Link it up in the child's context:
1064 */
1065 child_counter->ctx = child_ctx;
1066 child_counter->task = child;
1067 list_add_counter(child_counter, child_ctx);
1068 child_ctx->nr_counters++;
1069
1070 child_counter->parent = parent_counter;
1071 parent_counter->nr_inherited++;
1072 /*
1073 * inherit into child's child as well:
1074 */
1075 child_counter->hw_event.inherit = 1;
1076
1077 /*
1078 * Get a reference to the parent filp - we will fput it
1079 * when the child counter exits. This is safe to do because
1080 * we are in the parent and we know that the filp still
1081 * exists and has a nonzero count:
1082 */
1083 atomic_long_inc(&parent_counter->filp->f_count);
1084
1085 return 0;
1086}
1087
1088static void
1089__perf_counter_exit_task(struct task_struct *child,
1090 struct perf_counter *child_counter,
1091 struct perf_counter_context *child_ctx)
1092{
1093 struct perf_counter *parent_counter;
1094 u64 parent_val, child_val;
1095 u64 perf_flags;
1096
1097 /*
1098 * Disable and unlink this counter.
1099 *
1100 * Be careful about zapping the list - IRQ/NMI context
1101 * could still be processing it:
1102 */
1103 local_irq_disable();
1104 perf_flags = hw_perf_save_disable();
1105
1106 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
1107 child_counter->hw_ops->hw_perf_counter_disable(child_counter);
1108 list_del_init(&child_counter->list_entry);
1109
1110 hw_perf_restore(perf_flags);
1111 local_irq_enable();
1112
1113 parent_counter = child_counter->parent;
1114 /*
1115 * It can happen that parent exits first, and has counters
1116 * that are still around due to the child reference. These
1117 * counters need to be zapped - but otherwise linger.
1118 */
1119 if (!parent_counter)
1120 return;
1121
1122 parent_val = atomic64_read(&parent_counter->count);
1123 child_val = atomic64_read(&child_counter->count);
1124
1125 /*
1126 * Add back the child's count to the parent's count:
1127 */
1128 atomic64_add(child_val, &parent_counter->count);
1129
1130 fput(parent_counter->filp);
1131
1132 kfree(child_counter);
1133}
1134
1135/*
1136 * When a child task exist, feed back counter values to parent counters.
1137 *
1138 * Note: we are running in child context, but the PID is not hashed
1139 * anymore so new counters will not be added.
1140 */
1141void perf_counter_exit_task(struct task_struct *child)
1142{
1143 struct perf_counter *child_counter, *tmp;
1144 struct perf_counter_context *child_ctx;
1145
1146 child_ctx = &child->perf_counter_ctx;
1147
1148 if (likely(!child_ctx->nr_counters))
1149 return;
1150
1151 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1152 list_entry)
1153 __perf_counter_exit_task(child, child_counter, child_ctx);
1154}
1155
1156/*
1157 * Initialize the perf_counter context in task_struct
1158 */
1159void perf_counter_init_task(struct task_struct *child)
1160{
1161 struct perf_counter_context *child_ctx, *parent_ctx;
1162 struct perf_counter *counter, *parent_counter;
1163 struct task_struct *parent = current;
1164 unsigned long flags;
1165
1166 child_ctx = &child->perf_counter_ctx;
1167 parent_ctx = &parent->perf_counter_ctx;
1168
1169 __perf_counter_init_context(child_ctx, child);
1170
1171 /*
1172 * This is executed from the parent task context, so inherit
1173 * counters that have been marked for cloning:
1174 */
1175
1176 if (likely(!parent_ctx->nr_counters))
1177 return;
1178
1179 /*
1180 * Lock the parent list. No need to lock the child - not PID
1181 * hashed yet and not running, so nobody can access it.
1182 */
1183 spin_lock_irqsave(&parent_ctx->lock, flags);
1184
1185 /*
1186 * We dont have to disable NMIs - we are only looking at
1187 * the list, not manipulating it:
1188 */
1189 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1190 if (!counter->hw_event.inherit || counter->group_leader != counter)
1191 continue;
1192
1193 /*
1194 * Instead of creating recursive hierarchies of counters,
1195 * we link inheritd counters back to the original parent,
1196 * which has a filp for sure, which we use as the reference
1197 * count:
1198 */
1199 parent_counter = counter;
1200 if (counter->parent)
1201 parent_counter = counter->parent;
1202
1203 if (inherit_counter(parent_counter, parent,
1204 parent_ctx, child, child_ctx))
1205 break;
1206 }
1207
1208 spin_unlock_irqrestore(&parent_ctx->lock, flags);
1209}
1210
1047static void __cpuinit perf_counter_init_cpu(int cpu) 1211static void __cpuinit perf_counter_init_cpu(int cpu)
1048{ 1212{
1049 struct perf_cpu_context *cpuctx; 1213 struct perf_cpu_context *cpuctx;