aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/perf_counter.h24
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/perf_counter.c248
3 files changed, 228 insertions, 51 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 72460289c654..e5d25bf8f74e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -75,10 +75,11 @@ struct perf_counter_hw_event {
75 u64 irq_period; 75 u64 irq_period;
76 u32 record_type; 76 u32 record_type;
77 77
78 u32 disabled : 1, /* off by default */ 78 u32 disabled : 1, /* off by default */
79 nmi : 1, /* NMI sampling */ 79 nmi : 1, /* NMI sampling */
80 raw : 1, /* raw event type */ 80 raw : 1, /* raw event type */
81 __reserved_1 : 29; 81 inherit : 1, /* children inherit it */
82 __reserved_1 : 28;
82 83
83 u64 __reserved_2; 84 u64 __reserved_2;
84}; 85};
@@ -138,6 +139,8 @@ enum perf_counter_active_state {
138 PERF_COUNTER_STATE_ACTIVE = 1, 139 PERF_COUNTER_STATE_ACTIVE = 1,
139}; 140};
140 141
142struct file;
143
141/** 144/**
142 * struct perf_counter - performance counter kernel representation: 145 * struct perf_counter - performance counter kernel representation:
143 */ 146 */
@@ -156,7 +159,10 @@ struct perf_counter {
156 159
157 struct perf_counter_context *ctx; 160 struct perf_counter_context *ctx;
158 struct task_struct *task; 161 struct task_struct *task;
162 struct file *filp;
159 163
164 unsigned int nr_inherited;
165 struct perf_counter *parent;
160 /* 166 /*
161 * Protect attach/detach: 167 * Protect attach/detach:
162 */ 168 */
@@ -210,13 +216,16 @@ struct perf_cpu_context {
210extern int perf_max_counters; 216extern int perf_max_counters;
211 217
212#ifdef CONFIG_PERF_COUNTERS 218#ifdef CONFIG_PERF_COUNTERS
219extern void
220perf_counter_show(struct perf_counter *counter, char *str, int trace);
213extern const struct hw_perf_counter_ops * 221extern const struct hw_perf_counter_ops *
214hw_perf_counter_init(struct perf_counter *counter); 222hw_perf_counter_init(struct perf_counter *counter);
215 223
216extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); 224extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
217extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); 225extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
218extern void perf_counter_task_tick(struct task_struct *task, int cpu); 226extern void perf_counter_task_tick(struct task_struct *task, int cpu);
219extern void perf_counter_init_task(struct task_struct *task); 227extern void perf_counter_init_task(struct task_struct *child);
228extern void perf_counter_exit_task(struct task_struct *child);
220extern void perf_counter_notify(struct pt_regs *regs); 229extern void perf_counter_notify(struct pt_regs *regs);
221extern void perf_counter_print_debug(void); 230extern void perf_counter_print_debug(void);
222extern u64 hw_perf_save_disable(void); 231extern u64 hw_perf_save_disable(void);
@@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void);
226 235
227#else 236#else
228static inline void 237static inline void
238perf_counter_show(struct perf_counter *counter, char *str, int trace) { }
239static inline void
229perf_counter_task_sched_in(struct task_struct *task, int cpu) { } 240perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
230static inline void 241static inline void
231perf_counter_task_sched_out(struct task_struct *task, int cpu) { } 242perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
232static inline void 243static inline void
233perf_counter_task_tick(struct task_struct *task, int cpu) { } 244perf_counter_task_tick(struct task_struct *task, int cpu) { }
234static inline void perf_counter_init_task(struct task_struct *task) { } 245static inline void perf_counter_init_task(struct task_struct *child) { }
246static inline void perf_counter_exit_task(struct task_struct *child) { }
235static inline void perf_counter_notify(struct pt_regs *regs) { } 247static inline void perf_counter_notify(struct pt_regs *regs) { }
236static inline void perf_counter_print_debug(void) { } 248static inline void perf_counter_print_debug(void) { }
237static inline void hw_perf_restore(u64 ctrl) { } 249static inline void hw_perf_restore(u64 ctrl) { }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..d336c90a5f13 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
1093 mpol_put(tsk->mempolicy); 1093 mpol_put(tsk->mempolicy);
1094 tsk->mempolicy = NULL; 1094 tsk->mempolicy = NULL;
1095#endif 1095#endif
1096#ifdef CONFIG_FUTEX
1097 /* 1096 /*
1098 * This must happen late, after the PID is not 1097 * These must happen late, after the PID is not
1099 * hashed anymore: 1098 * hashed anymore, but still at a point that may sleep:
1100 */ 1099 */
1100 perf_counter_exit_task(tsk);
1101#ifdef CONFIG_FUTEX
1101 if (unlikely(!list_empty(&tsk->pi_state_list))) 1102 if (unlikely(!list_empty(&tsk->pi_state_list)))
1102 exit_pi_state_list(tsk); 1103 exit_pi_state_list(tsk);
1103 if (unlikely(current->pi_state_cache)) 1104 if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861ce8b27..f5e81dd193d1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
80 80
81 list_del_init(&sibling->list_entry); 81 list_del_init(&sibling->list_entry);
82 list_add_tail(&sibling->list_entry, &ctx->counter_list); 82 list_add_tail(&sibling->list_entry, &ctx->counter_list);
83 WARN_ON_ONCE(!sibling->group_leader);
84 WARN_ON_ONCE(sibling->group_leader == sibling);
85 sibling->group_leader = sibling; 83 sibling->group_leader = sibling;
86 } 84 }
87} 85}
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info)
97 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 95 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
98 struct perf_counter *counter = info; 96 struct perf_counter *counter = info;
99 struct perf_counter_context *ctx = counter->ctx; 97 struct perf_counter_context *ctx = counter->ctx;
98 unsigned long flags;
100 u64 perf_flags; 99 u64 perf_flags;
101 100
102 /* 101 /*
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info)
107 if (ctx->task && cpuctx->task_ctx != ctx) 106 if (ctx->task && cpuctx->task_ctx != ctx)
108 return; 107 return;
109 108
110 spin_lock(&ctx->lock); 109 spin_lock_irqsave(&ctx->lock, flags);
111 110
112 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 111 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
113 counter->hw_ops->hw_perf_counter_disable(counter); 112 counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info)
136 perf_max_counters - perf_reserved_percpu); 135 perf_max_counters - perf_reserved_percpu);
137 } 136 }
138 137
139 spin_unlock(&ctx->lock); 138 spin_unlock_irqrestore(&ctx->lock, flags);
140} 139}
141 140
142 141
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info)
199 struct perf_counter *counter = info; 198 struct perf_counter *counter = info;
200 struct perf_counter_context *ctx = counter->ctx; 199 struct perf_counter_context *ctx = counter->ctx;
201 int cpu = smp_processor_id(); 200 int cpu = smp_processor_id();
201 unsigned long flags;
202 u64 perf_flags; 202 u64 perf_flags;
203 203
204 /* 204 /*
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info)
209 if (ctx->task && cpuctx->task_ctx != ctx) 209 if (ctx->task && cpuctx->task_ctx != ctx)
210 return; 210 return;
211 211
212 spin_lock(&ctx->lock); 212 spin_lock_irqsave(&ctx->lock, flags);
213 213
214 /* 214 /*
215 * Protect the list operation against NMI by disabling the 215 * Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info)
232 if (!ctx->task && cpuctx->max_pertask) 232 if (!ctx->task && cpuctx->max_pertask)
233 cpuctx->max_pertask--; 233 cpuctx->max_pertask--;
234 234
235 spin_unlock(&ctx->lock); 235 spin_unlock_irqrestore(&ctx->lock, flags);
236} 236}
237 237
238/* 238/*
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void)
446 */ 446 */
447 perf_flags = hw_perf_save_disable(); 447 perf_flags = hw_perf_save_disable();
448 448
449 list_for_each_entry(counter, &ctx->counter_list, list_entry) { 449 list_for_each_entry(counter, &ctx->counter_list, list_entry)
450 WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
451 counter->state = PERF_COUNTER_STATE_OFF; 450 counter->state = PERF_COUNTER_STATE_OFF;
452 } 451
453 hw_perf_restore(perf_flags); 452 hw_perf_restore(perf_flags);
454 453
455 spin_unlock(&ctx->lock); 454 spin_unlock(&ctx->lock);
@@ -526,26 +525,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
526} 525}
527 526
528/* 527/*
529 * Initialize the perf_counter context in a task_struct:
530 */
531static void
532__perf_counter_init_context(struct perf_counter_context *ctx,
533 struct task_struct *task)
534{
535 spin_lock_init(&ctx->lock);
536 INIT_LIST_HEAD(&ctx->counter_list);
537 ctx->nr_counters = 0;
538 ctx->task = task;
539}
540/*
541 * Initialize the perf_counter context in task_struct
542 */
543void perf_counter_init_task(struct task_struct *task)
544{
545 __perf_counter_init_context(&task->perf_counter_ctx, task);
546}
547
548/*
549 * Cross CPU call to read the hardware counter 528 * Cross CPU call to read the hardware counter
550 */ 529 */
551static void __hw_perf_counter_read(void *info) 530static void __hw_perf_counter_read(void *info)
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
663 cpuctx = &per_cpu(perf_cpu_context, cpu); 642 cpuctx = &per_cpu(perf_cpu_context, cpu);
664 ctx = &cpuctx->ctx; 643 ctx = &cpuctx->ctx;
665 644
666 WARN_ON_ONCE(ctx->task);
667 return ctx; 645 return ctx;
668 } 646 }
669 647
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter)
915static struct perf_counter * 893static struct perf_counter *
916perf_counter_alloc(struct perf_counter_hw_event *hw_event, 894perf_counter_alloc(struct perf_counter_hw_event *hw_event,
917 int cpu, 895 int cpu,
918 struct perf_counter *group_leader) 896 struct perf_counter *group_leader,
897 gfp_t gfpflags)
919{ 898{
920 const struct hw_perf_counter_ops *hw_ops; 899 const struct hw_perf_counter_ops *hw_ops;
921 struct perf_counter *counter; 900 struct perf_counter *counter;
922 901
923 counter = kzalloc(sizeof(*counter), GFP_KERNEL); 902 counter = kzalloc(sizeof(*counter), gfpflags);
924 if (!counter) 903 if (!counter)
925 return NULL; 904 return NULL;
926 905
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
947 hw_ops = NULL; 926 hw_ops = NULL;
948 if (!hw_event->raw && hw_event->type < 0) 927 if (!hw_event->raw && hw_event->type < 0)
949 hw_ops = sw_perf_counter_init(counter); 928 hw_ops = sw_perf_counter_init(counter);
950 if (!hw_ops) { 929 if (!hw_ops)
951 hw_ops = hw_perf_counter_init(counter); 930 hw_ops = hw_perf_counter_init(counter);
952 }
953 931
954 if (!hw_ops) { 932 if (!hw_ops) {
955 kfree(counter); 933 kfree(counter);
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
975 struct perf_counter *counter, *group_leader; 953 struct perf_counter *counter, *group_leader;
976 struct perf_counter_hw_event hw_event; 954 struct perf_counter_hw_event hw_event;
977 struct perf_counter_context *ctx; 955 struct perf_counter_context *ctx;
956 struct file *counter_file = NULL;
978 struct file *group_file = NULL; 957 struct file *group_file = NULL;
979 int fput_needed = 0; 958 int fput_needed = 0;
959 int fput_needed2 = 0;
980 int ret; 960 int ret;
981 961
982 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) 962 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1017 } 997 }
1018 998
1019 ret = -EINVAL; 999 ret = -EINVAL;
1020 counter = perf_counter_alloc(&hw_event, cpu, group_leader); 1000 counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1021 if (!counter) 1001 if (!counter)
1022 goto err_put_context; 1002 goto err_put_context;
1023 1003
1024 perf_install_in_context(ctx, counter, cpu);
1025
1026 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 1004 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1027 if (ret < 0) 1005 if (ret < 0)
1028 goto err_remove_free_put_context; 1006 goto err_free_put_context;
1007
1008 counter_file = fget_light(ret, &fput_needed2);
1009 if (!counter_file)
1010 goto err_free_put_context;
1011
1012 counter->filp = counter_file;
1013 perf_install_in_context(ctx, counter, cpu);
1014
1015 fput_light(counter_file, fput_needed2);
1029 1016
1030out_fput: 1017out_fput:
1031 fput_light(group_file, fput_needed); 1018 fput_light(group_file, fput_needed);
1032 1019
1033 return ret; 1020 return ret;
1034 1021
1035err_remove_free_put_context: 1022err_free_put_context:
1036 mutex_lock(&counter->mutex);
1037 perf_counter_remove_from_context(counter);
1038 mutex_unlock(&counter->mutex);
1039 kfree(counter); 1023 kfree(counter);
1040 1024
1041err_put_context: 1025err_put_context:
@@ -1044,6 +1028,186 @@ err_put_context:
1044 goto out_fput; 1028 goto out_fput;
1045} 1029}
1046 1030
1031/*
1032 * Initialize the perf_counter context in a task_struct:
1033 */
1034static void
1035__perf_counter_init_context(struct perf_counter_context *ctx,
1036 struct task_struct *task)
1037{
1038 memset(ctx, 0, sizeof(*ctx));
1039 spin_lock_init(&ctx->lock);
1040 INIT_LIST_HEAD(&ctx->counter_list);
1041 ctx->task = task;
1042}
1043
1044/*
1045 * inherit a counter from parent task to child task:
1046 */
1047static int
1048inherit_counter(struct perf_counter *parent_counter,
1049 struct task_struct *parent,
1050 struct perf_counter_context *parent_ctx,
1051 struct task_struct *child,
1052 struct perf_counter_context *child_ctx)
1053{
1054 struct perf_counter *child_counter;
1055
1056 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1057 parent_counter->cpu, NULL,
1058 GFP_ATOMIC);
1059 if (!child_counter)
1060 return -ENOMEM;
1061
1062 /*
1063 * Link it up in the child's context:
1064 */
1065 child_counter->ctx = child_ctx;
1066 child_counter->task = child;
1067 list_add_counter(child_counter, child_ctx);
1068 child_ctx->nr_counters++;
1069
1070 child_counter->parent = parent_counter;
1071 parent_counter->nr_inherited++;
1072 /*
1073 * inherit into child's child as well:
1074 */
1075 child_counter->hw_event.inherit = 1;
1076
1077 /*
1078 * Get a reference to the parent filp - we will fput it
1079 * when the child counter exits. This is safe to do because
1080 * we are in the parent and we know that the filp still
1081 * exists and has a nonzero count:
1082 */
1083 atomic_long_inc(&parent_counter->filp->f_count);
1084
1085 return 0;
1086}
1087
1088static void
1089__perf_counter_exit_task(struct task_struct *child,
1090 struct perf_counter *child_counter,
1091 struct perf_counter_context *child_ctx)
1092{
1093 struct perf_counter *parent_counter;
1094 u64 parent_val, child_val;
1095 u64 perf_flags;
1096
1097 /*
1098 * Disable and unlink this counter.
1099 *
1100 * Be careful about zapping the list - IRQ/NMI context
1101 * could still be processing it:
1102 */
1103 local_irq_disable();
1104 perf_flags = hw_perf_save_disable();
1105
1106 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
1107 child_counter->hw_ops->hw_perf_counter_disable(child_counter);
1108 list_del_init(&child_counter->list_entry);
1109
1110 hw_perf_restore(perf_flags);
1111 local_irq_enable();
1112
1113 parent_counter = child_counter->parent;
1114 /*
1115 * It can happen that parent exits first, and has counters
1116 * that are still around due to the child reference. These
1117 * counters need to be zapped - but otherwise linger.
1118 */
1119 if (!parent_counter)
1120 return;
1121
1122 parent_val = atomic64_read(&parent_counter->count);
1123 child_val = atomic64_read(&child_counter->count);
1124
1125 /*
1126 * Add back the child's count to the parent's count:
1127 */
1128 atomic64_add(child_val, &parent_counter->count);
1129
1130 fput(parent_counter->filp);
1131
1132 kfree(child_counter);
1133}
1134
1135/*
1136 * When a child task exist, feed back counter values to parent counters.
1137 *
1138 * Note: we are running in child context, but the PID is not hashed
1139 * anymore so new counters will not be added.
1140 */
1141void perf_counter_exit_task(struct task_struct *child)
1142{
1143 struct perf_counter *child_counter, *tmp;
1144 struct perf_counter_context *child_ctx;
1145
1146 child_ctx = &child->perf_counter_ctx;
1147
1148 if (likely(!child_ctx->nr_counters))
1149 return;
1150
1151 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1152 list_entry)
1153 __perf_counter_exit_task(child, child_counter, child_ctx);
1154}
1155
1156/*
1157 * Initialize the perf_counter context in task_struct
1158 */
1159void perf_counter_init_task(struct task_struct *child)
1160{
1161 struct perf_counter_context *child_ctx, *parent_ctx;
1162 struct perf_counter *counter, *parent_counter;
1163 struct task_struct *parent = current;
1164 unsigned long flags;
1165
1166 child_ctx = &child->perf_counter_ctx;
1167 parent_ctx = &parent->perf_counter_ctx;
1168
1169 __perf_counter_init_context(child_ctx, child);
1170
1171 /*
1172 * This is executed from the parent task context, so inherit
1173 * counters that have been marked for cloning:
1174 */
1175
1176 if (likely(!parent_ctx->nr_counters))
1177 return;
1178
1179 /*
1180 * Lock the parent list. No need to lock the child - not PID
1181 * hashed yet and not running, so nobody can access it.
1182 */
1183 spin_lock_irqsave(&parent_ctx->lock, flags);
1184
1185 /*
1186 * We dont have to disable NMIs - we are only looking at
1187 * the list, not manipulating it:
1188 */
1189 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1190 if (!counter->hw_event.inherit || counter->group_leader != counter)
1191 continue;
1192
1193 /*
1194 * Instead of creating recursive hierarchies of counters,
1195 * we link inheritd counters back to the original parent,
1196 * which has a filp for sure, which we use as the reference
1197 * count:
1198 */
1199 parent_counter = counter;
1200 if (counter->parent)
1201 parent_counter = counter->parent;
1202
1203 if (inherit_counter(parent_counter, parent,
1204 parent_ctx, child, child_ctx))
1205 break;
1206 }
1207
1208 spin_unlock_irqrestore(&parent_ctx->lock, flags);
1209}
1210
1047static void __cpuinit perf_counter_init_cpu(int cpu) 1211static void __cpuinit perf_counter_init_cpu(int cpu)
1048{ 1212{
1049 struct perf_cpu_context *cpuctx; 1213 struct perf_cpu_context *cpuctx;