diff options
Diffstat (limited to 'kernel/events')
| -rw-r--r-- | kernel/events/core.c | 173 | ||||
| -rw-r--r-- | kernel/events/internal.h | 35 | ||||
| -rw-r--r-- | kernel/events/ring_buffer.c | 126 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 223 |
4 files changed, 348 insertions, 209 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d29334c..8c875ef6e120 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | |||
| 175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | 175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); |
| 176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | 176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; |
| 177 | 177 | ||
| 178 | static atomic_t perf_sample_allowed_ns __read_mostly = | 178 | static int perf_sample_allowed_ns __read_mostly = |
| 179 | ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); | 179 | DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; |
| 180 | 180 | ||
| 181 | void update_perf_cpu_limits(void) | 181 | void update_perf_cpu_limits(void) |
| 182 | { | 182 | { |
| @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) | |||
| 184 | 184 | ||
| 185 | tmp *= sysctl_perf_cpu_time_max_percent; | 185 | tmp *= sysctl_perf_cpu_time_max_percent; |
| 186 | do_div(tmp, 100); | 186 | do_div(tmp, 100); |
| 187 | atomic_set(&perf_sample_allowed_ns, tmp); | 187 | ACCESS_ONCE(perf_sample_allowed_ns) = tmp; |
| 188 | } | 188 | } |
| 189 | 189 | ||
| 190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | 190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); |
| @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
| 193 | void __user *buffer, size_t *lenp, | 193 | void __user *buffer, size_t *lenp, |
| 194 | loff_t *ppos) | 194 | loff_t *ppos) |
| 195 | { | 195 | { |
| 196 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | 196 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 197 | 197 | ||
| 198 | if (ret || !write) | 198 | if (ret || !write) |
| 199 | return ret; | 199 | return ret; |
| @@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | |||
| 228 | * we detect that events are taking too long. | 228 | * we detect that events are taking too long. |
| 229 | */ | 229 | */ |
| 230 | #define NR_ACCUMULATED_SAMPLES 128 | 230 | #define NR_ACCUMULATED_SAMPLES 128 |
| 231 | DEFINE_PER_CPU(u64, running_sample_length); | 231 | static DEFINE_PER_CPU(u64, running_sample_length); |
| 232 | 232 | ||
| 233 | void perf_sample_event_took(u64 sample_len_ns) | 233 | void perf_sample_event_took(u64 sample_len_ns) |
| 234 | { | 234 | { |
| 235 | u64 avg_local_sample_len; | 235 | u64 avg_local_sample_len; |
| 236 | u64 local_samples_len; | 236 | u64 local_samples_len; |
| 237 | u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); | ||
| 237 | 238 | ||
| 238 | if (atomic_read(&perf_sample_allowed_ns) == 0) | 239 | if (allowed_ns == 0) |
| 239 | return; | 240 | return; |
| 240 | 241 | ||
| 241 | /* decay the counter by 1 average sample */ | 242 | /* decay the counter by 1 average sample */ |
| @@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 251 | */ | 252 | */ |
| 252 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | 253 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; |
| 253 | 254 | ||
| 254 | if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) | 255 | if (avg_local_sample_len <= allowed_ns) |
| 255 | return; | 256 | return; |
| 256 | 257 | ||
| 257 | if (max_samples_per_tick <= 1) | 258 | if (max_samples_per_tick <= 1) |
| @@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 262 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | 263 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; |
| 263 | 264 | ||
| 264 | printk_ratelimited(KERN_WARNING | 265 | printk_ratelimited(KERN_WARNING |
| 265 | "perf samples too long (%lld > %d), lowering " | 266 | "perf samples too long (%lld > %lld), lowering " |
| 266 | "kernel.perf_event_max_sample_rate to %d\n", | 267 | "kernel.perf_event_max_sample_rate to %d\n", |
| 267 | avg_local_sample_len, | 268 | avg_local_sample_len, allowed_ns, |
| 268 | atomic_read(&perf_sample_allowed_ns), | ||
| 269 | sysctl_perf_event_sample_rate); | 269 | sysctl_perf_event_sample_rate); |
| 270 | 270 | ||
| 271 | update_perf_cpu_limits(); | 271 | update_perf_cpu_limits(); |
| @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
| 899 | put_ctx(ctx->parent_ctx); | 899 | put_ctx(ctx->parent_ctx); |
| 900 | ctx->parent_ctx = NULL; | 900 | ctx->parent_ctx = NULL; |
| 901 | } | 901 | } |
| 902 | ctx->generation++; | ||
| 902 | } | 903 | } |
| 903 | 904 | ||
| 904 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | 905 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
| @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1136 | ctx->nr_events++; | 1137 | ctx->nr_events++; |
| 1137 | if (event->attr.inherit_stat) | 1138 | if (event->attr.inherit_stat) |
| 1138 | ctx->nr_stat++; | 1139 | ctx->nr_stat++; |
| 1140 | |||
| 1141 | ctx->generation++; | ||
| 1139 | } | 1142 | } |
| 1140 | 1143 | ||
| 1141 | /* | 1144 | /* |
| @@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) | |||
| 1201 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 1204 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
| 1202 | size += sizeof(data->data_src.val); | 1205 | size += sizeof(data->data_src.val); |
| 1203 | 1206 | ||
| 1207 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
| 1208 | size += sizeof(data->txn); | ||
| 1209 | |||
| 1204 | event->header_size = size; | 1210 | event->header_size = size; |
| 1205 | } | 1211 | } |
| 1206 | 1212 | ||
| @@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1310 | */ | 1316 | */ |
| 1311 | if (event->state > PERF_EVENT_STATE_OFF) | 1317 | if (event->state > PERF_EVENT_STATE_OFF) |
| 1312 | event->state = PERF_EVENT_STATE_OFF; | 1318 | event->state = PERF_EVENT_STATE_OFF; |
| 1319 | |||
| 1320 | ctx->generation++; | ||
| 1313 | } | 1321 | } |
| 1314 | 1322 | ||
| 1315 | static void perf_group_detach(struct perf_event *event) | 1323 | static void perf_group_detach(struct perf_event *event) |
| @@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 2146 | } | 2154 | } |
| 2147 | 2155 | ||
| 2148 | /* | 2156 | /* |
| 2149 | * Test whether two contexts are equivalent, i.e. whether they | 2157 | * Test whether two contexts are equivalent, i.e. whether they have both been |
| 2150 | * have both been cloned from the same version of the same context | 2158 | * cloned from the same version of the same context. |
| 2151 | * and they both have the same number of enabled events. | 2159 | * |
| 2152 | * If the number of enabled events is the same, then the set | 2160 | * Equivalence is measured using a generation number in the context that is |
| 2153 | * of enabled events should be the same, because these are both | 2161 | * incremented on each modification to it; see unclone_ctx(), list_add_event() |
| 2154 | * inherited contexts, therefore we can't access individual events | 2162 | * and list_del_event(). |
| 2155 | * in them directly with an fd; we can only enable/disable all | ||
| 2156 | * events via prctl, or enable/disable all events in a family | ||
| 2157 | * via ioctl, which will have the same effect on both contexts. | ||
| 2158 | */ | 2163 | */ |
| 2159 | static int context_equiv(struct perf_event_context *ctx1, | 2164 | static int context_equiv(struct perf_event_context *ctx1, |
| 2160 | struct perf_event_context *ctx2) | 2165 | struct perf_event_context *ctx2) |
| 2161 | { | 2166 | { |
| 2162 | return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx | 2167 | /* Pinning disables the swap optimization */ |
| 2163 | && ctx1->parent_gen == ctx2->parent_gen | 2168 | if (ctx1->pin_count || ctx2->pin_count) |
| 2164 | && !ctx1->pin_count && !ctx2->pin_count; | 2169 | return 0; |
| 2170 | |||
| 2171 | /* If ctx1 is the parent of ctx2 */ | ||
| 2172 | if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) | ||
| 2173 | return 1; | ||
| 2174 | |||
| 2175 | /* If ctx2 is the parent of ctx1 */ | ||
| 2176 | if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) | ||
| 2177 | return 1; | ||
| 2178 | |||
| 2179 | /* | ||
| 2180 | * If ctx1 and ctx2 have the same parent; we flatten the parent | ||
| 2181 | * hierarchy, see perf_event_init_context(). | ||
| 2182 | */ | ||
| 2183 | if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && | ||
| 2184 | ctx1->parent_gen == ctx2->parent_gen) | ||
| 2185 | return 1; | ||
| 2186 | |||
| 2187 | /* Unmatched */ | ||
| 2188 | return 0; | ||
| 2165 | } | 2189 | } |
| 2166 | 2190 | ||
| 2167 | static void __perf_event_sync_stat(struct perf_event *event, | 2191 | static void __perf_event_sync_stat(struct perf_event *event, |
| @@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2244 | { | 2268 | { |
| 2245 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 2269 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
| 2246 | struct perf_event_context *next_ctx; | 2270 | struct perf_event_context *next_ctx; |
| 2247 | struct perf_event_context *parent; | 2271 | struct perf_event_context *parent, *next_parent; |
| 2248 | struct perf_cpu_context *cpuctx; | 2272 | struct perf_cpu_context *cpuctx; |
| 2249 | int do_switch = 1; | 2273 | int do_switch = 1; |
| 2250 | 2274 | ||
| @@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2256 | return; | 2280 | return; |
| 2257 | 2281 | ||
| 2258 | rcu_read_lock(); | 2282 | rcu_read_lock(); |
| 2259 | parent = rcu_dereference(ctx->parent_ctx); | ||
| 2260 | next_ctx = next->perf_event_ctxp[ctxn]; | 2283 | next_ctx = next->perf_event_ctxp[ctxn]; |
| 2261 | if (parent && next_ctx && | 2284 | if (!next_ctx) |
| 2262 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 2285 | goto unlock; |
| 2286 | |||
| 2287 | parent = rcu_dereference(ctx->parent_ctx); | ||
| 2288 | next_parent = rcu_dereference(next_ctx->parent_ctx); | ||
| 2289 | |||
| 2290 | /* If neither context have a parent context; they cannot be clones. */ | ||
| 2291 | if (!parent && !next_parent) | ||
| 2292 | goto unlock; | ||
| 2293 | |||
| 2294 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | ||
| 2263 | /* | 2295 | /* |
| 2264 | * Looks like the two contexts are clones, so we might be | 2296 | * Looks like the two contexts are clones, so we might be |
| 2265 | * able to optimize the context switch. We lock both | 2297 | * able to optimize the context switch. We lock both |
| @@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2287 | raw_spin_unlock(&next_ctx->lock); | 2319 | raw_spin_unlock(&next_ctx->lock); |
| 2288 | raw_spin_unlock(&ctx->lock); | 2320 | raw_spin_unlock(&ctx->lock); |
| 2289 | } | 2321 | } |
| 2322 | unlock: | ||
| 2290 | rcu_read_unlock(); | 2323 | rcu_read_unlock(); |
| 2291 | 2324 | ||
| 2292 | if (do_switch) { | 2325 | if (do_switch) { |
| @@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4572 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4605 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
| 4573 | perf_output_put(handle, data->data_src.val); | 4606 | perf_output_put(handle, data->data_src.val); |
| 4574 | 4607 | ||
| 4608 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
| 4609 | perf_output_put(handle, data->txn); | ||
| 4610 | |||
| 4575 | if (!event->attr.watermark) { | 4611 | if (!event->attr.watermark) { |
| 4576 | int wakeup_events = event->attr.wakeup_events; | 4612 | int wakeup_events = event->attr.wakeup_events; |
| 4577 | 4613 | ||
| @@ -5100,27 +5136,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
| 5100 | unsigned int size; | 5136 | unsigned int size; |
| 5101 | char tmp[16]; | 5137 | char tmp[16]; |
| 5102 | char *buf = NULL; | 5138 | char *buf = NULL; |
| 5103 | const char *name; | 5139 | char *name; |
| 5104 | |||
| 5105 | memset(tmp, 0, sizeof(tmp)); | ||
| 5106 | 5140 | ||
| 5107 | if (file) { | 5141 | if (file) { |
| 5108 | struct inode *inode; | 5142 | struct inode *inode; |
| 5109 | dev_t dev; | 5143 | dev_t dev; |
| 5144 | |||
| 5145 | buf = kmalloc(PATH_MAX, GFP_KERNEL); | ||
| 5146 | if (!buf) { | ||
| 5147 | name = "//enomem"; | ||
| 5148 | goto cpy_name; | ||
| 5149 | } | ||
| 5110 | /* | 5150 | /* |
| 5111 | * d_path works from the end of the rb backwards, so we | 5151 | * d_path() works from the end of the rb backwards, so we |
| 5112 | * need to add enough zero bytes after the string to handle | 5152 | * need to add enough zero bytes after the string to handle |
| 5113 | * the 64bit alignment we do later. | 5153 | * the 64bit alignment we do later. |
| 5114 | */ | 5154 | */ |
| 5115 | buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); | 5155 | name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); |
| 5116 | if (!buf) { | ||
| 5117 | name = strncpy(tmp, "//enomem", sizeof(tmp)); | ||
| 5118 | goto got_name; | ||
| 5119 | } | ||
| 5120 | name = d_path(&file->f_path, buf, PATH_MAX); | ||
| 5121 | if (IS_ERR(name)) { | 5156 | if (IS_ERR(name)) { |
| 5122 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5157 | name = "//toolong"; |
| 5123 | goto got_name; | 5158 | goto cpy_name; |
| 5124 | } | 5159 | } |
| 5125 | inode = file_inode(vma->vm_file); | 5160 | inode = file_inode(vma->vm_file); |
| 5126 | dev = inode->i_sb->s_dev; | 5161 | dev = inode->i_sb->s_dev; |
| @@ -5128,34 +5163,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
| 5128 | gen = inode->i_generation; | 5163 | gen = inode->i_generation; |
| 5129 | maj = MAJOR(dev); | 5164 | maj = MAJOR(dev); |
| 5130 | min = MINOR(dev); | 5165 | min = MINOR(dev); |
| 5131 | 5166 | goto got_name; | |
| 5132 | } else { | 5167 | } else { |
| 5133 | if (arch_vma_name(mmap_event->vma)) { | 5168 | name = (char *)arch_vma_name(vma); |
| 5134 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5169 | if (name) |
| 5135 | sizeof(tmp) - 1); | 5170 | goto cpy_name; |
| 5136 | tmp[sizeof(tmp) - 1] = '\0'; | ||
| 5137 | goto got_name; | ||
| 5138 | } | ||
| 5139 | 5171 | ||
| 5140 | if (!vma->vm_mm) { | 5172 | if (vma->vm_start <= vma->vm_mm->start_brk && |
| 5141 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | ||
| 5142 | goto got_name; | ||
| 5143 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
| 5144 | vma->vm_end >= vma->vm_mm->brk) { | 5173 | vma->vm_end >= vma->vm_mm->brk) { |
| 5145 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | 5174 | name = "[heap]"; |
| 5146 | goto got_name; | 5175 | goto cpy_name; |
| 5147 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | 5176 | } |
| 5177 | if (vma->vm_start <= vma->vm_mm->start_stack && | ||
| 5148 | vma->vm_end >= vma->vm_mm->start_stack) { | 5178 | vma->vm_end >= vma->vm_mm->start_stack) { |
| 5149 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | 5179 | name = "[stack]"; |
| 5150 | goto got_name; | 5180 | goto cpy_name; |
| 5151 | } | 5181 | } |
| 5152 | 5182 | ||
| 5153 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 5183 | name = "//anon"; |
| 5154 | goto got_name; | 5184 | goto cpy_name; |
| 5155 | } | 5185 | } |
| 5156 | 5186 | ||
| 5187 | cpy_name: | ||
| 5188 | strlcpy(tmp, name, sizeof(tmp)); | ||
| 5189 | name = tmp; | ||
| 5157 | got_name: | 5190 | got_name: |
| 5158 | size = ALIGN(strlen(name)+1, sizeof(u64)); | 5191 | /* |
| 5192 | * Since our buffer works in 8 byte units we need to align our string | ||
| 5193 | * size to a multiple of 8. However, we must guarantee the tail end is | ||
| 5194 | * zero'd out to avoid leaking random bits to userspace. | ||
| 5195 | */ | ||
| 5196 | size = strlen(name)+1; | ||
| 5197 | while (!IS_ALIGNED(size, sizeof(u64))) | ||
| 5198 | name[size++] = '\0'; | ||
| 5159 | 5199 | ||
| 5160 | mmap_event->file_name = name; | 5200 | mmap_event->file_name = name; |
| 5161 | mmap_event->file_size = size; | 5201 | mmap_event->file_size = size; |
| @@ -6292,6 +6332,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) | |||
| 6292 | 6332 | ||
| 6293 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | 6333 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); |
| 6294 | } | 6334 | } |
| 6335 | static DEVICE_ATTR_RO(type); | ||
| 6295 | 6336 | ||
| 6296 | static ssize_t | 6337 | static ssize_t |
| 6297 | perf_event_mux_interval_ms_show(struct device *dev, | 6338 | perf_event_mux_interval_ms_show(struct device *dev, |
| @@ -6336,17 +6377,19 @@ perf_event_mux_interval_ms_store(struct device *dev, | |||
| 6336 | 6377 | ||
| 6337 | return count; | 6378 | return count; |
| 6338 | } | 6379 | } |
| 6380 | static DEVICE_ATTR_RW(perf_event_mux_interval_ms); | ||
| 6339 | 6381 | ||
| 6340 | static struct device_attribute pmu_dev_attrs[] = { | 6382 | static struct attribute *pmu_dev_attrs[] = { |
| 6341 | __ATTR_RO(type), | 6383 | &dev_attr_type.attr, |
| 6342 | __ATTR_RW(perf_event_mux_interval_ms), | 6384 | &dev_attr_perf_event_mux_interval_ms.attr, |
| 6343 | __ATTR_NULL, | 6385 | NULL, |
| 6344 | }; | 6386 | }; |
| 6387 | ATTRIBUTE_GROUPS(pmu_dev); | ||
| 6345 | 6388 | ||
| 6346 | static int pmu_bus_running; | 6389 | static int pmu_bus_running; |
| 6347 | static struct bus_type pmu_bus = { | 6390 | static struct bus_type pmu_bus = { |
| 6348 | .name = "event_source", | 6391 | .name = "event_source", |
| 6349 | .dev_attrs = pmu_dev_attrs, | 6392 | .dev_groups = pmu_dev_groups, |
| 6350 | }; | 6393 | }; |
| 6351 | 6394 | ||
| 6352 | static void pmu_dev_release(struct device *dev) | 6395 | static void pmu_dev_release(struct device *dev) |
| @@ -6767,6 +6810,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 6767 | if (ret) | 6810 | if (ret) |
| 6768 | return -EFAULT; | 6811 | return -EFAULT; |
| 6769 | 6812 | ||
| 6813 | /* disabled for now */ | ||
| 6814 | if (attr->mmap2) | ||
| 6815 | return -EINVAL; | ||
| 6816 | |||
| 6770 | if (attr->__reserved_1) | 6817 | if (attr->__reserved_1) |
| 6771 | return -EINVAL; | 6818 | return -EINVAL; |
| 6772 | 6819 | ||
| @@ -7122,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7122 | } | 7169 | } |
| 7123 | 7170 | ||
| 7124 | perf_install_in_context(ctx, event, event->cpu); | 7171 | perf_install_in_context(ctx, event, event->cpu); |
| 7125 | ++ctx->generation; | ||
| 7126 | perf_unpin_context(ctx); | 7172 | perf_unpin_context(ctx); |
| 7127 | mutex_unlock(&ctx->mutex); | 7173 | mutex_unlock(&ctx->mutex); |
| 7128 | 7174 | ||
| @@ -7205,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7205 | WARN_ON_ONCE(ctx->parent_ctx); | 7251 | WARN_ON_ONCE(ctx->parent_ctx); |
| 7206 | mutex_lock(&ctx->mutex); | 7252 | mutex_lock(&ctx->mutex); |
| 7207 | perf_install_in_context(ctx, event, cpu); | 7253 | perf_install_in_context(ctx, event, cpu); |
| 7208 | ++ctx->generation; | ||
| 7209 | perf_unpin_context(ctx); | 7254 | perf_unpin_context(ctx); |
| 7210 | mutex_unlock(&ctx->mutex); | 7255 | mutex_unlock(&ctx->mutex); |
| 7211 | 7256 | ||
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca6599723be5..569b218782ad 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
| @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ |
| 85 | static inline unsigned int \ | 85 | static inline unsigned long \ |
| 86 | func_name(struct perf_output_handle *handle, \ | 86 | func_name(struct perf_output_handle *handle, \ |
| 87 | const void *buf, unsigned int len) \ | 87 | const void *buf, unsigned long len) \ |
| 88 | { \ | 88 | { \ |
| 89 | unsigned long size, written; \ | 89 | unsigned long size, written; \ |
| 90 | \ | 90 | \ |
| 91 | do { \ | 91 | do { \ |
| 92 | size = min_t(unsigned long, handle->size, len); \ | 92 | size = min(handle->size, len); \ |
| 93 | \ | ||
| 94 | written = memcpy_func(handle->addr, buf, size); \ | 93 | written = memcpy_func(handle->addr, buf, size); \ |
| 94 | written = size - written; \ | ||
| 95 | \ | 95 | \ |
| 96 | len -= written; \ | 96 | len -= written; \ |
| 97 | handle->addr += written; \ | 97 | handle->addr += written; \ |
| @@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \ | |||
| 110 | return len; \ | 110 | return len; \ |
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | static inline int memcpy_common(void *dst, const void *src, size_t n) | 113 | static inline unsigned long |
| 114 | memcpy_common(void *dst, const void *src, unsigned long n) | ||
| 114 | { | 115 | { |
| 115 | memcpy(dst, src, n); | 116 | memcpy(dst, src, n); |
| 116 | return n; | 117 | return 0; |
| 117 | } | 118 | } |
| 118 | 119 | ||
| 119 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) | 120 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) |
| 120 | 121 | ||
| 121 | #define MEMCPY_SKIP(dst, src, n) (n) | 122 | static inline unsigned long |
| 123 | memcpy_skip(void *dst, const void *src, unsigned long n) | ||
| 124 | { | ||
| 125 | return 0; | ||
| 126 | } | ||
| 122 | 127 | ||
| 123 | DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) | 128 | DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) |
| 124 | 129 | ||
| 125 | #ifndef arch_perf_out_copy_user | 130 | #ifndef arch_perf_out_copy_user |
| 126 | #define arch_perf_out_copy_user __copy_from_user_inatomic | 131 | #define arch_perf_out_copy_user arch_perf_out_copy_user |
| 132 | |||
| 133 | static inline unsigned long | ||
| 134 | arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) | ||
| 135 | { | ||
| 136 | unsigned long ret; | ||
| 137 | |||
| 138 | pagefault_disable(); | ||
| 139 | ret = __copy_from_user_inatomic(dst, src, n); | ||
| 140 | pagefault_enable(); | ||
| 141 | |||
| 142 | return ret; | ||
| 143 | } | ||
| 127 | #endif | 144 | #endif |
| 128 | 145 | ||
| 129 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) | 146 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..e8b168af135b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -12,40 +12,10 @@ | |||
| 12 | #include <linux/perf_event.h> | 12 | #include <linux/perf_event.h> |
| 13 | #include <linux/vmalloc.h> | 13 | #include <linux/vmalloc.h> |
| 14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
| 15 | #include <linux/circ_buf.h> | ||
| 15 | 16 | ||
| 16 | #include "internal.h" | 17 | #include "internal.h" |
| 17 | 18 | ||
| 18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | ||
| 19 | unsigned long offset, unsigned long head) | ||
| 20 | { | ||
| 21 | unsigned long sz = perf_data_size(rb); | ||
| 22 | unsigned long mask = sz - 1; | ||
| 23 | |||
| 24 | /* | ||
| 25 | * check if user-writable | ||
| 26 | * overwrite : over-write its own tail | ||
| 27 | * !overwrite: buffer possibly drops events. | ||
| 28 | */ | ||
| 29 | if (rb->overwrite) | ||
| 30 | return true; | ||
| 31 | |||
| 32 | /* | ||
| 33 | * verify that payload is not bigger than buffer | ||
| 34 | * otherwise masking logic may fail to detect | ||
| 35 | * the "not enough space" condition | ||
| 36 | */ | ||
| 37 | if ((head - offset) > sz) | ||
| 38 | return false; | ||
| 39 | |||
| 40 | offset = (offset - tail) & mask; | ||
| 41 | head = (head - tail) & mask; | ||
| 42 | |||
| 43 | if ((int)(head - offset) < 0) | ||
| 44 | return false; | ||
| 45 | |||
| 46 | return true; | ||
| 47 | } | ||
| 48 | |||
| 49 | static void perf_output_wakeup(struct perf_output_handle *handle) | 19 | static void perf_output_wakeup(struct perf_output_handle *handle) |
| 50 | { | 20 | { |
| 51 | atomic_set(&handle->rb->poll, POLL_IN); | 21 | atomic_set(&handle->rb->poll, POLL_IN); |
| @@ -87,15 +57,36 @@ again: | |||
| 87 | goto out; | 57 | goto out; |
| 88 | 58 | ||
| 89 | /* | 59 | /* |
| 90 | * Publish the known good head. Rely on the full barrier implied | 60 | * Since the mmap() consumer (userspace) can run on a different CPU: |
| 91 | * by atomic_dec_and_test() order the rb->head read and this | 61 | * |
| 92 | * write. | 62 | * kernel user |
| 63 | * | ||
| 64 | * READ ->data_tail READ ->data_head | ||
| 65 | * smp_mb() (A) smp_rmb() (C) | ||
| 66 | * WRITE $data READ $data | ||
| 67 | * smp_wmb() (B) smp_mb() (D) | ||
| 68 | * STORE ->data_head WRITE ->data_tail | ||
| 69 | * | ||
| 70 | * Where A pairs with D, and B pairs with C. | ||
| 71 | * | ||
| 72 | * I don't think A needs to be a full barrier because we won't in fact | ||
| 73 | * write data until we see the store from userspace. So we simply don't | ||
| 74 | * issue the data WRITE until we observe it. Be conservative for now. | ||
| 75 | * | ||
| 76 | * OTOH, D needs to be a full barrier since it separates the data READ | ||
| 77 | * from the tail WRITE. | ||
| 78 | * | ||
| 79 | * For B a WMB is sufficient since it separates two WRITEs, and for C | ||
| 80 | * an RMB is sufficient since it separates two READs. | ||
| 81 | * | ||
| 82 | * See perf_output_begin(). | ||
| 93 | */ | 83 | */ |
| 84 | smp_wmb(); | ||
| 94 | rb->user_page->data_head = head; | 85 | rb->user_page->data_head = head; |
| 95 | 86 | ||
| 96 | /* | 87 | /* |
| 97 | * Now check if we missed an update, rely on the (compiler) | 88 | * Now check if we missed an update -- rely on previous implied |
| 98 | * barrier in atomic_dec_and_test() to re-read rb->head. | 89 | * compiler barriers to force a re-read. |
| 99 | */ | 90 | */ |
| 100 | if (unlikely(head != local_read(&rb->head))) { | 91 | if (unlikely(head != local_read(&rb->head))) { |
| 101 | local_inc(&rb->nest); | 92 | local_inc(&rb->nest); |
| @@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 114 | { | 105 | { |
| 115 | struct ring_buffer *rb; | 106 | struct ring_buffer *rb; |
| 116 | unsigned long tail, offset, head; | 107 | unsigned long tail, offset, head; |
| 117 | int have_lost; | 108 | int have_lost, page_shift; |
| 118 | struct perf_sample_data sample_data; | ||
| 119 | struct { | 109 | struct { |
| 120 | struct perf_event_header header; | 110 | struct perf_event_header header; |
| 121 | u64 id; | 111 | u64 id; |
| @@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 130 | event = event->parent; | 120 | event = event->parent; |
| 131 | 121 | ||
| 132 | rb = rcu_dereference(event->rb); | 122 | rb = rcu_dereference(event->rb); |
| 133 | if (!rb) | 123 | if (unlikely(!rb)) |
| 134 | goto out; | 124 | goto out; |
| 135 | 125 | ||
| 136 | handle->rb = rb; | 126 | if (unlikely(!rb->nr_pages)) |
| 137 | handle->event = event; | ||
| 138 | |||
| 139 | if (!rb->nr_pages) | ||
| 140 | goto out; | 127 | goto out; |
| 141 | 128 | ||
| 129 | handle->rb = rb; | ||
| 130 | handle->event = event; | ||
| 131 | |||
| 142 | have_lost = local_read(&rb->lost); | 132 | have_lost = local_read(&rb->lost); |
| 143 | if (have_lost) { | 133 | if (unlikely(have_lost)) { |
| 144 | lost_event.header.size = sizeof(lost_event); | 134 | size += sizeof(lost_event); |
| 145 | perf_event_header__init_id(&lost_event.header, &sample_data, | 135 | if (event->attr.sample_id_all) |
| 146 | event); | 136 | size += event->id_header_size; |
| 147 | size += lost_event.header.size; | ||
| 148 | } | 137 | } |
| 149 | 138 | ||
| 150 | perf_output_get_handle(handle); | 139 | perf_output_get_handle(handle); |
| 151 | 140 | ||
| 152 | do { | 141 | do { |
| 153 | /* | ||
| 154 | * Userspace could choose to issue a mb() before updating the | ||
| 155 | * tail pointer. So that all reads will be completed before the | ||
| 156 | * write is issued. | ||
| 157 | */ | ||
| 158 | tail = ACCESS_ONCE(rb->user_page->data_tail); | 142 | tail = ACCESS_ONCE(rb->user_page->data_tail); |
| 159 | smp_rmb(); | ||
| 160 | offset = head = local_read(&rb->head); | 143 | offset = head = local_read(&rb->head); |
| 161 | head += size; | 144 | if (!rb->overwrite && |
| 162 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | 145 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) |
| 163 | goto fail; | 146 | goto fail; |
| 147 | head += size; | ||
| 164 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | 148 | } while (local_cmpxchg(&rb->head, offset, head) != offset); |
| 165 | 149 | ||
| 166 | if (head - local_read(&rb->wakeup) > rb->watermark) | 150 | /* |
| 151 | * Separate the userpage->tail read from the data stores below. | ||
| 152 | * Matches the MB userspace SHOULD issue after reading the data | ||
| 153 | * and before storing the new tail position. | ||
| 154 | * | ||
| 155 | * See perf_output_put_handle(). | ||
| 156 | */ | ||
| 157 | smp_mb(); | ||
| 158 | |||
| 159 | if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) | ||
| 167 | local_add(rb->watermark, &rb->wakeup); | 160 | local_add(rb->watermark, &rb->wakeup); |
| 168 | 161 | ||
| 169 | handle->page = offset >> (PAGE_SHIFT + page_order(rb)); | 162 | page_shift = PAGE_SHIFT + page_order(rb); |
| 170 | handle->page &= rb->nr_pages - 1; | ||
| 171 | handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); | ||
| 172 | handle->addr = rb->data_pages[handle->page]; | ||
| 173 | handle->addr += handle->size; | ||
| 174 | handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; | ||
| 175 | 163 | ||
| 176 | if (have_lost) { | 164 | handle->page = (offset >> page_shift) & (rb->nr_pages - 1); |
| 165 | offset &= (1UL << page_shift) - 1; | ||
| 166 | handle->addr = rb->data_pages[handle->page] + offset; | ||
| 167 | handle->size = (1UL << page_shift) - offset; | ||
| 168 | |||
| 169 | if (unlikely(have_lost)) { | ||
| 170 | struct perf_sample_data sample_data; | ||
| 171 | |||
| 172 | lost_event.header.size = sizeof(lost_event); | ||
| 177 | lost_event.header.type = PERF_RECORD_LOST; | 173 | lost_event.header.type = PERF_RECORD_LOST; |
| 178 | lost_event.header.misc = 0; | 174 | lost_event.header.misc = 0; |
| 179 | lost_event.id = event->id; | 175 | lost_event.id = event->id; |
| 180 | lost_event.lost = local_xchg(&rb->lost, 0); | 176 | lost_event.lost = local_xchg(&rb->lost, 0); |
| 181 | 177 | ||
| 178 | perf_event_header__init_id(&lost_event.header, | ||
| 179 | &sample_data, event); | ||
| 182 | perf_output_put(handle, lost_event); | 180 | perf_output_put(handle, lost_event); |
| 183 | perf_event__output_id_sample(event, handle, &sample_data); | 181 | perf_event__output_id_sample(event, handle, &sample_data); |
| 184 | } | 182 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70e..24b7d6ca871b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/kdebug.h> /* notifier mechanism */ | 35 | #include <linux/kdebug.h> /* notifier mechanism */ |
| 36 | #include "../../mm/internal.h" /* munlock_vma_page */ | 36 | #include "../../mm/internal.h" /* munlock_vma_page */ |
| 37 | #include <linux/percpu-rwsem.h> | 37 | #include <linux/percpu-rwsem.h> |
| 38 | #include <linux/task_work.h> | ||
| 38 | 39 | ||
| 39 | #include <linux/uprobes.h> | 40 | #include <linux/uprobes.h> |
| 40 | 41 | ||
| @@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
| 244 | * the architecture. If an arch has variable length instruction and the | 245 | * the architecture. If an arch has variable length instruction and the |
| 245 | * breakpoint instruction is not of the smallest length instruction | 246 | * breakpoint instruction is not of the smallest length instruction |
| 246 | * supported by that architecture then we need to modify is_trap_at_addr and | 247 | * supported by that architecture then we need to modify is_trap_at_addr and |
| 247 | * write_opcode accordingly. This would never be a problem for archs that | 248 | * uprobe_write_opcode accordingly. This would never be a problem for archs |
| 248 | * have fixed length instructions. | 249 | * that have fixed length instructions. |
| 249 | */ | 250 | */ |
| 250 | 251 | ||
| 251 | /* | 252 | /* |
| 252 | * write_opcode - write the opcode at a given virtual address. | 253 | * uprobe_write_opcode - write the opcode at a given virtual address. |
| 253 | * @mm: the probed process address space. | 254 | * @mm: the probed process address space. |
| 254 | * @vaddr: the virtual address to store the opcode. | 255 | * @vaddr: the virtual address to store the opcode. |
| 255 | * @opcode: opcode to be written at @vaddr. | 256 | * @opcode: opcode to be written at @vaddr. |
| @@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
| 260 | * For mm @mm, write the opcode at @vaddr. | 261 | * For mm @mm, write the opcode at @vaddr. |
| 261 | * Return 0 (success) or a negative errno. | 262 | * Return 0 (success) or a negative errno. |
| 262 | */ | 263 | */ |
| 263 | static int write_opcode(struct mm_struct *mm, unsigned long vaddr, | 264 | int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, |
| 264 | uprobe_opcode_t opcode) | 265 | uprobe_opcode_t opcode) |
| 265 | { | 266 | { |
| 266 | struct page *old_page, *new_page; | 267 | struct page *old_page, *new_page; |
| @@ -314,7 +315,7 @@ put_old: | |||
| 314 | */ | 315 | */ |
| 315 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 316 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 316 | { | 317 | { |
| 317 | return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); | 318 | return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); |
| 318 | } | 319 | } |
| 319 | 320 | ||
| 320 | /** | 321 | /** |
| @@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
| 329 | int __weak | 330 | int __weak |
| 330 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 331 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 331 | { | 332 | { |
| 332 | return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | 333 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); |
| 333 | } | 334 | } |
| 334 | 335 | ||
| 335 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | 336 | static int match_uprobe(struct uprobe *l, struct uprobe *r) |
| @@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | |||
| 503 | return ret; | 504 | return ret; |
| 504 | } | 505 | } |
| 505 | 506 | ||
| 506 | static int | 507 | static int __copy_insn(struct address_space *mapping, struct file *filp, |
| 507 | __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | 508 | void *insn, int nbytes, loff_t offset) |
| 508 | unsigned long nbytes, loff_t offset) | ||
| 509 | { | 509 | { |
| 510 | struct page *page; | 510 | struct page *page; |
| 511 | 511 | ||
| @@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | |||
| 527 | 527 | ||
| 528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) | 528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) |
| 529 | { | 529 | { |
| 530 | struct address_space *mapping; | 530 | struct address_space *mapping = uprobe->inode->i_mapping; |
| 531 | unsigned long nbytes; | 531 | loff_t offs = uprobe->offset; |
| 532 | int bytes; | 532 | void *insn = uprobe->arch.insn; |
| 533 | 533 | int size = MAX_UINSN_BYTES; | |
| 534 | nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); | 534 | int len, err = -EIO; |
| 535 | mapping = uprobe->inode->i_mapping; | ||
| 536 | 535 | ||
| 537 | /* Instruction at end of binary; copy only available bytes */ | 536 | /* Copy only available bytes, -EIO if nothing was read */ |
| 538 | if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) | 537 | do { |
| 539 | bytes = uprobe->inode->i_size - uprobe->offset; | 538 | if (offs >= i_size_read(uprobe->inode)) |
| 540 | else | 539 | break; |
| 541 | bytes = MAX_UINSN_BYTES; | ||
| 542 | 540 | ||
| 543 | /* Instruction at the page-boundary; copy bytes in second page */ | 541 | len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); |
| 544 | if (nbytes < bytes) { | 542 | err = __copy_insn(mapping, filp, insn, len, offs); |
| 545 | int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, | ||
| 546 | bytes - nbytes, uprobe->offset + nbytes); | ||
| 547 | if (err) | 543 | if (err) |
| 548 | return err; | 544 | break; |
| 549 | bytes = nbytes; | 545 | |
| 550 | } | 546 | insn += len; |
| 551 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); | 547 | offs += len; |
| 548 | size -= len; | ||
| 549 | } while (size); | ||
| 550 | |||
| 551 | return err; | ||
| 552 | } | 552 | } |
| 553 | 553 | ||
| 554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | 554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, |
| @@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
| 576 | if (ret) | 576 | if (ret) |
| 577 | goto out; | 577 | goto out; |
| 578 | 578 | ||
| 579 | /* write_opcode() assumes we don't cross page boundary */ | 579 | /* uprobe_write_opcode() assumes we don't cross page boundary */ |
| 580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | 580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + |
| 581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | 581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); |
| 582 | 582 | ||
| @@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
| 1096 | } | 1096 | } |
| 1097 | 1097 | ||
| 1098 | /* Slot allocation for XOL */ | 1098 | /* Slot allocation for XOL */ |
| 1099 | static int xol_add_vma(struct xol_area *area) | 1099 | static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) |
| 1100 | { | 1100 | { |
| 1101 | struct mm_struct *mm = current->mm; | ||
| 1102 | int ret = -EALREADY; | 1101 | int ret = -EALREADY; |
| 1103 | 1102 | ||
| 1104 | down_write(&mm->mmap_sem); | 1103 | down_write(&mm->mmap_sem); |
| 1105 | if (mm->uprobes_state.xol_area) | 1104 | if (mm->uprobes_state.xol_area) |
| 1106 | goto fail; | 1105 | goto fail; |
| 1107 | 1106 | ||
| 1108 | ret = -ENOMEM; | 1107 | if (!area->vaddr) { |
| 1109 | /* Try to map as high as possible, this is only a hint. */ | 1108 | /* Try to map as high as possible, this is only a hint. */ |
| 1110 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); | 1109 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, |
| 1111 | if (area->vaddr & ~PAGE_MASK) { | 1110 | PAGE_SIZE, 0, 0); |
| 1112 | ret = area->vaddr; | 1111 | if (area->vaddr & ~PAGE_MASK) { |
| 1113 | goto fail; | 1112 | ret = area->vaddr; |
| 1113 | goto fail; | ||
| 1114 | } | ||
| 1114 | } | 1115 | } |
| 1115 | 1116 | ||
| 1116 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, | 1117 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, |
| @@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area) | |||
| 1120 | 1121 | ||
| 1121 | smp_wmb(); /* pairs with get_xol_area() */ | 1122 | smp_wmb(); /* pairs with get_xol_area() */ |
| 1122 | mm->uprobes_state.xol_area = area; | 1123 | mm->uprobes_state.xol_area = area; |
| 1123 | ret = 0; | ||
| 1124 | fail: | 1124 | fail: |
| 1125 | up_write(&mm->mmap_sem); | 1125 | up_write(&mm->mmap_sem); |
| 1126 | 1126 | ||
| 1127 | return ret; | 1127 | return ret; |
| 1128 | } | 1128 | } |
| 1129 | 1129 | ||
| 1130 | /* | 1130 | static struct xol_area *__create_xol_area(unsigned long vaddr) |
| 1131 | * get_xol_area - Allocate process's xol_area if necessary. | ||
| 1132 | * This area will be used for storing instructions for execution out of line. | ||
| 1133 | * | ||
| 1134 | * Returns the allocated area or NULL. | ||
| 1135 | */ | ||
| 1136 | static struct xol_area *get_xol_area(void) | ||
| 1137 | { | 1131 | { |
| 1138 | struct mm_struct *mm = current->mm; | 1132 | struct mm_struct *mm = current->mm; |
| 1139 | struct xol_area *area; | ||
| 1140 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; | 1133 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; |
| 1134 | struct xol_area *area; | ||
| 1141 | 1135 | ||
| 1142 | area = mm->uprobes_state.xol_area; | 1136 | area = kmalloc(sizeof(*area), GFP_KERNEL); |
| 1143 | if (area) | ||
| 1144 | goto ret; | ||
| 1145 | |||
| 1146 | area = kzalloc(sizeof(*area), GFP_KERNEL); | ||
| 1147 | if (unlikely(!area)) | 1137 | if (unlikely(!area)) |
| 1148 | goto out; | 1138 | goto out; |
| 1149 | 1139 | ||
| @@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void) | |||
| 1155 | if (!area->page) | 1145 | if (!area->page) |
| 1156 | goto free_bitmap; | 1146 | goto free_bitmap; |
| 1157 | 1147 | ||
| 1158 | /* allocate first slot of task's xol_area for the return probes */ | 1148 | area->vaddr = vaddr; |
| 1149 | init_waitqueue_head(&area->wq); | ||
| 1150 | /* Reserve the 1st slot for get_trampoline_vaddr() */ | ||
| 1159 | set_bit(0, area->bitmap); | 1151 | set_bit(0, area->bitmap); |
| 1160 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); | ||
| 1161 | atomic_set(&area->slot_count, 1); | 1152 | atomic_set(&area->slot_count, 1); |
| 1162 | init_waitqueue_head(&area->wq); | 1153 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); |
| 1163 | 1154 | ||
| 1164 | if (!xol_add_vma(area)) | 1155 | if (!xol_add_vma(mm, area)) |
| 1165 | return area; | 1156 | return area; |
| 1166 | 1157 | ||
| 1167 | __free_page(area->page); | 1158 | __free_page(area->page); |
| @@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void) | |||
| 1170 | free_area: | 1161 | free_area: |
| 1171 | kfree(area); | 1162 | kfree(area); |
| 1172 | out: | 1163 | out: |
| 1164 | return NULL; | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | /* | ||
| 1168 | * get_xol_area - Allocate process's xol_area if necessary. | ||
| 1169 | * This area will be used for storing instructions for execution out of line. | ||
| 1170 | * | ||
| 1171 | * Returns the allocated area or NULL. | ||
| 1172 | */ | ||
| 1173 | static struct xol_area *get_xol_area(void) | ||
| 1174 | { | ||
| 1175 | struct mm_struct *mm = current->mm; | ||
| 1176 | struct xol_area *area; | ||
| 1177 | |||
| 1178 | if (!mm->uprobes_state.xol_area) | ||
| 1179 | __create_xol_area(0); | ||
| 1180 | |||
| 1173 | area = mm->uprobes_state.xol_area; | 1181 | area = mm->uprobes_state.xol_area; |
| 1174 | ret: | 1182 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ |
| 1175 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
| 1176 | return area; | 1183 | return area; |
| 1177 | } | 1184 | } |
| 1178 | 1185 | ||
| @@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
| 1256 | return 0; | 1263 | return 0; |
| 1257 | 1264 | ||
| 1258 | /* Initialize the slot */ | 1265 | /* Initialize the slot */ |
| 1259 | copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); | 1266 | copy_to_page(area->page, xol_vaddr, |
| 1267 | uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | ||
| 1260 | /* | 1268 | /* |
| 1261 | * We probably need flush_icache_user_range() but it needs vma. | 1269 | * We probably need flush_icache_user_range() but it needs vma. |
| 1262 | * This should work on supported architectures too. | 1270 | * This should work on supported architectures too. |
| @@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
| 1345 | } | 1353 | } |
| 1346 | 1354 | ||
| 1347 | /* | 1355 | /* |
| 1348 | * Called in context of a new clone/fork from copy_process. | ||
| 1349 | */ | ||
| 1350 | void uprobe_copy_process(struct task_struct *t) | ||
| 1351 | { | ||
| 1352 | t->utask = NULL; | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | /* | ||
| 1356 | * Allocate a uprobe_task object for the task if if necessary. | 1356 | * Allocate a uprobe_task object for the task if if necessary. |
| 1357 | * Called when the thread hits a breakpoint. | 1357 | * Called when the thread hits a breakpoint. |
| 1358 | * | 1358 | * |
| @@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void) | |||
| 1367 | return current->utask; | 1367 | return current->utask; |
| 1368 | } | 1368 | } |
| 1369 | 1369 | ||
| 1370 | static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) | ||
| 1371 | { | ||
| 1372 | struct uprobe_task *n_utask; | ||
| 1373 | struct return_instance **p, *o, *n; | ||
| 1374 | |||
| 1375 | n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); | ||
| 1376 | if (!n_utask) | ||
| 1377 | return -ENOMEM; | ||
| 1378 | t->utask = n_utask; | ||
| 1379 | |||
| 1380 | p = &n_utask->return_instances; | ||
| 1381 | for (o = o_utask->return_instances; o; o = o->next) { | ||
| 1382 | n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); | ||
| 1383 | if (!n) | ||
| 1384 | return -ENOMEM; | ||
| 1385 | |||
| 1386 | *n = *o; | ||
| 1387 | atomic_inc(&n->uprobe->ref); | ||
| 1388 | n->next = NULL; | ||
| 1389 | |||
| 1390 | *p = n; | ||
| 1391 | p = &n->next; | ||
| 1392 | n_utask->depth++; | ||
| 1393 | } | ||
| 1394 | |||
| 1395 | return 0; | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | static void uprobe_warn(struct task_struct *t, const char *msg) | ||
| 1399 | { | ||
| 1400 | pr_warn("uprobe: %s:%d failed to %s\n", | ||
| 1401 | current->comm, current->pid, msg); | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | static void dup_xol_work(struct callback_head *work) | ||
| 1405 | { | ||
| 1406 | kfree(work); | ||
| 1407 | |||
| 1408 | if (current->flags & PF_EXITING) | ||
| 1409 | return; | ||
| 1410 | |||
| 1411 | if (!__create_xol_area(current->utask->vaddr)) | ||
| 1412 | uprobe_warn(current, "dup xol area"); | ||
| 1413 | } | ||
| 1414 | |||
| 1415 | /* | ||
| 1416 | * Called in context of a new clone/fork from copy_process. | ||
| 1417 | */ | ||
| 1418 | void uprobe_copy_process(struct task_struct *t, unsigned long flags) | ||
| 1419 | { | ||
| 1420 | struct uprobe_task *utask = current->utask; | ||
| 1421 | struct mm_struct *mm = current->mm; | ||
| 1422 | struct callback_head *work; | ||
| 1423 | struct xol_area *area; | ||
| 1424 | |||
| 1425 | t->utask = NULL; | ||
| 1426 | |||
| 1427 | if (!utask || !utask->return_instances) | ||
| 1428 | return; | ||
| 1429 | |||
| 1430 | if (mm == t->mm && !(flags & CLONE_VFORK)) | ||
| 1431 | return; | ||
| 1432 | |||
| 1433 | if (dup_utask(t, utask)) | ||
| 1434 | return uprobe_warn(t, "dup ret instances"); | ||
| 1435 | |||
| 1436 | /* The task can fork() after dup_xol_work() fails */ | ||
| 1437 | area = mm->uprobes_state.xol_area; | ||
| 1438 | if (!area) | ||
| 1439 | return uprobe_warn(t, "dup xol area"); | ||
| 1440 | |||
| 1441 | if (mm == t->mm) | ||
| 1442 | return; | ||
| 1443 | |||
| 1444 | /* TODO: move it into the union in uprobe_task */ | ||
| 1445 | work = kmalloc(sizeof(*work), GFP_KERNEL); | ||
| 1446 | if (!work) | ||
| 1447 | return uprobe_warn(t, "dup xol area"); | ||
| 1448 | |||
| 1449 | t->utask->vaddr = area->vaddr; | ||
| 1450 | init_task_work(work, dup_xol_work); | ||
| 1451 | task_work_add(t, work, true); | ||
| 1452 | } | ||
| 1453 | |||
| 1370 | /* | 1454 | /* |
| 1371 | * Current area->vaddr notion assume the trampoline address is always | 1455 | * Current area->vaddr notion assume the trampoline address is always |
| 1372 | * equal area->vaddr. | 1456 | * equal area->vaddr. |
| @@ -1857,9 +1941,4 @@ static int __init init_uprobes(void) | |||
| 1857 | 1941 | ||
| 1858 | return register_die_notifier(&uprobe_exception_nb); | 1942 | return register_die_notifier(&uprobe_exception_nb); |
| 1859 | } | 1943 | } |
| 1860 | module_init(init_uprobes); | 1944 | __initcall(init_uprobes); |
| 1861 | |||
| 1862 | static void __exit exit_uprobes(void) | ||
| 1863 | { | ||
| 1864 | } | ||
| 1865 | module_exit(exit_uprobes); | ||
