diff options
Diffstat (limited to 'kernel')
126 files changed, 12447 insertions, 6117 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index ab4f1090f437..b3097bde4e9c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore | |||
@@ -4,3 +4,4 @@ | |||
4 | config_data.h | 4 | config_data.h |
5 | config_data.gz | 5 | config_data.gz |
6 | timeconst.h | 6 | timeconst.h |
7 | hz.bc | ||
diff --git a/kernel/Makefile b/kernel/Makefile index bbde5f1a4486..271fd3119af9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -24,6 +24,7 @@ endif | |||
24 | 24 | ||
25 | obj-y += sched/ | 25 | obj-y += sched/ |
26 | obj-y += power/ | 26 | obj-y += power/ |
27 | obj-y += cpu/ | ||
27 | 28 | ||
28 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o | 29 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o |
29 | obj-$(CONFIG_FREEZER) += freezer.o | 30 | obj-$(CONFIG_FREEZER) += freezer.o |
@@ -175,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey | |||
175 | openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ | 176 | openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ |
176 | -batch -x509 -config x509.genkey \ | 177 | -batch -x509 -config x509.genkey \ |
177 | -outform DER -out signing_key.x509 \ | 178 | -outform DER -out signing_key.x509 \ |
178 | -keyout signing_key.priv | 179 | -keyout signing_key.priv 2>&1 |
179 | @echo "###" | 180 | @echo "###" |
180 | @echo "### Key pair generated." | 181 | @echo "### Key pair generated." |
181 | @echo "###" | 182 | @echo "###" |
diff --git a/kernel/acct.c b/kernel/acct.c index b9bd7f098ee5..8d6e145138bb 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
540 | ac.ac_swaps = encode_comp_t(0); | 540 | ac.ac_swaps = encode_comp_t(0); |
541 | 541 | ||
542 | /* | 542 | /* |
543 | * Get freeze protection. If the fs is frozen, just skip the write | ||
544 | * as we could deadlock the system otherwise. | ||
545 | */ | ||
546 | if (!file_start_write_trylock(file)) | ||
547 | goto out; | ||
548 | /* | ||
543 | * Kernel segment override to datasegment and write it | 549 | * Kernel segment override to datasegment and write it |
544 | * to the accounting file. | 550 | * to the accounting file. |
545 | */ | 551 | */ |
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
554 | sizeof(acct_t), &file->f_pos); | 560 | sizeof(acct_t), &file->f_pos); |
555 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; | 561 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; |
556 | set_fs(fs); | 562 | set_fs(fs); |
563 | file_end_write(file); | ||
557 | out: | 564 | out: |
558 | revert_creds(orig_cred); | 565 | revert_creds(orig_cred); |
559 | } | 566 | } |
diff --git a/kernel/async.c b/kernel/async.c index 8ddee2c3e5b0..61f023ce0228 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -73,7 +73,7 @@ struct async_entry { | |||
73 | struct list_head global_list; | 73 | struct list_head global_list; |
74 | struct work_struct work; | 74 | struct work_struct work; |
75 | async_cookie_t cookie; | 75 | async_cookie_t cookie; |
76 | async_func_ptr *func; | 76 | async_func_t func; |
77 | void *data; | 77 | void *data; |
78 | struct async_domain *domain; | 78 | struct async_domain *domain; |
79 | }; | 79 | }; |
@@ -84,24 +84,20 @@ static atomic_t entry_count; | |||
84 | 84 | ||
85 | static async_cookie_t lowest_in_progress(struct async_domain *domain) | 85 | static async_cookie_t lowest_in_progress(struct async_domain *domain) |
86 | { | 86 | { |
87 | struct async_entry *first = NULL; | 87 | struct list_head *pending; |
88 | async_cookie_t ret = ASYNC_COOKIE_MAX; | 88 | async_cookie_t ret = ASYNC_COOKIE_MAX; |
89 | unsigned long flags; | 89 | unsigned long flags; |
90 | 90 | ||
91 | spin_lock_irqsave(&async_lock, flags); | 91 | spin_lock_irqsave(&async_lock, flags); |
92 | 92 | ||
93 | if (domain) { | 93 | if (domain) |
94 | if (!list_empty(&domain->pending)) | 94 | pending = &domain->pending; |
95 | first = list_first_entry(&domain->pending, | 95 | else |
96 | struct async_entry, domain_list); | 96 | pending = &async_global_pending; |
97 | } else { | ||
98 | if (!list_empty(&async_global_pending)) | ||
99 | first = list_first_entry(&async_global_pending, | ||
100 | struct async_entry, global_list); | ||
101 | } | ||
102 | 97 | ||
103 | if (first) | 98 | if (!list_empty(pending)) |
104 | ret = first->cookie; | 99 | ret = list_first_entry(pending, struct async_entry, |
100 | domain_list)->cookie; | ||
105 | 101 | ||
106 | spin_unlock_irqrestore(&async_lock, flags); | 102 | spin_unlock_irqrestore(&async_lock, flags); |
107 | return ret; | 103 | return ret; |
@@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
149 | wake_up(&async_done); | 145 | wake_up(&async_done); |
150 | } | 146 | } |
151 | 147 | ||
152 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) | 148 | static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) |
153 | { | 149 | { |
154 | struct async_entry *entry; | 150 | struct async_entry *entry; |
155 | unsigned long flags; | 151 | unsigned long flags; |
@@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a | |||
169 | spin_unlock_irqrestore(&async_lock, flags); | 165 | spin_unlock_irqrestore(&async_lock, flags); |
170 | 166 | ||
171 | /* low on memory.. run synchronously */ | 167 | /* low on memory.. run synchronously */ |
172 | ptr(data, newcookie); | 168 | func(data, newcookie); |
173 | return newcookie; | 169 | return newcookie; |
174 | } | 170 | } |
175 | INIT_LIST_HEAD(&entry->domain_list); | 171 | INIT_LIST_HEAD(&entry->domain_list); |
176 | INIT_LIST_HEAD(&entry->global_list); | 172 | INIT_LIST_HEAD(&entry->global_list); |
177 | INIT_WORK(&entry->work, async_run_entry_fn); | 173 | INIT_WORK(&entry->work, async_run_entry_fn); |
178 | entry->func = ptr; | 174 | entry->func = func; |
179 | entry->data = data; | 175 | entry->data = data; |
180 | entry->domain = domain; | 176 | entry->domain = domain; |
181 | 177 | ||
@@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a | |||
202 | 198 | ||
203 | /** | 199 | /** |
204 | * async_schedule - schedule a function for asynchronous execution | 200 | * async_schedule - schedule a function for asynchronous execution |
205 | * @ptr: function to execute asynchronously | 201 | * @func: function to execute asynchronously |
206 | * @data: data pointer to pass to the function | 202 | * @data: data pointer to pass to the function |
207 | * | 203 | * |
208 | * Returns an async_cookie_t that may be used for checkpointing later. | 204 | * Returns an async_cookie_t that may be used for checkpointing later. |
209 | * Note: This function may be called from atomic or non-atomic contexts. | 205 | * Note: This function may be called from atomic or non-atomic contexts. |
210 | */ | 206 | */ |
211 | async_cookie_t async_schedule(async_func_ptr *ptr, void *data) | 207 | async_cookie_t async_schedule(async_func_t func, void *data) |
212 | { | 208 | { |
213 | return __async_schedule(ptr, data, &async_dfl_domain); | 209 | return __async_schedule(func, data, &async_dfl_domain); |
214 | } | 210 | } |
215 | EXPORT_SYMBOL_GPL(async_schedule); | 211 | EXPORT_SYMBOL_GPL(async_schedule); |
216 | 212 | ||
217 | /** | 213 | /** |
218 | * async_schedule_domain - schedule a function for asynchronous execution within a certain domain | 214 | * async_schedule_domain - schedule a function for asynchronous execution within a certain domain |
219 | * @ptr: function to execute asynchronously | 215 | * @func: function to execute asynchronously |
220 | * @data: data pointer to pass to the function | 216 | * @data: data pointer to pass to the function |
221 | * @domain: the domain | 217 | * @domain: the domain |
222 | * | 218 | * |
@@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule); | |||
226 | * synchronization domain is specified via @domain. Note: This function | 222 | * synchronization domain is specified via @domain. Note: This function |
227 | * may be called from atomic or non-atomic contexts. | 223 | * may be called from atomic or non-atomic contexts. |
228 | */ | 224 | */ |
229 | async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, | 225 | async_cookie_t async_schedule_domain(async_func_t func, void *data, |
230 | struct async_domain *domain) | 226 | struct async_domain *domain) |
231 | { | 227 | { |
232 | return __async_schedule(ptr, data, domain); | 228 | return __async_schedule(func, data, domain); |
233 | } | 229 | } |
234 | EXPORT_SYMBOL_GPL(async_schedule_domain); | 230 | EXPORT_SYMBOL_GPL(async_schedule_domain); |
235 | 231 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f15..21c7fa615bd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -49,6 +49,8 @@ | |||
49 | #include <linux/slab.h> | 49 | #include <linux/slab.h> |
50 | #include <linux/err.h> | 50 | #include <linux/err.h> |
51 | #include <linux/kthread.h> | 51 | #include <linux/kthread.h> |
52 | #include <linux/kernel.h> | ||
53 | #include <linux/syscalls.h> | ||
52 | 54 | ||
53 | #include <linux/audit.h> | 55 | #include <linux/audit.h> |
54 | 56 | ||
@@ -58,7 +60,7 @@ | |||
58 | #ifdef CONFIG_SECURITY | 60 | #ifdef CONFIG_SECURITY |
59 | #include <linux/security.h> | 61 | #include <linux/security.h> |
60 | #endif | 62 | #endif |
61 | #include <linux/netlink.h> | 63 | #include <net/netlink.h> |
62 | #include <linux/freezer.h> | 64 | #include <linux/freezer.h> |
63 | #include <linux/tty.h> | 65 | #include <linux/tty.h> |
64 | #include <linux/pid_namespace.h> | 66 | #include <linux/pid_namespace.h> |
@@ -265,7 +267,6 @@ void audit_log_lost(const char *message) | |||
265 | } | 267 | } |
266 | 268 | ||
267 | static int audit_log_config_change(char *function_name, int new, int old, | 269 | static int audit_log_config_change(char *function_name, int new, int old, |
268 | kuid_t loginuid, u32 sessionid, u32 sid, | ||
269 | int allow_changes) | 270 | int allow_changes) |
270 | { | 271 | { |
271 | struct audit_buffer *ab; | 272 | struct audit_buffer *ab; |
@@ -274,29 +275,17 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
274 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 275 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
275 | if (unlikely(!ab)) | 276 | if (unlikely(!ab)) |
276 | return rc; | 277 | return rc; |
277 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, | 278 | audit_log_format(ab, "%s=%d old=%d", function_name, new, old); |
278 | old, from_kuid(&init_user_ns, loginuid), sessionid); | 279 | audit_log_session_info(ab); |
279 | if (sid) { | 280 | rc = audit_log_task_context(ab); |
280 | char *ctx = NULL; | 281 | if (rc) |
281 | u32 len; | 282 | allow_changes = 0; /* Something weird, deny request */ |
282 | |||
283 | rc = security_secid_to_secctx(sid, &ctx, &len); | ||
284 | if (rc) { | ||
285 | audit_log_format(ab, " sid=%u", sid); | ||
286 | allow_changes = 0; /* Something weird, deny request */ | ||
287 | } else { | ||
288 | audit_log_format(ab, " subj=%s", ctx); | ||
289 | security_release_secctx(ctx, len); | ||
290 | } | ||
291 | } | ||
292 | audit_log_format(ab, " res=%d", allow_changes); | 283 | audit_log_format(ab, " res=%d", allow_changes); |
293 | audit_log_end(ab); | 284 | audit_log_end(ab); |
294 | return rc; | 285 | return rc; |
295 | } | 286 | } |
296 | 287 | ||
297 | static int audit_do_config_change(char *function_name, int *to_change, | 288 | static int audit_do_config_change(char *function_name, int *to_change, int new) |
298 | int new, kuid_t loginuid, u32 sessionid, | ||
299 | u32 sid) | ||
300 | { | 289 | { |
301 | int allow_changes, rc = 0, old = *to_change; | 290 | int allow_changes, rc = 0, old = *to_change; |
302 | 291 | ||
@@ -307,8 +296,7 @@ static int audit_do_config_change(char *function_name, int *to_change, | |||
307 | allow_changes = 1; | 296 | allow_changes = 1; |
308 | 297 | ||
309 | if (audit_enabled != AUDIT_OFF) { | 298 | if (audit_enabled != AUDIT_OFF) { |
310 | rc = audit_log_config_change(function_name, new, old, loginuid, | 299 | rc = audit_log_config_change(function_name, new, old, allow_changes); |
311 | sessionid, sid, allow_changes); | ||
312 | if (rc) | 300 | if (rc) |
313 | allow_changes = 0; | 301 | allow_changes = 0; |
314 | } | 302 | } |
@@ -322,44 +310,37 @@ static int audit_do_config_change(char *function_name, int *to_change, | |||
322 | return rc; | 310 | return rc; |
323 | } | 311 | } |
324 | 312 | ||
325 | static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, | 313 | static int audit_set_rate_limit(int limit) |
326 | u32 sid) | ||
327 | { | 314 | { |
328 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, | 315 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); |
329 | limit, loginuid, sessionid, sid); | ||
330 | } | 316 | } |
331 | 317 | ||
332 | static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, | 318 | static int audit_set_backlog_limit(int limit) |
333 | u32 sid) | ||
334 | { | 319 | { |
335 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, | 320 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); |
336 | limit, loginuid, sessionid, sid); | ||
337 | } | 321 | } |
338 | 322 | ||
339 | static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) | 323 | static int audit_set_enabled(int state) |
340 | { | 324 | { |
341 | int rc; | 325 | int rc; |
342 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) | 326 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) |
343 | return -EINVAL; | 327 | return -EINVAL; |
344 | 328 | ||
345 | rc = audit_do_config_change("audit_enabled", &audit_enabled, state, | 329 | rc = audit_do_config_change("audit_enabled", &audit_enabled, state); |
346 | loginuid, sessionid, sid); | ||
347 | |||
348 | if (!rc) | 330 | if (!rc) |
349 | audit_ever_enabled |= !!state; | 331 | audit_ever_enabled |= !!state; |
350 | 332 | ||
351 | return rc; | 333 | return rc; |
352 | } | 334 | } |
353 | 335 | ||
354 | static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) | 336 | static int audit_set_failure(int state) |
355 | { | 337 | { |
356 | if (state != AUDIT_FAIL_SILENT | 338 | if (state != AUDIT_FAIL_SILENT |
357 | && state != AUDIT_FAIL_PRINTK | 339 | && state != AUDIT_FAIL_PRINTK |
358 | && state != AUDIT_FAIL_PANIC) | 340 | && state != AUDIT_FAIL_PANIC) |
359 | return -EINVAL; | 341 | return -EINVAL; |
360 | 342 | ||
361 | return audit_do_config_change("audit_failure", &audit_failure, state, | 343 | return audit_do_config_change("audit_failure", &audit_failure, state); |
362 | loginuid, sessionid, sid); | ||
363 | } | 344 | } |
364 | 345 | ||
365 | /* | 346 | /* |
@@ -417,34 +398,53 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
417 | consume_skb(skb); | 398 | consume_skb(skb); |
418 | } | 399 | } |
419 | 400 | ||
420 | static int kauditd_thread(void *dummy) | 401 | /* |
402 | * flush_hold_queue - empty the hold queue if auditd appears | ||
403 | * | ||
404 | * If auditd just started, drain the queue of messages already | ||
405 | * sent to syslog/printk. Remember loss here is ok. We already | ||
406 | * called audit_log_lost() if it didn't go out normally. so the | ||
407 | * race between the skb_dequeue and the next check for audit_pid | ||
408 | * doesn't matter. | ||
409 | * | ||
410 | * If you ever find kauditd to be too slow we can get a perf win | ||
411 | * by doing our own locking and keeping better track if there | ||
412 | * are messages in this queue. I don't see the need now, but | ||
413 | * in 5 years when I want to play with this again I'll see this | ||
414 | * note and still have no friggin idea what i'm thinking today. | ||
415 | */ | ||
416 | static void flush_hold_queue(void) | ||
421 | { | 417 | { |
422 | struct sk_buff *skb; | 418 | struct sk_buff *skb; |
423 | 419 | ||
420 | if (!audit_default || !audit_pid) | ||
421 | return; | ||
422 | |||
423 | skb = skb_dequeue(&audit_skb_hold_queue); | ||
424 | if (likely(!skb)) | ||
425 | return; | ||
426 | |||
427 | while (skb && audit_pid) { | ||
428 | kauditd_send_skb(skb); | ||
429 | skb = skb_dequeue(&audit_skb_hold_queue); | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * if auditd just disappeared but we | ||
434 | * dequeued an skb we need to drop ref | ||
435 | */ | ||
436 | if (skb) | ||
437 | consume_skb(skb); | ||
438 | } | ||
439 | |||
440 | static int kauditd_thread(void *dummy) | ||
441 | { | ||
424 | set_freezable(); | 442 | set_freezable(); |
425 | while (!kthread_should_stop()) { | 443 | while (!kthread_should_stop()) { |
426 | /* | 444 | struct sk_buff *skb; |
427 | * if auditd just started drain the queue of messages already | 445 | DECLARE_WAITQUEUE(wait, current); |
428 | * sent to syslog/printk. remember loss here is ok. we already | 446 | |
429 | * called audit_log_lost() if it didn't go out normally. so the | 447 | flush_hold_queue(); |
430 | * race between the skb_dequeue and the next check for audit_pid | ||
431 | * doesn't matter. | ||
432 | * | ||
433 | * if you ever find kauditd to be too slow we can get a perf win | ||
434 | * by doing our own locking and keeping better track if there | ||
435 | * are messages in this queue. I don't see the need now, but | ||
436 | * in 5 years when I want to play with this again I'll see this | ||
437 | * note and still have no friggin idea what i'm thinking today. | ||
438 | */ | ||
439 | if (audit_default && audit_pid) { | ||
440 | skb = skb_dequeue(&audit_skb_hold_queue); | ||
441 | if (unlikely(skb)) { | ||
442 | while (skb && audit_pid) { | ||
443 | kauditd_send_skb(skb); | ||
444 | skb = skb_dequeue(&audit_skb_hold_queue); | ||
445 | } | ||
446 | } | ||
447 | } | ||
448 | 448 | ||
449 | skb = skb_dequeue(&audit_skb_queue); | 449 | skb = skb_dequeue(&audit_skb_queue); |
450 | wake_up(&audit_backlog_wait); | 450 | wake_up(&audit_backlog_wait); |
@@ -453,19 +453,18 @@ static int kauditd_thread(void *dummy) | |||
453 | kauditd_send_skb(skb); | 453 | kauditd_send_skb(skb); |
454 | else | 454 | else |
455 | audit_printk_skb(skb); | 455 | audit_printk_skb(skb); |
456 | } else { | 456 | continue; |
457 | DECLARE_WAITQUEUE(wait, current); | 457 | } |
458 | set_current_state(TASK_INTERRUPTIBLE); | 458 | set_current_state(TASK_INTERRUPTIBLE); |
459 | add_wait_queue(&kauditd_wait, &wait); | 459 | add_wait_queue(&kauditd_wait, &wait); |
460 | |||
461 | if (!skb_queue_len(&audit_skb_queue)) { | ||
462 | try_to_freeze(); | ||
463 | schedule(); | ||
464 | } | ||
465 | 460 | ||
466 | __set_current_state(TASK_RUNNING); | 461 | if (!skb_queue_len(&audit_skb_queue)) { |
467 | remove_wait_queue(&kauditd_wait, &wait); | 462 | try_to_freeze(); |
463 | schedule(); | ||
468 | } | 464 | } |
465 | |||
466 | __set_current_state(TASK_RUNNING); | ||
467 | remove_wait_queue(&kauditd_wait, &wait); | ||
469 | } | 468 | } |
470 | return 0; | 469 | return 0; |
471 | } | 470 | } |
@@ -579,13 +578,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
579 | return -EPERM; | 578 | return -EPERM; |
580 | 579 | ||
581 | switch (msg_type) { | 580 | switch (msg_type) { |
582 | case AUDIT_GET: | ||
583 | case AUDIT_LIST: | 581 | case AUDIT_LIST: |
584 | case AUDIT_LIST_RULES: | ||
585 | case AUDIT_SET: | ||
586 | case AUDIT_ADD: | 582 | case AUDIT_ADD: |
587 | case AUDIT_ADD_RULE: | ||
588 | case AUDIT_DEL: | 583 | case AUDIT_DEL: |
584 | return -EOPNOTSUPP; | ||
585 | case AUDIT_GET: | ||
586 | case AUDIT_SET: | ||
587 | case AUDIT_LIST_RULES: | ||
588 | case AUDIT_ADD_RULE: | ||
589 | case AUDIT_DEL_RULE: | 589 | case AUDIT_DEL_RULE: |
590 | case AUDIT_SIGNAL_INFO: | 590 | case AUDIT_SIGNAL_INFO: |
591 | case AUDIT_TTY_GET: | 591 | case AUDIT_TTY_GET: |
@@ -608,12 +608,10 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
608 | return err; | 608 | return err; |
609 | } | 609 | } |
610 | 610 | ||
611 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | 611 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) |
612 | kuid_t auid, u32 ses, u32 sid) | ||
613 | { | 612 | { |
614 | int rc = 0; | 613 | int rc = 0; |
615 | char *ctx = NULL; | 614 | uid_t uid = from_kuid(&init_user_ns, current_uid()); |
616 | u32 len; | ||
617 | 615 | ||
618 | if (!audit_enabled) { | 616 | if (!audit_enabled) { |
619 | *ab = NULL; | 617 | *ab = NULL; |
@@ -623,33 +621,21 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
623 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 621 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
624 | if (unlikely(!*ab)) | 622 | if (unlikely(!*ab)) |
625 | return rc; | 623 | return rc; |
626 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", | 624 | audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); |
627 | task_tgid_vnr(current), | 625 | audit_log_session_info(*ab); |
628 | from_kuid(&init_user_ns, current_uid()), | 626 | audit_log_task_context(*ab); |
629 | from_kuid(&init_user_ns, auid), ses); | ||
630 | if (sid) { | ||
631 | rc = security_secid_to_secctx(sid, &ctx, &len); | ||
632 | if (rc) | ||
633 | audit_log_format(*ab, " ssid=%u", sid); | ||
634 | else { | ||
635 | audit_log_format(*ab, " subj=%s", ctx); | ||
636 | security_release_secctx(ctx, len); | ||
637 | } | ||
638 | } | ||
639 | 627 | ||
640 | return rc; | 628 | return rc; |
641 | } | 629 | } |
642 | 630 | ||
643 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | 631 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
644 | { | 632 | { |
645 | u32 seq, sid; | 633 | u32 seq; |
646 | void *data; | 634 | void *data; |
647 | struct audit_status *status_get, status_set; | 635 | struct audit_status *status_get, status_set; |
648 | int err; | 636 | int err; |
649 | struct audit_buffer *ab; | 637 | struct audit_buffer *ab; |
650 | u16 msg_type = nlh->nlmsg_type; | 638 | u16 msg_type = nlh->nlmsg_type; |
651 | kuid_t loginuid; /* loginuid of sender */ | ||
652 | u32 sessionid; | ||
653 | struct audit_sig_info *sig_data; | 639 | struct audit_sig_info *sig_data; |
654 | char *ctx = NULL; | 640 | char *ctx = NULL; |
655 | u32 len; | 641 | u32 len; |
@@ -660,17 +646,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
660 | 646 | ||
661 | /* As soon as there's any sign of userspace auditd, | 647 | /* As soon as there's any sign of userspace auditd, |
662 | * start kauditd to talk to it */ | 648 | * start kauditd to talk to it */ |
663 | if (!kauditd_task) | 649 | if (!kauditd_task) { |
664 | kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); | 650 | kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); |
665 | if (IS_ERR(kauditd_task)) { | 651 | if (IS_ERR(kauditd_task)) { |
666 | err = PTR_ERR(kauditd_task); | 652 | err = PTR_ERR(kauditd_task); |
667 | kauditd_task = NULL; | 653 | kauditd_task = NULL; |
668 | return err; | 654 | return err; |
655 | } | ||
669 | } | 656 | } |
670 | |||
671 | loginuid = audit_get_loginuid(current); | ||
672 | sessionid = audit_get_sessionid(current); | ||
673 | security_task_getsecid(current, &sid); | ||
674 | seq = nlh->nlmsg_seq; | 657 | seq = nlh->nlmsg_seq; |
675 | data = nlmsg_data(nlh); | 658 | data = nlmsg_data(nlh); |
676 | 659 | ||
@@ -691,14 +674,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
691 | return -EINVAL; | 674 | return -EINVAL; |
692 | status_get = (struct audit_status *)data; | 675 | status_get = (struct audit_status *)data; |
693 | if (status_get->mask & AUDIT_STATUS_ENABLED) { | 676 | if (status_get->mask & AUDIT_STATUS_ENABLED) { |
694 | err = audit_set_enabled(status_get->enabled, | 677 | err = audit_set_enabled(status_get->enabled); |
695 | loginuid, sessionid, sid); | ||
696 | if (err < 0) | 678 | if (err < 0) |
697 | return err; | 679 | return err; |
698 | } | 680 | } |
699 | if (status_get->mask & AUDIT_STATUS_FAILURE) { | 681 | if (status_get->mask & AUDIT_STATUS_FAILURE) { |
700 | err = audit_set_failure(status_get->failure, | 682 | err = audit_set_failure(status_get->failure); |
701 | loginuid, sessionid, sid); | ||
702 | if (err < 0) | 683 | if (err < 0) |
703 | return err; | 684 | return err; |
704 | } | 685 | } |
@@ -706,22 +687,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
706 | int new_pid = status_get->pid; | 687 | int new_pid = status_get->pid; |
707 | 688 | ||
708 | if (audit_enabled != AUDIT_OFF) | 689 | if (audit_enabled != AUDIT_OFF) |
709 | audit_log_config_change("audit_pid", new_pid, | 690 | audit_log_config_change("audit_pid", new_pid, audit_pid, 1); |
710 | audit_pid, loginuid, | ||
711 | sessionid, sid, 1); | ||
712 | |||
713 | audit_pid = new_pid; | 691 | audit_pid = new_pid; |
714 | audit_nlk_portid = NETLINK_CB(skb).portid; | 692 | audit_nlk_portid = NETLINK_CB(skb).portid; |
715 | } | 693 | } |
716 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { | 694 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { |
717 | err = audit_set_rate_limit(status_get->rate_limit, | 695 | err = audit_set_rate_limit(status_get->rate_limit); |
718 | loginuid, sessionid, sid); | ||
719 | if (err < 0) | 696 | if (err < 0) |
720 | return err; | 697 | return err; |
721 | } | 698 | } |
722 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | 699 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) |
723 | err = audit_set_backlog_limit(status_get->backlog_limit, | 700 | err = audit_set_backlog_limit(status_get->backlog_limit); |
724 | loginuid, sessionid, sid); | ||
725 | break; | 701 | break; |
726 | case AUDIT_USER: | 702 | case AUDIT_USER: |
727 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: | 703 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: |
@@ -729,25 +705,22 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
729 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) | 705 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
730 | return 0; | 706 | return 0; |
731 | 707 | ||
732 | err = audit_filter_user(); | 708 | err = audit_filter_user(msg_type); |
733 | if (err == 1) { | 709 | if (err == 1) { |
734 | err = 0; | 710 | err = 0; |
735 | if (msg_type == AUDIT_USER_TTY) { | 711 | if (msg_type == AUDIT_USER_TTY) { |
736 | err = tty_audit_push_task(current, loginuid, | 712 | err = tty_audit_push_current(); |
737 | sessionid); | ||
738 | if (err) | 713 | if (err) |
739 | break; | 714 | break; |
740 | } | 715 | } |
741 | audit_log_common_recv_msg(&ab, msg_type, | 716 | audit_log_common_recv_msg(&ab, msg_type); |
742 | loginuid, sessionid, sid); | ||
743 | |||
744 | if (msg_type != AUDIT_USER_TTY) | 717 | if (msg_type != AUDIT_USER_TTY) |
745 | audit_log_format(ab, " msg='%.1024s'", | 718 | audit_log_format(ab, " msg='%.1024s'", |
746 | (char *)data); | 719 | (char *)data); |
747 | else { | 720 | else { |
748 | int size; | 721 | int size; |
749 | 722 | ||
750 | audit_log_format(ab, " msg="); | 723 | audit_log_format(ab, " data="); |
751 | size = nlmsg_len(nlh); | 724 | size = nlmsg_len(nlh); |
752 | if (size > 0 && | 725 | if (size > 0 && |
753 | ((unsigned char *)data)[size - 1] == '\0') | 726 | ((unsigned char *)data)[size - 1] == '\0') |
@@ -758,50 +731,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
758 | audit_log_end(ab); | 731 | audit_log_end(ab); |
759 | } | 732 | } |
760 | break; | 733 | break; |
761 | case AUDIT_ADD: | ||
762 | case AUDIT_DEL: | ||
763 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) | ||
764 | return -EINVAL; | ||
765 | if (audit_enabled == AUDIT_LOCKED) { | ||
766 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, | ||
767 | loginuid, sessionid, sid); | ||
768 | |||
769 | audit_log_format(ab, " audit_enabled=%d res=0", | ||
770 | audit_enabled); | ||
771 | audit_log_end(ab); | ||
772 | return -EPERM; | ||
773 | } | ||
774 | /* fallthrough */ | ||
775 | case AUDIT_LIST: | ||
776 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, | ||
777 | seq, data, nlmsg_len(nlh), | ||
778 | loginuid, sessionid, sid); | ||
779 | break; | ||
780 | case AUDIT_ADD_RULE: | 734 | case AUDIT_ADD_RULE: |
781 | case AUDIT_DEL_RULE: | 735 | case AUDIT_DEL_RULE: |
782 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) | 736 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) |
783 | return -EINVAL; | 737 | return -EINVAL; |
784 | if (audit_enabled == AUDIT_LOCKED) { | 738 | if (audit_enabled == AUDIT_LOCKED) { |
785 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, | 739 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); |
786 | loginuid, sessionid, sid); | 740 | audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); |
787 | |||
788 | audit_log_format(ab, " audit_enabled=%d res=0", | ||
789 | audit_enabled); | ||
790 | audit_log_end(ab); | 741 | audit_log_end(ab); |
791 | return -EPERM; | 742 | return -EPERM; |
792 | } | 743 | } |
793 | /* fallthrough */ | 744 | /* fallthrough */ |
794 | case AUDIT_LIST_RULES: | 745 | case AUDIT_LIST_RULES: |
795 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, | 746 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, |
796 | seq, data, nlmsg_len(nlh), | 747 | seq, data, nlmsg_len(nlh)); |
797 | loginuid, sessionid, sid); | ||
798 | break; | 748 | break; |
799 | case AUDIT_TRIM: | 749 | case AUDIT_TRIM: |
800 | audit_trim_trees(); | 750 | audit_trim_trees(); |
801 | 751 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); | |
802 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, | ||
803 | loginuid, sessionid, sid); | ||
804 | |||
805 | audit_log_format(ab, " op=trim res=1"); | 752 | audit_log_format(ab, " op=trim res=1"); |
806 | audit_log_end(ab); | 753 | audit_log_end(ab); |
807 | break; | 754 | break; |
@@ -831,8 +778,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
831 | /* OK, here comes... */ | 778 | /* OK, here comes... */ |
832 | err = audit_tag_tree(old, new); | 779 | err = audit_tag_tree(old, new); |
833 | 780 | ||
834 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, | 781 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); |
835 | loginuid, sessionid, sid); | ||
836 | 782 | ||
837 | audit_log_format(ab, " op=make_equiv old="); | 783 | audit_log_format(ab, " op=make_equiv old="); |
838 | audit_log_untrustedstring(ab, old); | 784 | audit_log_untrustedstring(ab, old); |
@@ -871,27 +817,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
871 | struct audit_tty_status s; | 817 | struct audit_tty_status s; |
872 | struct task_struct *tsk = current; | 818 | struct task_struct *tsk = current; |
873 | 819 | ||
874 | spin_lock_irq(&tsk->sighand->siglock); | 820 | spin_lock(&tsk->sighand->siglock); |
875 | s.enabled = tsk->signal->audit_tty != 0; | 821 | s.enabled = tsk->signal->audit_tty != 0; |
876 | spin_unlock_irq(&tsk->sighand->siglock); | 822 | s.log_passwd = tsk->signal->audit_tty_log_passwd; |
823 | spin_unlock(&tsk->sighand->siglock); | ||
877 | 824 | ||
878 | audit_send_reply(NETLINK_CB(skb).portid, seq, | 825 | audit_send_reply(NETLINK_CB(skb).portid, seq, |
879 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); | 826 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); |
880 | break; | 827 | break; |
881 | } | 828 | } |
882 | case AUDIT_TTY_SET: { | 829 | case AUDIT_TTY_SET: { |
883 | struct audit_tty_status *s; | 830 | struct audit_tty_status s; |
884 | struct task_struct *tsk = current; | 831 | struct task_struct *tsk = current; |
885 | 832 | ||
886 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) | 833 | memset(&s, 0, sizeof(s)); |
887 | return -EINVAL; | 834 | /* guard against past and future API changes */ |
888 | s = data; | 835 | memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); |
889 | if (s->enabled != 0 && s->enabled != 1) | 836 | if ((s.enabled != 0 && s.enabled != 1) || |
837 | (s.log_passwd != 0 && s.log_passwd != 1)) | ||
890 | return -EINVAL; | 838 | return -EINVAL; |
891 | 839 | ||
892 | spin_lock_irq(&tsk->sighand->siglock); | 840 | spin_lock(&tsk->sighand->siglock); |
893 | tsk->signal->audit_tty = s->enabled != 0; | 841 | tsk->signal->audit_tty = s.enabled; |
894 | spin_unlock_irq(&tsk->sighand->siglock); | 842 | tsk->signal->audit_tty_log_passwd = s.log_passwd; |
843 | spin_unlock(&tsk->sighand->siglock); | ||
895 | break; | 844 | break; |
896 | } | 845 | } |
897 | default: | 846 | default: |
@@ -910,7 +859,7 @@ static void audit_receive_skb(struct sk_buff *skb) | |||
910 | { | 859 | { |
911 | struct nlmsghdr *nlh; | 860 | struct nlmsghdr *nlh; |
912 | /* | 861 | /* |
913 | * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 | 862 | * len MUST be signed for nlmsg_next to be able to dec it below 0 |
914 | * if the nlmsg_len was not aligned | 863 | * if the nlmsg_len was not aligned |
915 | */ | 864 | */ |
916 | int len; | 865 | int len; |
@@ -919,13 +868,13 @@ static void audit_receive_skb(struct sk_buff *skb) | |||
919 | nlh = nlmsg_hdr(skb); | 868 | nlh = nlmsg_hdr(skb); |
920 | len = skb->len; | 869 | len = skb->len; |
921 | 870 | ||
922 | while (NLMSG_OK(nlh, len)) { | 871 | while (nlmsg_ok(nlh, len)) { |
923 | err = audit_receive_msg(skb, nlh); | 872 | err = audit_receive_msg(skb, nlh); |
924 | /* if err or if this message says it wants a response */ | 873 | /* if err or if this message says it wants a response */ |
925 | if (err || (nlh->nlmsg_flags & NLM_F_ACK)) | 874 | if (err || (nlh->nlmsg_flags & NLM_F_ACK)) |
926 | netlink_ack(skb, nlh, err); | 875 | netlink_ack(skb, nlh, err); |
927 | 876 | ||
928 | nlh = NLMSG_NEXT(nlh, len); | 877 | nlh = nlmsg_next(nlh, &len); |
929 | } | 878 | } |
930 | } | 879 | } |
931 | 880 | ||
@@ -1434,6 +1383,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | |||
1434 | kfree(pathname); | 1383 | kfree(pathname); |
1435 | } | 1384 | } |
1436 | 1385 | ||
1386 | void audit_log_session_info(struct audit_buffer *ab) | ||
1387 | { | ||
1388 | u32 sessionid = audit_get_sessionid(current); | ||
1389 | uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); | ||
1390 | |||
1391 | audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); | ||
1392 | } | ||
1393 | |||
1437 | void audit_log_key(struct audit_buffer *ab, char *key) | 1394 | void audit_log_key(struct audit_buffer *ab, char *key) |
1438 | { | 1395 | { |
1439 | audit_log_format(ab, " key="); | 1396 | audit_log_format(ab, " key="); |
@@ -1443,6 +1400,224 @@ void audit_log_key(struct audit_buffer *ab, char *key) | |||
1443 | audit_log_format(ab, "(null)"); | 1400 | audit_log_format(ab, "(null)"); |
1444 | } | 1401 | } |
1445 | 1402 | ||
1403 | void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) | ||
1404 | { | ||
1405 | int i; | ||
1406 | |||
1407 | audit_log_format(ab, " %s=", prefix); | ||
1408 | CAP_FOR_EACH_U32(i) { | ||
1409 | audit_log_format(ab, "%08x", | ||
1410 | cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); | ||
1411 | } | ||
1412 | } | ||
1413 | |||
1414 | void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) | ||
1415 | { | ||
1416 | kernel_cap_t *perm = &name->fcap.permitted; | ||
1417 | kernel_cap_t *inh = &name->fcap.inheritable; | ||
1418 | int log = 0; | ||
1419 | |||
1420 | if (!cap_isclear(*perm)) { | ||
1421 | audit_log_cap(ab, "cap_fp", perm); | ||
1422 | log = 1; | ||
1423 | } | ||
1424 | if (!cap_isclear(*inh)) { | ||
1425 | audit_log_cap(ab, "cap_fi", inh); | ||
1426 | log = 1; | ||
1427 | } | ||
1428 | |||
1429 | if (log) | ||
1430 | audit_log_format(ab, " cap_fe=%d cap_fver=%x", | ||
1431 | name->fcap.fE, name->fcap_ver); | ||
1432 | } | ||
1433 | |||
1434 | static inline int audit_copy_fcaps(struct audit_names *name, | ||
1435 | const struct dentry *dentry) | ||
1436 | { | ||
1437 | struct cpu_vfs_cap_data caps; | ||
1438 | int rc; | ||
1439 | |||
1440 | if (!dentry) | ||
1441 | return 0; | ||
1442 | |||
1443 | rc = get_vfs_caps_from_disk(dentry, &caps); | ||
1444 | if (rc) | ||
1445 | return rc; | ||
1446 | |||
1447 | name->fcap.permitted = caps.permitted; | ||
1448 | name->fcap.inheritable = caps.inheritable; | ||
1449 | name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); | ||
1450 | name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> | ||
1451 | VFS_CAP_REVISION_SHIFT; | ||
1452 | |||
1453 | return 0; | ||
1454 | } | ||
1455 | |||
1456 | /* Copy inode data into an audit_names. */ | ||
1457 | void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, | ||
1458 | const struct inode *inode) | ||
1459 | { | ||
1460 | name->ino = inode->i_ino; | ||
1461 | name->dev = inode->i_sb->s_dev; | ||
1462 | name->mode = inode->i_mode; | ||
1463 | name->uid = inode->i_uid; | ||
1464 | name->gid = inode->i_gid; | ||
1465 | name->rdev = inode->i_rdev; | ||
1466 | security_inode_getsecid(inode, &name->osid); | ||
1467 | audit_copy_fcaps(name, dentry); | ||
1468 | } | ||
1469 | |||
1470 | /** | ||
1471 | * audit_log_name - produce AUDIT_PATH record from struct audit_names | ||
1472 | * @context: audit_context for the task | ||
1473 | * @n: audit_names structure with reportable details | ||
1474 | * @path: optional path to report instead of audit_names->name | ||
1475 | * @record_num: record number to report when handling a list of names | ||
1476 | * @call_panic: optional pointer to int that will be updated if secid fails | ||
1477 | */ | ||
1478 | void audit_log_name(struct audit_context *context, struct audit_names *n, | ||
1479 | struct path *path, int record_num, int *call_panic) | ||
1480 | { | ||
1481 | struct audit_buffer *ab; | ||
1482 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | ||
1483 | if (!ab) | ||
1484 | return; | ||
1485 | |||
1486 | audit_log_format(ab, "item=%d", record_num); | ||
1487 | |||
1488 | if (path) | ||
1489 | audit_log_d_path(ab, " name=", path); | ||
1490 | else if (n->name) { | ||
1491 | switch (n->name_len) { | ||
1492 | case AUDIT_NAME_FULL: | ||
1493 | /* log the full path */ | ||
1494 | audit_log_format(ab, " name="); | ||
1495 | audit_log_untrustedstring(ab, n->name->name); | ||
1496 | break; | ||
1497 | case 0: | ||
1498 | /* name was specified as a relative path and the | ||
1499 | * directory component is the cwd */ | ||
1500 | audit_log_d_path(ab, " name=", &context->pwd); | ||
1501 | break; | ||
1502 | default: | ||
1503 | /* log the name's directory component */ | ||
1504 | audit_log_format(ab, " name="); | ||
1505 | audit_log_n_untrustedstring(ab, n->name->name, | ||
1506 | n->name_len); | ||
1507 | } | ||
1508 | } else | ||
1509 | audit_log_format(ab, " name=(null)"); | ||
1510 | |||
1511 | if (n->ino != (unsigned long)-1) { | ||
1512 | audit_log_format(ab, " inode=%lu" | ||
1513 | " dev=%02x:%02x mode=%#ho" | ||
1514 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
1515 | n->ino, | ||
1516 | MAJOR(n->dev), | ||
1517 | MINOR(n->dev), | ||
1518 | n->mode, | ||
1519 | from_kuid(&init_user_ns, n->uid), | ||
1520 | from_kgid(&init_user_ns, n->gid), | ||
1521 | MAJOR(n->rdev), | ||
1522 | MINOR(n->rdev)); | ||
1523 | } | ||
1524 | if (n->osid != 0) { | ||
1525 | char *ctx = NULL; | ||
1526 | u32 len; | ||
1527 | if (security_secid_to_secctx( | ||
1528 | n->osid, &ctx, &len)) { | ||
1529 | audit_log_format(ab, " osid=%u", n->osid); | ||
1530 | if (call_panic) | ||
1531 | *call_panic = 2; | ||
1532 | } else { | ||
1533 | audit_log_format(ab, " obj=%s", ctx); | ||
1534 | security_release_secctx(ctx, len); | ||
1535 | } | ||
1536 | } | ||
1537 | |||
1538 | audit_log_fcaps(ab, n); | ||
1539 | audit_log_end(ab); | ||
1540 | } | ||
1541 | |||
1542 | int audit_log_task_context(struct audit_buffer *ab) | ||
1543 | { | ||
1544 | char *ctx = NULL; | ||
1545 | unsigned len; | ||
1546 | int error; | ||
1547 | u32 sid; | ||
1548 | |||
1549 | security_task_getsecid(current, &sid); | ||
1550 | if (!sid) | ||
1551 | return 0; | ||
1552 | |||
1553 | error = security_secid_to_secctx(sid, &ctx, &len); | ||
1554 | if (error) { | ||
1555 | if (error != -EINVAL) | ||
1556 | goto error_path; | ||
1557 | return 0; | ||
1558 | } | ||
1559 | |||
1560 | audit_log_format(ab, " subj=%s", ctx); | ||
1561 | security_release_secctx(ctx, len); | ||
1562 | return 0; | ||
1563 | |||
1564 | error_path: | ||
1565 | audit_panic("error in audit_log_task_context"); | ||
1566 | return error; | ||
1567 | } | ||
1568 | EXPORT_SYMBOL(audit_log_task_context); | ||
1569 | |||
1570 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | ||
1571 | { | ||
1572 | const struct cred *cred; | ||
1573 | char name[sizeof(tsk->comm)]; | ||
1574 | struct mm_struct *mm = tsk->mm; | ||
1575 | char *tty; | ||
1576 | |||
1577 | if (!ab) | ||
1578 | return; | ||
1579 | |||
1580 | /* tsk == current */ | ||
1581 | cred = current_cred(); | ||
1582 | |||
1583 | spin_lock_irq(&tsk->sighand->siglock); | ||
1584 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | ||
1585 | tty = tsk->signal->tty->name; | ||
1586 | else | ||
1587 | tty = "(none)"; | ||
1588 | spin_unlock_irq(&tsk->sighand->siglock); | ||
1589 | |||
1590 | audit_log_format(ab, | ||
1591 | " ppid=%ld pid=%d auid=%u uid=%u gid=%u" | ||
1592 | " euid=%u suid=%u fsuid=%u" | ||
1593 | " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", | ||
1594 | sys_getppid(), | ||
1595 | tsk->pid, | ||
1596 | from_kuid(&init_user_ns, audit_get_loginuid(tsk)), | ||
1597 | from_kuid(&init_user_ns, cred->uid), | ||
1598 | from_kgid(&init_user_ns, cred->gid), | ||
1599 | from_kuid(&init_user_ns, cred->euid), | ||
1600 | from_kuid(&init_user_ns, cred->suid), | ||
1601 | from_kuid(&init_user_ns, cred->fsuid), | ||
1602 | from_kgid(&init_user_ns, cred->egid), | ||
1603 | from_kgid(&init_user_ns, cred->sgid), | ||
1604 | from_kgid(&init_user_ns, cred->fsgid), | ||
1605 | audit_get_sessionid(tsk), tty); | ||
1606 | |||
1607 | get_task_comm(name, tsk); | ||
1608 | audit_log_format(ab, " comm="); | ||
1609 | audit_log_untrustedstring(ab, name); | ||
1610 | |||
1611 | if (mm) { | ||
1612 | down_read(&mm->mmap_sem); | ||
1613 | if (mm->exe_file) | ||
1614 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); | ||
1615 | up_read(&mm->mmap_sem); | ||
1616 | } | ||
1617 | audit_log_task_context(ab); | ||
1618 | } | ||
1619 | EXPORT_SYMBOL(audit_log_task_info); | ||
1620 | |||
1446 | /** | 1621 | /** |
1447 | * audit_log_link_denied - report a link restriction denial | 1622 | * audit_log_link_denied - report a link restriction denial |
1448 | * @operation: specific link opreation | 1623 | * @operation: specific link opreation |
@@ -1451,19 +1626,28 @@ void audit_log_key(struct audit_buffer *ab, char *key) | |||
1451 | void audit_log_link_denied(const char *operation, struct path *link) | 1626 | void audit_log_link_denied(const char *operation, struct path *link) |
1452 | { | 1627 | { |
1453 | struct audit_buffer *ab; | 1628 | struct audit_buffer *ab; |
1629 | struct audit_names *name; | ||
1630 | |||
1631 | name = kzalloc(sizeof(*name), GFP_NOFS); | ||
1632 | if (!name) | ||
1633 | return; | ||
1454 | 1634 | ||
1635 | /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ | ||
1455 | ab = audit_log_start(current->audit_context, GFP_KERNEL, | 1636 | ab = audit_log_start(current->audit_context, GFP_KERNEL, |
1456 | AUDIT_ANOM_LINK); | 1637 | AUDIT_ANOM_LINK); |
1457 | if (!ab) | 1638 | if (!ab) |
1458 | return; | 1639 | goto out; |
1459 | audit_log_format(ab, "op=%s action=denied", operation); | 1640 | audit_log_format(ab, "op=%s", operation); |
1460 | audit_log_format(ab, " pid=%d comm=", current->pid); | 1641 | audit_log_task_info(ab, current); |
1461 | audit_log_untrustedstring(ab, current->comm); | 1642 | audit_log_format(ab, " res=0"); |
1462 | audit_log_d_path(ab, " path=", link); | ||
1463 | audit_log_format(ab, " dev="); | ||
1464 | audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); | ||
1465 | audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); | ||
1466 | audit_log_end(ab); | 1643 | audit_log_end(ab); |
1644 | |||
1645 | /* Generate AUDIT_PATH record with object. */ | ||
1646 | name->type = AUDIT_TYPE_NORMAL; | ||
1647 | audit_copy_inode(name, link->dentry, link->dentry->d_inode); | ||
1648 | audit_log_name(current->audit_context, name, link, 0, NULL); | ||
1649 | out: | ||
1650 | kfree(name); | ||
1467 | } | 1651 | } |
1468 | 1652 | ||
1469 | /** | 1653 | /** |
@@ -1483,7 +1667,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
1483 | audit_log_lost("rate limit exceeded"); | 1667 | audit_log_lost("rate limit exceeded"); |
1484 | } else { | 1668 | } else { |
1485 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 1669 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
1486 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); | 1670 | nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; |
1487 | 1671 | ||
1488 | if (audit_pid) { | 1672 | if (audit_pid) { |
1489 | skb_queue_tail(&audit_skb_queue, ab->skb); | 1673 | skb_queue_tail(&audit_skb_queue, ab->skb); |
diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1b..1c95131ef760 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
23 | #include <linux/audit.h> | 23 | #include <linux/audit.h> |
24 | #include <linux/skbuff.h> | 24 | #include <linux/skbuff.h> |
25 | #include <uapi/linux/mqueue.h> | ||
25 | 26 | ||
26 | /* 0 = no checking | 27 | /* 0 = no checking |
27 | 1 = put_count checking | 28 | 1 = put_count checking |
@@ -29,6 +30,11 @@ | |||
29 | */ | 30 | */ |
30 | #define AUDIT_DEBUG 0 | 31 | #define AUDIT_DEBUG 0 |
31 | 32 | ||
33 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | ||
34 | * for saving names from getname(). If we get more names we will allocate | ||
35 | * a name dynamically and also add those to the list anchored by names_list. */ | ||
36 | #define AUDIT_NAMES 5 | ||
37 | |||
32 | /* At task start time, the audit_state is set in the audit_context using | 38 | /* At task start time, the audit_state is set in the audit_context using |
33 | a per-task filter. At syscall entry, the audit_state is augmented by | 39 | a per-task filter. At syscall entry, the audit_state is augmented by |
34 | the syscall filter. */ | 40 | the syscall filter. */ |
@@ -59,10 +65,157 @@ struct audit_entry { | |||
59 | struct audit_krule rule; | 65 | struct audit_krule rule; |
60 | }; | 66 | }; |
61 | 67 | ||
62 | #ifdef CONFIG_AUDIT | 68 | struct audit_cap_data { |
63 | extern int audit_enabled; | 69 | kernel_cap_t permitted; |
64 | extern int audit_ever_enabled; | 70 | kernel_cap_t inheritable; |
71 | union { | ||
72 | unsigned int fE; /* effective bit of file cap */ | ||
73 | kernel_cap_t effective; /* effective set of process */ | ||
74 | }; | ||
75 | }; | ||
76 | |||
77 | /* When fs/namei.c:getname() is called, we store the pointer in name and | ||
78 | * we don't let putname() free it (instead we free all of the saved | ||
79 | * pointers at syscall exit time). | ||
80 | * | ||
81 | * Further, in fs/namei.c:path_lookup() we store the inode and device. | ||
82 | */ | ||
83 | struct audit_names { | ||
84 | struct list_head list; /* audit_context->names_list */ | ||
85 | |||
86 | struct filename *name; | ||
87 | int name_len; /* number of chars to log */ | ||
88 | bool name_put; /* call __putname()? */ | ||
89 | |||
90 | unsigned long ino; | ||
91 | dev_t dev; | ||
92 | umode_t mode; | ||
93 | kuid_t uid; | ||
94 | kgid_t gid; | ||
95 | dev_t rdev; | ||
96 | u32 osid; | ||
97 | struct audit_cap_data fcap; | ||
98 | unsigned int fcap_ver; | ||
99 | unsigned char type; /* record type */ | ||
100 | /* | ||
101 | * This was an allocated audit_names and not from the array of | ||
102 | * names allocated in the task audit context. Thus this name | ||
103 | * should be freed on syscall exit. | ||
104 | */ | ||
105 | bool should_free; | ||
106 | }; | ||
107 | |||
108 | /* The per-task audit context. */ | ||
109 | struct audit_context { | ||
110 | int dummy; /* must be the first element */ | ||
111 | int in_syscall; /* 1 if task is in a syscall */ | ||
112 | enum audit_state state, current_state; | ||
113 | unsigned int serial; /* serial number for record */ | ||
114 | int major; /* syscall number */ | ||
115 | struct timespec ctime; /* time of syscall entry */ | ||
116 | unsigned long argv[4]; /* syscall arguments */ | ||
117 | long return_code;/* syscall return code */ | ||
118 | u64 prio; | ||
119 | int return_valid; /* return code is valid */ | ||
120 | /* | ||
121 | * The names_list is the list of all audit_names collected during this | ||
122 | * syscall. The first AUDIT_NAMES entries in the names_list will | ||
123 | * actually be from the preallocated_names array for performance | ||
124 | * reasons. Except during allocation they should never be referenced | ||
125 | * through the preallocated_names array and should only be found/used | ||
126 | * by running the names_list. | ||
127 | */ | ||
128 | struct audit_names preallocated_names[AUDIT_NAMES]; | ||
129 | int name_count; /* total records in names_list */ | ||
130 | struct list_head names_list; /* struct audit_names->list anchor */ | ||
131 | char *filterkey; /* key for rule that triggered record */ | ||
132 | struct path pwd; | ||
133 | struct audit_aux_data *aux; | ||
134 | struct audit_aux_data *aux_pids; | ||
135 | struct sockaddr_storage *sockaddr; | ||
136 | size_t sockaddr_len; | ||
137 | /* Save things to print about task_struct */ | ||
138 | pid_t pid, ppid; | ||
139 | kuid_t uid, euid, suid, fsuid; | ||
140 | kgid_t gid, egid, sgid, fsgid; | ||
141 | unsigned long personality; | ||
142 | int arch; | ||
143 | |||
144 | pid_t target_pid; | ||
145 | kuid_t target_auid; | ||
146 | kuid_t target_uid; | ||
147 | unsigned int target_sessionid; | ||
148 | u32 target_sid; | ||
149 | char target_comm[TASK_COMM_LEN]; | ||
150 | |||
151 | struct audit_tree_refs *trees, *first_trees; | ||
152 | struct list_head killed_trees; | ||
153 | int tree_count; | ||
154 | |||
155 | int type; | ||
156 | union { | ||
157 | struct { | ||
158 | int nargs; | ||
159 | long args[6]; | ||
160 | } socketcall; | ||
161 | struct { | ||
162 | kuid_t uid; | ||
163 | kgid_t gid; | ||
164 | umode_t mode; | ||
165 | u32 osid; | ||
166 | int has_perm; | ||
167 | uid_t perm_uid; | ||
168 | gid_t perm_gid; | ||
169 | umode_t perm_mode; | ||
170 | unsigned long qbytes; | ||
171 | } ipc; | ||
172 | struct { | ||
173 | mqd_t mqdes; | ||
174 | struct mq_attr mqstat; | ||
175 | } mq_getsetattr; | ||
176 | struct { | ||
177 | mqd_t mqdes; | ||
178 | int sigev_signo; | ||
179 | } mq_notify; | ||
180 | struct { | ||
181 | mqd_t mqdes; | ||
182 | size_t msg_len; | ||
183 | unsigned int msg_prio; | ||
184 | struct timespec abs_timeout; | ||
185 | } mq_sendrecv; | ||
186 | struct { | ||
187 | int oflag; | ||
188 | umode_t mode; | ||
189 | struct mq_attr attr; | ||
190 | } mq_open; | ||
191 | struct { | ||
192 | pid_t pid; | ||
193 | struct audit_cap_data cap; | ||
194 | } capset; | ||
195 | struct { | ||
196 | int fd; | ||
197 | int flags; | ||
198 | } mmap; | ||
199 | }; | ||
200 | int fds[2]; | ||
201 | |||
202 | #if AUDIT_DEBUG | ||
203 | int put_count; | ||
204 | int ino_count; | ||
65 | #endif | 205 | #endif |
206 | }; | ||
207 | |||
208 | extern int audit_ever_enabled; | ||
209 | |||
210 | extern void audit_copy_inode(struct audit_names *name, | ||
211 | const struct dentry *dentry, | ||
212 | const struct inode *inode); | ||
213 | extern void audit_log_cap(struct audit_buffer *ab, char *prefix, | ||
214 | kernel_cap_t *cap); | ||
215 | extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); | ||
216 | extern void audit_log_name(struct audit_context *context, | ||
217 | struct audit_names *n, struct path *path, | ||
218 | int record_num, int *call_panic); | ||
66 | 219 | ||
67 | extern int audit_pid; | 220 | extern int audit_pid; |
68 | 221 | ||
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d6..a291aa23fb3f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -617,9 +617,9 @@ void audit_trim_trees(void) | |||
617 | } | 617 | } |
618 | spin_unlock(&hash_lock); | 618 | spin_unlock(&hash_lock); |
619 | trim_marked(tree); | 619 | trim_marked(tree); |
620 | put_tree(tree); | ||
621 | drop_collected_mounts(root_mnt); | 620 | drop_collected_mounts(root_mnt); |
622 | skip_it: | 621 | skip_it: |
622 | put_tree(tree); | ||
623 | mutex_lock(&audit_filter_mutex); | 623 | mutex_lock(&audit_filter_mutex); |
624 | } | 624 | } |
625 | list_del(&cursor); | 625 | list_del(&cursor); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06f..6bd4a90d1991 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -310,121 +310,83 @@ static u32 audit_to_op(u32 op) | |||
310 | return n; | 310 | return n; |
311 | } | 311 | } |
312 | 312 | ||
313 | 313 | /* check if an audit field is valid */ | |
314 | /* Translate struct audit_rule to kernel's rule respresentation. | 314 | static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) |
315 | * Exists for backward compatibility with userspace. */ | ||
316 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | ||
317 | { | 315 | { |
318 | struct audit_entry *entry; | 316 | switch(f->type) { |
319 | int err = 0; | 317 | case AUDIT_MSGTYPE: |
320 | int i; | 318 | if (entry->rule.listnr != AUDIT_FILTER_TYPE && |
321 | 319 | entry->rule.listnr != AUDIT_FILTER_USER) | |
322 | entry = audit_to_entry_common(rule); | 320 | return -EINVAL; |
323 | if (IS_ERR(entry)) | 321 | break; |
324 | goto exit_nofree; | 322 | }; |
325 | |||
326 | for (i = 0; i < rule->field_count; i++) { | ||
327 | struct audit_field *f = &entry->rule.fields[i]; | ||
328 | u32 n; | ||
329 | |||
330 | n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); | ||
331 | |||
332 | /* Support for legacy operators where | ||
333 | * AUDIT_NEGATE bit signifies != and otherwise assumes == */ | ||
334 | if (n & AUDIT_NEGATE) | ||
335 | f->op = Audit_not_equal; | ||
336 | else if (!n) | ||
337 | f->op = Audit_equal; | ||
338 | else | ||
339 | f->op = audit_to_op(n); | ||
340 | |||
341 | entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; | ||
342 | |||
343 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); | ||
344 | f->val = rule->values[i]; | ||
345 | f->uid = INVALID_UID; | ||
346 | f->gid = INVALID_GID; | ||
347 | |||
348 | err = -EINVAL; | ||
349 | if (f->op == Audit_bad) | ||
350 | goto exit_free; | ||
351 | |||
352 | switch(f->type) { | ||
353 | default: | ||
354 | goto exit_free; | ||
355 | case AUDIT_UID: | ||
356 | case AUDIT_EUID: | ||
357 | case AUDIT_SUID: | ||
358 | case AUDIT_FSUID: | ||
359 | case AUDIT_LOGINUID: | ||
360 | /* bit ops not implemented for uid comparisons */ | ||
361 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
362 | goto exit_free; | ||
363 | |||
364 | f->uid = make_kuid(current_user_ns(), f->val); | ||
365 | if (!uid_valid(f->uid)) | ||
366 | goto exit_free; | ||
367 | break; | ||
368 | case AUDIT_GID: | ||
369 | case AUDIT_EGID: | ||
370 | case AUDIT_SGID: | ||
371 | case AUDIT_FSGID: | ||
372 | /* bit ops not implemented for gid comparisons */ | ||
373 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
374 | goto exit_free; | ||
375 | |||
376 | f->gid = make_kgid(current_user_ns(), f->val); | ||
377 | if (!gid_valid(f->gid)) | ||
378 | goto exit_free; | ||
379 | break; | ||
380 | case AUDIT_PID: | ||
381 | case AUDIT_PERS: | ||
382 | case AUDIT_MSGTYPE: | ||
383 | case AUDIT_PPID: | ||
384 | case AUDIT_DEVMAJOR: | ||
385 | case AUDIT_DEVMINOR: | ||
386 | case AUDIT_EXIT: | ||
387 | case AUDIT_SUCCESS: | ||
388 | /* bit ops are only useful on syscall args */ | ||
389 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
390 | goto exit_free; | ||
391 | break; | ||
392 | case AUDIT_ARG0: | ||
393 | case AUDIT_ARG1: | ||
394 | case AUDIT_ARG2: | ||
395 | case AUDIT_ARG3: | ||
396 | break; | ||
397 | /* arch is only allowed to be = or != */ | ||
398 | case AUDIT_ARCH: | ||
399 | if (f->op != Audit_not_equal && f->op != Audit_equal) | ||
400 | goto exit_free; | ||
401 | entry->rule.arch_f = f; | ||
402 | break; | ||
403 | case AUDIT_PERM: | ||
404 | if (f->val & ~15) | ||
405 | goto exit_free; | ||
406 | break; | ||
407 | case AUDIT_FILETYPE: | ||
408 | if (f->val & ~S_IFMT) | ||
409 | goto exit_free; | ||
410 | break; | ||
411 | case AUDIT_INODE: | ||
412 | err = audit_to_inode(&entry->rule, f); | ||
413 | if (err) | ||
414 | goto exit_free; | ||
415 | break; | ||
416 | } | ||
417 | } | ||
418 | |||
419 | if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) | ||
420 | entry->rule.inode_f = NULL; | ||
421 | |||
422 | exit_nofree: | ||
423 | return entry; | ||
424 | 323 | ||
425 | exit_free: | 324 | switch(f->type) { |
426 | audit_free_rule(entry); | 325 | default: |
427 | return ERR_PTR(err); | 326 | return -EINVAL; |
327 | case AUDIT_UID: | ||
328 | case AUDIT_EUID: | ||
329 | case AUDIT_SUID: | ||
330 | case AUDIT_FSUID: | ||
331 | case AUDIT_LOGINUID: | ||
332 | case AUDIT_OBJ_UID: | ||
333 | case AUDIT_GID: | ||
334 | case AUDIT_EGID: | ||
335 | case AUDIT_SGID: | ||
336 | case AUDIT_FSGID: | ||
337 | case AUDIT_OBJ_GID: | ||
338 | case AUDIT_PID: | ||
339 | case AUDIT_PERS: | ||
340 | case AUDIT_MSGTYPE: | ||
341 | case AUDIT_PPID: | ||
342 | case AUDIT_DEVMAJOR: | ||
343 | case AUDIT_DEVMINOR: | ||
344 | case AUDIT_EXIT: | ||
345 | case AUDIT_SUCCESS: | ||
346 | /* bit ops are only useful on syscall args */ | ||
347 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
348 | return -EINVAL; | ||
349 | break; | ||
350 | case AUDIT_ARG0: | ||
351 | case AUDIT_ARG1: | ||
352 | case AUDIT_ARG2: | ||
353 | case AUDIT_ARG3: | ||
354 | case AUDIT_SUBJ_USER: | ||
355 | case AUDIT_SUBJ_ROLE: | ||
356 | case AUDIT_SUBJ_TYPE: | ||
357 | case AUDIT_SUBJ_SEN: | ||
358 | case AUDIT_SUBJ_CLR: | ||
359 | case AUDIT_OBJ_USER: | ||
360 | case AUDIT_OBJ_ROLE: | ||
361 | case AUDIT_OBJ_TYPE: | ||
362 | case AUDIT_OBJ_LEV_LOW: | ||
363 | case AUDIT_OBJ_LEV_HIGH: | ||
364 | case AUDIT_WATCH: | ||
365 | case AUDIT_DIR: | ||
366 | case AUDIT_FILTERKEY: | ||
367 | break; | ||
368 | case AUDIT_LOGINUID_SET: | ||
369 | if ((f->val != 0) && (f->val != 1)) | ||
370 | return -EINVAL; | ||
371 | /* FALL THROUGH */ | ||
372 | case AUDIT_ARCH: | ||
373 | if (f->op != Audit_not_equal && f->op != Audit_equal) | ||
374 | return -EINVAL; | ||
375 | break; | ||
376 | case AUDIT_PERM: | ||
377 | if (f->val & ~15) | ||
378 | return -EINVAL; | ||
379 | break; | ||
380 | case AUDIT_FILETYPE: | ||
381 | if (f->val & ~S_IFMT) | ||
382 | return -EINVAL; | ||
383 | break; | ||
384 | case AUDIT_FIELD_COMPARE: | ||
385 | if (f->val > AUDIT_MAX_FIELD_COMPARE) | ||
386 | return -EINVAL; | ||
387 | break; | ||
388 | }; | ||
389 | return 0; | ||
428 | } | 390 | } |
429 | 391 | ||
430 | /* Translate struct audit_rule_data to kernel's rule respresentation. */ | 392 | /* Translate struct audit_rule_data to kernel's rule respresentation. */ |
@@ -459,17 +421,25 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
459 | f->gid = INVALID_GID; | 421 | f->gid = INVALID_GID; |
460 | f->lsm_str = NULL; | 422 | f->lsm_str = NULL; |
461 | f->lsm_rule = NULL; | 423 | f->lsm_rule = NULL; |
462 | switch(f->type) { | 424 | |
425 | /* Support legacy tests for a valid loginuid */ | ||
426 | if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { | ||
427 | f->type = AUDIT_LOGINUID_SET; | ||
428 | f->val = 0; | ||
429 | } | ||
430 | |||
431 | err = audit_field_valid(entry, f); | ||
432 | if (err) | ||
433 | goto exit_free; | ||
434 | |||
435 | err = -EINVAL; | ||
436 | switch (f->type) { | ||
437 | case AUDIT_LOGINUID: | ||
463 | case AUDIT_UID: | 438 | case AUDIT_UID: |
464 | case AUDIT_EUID: | 439 | case AUDIT_EUID: |
465 | case AUDIT_SUID: | 440 | case AUDIT_SUID: |
466 | case AUDIT_FSUID: | 441 | case AUDIT_FSUID: |
467 | case AUDIT_LOGINUID: | ||
468 | case AUDIT_OBJ_UID: | 442 | case AUDIT_OBJ_UID: |
469 | /* bit ops not implemented for uid comparisons */ | ||
470 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
471 | goto exit_free; | ||
472 | |||
473 | f->uid = make_kuid(current_user_ns(), f->val); | 443 | f->uid = make_kuid(current_user_ns(), f->val); |
474 | if (!uid_valid(f->uid)) | 444 | if (!uid_valid(f->uid)) |
475 | goto exit_free; | 445 | goto exit_free; |
@@ -479,27 +449,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
479 | case AUDIT_SGID: | 449 | case AUDIT_SGID: |
480 | case AUDIT_FSGID: | 450 | case AUDIT_FSGID: |
481 | case AUDIT_OBJ_GID: | 451 | case AUDIT_OBJ_GID: |
482 | /* bit ops not implemented for gid comparisons */ | ||
483 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
484 | goto exit_free; | ||
485 | |||
486 | f->gid = make_kgid(current_user_ns(), f->val); | 452 | f->gid = make_kgid(current_user_ns(), f->val); |
487 | if (!gid_valid(f->gid)) | 453 | if (!gid_valid(f->gid)) |
488 | goto exit_free; | 454 | goto exit_free; |
489 | break; | 455 | break; |
490 | case AUDIT_PID: | ||
491 | case AUDIT_PERS: | ||
492 | case AUDIT_MSGTYPE: | ||
493 | case AUDIT_PPID: | ||
494 | case AUDIT_DEVMAJOR: | ||
495 | case AUDIT_DEVMINOR: | ||
496 | case AUDIT_EXIT: | ||
497 | case AUDIT_SUCCESS: | ||
498 | case AUDIT_ARG0: | ||
499 | case AUDIT_ARG1: | ||
500 | case AUDIT_ARG2: | ||
501 | case AUDIT_ARG3: | ||
502 | break; | ||
503 | case AUDIT_ARCH: | 456 | case AUDIT_ARCH: |
504 | entry->rule.arch_f = f; | 457 | entry->rule.arch_f = f; |
505 | break; | 458 | break; |
@@ -570,20 +523,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
570 | entry->rule.buflen += f->val; | 523 | entry->rule.buflen += f->val; |
571 | entry->rule.filterkey = str; | 524 | entry->rule.filterkey = str; |
572 | break; | 525 | break; |
573 | case AUDIT_PERM: | ||
574 | if (f->val & ~15) | ||
575 | goto exit_free; | ||
576 | break; | ||
577 | case AUDIT_FILETYPE: | ||
578 | if (f->val & ~S_IFMT) | ||
579 | goto exit_free; | ||
580 | break; | ||
581 | case AUDIT_FIELD_COMPARE: | ||
582 | if (f->val > AUDIT_MAX_FIELD_COMPARE) | ||
583 | goto exit_free; | ||
584 | break; | ||
585 | default: | ||
586 | goto exit_free; | ||
587 | } | 526 | } |
588 | } | 527 | } |
589 | 528 | ||
@@ -594,6 +533,10 @@ exit_nofree: | |||
594 | return entry; | 533 | return entry; |
595 | 534 | ||
596 | exit_free: | 535 | exit_free: |
536 | if (entry->rule.watch) | ||
537 | audit_put_watch(entry->rule.watch); /* matches initial get */ | ||
538 | if (entry->rule.tree) | ||
539 | audit_put_tree(entry->rule.tree); /* that's the temporary one */ | ||
597 | audit_free_rule(entry); | 540 | audit_free_rule(entry); |
598 | return ERR_PTR(err); | 541 | return ERR_PTR(err); |
599 | } | 542 | } |
@@ -609,36 +552,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str) | |||
609 | return len; | 552 | return len; |
610 | } | 553 | } |
611 | 554 | ||
612 | /* Translate kernel rule respresentation to struct audit_rule. | ||
613 | * Exists for backward compatibility with userspace. */ | ||
614 | static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) | ||
615 | { | ||
616 | struct audit_rule *rule; | ||
617 | int i; | ||
618 | |||
619 | rule = kzalloc(sizeof(*rule), GFP_KERNEL); | ||
620 | if (unlikely(!rule)) | ||
621 | return NULL; | ||
622 | |||
623 | rule->flags = krule->flags | krule->listnr; | ||
624 | rule->action = krule->action; | ||
625 | rule->field_count = krule->field_count; | ||
626 | for (i = 0; i < rule->field_count; i++) { | ||
627 | rule->values[i] = krule->fields[i].val; | ||
628 | rule->fields[i] = krule->fields[i].type; | ||
629 | |||
630 | if (krule->vers_ops == 1) { | ||
631 | if (krule->fields[i].op == Audit_not_equal) | ||
632 | rule->fields[i] |= AUDIT_NEGATE; | ||
633 | } else { | ||
634 | rule->fields[i] |= audit_ops[krule->fields[i].op]; | ||
635 | } | ||
636 | } | ||
637 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; | ||
638 | |||
639 | return rule; | ||
640 | } | ||
641 | |||
642 | /* Translate kernel rule respresentation to struct audit_rule_data. */ | 555 | /* Translate kernel rule respresentation to struct audit_rule_data. */ |
643 | static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | 556 | static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) |
644 | { | 557 | { |
@@ -1051,35 +964,6 @@ out: | |||
1051 | return ret; | 964 | return ret; |
1052 | } | 965 | } |
1053 | 966 | ||
1054 | /* List rules using struct audit_rule. Exists for backward | ||
1055 | * compatibility with userspace. */ | ||
1056 | static void audit_list(int pid, int seq, struct sk_buff_head *q) | ||
1057 | { | ||
1058 | struct sk_buff *skb; | ||
1059 | struct audit_krule *r; | ||
1060 | int i; | ||
1061 | |||
1062 | /* This is a blocking read, so use audit_filter_mutex instead of rcu | ||
1063 | * iterator to sync with list writers. */ | ||
1064 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | ||
1065 | list_for_each_entry(r, &audit_rules_list[i], list) { | ||
1066 | struct audit_rule *rule; | ||
1067 | |||
1068 | rule = audit_krule_to_rule(r); | ||
1069 | if (unlikely(!rule)) | ||
1070 | break; | ||
1071 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
1072 | rule, sizeof(*rule)); | ||
1073 | if (skb) | ||
1074 | skb_queue_tail(q, skb); | ||
1075 | kfree(rule); | ||
1076 | } | ||
1077 | } | ||
1078 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | ||
1079 | if (skb) | ||
1080 | skb_queue_tail(q, skb); | ||
1081 | } | ||
1082 | |||
1083 | /* List rules using struct audit_rule_data. */ | 967 | /* List rules using struct audit_rule_data. */ |
1084 | static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | 968 | static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) |
1085 | { | 969 | { |
@@ -1109,11 +993,11 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | |||
1109 | } | 993 | } |
1110 | 994 | ||
1111 | /* Log rule additions and removals */ | 995 | /* Log rule additions and removals */ |
1112 | static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, | 996 | static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) |
1113 | char *action, struct audit_krule *rule, | ||
1114 | int res) | ||
1115 | { | 997 | { |
1116 | struct audit_buffer *ab; | 998 | struct audit_buffer *ab; |
999 | uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); | ||
1000 | u32 sessionid = audit_get_sessionid(current); | ||
1117 | 1001 | ||
1118 | if (!audit_enabled) | 1002 | if (!audit_enabled) |
1119 | return; | 1003 | return; |
@@ -1121,18 +1005,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, | |||
1121 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 1005 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
1122 | if (!ab) | 1006 | if (!ab) |
1123 | return; | 1007 | return; |
1124 | audit_log_format(ab, "auid=%u ses=%u", | 1008 | audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); |
1125 | from_kuid(&init_user_ns, loginuid), sessionid); | 1009 | audit_log_task_context(ab); |
1126 | if (sid) { | ||
1127 | char *ctx = NULL; | ||
1128 | u32 len; | ||
1129 | if (security_secid_to_secctx(sid, &ctx, &len)) | ||
1130 | audit_log_format(ab, " ssid=%u", sid); | ||
1131 | else { | ||
1132 | audit_log_format(ab, " subj=%s", ctx); | ||
1133 | security_release_secctx(ctx, len); | ||
1134 | } | ||
1135 | } | ||
1136 | audit_log_format(ab, " op="); | 1010 | audit_log_format(ab, " op="); |
1137 | audit_log_string(ab, action); | 1011 | audit_log_string(ab, action); |
1138 | audit_log_key(ab, rule->filterkey); | 1012 | audit_log_key(ab, rule->filterkey); |
@@ -1147,12 +1021,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, | |||
1147 | * @seq: netlink audit message sequence (serial) number | 1021 | * @seq: netlink audit message sequence (serial) number |
1148 | * @data: payload data | 1022 | * @data: payload data |
1149 | * @datasz: size of payload data | 1023 | * @datasz: size of payload data |
1150 | * @loginuid: loginuid of sender | ||
1151 | * @sessionid: sessionid for netlink audit message | ||
1152 | * @sid: SE Linux Security ID of sender | ||
1153 | */ | 1024 | */ |
1154 | int audit_receive_filter(int type, int pid, int seq, void *data, | 1025 | int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) |
1155 | size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) | ||
1156 | { | 1026 | { |
1157 | struct task_struct *tsk; | 1027 | struct task_struct *tsk; |
1158 | struct audit_netlink_list *dest; | 1028 | struct audit_netlink_list *dest; |
@@ -1160,7 +1030,6 @@ int audit_receive_filter(int type, int pid, int seq, void *data, | |||
1160 | struct audit_entry *entry; | 1030 | struct audit_entry *entry; |
1161 | 1031 | ||
1162 | switch (type) { | 1032 | switch (type) { |
1163 | case AUDIT_LIST: | ||
1164 | case AUDIT_LIST_RULES: | 1033 | case AUDIT_LIST_RULES: |
1165 | /* We can't just spew out the rules here because we might fill | 1034 | /* We can't just spew out the rules here because we might fill |
1166 | * the available socket buffer space and deadlock waiting for | 1035 | * the available socket buffer space and deadlock waiting for |
@@ -1175,10 +1044,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data, | |||
1175 | skb_queue_head_init(&dest->q); | 1044 | skb_queue_head_init(&dest->q); |
1176 | 1045 | ||
1177 | mutex_lock(&audit_filter_mutex); | 1046 | mutex_lock(&audit_filter_mutex); |
1178 | if (type == AUDIT_LIST) | 1047 | audit_list_rules(pid, seq, &dest->q); |
1179 | audit_list(pid, seq, &dest->q); | ||
1180 | else | ||
1181 | audit_list_rules(pid, seq, &dest->q); | ||
1182 | mutex_unlock(&audit_filter_mutex); | 1048 | mutex_unlock(&audit_filter_mutex); |
1183 | 1049 | ||
1184 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); | 1050 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); |
@@ -1188,35 +1054,23 @@ int audit_receive_filter(int type, int pid, int seq, void *data, | |||
1188 | err = PTR_ERR(tsk); | 1054 | err = PTR_ERR(tsk); |
1189 | } | 1055 | } |
1190 | break; | 1056 | break; |
1191 | case AUDIT_ADD: | ||
1192 | case AUDIT_ADD_RULE: | 1057 | case AUDIT_ADD_RULE: |
1193 | if (type == AUDIT_ADD) | 1058 | entry = audit_data_to_entry(data, datasz); |
1194 | entry = audit_rule_to_entry(data); | ||
1195 | else | ||
1196 | entry = audit_data_to_entry(data, datasz); | ||
1197 | if (IS_ERR(entry)) | 1059 | if (IS_ERR(entry)) |
1198 | return PTR_ERR(entry); | 1060 | return PTR_ERR(entry); |
1199 | 1061 | ||
1200 | err = audit_add_rule(entry); | 1062 | err = audit_add_rule(entry); |
1201 | audit_log_rule_change(loginuid, sessionid, sid, "add rule", | 1063 | audit_log_rule_change("add rule", &entry->rule, !err); |
1202 | &entry->rule, !err); | ||
1203 | |||
1204 | if (err) | 1064 | if (err) |
1205 | audit_free_rule(entry); | 1065 | audit_free_rule(entry); |
1206 | break; | 1066 | break; |
1207 | case AUDIT_DEL: | ||
1208 | case AUDIT_DEL_RULE: | 1067 | case AUDIT_DEL_RULE: |
1209 | if (type == AUDIT_DEL) | 1068 | entry = audit_data_to_entry(data, datasz); |
1210 | entry = audit_rule_to_entry(data); | ||
1211 | else | ||
1212 | entry = audit_data_to_entry(data, datasz); | ||
1213 | if (IS_ERR(entry)) | 1069 | if (IS_ERR(entry)) |
1214 | return PTR_ERR(entry); | 1070 | return PTR_ERR(entry); |
1215 | 1071 | ||
1216 | err = audit_del_rule(entry); | 1072 | err = audit_del_rule(entry); |
1217 | audit_log_rule_change(loginuid, sessionid, sid, "remove rule", | 1073 | audit_log_rule_change("remove rule", &entry->rule, !err); |
1218 | &entry->rule, !err); | ||
1219 | |||
1220 | audit_free_rule(entry); | 1074 | audit_free_rule(entry); |
1221 | break; | 1075 | break; |
1222 | default: | 1076 | default: |
@@ -1354,7 +1208,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) | |||
1354 | return strncmp(p, dname, dlen); | 1208 | return strncmp(p, dname, dlen); |
1355 | } | 1209 | } |
1356 | 1210 | ||
1357 | static int audit_filter_user_rules(struct audit_krule *rule, | 1211 | static int audit_filter_user_rules(struct audit_krule *rule, int type, |
1358 | enum audit_state *state) | 1212 | enum audit_state *state) |
1359 | { | 1213 | { |
1360 | int i; | 1214 | int i; |
@@ -1378,6 +1232,13 @@ static int audit_filter_user_rules(struct audit_krule *rule, | |||
1378 | result = audit_uid_comparator(audit_get_loginuid(current), | 1232 | result = audit_uid_comparator(audit_get_loginuid(current), |
1379 | f->op, f->uid); | 1233 | f->op, f->uid); |
1380 | break; | 1234 | break; |
1235 | case AUDIT_LOGINUID_SET: | ||
1236 | result = audit_comparator(audit_loginuid_set(current), | ||
1237 | f->op, f->val); | ||
1238 | break; | ||
1239 | case AUDIT_MSGTYPE: | ||
1240 | result = audit_comparator(type, f->op, f->val); | ||
1241 | break; | ||
1381 | case AUDIT_SUBJ_USER: | 1242 | case AUDIT_SUBJ_USER: |
1382 | case AUDIT_SUBJ_ROLE: | 1243 | case AUDIT_SUBJ_ROLE: |
1383 | case AUDIT_SUBJ_TYPE: | 1244 | case AUDIT_SUBJ_TYPE: |
@@ -1404,7 +1265,7 @@ static int audit_filter_user_rules(struct audit_krule *rule, | |||
1404 | return 1; | 1265 | return 1; |
1405 | } | 1266 | } |
1406 | 1267 | ||
1407 | int audit_filter_user(void) | 1268 | int audit_filter_user(int type) |
1408 | { | 1269 | { |
1409 | enum audit_state state = AUDIT_DISABLED; | 1270 | enum audit_state state = AUDIT_DISABLED; |
1410 | struct audit_entry *e; | 1271 | struct audit_entry *e; |
@@ -1412,7 +1273,7 @@ int audit_filter_user(void) | |||
1412 | 1273 | ||
1413 | rcu_read_lock(); | 1274 | rcu_read_lock(); |
1414 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { | 1275 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { |
1415 | if (audit_filter_user_rules(&e->rule, &state)) { | 1276 | if (audit_filter_user_rules(&e->rule, type, &state)) { |
1416 | if (state == AUDIT_DISABLED) | 1277 | if (state == AUDIT_DISABLED) |
1417 | ret = 0; | 1278 | ret = 0; |
1418 | break; | 1279 | break; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a9..3c8a601324a2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -76,11 +76,6 @@ | |||
76 | #define AUDITSC_SUCCESS 1 | 76 | #define AUDITSC_SUCCESS 1 |
77 | #define AUDITSC_FAILURE 2 | 77 | #define AUDITSC_FAILURE 2 |
78 | 78 | ||
79 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | ||
80 | * for saving names from getname(). If we get more names we will allocate | ||
81 | * a name dynamically and also add those to the list anchored by names_list. */ | ||
82 | #define AUDIT_NAMES 5 | ||
83 | |||
84 | /* no execve audit message should be longer than this (userspace limits) */ | 79 | /* no execve audit message should be longer than this (userspace limits) */ |
85 | #define MAX_EXECVE_AUDIT_LEN 7500 | 80 | #define MAX_EXECVE_AUDIT_LEN 7500 |
86 | 81 | ||
@@ -90,44 +85,6 @@ int audit_n_rules; | |||
90 | /* determines whether we collect data for signals sent */ | 85 | /* determines whether we collect data for signals sent */ |
91 | int audit_signals; | 86 | int audit_signals; |
92 | 87 | ||
93 | struct audit_cap_data { | ||
94 | kernel_cap_t permitted; | ||
95 | kernel_cap_t inheritable; | ||
96 | union { | ||
97 | unsigned int fE; /* effective bit of a file capability */ | ||
98 | kernel_cap_t effective; /* effective set of a process */ | ||
99 | }; | ||
100 | }; | ||
101 | |||
102 | /* When fs/namei.c:getname() is called, we store the pointer in name and | ||
103 | * we don't let putname() free it (instead we free all of the saved | ||
104 | * pointers at syscall exit time). | ||
105 | * | ||
106 | * Further, in fs/namei.c:path_lookup() we store the inode and device. | ||
107 | */ | ||
108 | struct audit_names { | ||
109 | struct list_head list; /* audit_context->names_list */ | ||
110 | struct filename *name; | ||
111 | unsigned long ino; | ||
112 | dev_t dev; | ||
113 | umode_t mode; | ||
114 | kuid_t uid; | ||
115 | kgid_t gid; | ||
116 | dev_t rdev; | ||
117 | u32 osid; | ||
118 | struct audit_cap_data fcap; | ||
119 | unsigned int fcap_ver; | ||
120 | int name_len; /* number of name's characters to log */ | ||
121 | unsigned char type; /* record type */ | ||
122 | bool name_put; /* call __putname() for this name */ | ||
123 | /* | ||
124 | * This was an allocated audit_names and not from the array of | ||
125 | * names allocated in the task audit context. Thus this name | ||
126 | * should be freed on syscall exit | ||
127 | */ | ||
128 | bool should_free; | ||
129 | }; | ||
130 | |||
131 | struct audit_aux_data { | 88 | struct audit_aux_data { |
132 | struct audit_aux_data *next; | 89 | struct audit_aux_data *next; |
133 | int type; | 90 | int type; |
@@ -175,106 +132,6 @@ struct audit_tree_refs { | |||
175 | struct audit_chunk *c[31]; | 132 | struct audit_chunk *c[31]; |
176 | }; | 133 | }; |
177 | 134 | ||
178 | /* The per-task audit context. */ | ||
179 | struct audit_context { | ||
180 | int dummy; /* must be the first element */ | ||
181 | int in_syscall; /* 1 if task is in a syscall */ | ||
182 | enum audit_state state, current_state; | ||
183 | unsigned int serial; /* serial number for record */ | ||
184 | int major; /* syscall number */ | ||
185 | struct timespec ctime; /* time of syscall entry */ | ||
186 | unsigned long argv[4]; /* syscall arguments */ | ||
187 | long return_code;/* syscall return code */ | ||
188 | u64 prio; | ||
189 | int return_valid; /* return code is valid */ | ||
190 | /* | ||
191 | * The names_list is the list of all audit_names collected during this | ||
192 | * syscall. The first AUDIT_NAMES entries in the names_list will | ||
193 | * actually be from the preallocated_names array for performance | ||
194 | * reasons. Except during allocation they should never be referenced | ||
195 | * through the preallocated_names array and should only be found/used | ||
196 | * by running the names_list. | ||
197 | */ | ||
198 | struct audit_names preallocated_names[AUDIT_NAMES]; | ||
199 | int name_count; /* total records in names_list */ | ||
200 | struct list_head names_list; /* anchor for struct audit_names->list */ | ||
201 | char * filterkey; /* key for rule that triggered record */ | ||
202 | struct path pwd; | ||
203 | struct audit_aux_data *aux; | ||
204 | struct audit_aux_data *aux_pids; | ||
205 | struct sockaddr_storage *sockaddr; | ||
206 | size_t sockaddr_len; | ||
207 | /* Save things to print about task_struct */ | ||
208 | pid_t pid, ppid; | ||
209 | kuid_t uid, euid, suid, fsuid; | ||
210 | kgid_t gid, egid, sgid, fsgid; | ||
211 | unsigned long personality; | ||
212 | int arch; | ||
213 | |||
214 | pid_t target_pid; | ||
215 | kuid_t target_auid; | ||
216 | kuid_t target_uid; | ||
217 | unsigned int target_sessionid; | ||
218 | u32 target_sid; | ||
219 | char target_comm[TASK_COMM_LEN]; | ||
220 | |||
221 | struct audit_tree_refs *trees, *first_trees; | ||
222 | struct list_head killed_trees; | ||
223 | int tree_count; | ||
224 | |||
225 | int type; | ||
226 | union { | ||
227 | struct { | ||
228 | int nargs; | ||
229 | long args[6]; | ||
230 | } socketcall; | ||
231 | struct { | ||
232 | kuid_t uid; | ||
233 | kgid_t gid; | ||
234 | umode_t mode; | ||
235 | u32 osid; | ||
236 | int has_perm; | ||
237 | uid_t perm_uid; | ||
238 | gid_t perm_gid; | ||
239 | umode_t perm_mode; | ||
240 | unsigned long qbytes; | ||
241 | } ipc; | ||
242 | struct { | ||
243 | mqd_t mqdes; | ||
244 | struct mq_attr mqstat; | ||
245 | } mq_getsetattr; | ||
246 | struct { | ||
247 | mqd_t mqdes; | ||
248 | int sigev_signo; | ||
249 | } mq_notify; | ||
250 | struct { | ||
251 | mqd_t mqdes; | ||
252 | size_t msg_len; | ||
253 | unsigned int msg_prio; | ||
254 | struct timespec abs_timeout; | ||
255 | } mq_sendrecv; | ||
256 | struct { | ||
257 | int oflag; | ||
258 | umode_t mode; | ||
259 | struct mq_attr attr; | ||
260 | } mq_open; | ||
261 | struct { | ||
262 | pid_t pid; | ||
263 | struct audit_cap_data cap; | ||
264 | } capset; | ||
265 | struct { | ||
266 | int fd; | ||
267 | int flags; | ||
268 | } mmap; | ||
269 | }; | ||
270 | int fds[2]; | ||
271 | |||
272 | #if AUDIT_DEBUG | ||
273 | int put_count; | ||
274 | int ino_count; | ||
275 | #endif | ||
276 | }; | ||
277 | |||
278 | static inline int open_arg(int flags, int mask) | 135 | static inline int open_arg(int flags, int mask) |
279 | { | 136 | { |
280 | int n = ACC_MODE(flags); | 137 | int n = ACC_MODE(flags); |
@@ -633,9 +490,23 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
633 | break; | 490 | break; |
634 | case AUDIT_GID: | 491 | case AUDIT_GID: |
635 | result = audit_gid_comparator(cred->gid, f->op, f->gid); | 492 | result = audit_gid_comparator(cred->gid, f->op, f->gid); |
493 | if (f->op == Audit_equal) { | ||
494 | if (!result) | ||
495 | result = in_group_p(f->gid); | ||
496 | } else if (f->op == Audit_not_equal) { | ||
497 | if (result) | ||
498 | result = !in_group_p(f->gid); | ||
499 | } | ||
636 | break; | 500 | break; |
637 | case AUDIT_EGID: | 501 | case AUDIT_EGID: |
638 | result = audit_gid_comparator(cred->egid, f->op, f->gid); | 502 | result = audit_gid_comparator(cred->egid, f->op, f->gid); |
503 | if (f->op == Audit_equal) { | ||
504 | if (!result) | ||
505 | result = in_egroup_p(f->gid); | ||
506 | } else if (f->op == Audit_not_equal) { | ||
507 | if (result) | ||
508 | result = !in_egroup_p(f->gid); | ||
509 | } | ||
639 | break; | 510 | break; |
640 | case AUDIT_SGID: | 511 | case AUDIT_SGID: |
641 | result = audit_gid_comparator(cred->sgid, f->op, f->gid); | 512 | result = audit_gid_comparator(cred->sgid, f->op, f->gid); |
@@ -742,6 +613,9 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
742 | if (ctx) | 613 | if (ctx) |
743 | result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); | 614 | result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); |
744 | break; | 615 | break; |
616 | case AUDIT_LOGINUID_SET: | ||
617 | result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); | ||
618 | break; | ||
745 | case AUDIT_SUBJ_USER: | 619 | case AUDIT_SUBJ_USER: |
746 | case AUDIT_SUBJ_ROLE: | 620 | case AUDIT_SUBJ_ROLE: |
747 | case AUDIT_SUBJ_TYPE: | 621 | case AUDIT_SUBJ_TYPE: |
@@ -987,6 +861,8 @@ static inline void audit_free_names(struct audit_context *context) | |||
987 | 861 | ||
988 | #if AUDIT_DEBUG == 2 | 862 | #if AUDIT_DEBUG == 2 |
989 | if (context->put_count + context->ino_count != context->name_count) { | 863 | if (context->put_count + context->ino_count != context->name_count) { |
864 | int i = 0; | ||
865 | |||
990 | printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" | 866 | printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" |
991 | " name_count=%d put_count=%d" | 867 | " name_count=%d put_count=%d" |
992 | " ino_count=%d [NOT freeing]\n", | 868 | " ino_count=%d [NOT freeing]\n", |
@@ -995,7 +871,7 @@ static inline void audit_free_names(struct audit_context *context) | |||
995 | context->name_count, context->put_count, | 871 | context->name_count, context->put_count, |
996 | context->ino_count); | 872 | context->ino_count); |
997 | list_for_each_entry(n, &context->names_list, list) { | 873 | list_for_each_entry(n, &context->names_list, list) { |
998 | printk(KERN_ERR "names[%d] = %p = %s\n", i, | 874 | printk(KERN_ERR "names[%d] = %p = %s\n", i++, |
999 | n->name, n->name->name ?: "(null)"); | 875 | n->name, n->name->name ?: "(null)"); |
1000 | } | 876 | } |
1001 | dump_stack(); | 877 | dump_stack(); |
@@ -1010,7 +886,7 @@ static inline void audit_free_names(struct audit_context *context) | |||
1010 | list_for_each_entry_safe(n, next, &context->names_list, list) { | 886 | list_for_each_entry_safe(n, next, &context->names_list, list) { |
1011 | list_del(&n->list); | 887 | list_del(&n->list); |
1012 | if (n->name && n->name_put) | 888 | if (n->name && n->name_put) |
1013 | __putname(n->name); | 889 | final_putname(n->name); |
1014 | if (n->should_free) | 890 | if (n->should_free) |
1015 | kfree(n); | 891 | kfree(n); |
1016 | } | 892 | } |
@@ -1034,21 +910,15 @@ static inline void audit_free_aux(struct audit_context *context) | |||
1034 | } | 910 | } |
1035 | } | 911 | } |
1036 | 912 | ||
1037 | static inline void audit_zero_context(struct audit_context *context, | ||
1038 | enum audit_state state) | ||
1039 | { | ||
1040 | memset(context, 0, sizeof(*context)); | ||
1041 | context->state = state; | ||
1042 | context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; | ||
1043 | } | ||
1044 | |||
1045 | static inline struct audit_context *audit_alloc_context(enum audit_state state) | 913 | static inline struct audit_context *audit_alloc_context(enum audit_state state) |
1046 | { | 914 | { |
1047 | struct audit_context *context; | 915 | struct audit_context *context; |
1048 | 916 | ||
1049 | if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) | 917 | context = kzalloc(sizeof(*context), GFP_KERNEL); |
918 | if (!context) | ||
1050 | return NULL; | 919 | return NULL; |
1051 | audit_zero_context(context, state); | 920 | context->state = state; |
921 | context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; | ||
1052 | INIT_LIST_HEAD(&context->killed_trees); | 922 | INIT_LIST_HEAD(&context->killed_trees); |
1053 | INIT_LIST_HEAD(&context->names_list); | 923 | INIT_LIST_HEAD(&context->names_list); |
1054 | return context; | 924 | return context; |
@@ -1099,88 +969,6 @@ static inline void audit_free_context(struct audit_context *context) | |||
1099 | kfree(context); | 969 | kfree(context); |
1100 | } | 970 | } |
1101 | 971 | ||
1102 | void audit_log_task_context(struct audit_buffer *ab) | ||
1103 | { | ||
1104 | char *ctx = NULL; | ||
1105 | unsigned len; | ||
1106 | int error; | ||
1107 | u32 sid; | ||
1108 | |||
1109 | security_task_getsecid(current, &sid); | ||
1110 | if (!sid) | ||
1111 | return; | ||
1112 | |||
1113 | error = security_secid_to_secctx(sid, &ctx, &len); | ||
1114 | if (error) { | ||
1115 | if (error != -EINVAL) | ||
1116 | goto error_path; | ||
1117 | return; | ||
1118 | } | ||
1119 | |||
1120 | audit_log_format(ab, " subj=%s", ctx); | ||
1121 | security_release_secctx(ctx, len); | ||
1122 | return; | ||
1123 | |||
1124 | error_path: | ||
1125 | audit_panic("error in audit_log_task_context"); | ||
1126 | return; | ||
1127 | } | ||
1128 | |||
1129 | EXPORT_SYMBOL(audit_log_task_context); | ||
1130 | |||
1131 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | ||
1132 | { | ||
1133 | const struct cred *cred; | ||
1134 | char name[sizeof(tsk->comm)]; | ||
1135 | struct mm_struct *mm = tsk->mm; | ||
1136 | char *tty; | ||
1137 | |||
1138 | if (!ab) | ||
1139 | return; | ||
1140 | |||
1141 | /* tsk == current */ | ||
1142 | cred = current_cred(); | ||
1143 | |||
1144 | spin_lock_irq(&tsk->sighand->siglock); | ||
1145 | if (tsk->signal && tsk->signal->tty) | ||
1146 | tty = tsk->signal->tty->name; | ||
1147 | else | ||
1148 | tty = "(none)"; | ||
1149 | spin_unlock_irq(&tsk->sighand->siglock); | ||
1150 | |||
1151 | |||
1152 | audit_log_format(ab, | ||
1153 | " ppid=%ld pid=%d auid=%u uid=%u gid=%u" | ||
1154 | " euid=%u suid=%u fsuid=%u" | ||
1155 | " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", | ||
1156 | sys_getppid(), | ||
1157 | tsk->pid, | ||
1158 | from_kuid(&init_user_ns, tsk->loginuid), | ||
1159 | from_kuid(&init_user_ns, cred->uid), | ||
1160 | from_kgid(&init_user_ns, cred->gid), | ||
1161 | from_kuid(&init_user_ns, cred->euid), | ||
1162 | from_kuid(&init_user_ns, cred->suid), | ||
1163 | from_kuid(&init_user_ns, cred->fsuid), | ||
1164 | from_kgid(&init_user_ns, cred->egid), | ||
1165 | from_kgid(&init_user_ns, cred->sgid), | ||
1166 | from_kgid(&init_user_ns, cred->fsgid), | ||
1167 | tsk->sessionid, tty); | ||
1168 | |||
1169 | get_task_comm(name, tsk); | ||
1170 | audit_log_format(ab, " comm="); | ||
1171 | audit_log_untrustedstring(ab, name); | ||
1172 | |||
1173 | if (mm) { | ||
1174 | down_read(&mm->mmap_sem); | ||
1175 | if (mm->exe_file) | ||
1176 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); | ||
1177 | up_read(&mm->mmap_sem); | ||
1178 | } | ||
1179 | audit_log_task_context(ab); | ||
1180 | } | ||
1181 | |||
1182 | EXPORT_SYMBOL(audit_log_task_info); | ||
1183 | |||
1184 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, | 972 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, |
1185 | kuid_t auid, kuid_t uid, unsigned int sessionid, | 973 | kuid_t auid, kuid_t uid, unsigned int sessionid, |
1186 | u32 sid, char *comm) | 974 | u32 sid, char *comm) |
@@ -1197,12 +985,14 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
1197 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, | 985 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, |
1198 | from_kuid(&init_user_ns, auid), | 986 | from_kuid(&init_user_ns, auid), |
1199 | from_kuid(&init_user_ns, uid), sessionid); | 987 | from_kuid(&init_user_ns, uid), sessionid); |
1200 | if (security_secid_to_secctx(sid, &ctx, &len)) { | 988 | if (sid) { |
1201 | audit_log_format(ab, " obj=(none)"); | 989 | if (security_secid_to_secctx(sid, &ctx, &len)) { |
1202 | rc = 1; | 990 | audit_log_format(ab, " obj=(none)"); |
1203 | } else { | 991 | rc = 1; |
1204 | audit_log_format(ab, " obj=%s", ctx); | 992 | } else { |
1205 | security_release_secctx(ctx, len); | 993 | audit_log_format(ab, " obj=%s", ctx); |
994 | security_release_secctx(ctx, len); | ||
995 | } | ||
1206 | } | 996 | } |
1207 | audit_log_format(ab, " ocomm="); | 997 | audit_log_format(ab, " ocomm="); |
1208 | audit_log_untrustedstring(ab, comm); | 998 | audit_log_untrustedstring(ab, comm); |
@@ -1396,35 +1186,6 @@ static void audit_log_execve_info(struct audit_context *context, | |||
1396 | kfree(buf); | 1186 | kfree(buf); |
1397 | } | 1187 | } |
1398 | 1188 | ||
1399 | static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) | ||
1400 | { | ||
1401 | int i; | ||
1402 | |||
1403 | audit_log_format(ab, " %s=", prefix); | ||
1404 | CAP_FOR_EACH_U32(i) { | ||
1405 | audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); | ||
1406 | } | ||
1407 | } | ||
1408 | |||
1409 | static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) | ||
1410 | { | ||
1411 | kernel_cap_t *perm = &name->fcap.permitted; | ||
1412 | kernel_cap_t *inh = &name->fcap.inheritable; | ||
1413 | int log = 0; | ||
1414 | |||
1415 | if (!cap_isclear(*perm)) { | ||
1416 | audit_log_cap(ab, "cap_fp", perm); | ||
1417 | log = 1; | ||
1418 | } | ||
1419 | if (!cap_isclear(*inh)) { | ||
1420 | audit_log_cap(ab, "cap_fi", inh); | ||
1421 | log = 1; | ||
1422 | } | ||
1423 | |||
1424 | if (log) | ||
1425 | audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); | ||
1426 | } | ||
1427 | |||
1428 | static void show_special(struct audit_context *context, int *call_panic) | 1189 | static void show_special(struct audit_context *context, int *call_panic) |
1429 | { | 1190 | { |
1430 | struct audit_buffer *ab; | 1191 | struct audit_buffer *ab; |
@@ -1522,68 +1283,6 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1522 | audit_log_end(ab); | 1283 | audit_log_end(ab); |
1523 | } | 1284 | } |
1524 | 1285 | ||
1525 | static void audit_log_name(struct audit_context *context, struct audit_names *n, | ||
1526 | int record_num, int *call_panic) | ||
1527 | { | ||
1528 | struct audit_buffer *ab; | ||
1529 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | ||
1530 | if (!ab) | ||
1531 | return; /* audit_panic has been called */ | ||
1532 | |||
1533 | audit_log_format(ab, "item=%d", record_num); | ||
1534 | |||
1535 | if (n->name) { | ||
1536 | switch (n->name_len) { | ||
1537 | case AUDIT_NAME_FULL: | ||
1538 | /* log the full path */ | ||
1539 | audit_log_format(ab, " name="); | ||
1540 | audit_log_untrustedstring(ab, n->name->name); | ||
1541 | break; | ||
1542 | case 0: | ||
1543 | /* name was specified as a relative path and the | ||
1544 | * directory component is the cwd */ | ||
1545 | audit_log_d_path(ab, " name=", &context->pwd); | ||
1546 | break; | ||
1547 | default: | ||
1548 | /* log the name's directory component */ | ||
1549 | audit_log_format(ab, " name="); | ||
1550 | audit_log_n_untrustedstring(ab, n->name->name, | ||
1551 | n->name_len); | ||
1552 | } | ||
1553 | } else | ||
1554 | audit_log_format(ab, " name=(null)"); | ||
1555 | |||
1556 | if (n->ino != (unsigned long)-1) { | ||
1557 | audit_log_format(ab, " inode=%lu" | ||
1558 | " dev=%02x:%02x mode=%#ho" | ||
1559 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
1560 | n->ino, | ||
1561 | MAJOR(n->dev), | ||
1562 | MINOR(n->dev), | ||
1563 | n->mode, | ||
1564 | from_kuid(&init_user_ns, n->uid), | ||
1565 | from_kgid(&init_user_ns, n->gid), | ||
1566 | MAJOR(n->rdev), | ||
1567 | MINOR(n->rdev)); | ||
1568 | } | ||
1569 | if (n->osid != 0) { | ||
1570 | char *ctx = NULL; | ||
1571 | u32 len; | ||
1572 | if (security_secid_to_secctx( | ||
1573 | n->osid, &ctx, &len)) { | ||
1574 | audit_log_format(ab, " osid=%u", n->osid); | ||
1575 | *call_panic = 2; | ||
1576 | } else { | ||
1577 | audit_log_format(ab, " obj=%s", ctx); | ||
1578 | security_release_secctx(ctx, len); | ||
1579 | } | ||
1580 | } | ||
1581 | |||
1582 | audit_log_fcaps(ab, n); | ||
1583 | |||
1584 | audit_log_end(ab); | ||
1585 | } | ||
1586 | |||
1587 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 1286 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
1588 | { | 1287 | { |
1589 | int i, call_panic = 0; | 1288 | int i, call_panic = 0; |
@@ -1701,7 +1400,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1701 | 1400 | ||
1702 | i = 0; | 1401 | i = 0; |
1703 | list_for_each_entry(n, &context->names_list, list) | 1402 | list_for_each_entry(n, &context->names_list, list) |
1704 | audit_log_name(context, n, i++, &call_panic); | 1403 | audit_log_name(context, n, NULL, i++, &call_panic); |
1705 | 1404 | ||
1706 | /* Send end of event record to help user space know we are finished */ | 1405 | /* Send end of event record to help user space know we are finished */ |
1707 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); | 1406 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); |
@@ -2036,18 +1735,18 @@ void audit_putname(struct filename *name) | |||
2036 | BUG_ON(!context); | 1735 | BUG_ON(!context); |
2037 | if (!context->in_syscall) { | 1736 | if (!context->in_syscall) { |
2038 | #if AUDIT_DEBUG == 2 | 1737 | #if AUDIT_DEBUG == 2 |
2039 | printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", | 1738 | printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", |
2040 | __FILE__, __LINE__, context->serial, name); | 1739 | __FILE__, __LINE__, context->serial, name); |
2041 | if (context->name_count) { | 1740 | if (context->name_count) { |
2042 | struct audit_names *n; | 1741 | struct audit_names *n; |
2043 | int i; | 1742 | int i = 0; |
2044 | 1743 | ||
2045 | list_for_each_entry(n, &context->names_list, list) | 1744 | list_for_each_entry(n, &context->names_list, list) |
2046 | printk(KERN_ERR "name[%d] = %p = %s\n", i, | 1745 | printk(KERN_ERR "name[%d] = %p = %s\n", i++, |
2047 | n->name, n->name->name ?: "(null)"); | 1746 | n->name, n->name->name ?: "(null)"); |
2048 | } | 1747 | } |
2049 | #endif | 1748 | #endif |
2050 | __putname(name); | 1749 | final_putname(name); |
2051 | } | 1750 | } |
2052 | #if AUDIT_DEBUG | 1751 | #if AUDIT_DEBUG |
2053 | else { | 1752 | else { |
@@ -2066,41 +1765,6 @@ void audit_putname(struct filename *name) | |||
2066 | #endif | 1765 | #endif |
2067 | } | 1766 | } |
2068 | 1767 | ||
2069 | static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) | ||
2070 | { | ||
2071 | struct cpu_vfs_cap_data caps; | ||
2072 | int rc; | ||
2073 | |||
2074 | if (!dentry) | ||
2075 | return 0; | ||
2076 | |||
2077 | rc = get_vfs_caps_from_disk(dentry, &caps); | ||
2078 | if (rc) | ||
2079 | return rc; | ||
2080 | |||
2081 | name->fcap.permitted = caps.permitted; | ||
2082 | name->fcap.inheritable = caps.inheritable; | ||
2083 | name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); | ||
2084 | name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; | ||
2085 | |||
2086 | return 0; | ||
2087 | } | ||
2088 | |||
2089 | |||
2090 | /* Copy inode data into an audit_names. */ | ||
2091 | static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, | ||
2092 | const struct inode *inode) | ||
2093 | { | ||
2094 | name->ino = inode->i_ino; | ||
2095 | name->dev = inode->i_sb->s_dev; | ||
2096 | name->mode = inode->i_mode; | ||
2097 | name->uid = inode->i_uid; | ||
2098 | name->gid = inode->i_gid; | ||
2099 | name->rdev = inode->i_rdev; | ||
2100 | security_inode_getsecid(inode, &name->osid); | ||
2101 | audit_copy_fcaps(name, dentry); | ||
2102 | } | ||
2103 | |||
2104 | /** | 1768 | /** |
2105 | * __audit_inode - store the inode and device from a lookup | 1769 | * __audit_inode - store the inode and device from a lookup |
2106 | * @name: name being audited | 1770 | * @name: name being audited |
@@ -2309,7 +1973,7 @@ int audit_set_loginuid(kuid_t loginuid) | |||
2309 | unsigned int sessionid; | 1973 | unsigned int sessionid; |
2310 | 1974 | ||
2311 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE | 1975 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE |
2312 | if (uid_valid(task->loginuid)) | 1976 | if (audit_loginuid_set(task)) |
2313 | return -EPERM; | 1977 | return -EPERM; |
2314 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ | 1978 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ |
2315 | if (!capable(CAP_AUDIT_CONTROL)) | 1979 | if (!capable(CAP_AUDIT_CONTROL)) |
@@ -2477,17 +2141,20 @@ int __audit_bprm(struct linux_binprm *bprm) | |||
2477 | 2141 | ||
2478 | /** | 2142 | /** |
2479 | * audit_socketcall - record audit data for sys_socketcall | 2143 | * audit_socketcall - record audit data for sys_socketcall |
2480 | * @nargs: number of args | 2144 | * @nargs: number of args, which should not be more than AUDITSC_ARGS. |
2481 | * @args: args array | 2145 | * @args: args array |
2482 | * | 2146 | * |
2483 | */ | 2147 | */ |
2484 | void __audit_socketcall(int nargs, unsigned long *args) | 2148 | int __audit_socketcall(int nargs, unsigned long *args) |
2485 | { | 2149 | { |
2486 | struct audit_context *context = current->audit_context; | 2150 | struct audit_context *context = current->audit_context; |
2487 | 2151 | ||
2152 | if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) | ||
2153 | return -EINVAL; | ||
2488 | context->type = AUDIT_SOCKETCALL; | 2154 | context->type = AUDIT_SOCKETCALL; |
2489 | context->socketcall.nargs = nargs; | 2155 | context->socketcall.nargs = nargs; |
2490 | memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); | 2156 | memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); |
2157 | return 0; | ||
2491 | } | 2158 | } |
2492 | 2159 | ||
2493 | /** | 2160 | /** |
diff --git a/kernel/capability.c b/kernel/capability.c index 493d97259484..f6c2ce5701e1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap) | |||
393 | EXPORT_SYMBOL(ns_capable); | 393 | EXPORT_SYMBOL(ns_capable); |
394 | 394 | ||
395 | /** | 395 | /** |
396 | * file_ns_capable - Determine if the file's opener had a capability in effect | ||
397 | * @file: The file we want to check | ||
398 | * @ns: The usernamespace we want the capability in | ||
399 | * @cap: The capability to be tested for | ||
400 | * | ||
401 | * Return true if task that opened the file had a capability in effect | ||
402 | * when the file was opened. | ||
403 | * | ||
404 | * This does not set PF_SUPERPRIV because the caller may not | ||
405 | * actually be privileged. | ||
406 | */ | ||
407 | bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) | ||
408 | { | ||
409 | if (WARN_ON_ONCE(!cap_valid(cap))) | ||
410 | return false; | ||
411 | |||
412 | if (security_capable(file->f_cred, ns, cap) == 0) | ||
413 | return true; | ||
414 | |||
415 | return false; | ||
416 | } | ||
417 | EXPORT_SYMBOL(file_ns_capable); | ||
418 | |||
419 | /** | ||
396 | * capable - Determine if the current task has a superior capability in effect | 420 | * capable - Determine if the current task has a superior capability in effect |
397 | * @cap: The capability to be tested for | 421 | * @cap: The capability to be tested for |
398 | * | 422 | * |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..a7c9e6ddb979 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/cred.h> | 30 | #include <linux/cred.h> |
31 | #include <linux/ctype.h> | 31 | #include <linux/ctype.h> |
32 | #include <linux/errno.h> | 32 | #include <linux/errno.h> |
33 | #include <linux/fs.h> | ||
34 | #include <linux/init_task.h> | 33 | #include <linux/init_task.h> |
35 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
36 | #include <linux/list.h> | 35 | #include <linux/list.h> |
@@ -59,7 +58,7 @@ | |||
59 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
60 | #include <linux/eventfd.h> | 59 | #include <linux/eventfd.h> |
61 | #include <linux/poll.h> | 60 | #include <linux/poll.h> |
62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ |
63 | #include <linux/kthread.h> | 62 | #include <linux/kthread.h> |
64 | 63 | ||
65 | #include <linux/atomic.h> | 64 | #include <linux/atomic.h> |
@@ -83,7 +82,13 @@ | |||
83 | * B happens only through cgroup_show_options() and using cgroup_root_mutex | 82 | * B happens only through cgroup_show_options() and using cgroup_root_mutex |
84 | * breaks it. | 83 | * breaks it. |
85 | */ | 84 | */ |
85 | #ifdef CONFIG_PROVE_RCU | ||
86 | DEFINE_MUTEX(cgroup_mutex); | ||
87 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ | ||
88 | #else | ||
86 | static DEFINE_MUTEX(cgroup_mutex); | 89 | static DEFINE_MUTEX(cgroup_mutex); |
90 | #endif | ||
91 | |||
87 | static DEFINE_MUTEX(cgroup_root_mutex); | 92 | static DEFINE_MUTEX(cgroup_root_mutex); |
88 | 93 | ||
89 | /* | 94 | /* |
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | |||
98 | #include <linux/cgroup_subsys.h> | 103 | #include <linux/cgroup_subsys.h> |
99 | }; | 104 | }; |
100 | 105 | ||
101 | #define MAX_CGROUP_ROOT_NAMELEN 64 | ||
102 | |||
103 | /* | ||
104 | * A cgroupfs_root represents the root of a cgroup hierarchy, | ||
105 | * and may be associated with a superblock to form an active | ||
106 | * hierarchy | ||
107 | */ | ||
108 | struct cgroupfs_root { | ||
109 | struct super_block *sb; | ||
110 | |||
111 | /* | ||
112 | * The bitmask of subsystems intended to be attached to this | ||
113 | * hierarchy | ||
114 | */ | ||
115 | unsigned long subsys_mask; | ||
116 | |||
117 | /* Unique id for this hierarchy. */ | ||
118 | int hierarchy_id; | ||
119 | |||
120 | /* The bitmask of subsystems currently attached to this hierarchy */ | ||
121 | unsigned long actual_subsys_mask; | ||
122 | |||
123 | /* A list running through the attached subsystems */ | ||
124 | struct list_head subsys_list; | ||
125 | |||
126 | /* The root cgroup for this hierarchy */ | ||
127 | struct cgroup top_cgroup; | ||
128 | |||
129 | /* Tracks how many cgroups are currently defined in hierarchy.*/ | ||
130 | int number_of_cgroups; | ||
131 | |||
132 | /* A list running through the active hierarchies */ | ||
133 | struct list_head root_list; | ||
134 | |||
135 | /* All cgroups on this root, cgroup_mutex protected */ | ||
136 | struct list_head allcg_list; | ||
137 | |||
138 | /* Hierarchy-specific flags */ | ||
139 | unsigned long flags; | ||
140 | |||
141 | /* IDs for cgroups in this hierarchy */ | ||
142 | struct ida cgroup_ida; | ||
143 | |||
144 | /* The path to use for release notifications. */ | ||
145 | char release_agent_path[PATH_MAX]; | ||
146 | |||
147 | /* The name for this hierarchy - may be empty */ | ||
148 | char name[MAX_CGROUP_ROOT_NAMELEN]; | ||
149 | }; | ||
150 | |||
151 | /* | 106 | /* |
152 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the | 107 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the |
153 | * subsystems that are otherwise unattached - it never has more than a | 108 | * subsystems that are otherwise unattached - it never has more than a |
@@ -162,6 +117,9 @@ struct cfent { | |||
162 | struct list_head node; | 117 | struct list_head node; |
163 | struct dentry *dentry; | 118 | struct dentry *dentry; |
164 | struct cftype *type; | 119 | struct cftype *type; |
120 | |||
121 | /* file xattrs */ | ||
122 | struct simple_xattrs xattrs; | ||
165 | }; | 123 | }; |
166 | 124 | ||
167 | /* | 125 | /* |
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
238 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | 196 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ |
239 | #define dummytop (&rootnode.top_cgroup) | 197 | #define dummytop (&rootnode.top_cgroup) |
240 | 198 | ||
199 | static struct cgroup_name root_cgroup_name = { .name = "/" }; | ||
200 | |||
241 | /* This flag indicates whether tasks in the fork and exit paths should | 201 | /* This flag indicates whether tasks in the fork and exit paths should |
242 | * check for fork/exit handlers to call. This avoids us having to do | 202 | * check for fork/exit handlers to call. This avoids us having to do |
243 | * extra work in the fork/exit path if none of the subsystems need to | 203 | * extra work in the fork/exit path if none of the subsystems need to |
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); | |||
249 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 209 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
250 | struct cftype cfts[], bool is_add); | 210 | struct cftype cfts[], bool is_add); |
251 | 211 | ||
252 | #ifdef CONFIG_PROVE_LOCKING | ||
253 | int cgroup_lock_is_held(void) | ||
254 | { | ||
255 | return lockdep_is_held(&cgroup_mutex); | ||
256 | } | ||
257 | #else /* #ifdef CONFIG_PROVE_LOCKING */ | ||
258 | int cgroup_lock_is_held(void) | ||
259 | { | ||
260 | return mutex_is_locked(&cgroup_mutex); | ||
261 | } | ||
262 | #endif /* #else #ifdef CONFIG_PROVE_LOCKING */ | ||
263 | |||
264 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | ||
265 | |||
266 | static int css_unbias_refcnt(int refcnt) | 212 | static int css_unbias_refcnt(int refcnt) |
267 | { | 213 | { |
268 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | 214 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; |
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) | |||
282 | return test_bit(CGRP_REMOVED, &cgrp->flags); | 228 | return test_bit(CGRP_REMOVED, &cgrp->flags); |
283 | } | 229 | } |
284 | 230 | ||
285 | /* bits in struct cgroupfs_root flags field */ | 231 | /** |
286 | enum { | 232 | * cgroup_is_descendant - test ancestry |
287 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | 233 | * @cgrp: the cgroup to be tested |
288 | ROOT_XATTR, /* supports extended attributes */ | 234 | * @ancestor: possible ancestor of @cgrp |
289 | }; | 235 | * |
236 | * Test whether @cgrp is a descendant of @ancestor. It also returns %true | ||
237 | * if @cgrp == @ancestor. This function is safe to call as long as @cgrp | ||
238 | * and @ancestor are accessible. | ||
239 | */ | ||
240 | bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) | ||
241 | { | ||
242 | while (cgrp) { | ||
243 | if (cgrp == ancestor) | ||
244 | return true; | ||
245 | cgrp = cgrp->parent; | ||
246 | } | ||
247 | return false; | ||
248 | } | ||
249 | EXPORT_SYMBOL_GPL(cgroup_is_descendant); | ||
290 | 250 | ||
291 | static int cgroup_is_releasable(const struct cgroup *cgrp) | 251 | static int cgroup_is_releasable(const struct cgroup *cgrp) |
292 | { | 252 | { |
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
327 | return __d_cfe(dentry)->type; | 287 | return __d_cfe(dentry)->type; |
328 | } | 288 | } |
329 | 289 | ||
290 | /** | ||
291 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. | ||
292 | * @cgrp: the cgroup to be checked for liveness | ||
293 | * | ||
294 | * On success, returns true; the mutex should be later unlocked. On | ||
295 | * failure returns false with no lock held. | ||
296 | */ | ||
297 | static bool cgroup_lock_live_group(struct cgroup *cgrp) | ||
298 | { | ||
299 | mutex_lock(&cgroup_mutex); | ||
300 | if (cgroup_is_removed(cgrp)) { | ||
301 | mutex_unlock(&cgroup_mutex); | ||
302 | return false; | ||
303 | } | ||
304 | return true; | ||
305 | } | ||
306 | |||
330 | /* the list of cgroups eligible for automatic release. Protected by | 307 | /* the list of cgroups eligible for automatic release. Protected by |
331 | * release_list_lock */ | 308 | * release_list_lock */ |
332 | static LIST_HEAD(release_list); | 309 | static LIST_HEAD(release_list); |
@@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
800 | * update of a tasks cgroup pointer by cgroup_attach_task() | 777 | * update of a tasks cgroup pointer by cgroup_attach_task() |
801 | */ | 778 | */ |
802 | 779 | ||
803 | /** | ||
804 | * cgroup_lock - lock out any changes to cgroup structures | ||
805 | * | ||
806 | */ | ||
807 | void cgroup_lock(void) | ||
808 | { | ||
809 | mutex_lock(&cgroup_mutex); | ||
810 | } | ||
811 | EXPORT_SYMBOL_GPL(cgroup_lock); | ||
812 | |||
813 | /** | ||
814 | * cgroup_unlock - release lock on cgroup changes | ||
815 | * | ||
816 | * Undo the lock taken in a previous cgroup_lock() call. | ||
817 | */ | ||
818 | void cgroup_unlock(void) | ||
819 | { | ||
820 | mutex_unlock(&cgroup_mutex); | ||
821 | } | ||
822 | EXPORT_SYMBOL_GPL(cgroup_unlock); | ||
823 | |||
824 | /* | 780 | /* |
825 | * A couple of forward declarations required, due to cyclic reference loop: | 781 | * A couple of forward declarations required, due to cyclic reference loop: |
826 | * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> | 782 | * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> |
@@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
859 | return inode; | 815 | return inode; |
860 | } | 816 | } |
861 | 817 | ||
818 | static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | ||
819 | { | ||
820 | struct cgroup_name *name; | ||
821 | |||
822 | name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); | ||
823 | if (!name) | ||
824 | return NULL; | ||
825 | strcpy(name->name, dentry->d_name.name); | ||
826 | return name; | ||
827 | } | ||
828 | |||
862 | static void cgroup_free_fn(struct work_struct *work) | 829 | static void cgroup_free_fn(struct work_struct *work) |
863 | { | 830 | { |
864 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); | 831 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); |
@@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work) | |||
875 | mutex_unlock(&cgroup_mutex); | 842 | mutex_unlock(&cgroup_mutex); |
876 | 843 | ||
877 | /* | 844 | /* |
845 | * We get a ref to the parent's dentry, and put the ref when | ||
846 | * this cgroup is being freed, so it's guaranteed that the | ||
847 | * parent won't be destroyed before its children. | ||
848 | */ | ||
849 | dput(cgrp->parent->dentry); | ||
850 | |||
851 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
852 | |||
853 | /* | ||
878 | * Drop the active superblock reference that we took when we | 854 | * Drop the active superblock reference that we took when we |
879 | * created the cgroup | 855 | * created the cgroup. This will free cgrp->root, if we are |
856 | * holding the last reference to @sb. | ||
880 | */ | 857 | */ |
881 | deactivate_super(cgrp->root->sb); | 858 | deactivate_super(cgrp->root->sb); |
882 | 859 | ||
@@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work) | |||
888 | 865 | ||
889 | simple_xattrs_free(&cgrp->xattrs); | 866 | simple_xattrs_free(&cgrp->xattrs); |
890 | 867 | ||
891 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | 868 | kfree(rcu_dereference_raw(cgrp->name)); |
892 | kfree(cgrp); | 869 | kfree(cgrp); |
893 | } | 870 | } |
894 | 871 | ||
@@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
910 | } else { | 887 | } else { |
911 | struct cfent *cfe = __d_cfe(dentry); | 888 | struct cfent *cfe = __d_cfe(dentry); |
912 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | 889 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; |
913 | struct cftype *cft = cfe->type; | ||
914 | 890 | ||
915 | WARN_ONCE(!list_empty(&cfe->node) && | 891 | WARN_ONCE(!list_empty(&cfe->node) && |
916 | cgrp != &cgrp->root->top_cgroup, | 892 | cgrp != &cgrp->root->top_cgroup, |
917 | "cfe still linked for %s\n", cfe->type->name); | 893 | "cfe still linked for %s\n", cfe->type->name); |
894 | simple_xattrs_free(&cfe->xattrs); | ||
918 | kfree(cfe); | 895 | kfree(cfe); |
919 | simple_xattrs_free(&cft->xattrs); | ||
920 | } | 896 | } |
921 | iput(inode); | 897 | iput(inode); |
922 | } | 898 | } |
@@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1108 | mutex_lock(&cgroup_root_mutex); | 1084 | mutex_lock(&cgroup_root_mutex); |
1109 | for_each_subsys(root, ss) | 1085 | for_each_subsys(root, ss) |
1110 | seq_printf(seq, ",%s", ss->name); | 1086 | seq_printf(seq, ",%s", ss->name); |
1111 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | 1087 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
1088 | seq_puts(seq, ",sane_behavior"); | ||
1089 | if (root->flags & CGRP_ROOT_NOPREFIX) | ||
1112 | seq_puts(seq, ",noprefix"); | 1090 | seq_puts(seq, ",noprefix"); |
1113 | if (test_bit(ROOT_XATTR, &root->flags)) | 1091 | if (root->flags & CGRP_ROOT_XATTR) |
1114 | seq_puts(seq, ",xattr"); | 1092 | seq_puts(seq, ",xattr"); |
1115 | if (strlen(root->release_agent_path)) | 1093 | if (strlen(root->release_agent_path)) |
1116 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1094 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
@@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1172 | all_ss = true; | 1150 | all_ss = true; |
1173 | continue; | 1151 | continue; |
1174 | } | 1152 | } |
1153 | if (!strcmp(token, "__DEVEL__sane_behavior")) { | ||
1154 | opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; | ||
1155 | continue; | ||
1156 | } | ||
1175 | if (!strcmp(token, "noprefix")) { | 1157 | if (!strcmp(token, "noprefix")) { |
1176 | set_bit(ROOT_NOPREFIX, &opts->flags); | 1158 | opts->flags |= CGRP_ROOT_NOPREFIX; |
1177 | continue; | 1159 | continue; |
1178 | } | 1160 | } |
1179 | if (!strcmp(token, "clone_children")) { | 1161 | if (!strcmp(token, "clone_children")) { |
@@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1181 | continue; | 1163 | continue; |
1182 | } | 1164 | } |
1183 | if (!strcmp(token, "xattr")) { | 1165 | if (!strcmp(token, "xattr")) { |
1184 | set_bit(ROOT_XATTR, &opts->flags); | 1166 | opts->flags |= CGRP_ROOT_XATTR; |
1185 | continue; | 1167 | continue; |
1186 | } | 1168 | } |
1187 | if (!strncmp(token, "release_agent=", 14)) { | 1169 | if (!strncmp(token, "release_agent=", 14)) { |
@@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1259 | 1241 | ||
1260 | /* Consistency checks */ | 1242 | /* Consistency checks */ |
1261 | 1243 | ||
1244 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { | ||
1245 | pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); | ||
1246 | |||
1247 | if (opts->flags & CGRP_ROOT_NOPREFIX) { | ||
1248 | pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); | ||
1249 | return -EINVAL; | ||
1250 | } | ||
1251 | |||
1252 | if (opts->cpuset_clone_children) { | ||
1253 | pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); | ||
1254 | return -EINVAL; | ||
1255 | } | ||
1256 | } | ||
1257 | |||
1262 | /* | 1258 | /* |
1263 | * Option noprefix was introduced just for backward compatibility | 1259 | * Option noprefix was introduced just for backward compatibility |
1264 | * with the old cpuset, so we allow noprefix only if mounting just | 1260 | * with the old cpuset, so we allow noprefix only if mounting just |
1265 | * the cpuset subsystem. | 1261 | * the cpuset subsystem. |
1266 | */ | 1262 | */ |
1267 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && | 1263 | if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) |
1268 | (opts->subsys_mask & mask)) | ||
1269 | return -EINVAL; | 1264 | return -EINVAL; |
1270 | 1265 | ||
1271 | 1266 | ||
@@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1336 | struct cgroup_sb_opts opts; | 1331 | struct cgroup_sb_opts opts; |
1337 | unsigned long added_mask, removed_mask; | 1332 | unsigned long added_mask, removed_mask; |
1338 | 1333 | ||
1334 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { | ||
1335 | pr_err("cgroup: sane_behavior: remount is not allowed\n"); | ||
1336 | return -EINVAL; | ||
1337 | } | ||
1338 | |||
1339 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1339 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1340 | mutex_lock(&cgroup_mutex); | 1340 | mutex_lock(&cgroup_mutex); |
1341 | mutex_lock(&cgroup_root_mutex); | 1341 | mutex_lock(&cgroup_root_mutex); |
@@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1421 | INIT_LIST_HEAD(&root->allcg_list); | 1421 | INIT_LIST_HEAD(&root->allcg_list); |
1422 | root->number_of_cgroups = 1; | 1422 | root->number_of_cgroups = 1; |
1423 | cgrp->root = root; | 1423 | cgrp->root = root; |
1424 | cgrp->top_cgroup = cgrp; | 1424 | cgrp->name = &root_cgroup_name; |
1425 | init_cgroup_housekeeping(cgrp); | 1425 | init_cgroup_housekeeping(cgrp); |
1426 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | 1426 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); |
1427 | } | 1427 | } |
@@ -1685,6 +1685,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1685 | * any) is not needed | 1685 | * any) is not needed |
1686 | */ | 1686 | */ |
1687 | cgroup_drop_root(opts.new_root); | 1687 | cgroup_drop_root(opts.new_root); |
1688 | |||
1689 | if (root->flags != opts.flags) { | ||
1690 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { | ||
1691 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); | ||
1692 | ret = -EINVAL; | ||
1693 | goto drop_new_super; | ||
1694 | } else { | ||
1695 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | ||
1696 | } | ||
1697 | } | ||
1698 | |||
1688 | /* no subsys rebinding, so refcounts don't change */ | 1699 | /* no subsys rebinding, so refcounts don't change */ |
1689 | drop_parsed_module_refcounts(opts.subsys_mask); | 1700 | drop_parsed_module_refcounts(opts.subsys_mask); |
1690 | } | 1701 | } |
@@ -1769,49 +1780,48 @@ static struct kobject *cgroup_kobj; | |||
1769 | * @buf: the buffer to write the path into | 1780 | * @buf: the buffer to write the path into |
1770 | * @buflen: the length of the buffer | 1781 | * @buflen: the length of the buffer |
1771 | * | 1782 | * |
1772 | * Called with cgroup_mutex held or else with an RCU-protected cgroup | 1783 | * Writes path of cgroup into buf. Returns 0 on success, -errno on error. |
1773 | * reference. Writes path of cgroup into buf. Returns 0 on success, | 1784 | * |
1774 | * -errno on error. | 1785 | * We can't generate cgroup path using dentry->d_name, as accessing |
1786 | * dentry->name must be protected by irq-unsafe dentry->d_lock or parent | ||
1787 | * inode's i_mutex, while on the other hand cgroup_path() can be called | ||
1788 | * with some irq-safe spinlocks held. | ||
1775 | */ | 1789 | */ |
1776 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1790 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1777 | { | 1791 | { |
1778 | struct dentry *dentry = cgrp->dentry; | 1792 | int ret = -ENAMETOOLONG; |
1779 | char *start; | 1793 | char *start; |
1780 | 1794 | ||
1781 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), | 1795 | if (!cgrp->parent) { |
1782 | "cgroup_path() called without proper locking"); | 1796 | if (strlcpy(buf, "/", buflen) >= buflen) |
1783 | 1797 | return -ENAMETOOLONG; | |
1784 | if (cgrp == dummytop) { | ||
1785 | /* | ||
1786 | * Inactive subsystems have no dentry for their root | ||
1787 | * cgroup | ||
1788 | */ | ||
1789 | strcpy(buf, "/"); | ||
1790 | return 0; | 1798 | return 0; |
1791 | } | 1799 | } |
1792 | 1800 | ||
1793 | start = buf + buflen - 1; | 1801 | start = buf + buflen - 1; |
1794 | |||
1795 | *start = '\0'; | 1802 | *start = '\0'; |
1796 | for (;;) { | ||
1797 | int len = dentry->d_name.len; | ||
1798 | 1803 | ||
1804 | rcu_read_lock(); | ||
1805 | do { | ||
1806 | const char *name = cgroup_name(cgrp); | ||
1807 | int len; | ||
1808 | |||
1809 | len = strlen(name); | ||
1799 | if ((start -= len) < buf) | 1810 | if ((start -= len) < buf) |
1800 | return -ENAMETOOLONG; | 1811 | goto out; |
1801 | memcpy(start, dentry->d_name.name, len); | 1812 | memcpy(start, name, len); |
1802 | cgrp = cgrp->parent; | ||
1803 | if (!cgrp) | ||
1804 | break; | ||
1805 | 1813 | ||
1806 | dentry = cgrp->dentry; | ||
1807 | if (!cgrp->parent) | ||
1808 | continue; | ||
1809 | if (--start < buf) | 1814 | if (--start < buf) |
1810 | return -ENAMETOOLONG; | 1815 | goto out; |
1811 | *start = '/'; | 1816 | *start = '/'; |
1812 | } | 1817 | |
1818 | cgrp = cgrp->parent; | ||
1819 | } while (cgrp->parent); | ||
1820 | ret = 0; | ||
1813 | memmove(buf, start, buf + buflen - start); | 1821 | memmove(buf, start, buf + buflen - start); |
1814 | return 0; | 1822 | out: |
1823 | rcu_read_unlock(); | ||
1824 | return ret; | ||
1815 | } | 1825 | } |
1816 | EXPORT_SYMBOL_GPL(cgroup_path); | 1826 | EXPORT_SYMBOL_GPL(cgroup_path); |
1817 | 1827 | ||
@@ -1900,7 +1910,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1900 | * | 1910 | * |
1901 | * Must be called with cgroup_mutex and threadgroup locked. | 1911 | * Must be called with cgroup_mutex and threadgroup locked. |
1902 | */ | 1912 | */ |
1903 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1913 | static void cgroup_task_migrate(struct cgroup *oldcgrp, |
1904 | struct task_struct *tsk, struct css_set *newcg) | 1914 | struct task_struct *tsk, struct css_set *newcg) |
1905 | { | 1915 | { |
1906 | struct css_set *oldcg; | 1916 | struct css_set *oldcg; |
@@ -1933,121 +1943,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1933 | } | 1943 | } |
1934 | 1944 | ||
1935 | /** | 1945 | /** |
1936 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' | 1946 | * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup |
1937 | * @cgrp: the cgroup the task is attaching to | ||
1938 | * @tsk: the task to be attached | ||
1939 | * | ||
1940 | * Call with cgroup_mutex and threadgroup locked. May take task_lock of | ||
1941 | * @tsk during call. | ||
1942 | */ | ||
1943 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | ||
1944 | { | ||
1945 | int retval = 0; | ||
1946 | struct cgroup_subsys *ss, *failed_ss = NULL; | ||
1947 | struct cgroup *oldcgrp; | ||
1948 | struct cgroupfs_root *root = cgrp->root; | ||
1949 | struct cgroup_taskset tset = { }; | ||
1950 | struct css_set *newcg; | ||
1951 | |||
1952 | /* @tsk either already exited or can't exit until the end */ | ||
1953 | if (tsk->flags & PF_EXITING) | ||
1954 | return -ESRCH; | ||
1955 | |||
1956 | /* Nothing to do if the task is already in that cgroup */ | ||
1957 | oldcgrp = task_cgroup_from_root(tsk, root); | ||
1958 | if (cgrp == oldcgrp) | ||
1959 | return 0; | ||
1960 | |||
1961 | tset.single.task = tsk; | ||
1962 | tset.single.cgrp = oldcgrp; | ||
1963 | |||
1964 | for_each_subsys(root, ss) { | ||
1965 | if (ss->can_attach) { | ||
1966 | retval = ss->can_attach(cgrp, &tset); | ||
1967 | if (retval) { | ||
1968 | /* | ||
1969 | * Remember on which subsystem the can_attach() | ||
1970 | * failed, so that we only call cancel_attach() | ||
1971 | * against the subsystems whose can_attach() | ||
1972 | * succeeded. (See below) | ||
1973 | */ | ||
1974 | failed_ss = ss; | ||
1975 | goto out; | ||
1976 | } | ||
1977 | } | ||
1978 | } | ||
1979 | |||
1980 | newcg = find_css_set(tsk->cgroups, cgrp); | ||
1981 | if (!newcg) { | ||
1982 | retval = -ENOMEM; | ||
1983 | goto out; | ||
1984 | } | ||
1985 | |||
1986 | cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); | ||
1987 | |||
1988 | for_each_subsys(root, ss) { | ||
1989 | if (ss->attach) | ||
1990 | ss->attach(cgrp, &tset); | ||
1991 | } | ||
1992 | |||
1993 | out: | ||
1994 | if (retval) { | ||
1995 | for_each_subsys(root, ss) { | ||
1996 | if (ss == failed_ss) | ||
1997 | /* | ||
1998 | * This subsystem was the one that failed the | ||
1999 | * can_attach() check earlier, so we don't need | ||
2000 | * to call cancel_attach() against it or any | ||
2001 | * remaining subsystems. | ||
2002 | */ | ||
2003 | break; | ||
2004 | if (ss->cancel_attach) | ||
2005 | ss->cancel_attach(cgrp, &tset); | ||
2006 | } | ||
2007 | } | ||
2008 | return retval; | ||
2009 | } | ||
2010 | |||
2011 | /** | ||
2012 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' | ||
2013 | * @from: attach to all cgroups of a given task | ||
2014 | * @tsk: the task to be attached | ||
2015 | */ | ||
2016 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | ||
2017 | { | ||
2018 | struct cgroupfs_root *root; | ||
2019 | int retval = 0; | ||
2020 | |||
2021 | cgroup_lock(); | ||
2022 | for_each_active_root(root) { | ||
2023 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | ||
2024 | |||
2025 | retval = cgroup_attach_task(from_cg, tsk); | ||
2026 | if (retval) | ||
2027 | break; | ||
2028 | } | ||
2029 | cgroup_unlock(); | ||
2030 | |||
2031 | return retval; | ||
2032 | } | ||
2033 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | ||
2034 | |||
2035 | /** | ||
2036 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup | ||
2037 | * @cgrp: the cgroup to attach to | 1947 | * @cgrp: the cgroup to attach to |
2038 | * @leader: the threadgroup leader task_struct of the group to be attached | 1948 | * @tsk: the task or the leader of the threadgroup to be attached |
1949 | * @threadgroup: attach the whole threadgroup? | ||
2039 | * | 1950 | * |
2040 | * Call holding cgroup_mutex and the group_rwsem of the leader. Will take | 1951 | * Call holding cgroup_mutex and the group_rwsem of the leader. Will take |
2041 | * task_lock of each thread in leader's threadgroup individually in turn. | 1952 | * task_lock of @tsk or each thread in the threadgroup individually in turn. |
2042 | */ | 1953 | */ |
2043 | static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | 1954 | static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, |
1955 | bool threadgroup) | ||
2044 | { | 1956 | { |
2045 | int retval, i, group_size; | 1957 | int retval, i, group_size; |
2046 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1958 | struct cgroup_subsys *ss, *failed_ss = NULL; |
2047 | /* guaranteed to be initialized later, but the compiler needs this */ | ||
2048 | struct cgroupfs_root *root = cgrp->root; | 1959 | struct cgroupfs_root *root = cgrp->root; |
2049 | /* threadgroup list cursor and array */ | 1960 | /* threadgroup list cursor and array */ |
2050 | struct task_struct *tsk; | 1961 | struct task_struct *leader = tsk; |
2051 | struct task_and_cgroup *tc; | 1962 | struct task_and_cgroup *tc; |
2052 | struct flex_array *group; | 1963 | struct flex_array *group; |
2053 | struct cgroup_taskset tset = { }; | 1964 | struct cgroup_taskset tset = { }; |
@@ -2059,17 +1970,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2059 | * group - group_rwsem prevents new threads from appearing, and if | 1970 | * group - group_rwsem prevents new threads from appearing, and if |
2060 | * threads exit, this will just be an over-estimate. | 1971 | * threads exit, this will just be an over-estimate. |
2061 | */ | 1972 | */ |
2062 | group_size = get_nr_threads(leader); | 1973 | if (threadgroup) |
1974 | group_size = get_nr_threads(tsk); | ||
1975 | else | ||
1976 | group_size = 1; | ||
2063 | /* flex_array supports very large thread-groups better than kmalloc. */ | 1977 | /* flex_array supports very large thread-groups better than kmalloc. */ |
2064 | group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); | 1978 | group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); |
2065 | if (!group) | 1979 | if (!group) |
2066 | return -ENOMEM; | 1980 | return -ENOMEM; |
2067 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ | 1981 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ |
2068 | retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); | 1982 | retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); |
2069 | if (retval) | 1983 | if (retval) |
2070 | goto out_free_group_list; | 1984 | goto out_free_group_list; |
2071 | 1985 | ||
2072 | tsk = leader; | ||
2073 | i = 0; | 1986 | i = 0; |
2074 | /* | 1987 | /* |
2075 | * Prevent freeing of tasks while we take a snapshot. Tasks that are | 1988 | * Prevent freeing of tasks while we take a snapshot. Tasks that are |
@@ -2098,6 +2011,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2098 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); | 2011 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); |
2099 | BUG_ON(retval != 0); | 2012 | BUG_ON(retval != 0); |
2100 | i++; | 2013 | i++; |
2014 | |||
2015 | if (!threadgroup) | ||
2016 | break; | ||
2101 | } while_each_thread(leader, tsk); | 2017 | } while_each_thread(leader, tsk); |
2102 | rcu_read_unlock(); | 2018 | rcu_read_unlock(); |
2103 | /* remember the number of threads in the array for later. */ | 2019 | /* remember the number of threads in the array for later. */ |
@@ -2143,7 +2059,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2143 | */ | 2059 | */ |
2144 | for (i = 0; i < group_size; i++) { | 2060 | for (i = 0; i < group_size; i++) { |
2145 | tc = flex_array_get(group, i); | 2061 | tc = flex_array_get(group, i); |
2146 | cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); | 2062 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); |
2147 | } | 2063 | } |
2148 | /* nothing is sensitive to fork() after this point. */ | 2064 | /* nothing is sensitive to fork() after this point. */ |
2149 | 2065 | ||
@@ -2224,11 +2140,11 @@ retry_find_task: | |||
2224 | tsk = tsk->group_leader; | 2140 | tsk = tsk->group_leader; |
2225 | 2141 | ||
2226 | /* | 2142 | /* |
2227 | * Workqueue threads may acquire PF_THREAD_BOUND and become | 2143 | * Workqueue threads may acquire PF_NO_SETAFFINITY and become |
2228 | * trapped in a cpuset, or RT worker may be born in a cgroup | 2144 | * trapped in a cpuset, or RT worker may be born in a cgroup |
2229 | * with no rt_runtime allocated. Just say no. | 2145 | * with no rt_runtime allocated. Just say no. |
2230 | */ | 2146 | */ |
2231 | if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { | 2147 | if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { |
2232 | ret = -EINVAL; | 2148 | ret = -EINVAL; |
2233 | rcu_read_unlock(); | 2149 | rcu_read_unlock(); |
2234 | goto out_unlock_cgroup; | 2150 | goto out_unlock_cgroup; |
@@ -2251,17 +2167,42 @@ retry_find_task: | |||
2251 | put_task_struct(tsk); | 2167 | put_task_struct(tsk); |
2252 | goto retry_find_task; | 2168 | goto retry_find_task; |
2253 | } | 2169 | } |
2254 | ret = cgroup_attach_proc(cgrp, tsk); | 2170 | } |
2255 | } else | 2171 | |
2256 | ret = cgroup_attach_task(cgrp, tsk); | 2172 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); |
2173 | |||
2257 | threadgroup_unlock(tsk); | 2174 | threadgroup_unlock(tsk); |
2258 | 2175 | ||
2259 | put_task_struct(tsk); | 2176 | put_task_struct(tsk); |
2260 | out_unlock_cgroup: | 2177 | out_unlock_cgroup: |
2261 | cgroup_unlock(); | 2178 | mutex_unlock(&cgroup_mutex); |
2262 | return ret; | 2179 | return ret; |
2263 | } | 2180 | } |
2264 | 2181 | ||
2182 | /** | ||
2183 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' | ||
2184 | * @from: attach to all cgroups of a given task | ||
2185 | * @tsk: the task to be attached | ||
2186 | */ | ||
2187 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | ||
2188 | { | ||
2189 | struct cgroupfs_root *root; | ||
2190 | int retval = 0; | ||
2191 | |||
2192 | mutex_lock(&cgroup_mutex); | ||
2193 | for_each_active_root(root) { | ||
2194 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | ||
2195 | |||
2196 | retval = cgroup_attach_task(from_cg, tsk, false); | ||
2197 | if (retval) | ||
2198 | break; | ||
2199 | } | ||
2200 | mutex_unlock(&cgroup_mutex); | ||
2201 | |||
2202 | return retval; | ||
2203 | } | ||
2204 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | ||
2205 | |||
2265 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2206 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) |
2266 | { | 2207 | { |
2267 | return attach_task_by_pid(cgrp, pid, false); | 2208 | return attach_task_by_pid(cgrp, pid, false); |
@@ -2272,24 +2213,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | |||
2272 | return attach_task_by_pid(cgrp, tgid, true); | 2213 | return attach_task_by_pid(cgrp, tgid, true); |
2273 | } | 2214 | } |
2274 | 2215 | ||
2275 | /** | ||
2276 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. | ||
2277 | * @cgrp: the cgroup to be checked for liveness | ||
2278 | * | ||
2279 | * On success, returns true; the lock should be later released with | ||
2280 | * cgroup_unlock(). On failure returns false with no lock held. | ||
2281 | */ | ||
2282 | bool cgroup_lock_live_group(struct cgroup *cgrp) | ||
2283 | { | ||
2284 | mutex_lock(&cgroup_mutex); | ||
2285 | if (cgroup_is_removed(cgrp)) { | ||
2286 | mutex_unlock(&cgroup_mutex); | ||
2287 | return false; | ||
2288 | } | ||
2289 | return true; | ||
2290 | } | ||
2291 | EXPORT_SYMBOL_GPL(cgroup_lock_live_group); | ||
2292 | |||
2293 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | 2216 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, |
2294 | const char *buffer) | 2217 | const char *buffer) |
2295 | { | 2218 | { |
@@ -2301,7 +2224,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | |||
2301 | mutex_lock(&cgroup_root_mutex); | 2224 | mutex_lock(&cgroup_root_mutex); |
2302 | strcpy(cgrp->root->release_agent_path, buffer); | 2225 | strcpy(cgrp->root->release_agent_path, buffer); |
2303 | mutex_unlock(&cgroup_root_mutex); | 2226 | mutex_unlock(&cgroup_root_mutex); |
2304 | cgroup_unlock(); | 2227 | mutex_unlock(&cgroup_mutex); |
2305 | return 0; | 2228 | return 0; |
2306 | } | 2229 | } |
2307 | 2230 | ||
@@ -2312,7 +2235,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | |||
2312 | return -ENODEV; | 2235 | return -ENODEV; |
2313 | seq_puts(seq, cgrp->root->release_agent_path); | 2236 | seq_puts(seq, cgrp->root->release_agent_path); |
2314 | seq_putc(seq, '\n'); | 2237 | seq_putc(seq, '\n'); |
2315 | cgroup_unlock(); | 2238 | mutex_unlock(&cgroup_mutex); |
2239 | return 0; | ||
2240 | } | ||
2241 | |||
2242 | static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, | ||
2243 | struct seq_file *seq) | ||
2244 | { | ||
2245 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | ||
2316 | return 0; | 2246 | return 0; |
2317 | } | 2247 | } |
2318 | 2248 | ||
@@ -2537,13 +2467,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file) | |||
2537 | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | 2467 | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, |
2538 | struct inode *new_dir, struct dentry *new_dentry) | 2468 | struct inode *new_dir, struct dentry *new_dentry) |
2539 | { | 2469 | { |
2470 | int ret; | ||
2471 | struct cgroup_name *name, *old_name; | ||
2472 | struct cgroup *cgrp; | ||
2473 | |||
2474 | /* | ||
2475 | * It's convinient to use parent dir's i_mutex to protected | ||
2476 | * cgrp->name. | ||
2477 | */ | ||
2478 | lockdep_assert_held(&old_dir->i_mutex); | ||
2479 | |||
2540 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | 2480 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) |
2541 | return -ENOTDIR; | 2481 | return -ENOTDIR; |
2542 | if (new_dentry->d_inode) | 2482 | if (new_dentry->d_inode) |
2543 | return -EEXIST; | 2483 | return -EEXIST; |
2544 | if (old_dir != new_dir) | 2484 | if (old_dir != new_dir) |
2545 | return -EIO; | 2485 | return -EIO; |
2546 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 2486 | |
2487 | cgrp = __d_cgrp(old_dentry); | ||
2488 | |||
2489 | name = cgroup_alloc_name(new_dentry); | ||
2490 | if (!name) | ||
2491 | return -ENOMEM; | ||
2492 | |||
2493 | ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||
2494 | if (ret) { | ||
2495 | kfree(name); | ||
2496 | return ret; | ||
2497 | } | ||
2498 | |||
2499 | old_name = cgrp->name; | ||
2500 | rcu_assign_pointer(cgrp->name, name); | ||
2501 | |||
2502 | kfree_rcu(old_name, rcu_head); | ||
2503 | return 0; | ||
2547 | } | 2504 | } |
2548 | 2505 | ||
2549 | static struct simple_xattrs *__d_xattrs(struct dentry *dentry) | 2506 | static struct simple_xattrs *__d_xattrs(struct dentry *dentry) |
@@ -2551,13 +2508,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry) | |||
2551 | if (S_ISDIR(dentry->d_inode->i_mode)) | 2508 | if (S_ISDIR(dentry->d_inode->i_mode)) |
2552 | return &__d_cgrp(dentry)->xattrs; | 2509 | return &__d_cgrp(dentry)->xattrs; |
2553 | else | 2510 | else |
2554 | return &__d_cft(dentry)->xattrs; | 2511 | return &__d_cfe(dentry)->xattrs; |
2555 | } | 2512 | } |
2556 | 2513 | ||
2557 | static inline int xattr_enabled(struct dentry *dentry) | 2514 | static inline int xattr_enabled(struct dentry *dentry) |
2558 | { | 2515 | { |
2559 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | 2516 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; |
2560 | return test_bit(ROOT_XATTR, &root->flags); | 2517 | return root->flags & CGRP_ROOT_XATTR; |
2561 | } | 2518 | } |
2562 | 2519 | ||
2563 | static bool is_valid_xattr(const char *name) | 2520 | static bool is_valid_xattr(const char *name) |
@@ -2727,9 +2684,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2727 | umode_t mode; | 2684 | umode_t mode; |
2728 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2685 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2729 | 2686 | ||
2730 | simple_xattrs_init(&cft->xattrs); | 2687 | if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { |
2731 | |||
2732 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | ||
2733 | strcpy(name, subsys->name); | 2688 | strcpy(name, subsys->name); |
2734 | strcat(name, "."); | 2689 | strcat(name, "."); |
2735 | } | 2690 | } |
@@ -2747,12 +2702,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2747 | goto out; | 2702 | goto out; |
2748 | } | 2703 | } |
2749 | 2704 | ||
2705 | cfe->type = (void *)cft; | ||
2706 | cfe->dentry = dentry; | ||
2707 | dentry->d_fsdata = cfe; | ||
2708 | simple_xattrs_init(&cfe->xattrs); | ||
2709 | |||
2750 | mode = cgroup_file_mode(cft); | 2710 | mode = cgroup_file_mode(cft); |
2751 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); | 2711 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); |
2752 | if (!error) { | 2712 | if (!error) { |
2753 | cfe->type = (void *)cft; | ||
2754 | cfe->dentry = dentry; | ||
2755 | dentry->d_fsdata = cfe; | ||
2756 | list_add_tail(&cfe->node, &parent->files); | 2713 | list_add_tail(&cfe->node, &parent->files); |
2757 | cfe = NULL; | 2714 | cfe = NULL; |
2758 | } | 2715 | } |
@@ -2770,6 +2727,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2770 | 2727 | ||
2771 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2728 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2772 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2729 | /* does cft->flags tell us to skip this file on @cgrp? */ |
2730 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) | ||
2731 | continue; | ||
2773 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | 2732 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) |
2774 | continue; | 2733 | continue; |
2775 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | 2734 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) |
@@ -2998,11 +2957,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | |||
2998 | WARN_ON_ONCE(!rcu_read_lock_held()); | 2957 | WARN_ON_ONCE(!rcu_read_lock_held()); |
2999 | 2958 | ||
3000 | /* if first iteration, pretend we just visited @cgroup */ | 2959 | /* if first iteration, pretend we just visited @cgroup */ |
3001 | if (!pos) { | 2960 | if (!pos) |
3002 | if (list_empty(&cgroup->children)) | ||
3003 | return NULL; | ||
3004 | pos = cgroup; | 2961 | pos = cgroup; |
3005 | } | ||
3006 | 2962 | ||
3007 | /* visit the first child if exists */ | 2963 | /* visit the first child if exists */ |
3008 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | 2964 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); |
@@ -3010,14 +2966,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | |||
3010 | return next; | 2966 | return next; |
3011 | 2967 | ||
3012 | /* no child, visit my or the closest ancestor's next sibling */ | 2968 | /* no child, visit my or the closest ancestor's next sibling */ |
3013 | do { | 2969 | while (pos != cgroup) { |
3014 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | 2970 | next = list_entry_rcu(pos->sibling.next, struct cgroup, |
3015 | sibling); | 2971 | sibling); |
3016 | if (&next->sibling != &pos->parent->children) | 2972 | if (&next->sibling != &pos->parent->children) |
3017 | return next; | 2973 | return next; |
3018 | 2974 | ||
3019 | pos = pos->parent; | 2975 | pos = pos->parent; |
3020 | } while (pos != cgroup); | 2976 | } |
3021 | 2977 | ||
3022 | return NULL; | 2978 | return NULL; |
3023 | } | 2979 | } |
@@ -3300,6 +3256,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3300 | return 0; | 3256 | return 0; |
3301 | } | 3257 | } |
3302 | 3258 | ||
3259 | static void cgroup_transfer_one_task(struct task_struct *task, | ||
3260 | struct cgroup_scanner *scan) | ||
3261 | { | ||
3262 | struct cgroup *new_cgroup = scan->data; | ||
3263 | |||
3264 | mutex_lock(&cgroup_mutex); | ||
3265 | cgroup_attach_task(new_cgroup, task, false); | ||
3266 | mutex_unlock(&cgroup_mutex); | ||
3267 | } | ||
3268 | |||
3269 | /** | ||
3270 | * cgroup_trasnsfer_tasks - move tasks from one cgroup to another | ||
3271 | * @to: cgroup to which the tasks will be moved | ||
3272 | * @from: cgroup in which the tasks currently reside | ||
3273 | */ | ||
3274 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | ||
3275 | { | ||
3276 | struct cgroup_scanner scan; | ||
3277 | |||
3278 | scan.cg = from; | ||
3279 | scan.test_task = NULL; /* select all tasks in cgroup */ | ||
3280 | scan.process_task = cgroup_transfer_one_task; | ||
3281 | scan.heap = NULL; | ||
3282 | scan.data = to; | ||
3283 | |||
3284 | return cgroup_scan_tasks(&scan); | ||
3285 | } | ||
3286 | |||
3303 | /* | 3287 | /* |
3304 | * Stuff for reading the 'tasks'/'procs' files. | 3288 | * Stuff for reading the 'tasks'/'procs' files. |
3305 | * | 3289 | * |
@@ -3362,35 +3346,14 @@ static void pidlist_free(void *p) | |||
3362 | else | 3346 | else |
3363 | kfree(p); | 3347 | kfree(p); |
3364 | } | 3348 | } |
3365 | static void *pidlist_resize(void *p, int newcount) | ||
3366 | { | ||
3367 | void *newlist; | ||
3368 | /* note: if new alloc fails, old p will still be valid either way */ | ||
3369 | if (is_vmalloc_addr(p)) { | ||
3370 | newlist = vmalloc(newcount * sizeof(pid_t)); | ||
3371 | if (!newlist) | ||
3372 | return NULL; | ||
3373 | memcpy(newlist, p, newcount * sizeof(pid_t)); | ||
3374 | vfree(p); | ||
3375 | } else { | ||
3376 | newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); | ||
3377 | } | ||
3378 | return newlist; | ||
3379 | } | ||
3380 | 3349 | ||
3381 | /* | 3350 | /* |
3382 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries | 3351 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries |
3383 | * If the new stripped list is sufficiently smaller and there's enough memory | 3352 | * Returns the number of unique elements. |
3384 | * to allocate a new buffer, will let go of the unneeded memory. Returns the | ||
3385 | * number of unique elements. | ||
3386 | */ | 3353 | */ |
3387 | /* is the size difference enough that we should re-allocate the array? */ | 3354 | static int pidlist_uniq(pid_t *list, int length) |
3388 | #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) | ||
3389 | static int pidlist_uniq(pid_t **p, int length) | ||
3390 | { | 3355 | { |
3391 | int src, dest = 1; | 3356 | int src, dest = 1; |
3392 | pid_t *list = *p; | ||
3393 | pid_t *newlist; | ||
3394 | 3357 | ||
3395 | /* | 3358 | /* |
3396 | * we presume the 0th element is unique, so i starts at 1. trivial | 3359 | * we presume the 0th element is unique, so i starts at 1. trivial |
@@ -3411,16 +3374,6 @@ static int pidlist_uniq(pid_t **p, int length) | |||
3411 | dest++; | 3374 | dest++; |
3412 | } | 3375 | } |
3413 | after: | 3376 | after: |
3414 | /* | ||
3415 | * if the length difference is large enough, we want to allocate a | ||
3416 | * smaller buffer to save memory. if this fails due to out of memory, | ||
3417 | * we'll just stay with what we've got. | ||
3418 | */ | ||
3419 | if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { | ||
3420 | newlist = pidlist_resize(list, dest); | ||
3421 | if (newlist) | ||
3422 | *p = newlist; | ||
3423 | } | ||
3424 | return dest; | 3377 | return dest; |
3425 | } | 3378 | } |
3426 | 3379 | ||
@@ -3516,7 +3469,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3516 | /* now sort & (if procs) strip out duplicates */ | 3469 | /* now sort & (if procs) strip out duplicates */ |
3517 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3470 | sort(array, length, sizeof(pid_t), cmppid, NULL); |
3518 | if (type == CGROUP_FILE_PROCS) | 3471 | if (type == CGROUP_FILE_PROCS) |
3519 | length = pidlist_uniq(&array, length); | 3472 | length = pidlist_uniq(array, length); |
3520 | l = cgroup_pidlist_find(cgrp, type); | 3473 | l = cgroup_pidlist_find(cgrp, type); |
3521 | if (!l) { | 3474 | if (!l) { |
3522 | pidlist_free(array); | 3475 | pidlist_free(array); |
@@ -3930,11 +3883,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3930 | if (ret) | 3883 | if (ret) |
3931 | goto fail; | 3884 | goto fail; |
3932 | 3885 | ||
3933 | if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { | 3886 | efile->f_op->poll(efile, &event->pt); |
3934 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | ||
3935 | ret = 0; | ||
3936 | goto fail; | ||
3937 | } | ||
3938 | 3887 | ||
3939 | /* | 3888 | /* |
3940 | * Events should be removed after rmdir of cgroup directory, but before | 3889 | * Events should be removed after rmdir of cgroup directory, but before |
@@ -4016,10 +3965,16 @@ static struct cftype files[] = { | |||
4016 | }, | 3965 | }, |
4017 | { | 3966 | { |
4018 | .name = "cgroup.clone_children", | 3967 | .name = "cgroup.clone_children", |
3968 | .flags = CFTYPE_INSANE, | ||
4019 | .read_u64 = cgroup_clone_children_read, | 3969 | .read_u64 = cgroup_clone_children_read, |
4020 | .write_u64 = cgroup_clone_children_write, | 3970 | .write_u64 = cgroup_clone_children_write, |
4021 | }, | 3971 | }, |
4022 | { | 3972 | { |
3973 | .name = "cgroup.sane_behavior", | ||
3974 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
3975 | .read_seq_string = cgroup_sane_behavior_show, | ||
3976 | }, | ||
3977 | { | ||
4023 | .name = "release_agent", | 3978 | .name = "release_agent", |
4024 | .flags = CFTYPE_ONLY_ON_ROOT, | 3979 | .flags = CFTYPE_ONLY_ON_ROOT, |
4025 | .read_seq_string = cgroup_release_agent_show, | 3980 | .read_seq_string = cgroup_release_agent_show, |
@@ -4131,17 +4086,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
4131 | if (!(css->flags & CSS_ONLINE)) | 4086 | if (!(css->flags & CSS_ONLINE)) |
4132 | return; | 4087 | return; |
4133 | 4088 | ||
4134 | /* | 4089 | if (ss->css_offline) |
4135 | * css_offline() should be called with cgroup_mutex unlocked. See | ||
4136 | * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for | ||
4137 | * details. This temporary unlocking should go away once | ||
4138 | * cgroup_mutex is unexported from controllers. | ||
4139 | */ | ||
4140 | if (ss->css_offline) { | ||
4141 | mutex_unlock(&cgroup_mutex); | ||
4142 | ss->css_offline(cgrp); | 4090 | ss->css_offline(cgrp); |
4143 | mutex_lock(&cgroup_mutex); | ||
4144 | } | ||
4145 | 4091 | ||
4146 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | 4092 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; |
4147 | } | 4093 | } |
@@ -4158,6 +4104,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4158 | umode_t mode) | 4104 | umode_t mode) |
4159 | { | 4105 | { |
4160 | struct cgroup *cgrp; | 4106 | struct cgroup *cgrp; |
4107 | struct cgroup_name *name; | ||
4161 | struct cgroupfs_root *root = parent->root; | 4108 | struct cgroupfs_root *root = parent->root; |
4162 | int err = 0; | 4109 | int err = 0; |
4163 | struct cgroup_subsys *ss; | 4110 | struct cgroup_subsys *ss; |
@@ -4168,9 +4115,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4168 | if (!cgrp) | 4115 | if (!cgrp) |
4169 | return -ENOMEM; | 4116 | return -ENOMEM; |
4170 | 4117 | ||
4118 | name = cgroup_alloc_name(dentry); | ||
4119 | if (!name) | ||
4120 | goto err_free_cgrp; | ||
4121 | rcu_assign_pointer(cgrp->name, name); | ||
4122 | |||
4171 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | 4123 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); |
4172 | if (cgrp->id < 0) | 4124 | if (cgrp->id < 0) |
4173 | goto err_free_cgrp; | 4125 | goto err_free_name; |
4174 | 4126 | ||
4175 | /* | 4127 | /* |
4176 | * Only live parents can have children. Note that the liveliness | 4128 | * Only live parents can have children. Note that the liveliness |
@@ -4198,7 +4150,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4198 | 4150 | ||
4199 | cgrp->parent = parent; | 4151 | cgrp->parent = parent; |
4200 | cgrp->root = parent->root; | 4152 | cgrp->root = parent->root; |
4201 | cgrp->top_cgroup = parent->top_cgroup; | ||
4202 | 4153 | ||
4203 | if (notify_on_release(parent)) | 4154 | if (notify_on_release(parent)) |
4204 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4155 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
@@ -4241,6 +4192,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4241 | for_each_subsys(root, ss) | 4192 | for_each_subsys(root, ss) |
4242 | dget(dentry); | 4193 | dget(dentry); |
4243 | 4194 | ||
4195 | /* hold a ref to the parent's dentry */ | ||
4196 | dget(parent->dentry); | ||
4197 | |||
4244 | /* creation succeeded, notify subsystems */ | 4198 | /* creation succeeded, notify subsystems */ |
4245 | for_each_subsys(root, ss) { | 4199 | for_each_subsys(root, ss) { |
4246 | err = online_css(ss, cgrp); | 4200 | err = online_css(ss, cgrp); |
@@ -4276,6 +4230,8 @@ err_free_all: | |||
4276 | deactivate_super(sb); | 4230 | deactivate_super(sb); |
4277 | err_free_id: | 4231 | err_free_id: |
4278 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | 4232 | ida_simple_remove(&root->cgroup_ida, cgrp->id); |
4233 | err_free_name: | ||
4234 | kfree(rcu_dereference_raw(cgrp->name)); | ||
4279 | err_free_cgrp: | 4235 | err_free_cgrp: |
4280 | kfree(cgrp); | 4236 | kfree(cgrp); |
4281 | return err; | 4237 | return err; |
@@ -4295,56 +4251,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4295 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4251 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
4296 | } | 4252 | } |
4297 | 4253 | ||
4298 | /* | ||
4299 | * Check the reference count on each subsystem. Since we already | ||
4300 | * established that there are no tasks in the cgroup, if the css refcount | ||
4301 | * is also 1, then there should be no outstanding references, so the | ||
4302 | * subsystem is safe to destroy. We scan across all subsystems rather than | ||
4303 | * using the per-hierarchy linked list of mounted subsystems since we can | ||
4304 | * be called via check_for_release() with no synchronization other than | ||
4305 | * RCU, and the subsystem linked list isn't RCU-safe. | ||
4306 | */ | ||
4307 | static int cgroup_has_css_refs(struct cgroup *cgrp) | ||
4308 | { | ||
4309 | int i; | ||
4310 | |||
4311 | /* | ||
4312 | * We won't need to lock the subsys array, because the subsystems | ||
4313 | * we're concerned about aren't going anywhere since our cgroup root | ||
4314 | * has a reference on them. | ||
4315 | */ | ||
4316 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4317 | struct cgroup_subsys *ss = subsys[i]; | ||
4318 | struct cgroup_subsys_state *css; | ||
4319 | |||
4320 | /* Skip subsystems not present or not in this hierarchy */ | ||
4321 | if (ss == NULL || ss->root != cgrp->root) | ||
4322 | continue; | ||
4323 | |||
4324 | css = cgrp->subsys[ss->subsys_id]; | ||
4325 | /* | ||
4326 | * When called from check_for_release() it's possible | ||
4327 | * that by this point the cgroup has been removed | ||
4328 | * and the css deleted. But a false-positive doesn't | ||
4329 | * matter, since it can only happen if the cgroup | ||
4330 | * has been deleted and hence no longer needs the | ||
4331 | * release agent to be called anyway. | ||
4332 | */ | ||
4333 | if (css && css_refcnt(css) > 1) | ||
4334 | return 1; | ||
4335 | } | ||
4336 | return 0; | ||
4337 | } | ||
4338 | |||
4339 | static int cgroup_destroy_locked(struct cgroup *cgrp) | 4254 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4340 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4255 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4341 | { | 4256 | { |
4342 | struct dentry *d = cgrp->dentry; | 4257 | struct dentry *d = cgrp->dentry; |
4343 | struct cgroup *parent = cgrp->parent; | 4258 | struct cgroup *parent = cgrp->parent; |
4344 | DEFINE_WAIT(wait); | ||
4345 | struct cgroup_event *event, *tmp; | 4259 | struct cgroup_event *event, *tmp; |
4346 | struct cgroup_subsys *ss; | 4260 | struct cgroup_subsys *ss; |
4347 | LIST_HEAD(tmp_list); | ||
4348 | 4261 | ||
4349 | lockdep_assert_held(&d->d_inode->i_mutex); | 4262 | lockdep_assert_held(&d->d_inode->i_mutex); |
4350 | lockdep_assert_held(&cgroup_mutex); | 4263 | lockdep_assert_held(&cgroup_mutex); |
@@ -4468,7 +4381,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4468 | * need to invoke fork callbacks here. */ | 4381 | * need to invoke fork callbacks here. */ |
4469 | BUG_ON(!list_empty(&init_task.tasks)); | 4382 | BUG_ON(!list_empty(&init_task.tasks)); |
4470 | 4383 | ||
4471 | ss->active = 1; | ||
4472 | BUG_ON(online_css(ss, dummytop)); | 4384 | BUG_ON(online_css(ss, dummytop)); |
4473 | 4385 | ||
4474 | mutex_unlock(&cgroup_mutex); | 4386 | mutex_unlock(&cgroup_mutex); |
@@ -4573,7 +4485,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4573 | } | 4485 | } |
4574 | write_unlock(&css_set_lock); | 4486 | write_unlock(&css_set_lock); |
4575 | 4487 | ||
4576 | ss->active = 1; | ||
4577 | ret = online_css(ss, dummytop); | 4488 | ret = online_css(ss, dummytop); |
4578 | if (ret) | 4489 | if (ret) |
4579 | goto err_unload; | 4490 | goto err_unload; |
@@ -4614,7 +4525,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4614 | mutex_lock(&cgroup_mutex); | 4525 | mutex_lock(&cgroup_mutex); |
4615 | 4526 | ||
4616 | offline_css(ss, dummytop); | 4527 | offline_css(ss, dummytop); |
4617 | ss->active = 0; | ||
4618 | 4528 | ||
4619 | if (ss->use_id) | 4529 | if (ss->use_id) |
4620 | idr_destroy(&ss->idr); | 4530 | idr_destroy(&ss->idr); |
@@ -4769,7 +4679,7 @@ out: | |||
4769 | */ | 4679 | */ |
4770 | 4680 | ||
4771 | /* TODO: Use a proper seq_file iterator */ | 4681 | /* TODO: Use a proper seq_file iterator */ |
4772 | static int proc_cgroup_show(struct seq_file *m, void *v) | 4682 | int proc_cgroup_show(struct seq_file *m, void *v) |
4773 | { | 4683 | { |
4774 | struct pid *pid; | 4684 | struct pid *pid; |
4775 | struct task_struct *tsk; | 4685 | struct task_struct *tsk; |
@@ -4821,19 +4731,6 @@ out: | |||
4821 | return retval; | 4731 | return retval; |
4822 | } | 4732 | } |
4823 | 4733 | ||
4824 | static int cgroup_open(struct inode *inode, struct file *file) | ||
4825 | { | ||
4826 | struct pid *pid = PROC_I(inode)->pid; | ||
4827 | return single_open(file, proc_cgroup_show, pid); | ||
4828 | } | ||
4829 | |||
4830 | const struct file_operations proc_cgroup_operations = { | ||
4831 | .open = cgroup_open, | ||
4832 | .read = seq_read, | ||
4833 | .llseek = seq_lseek, | ||
4834 | .release = single_release, | ||
4835 | }; | ||
4836 | |||
4837 | /* Display information about each subsystem and each hierarchy */ | 4734 | /* Display information about each subsystem and each hierarchy */ |
4838 | static int proc_cgroupstats_show(struct seq_file *m, void *v) | 4735 | static int proc_cgroupstats_show(struct seq_file *m, void *v) |
4839 | { | 4736 | { |
@@ -4935,17 +4832,17 @@ void cgroup_post_fork(struct task_struct *child) | |||
4935 | * and addition to css_set. | 4832 | * and addition to css_set. |
4936 | */ | 4833 | */ |
4937 | if (need_forkexit_callback) { | 4834 | if (need_forkexit_callback) { |
4938 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4835 | /* |
4836 | * fork/exit callbacks are supported only for builtin | ||
4837 | * subsystems, and the builtin section of the subsys | ||
4838 | * array is immutable, so we don't need to lock the | ||
4839 | * subsys array here. On the other hand, modular section | ||
4840 | * of the array can be freed at module unload, so we | ||
4841 | * can't touch that. | ||
4842 | */ | ||
4843 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4939 | struct cgroup_subsys *ss = subsys[i]; | 4844 | struct cgroup_subsys *ss = subsys[i]; |
4940 | 4845 | ||
4941 | /* | ||
4942 | * fork/exit callbacks are supported only for | ||
4943 | * builtin subsystems and we don't need further | ||
4944 | * synchronization as they never go away. | ||
4945 | */ | ||
4946 | if (!ss || ss->module) | ||
4947 | continue; | ||
4948 | |||
4949 | if (ss->fork) | 4846 | if (ss->fork) |
4950 | ss->fork(child); | 4847 | ss->fork(child); |
4951 | } | 4848 | } |
@@ -5010,13 +4907,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
5010 | tsk->cgroups = &init_css_set; | 4907 | tsk->cgroups = &init_css_set; |
5011 | 4908 | ||
5012 | if (run_callbacks && need_forkexit_callback) { | 4909 | if (run_callbacks && need_forkexit_callback) { |
5013 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4910 | /* |
4911 | * fork/exit callbacks are supported only for builtin | ||
4912 | * subsystems, see cgroup_post_fork() for details. | ||
4913 | */ | ||
4914 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
5014 | struct cgroup_subsys *ss = subsys[i]; | 4915 | struct cgroup_subsys *ss = subsys[i]; |
5015 | 4916 | ||
5016 | /* modular subsystems can't use callbacks */ | ||
5017 | if (!ss || ss->module) | ||
5018 | continue; | ||
5019 | |||
5020 | if (ss->exit) { | 4917 | if (ss->exit) { |
5021 | struct cgroup *old_cgrp = | 4918 | struct cgroup *old_cgrp = |
5022 | rcu_dereference_raw(cg->subsys[i])->cgroup; | 4919 | rcu_dereference_raw(cg->subsys[i])->cgroup; |
@@ -5030,44 +4927,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
5030 | put_css_set_taskexit(cg); | 4927 | put_css_set_taskexit(cg); |
5031 | } | 4928 | } |
5032 | 4929 | ||
5033 | /** | ||
5034 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp | ||
5035 | * @cgrp: the cgroup in question | ||
5036 | * @task: the task in question | ||
5037 | * | ||
5038 | * See if @cgrp is a descendant of @task's cgroup in the appropriate | ||
5039 | * hierarchy. | ||
5040 | * | ||
5041 | * If we are sending in dummytop, then presumably we are creating | ||
5042 | * the top cgroup in the subsystem. | ||
5043 | * | ||
5044 | * Called only by the ns (nsproxy) cgroup. | ||
5045 | */ | ||
5046 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) | ||
5047 | { | ||
5048 | int ret; | ||
5049 | struct cgroup *target; | ||
5050 | |||
5051 | if (cgrp == dummytop) | ||
5052 | return 1; | ||
5053 | |||
5054 | target = task_cgroup_from_root(task, cgrp->root); | ||
5055 | while (cgrp != target && cgrp!= cgrp->top_cgroup) | ||
5056 | cgrp = cgrp->parent; | ||
5057 | ret = (cgrp == target); | ||
5058 | return ret; | ||
5059 | } | ||
5060 | |||
5061 | static void check_for_release(struct cgroup *cgrp) | 4930 | static void check_for_release(struct cgroup *cgrp) |
5062 | { | 4931 | { |
5063 | /* All of these checks rely on RCU to keep the cgroup | 4932 | /* All of these checks rely on RCU to keep the cgroup |
5064 | * structure alive */ | 4933 | * structure alive */ |
5065 | if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) | 4934 | if (cgroup_is_releasable(cgrp) && |
5066 | && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { | 4935 | !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { |
5067 | /* Control Group is currently removeable. If it's not | 4936 | /* |
4937 | * Control Group is currently removeable. If it's not | ||
5068 | * already queued for a userspace notification, queue | 4938 | * already queued for a userspace notification, queue |
5069 | * it now */ | 4939 | * it now |
4940 | */ | ||
5070 | int need_schedule_work = 0; | 4941 | int need_schedule_work = 0; |
4942 | |||
5071 | raw_spin_lock(&release_list_lock); | 4943 | raw_spin_lock(&release_list_lock); |
5072 | if (!cgroup_is_removed(cgrp) && | 4944 | if (!cgroup_is_removed(cgrp) && |
5073 | list_empty(&cgrp->release_list)) { | 4945 | list_empty(&cgrp->release_list)) { |
@@ -5100,24 +4972,11 @@ EXPORT_SYMBOL_GPL(__css_tryget); | |||
5100 | /* Caller must verify that the css is not for root cgroup */ | 4972 | /* Caller must verify that the css is not for root cgroup */ |
5101 | void __css_put(struct cgroup_subsys_state *css) | 4973 | void __css_put(struct cgroup_subsys_state *css) |
5102 | { | 4974 | { |
5103 | struct cgroup *cgrp = css->cgroup; | ||
5104 | int v; | 4975 | int v; |
5105 | 4976 | ||
5106 | rcu_read_lock(); | ||
5107 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); | 4977 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); |
5108 | 4978 | if (v == 0) | |
5109 | switch (v) { | ||
5110 | case 1: | ||
5111 | if (notify_on_release(cgrp)) { | ||
5112 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
5113 | check_for_release(cgrp); | ||
5114 | } | ||
5115 | break; | ||
5116 | case 0: | ||
5117 | schedule_work(&css->dput_work); | 4979 | schedule_work(&css->dput_work); |
5118 | break; | ||
5119 | } | ||
5120 | rcu_read_unlock(); | ||
5121 | } | 4980 | } |
5122 | EXPORT_SYMBOL_GPL(__css_put); | 4981 | EXPORT_SYMBOL_GPL(__css_put); |
5123 | 4982 | ||
@@ -5416,55 +5275,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | |||
5416 | } | 5275 | } |
5417 | EXPORT_SYMBOL_GPL(css_lookup); | 5276 | EXPORT_SYMBOL_GPL(css_lookup); |
5418 | 5277 | ||
5419 | /** | ||
5420 | * css_get_next - lookup next cgroup under specified hierarchy. | ||
5421 | * @ss: pointer to subsystem | ||
5422 | * @id: current position of iteration. | ||
5423 | * @root: pointer to css. search tree under this. | ||
5424 | * @foundid: position of found object. | ||
5425 | * | ||
5426 | * Search next css under the specified hierarchy of rootid. Calling under | ||
5427 | * rcu_read_lock() is necessary. Returns NULL if it reaches the end. | ||
5428 | */ | ||
5429 | struct cgroup_subsys_state * | ||
5430 | css_get_next(struct cgroup_subsys *ss, int id, | ||
5431 | struct cgroup_subsys_state *root, int *foundid) | ||
5432 | { | ||
5433 | struct cgroup_subsys_state *ret = NULL; | ||
5434 | struct css_id *tmp; | ||
5435 | int tmpid; | ||
5436 | int rootid = css_id(root); | ||
5437 | int depth = css_depth(root); | ||
5438 | |||
5439 | if (!rootid) | ||
5440 | return NULL; | ||
5441 | |||
5442 | BUG_ON(!ss->use_id); | ||
5443 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
5444 | |||
5445 | /* fill start point for scan */ | ||
5446 | tmpid = id; | ||
5447 | while (1) { | ||
5448 | /* | ||
5449 | * scan next entry from bitmap(tree), tmpid is updated after | ||
5450 | * idr_get_next(). | ||
5451 | */ | ||
5452 | tmp = idr_get_next(&ss->idr, &tmpid); | ||
5453 | if (!tmp) | ||
5454 | break; | ||
5455 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { | ||
5456 | ret = rcu_dereference(tmp->css); | ||
5457 | if (ret) { | ||
5458 | *foundid = tmpid; | ||
5459 | break; | ||
5460 | } | ||
5461 | } | ||
5462 | /* continue to scan from next id */ | ||
5463 | tmpid = tmpid + 1; | ||
5464 | } | ||
5465 | return ret; | ||
5466 | } | ||
5467 | |||
5468 | /* | 5278 | /* |
5469 | * get corresponding css from file open on cgroupfs directory | 5279 | * get corresponding css from file open on cgroupfs directory |
5470 | */ | 5280 | */ |
diff --git a/kernel/compat.c b/kernel/compat.c index 19971d8c7299..0a09e481b70b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) | |||
516 | return 0; | 516 | return 0; |
517 | } | 517 | } |
518 | 518 | ||
519 | asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) | ||
520 | { | ||
521 | struct rusage r; | ||
522 | int ret; | ||
523 | mm_segment_t old_fs = get_fs(); | ||
524 | |||
525 | set_fs(KERNEL_DS); | ||
526 | ret = sys_getrusage(who, (struct rusage __user *) &r); | ||
527 | set_fs(old_fs); | ||
528 | |||
529 | if (ret) | ||
530 | return ret; | ||
531 | |||
532 | if (put_compat_rusage(&r, ru)) | ||
533 | return -EFAULT; | ||
534 | |||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | COMPAT_SYSCALL_DEFINE4(wait4, | 519 | COMPAT_SYSCALL_DEFINE4(wait4, |
539 | compat_pid_t, pid, | 520 | compat_pid_t, pid, |
540 | compat_uint_t __user *, stat_addr, | 521 | compat_uint_t __user *, stat_addr, |
@@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, | |||
1138 | } | 1119 | } |
1139 | #endif | 1120 | #endif |
1140 | 1121 | ||
1141 | struct compat_sysinfo { | ||
1142 | s32 uptime; | ||
1143 | u32 loads[3]; | ||
1144 | u32 totalram; | ||
1145 | u32 freeram; | ||
1146 | u32 sharedram; | ||
1147 | u32 bufferram; | ||
1148 | u32 totalswap; | ||
1149 | u32 freeswap; | ||
1150 | u16 procs; | ||
1151 | u16 pad; | ||
1152 | u32 totalhigh; | ||
1153 | u32 freehigh; | ||
1154 | u32 mem_unit; | ||
1155 | char _f[20-2*sizeof(u32)-sizeof(int)]; | ||
1156 | }; | ||
1157 | |||
1158 | asmlinkage long | ||
1159 | compat_sys_sysinfo(struct compat_sysinfo __user *info) | ||
1160 | { | ||
1161 | struct sysinfo s; | ||
1162 | |||
1163 | do_sysinfo(&s); | ||
1164 | |||
1165 | /* Check to see if any memory value is too large for 32-bit and scale | ||
1166 | * down if needed | ||
1167 | */ | ||
1168 | if ((s.totalram >> 32) || (s.totalswap >> 32)) { | ||
1169 | int bitcount = 0; | ||
1170 | |||
1171 | while (s.mem_unit < PAGE_SIZE) { | ||
1172 | s.mem_unit <<= 1; | ||
1173 | bitcount++; | ||
1174 | } | ||
1175 | |||
1176 | s.totalram >>= bitcount; | ||
1177 | s.freeram >>= bitcount; | ||
1178 | s.sharedram >>= bitcount; | ||
1179 | s.bufferram >>= bitcount; | ||
1180 | s.totalswap >>= bitcount; | ||
1181 | s.freeswap >>= bitcount; | ||
1182 | s.totalhigh >>= bitcount; | ||
1183 | s.freehigh >>= bitcount; | ||
1184 | } | ||
1185 | |||
1186 | if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || | ||
1187 | __put_user (s.uptime, &info->uptime) || | ||
1188 | __put_user (s.loads[0], &info->loads[0]) || | ||
1189 | __put_user (s.loads[1], &info->loads[1]) || | ||
1190 | __put_user (s.loads[2], &info->loads[2]) || | ||
1191 | __put_user (s.totalram, &info->totalram) || | ||
1192 | __put_user (s.freeram, &info->freeram) || | ||
1193 | __put_user (s.sharedram, &info->sharedram) || | ||
1194 | __put_user (s.bufferram, &info->bufferram) || | ||
1195 | __put_user (s.totalswap, &info->totalswap) || | ||
1196 | __put_user (s.freeswap, &info->freeswap) || | ||
1197 | __put_user (s.procs, &info->procs) || | ||
1198 | __put_user (s.totalhigh, &info->totalhigh) || | ||
1199 | __put_user (s.freehigh, &info->freehigh) || | ||
1200 | __put_user (s.mem_unit, &info->mem_unit)) | ||
1201 | return -EFAULT; | ||
1202 | |||
1203 | return 0; | ||
1204 | } | ||
1205 | |||
1206 | COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, | 1122 | COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, |
1207 | compat_pid_t, pid, | 1123 | compat_pid_t, pid, |
1208 | struct compat_timespec __user *, interval) | 1124 | struct compat_timespec __user *, interval) |
diff --git a/kernel/configs.c b/kernel/configs.c index 42e8fa075eed..c18b1f1ae515 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void) | |||
79 | if (!entry) | 79 | if (!entry) |
80 | return -ENOMEM; | 80 | return -ENOMEM; |
81 | 81 | ||
82 | entry->size = kernel_config_data_size; | 82 | proc_set_size(entry, kernel_config_data_size); |
83 | 83 | ||
84 | return 0; | 84 | return 0; |
85 | } | 85 | } |
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile new file mode 100644 index 000000000000..59ab052ef7a0 --- /dev/null +++ b/kernel/cpu/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-y = idle.o | |||
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c new file mode 100644 index 000000000000..d5585f5e038e --- /dev/null +++ b/kernel/cpu/idle.c | |||
@@ -0,0 +1,118 @@ | |||
1 | /* | ||
2 | * Generic entry point for the idle threads | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/cpu.h> | ||
6 | #include <linux/tick.h> | ||
7 | #include <linux/mm.h> | ||
8 | |||
9 | #include <asm/tlb.h> | ||
10 | |||
11 | #include <trace/events/power.h> | ||
12 | |||
13 | static int __read_mostly cpu_idle_force_poll; | ||
14 | |||
15 | void cpu_idle_poll_ctrl(bool enable) | ||
16 | { | ||
17 | if (enable) { | ||
18 | cpu_idle_force_poll++; | ||
19 | } else { | ||
20 | cpu_idle_force_poll--; | ||
21 | WARN_ON_ONCE(cpu_idle_force_poll < 0); | ||
22 | } | ||
23 | } | ||
24 | |||
25 | #ifdef CONFIG_GENERIC_IDLE_POLL_SETUP | ||
26 | static int __init cpu_idle_poll_setup(char *__unused) | ||
27 | { | ||
28 | cpu_idle_force_poll = 1; | ||
29 | return 1; | ||
30 | } | ||
31 | __setup("nohlt", cpu_idle_poll_setup); | ||
32 | |||
33 | static int __init cpu_idle_nopoll_setup(char *__unused) | ||
34 | { | ||
35 | cpu_idle_force_poll = 0; | ||
36 | return 1; | ||
37 | } | ||
38 | __setup("hlt", cpu_idle_nopoll_setup); | ||
39 | #endif | ||
40 | |||
41 | static inline int cpu_idle_poll(void) | ||
42 | { | ||
43 | rcu_idle_enter(); | ||
44 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | ||
45 | local_irq_enable(); | ||
46 | while (!need_resched()) | ||
47 | cpu_relax(); | ||
48 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | ||
49 | rcu_idle_exit(); | ||
50 | return 1; | ||
51 | } | ||
52 | |||
53 | /* Weak implementations for optional arch specific functions */ | ||
54 | void __weak arch_cpu_idle_prepare(void) { } | ||
55 | void __weak arch_cpu_idle_enter(void) { } | ||
56 | void __weak arch_cpu_idle_exit(void) { } | ||
57 | void __weak arch_cpu_idle_dead(void) { } | ||
58 | void __weak arch_cpu_idle(void) | ||
59 | { | ||
60 | cpu_idle_force_poll = 1; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Generic idle loop implementation | ||
65 | */ | ||
66 | static void cpu_idle_loop(void) | ||
67 | { | ||
68 | while (1) { | ||
69 | tick_nohz_idle_enter(); | ||
70 | |||
71 | while (!need_resched()) { | ||
72 | check_pgt_cache(); | ||
73 | rmb(); | ||
74 | |||
75 | if (cpu_is_offline(smp_processor_id())) | ||
76 | arch_cpu_idle_dead(); | ||
77 | |||
78 | local_irq_disable(); | ||
79 | arch_cpu_idle_enter(); | ||
80 | |||
81 | /* | ||
82 | * In poll mode we reenable interrupts and spin. | ||
83 | * | ||
84 | * Also if we detected in the wakeup from idle | ||
85 | * path that the tick broadcast device expired | ||
86 | * for us, we don't want to go deep idle as we | ||
87 | * know that the IPI is going to arrive right | ||
88 | * away | ||
89 | */ | ||
90 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { | ||
91 | cpu_idle_poll(); | ||
92 | } else { | ||
93 | current_clr_polling(); | ||
94 | if (!need_resched()) { | ||
95 | stop_critical_timings(); | ||
96 | rcu_idle_enter(); | ||
97 | arch_cpu_idle(); | ||
98 | WARN_ON_ONCE(irqs_disabled()); | ||
99 | rcu_idle_exit(); | ||
100 | start_critical_timings(); | ||
101 | } else { | ||
102 | local_irq_enable(); | ||
103 | } | ||
104 | current_set_polling(); | ||
105 | } | ||
106 | arch_cpu_idle_exit(); | ||
107 | } | ||
108 | tick_nohz_idle_exit(); | ||
109 | schedule_preempt_disabled(); | ||
110 | } | ||
111 | } | ||
112 | |||
113 | void cpu_startup_entry(enum cpuhp_state state) | ||
114 | { | ||
115 | current_set_polling(); | ||
116 | arch_cpu_idle_prepare(); | ||
117 | cpu_idle_loop(); | ||
118 | } | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecbd..64b3f791bbe5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex); | |||
265 | static DEFINE_MUTEX(callback_mutex); | 265 | static DEFINE_MUTEX(callback_mutex); |
266 | 266 | ||
267 | /* | 267 | /* |
268 | * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist | ||
269 | * buffers. They are statically allocated to prevent using excess stack | ||
270 | * when calling cpuset_print_task_mems_allowed(). | ||
271 | */ | ||
272 | #define CPUSET_NAME_LEN (128) | ||
273 | #define CPUSET_NODELIST_LEN (256) | ||
274 | static char cpuset_name[CPUSET_NAME_LEN]; | ||
275 | static char cpuset_nodelist[CPUSET_NODELIST_LEN]; | ||
276 | static DEFINE_SPINLOCK(cpuset_buffer_lock); | ||
277 | |||
278 | /* | ||
279 | * CPU / memory hotplug is handled asynchronously. | 268 | * CPU / memory hotplug is handled asynchronously. |
280 | */ | 269 | */ |
281 | static struct workqueue_struct *cpuset_propagate_hotplug_wq; | 270 | static struct workqueue_struct *cpuset_propagate_hotplug_wq; |
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void) | |||
780 | lockdep_assert_held(&cpuset_mutex); | 769 | lockdep_assert_held(&cpuset_mutex); |
781 | get_online_cpus(); | 770 | get_online_cpus(); |
782 | 771 | ||
772 | /* | ||
773 | * We have raced with CPU hotplug. Don't do anything to avoid | ||
774 | * passing doms with offlined cpu to partition_sched_domains(). | ||
775 | * Anyways, hotplug work item will rebuild sched domains. | ||
776 | */ | ||
777 | if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) | ||
778 | goto out; | ||
779 | |||
783 | /* Generate domain masks and attrs */ | 780 | /* Generate domain masks and attrs */ |
784 | ndoms = generate_sched_domains(&doms, &attr); | 781 | ndoms = generate_sched_domains(&doms, &attr); |
785 | 782 | ||
786 | /* Have scheduler rebuild the domains */ | 783 | /* Have scheduler rebuild the domains */ |
787 | partition_sched_domains(ndoms, doms, attr); | 784 | partition_sched_domains(ndoms, doms, attr); |
788 | 785 | out: | |
789 | put_online_cpus(); | 786 | put_online_cpus(); |
790 | } | 787 | } |
791 | #else /* !CONFIG_SMP */ | 788 | #else /* !CONFIG_SMP */ |
792 | static void rebuild_sched_domains_locked(void) | 789 | static void rebuild_sched_domains_locked(void) |
793 | { | 790 | { |
794 | } | 791 | } |
795 | |||
796 | static int generate_sched_domains(cpumask_var_t **domains, | ||
797 | struct sched_domain_attr **attributes) | ||
798 | { | ||
799 | *domains = NULL; | ||
800 | return 1; | ||
801 | } | ||
802 | #endif /* CONFIG_SMP */ | 792 | #endif /* CONFIG_SMP */ |
803 | 793 | ||
804 | void rebuild_sched_domains(void) | 794 | void rebuild_sched_domains(void) |
@@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1388 | 1378 | ||
1389 | cgroup_taskset_for_each(task, cgrp, tset) { | 1379 | cgroup_taskset_for_each(task, cgrp, tset) { |
1390 | /* | 1380 | /* |
1391 | * Kthreads bound to specific cpus cannot be moved to a new | 1381 | * Kthreads which disallow setaffinity shouldn't be moved |
1392 | * cpuset; we cannot change their cpu affinity and | 1382 | * to a new cpuset; we don't want to change their cpu |
1393 | * isolating such threads by their set of allowed nodes is | 1383 | * affinity and isolating such threads by their set of |
1394 | * unnecessary. Thus, cpusets are not applicable for such | 1384 | * allowed nodes is unnecessary. Thus, cpusets are not |
1395 | * threads. This prevents checking for success of | 1385 | * applicable for such threads. This prevents checking for |
1396 | * set_cpus_allowed_ptr() on all attached tasks before | 1386 | * success of set_cpus_allowed_ptr() on all attached tasks |
1397 | * cpus_allowed may be changed. | 1387 | * before cpus_allowed may be changed. |
1398 | */ | 1388 | */ |
1399 | ret = -EINVAL; | 1389 | ret = -EINVAL; |
1400 | if (task->flags & PF_THREAD_BOUND) | 1390 | if (task->flags & PF_NO_SETAFFINITY) |
1401 | goto out_unlock; | 1391 | goto out_unlock; |
1402 | ret = security_task_setscheduler(task); | 1392 | ret = security_task_setscheduler(task); |
1403 | if (ret) | 1393 | if (ret) |
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void) | |||
2005 | return 0; | 1995 | return 0; |
2006 | } | 1996 | } |
2007 | 1997 | ||
2008 | /** | ||
2009 | * cpuset_do_move_task - move a given task to another cpuset | ||
2010 | * @tsk: pointer to task_struct the task to move | ||
2011 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | ||
2012 | * | ||
2013 | * Called by cgroup_scan_tasks() for each task in a cgroup. | ||
2014 | * Return nonzero to stop the walk through the tasks. | ||
2015 | */ | ||
2016 | static void cpuset_do_move_task(struct task_struct *tsk, | ||
2017 | struct cgroup_scanner *scan) | ||
2018 | { | ||
2019 | struct cgroup *new_cgroup = scan->data; | ||
2020 | |||
2021 | cgroup_lock(); | ||
2022 | cgroup_attach_task(new_cgroup, tsk); | ||
2023 | cgroup_unlock(); | ||
2024 | } | ||
2025 | |||
2026 | /** | ||
2027 | * move_member_tasks_to_cpuset - move tasks from one cpuset to another | ||
2028 | * @from: cpuset in which the tasks currently reside | ||
2029 | * @to: cpuset to which the tasks will be moved | ||
2030 | * | ||
2031 | * Called with cpuset_mutex held | ||
2032 | * callback_mutex must not be held, as cpuset_attach() will take it. | ||
2033 | * | ||
2034 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
2035 | * calling callback functions for each. | ||
2036 | */ | ||
2037 | static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | ||
2038 | { | ||
2039 | struct cgroup_scanner scan; | ||
2040 | |||
2041 | scan.cg = from->css.cgroup; | ||
2042 | scan.test_task = NULL; /* select all tasks in cgroup */ | ||
2043 | scan.process_task = cpuset_do_move_task; | ||
2044 | scan.heap = NULL; | ||
2045 | scan.data = to->css.cgroup; | ||
2046 | |||
2047 | if (cgroup_scan_tasks(&scan)) | ||
2048 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | ||
2049 | "cgroup_scan_tasks failed\n"); | ||
2050 | } | ||
2051 | |||
2052 | /* | 1998 | /* |
2053 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs | 1999 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
2054 | * or memory nodes, we need to walk over the cpuset hierarchy, | 2000 | * or memory nodes, we need to walk over the cpuset hierarchy, |
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2069 | nodes_empty(parent->mems_allowed)) | 2015 | nodes_empty(parent->mems_allowed)) |
2070 | parent = parent_cs(parent); | 2016 | parent = parent_cs(parent); |
2071 | 2017 | ||
2072 | move_member_tasks_to_cpuset(cs, parent); | 2018 | if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { |
2019 | rcu_read_lock(); | ||
2020 | printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", | ||
2021 | cgroup_name(cs->css.cgroup)); | ||
2022 | rcu_read_unlock(); | ||
2023 | } | ||
2073 | } | 2024 | } |
2074 | 2025 | ||
2075 | /** | 2026 | /** |
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2222 | flush_workqueue(cpuset_propagate_hotplug_wq); | 2173 | flush_workqueue(cpuset_propagate_hotplug_wq); |
2223 | 2174 | ||
2224 | /* rebuild sched domains if cpus_allowed has changed */ | 2175 | /* rebuild sched domains if cpus_allowed has changed */ |
2225 | if (cpus_updated) { | 2176 | if (cpus_updated) |
2226 | struct sched_domain_attr *attr; | 2177 | rebuild_sched_domains(); |
2227 | cpumask_var_t *doms; | ||
2228 | int ndoms; | ||
2229 | |||
2230 | mutex_lock(&cpuset_mutex); | ||
2231 | ndoms = generate_sched_domains(&doms, &attr); | ||
2232 | mutex_unlock(&cpuset_mutex); | ||
2233 | |||
2234 | partition_sched_domains(ndoms, doms, attr); | ||
2235 | } | ||
2236 | } | 2178 | } |
2237 | 2179 | ||
2238 | void cpuset_update_active_cpus(bool cpu_online) | 2180 | void cpuset_update_active_cpus(bool cpu_online) |
@@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
2251 | schedule_work(&cpuset_hotplug_work); | 2193 | schedule_work(&cpuset_hotplug_work); |
2252 | } | 2194 | } |
2253 | 2195 | ||
2254 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
2255 | /* | 2196 | /* |
2256 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. | 2197 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
2257 | * Call this routine anytime after node_states[N_MEMORY] changes. | 2198 | * Call this routine anytime after node_states[N_MEMORY] changes. |
@@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2263 | schedule_work(&cpuset_hotplug_work); | 2204 | schedule_work(&cpuset_hotplug_work); |
2264 | return NOTIFY_OK; | 2205 | return NOTIFY_OK; |
2265 | } | 2206 | } |
2266 | #endif | 2207 | |
2208 | static struct notifier_block cpuset_track_online_nodes_nb = { | ||
2209 | .notifier_call = cpuset_track_online_nodes, | ||
2210 | .priority = 10, /* ??! */ | ||
2211 | }; | ||
2267 | 2212 | ||
2268 | /** | 2213 | /** |
2269 | * cpuset_init_smp - initialize cpus_allowed | 2214 | * cpuset_init_smp - initialize cpus_allowed |
2270 | * | 2215 | * |
2271 | * Description: Finish top cpuset after cpu, node maps are initialized | 2216 | * Description: Finish top cpuset after cpu, node maps are initialized |
2272 | **/ | 2217 | */ |
2273 | |||
2274 | void __init cpuset_init_smp(void) | 2218 | void __init cpuset_init_smp(void) |
2275 | { | 2219 | { |
2276 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2220 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2277 | top_cpuset.mems_allowed = node_states[N_MEMORY]; | 2221 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2278 | 2222 | ||
2279 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2223 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); |
2280 | 2224 | ||
2281 | cpuset_propagate_hotplug_wq = | 2225 | cpuset_propagate_hotplug_wq = |
2282 | alloc_ordered_workqueue("cpuset_hotplug", 0); | 2226 | alloc_ordered_workqueue("cpuset_hotplug", 0); |
@@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | |||
2592 | return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); | 2536 | return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); |
2593 | } | 2537 | } |
2594 | 2538 | ||
2539 | #define CPUSET_NODELIST_LEN (256) | ||
2540 | |||
2595 | /** | 2541 | /** |
2596 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed | 2542 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed |
2597 | * @task: pointer to task_struct of some task. | 2543 | * @task: pointer to task_struct of some task. |
@@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | |||
2602 | */ | 2548 | */ |
2603 | void cpuset_print_task_mems_allowed(struct task_struct *tsk) | 2549 | void cpuset_print_task_mems_allowed(struct task_struct *tsk) |
2604 | { | 2550 | { |
2605 | struct dentry *dentry; | 2551 | /* Statically allocated to prevent using excess stack. */ |
2552 | static char cpuset_nodelist[CPUSET_NODELIST_LEN]; | ||
2553 | static DEFINE_SPINLOCK(cpuset_buffer_lock); | ||
2606 | 2554 | ||
2607 | dentry = task_cs(tsk)->css.cgroup->dentry; | 2555 | struct cgroup *cgrp = task_cs(tsk)->css.cgroup; |
2608 | spin_lock(&cpuset_buffer_lock); | ||
2609 | 2556 | ||
2610 | if (!dentry) { | 2557 | rcu_read_lock(); |
2611 | strcpy(cpuset_name, "/"); | 2558 | spin_lock(&cpuset_buffer_lock); |
2612 | } else { | ||
2613 | spin_lock(&dentry->d_lock); | ||
2614 | strlcpy(cpuset_name, (const char *)dentry->d_name.name, | ||
2615 | CPUSET_NAME_LEN); | ||
2616 | spin_unlock(&dentry->d_lock); | ||
2617 | } | ||
2618 | 2559 | ||
2619 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, | 2560 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, |
2620 | tsk->mems_allowed); | 2561 | tsk->mems_allowed); |
2621 | printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", | 2562 | printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", |
2622 | tsk->comm, cpuset_name, cpuset_nodelist); | 2563 | tsk->comm, cgroup_name(cgrp), cpuset_nodelist); |
2564 | |||
2623 | spin_unlock(&cpuset_buffer_lock); | 2565 | spin_unlock(&cpuset_buffer_lock); |
2566 | rcu_read_unlock(); | ||
2624 | } | 2567 | } |
2625 | 2568 | ||
2626 | /* | 2569 | /* |
@@ -2666,7 +2609,7 @@ void __cpuset_memory_pressure_bump(void) | |||
2666 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it | 2609 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
2667 | * anyway. | 2610 | * anyway. |
2668 | */ | 2611 | */ |
2669 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2612 | int proc_cpuset_show(struct seq_file *m, void *unused_v) |
2670 | { | 2613 | { |
2671 | struct pid *pid; | 2614 | struct pid *pid; |
2672 | struct task_struct *tsk; | 2615 | struct task_struct *tsk; |
@@ -2700,19 +2643,6 @@ out_free: | |||
2700 | out: | 2643 | out: |
2701 | return retval; | 2644 | return retval; |
2702 | } | 2645 | } |
2703 | |||
2704 | static int cpuset_open(struct inode *inode, struct file *file) | ||
2705 | { | ||
2706 | struct pid *pid = PROC_I(inode)->pid; | ||
2707 | return single_open(file, proc_cpuset_show, pid); | ||
2708 | } | ||
2709 | |||
2710 | const struct file_operations proc_cpuset_operations = { | ||
2711 | .open = cpuset_open, | ||
2712 | .read = seq_read, | ||
2713 | .llseek = seq_lseek, | ||
2714 | .release = single_release, | ||
2715 | }; | ||
2716 | #endif /* CONFIG_PROC_PID_CPUSET */ | 2646 | #endif /* CONFIG_PROC_PID_CPUSET */ |
2717 | 2647 | ||
2718 | /* Display task mems_allowed in /proc/<pid>/status file. */ | 2648 | /* Display task mems_allowed in /proc/<pid>/status file. */ |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index c26278fd4851..0506d447aed2 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key) | |||
775 | 775 | ||
776 | static struct sysrq_key_op sysrq_dbg_op = { | 776 | static struct sysrq_key_op sysrq_dbg_op = { |
777 | .handler = sysrq_handle_dbg, | 777 | .handler = sysrq_handle_dbg, |
778 | .help_msg = "debug(G)", | 778 | .help_msg = "debug(g)", |
779 | .action_msg = "DEBUG", | 779 | .action_msg = "DEBUG", |
780 | }; | 780 | }; |
781 | #endif | 781 | #endif |
diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c30..9dc297faf7c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/poll.h> | 18 | #include <linux/poll.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/hash.h> | 20 | #include <linux/hash.h> |
21 | #include <linux/tick.h> | ||
21 | #include <linux/sysfs.h> | 22 | #include <linux/sysfs.h> |
22 | #include <linux/dcache.h> | 23 | #include <linux/dcache.h> |
23 | #include <linux/percpu.h> | 24 | #include <linux/percpu.h> |
@@ -37,6 +38,7 @@ | |||
37 | #include <linux/ftrace_event.h> | 38 | #include <linux/ftrace_event.h> |
38 | #include <linux/hw_breakpoint.h> | 39 | #include <linux/hw_breakpoint.h> |
39 | #include <linux/mm_types.h> | 40 | #include <linux/mm_types.h> |
41 | #include <linux/cgroup.h> | ||
40 | 42 | ||
41 | #include "internal.h" | 43 | #include "internal.h" |
42 | 44 | ||
@@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | |||
234 | #ifdef CONFIG_CGROUP_PERF | 236 | #ifdef CONFIG_CGROUP_PERF |
235 | 237 | ||
236 | /* | 238 | /* |
239 | * perf_cgroup_info keeps track of time_enabled for a cgroup. | ||
240 | * This is a per-cpu dynamically allocated data structure. | ||
241 | */ | ||
242 | struct perf_cgroup_info { | ||
243 | u64 time; | ||
244 | u64 timestamp; | ||
245 | }; | ||
246 | |||
247 | struct perf_cgroup { | ||
248 | struct cgroup_subsys_state css; | ||
249 | struct perf_cgroup_info __percpu *info; | ||
250 | }; | ||
251 | |||
252 | /* | ||
237 | * Must ensure cgroup is pinned (css_get) before calling | 253 | * Must ensure cgroup is pinned (css_get) before calling |
238 | * this function. In other words, we cannot call this function | 254 | * this function. In other words, we cannot call this function |
239 | * if there is no cgroup event for the current CPU context. | 255 | * if there is no cgroup event for the current CPU context. |
@@ -251,7 +267,22 @@ perf_cgroup_match(struct perf_event *event) | |||
251 | struct perf_event_context *ctx = event->ctx; | 267 | struct perf_event_context *ctx = event->ctx; |
252 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 268 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
253 | 269 | ||
254 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | 270 | /* @event doesn't care about cgroup */ |
271 | if (!event->cgrp) | ||
272 | return true; | ||
273 | |||
274 | /* wants specific cgroup scope but @cpuctx isn't associated with any */ | ||
275 | if (!cpuctx->cgrp) | ||
276 | return false; | ||
277 | |||
278 | /* | ||
279 | * Cgroup scoping is recursive. An event enabled for a cgroup is | ||
280 | * also enabled for all its descendant cgroups. If @cpuctx's | ||
281 | * cgroup is a descendant of @event's (the test covers identity | ||
282 | * case), it's a match. | ||
283 | */ | ||
284 | return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, | ||
285 | event->cgrp->css.cgroup); | ||
255 | } | 286 | } |
256 | 287 | ||
257 | static inline bool perf_tryget_cgroup(struct perf_event *event) | 288 | static inline bool perf_tryget_cgroup(struct perf_event *event) |
@@ -655,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu) | |||
655 | 686 | ||
656 | WARN_ON(!irqs_disabled()); | 687 | WARN_ON(!irqs_disabled()); |
657 | 688 | ||
658 | if (list_empty(&cpuctx->rotation_list)) | 689 | if (list_empty(&cpuctx->rotation_list)) { |
690 | int was_empty = list_empty(head); | ||
659 | list_add(&cpuctx->rotation_list, head); | 691 | list_add(&cpuctx->rotation_list, head); |
692 | if (was_empty) | ||
693 | tick_nohz_full_kick(); | ||
694 | } | ||
660 | } | 695 | } |
661 | 696 | ||
662 | static void get_ctx(struct perf_event_context *ctx) | 697 | static void get_ctx(struct perf_event_context *ctx) |
@@ -961,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event) | |||
961 | if (sample_type & PERF_SAMPLE_PERIOD) | 996 | if (sample_type & PERF_SAMPLE_PERIOD) |
962 | size += sizeof(data->period); | 997 | size += sizeof(data->period); |
963 | 998 | ||
999 | if (sample_type & PERF_SAMPLE_WEIGHT) | ||
1000 | size += sizeof(data->weight); | ||
1001 | |||
964 | if (sample_type & PERF_SAMPLE_READ) | 1002 | if (sample_type & PERF_SAMPLE_READ) |
965 | size += event->read_size; | 1003 | size += event->read_size; |
966 | 1004 | ||
1005 | if (sample_type & PERF_SAMPLE_DATA_SRC) | ||
1006 | size += sizeof(data->data_src.val); | ||
1007 | |||
967 | event->header_size = size; | 1008 | event->header_size = size; |
968 | } | 1009 | } |
969 | 1010 | ||
@@ -2555,6 +2596,16 @@ done: | |||
2555 | list_del_init(&cpuctx->rotation_list); | 2596 | list_del_init(&cpuctx->rotation_list); |
2556 | } | 2597 | } |
2557 | 2598 | ||
2599 | #ifdef CONFIG_NO_HZ_FULL | ||
2600 | bool perf_event_can_stop_tick(void) | ||
2601 | { | ||
2602 | if (list_empty(&__get_cpu_var(rotation_list))) | ||
2603 | return true; | ||
2604 | else | ||
2605 | return false; | ||
2606 | } | ||
2607 | #endif | ||
2608 | |||
2558 | void perf_event_task_tick(void) | 2609 | void perf_event_task_tick(void) |
2559 | { | 2610 | { |
2560 | struct list_head *head = &__get_cpu_var(rotation_list); | 2611 | struct list_head *head = &__get_cpu_var(rotation_list); |
@@ -4178,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4178 | perf_output_sample_ustack(handle, | 4229 | perf_output_sample_ustack(handle, |
4179 | data->stack_user_size, | 4230 | data->stack_user_size, |
4180 | data->regs_user.regs); | 4231 | data->regs_user.regs); |
4232 | |||
4233 | if (sample_type & PERF_SAMPLE_WEIGHT) | ||
4234 | perf_output_put(handle, data->weight); | ||
4235 | |||
4236 | if (sample_type & PERF_SAMPLE_DATA_SRC) | ||
4237 | perf_output_put(handle, data->data_src.val); | ||
4181 | } | 4238 | } |
4182 | 4239 | ||
4183 | void perf_prepare_sample(struct perf_event_header *header, | 4240 | void perf_prepare_sample(struct perf_event_header *header, |
@@ -4337,6 +4394,64 @@ perf_event_read_event(struct perf_event *event, | |||
4337 | perf_output_end(&handle); | 4394 | perf_output_end(&handle); |
4338 | } | 4395 | } |
4339 | 4396 | ||
4397 | typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); | ||
4398 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); | ||
4399 | |||
4400 | static void | ||
4401 | perf_event_aux_ctx(struct perf_event_context *ctx, | ||
4402 | perf_event_aux_match_cb match, | ||
4403 | perf_event_aux_output_cb output, | ||
4404 | void *data) | ||
4405 | { | ||
4406 | struct perf_event *event; | ||
4407 | |||
4408 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | ||
4409 | if (event->state < PERF_EVENT_STATE_INACTIVE) | ||
4410 | continue; | ||
4411 | if (!event_filter_match(event)) | ||
4412 | continue; | ||
4413 | if (match(event, data)) | ||
4414 | output(event, data); | ||
4415 | } | ||
4416 | } | ||
4417 | |||
4418 | static void | ||
4419 | perf_event_aux(perf_event_aux_match_cb match, | ||
4420 | perf_event_aux_output_cb output, | ||
4421 | void *data, | ||
4422 | struct perf_event_context *task_ctx) | ||
4423 | { | ||
4424 | struct perf_cpu_context *cpuctx; | ||
4425 | struct perf_event_context *ctx; | ||
4426 | struct pmu *pmu; | ||
4427 | int ctxn; | ||
4428 | |||
4429 | rcu_read_lock(); | ||
4430 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
4431 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | ||
4432 | if (cpuctx->unique_pmu != pmu) | ||
4433 | goto next; | ||
4434 | perf_event_aux_ctx(&cpuctx->ctx, match, output, data); | ||
4435 | if (task_ctx) | ||
4436 | goto next; | ||
4437 | ctxn = pmu->task_ctx_nr; | ||
4438 | if (ctxn < 0) | ||
4439 | goto next; | ||
4440 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4441 | if (ctx) | ||
4442 | perf_event_aux_ctx(ctx, match, output, data); | ||
4443 | next: | ||
4444 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4445 | } | ||
4446 | |||
4447 | if (task_ctx) { | ||
4448 | preempt_disable(); | ||
4449 | perf_event_aux_ctx(task_ctx, match, output, data); | ||
4450 | preempt_enable(); | ||
4451 | } | ||
4452 | rcu_read_unlock(); | ||
4453 | } | ||
4454 | |||
4340 | /* | 4455 | /* |
4341 | * task tracking -- fork/exit | 4456 | * task tracking -- fork/exit |
4342 | * | 4457 | * |
@@ -4359,8 +4474,9 @@ struct perf_task_event { | |||
4359 | }; | 4474 | }; |
4360 | 4475 | ||
4361 | static void perf_event_task_output(struct perf_event *event, | 4476 | static void perf_event_task_output(struct perf_event *event, |
4362 | struct perf_task_event *task_event) | 4477 | void *data) |
4363 | { | 4478 | { |
4479 | struct perf_task_event *task_event = data; | ||
4364 | struct perf_output_handle handle; | 4480 | struct perf_output_handle handle; |
4365 | struct perf_sample_data sample; | 4481 | struct perf_sample_data sample; |
4366 | struct task_struct *task = task_event->task; | 4482 | struct task_struct *task = task_event->task; |
@@ -4388,59 +4504,11 @@ out: | |||
4388 | task_event->event_id.header.size = size; | 4504 | task_event->event_id.header.size = size; |
4389 | } | 4505 | } |
4390 | 4506 | ||
4391 | static int perf_event_task_match(struct perf_event *event) | 4507 | static int perf_event_task_match(struct perf_event *event, |
4392 | { | 4508 | void *data __maybe_unused) |
4393 | if (event->state < PERF_EVENT_STATE_INACTIVE) | ||
4394 | return 0; | ||
4395 | |||
4396 | if (!event_filter_match(event)) | ||
4397 | return 0; | ||
4398 | |||
4399 | if (event->attr.comm || event->attr.mmap || | ||
4400 | event->attr.mmap_data || event->attr.task) | ||
4401 | return 1; | ||
4402 | |||
4403 | return 0; | ||
4404 | } | ||
4405 | |||
4406 | static void perf_event_task_ctx(struct perf_event_context *ctx, | ||
4407 | struct perf_task_event *task_event) | ||
4408 | { | 4509 | { |
4409 | struct perf_event *event; | 4510 | return event->attr.comm || event->attr.mmap || |
4410 | 4511 | event->attr.mmap_data || event->attr.task; | |
4411 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | ||
4412 | if (perf_event_task_match(event)) | ||
4413 | perf_event_task_output(event, task_event); | ||
4414 | } | ||
4415 | } | ||
4416 | |||
4417 | static void perf_event_task_event(struct perf_task_event *task_event) | ||
4418 | { | ||
4419 | struct perf_cpu_context *cpuctx; | ||
4420 | struct perf_event_context *ctx; | ||
4421 | struct pmu *pmu; | ||
4422 | int ctxn; | ||
4423 | |||
4424 | rcu_read_lock(); | ||
4425 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
4426 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | ||
4427 | if (cpuctx->unique_pmu != pmu) | ||
4428 | goto next; | ||
4429 | perf_event_task_ctx(&cpuctx->ctx, task_event); | ||
4430 | |||
4431 | ctx = task_event->task_ctx; | ||
4432 | if (!ctx) { | ||
4433 | ctxn = pmu->task_ctx_nr; | ||
4434 | if (ctxn < 0) | ||
4435 | goto next; | ||
4436 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4437 | } | ||
4438 | if (ctx) | ||
4439 | perf_event_task_ctx(ctx, task_event); | ||
4440 | next: | ||
4441 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4442 | } | ||
4443 | rcu_read_unlock(); | ||
4444 | } | 4512 | } |
4445 | 4513 | ||
4446 | static void perf_event_task(struct task_struct *task, | 4514 | static void perf_event_task(struct task_struct *task, |
@@ -4471,7 +4539,10 @@ static void perf_event_task(struct task_struct *task, | |||
4471 | }, | 4539 | }, |
4472 | }; | 4540 | }; |
4473 | 4541 | ||
4474 | perf_event_task_event(&task_event); | 4542 | perf_event_aux(perf_event_task_match, |
4543 | perf_event_task_output, | ||
4544 | &task_event, | ||
4545 | task_ctx); | ||
4475 | } | 4546 | } |
4476 | 4547 | ||
4477 | void perf_event_fork(struct task_struct *task) | 4548 | void perf_event_fork(struct task_struct *task) |
@@ -4497,8 +4568,9 @@ struct perf_comm_event { | |||
4497 | }; | 4568 | }; |
4498 | 4569 | ||
4499 | static void perf_event_comm_output(struct perf_event *event, | 4570 | static void perf_event_comm_output(struct perf_event *event, |
4500 | struct perf_comm_event *comm_event) | 4571 | void *data) |
4501 | { | 4572 | { |
4573 | struct perf_comm_event *comm_event = data; | ||
4502 | struct perf_output_handle handle; | 4574 | struct perf_output_handle handle; |
4503 | struct perf_sample_data sample; | 4575 | struct perf_sample_data sample; |
4504 | int size = comm_event->event_id.header.size; | 4576 | int size = comm_event->event_id.header.size; |
@@ -4525,39 +4597,16 @@ out: | |||
4525 | comm_event->event_id.header.size = size; | 4597 | comm_event->event_id.header.size = size; |
4526 | } | 4598 | } |
4527 | 4599 | ||
4528 | static int perf_event_comm_match(struct perf_event *event) | 4600 | static int perf_event_comm_match(struct perf_event *event, |
4601 | void *data __maybe_unused) | ||
4529 | { | 4602 | { |
4530 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4603 | return event->attr.comm; |
4531 | return 0; | ||
4532 | |||
4533 | if (!event_filter_match(event)) | ||
4534 | return 0; | ||
4535 | |||
4536 | if (event->attr.comm) | ||
4537 | return 1; | ||
4538 | |||
4539 | return 0; | ||
4540 | } | ||
4541 | |||
4542 | static void perf_event_comm_ctx(struct perf_event_context *ctx, | ||
4543 | struct perf_comm_event *comm_event) | ||
4544 | { | ||
4545 | struct perf_event *event; | ||
4546 | |||
4547 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | ||
4548 | if (perf_event_comm_match(event)) | ||
4549 | perf_event_comm_output(event, comm_event); | ||
4550 | } | ||
4551 | } | 4604 | } |
4552 | 4605 | ||
4553 | static void perf_event_comm_event(struct perf_comm_event *comm_event) | 4606 | static void perf_event_comm_event(struct perf_comm_event *comm_event) |
4554 | { | 4607 | { |
4555 | struct perf_cpu_context *cpuctx; | ||
4556 | struct perf_event_context *ctx; | ||
4557 | char comm[TASK_COMM_LEN]; | 4608 | char comm[TASK_COMM_LEN]; |
4558 | unsigned int size; | 4609 | unsigned int size; |
4559 | struct pmu *pmu; | ||
4560 | int ctxn; | ||
4561 | 4610 | ||
4562 | memset(comm, 0, sizeof(comm)); | 4611 | memset(comm, 0, sizeof(comm)); |
4563 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); | 4612 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
@@ -4567,24 +4616,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
4567 | comm_event->comm_size = size; | 4616 | comm_event->comm_size = size; |
4568 | 4617 | ||
4569 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4618 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
4570 | rcu_read_lock(); | ||
4571 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
4572 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | ||
4573 | if (cpuctx->unique_pmu != pmu) | ||
4574 | goto next; | ||
4575 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | ||
4576 | 4619 | ||
4577 | ctxn = pmu->task_ctx_nr; | 4620 | perf_event_aux(perf_event_comm_match, |
4578 | if (ctxn < 0) | 4621 | perf_event_comm_output, |
4579 | goto next; | 4622 | comm_event, |
4580 | 4623 | NULL); | |
4581 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4582 | if (ctx) | ||
4583 | perf_event_comm_ctx(ctx, comm_event); | ||
4584 | next: | ||
4585 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4586 | } | ||
4587 | rcu_read_unlock(); | ||
4588 | } | 4624 | } |
4589 | 4625 | ||
4590 | void perf_event_comm(struct task_struct *task) | 4626 | void perf_event_comm(struct task_struct *task) |
@@ -4593,6 +4629,7 @@ void perf_event_comm(struct task_struct *task) | |||
4593 | struct perf_event_context *ctx; | 4629 | struct perf_event_context *ctx; |
4594 | int ctxn; | 4630 | int ctxn; |
4595 | 4631 | ||
4632 | rcu_read_lock(); | ||
4596 | for_each_task_context_nr(ctxn) { | 4633 | for_each_task_context_nr(ctxn) { |
4597 | ctx = task->perf_event_ctxp[ctxn]; | 4634 | ctx = task->perf_event_ctxp[ctxn]; |
4598 | if (!ctx) | 4635 | if (!ctx) |
@@ -4600,6 +4637,7 @@ void perf_event_comm(struct task_struct *task) | |||
4600 | 4637 | ||
4601 | perf_event_enable_on_exec(ctx); | 4638 | perf_event_enable_on_exec(ctx); |
4602 | } | 4639 | } |
4640 | rcu_read_unlock(); | ||
4603 | 4641 | ||
4604 | if (!atomic_read(&nr_comm_events)) | 4642 | if (!atomic_read(&nr_comm_events)) |
4605 | return; | 4643 | return; |
@@ -4644,8 +4682,9 @@ struct perf_mmap_event { | |||
4644 | }; | 4682 | }; |
4645 | 4683 | ||
4646 | static void perf_event_mmap_output(struct perf_event *event, | 4684 | static void perf_event_mmap_output(struct perf_event *event, |
4647 | struct perf_mmap_event *mmap_event) | 4685 | void *data) |
4648 | { | 4686 | { |
4687 | struct perf_mmap_event *mmap_event = data; | ||
4649 | struct perf_output_handle handle; | 4688 | struct perf_output_handle handle; |
4650 | struct perf_sample_data sample; | 4689 | struct perf_sample_data sample; |
4651 | int size = mmap_event->event_id.header.size; | 4690 | int size = mmap_event->event_id.header.size; |
@@ -4672,46 +4711,24 @@ out: | |||
4672 | } | 4711 | } |
4673 | 4712 | ||
4674 | static int perf_event_mmap_match(struct perf_event *event, | 4713 | static int perf_event_mmap_match(struct perf_event *event, |
4675 | struct perf_mmap_event *mmap_event, | 4714 | void *data) |
4676 | int executable) | ||
4677 | { | ||
4678 | if (event->state < PERF_EVENT_STATE_INACTIVE) | ||
4679 | return 0; | ||
4680 | |||
4681 | if (!event_filter_match(event)) | ||
4682 | return 0; | ||
4683 | |||
4684 | if ((!executable && event->attr.mmap_data) || | ||
4685 | (executable && event->attr.mmap)) | ||
4686 | return 1; | ||
4687 | |||
4688 | return 0; | ||
4689 | } | ||
4690 | |||
4691 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, | ||
4692 | struct perf_mmap_event *mmap_event, | ||
4693 | int executable) | ||
4694 | { | 4715 | { |
4695 | struct perf_event *event; | 4716 | struct perf_mmap_event *mmap_event = data; |
4717 | struct vm_area_struct *vma = mmap_event->vma; | ||
4718 | int executable = vma->vm_flags & VM_EXEC; | ||
4696 | 4719 | ||
4697 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 4720 | return (!executable && event->attr.mmap_data) || |
4698 | if (perf_event_mmap_match(event, mmap_event, executable)) | 4721 | (executable && event->attr.mmap); |
4699 | perf_event_mmap_output(event, mmap_event); | ||
4700 | } | ||
4701 | } | 4722 | } |
4702 | 4723 | ||
4703 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | 4724 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) |
4704 | { | 4725 | { |
4705 | struct perf_cpu_context *cpuctx; | ||
4706 | struct perf_event_context *ctx; | ||
4707 | struct vm_area_struct *vma = mmap_event->vma; | 4726 | struct vm_area_struct *vma = mmap_event->vma; |
4708 | struct file *file = vma->vm_file; | 4727 | struct file *file = vma->vm_file; |
4709 | unsigned int size; | 4728 | unsigned int size; |
4710 | char tmp[16]; | 4729 | char tmp[16]; |
4711 | char *buf = NULL; | 4730 | char *buf = NULL; |
4712 | const char *name; | 4731 | const char *name; |
4713 | struct pmu *pmu; | ||
4714 | int ctxn; | ||
4715 | 4732 | ||
4716 | memset(tmp, 0, sizeof(tmp)); | 4733 | memset(tmp, 0, sizeof(tmp)); |
4717 | 4734 | ||
@@ -4734,7 +4751,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
4734 | } else { | 4751 | } else { |
4735 | if (arch_vma_name(mmap_event->vma)) { | 4752 | if (arch_vma_name(mmap_event->vma)) { |
4736 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 4753 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), |
4737 | sizeof(tmp)); | 4754 | sizeof(tmp) - 1); |
4755 | tmp[sizeof(tmp) - 1] = '\0'; | ||
4738 | goto got_name; | 4756 | goto got_name; |
4739 | } | 4757 | } |
4740 | 4758 | ||
@@ -4761,29 +4779,15 @@ got_name: | |||
4761 | mmap_event->file_name = name; | 4779 | mmap_event->file_name = name; |
4762 | mmap_event->file_size = size; | 4780 | mmap_event->file_size = size; |
4763 | 4781 | ||
4764 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 4782 | if (!(vma->vm_flags & VM_EXEC)) |
4765 | 4783 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; | |
4766 | rcu_read_lock(); | ||
4767 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
4768 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | ||
4769 | if (cpuctx->unique_pmu != pmu) | ||
4770 | goto next; | ||
4771 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, | ||
4772 | vma->vm_flags & VM_EXEC); | ||
4773 | 4784 | ||
4774 | ctxn = pmu->task_ctx_nr; | 4785 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
4775 | if (ctxn < 0) | ||
4776 | goto next; | ||
4777 | 4786 | ||
4778 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 4787 | perf_event_aux(perf_event_mmap_match, |
4779 | if (ctx) { | 4788 | perf_event_mmap_output, |
4780 | perf_event_mmap_ctx(ctx, mmap_event, | 4789 | mmap_event, |
4781 | vma->vm_flags & VM_EXEC); | 4790 | NULL); |
4782 | } | ||
4783 | next: | ||
4784 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4785 | } | ||
4786 | rcu_read_unlock(); | ||
4787 | 4791 | ||
4788 | kfree(buf); | 4792 | kfree(buf); |
4789 | } | 4793 | } |
@@ -5327,7 +5331,7 @@ static void sw_perf_event_destroy(struct perf_event *event) | |||
5327 | 5331 | ||
5328 | static int perf_swevent_init(struct perf_event *event) | 5332 | static int perf_swevent_init(struct perf_event *event) |
5329 | { | 5333 | { |
5330 | int event_id = event->attr.config; | 5334 | u64 event_id = event->attr.config; |
5331 | 5335 | ||
5332 | if (event->attr.type != PERF_TYPE_SOFTWARE) | 5336 | if (event->attr.type != PERF_TYPE_SOFTWARE) |
5333 | return -ENOENT; | 5337 | return -ENOENT; |
@@ -5647,6 +5651,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) | |||
5647 | event->attr.sample_period = NSEC_PER_SEC / freq; | 5651 | event->attr.sample_period = NSEC_PER_SEC / freq; |
5648 | hwc->sample_period = event->attr.sample_period; | 5652 | hwc->sample_period = event->attr.sample_period; |
5649 | local64_set(&hwc->period_left, hwc->sample_period); | 5653 | local64_set(&hwc->period_left, hwc->sample_period); |
5654 | hwc->last_period = hwc->sample_period; | ||
5650 | event->attr.freq = 0; | 5655 | event->attr.freq = 0; |
5651 | } | 5656 | } |
5652 | } | 5657 | } |
@@ -5982,6 +5987,7 @@ skip_type: | |||
5982 | if (pmu->pmu_cpu_context) | 5987 | if (pmu->pmu_cpu_context) |
5983 | goto got_cpu_context; | 5988 | goto got_cpu_context; |
5984 | 5989 | ||
5990 | ret = -ENOMEM; | ||
5985 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | 5991 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); |
5986 | if (!pmu->pmu_cpu_context) | 5992 | if (!pmu->pmu_cpu_context) |
5987 | goto free_dev; | 5993 | goto free_dev; |
@@ -7509,12 +7515,5 @@ struct cgroup_subsys perf_subsys = { | |||
7509 | .css_free = perf_cgroup_css_free, | 7515 | .css_free = perf_cgroup_css_free, |
7510 | .exit = perf_cgroup_exit, | 7516 | .exit = perf_cgroup_exit, |
7511 | .attach = perf_cgroup_attach, | 7517 | .attach = perf_cgroup_attach, |
7512 | |||
7513 | /* | ||
7514 | * perf_event cgroup doesn't handle nesting correctly. | ||
7515 | * ctx->nr_cgroups adjustments should be propagated through the | ||
7516 | * cgroup hierarchy. Fix it and remove the following. | ||
7517 | */ | ||
7518 | .broken_hierarchy = true, | ||
7519 | }; | 7518 | }; |
7520 | #endif /* CONFIG_CGROUP_PERF */ | 7519 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8b..eb675c4d59df 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -16,7 +16,7 @@ struct ring_buffer { | |||
16 | int page_order; /* allocation order */ | 16 | int page_order; /* allocation order */ |
17 | #endif | 17 | #endif |
18 | int nr_pages; /* nr of data pages */ | 18 | int nr_pages; /* nr of data pages */ |
19 | int writable; /* are we writable */ | 19 | int overwrite; /* can overwrite itself */ |
20 | 20 | ||
21 | atomic_t poll; /* POLL_ for wakeups */ | 21 | atomic_t poll; /* POLL_ for wakeups */ |
22 | 22 | ||
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff3973..cd55144270b5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -18,12 +18,24 @@ | |||
18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | 18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, |
19 | unsigned long offset, unsigned long head) | 19 | unsigned long offset, unsigned long head) |
20 | { | 20 | { |
21 | unsigned long mask; | 21 | unsigned long sz = perf_data_size(rb); |
22 | unsigned long mask = sz - 1; | ||
22 | 23 | ||
23 | if (!rb->writable) | 24 | /* |
25 | * check if user-writable | ||
26 | * overwrite : over-write its own tail | ||
27 | * !overwrite: buffer possibly drops events. | ||
28 | */ | ||
29 | if (rb->overwrite) | ||
24 | return true; | 30 | return true; |
25 | 31 | ||
26 | mask = perf_data_size(rb) - 1; | 32 | /* |
33 | * verify that payload is not bigger than buffer | ||
34 | * otherwise masking logic may fail to detect | ||
35 | * the "not enough space" condition | ||
36 | */ | ||
37 | if ((head - offset) > sz) | ||
38 | return false; | ||
27 | 39 | ||
28 | offset = (offset - tail) & mask; | 40 | offset = (offset - tail) & mask; |
29 | head = (head - tail) & mask; | 41 | head = (head - tail) & mask; |
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | |||
212 | rb->watermark = max_size / 2; | 224 | rb->watermark = max_size / 2; |
213 | 225 | ||
214 | if (flags & RING_BUFFER_WRITABLE) | 226 | if (flags & RING_BUFFER_WRITABLE) |
215 | rb->writable = 1; | 227 | rb->overwrite = 0; |
228 | else | ||
229 | rb->overwrite = 1; | ||
216 | 230 | ||
217 | atomic_set(&rb->refcount, 1); | 231 | atomic_set(&rb->refcount, 1); |
218 | 232 | ||
@@ -312,11 +326,16 @@ void rb_free(struct ring_buffer *rb) | |||
312 | } | 326 | } |
313 | 327 | ||
314 | #else | 328 | #else |
329 | static int data_page_nr(struct ring_buffer *rb) | ||
330 | { | ||
331 | return rb->nr_pages << page_order(rb); | ||
332 | } | ||
315 | 333 | ||
316 | struct page * | 334 | struct page * |
317 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | 335 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) |
318 | { | 336 | { |
319 | if (pgoff > (1UL << page_order(rb))) | 337 | /* The '>' counts in the user page. */ |
338 | if (pgoff > data_page_nr(rb)) | ||
320 | return NULL; | 339 | return NULL; |
321 | 340 | ||
322 | return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); | 341 | return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); |
@@ -336,10 +355,11 @@ static void rb_free_work(struct work_struct *work) | |||
336 | int i, nr; | 355 | int i, nr; |
337 | 356 | ||
338 | rb = container_of(work, struct ring_buffer, work); | 357 | rb = container_of(work, struct ring_buffer, work); |
339 | nr = 1 << page_order(rb); | 358 | nr = data_page_nr(rb); |
340 | 359 | ||
341 | base = rb->user_page; | 360 | base = rb->user_page; |
342 | for (i = 0; i < nr + 1; i++) | 361 | /* The '<=' counts in the user page. */ |
362 | for (i = 0; i <= nr; i++) | ||
343 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | 363 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); |
344 | 364 | ||
345 | vfree(base); | 365 | vfree(base); |
@@ -373,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) | |||
373 | rb->user_page = all_buf; | 393 | rb->user_page = all_buf; |
374 | rb->data_pages[0] = all_buf + PAGE_SIZE; | 394 | rb->data_pages[0] = all_buf + PAGE_SIZE; |
375 | rb->page_order = ilog2(nr_pages); | 395 | rb->page_order = ilog2(nr_pages); |
376 | rb->nr_pages = 1; | 396 | rb->nr_pages = !!nr_pages; |
377 | 397 | ||
378 | ring_buffer_init(rb, watermark, flags); | 398 | ring_buffer_init(rb, watermark, flags); |
379 | 399 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a567c8c7ef31..f3569747d629 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -75,6 +75,15 @@ struct uprobe { | |||
75 | struct arch_uprobe arch; | 75 | struct arch_uprobe arch; |
76 | }; | 76 | }; |
77 | 77 | ||
78 | struct return_instance { | ||
79 | struct uprobe *uprobe; | ||
80 | unsigned long func; | ||
81 | unsigned long orig_ret_vaddr; /* original return address */ | ||
82 | bool chained; /* true, if instance is nested */ | ||
83 | |||
84 | struct return_instance *next; /* keep as stack */ | ||
85 | }; | ||
86 | |||
78 | /* | 87 | /* |
79 | * valid_vma: Verify if the specified vma is an executable vma | 88 | * valid_vma: Verify if the specified vma is an executable vma |
80 | * Relax restrictions while unregistering: vm_flags might have | 89 | * Relax restrictions while unregistering: vm_flags might have |
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) | |||
173 | return *insn == UPROBE_SWBP_INSN; | 182 | return *insn == UPROBE_SWBP_INSN; |
174 | } | 183 | } |
175 | 184 | ||
176 | static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) | 185 | /** |
186 | * is_trap_insn - check if instruction is breakpoint instruction. | ||
187 | * @insn: instruction to be checked. | ||
188 | * Default implementation of is_trap_insn | ||
189 | * Returns true if @insn is a breakpoint instruction. | ||
190 | * | ||
191 | * This function is needed for the case where an architecture has multiple | ||
192 | * trap instructions (like powerpc). | ||
193 | */ | ||
194 | bool __weak is_trap_insn(uprobe_opcode_t *insn) | ||
195 | { | ||
196 | return is_swbp_insn(insn); | ||
197 | } | ||
198 | |||
199 | static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) | ||
177 | { | 200 | { |
178 | void *kaddr = kmap_atomic(page); | 201 | void *kaddr = kmap_atomic(page); |
179 | memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); | 202 | memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); |
203 | kunmap_atomic(kaddr); | ||
204 | } | ||
205 | |||
206 | static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) | ||
207 | { | ||
208 | void *kaddr = kmap_atomic(page); | ||
209 | memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); | ||
180 | kunmap_atomic(kaddr); | 210 | kunmap_atomic(kaddr); |
181 | } | 211 | } |
182 | 212 | ||
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
185 | uprobe_opcode_t old_opcode; | 215 | uprobe_opcode_t old_opcode; |
186 | bool is_swbp; | 216 | bool is_swbp; |
187 | 217 | ||
188 | copy_opcode(page, vaddr, &old_opcode); | 218 | /* |
219 | * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. | ||
220 | * We do not check if it is any other 'trap variant' which could | ||
221 | * be conditional trap instruction such as the one powerpc supports. | ||
222 | * | ||
223 | * The logic is that we do not care if the underlying instruction | ||
224 | * is a trap variant; uprobes always wins over any other (gdb) | ||
225 | * breakpoint. | ||
226 | */ | ||
227 | copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); | ||
189 | is_swbp = is_swbp_insn(&old_opcode); | 228 | is_swbp = is_swbp_insn(&old_opcode); |
190 | 229 | ||
191 | if (is_swbp_insn(new_opcode)) { | 230 | if (is_swbp_insn(new_opcode)) { |
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
204 | * Expect the breakpoint instruction to be the smallest size instruction for | 243 | * Expect the breakpoint instruction to be the smallest size instruction for |
205 | * the architecture. If an arch has variable length instruction and the | 244 | * the architecture. If an arch has variable length instruction and the |
206 | * breakpoint instruction is not of the smallest length instruction | 245 | * breakpoint instruction is not of the smallest length instruction |
207 | * supported by that architecture then we need to modify is_swbp_at_addr and | 246 | * supported by that architecture then we need to modify is_trap_at_addr and |
208 | * write_opcode accordingly. This would never be a problem for archs that | 247 | * write_opcode accordingly. This would never be a problem for archs that |
209 | * have fixed length instructions. | 248 | * have fixed length instructions. |
210 | */ | 249 | */ |
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr, | |||
225 | uprobe_opcode_t opcode) | 264 | uprobe_opcode_t opcode) |
226 | { | 265 | { |
227 | struct page *old_page, *new_page; | 266 | struct page *old_page, *new_page; |
228 | void *vaddr_old, *vaddr_new; | ||
229 | struct vm_area_struct *vma; | 267 | struct vm_area_struct *vma; |
230 | int ret; | 268 | int ret; |
231 | 269 | ||
@@ -246,15 +284,8 @@ retry: | |||
246 | 284 | ||
247 | __SetPageUptodate(new_page); | 285 | __SetPageUptodate(new_page); |
248 | 286 | ||
249 | /* copy the page now that we've got it stable */ | 287 | copy_highpage(new_page, old_page); |
250 | vaddr_old = kmap_atomic(old_page); | 288 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); |
251 | vaddr_new = kmap_atomic(new_page); | ||
252 | |||
253 | memcpy(vaddr_new, vaddr_old, PAGE_SIZE); | ||
254 | memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); | ||
255 | |||
256 | kunmap_atomic(vaddr_new); | ||
257 | kunmap_atomic(vaddr_old); | ||
258 | 289 | ||
259 | ret = anon_vma_prepare(vma); | 290 | ret = anon_vma_prepare(vma); |
260 | if (ret) | 291 | if (ret) |
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | |||
477 | unsigned long nbytes, loff_t offset) | 508 | unsigned long nbytes, loff_t offset) |
478 | { | 509 | { |
479 | struct page *page; | 510 | struct page *page; |
480 | void *vaddr; | ||
481 | unsigned long off; | ||
482 | pgoff_t idx; | ||
483 | |||
484 | if (!filp) | ||
485 | return -EINVAL; | ||
486 | 511 | ||
487 | if (!mapping->a_ops->readpage) | 512 | if (!mapping->a_ops->readpage) |
488 | return -EIO; | 513 | return -EIO; |
489 | |||
490 | idx = offset >> PAGE_CACHE_SHIFT; | ||
491 | off = offset & ~PAGE_MASK; | ||
492 | |||
493 | /* | 514 | /* |
494 | * Ensure that the page that has the original instruction is | 515 | * Ensure that the page that has the original instruction is |
495 | * populated and in page-cache. | 516 | * populated and in page-cache. |
496 | */ | 517 | */ |
497 | page = read_mapping_page(mapping, idx, filp); | 518 | page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); |
498 | if (IS_ERR(page)) | 519 | if (IS_ERR(page)) |
499 | return PTR_ERR(page); | 520 | return PTR_ERR(page); |
500 | 521 | ||
501 | vaddr = kmap_atomic(page); | 522 | copy_from_page(page, offset, insn, nbytes); |
502 | memcpy(insn, vaddr + off, nbytes); | ||
503 | kunmap_atomic(vaddr); | ||
504 | page_cache_release(page); | 523 | page_cache_release(page); |
505 | 524 | ||
506 | return 0; | 525 | return 0; |
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
550 | goto out; | 569 | goto out; |
551 | 570 | ||
552 | ret = -ENOTSUPP; | 571 | ret = -ENOTSUPP; |
553 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) | 572 | if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) |
554 | goto out; | 573 | goto out; |
555 | 574 | ||
556 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); | 575 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); |
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) | |||
758 | down_write(&mm->mmap_sem); | 777 | down_write(&mm->mmap_sem); |
759 | vma = find_vma(mm, info->vaddr); | 778 | vma = find_vma(mm, info->vaddr); |
760 | if (!vma || !valid_vma(vma, is_register) || | 779 | if (!vma || !valid_vma(vma, is_register) || |
761 | vma->vm_file->f_mapping->host != uprobe->inode) | 780 | file_inode(vma->vm_file) != uprobe->inode) |
762 | goto unlock; | 781 | goto unlock; |
763 | 782 | ||
764 | if (vma->vm_start > info->vaddr || | 783 | if (vma->vm_start > info->vaddr || |
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * | |||
828 | struct uprobe *uprobe; | 847 | struct uprobe *uprobe; |
829 | int ret; | 848 | int ret; |
830 | 849 | ||
850 | /* Uprobe must have at least one set consumer */ | ||
851 | if (!uc->handler && !uc->ret_handler) | ||
852 | return -EINVAL; | ||
853 | |||
831 | /* Racy, just to catch the obvious mistakes */ | 854 | /* Racy, just to catch the obvious mistakes */ |
832 | if (offset > i_size_read(inode)) | 855 | if (offset > i_size_read(inode)) |
833 | return -EINVAL; | 856 | return -EINVAL; |
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) | |||
917 | loff_t offset; | 940 | loff_t offset; |
918 | 941 | ||
919 | if (!valid_vma(vma, false) || | 942 | if (!valid_vma(vma, false) || |
920 | vma->vm_file->f_mapping->host != uprobe->inode) | 943 | file_inode(vma->vm_file) != uprobe->inode) |
921 | continue; | 944 | continue; |
922 | 945 | ||
923 | offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; | 946 | offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; |
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
1010 | if (no_uprobe_events() || !valid_vma(vma, true)) | 1033 | if (no_uprobe_events() || !valid_vma(vma, true)) |
1011 | return 0; | 1034 | return 0; |
1012 | 1035 | ||
1013 | inode = vma->vm_file->f_mapping->host; | 1036 | inode = file_inode(vma->vm_file); |
1014 | if (!inode) | 1037 | if (!inode) |
1015 | return 0; | 1038 | return 0; |
1016 | 1039 | ||
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e | |||
1041 | struct inode *inode; | 1064 | struct inode *inode; |
1042 | struct rb_node *n; | 1065 | struct rb_node *n; |
1043 | 1066 | ||
1044 | inode = vma->vm_file->f_mapping->host; | 1067 | inode = file_inode(vma->vm_file); |
1045 | 1068 | ||
1046 | min = vaddr_to_offset(vma, start); | 1069 | min = vaddr_to_offset(vma, start); |
1047 | max = min + (end - start) - 1; | 1070 | max = min + (end - start) - 1; |
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void) | |||
1114 | { | 1137 | { |
1115 | struct mm_struct *mm = current->mm; | 1138 | struct mm_struct *mm = current->mm; |
1116 | struct xol_area *area; | 1139 | struct xol_area *area; |
1140 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; | ||
1117 | 1141 | ||
1118 | area = mm->uprobes_state.xol_area; | 1142 | area = mm->uprobes_state.xol_area; |
1119 | if (area) | 1143 | if (area) |
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void) | |||
1131 | if (!area->page) | 1155 | if (!area->page) |
1132 | goto free_bitmap; | 1156 | goto free_bitmap; |
1133 | 1157 | ||
1158 | /* allocate first slot of task's xol_area for the return probes */ | ||
1159 | set_bit(0, area->bitmap); | ||
1160 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); | ||
1161 | atomic_set(&area->slot_count, 1); | ||
1134 | init_waitqueue_head(&area->wq); | 1162 | init_waitqueue_head(&area->wq); |
1163 | |||
1135 | if (!xol_add_vma(area)) | 1164 | if (!xol_add_vma(area)) |
1136 | return area; | 1165 | return area; |
1137 | 1166 | ||
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) | |||
1216 | static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | 1245 | static unsigned long xol_get_insn_slot(struct uprobe *uprobe) |
1217 | { | 1246 | { |
1218 | struct xol_area *area; | 1247 | struct xol_area *area; |
1219 | unsigned long offset; | ||
1220 | unsigned long xol_vaddr; | 1248 | unsigned long xol_vaddr; |
1221 | void *vaddr; | ||
1222 | 1249 | ||
1223 | area = get_xol_area(); | 1250 | area = get_xol_area(); |
1224 | if (!area) | 1251 | if (!area) |
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
1229 | return 0; | 1256 | return 0; |
1230 | 1257 | ||
1231 | /* Initialize the slot */ | 1258 | /* Initialize the slot */ |
1232 | offset = xol_vaddr & ~PAGE_MASK; | 1259 | copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); |
1233 | vaddr = kmap_atomic(area->page); | ||
1234 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); | ||
1235 | kunmap_atomic(vaddr); | ||
1236 | /* | 1260 | /* |
1237 | * We probably need flush_icache_user_range() but it needs vma. | 1261 | * We probably need flush_icache_user_range() but it needs vma. |
1238 | * This should work on supported architectures too. | 1262 | * This should work on supported architectures too. |
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) | |||
1298 | void uprobe_free_utask(struct task_struct *t) | 1322 | void uprobe_free_utask(struct task_struct *t) |
1299 | { | 1323 | { |
1300 | struct uprobe_task *utask = t->utask; | 1324 | struct uprobe_task *utask = t->utask; |
1325 | struct return_instance *ri, *tmp; | ||
1301 | 1326 | ||
1302 | if (!utask) | 1327 | if (!utask) |
1303 | return; | 1328 | return; |
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t) | |||
1305 | if (utask->active_uprobe) | 1330 | if (utask->active_uprobe) |
1306 | put_uprobe(utask->active_uprobe); | 1331 | put_uprobe(utask->active_uprobe); |
1307 | 1332 | ||
1333 | ri = utask->return_instances; | ||
1334 | while (ri) { | ||
1335 | tmp = ri; | ||
1336 | ri = ri->next; | ||
1337 | |||
1338 | put_uprobe(tmp->uprobe); | ||
1339 | kfree(tmp); | ||
1340 | } | ||
1341 | |||
1308 | xol_free_insn_slot(t); | 1342 | xol_free_insn_slot(t); |
1309 | kfree(utask); | 1343 | kfree(utask); |
1310 | t->utask = NULL; | 1344 | t->utask = NULL; |
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void) | |||
1333 | return current->utask; | 1367 | return current->utask; |
1334 | } | 1368 | } |
1335 | 1369 | ||
1370 | /* | ||
1371 | * Current area->vaddr notion assume the trampoline address is always | ||
1372 | * equal area->vaddr. | ||
1373 | * | ||
1374 | * Returns -1 in case the xol_area is not allocated. | ||
1375 | */ | ||
1376 | static unsigned long get_trampoline_vaddr(void) | ||
1377 | { | ||
1378 | struct xol_area *area; | ||
1379 | unsigned long trampoline_vaddr = -1; | ||
1380 | |||
1381 | area = current->mm->uprobes_state.xol_area; | ||
1382 | smp_read_barrier_depends(); | ||
1383 | if (area) | ||
1384 | trampoline_vaddr = area->vaddr; | ||
1385 | |||
1386 | return trampoline_vaddr; | ||
1387 | } | ||
1388 | |||
1389 | static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) | ||
1390 | { | ||
1391 | struct return_instance *ri; | ||
1392 | struct uprobe_task *utask; | ||
1393 | unsigned long orig_ret_vaddr, trampoline_vaddr; | ||
1394 | bool chained = false; | ||
1395 | |||
1396 | if (!get_xol_area()) | ||
1397 | return; | ||
1398 | |||
1399 | utask = get_utask(); | ||
1400 | if (!utask) | ||
1401 | return; | ||
1402 | |||
1403 | if (utask->depth >= MAX_URETPROBE_DEPTH) { | ||
1404 | printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" | ||
1405 | " nestedness limit pid/tgid=%d/%d\n", | ||
1406 | current->pid, current->tgid); | ||
1407 | return; | ||
1408 | } | ||
1409 | |||
1410 | ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); | ||
1411 | if (!ri) | ||
1412 | goto fail; | ||
1413 | |||
1414 | trampoline_vaddr = get_trampoline_vaddr(); | ||
1415 | orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); | ||
1416 | if (orig_ret_vaddr == -1) | ||
1417 | goto fail; | ||
1418 | |||
1419 | /* | ||
1420 | * We don't want to keep trampoline address in stack, rather keep the | ||
1421 | * original return address of first caller thru all the consequent | ||
1422 | * instances. This also makes breakpoint unwrapping easier. | ||
1423 | */ | ||
1424 | if (orig_ret_vaddr == trampoline_vaddr) { | ||
1425 | if (!utask->return_instances) { | ||
1426 | /* | ||
1427 | * This situation is not possible. Likely we have an | ||
1428 | * attack from user-space. | ||
1429 | */ | ||
1430 | pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", | ||
1431 | current->pid, current->tgid); | ||
1432 | goto fail; | ||
1433 | } | ||
1434 | |||
1435 | chained = true; | ||
1436 | orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; | ||
1437 | } | ||
1438 | |||
1439 | atomic_inc(&uprobe->ref); | ||
1440 | ri->uprobe = uprobe; | ||
1441 | ri->func = instruction_pointer(regs); | ||
1442 | ri->orig_ret_vaddr = orig_ret_vaddr; | ||
1443 | ri->chained = chained; | ||
1444 | |||
1445 | utask->depth++; | ||
1446 | |||
1447 | /* add instance to the stack */ | ||
1448 | ri->next = utask->return_instances; | ||
1449 | utask->return_instances = ri; | ||
1450 | |||
1451 | return; | ||
1452 | |||
1453 | fail: | ||
1454 | kfree(ri); | ||
1455 | } | ||
1456 | |||
1336 | /* Prepare to single-step probed instruction out of line. */ | 1457 | /* Prepare to single-step probed instruction out of line. */ |
1337 | static int | 1458 | static int |
1338 | pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) | 1459 | pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) |
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) | |||
1431 | clear_bit(MMF_HAS_UPROBES, &mm->flags); | 1552 | clear_bit(MMF_HAS_UPROBES, &mm->flags); |
1432 | } | 1553 | } |
1433 | 1554 | ||
1434 | static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | 1555 | static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) |
1435 | { | 1556 | { |
1436 | struct page *page; | 1557 | struct page *page; |
1437 | uprobe_opcode_t opcode; | 1558 | uprobe_opcode_t opcode; |
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
1449 | if (result < 0) | 1570 | if (result < 0) |
1450 | return result; | 1571 | return result; |
1451 | 1572 | ||
1452 | copy_opcode(page, vaddr, &opcode); | 1573 | copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); |
1453 | put_page(page); | 1574 | put_page(page); |
1454 | out: | 1575 | out: |
1455 | return is_swbp_insn(&opcode); | 1576 | /* This needs to return true for any variant of the trap insn */ |
1577 | return is_trap_insn(&opcode); | ||
1456 | } | 1578 | } |
1457 | 1579 | ||
1458 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | 1580 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) |
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | |||
1465 | vma = find_vma(mm, bp_vaddr); | 1587 | vma = find_vma(mm, bp_vaddr); |
1466 | if (vma && vma->vm_start <= bp_vaddr) { | 1588 | if (vma && vma->vm_start <= bp_vaddr) { |
1467 | if (valid_vma(vma, false)) { | 1589 | if (valid_vma(vma, false)) { |
1468 | struct inode *inode = vma->vm_file->f_mapping->host; | 1590 | struct inode *inode = file_inode(vma->vm_file); |
1469 | loff_t offset = vaddr_to_offset(vma, bp_vaddr); | 1591 | loff_t offset = vaddr_to_offset(vma, bp_vaddr); |
1470 | 1592 | ||
1471 | uprobe = find_uprobe(inode, offset); | 1593 | uprobe = find_uprobe(inode, offset); |
1472 | } | 1594 | } |
1473 | 1595 | ||
1474 | if (!uprobe) | 1596 | if (!uprobe) |
1475 | *is_swbp = is_swbp_at_addr(mm, bp_vaddr); | 1597 | *is_swbp = is_trap_at_addr(mm, bp_vaddr); |
1476 | } else { | 1598 | } else { |
1477 | *is_swbp = -EFAULT; | 1599 | *is_swbp = -EFAULT; |
1478 | } | 1600 | } |
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | |||
1488 | { | 1610 | { |
1489 | struct uprobe_consumer *uc; | 1611 | struct uprobe_consumer *uc; |
1490 | int remove = UPROBE_HANDLER_REMOVE; | 1612 | int remove = UPROBE_HANDLER_REMOVE; |
1613 | bool need_prep = false; /* prepare return uprobe, when needed */ | ||
1491 | 1614 | ||
1492 | down_read(&uprobe->register_rwsem); | 1615 | down_read(&uprobe->register_rwsem); |
1493 | for (uc = uprobe->consumers; uc; uc = uc->next) { | 1616 | for (uc = uprobe->consumers; uc; uc = uc->next) { |
1494 | int rc = uc->handler(uc, regs); | 1617 | int rc = 0; |
1618 | |||
1619 | if (uc->handler) { | ||
1620 | rc = uc->handler(uc, regs); | ||
1621 | WARN(rc & ~UPROBE_HANDLER_MASK, | ||
1622 | "bad rc=0x%x from %pf()\n", rc, uc->handler); | ||
1623 | } | ||
1624 | |||
1625 | if (uc->ret_handler) | ||
1626 | need_prep = true; | ||
1495 | 1627 | ||
1496 | WARN(rc & ~UPROBE_HANDLER_MASK, | ||
1497 | "bad rc=0x%x from %pf()\n", rc, uc->handler); | ||
1498 | remove &= rc; | 1628 | remove &= rc; |
1499 | } | 1629 | } |
1500 | 1630 | ||
1631 | if (need_prep && !remove) | ||
1632 | prepare_uretprobe(uprobe, regs); /* put bp at return */ | ||
1633 | |||
1501 | if (remove && uprobe->consumers) { | 1634 | if (remove && uprobe->consumers) { |
1502 | WARN_ON(!uprobe_is_active(uprobe)); | 1635 | WARN_ON(!uprobe_is_active(uprobe)); |
1503 | unapply_uprobe(uprobe, current->mm); | 1636 | unapply_uprobe(uprobe, current->mm); |
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | |||
1505 | up_read(&uprobe->register_rwsem); | 1638 | up_read(&uprobe->register_rwsem); |
1506 | } | 1639 | } |
1507 | 1640 | ||
1641 | static void | ||
1642 | handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) | ||
1643 | { | ||
1644 | struct uprobe *uprobe = ri->uprobe; | ||
1645 | struct uprobe_consumer *uc; | ||
1646 | |||
1647 | down_read(&uprobe->register_rwsem); | ||
1648 | for (uc = uprobe->consumers; uc; uc = uc->next) { | ||
1649 | if (uc->ret_handler) | ||
1650 | uc->ret_handler(uc, ri->func, regs); | ||
1651 | } | ||
1652 | up_read(&uprobe->register_rwsem); | ||
1653 | } | ||
1654 | |||
1655 | static bool handle_trampoline(struct pt_regs *regs) | ||
1656 | { | ||
1657 | struct uprobe_task *utask; | ||
1658 | struct return_instance *ri, *tmp; | ||
1659 | bool chained; | ||
1660 | |||
1661 | utask = current->utask; | ||
1662 | if (!utask) | ||
1663 | return false; | ||
1664 | |||
1665 | ri = utask->return_instances; | ||
1666 | if (!ri) | ||
1667 | return false; | ||
1668 | |||
1669 | /* | ||
1670 | * TODO: we should throw out return_instance's invalidated by | ||
1671 | * longjmp(), currently we assume that the probed function always | ||
1672 | * returns. | ||
1673 | */ | ||
1674 | instruction_pointer_set(regs, ri->orig_ret_vaddr); | ||
1675 | |||
1676 | for (;;) { | ||
1677 | handle_uretprobe_chain(ri, regs); | ||
1678 | |||
1679 | chained = ri->chained; | ||
1680 | put_uprobe(ri->uprobe); | ||
1681 | |||
1682 | tmp = ri; | ||
1683 | ri = ri->next; | ||
1684 | kfree(tmp); | ||
1685 | |||
1686 | if (!chained) | ||
1687 | break; | ||
1688 | |||
1689 | utask->depth--; | ||
1690 | |||
1691 | BUG_ON(!ri); | ||
1692 | } | ||
1693 | |||
1694 | utask->return_instances = ri; | ||
1695 | |||
1696 | return true; | ||
1697 | } | ||
1698 | |||
1508 | /* | 1699 | /* |
1509 | * Run handler and ask thread to singlestep. | 1700 | * Run handler and ask thread to singlestep. |
1510 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1701 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs) | |||
1516 | int uninitialized_var(is_swbp); | 1707 | int uninitialized_var(is_swbp); |
1517 | 1708 | ||
1518 | bp_vaddr = uprobe_get_swbp_addr(regs); | 1709 | bp_vaddr = uprobe_get_swbp_addr(regs); |
1519 | uprobe = find_active_uprobe(bp_vaddr, &is_swbp); | 1710 | if (bp_vaddr == get_trampoline_vaddr()) { |
1711 | if (handle_trampoline(regs)) | ||
1712 | return; | ||
1713 | |||
1714 | pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", | ||
1715 | current->pid, current->tgid); | ||
1716 | } | ||
1520 | 1717 | ||
1718 | uprobe = find_active_uprobe(bp_vaddr, &is_swbp); | ||
1521 | if (!uprobe) { | 1719 | if (!uprobe) { |
1522 | if (is_swbp > 0) { | 1720 | if (is_swbp > 0) { |
1523 | /* No matching uprobe; signal SIGTRAP. */ | 1721 | /* No matching uprobe; signal SIGTRAP. */ |
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs) | |||
1616 | */ | 1814 | */ |
1617 | int uprobe_pre_sstep_notifier(struct pt_regs *regs) | 1815 | int uprobe_pre_sstep_notifier(struct pt_regs *regs) |
1618 | { | 1816 | { |
1619 | if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) | 1817 | if (!current->mm) |
1818 | return 0; | ||
1819 | |||
1820 | if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && | ||
1821 | (!current->utask || !current->utask->return_instances)) | ||
1620 | return 0; | 1822 | return 0; |
1621 | 1823 | ||
1622 | set_thread_flag(TIF_UPROBE); | 1824 | set_thread_flag(TIF_UPROBE); |
diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca9935..af2eb3cbd499 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -835,7 +835,7 @@ void do_exit(long code) | |||
835 | /* | 835 | /* |
836 | * Make sure we are holding no locks: | 836 | * Make sure we are holding no locks: |
837 | */ | 837 | */ |
838 | debug_check_no_locks_held(); | 838 | debug_check_no_locks_held(tsk); |
839 | /* | 839 | /* |
840 | * We can do this unlocked here. The futex code uses this flag | 840 | * We can do this unlocked here. The futex code uses this flag |
841 | * just to verify whether the pi state cleanup has been done | 841 | * just to verify whether the pi state cleanup has been done |
@@ -847,7 +847,7 @@ void do_exit(long code) | |||
847 | exit_io_context(tsk); | 847 | exit_io_context(tsk); |
848 | 848 | ||
849 | if (tsk->splice_pipe) | 849 | if (tsk->splice_pipe) |
850 | __free_pipe_info(tsk->splice_pipe); | 850 | free_pipe_info(tsk->splice_pipe); |
851 | 851 | ||
852 | if (tsk->task_frag.page) | 852 | if (tsk->task_frag.page) |
853 | put_page(tsk->task_frag.page); | 853 | put_page(tsk->task_frag.page); |
@@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | |||
1629 | } | 1629 | } |
1630 | 1630 | ||
1631 | put_pid(pid); | 1631 | put_pid(pid); |
1632 | |||
1633 | /* avoid REGPARM breakage on x86: */ | ||
1634 | asmlinkage_protect(5, ret, which, upid, infop, options, ru); | ||
1635 | return ret; | 1632 | return ret; |
1636 | } | 1633 | } |
1637 | 1634 | ||
@@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | |||
1669 | ret = do_wait(&wo); | 1666 | ret = do_wait(&wo); |
1670 | put_pid(pid); | 1667 | put_pid(pid); |
1671 | 1668 | ||
1672 | /* avoid REGPARM breakage on x86: */ | ||
1673 | asmlinkage_protect(4, ret, upid, stat_addr, options, ru); | ||
1674 | return ret; | 1669 | return ret; |
1675 | } | 1670 | } |
1676 | 1671 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index fe35a634bf76..67460b93b1a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1; | |||
41 | /* Sort the kernel's built-in exception table */ | 41 | /* Sort the kernel's built-in exception table */ |
42 | void __init sort_main_extable(void) | 42 | void __init sort_main_extable(void) |
43 | { | 43 | { |
44 | if (main_extable_sort_needed) | 44 | if (main_extable_sort_needed) { |
45 | pr_notice("Sorting __ex_table...\n"); | ||
45 | sort_extable(__start___ex_table, __stop___ex_table); | 46 | sort_extable(__start___ex_table, __stop___ex_table); |
46 | else | 47 | } |
47 | pr_notice("__ex_table already sorted, skipping sort\n"); | ||
48 | } | 48 | } |
49 | 49 | ||
50 | /* Given an address, look for it in the exception tables. */ | 50 | /* Given an address, look for it in the exception tables. */ |
diff --git a/kernel/fork.c b/kernel/fork.c index 1766d324d5e3..987b28a1f01b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -70,6 +70,7 @@ | |||
70 | #include <linux/khugepaged.h> | 70 | #include <linux/khugepaged.h> |
71 | #include <linux/signalfd.h> | 71 | #include <linux/signalfd.h> |
72 | #include <linux/uprobes.h> | 72 | #include <linux/uprobes.h> |
73 | #include <linux/aio.h> | ||
73 | 74 | ||
74 | #include <asm/pgtable.h> | 75 | #include <asm/pgtable.h> |
75 | #include <asm/pgalloc.h> | 76 | #include <asm/pgalloc.h> |
@@ -1233,7 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1233 | 1234 | ||
1234 | p->utime = p->stime = p->gtime = 0; | 1235 | p->utime = p->stime = p->gtime = 0; |
1235 | p->utimescaled = p->stimescaled = 0; | 1236 | p->utimescaled = p->stimescaled = 0; |
1236 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1237 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
1237 | p->prev_cputime.utime = p->prev_cputime.stime = 0; | 1238 | p->prev_cputime.utime = p->prev_cputime.stime = 0; |
1238 | #endif | 1239 | #endif |
1239 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1240 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
@@ -1303,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1303 | p->memcg_batch.do_batch = 0; | 1304 | p->memcg_batch.do_batch = 0; |
1304 | p->memcg_batch.memcg = NULL; | 1305 | p->memcg_batch.memcg = NULL; |
1305 | #endif | 1306 | #endif |
1307 | #ifdef CONFIG_BCACHE | ||
1308 | p->sequential_io = 0; | ||
1309 | p->sequential_io_avg = 0; | ||
1310 | #endif | ||
1306 | 1311 | ||
1307 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1312 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1308 | sched_fork(p); | 1313 | sched_fork(p); |
@@ -1677,10 +1682,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | |||
1677 | int, tls_val) | 1682 | int, tls_val) |
1678 | #endif | 1683 | #endif |
1679 | { | 1684 | { |
1680 | long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); | 1685 | return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); |
1681 | asmlinkage_protect(5, ret, clone_flags, newsp, | ||
1682 | parent_tidptr, child_tidptr, tls_val); | ||
1683 | return ret; | ||
1684 | } | 1686 | } |
1685 | #endif | 1687 | #endif |
1686 | 1688 | ||
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3feb..fd4b13b131f8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -63,6 +63,7 @@ | |||
63 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | 63 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = |
64 | { | 64 | { |
65 | 65 | ||
66 | .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), | ||
66 | .clock_base = | 67 | .clock_base = |
67 | { | 68 | { |
68 | { | 69 | { |
@@ -83,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |||
83 | .get_time = &ktime_get_boottime, | 84 | .get_time = &ktime_get_boottime, |
84 | .resolution = KTIME_LOW_RES, | 85 | .resolution = KTIME_LOW_RES, |
85 | }, | 86 | }, |
87 | { | ||
88 | .index = HRTIMER_BASE_TAI, | ||
89 | .clockid = CLOCK_TAI, | ||
90 | .get_time = &ktime_get_clocktai, | ||
91 | .resolution = KTIME_LOW_RES, | ||
92 | }, | ||
86 | } | 93 | } |
87 | }; | 94 | }; |
88 | 95 | ||
@@ -90,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { | |||
90 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, | 97 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, |
91 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, | 98 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, |
92 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, | 99 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, |
100 | [CLOCK_TAI] = HRTIMER_BASE_TAI, | ||
93 | }; | 101 | }; |
94 | 102 | ||
95 | static inline int hrtimer_clockid_to_base(clockid_t clock_id) | 103 | static inline int hrtimer_clockid_to_base(clockid_t clock_id) |
@@ -106,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | |||
106 | { | 114 | { |
107 | ktime_t xtim, mono, boot; | 115 | ktime_t xtim, mono, boot; |
108 | struct timespec xts, tom, slp; | 116 | struct timespec xts, tom, slp; |
117 | s32 tai_offset; | ||
109 | 118 | ||
110 | get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); | 119 | get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); |
120 | tai_offset = timekeeping_get_tai_offset(); | ||
111 | 121 | ||
112 | xtim = timespec_to_ktime(xts); | 122 | xtim = timespec_to_ktime(xts); |
113 | mono = ktime_add(xtim, timespec_to_ktime(tom)); | 123 | mono = ktime_add(xtim, timespec_to_ktime(tom)); |
@@ -115,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | |||
115 | base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; | 125 | base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; |
116 | base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; | 126 | base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; |
117 | base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; | 127 | base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; |
128 | base->clock_base[HRTIMER_BASE_TAI].softirq_time = | ||
129 | ktime_add(xtim, ktime_set(tai_offset, 0)); | ||
118 | } | 130 | } |
119 | 131 | ||
120 | /* | 132 | /* |
@@ -160,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
160 | */ | 172 | */ |
161 | static int hrtimer_get_target(int this_cpu, int pinned) | 173 | static int hrtimer_get_target(int this_cpu, int pinned) |
162 | { | 174 | { |
163 | #ifdef CONFIG_NO_HZ | 175 | #ifdef CONFIG_NO_HZ_COMMON |
164 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) | 176 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) |
165 | return get_nohz_timer_target(); | 177 | return get_nohz_timer_target(); |
166 | #endif | 178 | #endif |
@@ -275,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | |||
275 | } else { | 287 | } else { |
276 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); | 288 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); |
277 | 289 | ||
290 | /* Make sure nsec fits into long */ | ||
291 | if (unlikely(nsec > KTIME_SEC_MAX)) | ||
292 | return (ktime_t){ .tv64 = KTIME_MAX }; | ||
293 | |||
278 | tmp = ktime_set((long)nsec, rem); | 294 | tmp = ktime_set((long)nsec, rem); |
279 | } | 295 | } |
280 | 296 | ||
@@ -651,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | |||
651 | { | 667 | { |
652 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; | 668 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; |
653 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; | 669 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; |
670 | ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; | ||
654 | 671 | ||
655 | return ktime_get_update_offsets(offs_real, offs_boot); | 672 | return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); |
656 | } | 673 | } |
657 | 674 | ||
658 | /* | 675 | /* |
@@ -1010,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
1010 | * @timer: the timer to be added | 1027 | * @timer: the timer to be added |
1011 | * @tim: expiry time | 1028 | * @tim: expiry time |
1012 | * @delta_ns: "slack" range for the timer | 1029 | * @delta_ns: "slack" range for the timer |
1013 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) | 1030 | * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or |
1031 | * relative (HRTIMER_MODE_REL) | ||
1014 | * | 1032 | * |
1015 | * Returns: | 1033 | * Returns: |
1016 | * 0 on success | 1034 | * 0 on success |
@@ -1027,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); | |||
1027 | * hrtimer_start - (re)start an hrtimer on the current CPU | 1045 | * hrtimer_start - (re)start an hrtimer on the current CPU |
1028 | * @timer: the timer to be added | 1046 | * @timer: the timer to be added |
1029 | * @tim: expiry time | 1047 | * @tim: expiry time |
1030 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) | 1048 | * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or |
1049 | * relative (HRTIMER_MODE_REL) | ||
1031 | * | 1050 | * |
1032 | * Returns: | 1051 | * Returns: |
1033 | * 0 on success | 1052 | * 0 on success |
@@ -1106,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | |||
1106 | } | 1125 | } |
1107 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | 1126 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); |
1108 | 1127 | ||
1109 | #ifdef CONFIG_NO_HZ | 1128 | #ifdef CONFIG_NO_HZ_COMMON |
1110 | /** | 1129 | /** |
1111 | * hrtimer_get_next_event - get the time until next expiry event | 1130 | * hrtimer_get_next_event - get the time until next expiry event |
1112 | * | 1131 | * |
@@ -1309,6 +1328,8 @@ retry: | |||
1309 | 1328 | ||
1310 | expires = ktime_sub(hrtimer_get_expires(timer), | 1329 | expires = ktime_sub(hrtimer_get_expires(timer), |
1311 | base->offset); | 1330 | base->offset); |
1331 | if (expires.tv64 < 0) | ||
1332 | expires.tv64 = KTIME_MAX; | ||
1312 | if (expires.tv64 < expires_next.tv64) | 1333 | if (expires.tv64 < expires_next.tv64) |
1313 | expires_next = expires; | 1334 | expires_next = expires; |
1314 | break; | 1335 | break; |
@@ -1642,8 +1663,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
1642 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 1663 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
1643 | int i; | 1664 | int i; |
1644 | 1665 | ||
1645 | raw_spin_lock_init(&cpu_base->lock); | ||
1646 | |||
1647 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1666 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1648 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1667 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1649 | timerqueue_init_head(&cpu_base->clock_base[i].active); | 1668 | timerqueue_init_head(&cpu_base->clock_base[i].active); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 96f3a1d9c379..54a4d5223238 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | |||
143 | * irq_domain_add_simple() - Allocate and register a simple irq_domain. | 143 | * irq_domain_add_simple() - Allocate and register a simple irq_domain. |
144 | * @of_node: pointer to interrupt controller's device tree node. | 144 | * @of_node: pointer to interrupt controller's device tree node. |
145 | * @size: total number of irqs in mapping | 145 | * @size: total number of irqs in mapping |
146 | * @first_irq: first number of irq block assigned to the domain | 146 | * @first_irq: first number of irq block assigned to the domain, |
147 | * pass zero to assign irqs on-the-fly. This will result in a | ||
148 | * linear IRQ domain so it is important to use irq_create_mapping() | ||
149 | * for each used IRQ, especially when SPARSE_IRQ is enabled. | ||
147 | * @ops: map/unmap domain callbacks | 150 | * @ops: map/unmap domain callbacks |
148 | * @host_data: Controller private data pointer | 151 | * @host_data: Controller private data pointer |
149 | * | 152 | * |
@@ -191,6 +194,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | |||
191 | /* A linear domain is the default */ | 194 | /* A linear domain is the default */ |
192 | return irq_domain_add_linear(of_node, size, ops, host_data); | 195 | return irq_domain_add_linear(of_node, size, ops, host_data); |
193 | } | 196 | } |
197 | EXPORT_SYMBOL_GPL(irq_domain_add_simple); | ||
194 | 198 | ||
195 | /** | 199 | /** |
196 | * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. | 200 | * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. |
@@ -397,11 +401,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, | |||
397 | while (count--) { | 401 | while (count--) { |
398 | int irq = irq_base + count; | 402 | int irq = irq_base + count; |
399 | struct irq_data *irq_data = irq_get_irq_data(irq); | 403 | struct irq_data *irq_data = irq_get_irq_data(irq); |
400 | irq_hw_number_t hwirq = irq_data->hwirq; | 404 | irq_hw_number_t hwirq; |
401 | 405 | ||
402 | if (WARN_ON(!irq_data || irq_data->domain != domain)) | 406 | if (WARN_ON(!irq_data || irq_data->domain != domain)) |
403 | continue; | 407 | continue; |
404 | 408 | ||
409 | hwirq = irq_data->hwirq; | ||
405 | irq_set_status_flags(irq, IRQ_NOREQUEST); | 410 | irq_set_status_flags(irq, IRQ_NOREQUEST); |
406 | 411 | ||
407 | /* remove chip and handler */ | 412 | /* remove chip and handler */ |
@@ -462,9 +467,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, | |||
462 | if (domain->ops->map) { | 467 | if (domain->ops->map) { |
463 | ret = domain->ops->map(domain, virq, hwirq); | 468 | ret = domain->ops->map(domain, virq, hwirq); |
464 | if (ret != 0) { | 469 | if (ret != 0) { |
465 | pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", | 470 | /* |
466 | virq, hwirq, ret); | 471 | * If map() returns -EPERM, this interrupt is protected |
467 | WARN_ON(1); | 472 | * by the firmware or some other service and shall not |
473 | * be mapped. | ||
474 | * | ||
475 | * Since on some platforms we blindly try to map everything | ||
476 | * we end up with a log full of backtraces. | ||
477 | * | ||
478 | * So instead, we silently fail on -EPERM, it is the | ||
479 | * responsibility of the PIC driver to display a relevant | ||
480 | * message if needed. | ||
481 | */ | ||
482 | if (ret != -EPERM) { | ||
483 | pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", | ||
484 | virq, hwirq, ret); | ||
485 | WARN_ON(1); | ||
486 | } | ||
468 | irq_data->domain = NULL; | 487 | irq_data->domain = NULL; |
469 | irq_data->hwirq = 0; | 488 | irq_data->hwirq = 0; |
470 | goto err_unmap; | 489 | goto err_unmap; |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 397db02209ed..19ed5c425c3b 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) | |||
76 | static ssize_t write_irq_affinity(int type, struct file *file, | 76 | static ssize_t write_irq_affinity(int type, struct file *file, |
77 | const char __user *buffer, size_t count, loff_t *pos) | 77 | const char __user *buffer, size_t count, loff_t *pos) |
78 | { | 78 | { |
79 | unsigned int irq = (int)(long)PDE(file_inode(file))->data; | 79 | unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); |
80 | cpumask_var_t new_value; | 80 | cpumask_var_t new_value; |
81 | int err; | 81 | int err; |
82 | 82 | ||
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, | |||
131 | 131 | ||
132 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) | 132 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) |
133 | { | 133 | { |
134 | return single_open(file, irq_affinity_proc_show, PDE(inode)->data); | 134 | return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); |
135 | } | 135 | } |
136 | 136 | ||
137 | static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) | 137 | static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) |
138 | { | 138 | { |
139 | return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); | 139 | return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); |
140 | } | 140 | } |
141 | 141 | ||
142 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) | 142 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) |
143 | { | 143 | { |
144 | return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); | 144 | return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode)); |
145 | } | 145 | } |
146 | 146 | ||
147 | static const struct file_operations irq_affinity_proc_fops = { | 147 | static const struct file_operations irq_affinity_proc_fops = { |
@@ -212,7 +212,7 @@ out: | |||
212 | 212 | ||
213 | static int default_affinity_open(struct inode *inode, struct file *file) | 213 | static int default_affinity_open(struct inode *inode, struct file *file) |
214 | { | 214 | { |
215 | return single_open(file, default_affinity_show, PDE(inode)->data); | 215 | return single_open(file, default_affinity_show, PDE_DATA(inode)); |
216 | } | 216 | } |
217 | 217 | ||
218 | static const struct file_operations default_affinity_proc_fops = { | 218 | static const struct file_operations default_affinity_proc_fops = { |
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) | |||
233 | 233 | ||
234 | static int irq_node_proc_open(struct inode *inode, struct file *file) | 234 | static int irq_node_proc_open(struct inode *inode, struct file *file) |
235 | { | 235 | { |
236 | return single_open(file, irq_node_proc_show, PDE(inode)->data); | 236 | return single_open(file, irq_node_proc_show, PDE_DATA(inode)); |
237 | } | 237 | } |
238 | 238 | ||
239 | static const struct file_operations irq_node_proc_fops = { | 239 | static const struct file_operations irq_node_proc_fops = { |
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) | |||
256 | 256 | ||
257 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) | 257 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) |
258 | { | 258 | { |
259 | return single_open(file, irq_spurious_proc_show, PDE(inode)->data); | 259 | return single_open(file, irq_spurious_proc_show, PDE_DATA(inode)); |
260 | } | 260 | } |
261 | 261 | ||
262 | static const struct file_operations irq_spurious_proc_fops = { | 262 | static const struct file_operations irq_spurious_proc_fops = { |
@@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
366 | 366 | ||
367 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 367 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
368 | { | 368 | { |
369 | if (action->dir) { | 369 | proc_remove(action->dir); |
370 | struct irq_desc *desc = irq_to_desc(irq); | ||
371 | |||
372 | remove_proc_entry(action->dir->name, desc->dir); | ||
373 | } | ||
374 | } | 370 | } |
375 | 371 | ||
376 | static void register_default_affinity_proc(void) | 372 | static void register_default_affinity_proc(void) |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 2169feeba529..3127ad52cdb2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr) | |||
84 | 84 | ||
85 | /* | 85 | /* |
86 | * Expand a compressed symbol data into the resulting uncompressed string, | 86 | * Expand a compressed symbol data into the resulting uncompressed string, |
87 | * if uncompressed string is too long (>= maxlen), it will be truncated, | ||
87 | * given the offset to where the symbol is in the compressed stream. | 88 | * given the offset to where the symbol is in the compressed stream. |
88 | */ | 89 | */ |
89 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) | 90 | static unsigned int kallsyms_expand_symbol(unsigned int off, |
91 | char *result, size_t maxlen) | ||
90 | { | 92 | { |
91 | int len, skipped_first = 0; | 93 | int len, skipped_first = 0; |
92 | const u8 *tptr, *data; | 94 | const u8 *tptr, *data; |
@@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) | |||
113 | 115 | ||
114 | while (*tptr) { | 116 | while (*tptr) { |
115 | if (skipped_first) { | 117 | if (skipped_first) { |
118 | if (maxlen <= 1) | ||
119 | goto tail; | ||
116 | *result = *tptr; | 120 | *result = *tptr; |
117 | result++; | 121 | result++; |
122 | maxlen--; | ||
118 | } else | 123 | } else |
119 | skipped_first = 1; | 124 | skipped_first = 1; |
120 | tptr++; | 125 | tptr++; |
121 | } | 126 | } |
122 | } | 127 | } |
123 | 128 | ||
124 | *result = '\0'; | 129 | tail: |
130 | if (maxlen) | ||
131 | *result = '\0'; | ||
125 | 132 | ||
126 | /* Return to offset to the next symbol. */ | 133 | /* Return to offset to the next symbol. */ |
127 | return off; | 134 | return off; |
@@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name) | |||
176 | unsigned int off; | 183 | unsigned int off; |
177 | 184 | ||
178 | for (i = 0, off = 0; i < kallsyms_num_syms; i++) { | 185 | for (i = 0, off = 0; i < kallsyms_num_syms; i++) { |
179 | off = kallsyms_expand_symbol(off, namebuf); | 186 | off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); |
180 | 187 | ||
181 | if (strcmp(namebuf, name) == 0) | 188 | if (strcmp(namebuf, name) == 0) |
182 | return kallsyms_addresses[i]; | 189 | return kallsyms_addresses[i]; |
@@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, | |||
195 | int ret; | 202 | int ret; |
196 | 203 | ||
197 | for (i = 0, off = 0; i < kallsyms_num_syms; i++) { | 204 | for (i = 0, off = 0; i < kallsyms_num_syms; i++) { |
198 | off = kallsyms_expand_symbol(off, namebuf); | 205 | off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); |
199 | ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); | 206 | ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); |
200 | if (ret != 0) | 207 | if (ret != 0) |
201 | return ret; | 208 | return ret; |
@@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr, | |||
294 | 301 | ||
295 | pos = get_symbol_pos(addr, symbolsize, offset); | 302 | pos = get_symbol_pos(addr, symbolsize, offset); |
296 | /* Grab name */ | 303 | /* Grab name */ |
297 | kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); | 304 | kallsyms_expand_symbol(get_symbol_offset(pos), |
305 | namebuf, KSYM_NAME_LEN); | ||
298 | if (modname) | 306 | if (modname) |
299 | *modname = NULL; | 307 | *modname = NULL; |
300 | return namebuf; | 308 | return namebuf; |
@@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname) | |||
315 | 323 | ||
316 | pos = get_symbol_pos(addr, NULL, NULL); | 324 | pos = get_symbol_pos(addr, NULL, NULL); |
317 | /* Grab name */ | 325 | /* Grab name */ |
318 | kallsyms_expand_symbol(get_symbol_offset(pos), symname); | 326 | kallsyms_expand_symbol(get_symbol_offset(pos), |
327 | symname, KSYM_NAME_LEN); | ||
319 | return 0; | 328 | return 0; |
320 | } | 329 | } |
321 | /* See if it's in a module. */ | 330 | /* See if it's in a module. */ |
@@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, | |||
333 | 342 | ||
334 | pos = get_symbol_pos(addr, size, offset); | 343 | pos = get_symbol_pos(addr, size, offset); |
335 | /* Grab name */ | 344 | /* Grab name */ |
336 | kallsyms_expand_symbol(get_symbol_offset(pos), name); | 345 | kallsyms_expand_symbol(get_symbol_offset(pos), |
346 | name, KSYM_NAME_LEN); | ||
337 | modname[0] = '\0'; | 347 | modname[0] = '\0'; |
338 | return 0; | 348 | return 0; |
339 | } | 349 | } |
@@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) | |||
463 | 473 | ||
464 | iter->type = kallsyms_get_symbol_type(off); | 474 | iter->type = kallsyms_get_symbol_type(off); |
465 | 475 | ||
466 | off = kallsyms_expand_symbol(off, iter->name); | 476 | off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); |
467 | 477 | ||
468 | return off - iter->nameoff; | 478 | return off - iter->nameoff; |
469 | } | 479 | } |
diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b6..59f7b55ba745 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -55,7 +55,7 @@ struct resource crashk_res = { | |||
55 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | 55 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM |
56 | }; | 56 | }; |
57 | struct resource crashk_low_res = { | 57 | struct resource crashk_low_res = { |
58 | .name = "Crash kernel low", | 58 | .name = "Crash kernel", |
59 | .start = 0, | 59 | .start = 0, |
60 | .end = 0, | 60 | .end = 0, |
61 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | 61 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM |
@@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
786 | struct kexec_segment *segment) | 786 | struct kexec_segment *segment) |
787 | { | 787 | { |
788 | unsigned long maddr; | 788 | unsigned long maddr; |
789 | unsigned long ubytes, mbytes; | 789 | size_t ubytes, mbytes; |
790 | int result; | 790 | int result; |
791 | unsigned char __user *buf; | 791 | unsigned char __user *buf; |
792 | 792 | ||
@@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
819 | /* Start with a clear page */ | 819 | /* Start with a clear page */ |
820 | clear_page(ptr); | 820 | clear_page(ptr); |
821 | ptr += maddr & ~PAGE_MASK; | 821 | ptr += maddr & ~PAGE_MASK; |
822 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | 822 | mchunk = min_t(size_t, mbytes, |
823 | if (mchunk > mbytes) | 823 | PAGE_SIZE - (maddr & ~PAGE_MASK)); |
824 | mchunk = mbytes; | 824 | uchunk = min(ubytes, mchunk); |
825 | |||
826 | uchunk = mchunk; | ||
827 | if (uchunk > ubytes) | ||
828 | uchunk = ubytes; | ||
829 | 825 | ||
830 | result = copy_from_user(ptr, buf, uchunk); | 826 | result = copy_from_user(ptr, buf, uchunk); |
831 | kunmap(page); | 827 | kunmap(page); |
@@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
850 | * We do things a page at a time for the sake of kmap. | 846 | * We do things a page at a time for the sake of kmap. |
851 | */ | 847 | */ |
852 | unsigned long maddr; | 848 | unsigned long maddr; |
853 | unsigned long ubytes, mbytes; | 849 | size_t ubytes, mbytes; |
854 | int result; | 850 | int result; |
855 | unsigned char __user *buf; | 851 | unsigned char __user *buf; |
856 | 852 | ||
@@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
871 | } | 867 | } |
872 | ptr = kmap(page); | 868 | ptr = kmap(page); |
873 | ptr += maddr & ~PAGE_MASK; | 869 | ptr += maddr & ~PAGE_MASK; |
874 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | 870 | mchunk = min_t(size_t, mbytes, |
875 | if (mchunk > mbytes) | 871 | PAGE_SIZE - (maddr & ~PAGE_MASK)); |
876 | mchunk = mbytes; | 872 | uchunk = min(ubytes, mchunk); |
877 | 873 | if (mchunk > uchunk) { | |
878 | uchunk = mchunk; | ||
879 | if (uchunk > ubytes) { | ||
880 | uchunk = ubytes; | ||
881 | /* Zero the trailing part of the page */ | 874 | /* Zero the trailing part of the page */ |
882 | memset(ptr + uchunk, 0, mchunk - uchunk); | 875 | memset(ptr + uchunk, 0, mchunk - uchunk); |
883 | } | 876 | } |
@@ -1118,12 +1111,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, | |||
1118 | { | 1111 | { |
1119 | unsigned long addr; | 1112 | unsigned long addr; |
1120 | 1113 | ||
1121 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | 1114 | for (addr = begin; addr < end; addr += PAGE_SIZE) |
1122 | ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); | 1115 | free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); |
1123 | init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); | ||
1124 | free_page((unsigned long)__va(addr)); | ||
1125 | totalram_pages++; | ||
1126 | } | ||
1127 | } | 1116 | } |
1128 | 1117 | ||
1129 | int crash_shrink_memory(unsigned long new_size) | 1118 | int crash_shrink_memory(unsigned long new_size) |
@@ -1368,35 +1357,114 @@ static int __init parse_crashkernel_simple(char *cmdline, | |||
1368 | return 0; | 1357 | return 0; |
1369 | } | 1358 | } |
1370 | 1359 | ||
1360 | #define SUFFIX_HIGH 0 | ||
1361 | #define SUFFIX_LOW 1 | ||
1362 | #define SUFFIX_NULL 2 | ||
1363 | static __initdata char *suffix_tbl[] = { | ||
1364 | [SUFFIX_HIGH] = ",high", | ||
1365 | [SUFFIX_LOW] = ",low", | ||
1366 | [SUFFIX_NULL] = NULL, | ||
1367 | }; | ||
1368 | |||
1371 | /* | 1369 | /* |
1372 | * That function is the entry point for command line parsing and should be | 1370 | * That function parses "suffix" crashkernel command lines like |
1373 | * called from the arch-specific code. | 1371 | * |
1372 | * crashkernel=size,[high|low] | ||
1373 | * | ||
1374 | * It returns 0 on success and -EINVAL on failure. | ||
1374 | */ | 1375 | */ |
1376 | static int __init parse_crashkernel_suffix(char *cmdline, | ||
1377 | unsigned long long *crash_size, | ||
1378 | unsigned long long *crash_base, | ||
1379 | const char *suffix) | ||
1380 | { | ||
1381 | char *cur = cmdline; | ||
1382 | |||
1383 | *crash_size = memparse(cmdline, &cur); | ||
1384 | if (cmdline == cur) { | ||
1385 | pr_warn("crashkernel: memory value expected\n"); | ||
1386 | return -EINVAL; | ||
1387 | } | ||
1388 | |||
1389 | /* check with suffix */ | ||
1390 | if (strncmp(cur, suffix, strlen(suffix))) { | ||
1391 | pr_warn("crashkernel: unrecognized char\n"); | ||
1392 | return -EINVAL; | ||
1393 | } | ||
1394 | cur += strlen(suffix); | ||
1395 | if (*cur != ' ' && *cur != '\0') { | ||
1396 | pr_warn("crashkernel: unrecognized char\n"); | ||
1397 | return -EINVAL; | ||
1398 | } | ||
1399 | |||
1400 | return 0; | ||
1401 | } | ||
1402 | |||
1403 | static __init char *get_last_crashkernel(char *cmdline, | ||
1404 | const char *name, | ||
1405 | const char *suffix) | ||
1406 | { | ||
1407 | char *p = cmdline, *ck_cmdline = NULL; | ||
1408 | |||
1409 | /* find crashkernel and use the last one if there are more */ | ||
1410 | p = strstr(p, name); | ||
1411 | while (p) { | ||
1412 | char *end_p = strchr(p, ' '); | ||
1413 | char *q; | ||
1414 | |||
1415 | if (!end_p) | ||
1416 | end_p = p + strlen(p); | ||
1417 | |||
1418 | if (!suffix) { | ||
1419 | int i; | ||
1420 | |||
1421 | /* skip the one with any known suffix */ | ||
1422 | for (i = 0; suffix_tbl[i]; i++) { | ||
1423 | q = end_p - strlen(suffix_tbl[i]); | ||
1424 | if (!strncmp(q, suffix_tbl[i], | ||
1425 | strlen(suffix_tbl[i]))) | ||
1426 | goto next; | ||
1427 | } | ||
1428 | ck_cmdline = p; | ||
1429 | } else { | ||
1430 | q = end_p - strlen(suffix); | ||
1431 | if (!strncmp(q, suffix, strlen(suffix))) | ||
1432 | ck_cmdline = p; | ||
1433 | } | ||
1434 | next: | ||
1435 | p = strstr(p+1, name); | ||
1436 | } | ||
1437 | |||
1438 | if (!ck_cmdline) | ||
1439 | return NULL; | ||
1440 | |||
1441 | return ck_cmdline; | ||
1442 | } | ||
1443 | |||
1375 | static int __init __parse_crashkernel(char *cmdline, | 1444 | static int __init __parse_crashkernel(char *cmdline, |
1376 | unsigned long long system_ram, | 1445 | unsigned long long system_ram, |
1377 | unsigned long long *crash_size, | 1446 | unsigned long long *crash_size, |
1378 | unsigned long long *crash_base, | 1447 | unsigned long long *crash_base, |
1379 | const char *name) | 1448 | const char *name, |
1449 | const char *suffix) | ||
1380 | { | 1450 | { |
1381 | char *p = cmdline, *ck_cmdline = NULL; | ||
1382 | char *first_colon, *first_space; | 1451 | char *first_colon, *first_space; |
1452 | char *ck_cmdline; | ||
1383 | 1453 | ||
1384 | BUG_ON(!crash_size || !crash_base); | 1454 | BUG_ON(!crash_size || !crash_base); |
1385 | *crash_size = 0; | 1455 | *crash_size = 0; |
1386 | *crash_base = 0; | 1456 | *crash_base = 0; |
1387 | 1457 | ||
1388 | /* find crashkernel and use the last one if there are more */ | 1458 | ck_cmdline = get_last_crashkernel(cmdline, name, suffix); |
1389 | p = strstr(p, name); | ||
1390 | while (p) { | ||
1391 | ck_cmdline = p; | ||
1392 | p = strstr(p+1, name); | ||
1393 | } | ||
1394 | 1459 | ||
1395 | if (!ck_cmdline) | 1460 | if (!ck_cmdline) |
1396 | return -EINVAL; | 1461 | return -EINVAL; |
1397 | 1462 | ||
1398 | ck_cmdline += strlen(name); | 1463 | ck_cmdline += strlen(name); |
1399 | 1464 | ||
1465 | if (suffix) | ||
1466 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | ||
1467 | crash_base, suffix); | ||
1400 | /* | 1468 | /* |
1401 | * if the commandline contains a ':', then that's the extended | 1469 | * if the commandline contains a ':', then that's the extended |
1402 | * syntax -- if not, it must be the classic syntax | 1470 | * syntax -- if not, it must be the classic syntax |
@@ -1413,13 +1481,26 @@ static int __init __parse_crashkernel(char *cmdline, | |||
1413 | return 0; | 1481 | return 0; |
1414 | } | 1482 | } |
1415 | 1483 | ||
1484 | /* | ||
1485 | * That function is the entry point for command line parsing and should be | ||
1486 | * called from the arch-specific code. | ||
1487 | */ | ||
1416 | int __init parse_crashkernel(char *cmdline, | 1488 | int __init parse_crashkernel(char *cmdline, |
1417 | unsigned long long system_ram, | 1489 | unsigned long long system_ram, |
1418 | unsigned long long *crash_size, | 1490 | unsigned long long *crash_size, |
1419 | unsigned long long *crash_base) | 1491 | unsigned long long *crash_base) |
1420 | { | 1492 | { |
1421 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | 1493 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, |
1422 | "crashkernel="); | 1494 | "crashkernel=", NULL); |
1495 | } | ||
1496 | |||
1497 | int __init parse_crashkernel_high(char *cmdline, | ||
1498 | unsigned long long system_ram, | ||
1499 | unsigned long long *crash_size, | ||
1500 | unsigned long long *crash_base) | ||
1501 | { | ||
1502 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1503 | "crashkernel=", suffix_tbl[SUFFIX_HIGH]); | ||
1423 | } | 1504 | } |
1424 | 1505 | ||
1425 | int __init parse_crashkernel_low(char *cmdline, | 1506 | int __init parse_crashkernel_low(char *cmdline, |
@@ -1428,7 +1509,7 @@ int __init parse_crashkernel_low(char *cmdline, | |||
1428 | unsigned long long *crash_base) | 1509 | unsigned long long *crash_base) |
1429 | { | 1510 | { |
1430 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | 1511 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, |
1431 | "crashkernel_low="); | 1512 | "crashkernel=", suffix_tbl[SUFFIX_LOW]); |
1432 | } | 1513 | } |
1433 | 1514 | ||
1434 | static void update_vmcoreinfo_note(void) | 1515 | static void update_vmcoreinfo_note(void) |
@@ -1452,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...) | |||
1452 | { | 1533 | { |
1453 | va_list args; | 1534 | va_list args; |
1454 | char buf[0x50]; | 1535 | char buf[0x50]; |
1455 | int r; | 1536 | size_t r; |
1456 | 1537 | ||
1457 | va_start(args, fmt); | 1538 | va_start(args, fmt); |
1458 | r = vsnprintf(buf, sizeof(buf), fmt, args); | 1539 | r = vsnprintf(buf, sizeof(buf), fmt, args); |
1459 | va_end(args); | 1540 | va_end(args); |
1460 | 1541 | ||
1461 | if (r + vmcoreinfo_size > vmcoreinfo_max_size) | 1542 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); |
1462 | r = vmcoreinfo_max_size - vmcoreinfo_size; | ||
1463 | 1543 | ||
1464 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | 1544 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); |
1465 | 1545 | ||
@@ -1489,7 +1569,7 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1489 | VMCOREINFO_SYMBOL(swapper_pg_dir); | 1569 | VMCOREINFO_SYMBOL(swapper_pg_dir); |
1490 | #endif | 1570 | #endif |
1491 | VMCOREINFO_SYMBOL(_stext); | 1571 | VMCOREINFO_SYMBOL(_stext); |
1492 | VMCOREINFO_SYMBOL(vmlist); | 1572 | VMCOREINFO_SYMBOL(vmap_area_list); |
1493 | 1573 | ||
1494 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 1574 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
1495 | VMCOREINFO_SYMBOL(mem_map); | 1575 | VMCOREINFO_SYMBOL(mem_map); |
@@ -1527,7 +1607,8 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1527 | VMCOREINFO_OFFSET(free_area, free_list); | 1607 | VMCOREINFO_OFFSET(free_area, free_list); |
1528 | VMCOREINFO_OFFSET(list_head, next); | 1608 | VMCOREINFO_OFFSET(list_head, next); |
1529 | VMCOREINFO_OFFSET(list_head, prev); | 1609 | VMCOREINFO_OFFSET(list_head, prev); |
1530 | VMCOREINFO_OFFSET(vm_struct, addr); | 1610 | VMCOREINFO_OFFSET(vmap_area, va_start); |
1611 | VMCOREINFO_OFFSET(vmap_area, list); | ||
1531 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | 1612 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); |
1532 | log_buf_kexec_setup(); | 1613 | log_buf_kexec_setup(); |
1533 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | 1614 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 56dd34976d7b..8241906c4b61 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info) | |||
77 | 77 | ||
78 | static int call_modprobe(char *module_name, int wait) | 78 | static int call_modprobe(char *module_name, int wait) |
79 | { | 79 | { |
80 | struct subprocess_info *info; | ||
80 | static char *envp[] = { | 81 | static char *envp[] = { |
81 | "HOME=/", | 82 | "HOME=/", |
82 | "TERM=linux", | 83 | "TERM=linux", |
@@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait) | |||
98 | argv[3] = module_name; /* check free_modprobe_argv() */ | 99 | argv[3] = module_name; /* check free_modprobe_argv() */ |
99 | argv[4] = NULL; | 100 | argv[4] = NULL; |
100 | 101 | ||
101 | return call_usermodehelper_fns(modprobe_path, argv, envp, | 102 | info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, |
102 | wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); | 103 | NULL, free_modprobe_argv, NULL); |
104 | if (!info) | ||
105 | goto free_module_name; | ||
106 | |||
107 | return call_usermodehelper_exec(info, wait | UMH_KILLABLE); | ||
108 | |||
109 | free_module_name: | ||
110 | kfree(module_name); | ||
103 | free_argv: | 111 | free_argv: |
104 | kfree(argv); | 112 | kfree(argv); |
105 | out: | 113 | out: |
@@ -502,14 +510,28 @@ static void helper_unlock(void) | |||
502 | * @argv: arg vector for process | 510 | * @argv: arg vector for process |
503 | * @envp: environment for process | 511 | * @envp: environment for process |
504 | * @gfp_mask: gfp mask for memory allocation | 512 | * @gfp_mask: gfp mask for memory allocation |
513 | * @cleanup: a cleanup function | ||
514 | * @init: an init function | ||
515 | * @data: arbitrary context sensitive data | ||
505 | * | 516 | * |
506 | * Returns either %NULL on allocation failure, or a subprocess_info | 517 | * Returns either %NULL on allocation failure, or a subprocess_info |
507 | * structure. This should be passed to call_usermodehelper_exec to | 518 | * structure. This should be passed to call_usermodehelper_exec to |
508 | * exec the process and free the structure. | 519 | * exec the process and free the structure. |
520 | * | ||
521 | * The init function is used to customize the helper process prior to | ||
522 | * exec. A non-zero return code causes the process to error out, exit, | ||
523 | * and return the failure to the calling process | ||
524 | * | ||
525 | * The cleanup function is just before ethe subprocess_info is about to | ||
526 | * be freed. This can be used for freeing the argv and envp. The | ||
527 | * Function must be runnable in either a process context or the | ||
528 | * context in which call_usermodehelper_exec is called. | ||
509 | */ | 529 | */ |
510 | static | ||
511 | struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, | 530 | struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, |
512 | char **envp, gfp_t gfp_mask) | 531 | char **envp, gfp_t gfp_mask, |
532 | int (*init)(struct subprocess_info *info, struct cred *new), | ||
533 | void (*cleanup)(struct subprocess_info *info), | ||
534 | void *data) | ||
513 | { | 535 | { |
514 | struct subprocess_info *sub_info; | 536 | struct subprocess_info *sub_info; |
515 | sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); | 537 | sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); |
@@ -520,56 +542,38 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, | |||
520 | sub_info->path = path; | 542 | sub_info->path = path; |
521 | sub_info->argv = argv; | 543 | sub_info->argv = argv; |
522 | sub_info->envp = envp; | 544 | sub_info->envp = envp; |
545 | |||
546 | sub_info->cleanup = cleanup; | ||
547 | sub_info->init = init; | ||
548 | sub_info->data = data; | ||
523 | out: | 549 | out: |
524 | return sub_info; | 550 | return sub_info; |
525 | } | 551 | } |
526 | 552 | EXPORT_SYMBOL(call_usermodehelper_setup); | |
527 | /** | ||
528 | * call_usermodehelper_setfns - set a cleanup/init function | ||
529 | * @info: a subprocess_info returned by call_usermodehelper_setup | ||
530 | * @cleanup: a cleanup function | ||
531 | * @init: an init function | ||
532 | * @data: arbitrary context sensitive data | ||
533 | * | ||
534 | * The init function is used to customize the helper process prior to | ||
535 | * exec. A non-zero return code causes the process to error out, exit, | ||
536 | * and return the failure to the calling process | ||
537 | * | ||
538 | * The cleanup function is just before ethe subprocess_info is about to | ||
539 | * be freed. This can be used for freeing the argv and envp. The | ||
540 | * Function must be runnable in either a process context or the | ||
541 | * context in which call_usermodehelper_exec is called. | ||
542 | */ | ||
543 | static | ||
544 | void call_usermodehelper_setfns(struct subprocess_info *info, | ||
545 | int (*init)(struct subprocess_info *info, struct cred *new), | ||
546 | void (*cleanup)(struct subprocess_info *info), | ||
547 | void *data) | ||
548 | { | ||
549 | info->cleanup = cleanup; | ||
550 | info->init = init; | ||
551 | info->data = data; | ||
552 | } | ||
553 | 553 | ||
554 | /** | 554 | /** |
555 | * call_usermodehelper_exec - start a usermode application | 555 | * call_usermodehelper_exec - start a usermode application |
556 | * @sub_info: information about the subprocessa | 556 | * @sub_info: information about the subprocessa |
557 | * @wait: wait for the application to finish and return status. | 557 | * @wait: wait for the application to finish and return status. |
558 | * when -1 don't wait at all, but you get no useful error back when | 558 | * when UMH_NO_WAIT don't wait at all, but you get no useful error back |
559 | * the program couldn't be exec'ed. This makes it safe to call | 559 | * when the program couldn't be exec'ed. This makes it safe to call |
560 | * from interrupt context. | 560 | * from interrupt context. |
561 | * | 561 | * |
562 | * Runs a user-space application. The application is started | 562 | * Runs a user-space application. The application is started |
563 | * asynchronously if wait is not set, and runs as a child of keventd. | 563 | * asynchronously if wait is not set, and runs as a child of keventd. |
564 | * (ie. it runs with full root capabilities). | 564 | * (ie. it runs with full root capabilities). |
565 | */ | 565 | */ |
566 | static | ||
567 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | 566 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) |
568 | { | 567 | { |
569 | DECLARE_COMPLETION_ONSTACK(done); | 568 | DECLARE_COMPLETION_ONSTACK(done); |
570 | int retval = 0; | 569 | int retval = 0; |
571 | 570 | ||
572 | helper_lock(); | 571 | helper_lock(); |
572 | if (!sub_info->path) { | ||
573 | retval = -EINVAL; | ||
574 | goto out; | ||
575 | } | ||
576 | |||
573 | if (sub_info->path[0] == '\0') | 577 | if (sub_info->path[0] == '\0') |
574 | goto out; | 578 | goto out; |
575 | 579 | ||
@@ -615,31 +619,34 @@ unlock: | |||
615 | helper_unlock(); | 619 | helper_unlock(); |
616 | return retval; | 620 | return retval; |
617 | } | 621 | } |
622 | EXPORT_SYMBOL(call_usermodehelper_exec); | ||
618 | 623 | ||
619 | /* | 624 | /** |
620 | * call_usermodehelper_fns() will not run the caller-provided cleanup function | 625 | * call_usermodehelper() - prepare and start a usermode application |
621 | * if a memory allocation failure is experienced. So the caller might need to | 626 | * @path: path to usermode executable |
622 | * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform | 627 | * @argv: arg vector for process |
623 | * the necessaary cleanup within the caller. | 628 | * @envp: environment for process |
629 | * @wait: wait for the application to finish and return status. | ||
630 | * when UMH_NO_WAIT don't wait at all, but you get no useful error back | ||
631 | * when the program couldn't be exec'ed. This makes it safe to call | ||
632 | * from interrupt context. | ||
633 | * | ||
634 | * This function is the equivalent to use call_usermodehelper_setup() and | ||
635 | * call_usermodehelper_exec(). | ||
624 | */ | 636 | */ |
625 | int call_usermodehelper_fns( | 637 | int call_usermodehelper(char *path, char **argv, char **envp, int wait) |
626 | char *path, char **argv, char **envp, int wait, | ||
627 | int (*init)(struct subprocess_info *info, struct cred *new), | ||
628 | void (*cleanup)(struct subprocess_info *), void *data) | ||
629 | { | 638 | { |
630 | struct subprocess_info *info; | 639 | struct subprocess_info *info; |
631 | gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; | 640 | gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; |
632 | 641 | ||
633 | info = call_usermodehelper_setup(path, argv, envp, gfp_mask); | 642 | info = call_usermodehelper_setup(path, argv, envp, gfp_mask, |
634 | 643 | NULL, NULL, NULL); | |
635 | if (info == NULL) | 644 | if (info == NULL) |
636 | return -ENOMEM; | 645 | return -ENOMEM; |
637 | 646 | ||
638 | call_usermodehelper_setfns(info, init, cleanup, data); | ||
639 | |||
640 | return call_usermodehelper_exec(info, wait); | 647 | return call_usermodehelper_exec(info, wait); |
641 | } | 648 | } |
642 | EXPORT_SYMBOL(call_usermodehelper_fns); | 649 | EXPORT_SYMBOL(call_usermodehelper); |
643 | 650 | ||
644 | static int proc_cap_handler(struct ctl_table *table, int write, | 651 | static int proc_cap_handler(struct ctl_table *table, int write, |
645 | void __user *buffer, size_t *lenp, loff_t *ppos) | 652 | void __user *buffer, size_t *lenp, loff_t *ppos) |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e35be53f6613..3fed7f0cbcdf 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -794,16 +794,16 @@ out: | |||
794 | } | 794 | } |
795 | 795 | ||
796 | #ifdef CONFIG_SYSCTL | 796 | #ifdef CONFIG_SYSCTL |
797 | /* This should be called with kprobe_mutex locked */ | ||
798 | static void __kprobes optimize_all_kprobes(void) | 797 | static void __kprobes optimize_all_kprobes(void) |
799 | { | 798 | { |
800 | struct hlist_head *head; | 799 | struct hlist_head *head; |
801 | struct kprobe *p; | 800 | struct kprobe *p; |
802 | unsigned int i; | 801 | unsigned int i; |
803 | 802 | ||
803 | mutex_lock(&kprobe_mutex); | ||
804 | /* If optimization is already allowed, just return */ | 804 | /* If optimization is already allowed, just return */ |
805 | if (kprobes_allow_optimization) | 805 | if (kprobes_allow_optimization) |
806 | return; | 806 | goto out; |
807 | 807 | ||
808 | kprobes_allow_optimization = true; | 808 | kprobes_allow_optimization = true; |
809 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 809 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void) | |||
813 | optimize_kprobe(p); | 813 | optimize_kprobe(p); |
814 | } | 814 | } |
815 | printk(KERN_INFO "Kprobes globally optimized\n"); | 815 | printk(KERN_INFO "Kprobes globally optimized\n"); |
816 | out: | ||
817 | mutex_unlock(&kprobe_mutex); | ||
816 | } | 818 | } |
817 | 819 | ||
818 | /* This should be called with kprobe_mutex locked */ | ||
819 | static void __kprobes unoptimize_all_kprobes(void) | 820 | static void __kprobes unoptimize_all_kprobes(void) |
820 | { | 821 | { |
821 | struct hlist_head *head; | 822 | struct hlist_head *head; |
822 | struct kprobe *p; | 823 | struct kprobe *p; |
823 | unsigned int i; | 824 | unsigned int i; |
824 | 825 | ||
826 | mutex_lock(&kprobe_mutex); | ||
825 | /* If optimization is already prohibited, just return */ | 827 | /* If optimization is already prohibited, just return */ |
826 | if (!kprobes_allow_optimization) | 828 | if (!kprobes_allow_optimization) { |
829 | mutex_unlock(&kprobe_mutex); | ||
827 | return; | 830 | return; |
831 | } | ||
828 | 832 | ||
829 | kprobes_allow_optimization = false; | 833 | kprobes_allow_optimization = false; |
830 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 834 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void) | |||
834 | unoptimize_kprobe(p, false); | 838 | unoptimize_kprobe(p, false); |
835 | } | 839 | } |
836 | } | 840 | } |
841 | mutex_unlock(&kprobe_mutex); | ||
842 | |||
837 | /* Wait for unoptimizing completion */ | 843 | /* Wait for unoptimizing completion */ |
838 | wait_for_kprobe_optimizer(); | 844 | wait_for_kprobe_optimizer(); |
839 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | 845 | printk(KERN_INFO "Kprobes globally unoptimized\n"); |
840 | } | 846 | } |
841 | 847 | ||
848 | static DEFINE_MUTEX(kprobe_sysctl_mutex); | ||
842 | int sysctl_kprobes_optimization; | 849 | int sysctl_kprobes_optimization; |
843 | int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | 850 | int proc_kprobes_optimization_handler(struct ctl_table *table, int write, |
844 | void __user *buffer, size_t *length, | 851 | void __user *buffer, size_t *length, |
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
846 | { | 853 | { |
847 | int ret; | 854 | int ret; |
848 | 855 | ||
849 | mutex_lock(&kprobe_mutex); | 856 | mutex_lock(&kprobe_sysctl_mutex); |
850 | sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; | 857 | sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; |
851 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 858 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
852 | 859 | ||
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
854 | optimize_all_kprobes(); | 861 | optimize_all_kprobes(); |
855 | else | 862 | else |
856 | unoptimize_all_kprobes(); | 863 | unoptimize_all_kprobes(); |
857 | mutex_unlock(&kprobe_mutex); | 864 | mutex_unlock(&kprobe_sysctl_mutex); |
858 | 865 | ||
859 | return ret; | 866 | return ret; |
860 | } | 867 | } |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9baf..760e86df8c20 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/freezer.h> | 18 | #include <linux/freezer.h> |
19 | #include <linux/ptrace.h> | 19 | #include <linux/ptrace.h> |
20 | #include <linux/uaccess.h> | ||
20 | #include <trace/events/sched.h> | 21 | #include <trace/events/sched.h> |
21 | 22 | ||
22 | static DEFINE_SPINLOCK(kthread_create_lock); | 23 | static DEFINE_SPINLOCK(kthread_create_lock); |
@@ -52,8 +53,21 @@ enum KTHREAD_BITS { | |||
52 | KTHREAD_IS_PARKED, | 53 | KTHREAD_IS_PARKED, |
53 | }; | 54 | }; |
54 | 55 | ||
55 | #define to_kthread(tsk) \ | 56 | #define __to_kthread(vfork) \ |
56 | container_of((tsk)->vfork_done, struct kthread, exited) | 57 | container_of(vfork, struct kthread, exited) |
58 | |||
59 | static inline struct kthread *to_kthread(struct task_struct *k) | ||
60 | { | ||
61 | return __to_kthread(k->vfork_done); | ||
62 | } | ||
63 | |||
64 | static struct kthread *to_live_kthread(struct task_struct *k) | ||
65 | { | ||
66 | struct completion *vfork = ACCESS_ONCE(k->vfork_done); | ||
67 | if (likely(vfork)) | ||
68 | return __to_kthread(vfork); | ||
69 | return NULL; | ||
70 | } | ||
57 | 71 | ||
58 | /** | 72 | /** |
59 | * kthread_should_stop - should this kthread return now? | 73 | * kthread_should_stop - should this kthread return now? |
@@ -122,14 +136,32 @@ void *kthread_data(struct task_struct *task) | |||
122 | return to_kthread(task)->data; | 136 | return to_kthread(task)->data; |
123 | } | 137 | } |
124 | 138 | ||
139 | /** | ||
140 | * probe_kthread_data - speculative version of kthread_data() | ||
141 | * @task: possible kthread task in question | ||
142 | * | ||
143 | * @task could be a kthread task. Return the data value specified when it | ||
144 | * was created if accessible. If @task isn't a kthread task or its data is | ||
145 | * inaccessible for any reason, %NULL is returned. This function requires | ||
146 | * that @task itself is safe to dereference. | ||
147 | */ | ||
148 | void *probe_kthread_data(struct task_struct *task) | ||
149 | { | ||
150 | struct kthread *kthread = to_kthread(task); | ||
151 | void *data = NULL; | ||
152 | |||
153 | probe_kernel_read(&data, &kthread->data, sizeof(data)); | ||
154 | return data; | ||
155 | } | ||
156 | |||
125 | static void __kthread_parkme(struct kthread *self) | 157 | static void __kthread_parkme(struct kthread *self) |
126 | { | 158 | { |
127 | __set_current_state(TASK_INTERRUPTIBLE); | 159 | __set_current_state(TASK_PARKED); |
128 | while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { | 160 | while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { |
129 | if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) | 161 | if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) |
130 | complete(&self->parked); | 162 | complete(&self->parked); |
131 | schedule(); | 163 | schedule(); |
132 | __set_current_state(TASK_INTERRUPTIBLE); | 164 | __set_current_state(TASK_PARKED); |
133 | } | 165 | } |
134 | clear_bit(KTHREAD_IS_PARKED, &self->flags); | 166 | clear_bit(KTHREAD_IS_PARKED, &self->flags); |
135 | __set_current_state(TASK_RUNNING); | 167 | __set_current_state(TASK_RUNNING); |
@@ -256,11 +288,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
256 | } | 288 | } |
257 | EXPORT_SYMBOL(kthread_create_on_node); | 289 | EXPORT_SYMBOL(kthread_create_on_node); |
258 | 290 | ||
259 | static void __kthread_bind(struct task_struct *p, unsigned int cpu) | 291 | static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) |
260 | { | 292 | { |
293 | /* Must have done schedule() in kthread() before we set_task_cpu */ | ||
294 | if (!wait_task_inactive(p, state)) { | ||
295 | WARN_ON(1); | ||
296 | return; | ||
297 | } | ||
261 | /* It's safe because the task is inactive. */ | 298 | /* It's safe because the task is inactive. */ |
262 | do_set_cpus_allowed(p, cpumask_of(cpu)); | 299 | do_set_cpus_allowed(p, cpumask_of(cpu)); |
263 | p->flags |= PF_THREAD_BOUND; | 300 | p->flags |= PF_NO_SETAFFINITY; |
264 | } | 301 | } |
265 | 302 | ||
266 | /** | 303 | /** |
@@ -274,12 +311,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) | |||
274 | */ | 311 | */ |
275 | void kthread_bind(struct task_struct *p, unsigned int cpu) | 312 | void kthread_bind(struct task_struct *p, unsigned int cpu) |
276 | { | 313 | { |
277 | /* Must have done schedule() in kthread() before we set_task_cpu */ | 314 | __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); |
278 | if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { | ||
279 | WARN_ON(1); | ||
280 | return; | ||
281 | } | ||
282 | __kthread_bind(p, cpu); | ||
283 | } | 315 | } |
284 | EXPORT_SYMBOL(kthread_bind); | 316 | EXPORT_SYMBOL(kthread_bind); |
285 | 317 | ||
@@ -311,17 +343,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
311 | return p; | 343 | return p; |
312 | } | 344 | } |
313 | 345 | ||
314 | static struct kthread *task_get_live_kthread(struct task_struct *k) | 346 | static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) |
315 | { | 347 | { |
316 | struct kthread *kthread; | 348 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
317 | 349 | /* | |
318 | get_task_struct(k); | 350 | * We clear the IS_PARKED bit here as we don't wait |
319 | kthread = to_kthread(k); | 351 | * until the task has left the park code. So if we'd |
320 | /* It might have exited */ | 352 | * park before that happens we'd see the IS_PARKED bit |
321 | barrier(); | 353 | * which might be about to be cleared. |
322 | if (k->vfork_done != NULL) | 354 | */ |
323 | return kthread; | 355 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { |
324 | return NULL; | 356 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) |
357 | __kthread_bind(k, kthread->cpu, TASK_PARKED); | ||
358 | wake_up_state(k, TASK_PARKED); | ||
359 | } | ||
325 | } | 360 | } |
326 | 361 | ||
327 | /** | 362 | /** |
@@ -334,23 +369,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k) | |||
334 | */ | 369 | */ |
335 | void kthread_unpark(struct task_struct *k) | 370 | void kthread_unpark(struct task_struct *k) |
336 | { | 371 | { |
337 | struct kthread *kthread = task_get_live_kthread(k); | 372 | struct kthread *kthread = to_live_kthread(k); |
338 | 373 | ||
339 | if (kthread) { | 374 | if (kthread) |
340 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 375 | __kthread_unpark(k, kthread); |
341 | /* | ||
342 | * We clear the IS_PARKED bit here as we don't wait | ||
343 | * until the task has left the park code. So if we'd | ||
344 | * park before that happens we'd see the IS_PARKED bit | ||
345 | * which might be about to be cleared. | ||
346 | */ | ||
347 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | ||
348 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) | ||
349 | __kthread_bind(k, kthread->cpu); | ||
350 | wake_up_process(k); | ||
351 | } | ||
352 | } | ||
353 | put_task_struct(k); | ||
354 | } | 376 | } |
355 | 377 | ||
356 | /** | 378 | /** |
@@ -367,7 +389,7 @@ void kthread_unpark(struct task_struct *k) | |||
367 | */ | 389 | */ |
368 | int kthread_park(struct task_struct *k) | 390 | int kthread_park(struct task_struct *k) |
369 | { | 391 | { |
370 | struct kthread *kthread = task_get_live_kthread(k); | 392 | struct kthread *kthread = to_live_kthread(k); |
371 | int ret = -ENOSYS; | 393 | int ret = -ENOSYS; |
372 | 394 | ||
373 | if (kthread) { | 395 | if (kthread) { |
@@ -380,7 +402,6 @@ int kthread_park(struct task_struct *k) | |||
380 | } | 402 | } |
381 | ret = 0; | 403 | ret = 0; |
382 | } | 404 | } |
383 | put_task_struct(k); | ||
384 | return ret; | 405 | return ret; |
385 | } | 406 | } |
386 | 407 | ||
@@ -401,21 +422,23 @@ int kthread_park(struct task_struct *k) | |||
401 | */ | 422 | */ |
402 | int kthread_stop(struct task_struct *k) | 423 | int kthread_stop(struct task_struct *k) |
403 | { | 424 | { |
404 | struct kthread *kthread = task_get_live_kthread(k); | 425 | struct kthread *kthread; |
405 | int ret; | 426 | int ret; |
406 | 427 | ||
407 | trace_sched_kthread_stop(k); | 428 | trace_sched_kthread_stop(k); |
429 | |||
430 | get_task_struct(k); | ||
431 | kthread = to_live_kthread(k); | ||
408 | if (kthread) { | 432 | if (kthread) { |
409 | set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); | 433 | set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); |
410 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 434 | __kthread_unpark(k, kthread); |
411 | wake_up_process(k); | 435 | wake_up_process(k); |
412 | wait_for_completion(&kthread->exited); | 436 | wait_for_completion(&kthread->exited); |
413 | } | 437 | } |
414 | ret = k->exit_code; | 438 | ret = k->exit_code; |
415 | |||
416 | put_task_struct(k); | 439 | put_task_struct(k); |
417 | trace_sched_kthread_stop_ret(ret); | ||
418 | 440 | ||
441 | trace_sched_kthread_stop_ret(ret); | ||
419 | return ret; | 442 | return ret; |
420 | } | 443 | } |
421 | EXPORT_SYMBOL(kthread_stop); | 444 | EXPORT_SYMBOL(kthread_stop); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d9..1f3186b37fd5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class) | |||
380 | unsigned long nr_stack_trace_entries; | 380 | unsigned long nr_stack_trace_entries; |
381 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; | 381 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; |
382 | 382 | ||
383 | static void print_lockdep_off(const char *bug_msg) | ||
384 | { | ||
385 | printk(KERN_DEBUG "%s\n", bug_msg); | ||
386 | printk(KERN_DEBUG "turning off the locking correctness validator.\n"); | ||
387 | printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); | ||
388 | } | ||
389 | |||
383 | static int save_trace(struct stack_trace *trace) | 390 | static int save_trace(struct stack_trace *trace) |
384 | { | 391 | { |
385 | trace->nr_entries = 0; | 392 | trace->nr_entries = 0; |
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace) | |||
409 | if (!debug_locks_off_graph_unlock()) | 416 | if (!debug_locks_off_graph_unlock()) |
410 | return 0; | 417 | return 0; |
411 | 418 | ||
412 | printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); | 419 | print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); |
413 | printk("turning off the locking correctness validator.\n"); | ||
414 | dump_stack(); | 420 | dump_stack(); |
415 | 421 | ||
416 | return 0; | 422 | return 0; |
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
763 | } | 769 | } |
764 | raw_local_irq_restore(flags); | 770 | raw_local_irq_restore(flags); |
765 | 771 | ||
766 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | 772 | print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); |
767 | printk("turning off the locking correctness validator.\n"); | ||
768 | dump_stack(); | 773 | dump_stack(); |
769 | return NULL; | 774 | return NULL; |
770 | } | 775 | } |
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void) | |||
834 | if (!debug_locks_off_graph_unlock()) | 839 | if (!debug_locks_off_graph_unlock()) |
835 | return NULL; | 840 | return NULL; |
836 | 841 | ||
837 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | 842 | print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); |
838 | printk("turning off the locking correctness validator.\n"); | ||
839 | dump_stack(); | 843 | dump_stack(); |
840 | return NULL; | 844 | return NULL; |
841 | } | 845 | } |
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
2000 | struct lock_class *class = hlock_class(hlock); | 2004 | struct lock_class *class = hlock_class(hlock); |
2001 | struct list_head *hash_head = chainhashentry(chain_key); | 2005 | struct list_head *hash_head = chainhashentry(chain_key); |
2002 | struct lock_chain *chain; | 2006 | struct lock_chain *chain; |
2003 | struct held_lock *hlock_curr, *hlock_next; | 2007 | struct held_lock *hlock_curr; |
2004 | int i, j; | 2008 | int i, j; |
2005 | 2009 | ||
2006 | /* | 2010 | /* |
@@ -2048,8 +2052,7 @@ cache_hit: | |||
2048 | if (!debug_locks_off_graph_unlock()) | 2052 | if (!debug_locks_off_graph_unlock()) |
2049 | return 0; | 2053 | return 0; |
2050 | 2054 | ||
2051 | printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); | 2055 | print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); |
2052 | printk("turning off the locking correctness validator.\n"); | ||
2053 | dump_stack(); | 2056 | dump_stack(); |
2054 | return 0; | 2057 | return 0; |
2055 | } | 2058 | } |
@@ -2057,12 +2060,10 @@ cache_hit: | |||
2057 | chain->chain_key = chain_key; | 2060 | chain->chain_key = chain_key; |
2058 | chain->irq_context = hlock->irq_context; | 2061 | chain->irq_context = hlock->irq_context; |
2059 | /* Find the first held_lock of current chain */ | 2062 | /* Find the first held_lock of current chain */ |
2060 | hlock_next = hlock; | ||
2061 | for (i = curr->lockdep_depth - 1; i >= 0; i--) { | 2063 | for (i = curr->lockdep_depth - 1; i >= 0; i--) { |
2062 | hlock_curr = curr->held_locks + i; | 2064 | hlock_curr = curr->held_locks + i; |
2063 | if (hlock_curr->irq_context != hlock_next->irq_context) | 2065 | if (hlock_curr->irq_context != hlock->irq_context) |
2064 | break; | 2066 | break; |
2065 | hlock_next = hlock; | ||
2066 | } | 2067 | } |
2067 | i++; | 2068 | i++; |
2068 | chain->depth = curr->lockdep_depth + 1 - i; | 2069 | chain->depth = curr->lockdep_depth + 1 - i; |
@@ -2997,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2997 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 2998 | EXPORT_SYMBOL_GPL(lockdep_init_map); |
2998 | 2999 | ||
2999 | struct lock_class_key __lockdep_no_validate__; | 3000 | struct lock_class_key __lockdep_no_validate__; |
3001 | EXPORT_SYMBOL_GPL(__lockdep_no_validate__); | ||
3000 | 3002 | ||
3001 | static int | 3003 | static int |
3002 | print_lock_nested_lock_not_held(struct task_struct *curr, | 3004 | print_lock_nested_lock_not_held(struct task_struct *curr, |
@@ -3190,9 +3192,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3190 | #endif | 3192 | #endif |
3191 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { | 3193 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { |
3192 | debug_locks_off(); | 3194 | debug_locks_off(); |
3193 | printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", | 3195 | print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); |
3196 | printk(KERN_DEBUG "depth: %i max: %lu!\n", | ||
3194 | curr->lockdep_depth, MAX_LOCK_DEPTH); | 3197 | curr->lockdep_depth, MAX_LOCK_DEPTH); |
3195 | printk("turning off the locking correctness validator.\n"); | ||
3196 | 3198 | ||
3197 | lockdep_print_held_locks(current); | 3199 | lockdep_print_held_locks(current); |
3198 | debug_show_all_locks(); | 3200 | debug_show_all_locks(); |
@@ -4088,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | |||
4088 | } | 4090 | } |
4089 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); | 4091 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); |
4090 | 4092 | ||
4091 | static void print_held_locks_bug(void) | 4093 | static void print_held_locks_bug(struct task_struct *curr) |
4092 | { | 4094 | { |
4093 | if (!debug_locks_off()) | 4095 | if (!debug_locks_off()) |
4094 | return; | 4096 | return; |
@@ -4097,21 +4099,22 @@ static void print_held_locks_bug(void) | |||
4097 | 4099 | ||
4098 | printk("\n"); | 4100 | printk("\n"); |
4099 | printk("=====================================\n"); | 4101 | printk("=====================================\n"); |
4100 | printk("[ BUG: %s/%d still has locks held! ]\n", | 4102 | printk("[ BUG: lock held at task exit time! ]\n"); |
4101 | current->comm, task_pid_nr(current)); | ||
4102 | print_kernel_ident(); | 4103 | print_kernel_ident(); |
4103 | printk("-------------------------------------\n"); | 4104 | printk("-------------------------------------\n"); |
4104 | lockdep_print_held_locks(current); | 4105 | printk("%s/%d is exiting with locks still held!\n", |
4106 | curr->comm, task_pid_nr(curr)); | ||
4107 | lockdep_print_held_locks(curr); | ||
4108 | |||
4105 | printk("\nstack backtrace:\n"); | 4109 | printk("\nstack backtrace:\n"); |
4106 | dump_stack(); | 4110 | dump_stack(); |
4107 | } | 4111 | } |
4108 | 4112 | ||
4109 | void debug_check_no_locks_held(void) | 4113 | void debug_check_no_locks_held(struct task_struct *task) |
4110 | { | 4114 | { |
4111 | if (unlikely(current->lockdep_depth > 0)) | 4115 | if (unlikely(task->lockdep_depth > 0)) |
4112 | print_held_locks_bug(); | 4116 | print_held_locks_bug(task); |
4113 | } | 4117 | } |
4114 | EXPORT_SYMBOL_GPL(debug_check_no_locks_held); | ||
4115 | 4118 | ||
4116 | void debug_show_all_locks(void) | 4119 | void debug_show_all_locks(void) |
4117 | { | 4120 | { |
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S index 246b4c6e6135..4a9a86d12c8b 100644 --- a/kernel/modsign_certificate.S +++ b/kernel/modsign_certificate.S | |||
@@ -1,15 +1,8 @@ | |||
1 | /* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ | 1 | #include <linux/export.h> |
2 | #ifndef SYMBOL_PREFIX | ||
3 | #define ASM_SYMBOL(sym) sym | ||
4 | #else | ||
5 | #define PASTE2(x,y) x##y | ||
6 | #define PASTE(x,y) PASTE2(x,y) | ||
7 | #define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) | ||
8 | #endif | ||
9 | 2 | ||
10 | #define GLOBAL(name) \ | 3 | #define GLOBAL(name) \ |
11 | .globl ASM_SYMBOL(name); \ | 4 | .globl VMLINUX_SYMBOL(name); \ |
12 | ASM_SYMBOL(name): | 5 | VMLINUX_SYMBOL(name): |
13 | 6 | ||
14 | .section ".init.data","aw" | 7 | .section ".init.data","aw" |
15 | 8 | ||
diff --git a/kernel/module.c b/kernel/module.c index 0925c9a71975..cab4bce49c23 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, | |||
1209 | 1209 | ||
1210 | /* Since this should be found in kernel (which can't be removed), | 1210 | /* Since this should be found in kernel (which can't be removed), |
1211 | * no locking is necessary. */ | 1211 | * no locking is necessary. */ |
1212 | if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, | 1212 | if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, |
1213 | &crc, true, false)) | 1213 | &crc, true, false)) |
1214 | BUG(); | 1214 | BUG(); |
1215 | return check_version(sechdrs, versindex, "module_layout", mod, crc, | 1215 | return check_version(sechdrs, versindex, |
1216 | VMLINUX_SYMBOL_STR(module_layout), mod, crc, | ||
1216 | NULL); | 1217 | NULL); |
1217 | } | 1218 | } |
1218 | 1219 | ||
@@ -1861,12 +1862,12 @@ static void free_module(struct module *mod) | |||
1861 | { | 1862 | { |
1862 | trace_module_free(mod); | 1863 | trace_module_free(mod); |
1863 | 1864 | ||
1864 | /* Delete from various lists */ | ||
1865 | mutex_lock(&module_mutex); | ||
1866 | stop_machine(__unlink_module, mod, NULL); | ||
1867 | mutex_unlock(&module_mutex); | ||
1868 | mod_sysfs_teardown(mod); | 1865 | mod_sysfs_teardown(mod); |
1869 | 1866 | ||
1867 | /* We leave it in list to prevent duplicate loads, but make sure | ||
1868 | * that noone uses it while it's being deconstructed. */ | ||
1869 | mod->state = MODULE_STATE_UNFORMED; | ||
1870 | |||
1870 | /* Remove dynamic debug info */ | 1871 | /* Remove dynamic debug info */ |
1871 | ddebug_remove_module(mod->name); | 1872 | ddebug_remove_module(mod->name); |
1872 | 1873 | ||
@@ -1879,6 +1880,11 @@ static void free_module(struct module *mod) | |||
1879 | /* Free any allocated parameters. */ | 1880 | /* Free any allocated parameters. */ |
1880 | destroy_params(mod->kp, mod->num_kp); | 1881 | destroy_params(mod->kp, mod->num_kp); |
1881 | 1882 | ||
1883 | /* Now we can delete it from the lists */ | ||
1884 | mutex_lock(&module_mutex); | ||
1885 | stop_machine(__unlink_module, mod, NULL); | ||
1886 | mutex_unlock(&module_mutex); | ||
1887 | |||
1882 | /* This may be NULL, but that's OK */ | 1888 | /* This may be NULL, but that's OK */ |
1883 | unset_module_init_ro_nx(mod); | 1889 | unset_module_init_ro_nx(mod); |
1884 | module_free(mod, mod->module_init); | 1890 | module_free(mod, mod->module_init); |
@@ -2425,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod, | |||
2425 | kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); | 2431 | kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); |
2426 | 2432 | ||
2427 | for (i = 1; i < info->hdr->e_shnum; i++) { | 2433 | for (i = 1; i < info->hdr->e_shnum; i++) { |
2428 | const char *name = info->secstrings + info->sechdrs[i].sh_name; | 2434 | /* Scan all writable sections that's not executable */ |
2429 | if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) | 2435 | if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || |
2430 | continue; | 2436 | !(info->sechdrs[i].sh_flags & SHF_WRITE) || |
2431 | if (!strstarts(name, ".data") && !strstarts(name, ".bss")) | 2437 | (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) |
2432 | continue; | 2438 | continue; |
2433 | 2439 | ||
2434 | kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, | 2440 | kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, |
@@ -2763,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2763 | mod->trace_events = section_objs(info, "_ftrace_events", | 2769 | mod->trace_events = section_objs(info, "_ftrace_events", |
2764 | sizeof(*mod->trace_events), | 2770 | sizeof(*mod->trace_events), |
2765 | &mod->num_trace_events); | 2771 | &mod->num_trace_events); |
2766 | /* | ||
2767 | * This section contains pointers to allocated objects in the trace | ||
2768 | * code and not scanning it leads to false positives. | ||
2769 | */ | ||
2770 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | ||
2771 | mod->num_trace_events, GFP_KERNEL); | ||
2772 | #endif | 2772 | #endif |
2773 | #ifdef CONFIG_TRACING | 2773 | #ifdef CONFIG_TRACING |
2774 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | 2774 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", |
2775 | sizeof(*mod->trace_bprintk_fmt_start), | 2775 | sizeof(*mod->trace_bprintk_fmt_start), |
2776 | &mod->num_trace_bprintk_fmt); | 2776 | &mod->num_trace_bprintk_fmt); |
2777 | /* | ||
2778 | * This section contains pointers to allocated objects in the trace | ||
2779 | * code and not scanning it leads to false positives. | ||
2780 | */ | ||
2781 | kmemleak_scan_area(mod->trace_bprintk_fmt_start, | ||
2782 | sizeof(*mod->trace_bprintk_fmt_start) * | ||
2783 | mod->num_trace_bprintk_fmt, GFP_KERNEL); | ||
2784 | #endif | 2777 | #endif |
2785 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | 2778 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD |
2786 | /* sechdrs[0].sh_size is always zero */ | 2779 | /* sechdrs[0].sh_size is always zero */ |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e0..ad53a664f113 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -37,6 +37,12 @@ | |||
37 | # include <asm/mutex.h> | 37 | # include <asm/mutex.h> |
38 | #endif | 38 | #endif |
39 | 39 | ||
40 | /* | ||
41 | * A negative mutex count indicates that waiters are sleeping waiting for the | ||
42 | * mutex. | ||
43 | */ | ||
44 | #define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) | ||
45 | |||
40 | void | 46 | void |
41 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | 47 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) |
42 | { | 48 | { |
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | |||
44 | spin_lock_init(&lock->wait_lock); | 50 | spin_lock_init(&lock->wait_lock); |
45 | INIT_LIST_HEAD(&lock->wait_list); | 51 | INIT_LIST_HEAD(&lock->wait_list); |
46 | mutex_clear_owner(lock); | 52 | mutex_clear_owner(lock); |
53 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | ||
54 | lock->spin_mlock = NULL; | ||
55 | #endif | ||
47 | 56 | ||
48 | debug_mutex_init(lock, name, key); | 57 | debug_mutex_init(lock, name, key); |
49 | } | 58 | } |
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock) | |||
95 | EXPORT_SYMBOL(mutex_lock); | 104 | EXPORT_SYMBOL(mutex_lock); |
96 | #endif | 105 | #endif |
97 | 106 | ||
107 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | ||
108 | /* | ||
109 | * In order to avoid a stampede of mutex spinners from acquiring the mutex | ||
110 | * more or less simultaneously, the spinners need to acquire a MCS lock | ||
111 | * first before spinning on the owner field. | ||
112 | * | ||
113 | * We don't inline mspin_lock() so that perf can correctly account for the | ||
114 | * time spent in this lock function. | ||
115 | */ | ||
116 | struct mspin_node { | ||
117 | struct mspin_node *next ; | ||
118 | int locked; /* 1 if lock acquired */ | ||
119 | }; | ||
120 | #define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) | ||
121 | |||
122 | static noinline | ||
123 | void mspin_lock(struct mspin_node **lock, struct mspin_node *node) | ||
124 | { | ||
125 | struct mspin_node *prev; | ||
126 | |||
127 | /* Init node */ | ||
128 | node->locked = 0; | ||
129 | node->next = NULL; | ||
130 | |||
131 | prev = xchg(lock, node); | ||
132 | if (likely(prev == NULL)) { | ||
133 | /* Lock acquired */ | ||
134 | node->locked = 1; | ||
135 | return; | ||
136 | } | ||
137 | ACCESS_ONCE(prev->next) = node; | ||
138 | smp_wmb(); | ||
139 | /* Wait until the lock holder passes the lock down */ | ||
140 | while (!ACCESS_ONCE(node->locked)) | ||
141 | arch_mutex_cpu_relax(); | ||
142 | } | ||
143 | |||
144 | static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) | ||
145 | { | ||
146 | struct mspin_node *next = ACCESS_ONCE(node->next); | ||
147 | |||
148 | if (likely(!next)) { | ||
149 | /* | ||
150 | * Release the lock by setting it to NULL | ||
151 | */ | ||
152 | if (cmpxchg(lock, node, NULL) == node) | ||
153 | return; | ||
154 | /* Wait until the next pointer is set */ | ||
155 | while (!(next = ACCESS_ONCE(node->next))) | ||
156 | arch_mutex_cpu_relax(); | ||
157 | } | ||
158 | ACCESS_ONCE(next->locked) = 1; | ||
159 | smp_wmb(); | ||
160 | } | ||
161 | |||
162 | /* | ||
163 | * Mutex spinning code migrated from kernel/sched/core.c | ||
164 | */ | ||
165 | |||
166 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
167 | { | ||
168 | if (lock->owner != owner) | ||
169 | return false; | ||
170 | |||
171 | /* | ||
172 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
173 | * lock->owner still matches owner, if that fails, owner might | ||
174 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
175 | * ensures the memory stays valid. | ||
176 | */ | ||
177 | barrier(); | ||
178 | |||
179 | return owner->on_cpu; | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * Look out! "owner" is an entirely speculative pointer | ||
184 | * access and not reliable. | ||
185 | */ | ||
186 | static noinline | ||
187 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | ||
188 | { | ||
189 | rcu_read_lock(); | ||
190 | while (owner_running(lock, owner)) { | ||
191 | if (need_resched()) | ||
192 | break; | ||
193 | |||
194 | arch_mutex_cpu_relax(); | ||
195 | } | ||
196 | rcu_read_unlock(); | ||
197 | |||
198 | /* | ||
199 | * We break out the loop above on need_resched() and when the | ||
200 | * owner changed, which is a sign for heavy contention. Return | ||
201 | * success only when lock->owner is NULL. | ||
202 | */ | ||
203 | return lock->owner == NULL; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * Initial check for entering the mutex spinning loop | ||
208 | */ | ||
209 | static inline int mutex_can_spin_on_owner(struct mutex *lock) | ||
210 | { | ||
211 | int retval = 1; | ||
212 | |||
213 | rcu_read_lock(); | ||
214 | if (lock->owner) | ||
215 | retval = lock->owner->on_cpu; | ||
216 | rcu_read_unlock(); | ||
217 | /* | ||
218 | * if lock->owner is not set, the mutex owner may have just acquired | ||
219 | * it and not set the owner yet or the mutex has been released. | ||
220 | */ | ||
221 | return retval; | ||
222 | } | ||
223 | #endif | ||
224 | |||
98 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); | 225 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); |
99 | 226 | ||
100 | /** | 227 | /** |
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
158 | * | 285 | * |
159 | * We can't do this for DEBUG_MUTEXES because that relies on wait_lock | 286 | * We can't do this for DEBUG_MUTEXES because that relies on wait_lock |
160 | * to serialize everything. | 287 | * to serialize everything. |
288 | * | ||
289 | * The mutex spinners are queued up using MCS lock so that only one | ||
290 | * spinner can compete for the mutex. However, if mutex spinning isn't | ||
291 | * going to happen, there is no point in going through the lock/unlock | ||
292 | * overhead. | ||
161 | */ | 293 | */ |
294 | if (!mutex_can_spin_on_owner(lock)) | ||
295 | goto slowpath; | ||
162 | 296 | ||
163 | for (;;) { | 297 | for (;;) { |
164 | struct task_struct *owner; | 298 | struct task_struct *owner; |
299 | struct mspin_node node; | ||
165 | 300 | ||
166 | /* | 301 | /* |
167 | * If there's an owner, wait for it to either | 302 | * If there's an owner, wait for it to either |
168 | * release the lock or go to sleep. | 303 | * release the lock or go to sleep. |
169 | */ | 304 | */ |
305 | mspin_lock(MLOCK(lock), &node); | ||
170 | owner = ACCESS_ONCE(lock->owner); | 306 | owner = ACCESS_ONCE(lock->owner); |
171 | if (owner && !mutex_spin_on_owner(lock, owner)) | 307 | if (owner && !mutex_spin_on_owner(lock, owner)) { |
308 | mspin_unlock(MLOCK(lock), &node); | ||
172 | break; | 309 | break; |
310 | } | ||
173 | 311 | ||
174 | if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { | 312 | if ((atomic_read(&lock->count) == 1) && |
313 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | ||
175 | lock_acquired(&lock->dep_map, ip); | 314 | lock_acquired(&lock->dep_map, ip); |
176 | mutex_set_owner(lock); | 315 | mutex_set_owner(lock); |
316 | mspin_unlock(MLOCK(lock), &node); | ||
177 | preempt_enable(); | 317 | preempt_enable(); |
178 | return 0; | 318 | return 0; |
179 | } | 319 | } |
320 | mspin_unlock(MLOCK(lock), &node); | ||
180 | 321 | ||
181 | /* | 322 | /* |
182 | * When there's no owner, we might have preempted between the | 323 | * When there's no owner, we might have preempted between the |
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
195 | */ | 336 | */ |
196 | arch_mutex_cpu_relax(); | 337 | arch_mutex_cpu_relax(); |
197 | } | 338 | } |
339 | slowpath: | ||
198 | #endif | 340 | #endif |
199 | spin_lock_mutex(&lock->wait_lock, flags); | 341 | spin_lock_mutex(&lock->wait_lock, flags); |
200 | 342 | ||
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
205 | list_add_tail(&waiter.list, &lock->wait_list); | 347 | list_add_tail(&waiter.list, &lock->wait_list); |
206 | waiter.task = task; | 348 | waiter.task = task; |
207 | 349 | ||
208 | if (atomic_xchg(&lock->count, -1) == 1) | 350 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) |
209 | goto done; | 351 | goto done; |
210 | 352 | ||
211 | lock_contended(&lock->dep_map, ip); | 353 | lock_contended(&lock->dep_map, ip); |
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
220 | * that when we release the lock, we properly wake up the | 362 | * that when we release the lock, we properly wake up the |
221 | * other waiters: | 363 | * other waiters: |
222 | */ | 364 | */ |
223 | if (atomic_xchg(&lock->count, -1) == 1) | 365 | if (MUTEX_SHOW_NO_WAITER(lock) && |
366 | (atomic_xchg(&lock->count, -1) == 1)) | ||
224 | break; | 367 | break; |
225 | 368 | ||
226 | /* | 369 | /* |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index afc0456f227a..364ceab15f0c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | #include <net/net_namespace.h> | 23 | #include <net/net_namespace.h> |
24 | #include <linux/ipc_namespace.h> | 24 | #include <linux/ipc_namespace.h> |
25 | #include <linux/proc_fs.h> | 25 | #include <linux/proc_ns.h> |
26 | #include <linux/file.h> | 26 | #include <linux/file.h> |
27 | #include <linux/syscalls.h> | 27 | #include <linux/syscalls.h> |
28 | 28 | ||
@@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
241 | const struct proc_ns_operations *ops; | 241 | const struct proc_ns_operations *ops; |
242 | struct task_struct *tsk = current; | 242 | struct task_struct *tsk = current; |
243 | struct nsproxy *new_nsproxy; | 243 | struct nsproxy *new_nsproxy; |
244 | struct proc_inode *ei; | 244 | struct proc_ns *ei; |
245 | struct file *file; | 245 | struct file *file; |
246 | int err; | 246 | int err; |
247 | 247 | ||
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
250 | return PTR_ERR(file); | 250 | return PTR_ERR(file); |
251 | 251 | ||
252 | err = -EINVAL; | 252 | err = -EINVAL; |
253 | ei = PROC_I(file_inode(file)); | 253 | ei = get_proc_ns(file_inode(file)); |
254 | ops = ei->ns_ops; | 254 | ops = ei->ns_ops; |
255 | if (nstype && (ops->type != nstype)) | 255 | if (nstype && (ops->type != nstype)) |
256 | goto out; | 256 | goto out; |
diff --git a/kernel/panic.c b/kernel/panic.c index 7c57cc9eee2c..167ec097ce8b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/sysrq.h> | 22 | #include <linux/sysrq.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/nmi.h> | 24 | #include <linux/nmi.h> |
25 | #include <linux/dmi.h> | ||
26 | 25 | ||
27 | #define PANIC_TIMER_STEP 100 | 26 | #define PANIC_TIMER_STEP 100 |
28 | #define PANIC_BLINK_SPD 18 | 27 | #define PANIC_BLINK_SPD 18 |
@@ -400,13 +399,8 @@ struct slowpath_args { | |||
400 | static void warn_slowpath_common(const char *file, int line, void *caller, | 399 | static void warn_slowpath_common(const char *file, int line, void *caller, |
401 | unsigned taint, struct slowpath_args *args) | 400 | unsigned taint, struct slowpath_args *args) |
402 | { | 401 | { |
403 | const char *board; | ||
404 | |||
405 | printk(KERN_WARNING "------------[ cut here ]------------\n"); | 402 | printk(KERN_WARNING "------------[ cut here ]------------\n"); |
406 | printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); | 403 | printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); |
407 | board = dmi_get_system_info(DMI_PRODUCT_NAME); | ||
408 | if (board) | ||
409 | printk(KERN_WARNING "Hardware name: %s\n", board); | ||
410 | 404 | ||
411 | if (args) | 405 | if (args) |
412 | vprintk(args->fmt, args->args); | 406 | vprintk(args->fmt, args->args); |
diff --git a/kernel/params.c b/kernel/params.c index ed35345be536..53b958fcd639 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, | |||
613 | sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), | 613 | sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), |
614 | GFP_KERNEL); | 614 | GFP_KERNEL); |
615 | if (!new) { | 615 | if (!new) { |
616 | kfree(mk->mp); | 616 | kfree(attrs); |
617 | err = -ENOMEM; | 617 | err = -ENOMEM; |
618 | goto fail; | 618 | goto fail; |
619 | } | 619 | } |
620 | /* Despite looking like the typical realloc() bug, this is safe. | ||
621 | * We *want* the old 'attrs' to be freed either way, and we'll store | ||
622 | * the new one in the success case. */ | ||
620 | attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); | 623 | attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); |
621 | if (!attrs) { | 624 | if (!attrs) { |
622 | err = -ENOMEM; | 625 | err = -ENOMEM; |
diff --git a/kernel/pid.c b/kernel/pid.c index 047dc6264638..0db3e791a06d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/pid_namespace.h> | 36 | #include <linux/pid_namespace.h> |
37 | #include <linux/init_task.h> | 37 | #include <linux/init_task.h> |
38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
39 | #include <linux/proc_ns.h> | ||
39 | #include <linux/proc_fs.h> | 40 | #include <linux/proc_fs.h> |
40 | 41 | ||
41 | #define pid_hashfn(nr, ns) \ | 42 | #define pid_hashfn(nr, ns) \ |
@@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT; | |||
51 | int pid_max_min = RESERVED_PIDS + 1; | 52 | int pid_max_min = RESERVED_PIDS + 1; |
52 | int pid_max_max = PID_MAX_LIMIT; | 53 | int pid_max_max = PID_MAX_LIMIT; |
53 | 54 | ||
54 | #define BITS_PER_PAGE (PAGE_SIZE*8) | ||
55 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) | ||
56 | |||
57 | static inline int mk_pid(struct pid_namespace *pid_ns, | 55 | static inline int mk_pid(struct pid_namespace *pid_ns, |
58 | struct pidmap *map, int off) | 56 | struct pidmap *map, int off) |
59 | { | 57 | { |
@@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
183 | break; | 181 | break; |
184 | } | 182 | } |
185 | if (likely(atomic_read(&map->nr_free))) { | 183 | if (likely(atomic_read(&map->nr_free))) { |
186 | do { | 184 | for ( ; ; ) { |
187 | if (!test_and_set_bit(offset, map->page)) { | 185 | if (!test_and_set_bit(offset, map->page)) { |
188 | atomic_dec(&map->nr_free); | 186 | atomic_dec(&map->nr_free); |
189 | set_last_pid(pid_ns, last, pid); | 187 | set_last_pid(pid_ns, last, pid); |
190 | return pid; | 188 | return pid; |
191 | } | 189 | } |
192 | offset = find_next_offset(map, offset); | 190 | offset = find_next_offset(map, offset); |
191 | if (offset >= BITS_PER_PAGE) | ||
192 | break; | ||
193 | pid = mk_pid(pid_ns, map, offset); | 193 | pid = mk_pid(pid_ns, map, offset); |
194 | } while (offset < BITS_PER_PAGE && pid < pid_max); | 194 | if (pid >= pid_max) |
195 | break; | ||
196 | } | ||
195 | } | 197 | } |
196 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { | 198 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { |
197 | ++map; | 199 | ++map; |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c6023..6917e8edb48e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -15,12 +15,10 @@ | |||
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/acct.h> | 16 | #include <linux/acct.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_ns.h> |
19 | #include <linux/reboot.h> | 19 | #include <linux/reboot.h> |
20 | #include <linux/export.h> | 20 | #include <linux/export.h> |
21 | 21 | ||
22 | #define BITS_PER_PAGE (PAGE_SIZE*8) | ||
23 | |||
24 | struct pid_cache { | 22 | struct pid_cache { |
25 | int nr_ids; | 23 | int nr_ids; |
26 | char name[16]; | 24 | char name[16]; |
@@ -181,6 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
181 | int nr; | 179 | int nr; |
182 | int rc; | 180 | int rc; |
183 | struct task_struct *task, *me = current; | 181 | struct task_struct *task, *me = current; |
182 | int init_pids = thread_group_leader(me) ? 1 : 2; | ||
184 | 183 | ||
185 | /* Don't allow any more processes into the pid namespace */ | 184 | /* Don't allow any more processes into the pid namespace */ |
186 | disable_pid_allocation(pid_ns); | 185 | disable_pid_allocation(pid_ns); |
@@ -230,7 +229,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
230 | */ | 229 | */ |
231 | for (;;) { | 230 | for (;;) { |
232 | set_current_state(TASK_UNINTERRUPTIBLE); | 231 | set_current_state(TASK_UNINTERRUPTIBLE); |
233 | if (pid_ns->nr_hashed == 1) | 232 | if (pid_ns->nr_hashed == init_pids) |
234 | break; | 233 | break; |
235 | schedule(); | 234 | schedule(); |
236 | } | 235 | } |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8fd709c9bb58..42670e9b44e0 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -10,6 +10,8 @@ | |||
10 | #include <linux/kernel_stat.h> | 10 | #include <linux/kernel_stat.h> |
11 | #include <trace/events/timer.h> | 11 | #include <trace/events/timer.h> |
12 | #include <linux/random.h> | 12 | #include <linux/random.h> |
13 | #include <linux/tick.h> | ||
14 | #include <linux/workqueue.h> | ||
13 | 15 | ||
14 | /* | 16 | /* |
15 | * Called after updating RLIMIT_CPU to run cpu timer and update | 17 | * Called after updating RLIMIT_CPU to run cpu timer and update |
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer, | |||
153 | } | 155 | } |
154 | } | 156 | } |
155 | 157 | ||
158 | /** | ||
159 | * task_cputime_zero - Check a task_cputime struct for all zero fields. | ||
160 | * | ||
161 | * @cputime: The struct to compare. | ||
162 | * | ||
163 | * Checks @cputime to see if all fields are zero. Returns true if all fields | ||
164 | * are zero, false if any field is nonzero. | ||
165 | */ | ||
166 | static inline int task_cputime_zero(const struct task_cputime *cputime) | ||
167 | { | ||
168 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) | ||
169 | return 1; | ||
170 | return 0; | ||
171 | } | ||
172 | |||
156 | static inline cputime_t prof_ticks(struct task_struct *p) | 173 | static inline cputime_t prof_ticks(struct task_struct *p) |
157 | { | 174 | { |
158 | cputime_t utime, stime; | 175 | cputime_t utime, stime; |
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
636 | return 0; | 653 | return 0; |
637 | } | 654 | } |
638 | 655 | ||
656 | #ifdef CONFIG_NO_HZ_FULL | ||
657 | static void nohz_kick_work_fn(struct work_struct *work) | ||
658 | { | ||
659 | tick_nohz_full_kick_all(); | ||
660 | } | ||
661 | |||
662 | static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); | ||
663 | |||
664 | /* | ||
665 | * We need the IPIs to be sent from sane process context. | ||
666 | * The posix cpu timers are always set with irqs disabled. | ||
667 | */ | ||
668 | static void posix_cpu_timer_kick_nohz(void) | ||
669 | { | ||
670 | schedule_work(&nohz_kick_work); | ||
671 | } | ||
672 | |||
673 | bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) | ||
674 | { | ||
675 | if (!task_cputime_zero(&tsk->cputime_expires)) | ||
676 | return false; | ||
677 | |||
678 | if (tsk->signal->cputimer.running) | ||
679 | return false; | ||
680 | |||
681 | return true; | ||
682 | } | ||
683 | #else | ||
684 | static inline void posix_cpu_timer_kick_nohz(void) { } | ||
685 | #endif | ||
686 | |||
639 | /* | 687 | /* |
640 | * Guts of sys_timer_settime for CPU timers. | 688 | * Guts of sys_timer_settime for CPU timers. |
641 | * This is called with the timer locked and interrupts disabled. | 689 | * This is called with the timer locked and interrupts disabled. |
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
794 | sample_to_timespec(timer->it_clock, | 842 | sample_to_timespec(timer->it_clock, |
795 | old_incr, &old->it_interval); | 843 | old_incr, &old->it_interval); |
796 | } | 844 | } |
845 | if (!ret) | ||
846 | posix_cpu_timer_kick_nohz(); | ||
797 | return ret; | 847 | return ret; |
798 | } | 848 | } |
799 | 849 | ||
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1008 | } | 1058 | } |
1009 | } | 1059 | } |
1010 | 1060 | ||
1011 | /** | ||
1012 | * task_cputime_zero - Check a task_cputime struct for all zero fields. | ||
1013 | * | ||
1014 | * @cputime: The struct to compare. | ||
1015 | * | ||
1016 | * Checks @cputime to see if all fields are zero. Returns true if all fields | ||
1017 | * are zero, false if any field is nonzero. | ||
1018 | */ | ||
1019 | static inline int task_cputime_zero(const struct task_cputime *cputime) | ||
1020 | { | ||
1021 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) | ||
1022 | return 1; | ||
1023 | return 0; | ||
1024 | } | ||
1025 | |||
1026 | /* | 1061 | /* |
1027 | * Check for any per-thread CPU timers that have fired and move them | 1062 | * Check for any per-thread CPU timers that have fired and move them |
1028 | * off the tsk->*_timers list onto the firing list. Per-thread timers | 1063 | * off the tsk->*_timers list onto the firing list. Per-thread timers |
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1336 | cpu_timer_fire(timer); | 1371 | cpu_timer_fire(timer); |
1337 | spin_unlock(&timer->it_lock); | 1372 | spin_unlock(&timer->it_lock); |
1338 | } | 1373 | } |
1374 | |||
1375 | /* | ||
1376 | * In case some timers were rescheduled after the queue got emptied, | ||
1377 | * wake up full dynticks CPUs. | ||
1378 | */ | ||
1379 | if (tsk->signal->cputimer.running) | ||
1380 | posix_cpu_timer_kick_nohz(); | ||
1339 | } | 1381 | } |
1340 | 1382 | ||
1341 | /* | 1383 | /* |
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1366 | } | 1408 | } |
1367 | 1409 | ||
1368 | if (!*newval) | 1410 | if (!*newval) |
1369 | return; | 1411 | goto out; |
1370 | *newval += now.cpu; | 1412 | *newval += now.cpu; |
1371 | } | 1413 | } |
1372 | 1414 | ||
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1384 | tsk->signal->cputime_expires.virt_exp = *newval; | 1426 | tsk->signal->cputime_expires.virt_exp = *newval; |
1385 | break; | 1427 | break; |
1386 | } | 1428 | } |
1429 | out: | ||
1430 | posix_cpu_timer_kick_nohz(); | ||
1387 | } | 1431 | } |
1388 | 1432 | ||
1389 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | 1433 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 6edbb2c55c22..424c2d4265c9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -40,38 +40,31 @@ | |||
40 | #include <linux/list.h> | 40 | #include <linux/list.h> |
41 | #include <linux/init.h> | 41 | #include <linux/init.h> |
42 | #include <linux/compiler.h> | 42 | #include <linux/compiler.h> |
43 | #include <linux/idr.h> | 43 | #include <linux/hash.h> |
44 | #include <linux/posix-clock.h> | 44 | #include <linux/posix-clock.h> |
45 | #include <linux/posix-timers.h> | 45 | #include <linux/posix-timers.h> |
46 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
47 | #include <linux/wait.h> | 47 | #include <linux/wait.h> |
48 | #include <linux/workqueue.h> | 48 | #include <linux/workqueue.h> |
49 | #include <linux/export.h> | 49 | #include <linux/export.h> |
50 | #include <linux/hashtable.h> | ||
50 | 51 | ||
51 | /* | 52 | /* |
52 | * Management arrays for POSIX timers. Timers are kept in slab memory | 53 | * Management arrays for POSIX timers. Timers are now kept in static hash table |
53 | * Timer ids are allocated by an external routine that keeps track of the | 54 | * with 512 entries. |
54 | * id and the timer. The external interface is: | 55 | * Timer ids are allocated by local routine, which selects proper hash head by |
55 | * | 56 | * key, constructed from current->signal address and per signal struct counter. |
56 | * void *idr_find(struct idr *idp, int id); to find timer_id <id> | 57 | * This keeps timer ids unique per process, but now they can intersect between |
57 | * int idr_get_new(struct idr *idp, void *ptr); to get a new id and | 58 | * processes. |
58 | * related it to <ptr> | ||
59 | * void idr_remove(struct idr *idp, int id); to release <id> | ||
60 | * void idr_init(struct idr *idp); to initialize <idp> | ||
61 | * which we supply. | ||
62 | * The idr_get_new *may* call slab for more memory so it must not be | ||
63 | * called under a spin lock. Likewise idr_remore may release memory | ||
64 | * (but it may be ok to do this under a lock...). | ||
65 | * idr_find is just a memory look up and is quite fast. A -1 return | ||
66 | * indicates that the requested id does not exist. | ||
67 | */ | 59 | */ |
68 | 60 | ||
69 | /* | 61 | /* |
70 | * Lets keep our timers in a slab cache :-) | 62 | * Lets keep our timers in a slab cache :-) |
71 | */ | 63 | */ |
72 | static struct kmem_cache *posix_timers_cache; | 64 | static struct kmem_cache *posix_timers_cache; |
73 | static struct idr posix_timers_id; | 65 | |
74 | static DEFINE_SPINLOCK(idr_lock); | 66 | static DEFINE_HASHTABLE(posix_timers_hashtable, 9); |
67 | static DEFINE_SPINLOCK(hash_lock); | ||
75 | 68 | ||
76 | /* | 69 | /* |
77 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other | 70 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other |
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); | |||
152 | __timr; \ | 145 | __timr; \ |
153 | }) | 146 | }) |
154 | 147 | ||
148 | static int hash(struct signal_struct *sig, unsigned int nr) | ||
149 | { | ||
150 | return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); | ||
151 | } | ||
152 | |||
153 | static struct k_itimer *__posix_timers_find(struct hlist_head *head, | ||
154 | struct signal_struct *sig, | ||
155 | timer_t id) | ||
156 | { | ||
157 | struct k_itimer *timer; | ||
158 | |||
159 | hlist_for_each_entry_rcu(timer, head, t_hash) { | ||
160 | if ((timer->it_signal == sig) && (timer->it_id == id)) | ||
161 | return timer; | ||
162 | } | ||
163 | return NULL; | ||
164 | } | ||
165 | |||
166 | static struct k_itimer *posix_timer_by_id(timer_t id) | ||
167 | { | ||
168 | struct signal_struct *sig = current->signal; | ||
169 | struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; | ||
170 | |||
171 | return __posix_timers_find(head, sig, id); | ||
172 | } | ||
173 | |||
174 | static int posix_timer_add(struct k_itimer *timer) | ||
175 | { | ||
176 | struct signal_struct *sig = current->signal; | ||
177 | int first_free_id = sig->posix_timer_id; | ||
178 | struct hlist_head *head; | ||
179 | int ret = -ENOENT; | ||
180 | |||
181 | do { | ||
182 | spin_lock(&hash_lock); | ||
183 | head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; | ||
184 | if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { | ||
185 | hlist_add_head_rcu(&timer->t_hash, head); | ||
186 | ret = sig->posix_timer_id; | ||
187 | } | ||
188 | if (++sig->posix_timer_id < 0) | ||
189 | sig->posix_timer_id = 0; | ||
190 | if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) | ||
191 | /* Loop over all possible ids completed */ | ||
192 | ret = -EAGAIN; | ||
193 | spin_unlock(&hash_lock); | ||
194 | } while (ret == -ENOENT); | ||
195 | return ret; | ||
196 | } | ||
197 | |||
155 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | 198 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) |
156 | { | 199 | { |
157 | spin_unlock_irqrestore(&timr->it_lock, flags); | 200 | spin_unlock_irqrestore(&timr->it_lock, flags); |
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) | |||
221 | return 0; | 264 | return 0; |
222 | } | 265 | } |
223 | 266 | ||
267 | static int posix_get_tai(clockid_t which_clock, struct timespec *tp) | ||
268 | { | ||
269 | timekeeping_clocktai(tp); | ||
270 | return 0; | ||
271 | } | ||
224 | 272 | ||
225 | /* | 273 | /* |
226 | * Initialize everything, well, just everything in Posix clocks/timers ;) | 274 | * Initialize everything, well, just everything in Posix clocks/timers ;) |
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void) | |||
261 | .clock_getres = posix_get_coarse_res, | 309 | .clock_getres = posix_get_coarse_res, |
262 | .clock_get = posix_get_monotonic_coarse, | 310 | .clock_get = posix_get_monotonic_coarse, |
263 | }; | 311 | }; |
312 | struct k_clock clock_tai = { | ||
313 | .clock_getres = hrtimer_get_res, | ||
314 | .clock_get = posix_get_tai, | ||
315 | .nsleep = common_nsleep, | ||
316 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
317 | .timer_create = common_timer_create, | ||
318 | .timer_set = common_timer_set, | ||
319 | .timer_get = common_timer_get, | ||
320 | .timer_del = common_timer_del, | ||
321 | }; | ||
264 | struct k_clock clock_boottime = { | 322 | struct k_clock clock_boottime = { |
265 | .clock_getres = hrtimer_get_res, | 323 | .clock_getres = hrtimer_get_res, |
266 | .clock_get = posix_get_boottime, | 324 | .clock_get = posix_get_boottime, |
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void) | |||
278 | posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); | 336 | posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); |
279 | posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); | 337 | posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); |
280 | posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); | 338 | posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); |
339 | posix_timers_register_clock(CLOCK_TAI, &clock_tai); | ||
281 | 340 | ||
282 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | 341 | posix_timers_cache = kmem_cache_create("posix_timers_cache", |
283 | sizeof (struct k_itimer), 0, SLAB_PANIC, | 342 | sizeof (struct k_itimer), 0, SLAB_PANIC, |
284 | NULL); | 343 | NULL); |
285 | idr_init(&posix_timers_id); | ||
286 | return 0; | 344 | return 0; |
287 | } | 345 | } |
288 | 346 | ||
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
504 | { | 562 | { |
505 | if (it_id_set) { | 563 | if (it_id_set) { |
506 | unsigned long flags; | 564 | unsigned long flags; |
507 | spin_lock_irqsave(&idr_lock, flags); | 565 | spin_lock_irqsave(&hash_lock, flags); |
508 | idr_remove(&posix_timers_id, tmr->it_id); | 566 | hlist_del_rcu(&tmr->t_hash); |
509 | spin_unlock_irqrestore(&idr_lock, flags); | 567 | spin_unlock_irqrestore(&hash_lock, flags); |
510 | } | 568 | } |
511 | put_pid(tmr->it_pid); | 569 | put_pid(tmr->it_pid); |
512 | sigqueue_free(tmr->sigq); | 570 | sigqueue_free(tmr->sigq); |
@@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
552 | return -EAGAIN; | 610 | return -EAGAIN; |
553 | 611 | ||
554 | spin_lock_init(&new_timer->it_lock); | 612 | spin_lock_init(&new_timer->it_lock); |
555 | 613 | new_timer_id = posix_timer_add(new_timer); | |
556 | idr_preload(GFP_KERNEL); | 614 | if (new_timer_id < 0) { |
557 | spin_lock_irq(&idr_lock); | 615 | error = new_timer_id; |
558 | error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); | ||
559 | spin_unlock_irq(&idr_lock); | ||
560 | idr_preload_end(); | ||
561 | if (error < 0) { | ||
562 | /* | ||
563 | * Weird looking, but we return EAGAIN if the IDR is | ||
564 | * full (proper POSIX return value for this) | ||
565 | */ | ||
566 | if (error == -ENOSPC) | ||
567 | error = -EAGAIN; | ||
568 | goto out; | 616 | goto out; |
569 | } | 617 | } |
570 | new_timer_id = error; | ||
571 | 618 | ||
572 | it_id_set = IT_ID_SET; | 619 | it_id_set = IT_ID_SET; |
573 | new_timer->it_id = (timer_t) new_timer_id; | 620 | new_timer->it_id = (timer_t) new_timer_id; |
@@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) | |||
645 | return NULL; | 692 | return NULL; |
646 | 693 | ||
647 | rcu_read_lock(); | 694 | rcu_read_lock(); |
648 | timr = idr_find(&posix_timers_id, (int)timer_id); | 695 | timr = posix_timer_by_id(timer_id); |
649 | if (timr) { | 696 | if (timr) { |
650 | spin_lock_irqsave(&timr->it_lock, *flags); | 697 | spin_lock_irqsave(&timr->it_lock, *flags); |
651 | if (timr->it_signal == current->signal) { | 698 | if (timr->it_signal == current->signal) { |
diff --git a/kernel/power/console.c b/kernel/power/console.c index b1dc456474b5..463aa6736751 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
@@ -4,6 +4,7 @@ | |||
4 | * Originally from swsusp. | 4 | * Originally from swsusp. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/console.h> | ||
7 | #include <linux/vt_kern.h> | 8 | #include <linux/vt_kern.h> |
8 | #include <linux/kbd_kern.h> | 9 | #include <linux/kbd_kern.h> |
9 | #include <linux/vt.h> | 10 | #include <linux/vt.h> |
@@ -14,8 +15,120 @@ | |||
14 | 15 | ||
15 | static int orig_fgconsole, orig_kmsg; | 16 | static int orig_fgconsole, orig_kmsg; |
16 | 17 | ||
18 | static DEFINE_MUTEX(vt_switch_mutex); | ||
19 | |||
20 | struct pm_vt_switch { | ||
21 | struct list_head head; | ||
22 | struct device *dev; | ||
23 | bool required; | ||
24 | }; | ||
25 | |||
26 | static LIST_HEAD(pm_vt_switch_list); | ||
27 | |||
28 | |||
29 | /** | ||
30 | * pm_vt_switch_required - indicate VT switch at suspend requirements | ||
31 | * @dev: device | ||
32 | * @required: if true, caller needs VT switch at suspend/resume time | ||
33 | * | ||
34 | * The different console drivers may or may not require VT switches across | ||
35 | * suspend/resume, depending on how they handle restoring video state and | ||
36 | * what may be running. | ||
37 | * | ||
38 | * Drivers can indicate support for switchless suspend/resume, which can | ||
39 | * save time and flicker, by using this routine and passing 'false' as | ||
40 | * the argument. If any loaded driver needs VT switching, or the | ||
41 | * no_console_suspend argument has been passed on the command line, VT | ||
42 | * switches will occur. | ||
43 | */ | ||
44 | void pm_vt_switch_required(struct device *dev, bool required) | ||
45 | { | ||
46 | struct pm_vt_switch *entry, *tmp; | ||
47 | |||
48 | mutex_lock(&vt_switch_mutex); | ||
49 | list_for_each_entry(tmp, &pm_vt_switch_list, head) { | ||
50 | if (tmp->dev == dev) { | ||
51 | /* already registered, update requirement */ | ||
52 | tmp->required = required; | ||
53 | goto out; | ||
54 | } | ||
55 | } | ||
56 | |||
57 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | ||
58 | if (!entry) | ||
59 | goto out; | ||
60 | |||
61 | entry->required = required; | ||
62 | entry->dev = dev; | ||
63 | |||
64 | list_add(&entry->head, &pm_vt_switch_list); | ||
65 | out: | ||
66 | mutex_unlock(&vt_switch_mutex); | ||
67 | } | ||
68 | EXPORT_SYMBOL(pm_vt_switch_required); | ||
69 | |||
70 | /** | ||
71 | * pm_vt_switch_unregister - stop tracking a device's VT switching needs | ||
72 | * @dev: device | ||
73 | * | ||
74 | * Remove @dev from the vt switch list. | ||
75 | */ | ||
76 | void pm_vt_switch_unregister(struct device *dev) | ||
77 | { | ||
78 | struct pm_vt_switch *tmp; | ||
79 | |||
80 | mutex_lock(&vt_switch_mutex); | ||
81 | list_for_each_entry(tmp, &pm_vt_switch_list, head) { | ||
82 | if (tmp->dev == dev) { | ||
83 | list_del(&tmp->head); | ||
84 | break; | ||
85 | } | ||
86 | } | ||
87 | mutex_unlock(&vt_switch_mutex); | ||
88 | } | ||
89 | EXPORT_SYMBOL(pm_vt_switch_unregister); | ||
90 | |||
91 | /* | ||
92 | * There are three cases when a VT switch on suspend/resume are required: | ||
93 | * 1) no driver has indicated a requirement one way or another, so preserve | ||
94 | * the old behavior | ||
95 | * 2) console suspend is disabled, we want to see debug messages across | ||
96 | * suspend/resume | ||
97 | * 3) any registered driver indicates it needs a VT switch | ||
98 | * | ||
99 | * If none of these conditions is present, meaning we have at least one driver | ||
100 | * that doesn't need the switch, and none that do, we can avoid it to make | ||
101 | * resume look a little prettier (and suspend too, but that's usually hidden, | ||
102 | * e.g. when closing the lid on a laptop). | ||
103 | */ | ||
104 | static bool pm_vt_switch(void) | ||
105 | { | ||
106 | struct pm_vt_switch *entry; | ||
107 | bool ret = true; | ||
108 | |||
109 | mutex_lock(&vt_switch_mutex); | ||
110 | if (list_empty(&pm_vt_switch_list)) | ||
111 | goto out; | ||
112 | |||
113 | if (!console_suspend_enabled) | ||
114 | goto out; | ||
115 | |||
116 | list_for_each_entry(entry, &pm_vt_switch_list, head) { | ||
117 | if (entry->required) | ||
118 | goto out; | ||
119 | } | ||
120 | |||
121 | ret = false; | ||
122 | out: | ||
123 | mutex_unlock(&vt_switch_mutex); | ||
124 | return ret; | ||
125 | } | ||
126 | |||
17 | int pm_prepare_console(void) | 127 | int pm_prepare_console(void) |
18 | { | 128 | { |
129 | if (!pm_vt_switch()) | ||
130 | return 0; | ||
131 | |||
19 | orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); | 132 | orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); |
20 | if (orig_fgconsole < 0) | 133 | if (orig_fgconsole < 0) |
21 | return 1; | 134 | return 1; |
@@ -26,6 +139,9 @@ int pm_prepare_console(void) | |||
26 | 139 | ||
27 | void pm_restore_console(void) | 140 | void pm_restore_console(void) |
28 | { | 141 | { |
142 | if (!pm_vt_switch()) | ||
143 | return; | ||
144 | |||
29 | if (orig_fgconsole >= 0) { | 145 | if (orig_fgconsole >= 0) { |
30 | vt_move_to_console(orig_fgconsole, 0); | 146 | vt_move_to_console(orig_fgconsole, 0); |
31 | vt_kmsg_redirect(orig_kmsg); | 147 | vt_kmsg_redirect(orig_kmsg); |
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 68197a4e8fc9..7ef6866b521d 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
@@ -32,7 +32,7 @@ static void handle_poweroff(int key) | |||
32 | 32 | ||
33 | static struct sysrq_key_op sysrq_poweroff_op = { | 33 | static struct sysrq_key_op sysrq_poweroff_op = { |
34 | .handler = handle_poweroff, | 34 | .handler = handle_poweroff, |
35 | .help_msg = "powerOff", | 35 | .help_msg = "poweroff(o)", |
36 | .action_msg = "Power Off", | 36 | .action_msg = "Power Off", |
37 | .enable_mask = SYSRQ_ENABLE_BOOT, | 37 | .enable_mask = SYSRQ_ENABLE_BOOT, |
38 | }; | 38 | }; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d4feda084a3a..bef86d121eb2 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); | |||
76 | 76 | ||
77 | bool valid_state(suspend_state_t state) | 77 | bool valid_state(suspend_state_t state) |
78 | { | 78 | { |
79 | if (state == PM_SUSPEND_FREEZE) | 79 | if (state == PM_SUSPEND_FREEZE) { |
80 | return true; | 80 | #ifdef CONFIG_PM_DEBUG |
81 | if (pm_test_level != TEST_NONE && | ||
82 | pm_test_level != TEST_FREEZER && | ||
83 | pm_test_level != TEST_DEVICES && | ||
84 | pm_test_level != TEST_PLATFORM) { | ||
85 | printk(KERN_WARNING "Unsupported pm_test mode for " | ||
86 | "freeze state, please choose " | ||
87 | "none/freezer/devices/platform.\n"); | ||
88 | return false; | ||
89 | } | ||
90 | #endif | ||
91 | return true; | ||
92 | } | ||
81 | /* | 93 | /* |
82 | * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel | 94 | * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel |
83 | * support and need to be valid to the lowlevel | 95 | * support and need to be valid to the lowlevel |
@@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
184 | goto Platform_wake; | 196 | goto Platform_wake; |
185 | } | 197 | } |
186 | 198 | ||
199 | if (suspend_test(TEST_PLATFORM)) | ||
200 | goto Platform_wake; | ||
201 | |||
187 | /* | 202 | /* |
188 | * PM_SUSPEND_FREEZE equals | 203 | * PM_SUSPEND_FREEZE equals |
189 | * frozen processes + suspended devices + idle processors. | 204 | * frozen processes + suspended devices + idle processors. |
@@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
195 | goto Platform_wake; | 210 | goto Platform_wake; |
196 | } | 211 | } |
197 | 212 | ||
198 | if (suspend_test(TEST_PLATFORM)) | ||
199 | goto Platform_wake; | ||
200 | |||
201 | error = disable_nonboot_cpus(); | 213 | error = disable_nonboot_cpus(); |
202 | if (error || suspend_test(TEST_CPUS)) | 214 | if (error || suspend_test(TEST_CPUS)) |
203 | goto Enable_cpus; | 215 | goto Enable_cpus; |
diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335a..fa36e1494420 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | #include <linux/memblock.h> | 34 | #include <linux/memblock.h> |
35 | #include <linux/aio.h> | ||
35 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
36 | #include <linux/kexec.h> | 37 | #include <linux/kexec.h> |
37 | #include <linux/kdb.h> | 38 | #include <linux/kdb.h> |
@@ -43,19 +44,13 @@ | |||
43 | #include <linux/rculist.h> | 44 | #include <linux/rculist.h> |
44 | #include <linux/poll.h> | 45 | #include <linux/poll.h> |
45 | #include <linux/irq_work.h> | 46 | #include <linux/irq_work.h> |
47 | #include <linux/utsname.h> | ||
46 | 48 | ||
47 | #include <asm/uaccess.h> | 49 | #include <asm/uaccess.h> |
48 | 50 | ||
49 | #define CREATE_TRACE_POINTS | 51 | #define CREATE_TRACE_POINTS |
50 | #include <trace/events/printk.h> | 52 | #include <trace/events/printk.h> |
51 | 53 | ||
52 | /* | ||
53 | * Architectures can override it: | ||
54 | */ | ||
55 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | ||
56 | { | ||
57 | } | ||
58 | |||
59 | /* printk's without a loglevel use this.. */ | 54 | /* printk's without a loglevel use this.. */ |
60 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 55 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
61 | 56 | ||
@@ -63,8 +58,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
63 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ | 58 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ |
64 | #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ | 59 | #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ |
65 | 60 | ||
66 | DECLARE_WAIT_QUEUE_HEAD(log_wait); | ||
67 | |||
68 | int console_printk[4] = { | 61 | int console_printk[4] = { |
69 | DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ | 62 | DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ |
70 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ | 63 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ |
@@ -224,6 +217,7 @@ struct log { | |||
224 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | 217 | static DEFINE_RAW_SPINLOCK(logbuf_lock); |
225 | 218 | ||
226 | #ifdef CONFIG_PRINTK | 219 | #ifdef CONFIG_PRINTK |
220 | DECLARE_WAIT_QUEUE_HEAD(log_wait); | ||
227 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | 221 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ |
228 | static u64 syslog_seq; | 222 | static u64 syslog_seq; |
229 | static u32 syslog_idx; | 223 | static u32 syslog_idx; |
@@ -609,7 +603,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | |||
609 | /* return error when data has vanished underneath us */ | 603 | /* return error when data has vanished underneath us */ |
610 | if (user->seq < log_first_seq) | 604 | if (user->seq < log_first_seq) |
611 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | 605 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; |
612 | ret = POLLIN|POLLRDNORM; | 606 | else |
607 | ret = POLLIN|POLLRDNORM; | ||
613 | } | 608 | } |
614 | raw_spin_unlock_irq(&logbuf_lock); | 609 | raw_spin_unlock_irq(&logbuf_lock); |
615 | 610 | ||
@@ -1266,7 +1261,7 @@ static void call_console_drivers(int level, const char *text, size_t len) | |||
1266 | { | 1261 | { |
1267 | struct console *con; | 1262 | struct console *con; |
1268 | 1263 | ||
1269 | trace_console(text, 0, len, len); | 1264 | trace_console(text, len); |
1270 | 1265 | ||
1271 | if (level >= console_loglevel && !ignore_loglevel) | 1266 | if (level >= console_loglevel && !ignore_loglevel) |
1272 | return; | 1267 | return; |
@@ -1724,6 +1719,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; } | |||
1724 | 1719 | ||
1725 | #endif /* CONFIG_PRINTK */ | 1720 | #endif /* CONFIG_PRINTK */ |
1726 | 1721 | ||
1722 | #ifdef CONFIG_EARLY_PRINTK | ||
1723 | struct console *early_console; | ||
1724 | |||
1725 | void early_vprintk(const char *fmt, va_list ap) | ||
1726 | { | ||
1727 | if (early_console) { | ||
1728 | char buf[512]; | ||
1729 | int n = vscnprintf(buf, sizeof(buf), fmt, ap); | ||
1730 | |||
1731 | early_console->write(early_console, buf, n); | ||
1732 | } | ||
1733 | } | ||
1734 | |||
1735 | asmlinkage void early_printk(const char *fmt, ...) | ||
1736 | { | ||
1737 | va_list ap; | ||
1738 | |||
1739 | va_start(ap, fmt); | ||
1740 | early_vprintk(fmt, ap); | ||
1741 | va_end(ap); | ||
1742 | } | ||
1743 | #endif | ||
1744 | |||
1727 | static int __add_preferred_console(char *name, int idx, char *options, | 1745 | static int __add_preferred_console(char *name, int idx, char *options, |
1728 | char *brl_options) | 1746 | char *brl_options) |
1729 | { | 1747 | { |
@@ -1957,45 +1975,6 @@ int is_console_locked(void) | |||
1957 | return console_locked; | 1975 | return console_locked; |
1958 | } | 1976 | } |
1959 | 1977 | ||
1960 | /* | ||
1961 | * Delayed printk version, for scheduler-internal messages: | ||
1962 | */ | ||
1963 | #define PRINTK_BUF_SIZE 512 | ||
1964 | |||
1965 | #define PRINTK_PENDING_WAKEUP 0x01 | ||
1966 | #define PRINTK_PENDING_SCHED 0x02 | ||
1967 | |||
1968 | static DEFINE_PER_CPU(int, printk_pending); | ||
1969 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | ||
1970 | |||
1971 | static void wake_up_klogd_work_func(struct irq_work *irq_work) | ||
1972 | { | ||
1973 | int pending = __this_cpu_xchg(printk_pending, 0); | ||
1974 | |||
1975 | if (pending & PRINTK_PENDING_SCHED) { | ||
1976 | char *buf = __get_cpu_var(printk_sched_buf); | ||
1977 | printk(KERN_WARNING "[sched_delayed] %s", buf); | ||
1978 | } | ||
1979 | |||
1980 | if (pending & PRINTK_PENDING_WAKEUP) | ||
1981 | wake_up_interruptible(&log_wait); | ||
1982 | } | ||
1983 | |||
1984 | static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { | ||
1985 | .func = wake_up_klogd_work_func, | ||
1986 | .flags = IRQ_WORK_LAZY, | ||
1987 | }; | ||
1988 | |||
1989 | void wake_up_klogd(void) | ||
1990 | { | ||
1991 | preempt_disable(); | ||
1992 | if (waitqueue_active(&log_wait)) { | ||
1993 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | ||
1994 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | ||
1995 | } | ||
1996 | preempt_enable(); | ||
1997 | } | ||
1998 | |||
1999 | static void console_cont_flush(char *text, size_t size) | 1978 | static void console_cont_flush(char *text, size_t size) |
2000 | { | 1979 | { |
2001 | unsigned long flags; | 1980 | unsigned long flags; |
@@ -2458,6 +2437,44 @@ static int __init printk_late_init(void) | |||
2458 | late_initcall(printk_late_init); | 2437 | late_initcall(printk_late_init); |
2459 | 2438 | ||
2460 | #if defined CONFIG_PRINTK | 2439 | #if defined CONFIG_PRINTK |
2440 | /* | ||
2441 | * Delayed printk version, for scheduler-internal messages: | ||
2442 | */ | ||
2443 | #define PRINTK_BUF_SIZE 512 | ||
2444 | |||
2445 | #define PRINTK_PENDING_WAKEUP 0x01 | ||
2446 | #define PRINTK_PENDING_SCHED 0x02 | ||
2447 | |||
2448 | static DEFINE_PER_CPU(int, printk_pending); | ||
2449 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | ||
2450 | |||
2451 | static void wake_up_klogd_work_func(struct irq_work *irq_work) | ||
2452 | { | ||
2453 | int pending = __this_cpu_xchg(printk_pending, 0); | ||
2454 | |||
2455 | if (pending & PRINTK_PENDING_SCHED) { | ||
2456 | char *buf = __get_cpu_var(printk_sched_buf); | ||
2457 | printk(KERN_WARNING "[sched_delayed] %s", buf); | ||
2458 | } | ||
2459 | |||
2460 | if (pending & PRINTK_PENDING_WAKEUP) | ||
2461 | wake_up_interruptible(&log_wait); | ||
2462 | } | ||
2463 | |||
2464 | static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { | ||
2465 | .func = wake_up_klogd_work_func, | ||
2466 | .flags = IRQ_WORK_LAZY, | ||
2467 | }; | ||
2468 | |||
2469 | void wake_up_klogd(void) | ||
2470 | { | ||
2471 | preempt_disable(); | ||
2472 | if (waitqueue_active(&log_wait)) { | ||
2473 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | ||
2474 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | ||
2475 | } | ||
2476 | preempt_enable(); | ||
2477 | } | ||
2461 | 2478 | ||
2462 | int printk_sched(const char *fmt, ...) | 2479 | int printk_sched(const char *fmt, ...) |
2463 | { | 2480 | { |
@@ -2834,4 +2851,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) | |||
2834 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2851 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
2835 | } | 2852 | } |
2836 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); | 2853 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); |
2854 | |||
2855 | static char dump_stack_arch_desc_str[128]; | ||
2856 | |||
2857 | /** | ||
2858 | * dump_stack_set_arch_desc - set arch-specific str to show with task dumps | ||
2859 | * @fmt: printf-style format string | ||
2860 | * @...: arguments for the format string | ||
2861 | * | ||
2862 | * The configured string will be printed right after utsname during task | ||
2863 | * dumps. Usually used to add arch-specific system identifiers. If an | ||
2864 | * arch wants to make use of such an ID string, it should initialize this | ||
2865 | * as soon as possible during boot. | ||
2866 | */ | ||
2867 | void __init dump_stack_set_arch_desc(const char *fmt, ...) | ||
2868 | { | ||
2869 | va_list args; | ||
2870 | |||
2871 | va_start(args, fmt); | ||
2872 | vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), | ||
2873 | fmt, args); | ||
2874 | va_end(args); | ||
2875 | } | ||
2876 | |||
2877 | /** | ||
2878 | * dump_stack_print_info - print generic debug info for dump_stack() | ||
2879 | * @log_lvl: log level | ||
2880 | * | ||
2881 | * Arch-specific dump_stack() implementations can use this function to | ||
2882 | * print out the same debug information as the generic dump_stack(). | ||
2883 | */ | ||
2884 | void dump_stack_print_info(const char *log_lvl) | ||
2885 | { | ||
2886 | printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", | ||
2887 | log_lvl, raw_smp_processor_id(), current->pid, current->comm, | ||
2888 | print_tainted(), init_utsname()->release, | ||
2889 | (int)strcspn(init_utsname()->version, " "), | ||
2890 | init_utsname()->version); | ||
2891 | |||
2892 | if (dump_stack_arch_desc_str[0] != '\0') | ||
2893 | printk("%sHardware name: %s\n", | ||
2894 | log_lvl, dump_stack_arch_desc_str); | ||
2895 | |||
2896 | print_worker_info(log_lvl, current); | ||
2897 | } | ||
2898 | |||
2899 | /** | ||
2900 | * show_regs_print_info - print generic debug info for show_regs() | ||
2901 | * @log_lvl: log level | ||
2902 | * | ||
2903 | * show_regs() implementations can use this function to print out generic | ||
2904 | * debug information. | ||
2905 | */ | ||
2906 | void show_regs_print_info(const char *log_lvl) | ||
2907 | { | ||
2908 | dump_stack_print_info(log_lvl); | ||
2909 | |||
2910 | printk("%stask: %p ti: %p task.ti: %p\n", | ||
2911 | log_lvl, current, current_thread_info(), | ||
2912 | task_thread_info(current)); | ||
2913 | } | ||
2914 | |||
2837 | #endif | 2915 | #endif |
diff --git a/kernel/profile.c b/kernel/profile.c index dc3384ee874e..0bf400737660 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = { | |||
462 | .write = prof_cpu_mask_proc_write, | 462 | .write = prof_cpu_mask_proc_write, |
463 | }; | 463 | }; |
464 | 464 | ||
465 | void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | 465 | void create_prof_cpu_mask(void) |
466 | { | 466 | { |
467 | /* create /proc/irq/prof_cpu_mask */ | 467 | /* create /proc/irq/prof_cpu_mask */ |
468 | proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); | 468 | proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops); |
469 | } | 469 | } |
470 | 470 | ||
471 | /* | 471 | /* |
@@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ | |||
600 | NULL, &proc_profile_operations); | 600 | NULL, &proc_profile_operations); |
601 | if (!entry) | 601 | if (!entry) |
602 | return 0; | 602 | return 0; |
603 | entry->size = (1+prof_len) * sizeof(atomic_t); | 603 | proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); |
604 | hotcpu_notifier(profile_cpu_callback, 0); | 604 | hotcpu_notifier(profile_cpu_callback, 0); |
605 | return 0; | 605 | return 0; |
606 | } | 606 | } |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index acbd28424d81..aed981a3f69c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/ptrace.h> | 17 | #include <linux/ptrace.h> |
18 | #include <linux/security.h> | 18 | #include <linux/security.h> |
19 | #include <linux/signal.h> | 19 | #include <linux/signal.h> |
20 | #include <linux/uio.h> | ||
20 | #include <linux/audit.h> | 21 | #include <linux/audit.h> |
21 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
22 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
@@ -24,6 +25,7 @@ | |||
24 | #include <linux/regset.h> | 25 | #include <linux/regset.h> |
25 | #include <linux/hw_breakpoint.h> | 26 | #include <linux/hw_breakpoint.h> |
26 | #include <linux/cn_proc.h> | 27 | #include <linux/cn_proc.h> |
28 | #include <linux/compat.h> | ||
27 | 29 | ||
28 | 30 | ||
29 | static int ptrace_trapping_sleep_fn(void *flags) | 31 | static int ptrace_trapping_sleep_fn(void *flags) |
@@ -618,6 +620,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) | |||
618 | return error; | 620 | return error; |
619 | } | 621 | } |
620 | 622 | ||
623 | static int ptrace_peek_siginfo(struct task_struct *child, | ||
624 | unsigned long addr, | ||
625 | unsigned long data) | ||
626 | { | ||
627 | struct ptrace_peeksiginfo_args arg; | ||
628 | struct sigpending *pending; | ||
629 | struct sigqueue *q; | ||
630 | int ret, i; | ||
631 | |||
632 | ret = copy_from_user(&arg, (void __user *) addr, | ||
633 | sizeof(struct ptrace_peeksiginfo_args)); | ||
634 | if (ret) | ||
635 | return -EFAULT; | ||
636 | |||
637 | if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED) | ||
638 | return -EINVAL; /* unknown flags */ | ||
639 | |||
640 | if (arg.nr < 0) | ||
641 | return -EINVAL; | ||
642 | |||
643 | if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) | ||
644 | pending = &child->signal->shared_pending; | ||
645 | else | ||
646 | pending = &child->pending; | ||
647 | |||
648 | for (i = 0; i < arg.nr; ) { | ||
649 | siginfo_t info; | ||
650 | s32 off = arg.off + i; | ||
651 | |||
652 | spin_lock_irq(&child->sighand->siglock); | ||
653 | list_for_each_entry(q, &pending->list, list) { | ||
654 | if (!off--) { | ||
655 | copy_siginfo(&info, &q->info); | ||
656 | break; | ||
657 | } | ||
658 | } | ||
659 | spin_unlock_irq(&child->sighand->siglock); | ||
660 | |||
661 | if (off >= 0) /* beyond the end of the list */ | ||
662 | break; | ||
663 | |||
664 | #ifdef CONFIG_COMPAT | ||
665 | if (unlikely(is_compat_task())) { | ||
666 | compat_siginfo_t __user *uinfo = compat_ptr(data); | ||
667 | |||
668 | ret = copy_siginfo_to_user32(uinfo, &info); | ||
669 | ret |= __put_user(info.si_code, &uinfo->si_code); | ||
670 | } else | ||
671 | #endif | ||
672 | { | ||
673 | siginfo_t __user *uinfo = (siginfo_t __user *) data; | ||
674 | |||
675 | ret = copy_siginfo_to_user(uinfo, &info); | ||
676 | ret |= __put_user(info.si_code, &uinfo->si_code); | ||
677 | } | ||
678 | |||
679 | if (ret) { | ||
680 | ret = -EFAULT; | ||
681 | break; | ||
682 | } | ||
683 | |||
684 | data += sizeof(siginfo_t); | ||
685 | i++; | ||
686 | |||
687 | if (signal_pending(current)) | ||
688 | break; | ||
689 | |||
690 | cond_resched(); | ||
691 | } | ||
692 | |||
693 | if (i > 0) | ||
694 | return i; | ||
695 | |||
696 | return ret; | ||
697 | } | ||
621 | 698 | ||
622 | #ifdef PTRACE_SINGLESTEP | 699 | #ifdef PTRACE_SINGLESTEP |
623 | #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) | 700 | #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) |
@@ -748,6 +825,10 @@ int ptrace_request(struct task_struct *child, long request, | |||
748 | ret = put_user(child->ptrace_message, datalp); | 825 | ret = put_user(child->ptrace_message, datalp); |
749 | break; | 826 | break; |
750 | 827 | ||
828 | case PTRACE_PEEKSIGINFO: | ||
829 | ret = ptrace_peek_siginfo(child, addr, data); | ||
830 | break; | ||
831 | |||
751 | case PTRACE_GETSIGINFO: | 832 | case PTRACE_GETSIGINFO: |
752 | ret = ptrace_getsiginfo(child, &siginfo); | 833 | ret = ptrace_getsiginfo(child, &siginfo); |
753 | if (!ret) | 834 | if (!ret) |
diff --git a/kernel/range.c b/kernel/range.c index 9b8ae2d6ed68..eb911dbce267 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
@@ -48,9 +48,11 @@ int add_range_with_merge(struct range *range, int az, int nr_range, | |||
48 | final_start = min(range[i].start, start); | 48 | final_start = min(range[i].start, start); |
49 | final_end = max(range[i].end, end); | 49 | final_end = max(range[i].end, end); |
50 | 50 | ||
51 | range[i].start = final_start; | 51 | /* clear it and add it back for further merge */ |
52 | range[i].end = final_end; | 52 | range[i].start = 0; |
53 | return nr_range; | 53 | range[i].end = 0; |
54 | return add_range_with_merge(range, az, nr_range, | ||
55 | final_start, final_end); | ||
54 | } | 56 | } |
55 | 57 | ||
56 | /* Need to add it: */ | 58 | /* Need to add it: */ |
@@ -97,7 +99,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) | |||
97 | range[i].end = range[j].end; | 99 | range[i].end = range[j].end; |
98 | range[i].start = end; | 100 | range[i].start = end; |
99 | } else { | 101 | } else { |
100 | printk(KERN_ERR "run of slot in ranges\n"); | 102 | pr_err("%s: run out of slot in ranges\n", |
103 | __func__); | ||
101 | } | 104 | } |
102 | range[j].end = start; | 105 | range[j].end = start; |
103 | continue; | 106 | continue; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd86..16ea67925015 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -64,7 +64,7 @@ | |||
64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
66 | 66 | ||
67 | #define RCU_STATE_INITIALIZER(sname, cr) { \ | 67 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ |
68 | .level = { &sname##_state.node[0] }, \ | 68 | .level = { &sname##_state.node[0] }, \ |
69 | .call = cr, \ | 69 | .call = cr, \ |
70 | .fqs_state = RCU_GP_IDLE, \ | 70 | .fqs_state = RCU_GP_IDLE, \ |
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
78 | .name = #sname, \ | 78 | .name = #sname, \ |
79 | .abbr = sabbr, \ | ||
79 | } | 80 | } |
80 | 81 | ||
81 | struct rcu_state rcu_sched_state = | 82 | struct rcu_state rcu_sched_state = |
82 | RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); | 83 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
83 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | 84 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); |
84 | 85 | ||
85 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); | 86 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
86 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 87 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
87 | 88 | ||
88 | static struct rcu_state *rcu_state; | 89 | static struct rcu_state *rcu_state; |
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; | |||
223 | module_param(jiffies_till_first_fqs, ulong, 0644); | 224 | module_param(jiffies_till_first_fqs, ulong, 0644); |
224 | module_param(jiffies_till_next_fqs, ulong, 0644); | 225 | module_param(jiffies_till_next_fqs, ulong, 0644); |
225 | 226 | ||
227 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | ||
228 | struct rcu_data *rdp); | ||
226 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | 229 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); |
227 | static void force_quiescent_state(struct rcu_state *rsp); | 230 | static void force_quiescent_state(struct rcu_state *rsp); |
228 | static int rcu_pending(int cpu); | 231 | static int rcu_pending(int cpu); |
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
310 | 313 | ||
311 | if (rcu_gp_in_progress(rsp)) | 314 | if (rcu_gp_in_progress(rsp)) |
312 | return 0; /* No, a grace period is already in progress. */ | 315 | return 0; /* No, a grace period is already in progress. */ |
316 | if (rcu_nocb_needs_gp(rsp)) | ||
317 | return 1; /* Yes, a no-CBs CPU needs one. */ | ||
313 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 318 | if (!rdp->nxttail[RCU_NEXT_TAIL]) |
314 | return 0; /* No, this is a no-CBs (or offline) CPU. */ | 319 | return 0; /* No, this is a no-CBs (or offline) CPU. */ |
315 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | 320 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) |
@@ -794,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
794 | rdp->offline_fqs++; | 799 | rdp->offline_fqs++; |
795 | return 1; | 800 | return 1; |
796 | } | 801 | } |
802 | |||
803 | /* | ||
804 | * There is a possibility that a CPU in adaptive-ticks state | ||
805 | * might run in the kernel with the scheduling-clock tick disabled | ||
806 | * for an extended time period. Invoke rcu_kick_nohz_cpu() to | ||
807 | * force the CPU to restart the scheduling-clock tick in this | ||
808 | * CPU is in this state. | ||
809 | */ | ||
810 | rcu_kick_nohz_cpu(rdp->cpu); | ||
811 | |||
797 | return 0; | 812 | return 0; |
798 | } | 813 | } |
799 | 814 | ||
@@ -1035,10 +1050,11 @@ static void init_callback_list(struct rcu_data *rdp) | |||
1035 | { | 1050 | { |
1036 | int i; | 1051 | int i; |
1037 | 1052 | ||
1053 | if (init_nocb_callback_list(rdp)) | ||
1054 | return; | ||
1038 | rdp->nxtlist = NULL; | 1055 | rdp->nxtlist = NULL; |
1039 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1056 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1040 | rdp->nxttail[i] = &rdp->nxtlist; | 1057 | rdp->nxttail[i] = &rdp->nxtlist; |
1041 | init_nocb_callback_list(rdp); | ||
1042 | } | 1058 | } |
1043 | 1059 | ||
1044 | /* | 1060 | /* |
@@ -1071,6 +1087,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | |||
1071 | } | 1087 | } |
1072 | 1088 | ||
1073 | /* | 1089 | /* |
1090 | * Trace-event helper function for rcu_start_future_gp() and | ||
1091 | * rcu_nocb_wait_gp(). | ||
1092 | */ | ||
1093 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | ||
1094 | unsigned long c, char *s) | ||
1095 | { | ||
1096 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, | ||
1097 | rnp->completed, c, rnp->level, | ||
1098 | rnp->grplo, rnp->grphi, s); | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Start some future grace period, as needed to handle newly arrived | ||
1103 | * callbacks. The required future grace periods are recorded in each | ||
1104 | * rcu_node structure's ->need_future_gp field. | ||
1105 | * | ||
1106 | * The caller must hold the specified rcu_node structure's ->lock. | ||
1107 | */ | ||
1108 | static unsigned long __maybe_unused | ||
1109 | rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | ||
1110 | { | ||
1111 | unsigned long c; | ||
1112 | int i; | ||
1113 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | ||
1114 | |||
1115 | /* | ||
1116 | * Pick up grace-period number for new callbacks. If this | ||
1117 | * grace period is already marked as needed, return to the caller. | ||
1118 | */ | ||
1119 | c = rcu_cbs_completed(rdp->rsp, rnp); | ||
1120 | trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); | ||
1121 | if (rnp->need_future_gp[c & 0x1]) { | ||
1122 | trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); | ||
1123 | return c; | ||
1124 | } | ||
1125 | |||
1126 | /* | ||
1127 | * If either this rcu_node structure or the root rcu_node structure | ||
1128 | * believe that a grace period is in progress, then we must wait | ||
1129 | * for the one following, which is in "c". Because our request | ||
1130 | * will be noticed at the end of the current grace period, we don't | ||
1131 | * need to explicitly start one. | ||
1132 | */ | ||
1133 | if (rnp->gpnum != rnp->completed || | ||
1134 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | ||
1135 | rnp->need_future_gp[c & 0x1]++; | ||
1136 | trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); | ||
1137 | return c; | ||
1138 | } | ||
1139 | |||
1140 | /* | ||
1141 | * There might be no grace period in progress. If we don't already | ||
1142 | * hold it, acquire the root rcu_node structure's lock in order to | ||
1143 | * start one (if needed). | ||
1144 | */ | ||
1145 | if (rnp != rnp_root) | ||
1146 | raw_spin_lock(&rnp_root->lock); | ||
1147 | |||
1148 | /* | ||
1149 | * Get a new grace-period number. If there really is no grace | ||
1150 | * period in progress, it will be smaller than the one we obtained | ||
1151 | * earlier. Adjust callbacks as needed. Note that even no-CBs | ||
1152 | * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. | ||
1153 | */ | ||
1154 | c = rcu_cbs_completed(rdp->rsp, rnp_root); | ||
1155 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) | ||
1156 | if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) | ||
1157 | rdp->nxtcompleted[i] = c; | ||
1158 | |||
1159 | /* | ||
1160 | * If the needed for the required grace period is already | ||
1161 | * recorded, trace and leave. | ||
1162 | */ | ||
1163 | if (rnp_root->need_future_gp[c & 0x1]) { | ||
1164 | trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); | ||
1165 | goto unlock_out; | ||
1166 | } | ||
1167 | |||
1168 | /* Record the need for the future grace period. */ | ||
1169 | rnp_root->need_future_gp[c & 0x1]++; | ||
1170 | |||
1171 | /* If a grace period is not already in progress, start one. */ | ||
1172 | if (rnp_root->gpnum != rnp_root->completed) { | ||
1173 | trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); | ||
1174 | } else { | ||
1175 | trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); | ||
1176 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | ||
1177 | } | ||
1178 | unlock_out: | ||
1179 | if (rnp != rnp_root) | ||
1180 | raw_spin_unlock(&rnp_root->lock); | ||
1181 | return c; | ||
1182 | } | ||
1183 | |||
1184 | /* | ||
1185 | * Clean up any old requests for the just-ended grace period. Also return | ||
1186 | * whether any additional grace periods have been requested. Also invoke | ||
1187 | * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads | ||
1188 | * waiting for this grace period to complete. | ||
1189 | */ | ||
1190 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
1191 | { | ||
1192 | int c = rnp->completed; | ||
1193 | int needmore; | ||
1194 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1195 | |||
1196 | rcu_nocb_gp_cleanup(rsp, rnp); | ||
1197 | rnp->need_future_gp[c & 0x1] = 0; | ||
1198 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | ||
1199 | trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); | ||
1200 | return needmore; | ||
1201 | } | ||
1202 | |||
1203 | /* | ||
1074 | * If there is room, assign a ->completed number to any callbacks on | 1204 | * If there is room, assign a ->completed number to any callbacks on |
1075 | * this CPU that have not already been assigned. Also accelerate any | 1205 | * this CPU that have not already been assigned. Also accelerate any |
1076 | * callbacks that were previously assigned a ->completed number that has | 1206 | * callbacks that were previously assigned a ->completed number that has |
@@ -1129,6 +1259,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1129 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | 1259 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; |
1130 | rdp->nxtcompleted[i] = c; | 1260 | rdp->nxtcompleted[i] = c; |
1131 | } | 1261 | } |
1262 | /* Record any needed additional grace periods. */ | ||
1263 | rcu_start_future_gp(rnp, rdp); | ||
1132 | 1264 | ||
1133 | /* Trace depending on how much we were able to accelerate. */ | 1265 | /* Trace depending on how much we were able to accelerate. */ |
1134 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1266 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
@@ -1308,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1308 | rdp = this_cpu_ptr(rsp->rda); | 1440 | rdp = this_cpu_ptr(rsp->rda); |
1309 | rcu_preempt_check_blocked_tasks(rnp); | 1441 | rcu_preempt_check_blocked_tasks(rnp); |
1310 | rnp->qsmask = rnp->qsmaskinit; | 1442 | rnp->qsmask = rnp->qsmaskinit; |
1311 | rnp->gpnum = rsp->gpnum; | 1443 | ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; |
1312 | WARN_ON_ONCE(rnp->completed != rsp->completed); | 1444 | WARN_ON_ONCE(rnp->completed != rsp->completed); |
1313 | rnp->completed = rsp->completed; | 1445 | ACCESS_ONCE(rnp->completed) = rsp->completed; |
1314 | if (rnp == rdp->mynode) | 1446 | if (rnp == rdp->mynode) |
1315 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 1447 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
1316 | rcu_preempt_boost_start_gp(rnp); | 1448 | rcu_preempt_boost_start_gp(rnp); |
@@ -1319,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1319 | rnp->grphi, rnp->qsmask); | 1451 | rnp->grphi, rnp->qsmask); |
1320 | raw_spin_unlock_irq(&rnp->lock); | 1452 | raw_spin_unlock_irq(&rnp->lock); |
1321 | #ifdef CONFIG_PROVE_RCU_DELAY | 1453 | #ifdef CONFIG_PROVE_RCU_DELAY |
1322 | if ((random32() % (rcu_num_nodes * 8)) == 0) | 1454 | if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && |
1455 | system_state == SYSTEM_RUNNING) | ||
1323 | schedule_timeout_uninterruptible(2); | 1456 | schedule_timeout_uninterruptible(2); |
1324 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | 1457 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ |
1325 | cond_resched(); | 1458 | cond_resched(); |
@@ -1361,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1361 | static void rcu_gp_cleanup(struct rcu_state *rsp) | 1494 | static void rcu_gp_cleanup(struct rcu_state *rsp) |
1362 | { | 1495 | { |
1363 | unsigned long gp_duration; | 1496 | unsigned long gp_duration; |
1497 | int nocb = 0; | ||
1364 | struct rcu_data *rdp; | 1498 | struct rcu_data *rdp; |
1365 | struct rcu_node *rnp = rcu_get_root(rsp); | 1499 | struct rcu_node *rnp = rcu_get_root(rsp); |
1366 | 1500 | ||
@@ -1390,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1390 | */ | 1524 | */ |
1391 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1525 | rcu_for_each_node_breadth_first(rsp, rnp) { |
1392 | raw_spin_lock_irq(&rnp->lock); | 1526 | raw_spin_lock_irq(&rnp->lock); |
1393 | rnp->completed = rsp->gpnum; | 1527 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; |
1528 | rdp = this_cpu_ptr(rsp->rda); | ||
1529 | if (rnp == rdp->mynode) | ||
1530 | __rcu_process_gp_end(rsp, rnp, rdp); | ||
1531 | nocb += rcu_future_gp_cleanup(rsp, rnp); | ||
1394 | raw_spin_unlock_irq(&rnp->lock); | 1532 | raw_spin_unlock_irq(&rnp->lock); |
1395 | cond_resched(); | 1533 | cond_resched(); |
1396 | } | 1534 | } |
1397 | rnp = rcu_get_root(rsp); | 1535 | rnp = rcu_get_root(rsp); |
1398 | raw_spin_lock_irq(&rnp->lock); | 1536 | raw_spin_lock_irq(&rnp->lock); |
1537 | rcu_nocb_gp_set(rnp, nocb); | ||
1399 | 1538 | ||
1400 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | 1539 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
1401 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1540 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); |
1402 | rsp->fqs_state = RCU_GP_IDLE; | 1541 | rsp->fqs_state = RCU_GP_IDLE; |
1403 | rdp = this_cpu_ptr(rsp->rda); | 1542 | rdp = this_cpu_ptr(rsp->rda); |
1543 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | ||
1404 | if (cpu_needs_another_gp(rsp, rdp)) | 1544 | if (cpu_needs_another_gp(rsp, rdp)) |
1405 | rsp->gp_flags = 1; | 1545 | rsp->gp_flags = 1; |
1406 | raw_spin_unlock_irq(&rnp->lock); | 1546 | raw_spin_unlock_irq(&rnp->lock); |
@@ -1476,57 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1476 | /* | 1616 | /* |
1477 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1617 | * Start a new RCU grace period if warranted, re-initializing the hierarchy |
1478 | * in preparation for detecting the next grace period. The caller must hold | 1618 | * in preparation for detecting the next grace period. The caller must hold |
1479 | * the root node's ->lock, which is released before return. Hard irqs must | 1619 | * the root node's ->lock and hard irqs must be disabled. |
1480 | * be disabled. | ||
1481 | * | 1620 | * |
1482 | * Note that it is legal for a dying CPU (which is marked as offline) to | 1621 | * Note that it is legal for a dying CPU (which is marked as offline) to |
1483 | * invoke this function. This can happen when the dying CPU reports its | 1622 | * invoke this function. This can happen when the dying CPU reports its |
1484 | * quiescent state. | 1623 | * quiescent state. |
1485 | */ | 1624 | */ |
1486 | static void | 1625 | static void |
1487 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | 1626 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
1488 | __releases(rcu_get_root(rsp)->lock) | 1627 | struct rcu_data *rdp) |
1489 | { | 1628 | { |
1490 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1629 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { |
1491 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1492 | |||
1493 | if (!rsp->gp_kthread || | ||
1494 | !cpu_needs_another_gp(rsp, rdp)) { | ||
1495 | /* | 1630 | /* |
1496 | * Either we have not yet spawned the grace-period | 1631 | * Either we have not yet spawned the grace-period |
1497 | * task, this CPU does not need another grace period, | 1632 | * task, this CPU does not need another grace period, |
1498 | * or a grace period is already in progress. | 1633 | * or a grace period is already in progress. |
1499 | * Either way, don't start a new grace period. | 1634 | * Either way, don't start a new grace period. |
1500 | */ | 1635 | */ |
1501 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1502 | return; | 1636 | return; |
1503 | } | 1637 | } |
1504 | |||
1505 | /* | ||
1506 | * Because there is no grace period in progress right now, | ||
1507 | * any callbacks we have up to this point will be satisfied | ||
1508 | * by the next grace period. So this is a good place to | ||
1509 | * assign a grace period number to recently posted callbacks. | ||
1510 | */ | ||
1511 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
1512 | |||
1513 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1638 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1514 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | ||
1515 | |||
1516 | /* Ensure that CPU is aware of completion of last grace period. */ | ||
1517 | rcu_process_gp_end(rsp, rdp); | ||
1518 | local_irq_restore(flags); | ||
1519 | 1639 | ||
1520 | /* Wake up rcu_gp_kthread() to start the grace period. */ | 1640 | /* Wake up rcu_gp_kthread() to start the grace period. */ |
1521 | wake_up(&rsp->gp_wq); | 1641 | wake_up(&rsp->gp_wq); |
1522 | } | 1642 | } |
1523 | 1643 | ||
1524 | /* | 1644 | /* |
1645 | * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's | ||
1646 | * callbacks. Note that rcu_start_gp_advanced() cannot do this because it | ||
1647 | * is invoked indirectly from rcu_advance_cbs(), which would result in | ||
1648 | * endless recursion -- or would do so if it wasn't for the self-deadlock | ||
1649 | * that is encountered beforehand. | ||
1650 | */ | ||
1651 | static void | ||
1652 | rcu_start_gp(struct rcu_state *rsp) | ||
1653 | { | ||
1654 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1655 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1656 | |||
1657 | /* | ||
1658 | * If there is no grace period in progress right now, any | ||
1659 | * callbacks we have up to this point will be satisfied by the | ||
1660 | * next grace period. Also, advancing the callbacks reduces the | ||
1661 | * probability of false positives from cpu_needs_another_gp() | ||
1662 | * resulting in pointless grace periods. So, advance callbacks | ||
1663 | * then start the grace period! | ||
1664 | */ | ||
1665 | rcu_advance_cbs(rsp, rnp, rdp); | ||
1666 | rcu_start_gp_advanced(rsp, rnp, rdp); | ||
1667 | } | ||
1668 | |||
1669 | /* | ||
1525 | * Report a full set of quiescent states to the specified rcu_state | 1670 | * Report a full set of quiescent states to the specified rcu_state |
1526 | * data structure. This involves cleaning up after the prior grace | 1671 | * data structure. This involves cleaning up after the prior grace |
1527 | * period and letting rcu_start_gp() start up the next grace period | 1672 | * period and letting rcu_start_gp() start up the next grace period |
1528 | * if one is needed. Note that the caller must hold rnp->lock, as | 1673 | * if one is needed. Note that the caller must hold rnp->lock, which |
1529 | * required by rcu_start_gp(), which will release it. | 1674 | * is released before return. |
1530 | */ | 1675 | */ |
1531 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 1676 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
1532 | __releases(rcu_get_root(rsp)->lock) | 1677 | __releases(rcu_get_root(rsp)->lock) |
@@ -1685,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1685 | struct rcu_node *rnp, struct rcu_data *rdp) | 1830 | struct rcu_node *rnp, struct rcu_data *rdp) |
1686 | { | 1831 | { |
1687 | /* No-CBs CPUs do not have orphanable callbacks. */ | 1832 | /* No-CBs CPUs do not have orphanable callbacks. */ |
1688 | if (is_nocb_cpu(rdp->cpu)) | 1833 | if (rcu_is_nocb_cpu(rdp->cpu)) |
1689 | return; | 1834 | return; |
1690 | 1835 | ||
1691 | /* | 1836 | /* |
@@ -2124,7 +2269,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
2124 | local_irq_save(flags); | 2269 | local_irq_save(flags); |
2125 | if (cpu_needs_another_gp(rsp, rdp)) { | 2270 | if (cpu_needs_another_gp(rsp, rdp)) { |
2126 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ | 2271 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ |
2127 | rcu_start_gp(rsp, flags); /* releases above lock */ | 2272 | rcu_start_gp(rsp); |
2273 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | ||
2128 | } else { | 2274 | } else { |
2129 | local_irq_restore(flags); | 2275 | local_irq_restore(flags); |
2130 | } | 2276 | } |
@@ -2169,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2169 | 2315 | ||
2170 | static void invoke_rcu_core(void) | 2316 | static void invoke_rcu_core(void) |
2171 | { | 2317 | { |
2172 | raise_softirq(RCU_SOFTIRQ); | 2318 | if (cpu_online(smp_processor_id())) |
2319 | raise_softirq(RCU_SOFTIRQ); | ||
2173 | } | 2320 | } |
2174 | 2321 | ||
2175 | /* | 2322 | /* |
@@ -2204,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2204 | 2351 | ||
2205 | /* Start a new grace period if one not already started. */ | 2352 | /* Start a new grace period if one not already started. */ |
2206 | if (!rcu_gp_in_progress(rsp)) { | 2353 | if (!rcu_gp_in_progress(rsp)) { |
2207 | unsigned long nestflag; | ||
2208 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 2354 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
2209 | 2355 | ||
2210 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | 2356 | raw_spin_lock(&rnp_root->lock); |
2211 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | 2357 | rcu_start_gp(rsp); |
2358 | raw_spin_unlock(&rnp_root->lock); | ||
2212 | } else { | 2359 | } else { |
2213 | /* Give the grace period a kick. */ | 2360 | /* Give the grace period a kick. */ |
2214 | rdp->blimit = LONG_MAX; | 2361 | rdp->blimit = LONG_MAX; |
@@ -2628,19 +2775,27 @@ static int rcu_pending(int cpu) | |||
2628 | } | 2775 | } |
2629 | 2776 | ||
2630 | /* | 2777 | /* |
2631 | * Check to see if any future RCU-related work will need to be done | 2778 | * Return true if the specified CPU has any callback. If all_lazy is |
2632 | * by the current CPU, even if none need be done immediately, returning | 2779 | * non-NULL, store an indication of whether all callbacks are lazy. |
2633 | * 1 if so. | 2780 | * (If there are no callbacks, all of them are deemed to be lazy.) |
2634 | */ | 2781 | */ |
2635 | static int rcu_cpu_has_callbacks(int cpu) | 2782 | static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) |
2636 | { | 2783 | { |
2784 | bool al = true; | ||
2785 | bool hc = false; | ||
2786 | struct rcu_data *rdp; | ||
2637 | struct rcu_state *rsp; | 2787 | struct rcu_state *rsp; |
2638 | 2788 | ||
2639 | /* RCU callbacks either ready or pending? */ | 2789 | for_each_rcu_flavor(rsp) { |
2640 | for_each_rcu_flavor(rsp) | 2790 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2641 | if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) | 2791 | if (rdp->qlen != rdp->qlen_lazy) |
2642 | return 1; | 2792 | al = false; |
2643 | return 0; | 2793 | if (rdp->nxtlist) |
2794 | hc = true; | ||
2795 | } | ||
2796 | if (all_lazy) | ||
2797 | *all_lazy = al; | ||
2798 | return hc; | ||
2644 | } | 2799 | } |
2645 | 2800 | ||
2646 | /* | 2801 | /* |
@@ -2747,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2747 | * corresponding CPU's preceding callbacks have been invoked. | 2902 | * corresponding CPU's preceding callbacks have been invoked. |
2748 | */ | 2903 | */ |
2749 | for_each_possible_cpu(cpu) { | 2904 | for_each_possible_cpu(cpu) { |
2750 | if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) | 2905 | if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) |
2751 | continue; | 2906 | continue; |
2752 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2907 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2753 | if (is_nocb_cpu(cpu)) { | 2908 | if (rcu_is_nocb_cpu(cpu)) { |
2754 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | 2909 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, |
2755 | rsp->n_barrier_done); | 2910 | rsp->n_barrier_done); |
2756 | atomic_inc(&rsp->barrier_cpu_count); | 2911 | atomic_inc(&rsp->barrier_cpu_count); |
@@ -2859,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2859 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3014 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
2860 | atomic_set(&rdp->dynticks->dynticks, | 3015 | atomic_set(&rdp->dynticks->dynticks, |
2861 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 3016 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
2862 | rcu_prepare_for_idle_init(cpu); | ||
2863 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 3017 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
2864 | 3018 | ||
2865 | /* Add CPU to rcu_node bitmasks. */ | 3019 | /* Add CPU to rcu_node bitmasks. */ |
@@ -2909,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2909 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 3063 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
2910 | struct rcu_node *rnp = rdp->mynode; | 3064 | struct rcu_node *rnp = rdp->mynode; |
2911 | struct rcu_state *rsp; | 3065 | struct rcu_state *rsp; |
2912 | int ret = NOTIFY_OK; | ||
2913 | 3066 | ||
2914 | trace_rcu_utilization("Start CPU hotplug"); | 3067 | trace_rcu_utilization("Start CPU hotplug"); |
2915 | switch (action) { | 3068 | switch (action) { |
@@ -2923,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2923 | rcu_boost_kthread_setaffinity(rnp, -1); | 3076 | rcu_boost_kthread_setaffinity(rnp, -1); |
2924 | break; | 3077 | break; |
2925 | case CPU_DOWN_PREPARE: | 3078 | case CPU_DOWN_PREPARE: |
2926 | if (nocb_cpu_expendable(cpu)) | 3079 | rcu_boost_kthread_setaffinity(rnp, cpu); |
2927 | rcu_boost_kthread_setaffinity(rnp, cpu); | ||
2928 | else | ||
2929 | ret = NOTIFY_BAD; | ||
2930 | break; | 3080 | break; |
2931 | case CPU_DYING: | 3081 | case CPU_DYING: |
2932 | case CPU_DYING_FROZEN: | 3082 | case CPU_DYING_FROZEN: |
2933 | /* | ||
2934 | * The whole machine is "stopped" except this CPU, so we can | ||
2935 | * touch any data without introducing corruption. We send the | ||
2936 | * dying CPU's callbacks to an arbitrarily chosen online CPU. | ||
2937 | */ | ||
2938 | for_each_rcu_flavor(rsp) | 3083 | for_each_rcu_flavor(rsp) |
2939 | rcu_cleanup_dying_cpu(rsp); | 3084 | rcu_cleanup_dying_cpu(rsp); |
2940 | rcu_cleanup_after_idle(cpu); | ||
2941 | break; | 3085 | break; |
2942 | case CPU_DEAD: | 3086 | case CPU_DEAD: |
2943 | case CPU_DEAD_FROZEN: | 3087 | case CPU_DEAD_FROZEN: |
@@ -2950,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2950 | break; | 3094 | break; |
2951 | } | 3095 | } |
2952 | trace_rcu_utilization("End CPU hotplug"); | 3096 | trace_rcu_utilization("End CPU hotplug"); |
2953 | return ret; | 3097 | return NOTIFY_OK; |
2954 | } | 3098 | } |
2955 | 3099 | ||
2956 | /* | 3100 | /* |
@@ -3085,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3085 | } | 3229 | } |
3086 | rnp->level = i; | 3230 | rnp->level = i; |
3087 | INIT_LIST_HEAD(&rnp->blkd_tasks); | 3231 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
3232 | rcu_init_one_nocb(rnp); | ||
3088 | } | 3233 | } |
3089 | } | 3234 | } |
3090 | 3235 | ||
@@ -3170,8 +3315,7 @@ void __init rcu_init(void) | |||
3170 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 3315 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
3171 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3316 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
3172 | __rcu_init_preempt(); | 3317 | __rcu_init_preempt(); |
3173 | rcu_init_nocb(); | 3318 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
3174 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
3175 | 3319 | ||
3176 | /* | 3320 | /* |
3177 | * We don't need protection against CPU-hotplug here because | 3321 | * We don't need protection against CPU-hotplug here because |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b5045d9d..da77a8f57ff9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -88,18 +88,13 @@ struct rcu_dynticks { | |||
88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
89 | atomic_t dynticks; /* Even value for idle, else odd. */ | 89 | atomic_t dynticks; /* Even value for idle, else odd. */ |
90 | #ifdef CONFIG_RCU_FAST_NO_HZ | 90 | #ifdef CONFIG_RCU_FAST_NO_HZ |
91 | int dyntick_drain; /* Prepare-for-idle state variable. */ | 91 | bool all_lazy; /* Are all CPU's CBs lazy? */ |
92 | unsigned long dyntick_holdoff; | ||
93 | /* No retries for the jiffy of failure. */ | ||
94 | struct timer_list idle_gp_timer; | ||
95 | /* Wake up CPU sleeping with callbacks. */ | ||
96 | unsigned long idle_gp_timer_expires; | ||
97 | /* When to wake up CPU (for repost). */ | ||
98 | bool idle_first_pass; /* First pass of attempt to go idle? */ | ||
99 | unsigned long nonlazy_posted; | 92 | unsigned long nonlazy_posted; |
100 | /* # times non-lazy CBs posted to CPU. */ | 93 | /* # times non-lazy CBs posted to CPU. */ |
101 | unsigned long nonlazy_posted_snap; | 94 | unsigned long nonlazy_posted_snap; |
102 | /* idle-period nonlazy_posted snapshot. */ | 95 | /* idle-period nonlazy_posted snapshot. */ |
96 | unsigned long last_accelerate; | ||
97 | /* Last jiffy CBs were accelerated. */ | ||
103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 98 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 99 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
105 | }; | 100 | }; |
@@ -134,9 +129,6 @@ struct rcu_node { | |||
134 | /* elements that need to drain to allow the */ | 129 | /* elements that need to drain to allow the */ |
135 | /* current expedited grace period to */ | 130 | /* current expedited grace period to */ |
136 | /* complete (only for TREE_PREEMPT_RCU). */ | 131 | /* complete (only for TREE_PREEMPT_RCU). */ |
137 | atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ | ||
138 | /* Since this has meaning only for leaf */ | ||
139 | /* rcu_node structures, 32 bits suffices. */ | ||
140 | unsigned long qsmaskinit; | 132 | unsigned long qsmaskinit; |
141 | /* Per-GP initial value for qsmask & expmask. */ | 133 | /* Per-GP initial value for qsmask & expmask. */ |
142 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 134 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
@@ -196,6 +188,12 @@ struct rcu_node { | |||
196 | /* Refused to boost: not sure why, though. */ | 188 | /* Refused to boost: not sure why, though. */ |
197 | /* This can happen due to race conditions. */ | 189 | /* This can happen due to race conditions. */ |
198 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 190 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
191 | #ifdef CONFIG_RCU_NOCB_CPU | ||
192 | wait_queue_head_t nocb_gp_wq[2]; | ||
193 | /* Place for rcu_nocb_kthread() to wait GP. */ | ||
194 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
195 | int need_future_gp[2]; | ||
196 | /* Counts of upcoming no-CB GP requests. */ | ||
199 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; | 197 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; |
200 | } ____cacheline_internodealigned_in_smp; | 198 | } ____cacheline_internodealigned_in_smp; |
201 | 199 | ||
@@ -328,6 +326,11 @@ struct rcu_data { | |||
328 | struct task_struct *nocb_kthread; | 326 | struct task_struct *nocb_kthread; |
329 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 327 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
330 | 328 | ||
329 | /* 8) RCU CPU stall data. */ | ||
330 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
331 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ | ||
332 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
333 | |||
331 | int cpu; | 334 | int cpu; |
332 | struct rcu_state *rsp; | 335 | struct rcu_state *rsp; |
333 | }; | 336 | }; |
@@ -375,12 +378,6 @@ struct rcu_state { | |||
375 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 378 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
376 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 379 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
377 | void (*func)(struct rcu_head *head)); | 380 | void (*func)(struct rcu_head *head)); |
378 | #ifdef CONFIG_RCU_NOCB_CPU | ||
379 | void (*call_remote)(struct rcu_head *head, | ||
380 | void (*func)(struct rcu_head *head)); | ||
381 | /* call_rcu() flavor, but for */ | ||
382 | /* placing on remote CPU. */ | ||
383 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
384 | 381 | ||
385 | /* The following fields are guarded by the root rcu_node's lock. */ | 382 | /* The following fields are guarded by the root rcu_node's lock. */ |
386 | 383 | ||
@@ -443,6 +440,7 @@ struct rcu_state { | |||
443 | unsigned long gp_max; /* Maximum GP duration in */ | 440 | unsigned long gp_max; /* Maximum GP duration in */ |
444 | /* jiffies. */ | 441 | /* jiffies. */ |
445 | char *name; /* Name of structure. */ | 442 | char *name; /* Name of structure. */ |
443 | char abbr; /* Abbreviated name. */ | ||
446 | struct list_head flavors; /* List of RCU flavors. */ | 444 | struct list_head flavors; /* List of RCU flavors. */ |
447 | }; | 445 | }; |
448 | 446 | ||
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
520 | struct rcu_node *rnp); | 518 | struct rcu_node *rnp); |
521 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 519 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
522 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 520 | static void __cpuinit rcu_prepare_kthreads(int cpu); |
523 | static void rcu_prepare_for_idle_init(int cpu); | ||
524 | static void rcu_cleanup_after_idle(int cpu); | 521 | static void rcu_cleanup_after_idle(int cpu); |
525 | static void rcu_prepare_for_idle(int cpu); | 522 | static void rcu_prepare_for_idle(int cpu); |
526 | static void rcu_idle_count_callbacks_posted(void); | 523 | static void rcu_idle_count_callbacks_posted(void); |
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
529 | static void print_cpu_stall_info_end(void); | 526 | static void print_cpu_stall_info_end(void); |
530 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 527 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
531 | static void increment_cpu_stall_ticks(void); | 528 | static void increment_cpu_stall_ticks(void); |
532 | static bool is_nocb_cpu(int cpu); | 529 | static int rcu_nocb_needs_gp(struct rcu_state *rsp); |
530 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | ||
531 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | ||
532 | static void rcu_init_one_nocb(struct rcu_node *rnp); | ||
533 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 533 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
534 | bool lazy); | 534 | bool lazy); |
535 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 535 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
536 | struct rcu_data *rdp); | 536 | struct rcu_data *rdp); |
537 | static bool nocb_cpu_expendable(int cpu); | ||
538 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 537 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
539 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 538 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
540 | static void init_nocb_callback_list(struct rcu_data *rdp); | 539 | static void rcu_kick_nohz_cpu(int cpu); |
541 | static void __init rcu_init_nocb(void); | 540 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
542 | 541 | ||
543 | #endif /* #ifndef RCU_TREE_NONCORE */ | 542 | #endif /* #ifndef RCU_TREE_NONCORE */ |
544 | 543 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9d..3db5a375d8dd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
31 | #include <linux/tick.h> | ||
31 | 32 | ||
32 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
33 | 34 | ||
@@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void) | |||
85 | if (nr_cpu_ids != NR_CPUS) | 86 | if (nr_cpu_ids != NR_CPUS) |
86 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 87 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
87 | #ifdef CONFIG_RCU_NOCB_CPU | 88 | #ifdef CONFIG_RCU_NOCB_CPU |
89 | #ifndef CONFIG_RCU_NOCB_CPU_NONE | ||
90 | if (!have_rcu_nocb_mask) { | ||
91 | zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); | ||
92 | have_rcu_nocb_mask = true; | ||
93 | } | ||
94 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
95 | pr_info("\tExperimental no-CBs CPU 0\n"); | ||
96 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
97 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
98 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
99 | pr_info("\tExperimental no-CBs for all CPUs\n"); | ||
100 | cpumask_setall(rcu_nocb_mask); | ||
101 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
102 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
88 | if (have_rcu_nocb_mask) { | 103 | if (have_rcu_nocb_mask) { |
89 | if (cpumask_test_cpu(0, rcu_nocb_mask)) { | ||
90 | cpumask_clear_cpu(0, rcu_nocb_mask); | ||
91 | pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); | ||
92 | } | ||
93 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | 104 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); |
94 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); | 105 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); |
95 | if (rcu_nocb_poll) | 106 | if (rcu_nocb_poll) |
@@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
101 | #ifdef CONFIG_TREE_PREEMPT_RCU | 112 | #ifdef CONFIG_TREE_PREEMPT_RCU |
102 | 113 | ||
103 | struct rcu_state rcu_preempt_state = | 114 | struct rcu_state rcu_preempt_state = |
104 | RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); | 115 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
105 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 116 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
106 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 117 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
107 | 118 | ||
@@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1533 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | 1544 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) |
1534 | { | 1545 | { |
1535 | *delta_jiffies = ULONG_MAX; | 1546 | *delta_jiffies = ULONG_MAX; |
1536 | return rcu_cpu_has_callbacks(cpu); | 1547 | return rcu_cpu_has_callbacks(cpu, NULL); |
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. | ||
1541 | */ | ||
1542 | static void rcu_prepare_for_idle_init(int cpu) | ||
1543 | { | ||
1544 | } | 1548 | } |
1545 | 1549 | ||
1546 | /* | 1550 | /* |
@@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void) | |||
1577 | * | 1581 | * |
1578 | * The following three proprocessor symbols control this state machine: | 1582 | * The following three proprocessor symbols control this state machine: |
1579 | * | 1583 | * |
1580 | * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt | ||
1581 | * to satisfy RCU. Beyond this point, it is better to incur a periodic | ||
1582 | * scheduling-clock interrupt than to loop through the state machine | ||
1583 | * at full power. | ||
1584 | * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are | ||
1585 | * optional if RCU does not need anything immediately from this | ||
1586 | * CPU, even if this CPU still has RCU callbacks queued. The first | ||
1587 | * times through the state machine are mandatory: we need to give | ||
1588 | * the state machine a chance to communicate a quiescent state | ||
1589 | * to the RCU core. | ||
1590 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted | 1584 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted |
1591 | * to sleep in dyntick-idle mode with RCU callbacks pending. This | 1585 | * to sleep in dyntick-idle mode with RCU callbacks pending. This |
1592 | * is sized to be roughly one RCU grace period. Those energy-efficiency | 1586 | * is sized to be roughly one RCU grace period. Those energy-efficiency |
@@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void) | |||
1602 | * adjustment, they can be converted into kernel config parameters, though | 1596 | * adjustment, they can be converted into kernel config parameters, though |
1603 | * making the state machine smarter might be a better option. | 1597 | * making the state machine smarter might be a better option. |
1604 | */ | 1598 | */ |
1605 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | ||
1606 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | ||
1607 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ | 1599 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ |
1608 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1600 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1609 | 1601 | ||
1610 | extern int tick_nohz_enabled; | 1602 | static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; |
1611 | 1603 | module_param(rcu_idle_gp_delay, int, 0644); | |
1612 | /* | 1604 | static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; |
1613 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1605 | module_param(rcu_idle_lazy_gp_delay, int, 0644); |
1614 | * the specified CPU? Both RCU flavor and CPU are specified by the | ||
1615 | * rcu_data structure. | ||
1616 | */ | ||
1617 | static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) | ||
1618 | { | ||
1619 | return rdp->qlen != rdp->qlen_lazy; | ||
1620 | } | ||
1621 | 1606 | ||
1622 | #ifdef CONFIG_TREE_PREEMPT_RCU | 1607 | extern int tick_nohz_enabled; |
1623 | 1608 | ||
1624 | /* | 1609 | /* |
1625 | * Are there non-lazy RCU-preempt callbacks? (There cannot be if there | 1610 | * Try to advance callbacks for all flavors of RCU on the current CPU. |
1626 | * is no RCU-preempt in the kernel.) | 1611 | * Afterwards, if there are any callbacks ready for immediate invocation, |
1612 | * return true. | ||
1627 | */ | 1613 | */ |
1628 | static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) | 1614 | static bool rcu_try_advance_all_cbs(void) |
1629 | { | 1615 | { |
1630 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 1616 | bool cbs_ready = false; |
1631 | 1617 | struct rcu_data *rdp; | |
1632 | return __rcu_cpu_has_nonlazy_callbacks(rdp); | 1618 | struct rcu_node *rnp; |
1633 | } | 1619 | struct rcu_state *rsp; |
1634 | |||
1635 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
1636 | 1620 | ||
1637 | static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) | 1621 | for_each_rcu_flavor(rsp) { |
1638 | { | 1622 | rdp = this_cpu_ptr(rsp->rda); |
1639 | return 0; | 1623 | rnp = rdp->mynode; |
1640 | } | ||
1641 | 1624 | ||
1642 | #endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1625 | /* |
1626 | * Don't bother checking unless a grace period has | ||
1627 | * completed since we last checked and there are | ||
1628 | * callbacks not yet ready to invoke. | ||
1629 | */ | ||
1630 | if (rdp->completed != rnp->completed && | ||
1631 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | ||
1632 | rcu_process_gp_end(rsp, rdp); | ||
1643 | 1633 | ||
1644 | /* | 1634 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1645 | * Does any flavor of RCU have non-lazy callbacks on the specified CPU? | 1635 | cbs_ready = true; |
1646 | */ | 1636 | } |
1647 | static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | 1637 | return cbs_ready; |
1648 | { | ||
1649 | return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || | ||
1650 | __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || | ||
1651 | rcu_preempt_cpu_has_nonlazy_callbacks(cpu); | ||
1652 | } | 1638 | } |
1653 | 1639 | ||
1654 | /* | 1640 | /* |
1655 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | 1641 | * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready |
1656 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | 1642 | * to invoke. If the CPU has callbacks, try to advance them. Tell the |
1657 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | 1643 | * caller to set the timeout based on whether or not there are non-lazy |
1658 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | 1644 | * callbacks. |
1659 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
1660 | * it is better to incur scheduling-clock interrupts than to spin | ||
1661 | * continuously for the same time duration! | ||
1662 | * | 1645 | * |
1663 | * The delta_jiffies argument is used to store the time when RCU is | 1646 | * The caller must have disabled interrupts. |
1664 | * going to need the CPU again if it still has callbacks. The reason | ||
1665 | * for this is that rcu_prepare_for_idle() might need to post a timer, | ||
1666 | * but if so, it will do so after tick_nohz_stop_sched_tick() has set | ||
1667 | * the wakeup time for this CPU. This means that RCU's timer can be | ||
1668 | * delayed until the wakeup time, which defeats the purpose of posting | ||
1669 | * a timer. | ||
1670 | */ | 1647 | */ |
1671 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | 1648 | int rcu_needs_cpu(int cpu, unsigned long *dj) |
1672 | { | 1649 | { |
1673 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1650 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
1674 | 1651 | ||
1675 | /* Flag a new idle sojourn to the idle-entry state machine. */ | 1652 | /* Snapshot to detect later posting of non-lazy callback. */ |
1676 | rdtp->idle_first_pass = 1; | 1653 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; |
1654 | |||
1677 | /* If no callbacks, RCU doesn't need the CPU. */ | 1655 | /* If no callbacks, RCU doesn't need the CPU. */ |
1678 | if (!rcu_cpu_has_callbacks(cpu)) { | 1656 | if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { |
1679 | *delta_jiffies = ULONG_MAX; | 1657 | *dj = ULONG_MAX; |
1680 | return 0; | 1658 | return 0; |
1681 | } | 1659 | } |
1682 | if (rdtp->dyntick_holdoff == jiffies) { | 1660 | |
1683 | /* RCU recently tried and failed, so don't try again. */ | 1661 | /* Attempt to advance callbacks. */ |
1684 | *delta_jiffies = 1; | 1662 | if (rcu_try_advance_all_cbs()) { |
1663 | /* Some ready to invoke, so initiate later invocation. */ | ||
1664 | invoke_rcu_core(); | ||
1685 | return 1; | 1665 | return 1; |
1686 | } | 1666 | } |
1687 | /* Set up for the possibility that RCU will post a timer. */ | 1667 | rdtp->last_accelerate = jiffies; |
1688 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | 1668 | |
1689 | *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, | 1669 | /* Request timer delay depending on laziness, and round. */ |
1690 | RCU_IDLE_GP_DELAY) - jiffies; | 1670 | if (!rdtp->all_lazy) { |
1671 | *dj = round_up(rcu_idle_gp_delay + jiffies, | ||
1672 | rcu_idle_gp_delay) - jiffies; | ||
1691 | } else { | 1673 | } else { |
1692 | *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; | 1674 | *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; |
1693 | *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; | ||
1694 | } | 1675 | } |
1695 | return 0; | 1676 | return 0; |
1696 | } | 1677 | } |
1697 | 1678 | ||
1698 | /* | 1679 | /* |
1699 | * Handler for smp_call_function_single(). The only point of this | 1680 | * Prepare a CPU for idle from an RCU perspective. The first major task |
1700 | * handler is to wake the CPU up, so the handler does only tracing. | 1681 | * is to sense whether nohz mode has been enabled or disabled via sysfs. |
1701 | */ | 1682 | * The second major task is to check to see if a non-lazy callback has |
1702 | void rcu_idle_demigrate(void *unused) | 1683 | * arrived at a CPU that previously had only lazy callbacks. The third |
1703 | { | 1684 | * major task is to accelerate (that is, assign grace-period numbers to) |
1704 | trace_rcu_prep_idle("Demigrate"); | 1685 | * any recently arrived callbacks. |
1705 | } | ||
1706 | |||
1707 | /* | ||
1708 | * Timer handler used to force CPU to start pushing its remaining RCU | ||
1709 | * callbacks in the case where it entered dyntick-idle mode with callbacks | ||
1710 | * pending. The hander doesn't really need to do anything because the | ||
1711 | * real work is done upon re-entry to idle, or by the next scheduling-clock | ||
1712 | * interrupt should idle not be re-entered. | ||
1713 | * | ||
1714 | * One special case: the timer gets migrated without awakening the CPU | ||
1715 | * on which the timer was scheduled on. In this case, we must wake up | ||
1716 | * that CPU. We do so with smp_call_function_single(). | ||
1717 | */ | ||
1718 | static void rcu_idle_gp_timer_func(unsigned long cpu_in) | ||
1719 | { | ||
1720 | int cpu = (int)cpu_in; | ||
1721 | |||
1722 | trace_rcu_prep_idle("Timer"); | ||
1723 | if (cpu != smp_processor_id()) | ||
1724 | smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); | ||
1725 | else | ||
1726 | WARN_ON_ONCE(1); /* Getting here can hang the system... */ | ||
1727 | } | ||
1728 | |||
1729 | /* | ||
1730 | * Initialize the timer used to pull CPUs out of dyntick-idle mode. | ||
1731 | */ | ||
1732 | static void rcu_prepare_for_idle_init(int cpu) | ||
1733 | { | ||
1734 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1735 | |||
1736 | rdtp->dyntick_holdoff = jiffies - 1; | ||
1737 | setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); | ||
1738 | rdtp->idle_gp_timer_expires = jiffies - 1; | ||
1739 | rdtp->idle_first_pass = 1; | ||
1740 | } | ||
1741 | |||
1742 | /* | ||
1743 | * Clean up for exit from idle. Because we are exiting from idle, there | ||
1744 | * is no longer any point to ->idle_gp_timer, so cancel it. This will | ||
1745 | * do nothing if this timer is not active, so just cancel it unconditionally. | ||
1746 | */ | ||
1747 | static void rcu_cleanup_after_idle(int cpu) | ||
1748 | { | ||
1749 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1750 | |||
1751 | del_timer(&rdtp->idle_gp_timer); | ||
1752 | trace_rcu_prep_idle("Cleanup after idle"); | ||
1753 | rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); | ||
1754 | } | ||
1755 | |||
1756 | /* | ||
1757 | * Check to see if any RCU-related work can be done by the current CPU, | ||
1758 | * and if so, schedule a softirq to get it done. This function is part | ||
1759 | * of the RCU implementation; it is -not- an exported member of the RCU API. | ||
1760 | * | ||
1761 | * The idea is for the current CPU to clear out all work required by the | ||
1762 | * RCU core for the current grace period, so that this CPU can be permitted | ||
1763 | * to enter dyntick-idle mode. In some cases, it will need to be awakened | ||
1764 | * at the end of the grace period by whatever CPU ends the grace period. | ||
1765 | * This allows CPUs to go dyntick-idle more quickly, and to reduce the | ||
1766 | * number of wakeups by a modest integer factor. | ||
1767 | * | ||
1768 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | ||
1769 | * disabled, we do one pass of force_quiescent_state(), then do a | ||
1770 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | ||
1771 | * later. The ->dyntick_drain field controls the sequencing. | ||
1772 | * | 1686 | * |
1773 | * The caller must have disabled interrupts. | 1687 | * The caller must have disabled interrupts. |
1774 | */ | 1688 | */ |
1775 | static void rcu_prepare_for_idle(int cpu) | 1689 | static void rcu_prepare_for_idle(int cpu) |
1776 | { | 1690 | { |
1777 | struct timer_list *tp; | 1691 | struct rcu_data *rdp; |
1778 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1692 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
1693 | struct rcu_node *rnp; | ||
1694 | struct rcu_state *rsp; | ||
1779 | int tne; | 1695 | int tne; |
1780 | 1696 | ||
1781 | /* Handle nohz enablement switches conservatively. */ | 1697 | /* Handle nohz enablement switches conservatively. */ |
1782 | tne = ACCESS_ONCE(tick_nohz_enabled); | 1698 | tne = ACCESS_ONCE(tick_nohz_enabled); |
1783 | if (tne != rdtp->tick_nohz_enabled_snap) { | 1699 | if (tne != rdtp->tick_nohz_enabled_snap) { |
1784 | if (rcu_cpu_has_callbacks(cpu)) | 1700 | if (rcu_cpu_has_callbacks(cpu, NULL)) |
1785 | invoke_rcu_core(); /* force nohz to see update. */ | 1701 | invoke_rcu_core(); /* force nohz to see update. */ |
1786 | rdtp->tick_nohz_enabled_snap = tne; | 1702 | rdtp->tick_nohz_enabled_snap = tne; |
1787 | return; | 1703 | return; |
@@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu) | |||
1789 | if (!tne) | 1705 | if (!tne) |
1790 | return; | 1706 | return; |
1791 | 1707 | ||
1792 | /* Adaptive-tick mode, where usermode execution is idle to RCU. */ | 1708 | /* If this is a no-CBs CPU, no callbacks, just return. */ |
1793 | if (!is_idle_task(current)) { | 1709 | if (rcu_is_nocb_cpu(cpu)) |
1794 | rdtp->dyntick_holdoff = jiffies - 1; | ||
1795 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | ||
1796 | trace_rcu_prep_idle("User dyntick with callbacks"); | ||
1797 | rdtp->idle_gp_timer_expires = | ||
1798 | round_up(jiffies + RCU_IDLE_GP_DELAY, | ||
1799 | RCU_IDLE_GP_DELAY); | ||
1800 | } else if (rcu_cpu_has_callbacks(cpu)) { | ||
1801 | rdtp->idle_gp_timer_expires = | ||
1802 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); | ||
1803 | trace_rcu_prep_idle("User dyntick with lazy callbacks"); | ||
1804 | } else { | ||
1805 | return; | ||
1806 | } | ||
1807 | tp = &rdtp->idle_gp_timer; | ||
1808 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
1809 | return; | 1710 | return; |
1810 | } | ||
1811 | 1711 | ||
1812 | /* | 1712 | /* |
1813 | * If this is an idle re-entry, for example, due to use of | 1713 | * If a non-lazy callback arrived at a CPU having only lazy |
1814 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 1714 | * callbacks, invoke RCU core for the side-effect of recalculating |
1815 | * loop, then don't take any state-machine actions, unless the | 1715 | * idle duration on re-entry to idle. |
1816 | * momentary exit from idle queued additional non-lazy callbacks. | ||
1817 | * Instead, repost the ->idle_gp_timer if this CPU has callbacks | ||
1818 | * pending. | ||
1819 | */ | 1716 | */ |
1820 | if (!rdtp->idle_first_pass && | 1717 | if (rdtp->all_lazy && |
1821 | (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { | 1718 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { |
1822 | if (rcu_cpu_has_callbacks(cpu)) { | 1719 | invoke_rcu_core(); |
1823 | tp = &rdtp->idle_gp_timer; | ||
1824 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
1825 | } | ||
1826 | return; | 1720 | return; |
1827 | } | 1721 | } |
1828 | rdtp->idle_first_pass = 0; | ||
1829 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; | ||
1830 | 1722 | ||
1831 | /* | 1723 | /* |
1832 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 1724 | * If we have not yet accelerated this jiffy, accelerate all |
1833 | * Also reset state to avoid prejudicing later attempts. | 1725 | * callbacks on this CPU. |
1834 | */ | 1726 | */ |
1835 | if (!rcu_cpu_has_callbacks(cpu)) { | 1727 | if (rdtp->last_accelerate == jiffies) |
1836 | rdtp->dyntick_holdoff = jiffies - 1; | ||
1837 | rdtp->dyntick_drain = 0; | ||
1838 | trace_rcu_prep_idle("No callbacks"); | ||
1839 | return; | 1728 | return; |
1729 | rdtp->last_accelerate = jiffies; | ||
1730 | for_each_rcu_flavor(rsp) { | ||
1731 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
1732 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | ||
1733 | continue; | ||
1734 | rnp = rdp->mynode; | ||
1735 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1736 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
1737 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1840 | } | 1738 | } |
1739 | } | ||
1841 | 1740 | ||
1842 | /* | 1741 | /* |
1843 | * If in holdoff mode, just return. We will presumably have | 1742 | * Clean up for exit from idle. Attempt to advance callbacks based on |
1844 | * refrained from disabling the scheduling-clock tick. | 1743 | * any grace periods that elapsed while the CPU was idle, and if any |
1845 | */ | 1744 | * callbacks are now ready to invoke, initiate invocation. |
1846 | if (rdtp->dyntick_holdoff == jiffies) { | 1745 | */ |
1847 | trace_rcu_prep_idle("In holdoff"); | 1746 | static void rcu_cleanup_after_idle(int cpu) |
1848 | return; | 1747 | { |
1849 | } | 1748 | struct rcu_data *rdp; |
1749 | struct rcu_state *rsp; | ||
1850 | 1750 | ||
1851 | /* Check and update the ->dyntick_drain sequencing. */ | 1751 | if (rcu_is_nocb_cpu(cpu)) |
1852 | if (rdtp->dyntick_drain <= 0) { | ||
1853 | /* First time through, initialize the counter. */ | ||
1854 | rdtp->dyntick_drain = RCU_IDLE_FLUSHES; | ||
1855 | } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && | ||
1856 | !rcu_pending(cpu) && | ||
1857 | !local_softirq_pending()) { | ||
1858 | /* Can we go dyntick-idle despite still having callbacks? */ | ||
1859 | rdtp->dyntick_drain = 0; | ||
1860 | rdtp->dyntick_holdoff = jiffies; | ||
1861 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | ||
1862 | trace_rcu_prep_idle("Dyntick with callbacks"); | ||
1863 | rdtp->idle_gp_timer_expires = | ||
1864 | round_up(jiffies + RCU_IDLE_GP_DELAY, | ||
1865 | RCU_IDLE_GP_DELAY); | ||
1866 | } else { | ||
1867 | rdtp->idle_gp_timer_expires = | ||
1868 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); | ||
1869 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); | ||
1870 | } | ||
1871 | tp = &rdtp->idle_gp_timer; | ||
1872 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
1873 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
1874 | return; /* Nothing more to do immediately. */ | ||
1875 | } else if (--(rdtp->dyntick_drain) <= 0) { | ||
1876 | /* We have hit the limit, so time to give up. */ | ||
1877 | rdtp->dyntick_holdoff = jiffies; | ||
1878 | trace_rcu_prep_idle("Begin holdoff"); | ||
1879 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | ||
1880 | return; | 1752 | return; |
1881 | } | 1753 | rcu_try_advance_all_cbs(); |
1882 | 1754 | for_each_rcu_flavor(rsp) { | |
1883 | /* | 1755 | rdp = per_cpu_ptr(rsp->rda, cpu); |
1884 | * Do one step of pushing the remaining RCU callbacks through | 1756 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1885 | * the RCU core state machine. | 1757 | invoke_rcu_core(); |
1886 | */ | ||
1887 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
1888 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | ||
1889 | rcu_preempt_qs(cpu); | ||
1890 | force_quiescent_state(&rcu_preempt_state); | ||
1891 | } | ||
1892 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
1893 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | ||
1894 | rcu_sched_qs(cpu); | ||
1895 | force_quiescent_state(&rcu_sched_state); | ||
1896 | } | ||
1897 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | ||
1898 | rcu_bh_qs(cpu); | ||
1899 | force_quiescent_state(&rcu_bh_state); | ||
1900 | } | ||
1901 | |||
1902 | /* | ||
1903 | * If RCU callbacks are still pending, RCU still needs this CPU. | ||
1904 | * So try forcing the callbacks through the grace period. | ||
1905 | */ | ||
1906 | if (rcu_cpu_has_callbacks(cpu)) { | ||
1907 | trace_rcu_prep_idle("More callbacks"); | ||
1908 | invoke_rcu_core(); | ||
1909 | } else { | ||
1910 | trace_rcu_prep_idle("Callbacks drained"); | ||
1911 | } | 1758 | } |
1912 | } | 1759 | } |
1913 | 1760 | ||
@@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier); | |||
2015 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 1862 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2016 | { | 1863 | { |
2017 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1864 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2018 | struct timer_list *tltp = &rdtp->idle_gp_timer; | 1865 | unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; |
2019 | char c; | ||
2020 | 1866 | ||
2021 | c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; | 1867 | sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", |
2022 | if (timer_pending(tltp)) | 1868 | rdtp->last_accelerate & 0xffff, jiffies & 0xffff, |
2023 | sprintf(cp, "drain=%d %c timer=%lu", | 1869 | ulong2long(nlpd), |
2024 | rdtp->dyntick_drain, c, tltp->expires - jiffies); | 1870 | rdtp->all_lazy ? 'L' : '.', |
2025 | else | 1871 | rdtp->tick_nohz_enabled_snap ? '.' : 'D'); |
2026 | sprintf(cp, "drain=%d %c timer not pending", | ||
2027 | rdtp->dyntick_drain, c); | ||
2028 | } | 1872 | } |
2029 | 1873 | ||
2030 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 1874 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
@@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | |||
2070 | ticks_value = rsp->gpnum - rdp->gpnum; | 1914 | ticks_value = rsp->gpnum - rdp->gpnum; |
2071 | } | 1915 | } |
2072 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | 1916 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); |
2073 | printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", | 1917 | printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", |
2074 | cpu, ticks_value, ticks_title, | 1918 | cpu, ticks_value, ticks_title, |
2075 | atomic_read(&rdtp->dynticks) & 0xfff, | 1919 | atomic_read(&rdtp->dynticks) & 0xfff, |
2076 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, | 1920 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, |
1921 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), | ||
2077 | fast_no_hz); | 1922 | fast_no_hz); |
2078 | } | 1923 | } |
2079 | 1924 | ||
@@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void) | |||
2087 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | 1932 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) |
2088 | { | 1933 | { |
2089 | rdp->ticks_this_gp = 0; | 1934 | rdp->ticks_this_gp = 0; |
1935 | rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); | ||
2090 | } | 1936 | } |
2091 | 1937 | ||
2092 | /* Increment ->ticks_this_gp for all flavors of RCU. */ | 1938 | /* Increment ->ticks_this_gp for all flavors of RCU. */ |
@@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg) | |||
2165 | } | 2011 | } |
2166 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 2012 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
2167 | 2013 | ||
2014 | /* | ||
2015 | * Do any no-CBs CPUs need another grace period? | ||
2016 | * | ||
2017 | * Interrupts must be disabled. If the caller does not hold the root | ||
2018 | * rnp_node structure's ->lock, the results are advisory only. | ||
2019 | */ | ||
2020 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2021 | { | ||
2022 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
2023 | |||
2024 | return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; | ||
2025 | } | ||
2026 | |||
2027 | /* | ||
2028 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | ||
2029 | * grace period. | ||
2030 | */ | ||
2031 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
2032 | { | ||
2033 | wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); | ||
2034 | } | ||
2035 | |||
2036 | /* | ||
2037 | * Set the root rcu_node structure's ->need_future_gp field | ||
2038 | * based on the sum of those of all rcu_node structures. This does | ||
2039 | * double-count the root rcu_node structure's requests, but this | ||
2040 | * is necessary to handle the possibility of a rcu_nocb_kthread() | ||
2041 | * having awakened during the time that the rcu_node structures | ||
2042 | * were being updated for the end of the previous grace period. | ||
2043 | */ | ||
2044 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | ||
2045 | { | ||
2046 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; | ||
2047 | } | ||
2048 | |||
2049 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
2050 | { | ||
2051 | init_waitqueue_head(&rnp->nocb_gp_wq[0]); | ||
2052 | init_waitqueue_head(&rnp->nocb_gp_wq[1]); | ||
2053 | } | ||
2054 | |||
2168 | /* Is the specified CPU a no-CPUs CPU? */ | 2055 | /* Is the specified CPU a no-CPUs CPU? */ |
2169 | static bool is_nocb_cpu(int cpu) | 2056 | bool rcu_is_nocb_cpu(int cpu) |
2170 | { | 2057 | { |
2171 | if (have_rcu_nocb_mask) | 2058 | if (have_rcu_nocb_mask) |
2172 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | 2059 | return cpumask_test_cpu(cpu, rcu_nocb_mask); |
@@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
2224 | bool lazy) | 2111 | bool lazy) |
2225 | { | 2112 | { |
2226 | 2113 | ||
2227 | if (!is_nocb_cpu(rdp->cpu)) | 2114 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
2228 | return 0; | 2115 | return 0; |
2229 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | 2116 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); |
2117 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | ||
2118 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | ||
2119 | (unsigned long)rhp->func, | ||
2120 | rdp->qlen_lazy, rdp->qlen); | ||
2121 | else | ||
2122 | trace_rcu_callback(rdp->rsp->name, rhp, | ||
2123 | rdp->qlen_lazy, rdp->qlen); | ||
2230 | return 1; | 2124 | return 1; |
2231 | } | 2125 | } |
2232 | 2126 | ||
@@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
2241 | long qll = rsp->qlen_lazy; | 2135 | long qll = rsp->qlen_lazy; |
2242 | 2136 | ||
2243 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | 2137 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ |
2244 | if (!is_nocb_cpu(smp_processor_id())) | 2138 | if (!rcu_is_nocb_cpu(smp_processor_id())) |
2245 | return 0; | 2139 | return 0; |
2246 | rsp->qlen = 0; | 2140 | rsp->qlen = 0; |
2247 | rsp->qlen_lazy = 0; | 2141 | rsp->qlen_lazy = 0; |
@@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
2265 | } | 2159 | } |
2266 | 2160 | ||
2267 | /* | 2161 | /* |
2268 | * There must be at least one non-no-CBs CPU in operation at any given | 2162 | * If necessary, kick off a new grace period, and either way wait |
2269 | * time, because no-CBs CPUs are not capable of initiating grace periods | 2163 | * for a subsequent grace period to complete. |
2270 | * independently. This function therefore complains if the specified | ||
2271 | * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to | ||
2272 | * avoid offlining the last such CPU. (Recursion is a wonderful thing, | ||
2273 | * but you have to have a base case!) | ||
2274 | */ | 2164 | */ |
2275 | static bool nocb_cpu_expendable(int cpu) | 2165 | static void rcu_nocb_wait_gp(struct rcu_data *rdp) |
2276 | { | 2166 | { |
2277 | cpumask_var_t non_nocb_cpus; | 2167 | unsigned long c; |
2278 | int ret; | 2168 | bool d; |
2169 | unsigned long flags; | ||
2170 | struct rcu_node *rnp = rdp->mynode; | ||
2171 | |||
2172 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
2173 | c = rcu_start_future_gp(rnp, rdp); | ||
2174 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2279 | 2175 | ||
2280 | /* | 2176 | /* |
2281 | * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, | 2177 | * Wait for the grace period. Do so interruptibly to avoid messing |
2282 | * then offlining this CPU is harmless. Let it happen. | 2178 | * up the load average. |
2283 | */ | 2179 | */ |
2284 | if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) | 2180 | trace_rcu_future_gp(rnp, rdp, c, "StartWait"); |
2285 | return 1; | 2181 | for (;;) { |
2286 | 2182 | wait_event_interruptible( | |
2287 | /* If no memory, play it safe and keep the CPU around. */ | 2183 | rnp->nocb_gp_wq[c & 0x1], |
2288 | if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) | 2184 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); |
2289 | return 0; | 2185 | if (likely(d)) |
2290 | cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); | 2186 | break; |
2291 | cpumask_clear_cpu(cpu, non_nocb_cpus); | 2187 | flush_signals(current); |
2292 | ret = !cpumask_empty(non_nocb_cpus); | 2188 | trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); |
2293 | free_cpumask_var(non_nocb_cpus); | 2189 | } |
2294 | return ret; | 2190 | trace_rcu_future_gp(rnp, rdp, c, "EndWait"); |
2295 | } | 2191 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ |
2296 | |||
2297 | /* | ||
2298 | * Helper structure for remote registry of RCU callbacks. | ||
2299 | * This is needed for when a no-CBs CPU needs to start a grace period. | ||
2300 | * If it just invokes call_rcu(), the resulting callback will be queued, | ||
2301 | * which can result in deadlock. | ||
2302 | */ | ||
2303 | struct rcu_head_remote { | ||
2304 | struct rcu_head *rhp; | ||
2305 | call_rcu_func_t *crf; | ||
2306 | void (*func)(struct rcu_head *rhp); | ||
2307 | }; | ||
2308 | |||
2309 | /* | ||
2310 | * Register a callback as specified by the rcu_head_remote struct. | ||
2311 | * This function is intended to be invoked via smp_call_function_single(). | ||
2312 | */ | ||
2313 | static void call_rcu_local(void *arg) | ||
2314 | { | ||
2315 | struct rcu_head_remote *rhrp = | ||
2316 | container_of(arg, struct rcu_head_remote, rhp); | ||
2317 | |||
2318 | rhrp->crf(rhrp->rhp, rhrp->func); | ||
2319 | } | ||
2320 | |||
2321 | /* | ||
2322 | * Set up an rcu_head_remote structure and the invoke call_rcu_local() | ||
2323 | * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via | ||
2324 | * smp_call_function_single(). | ||
2325 | */ | ||
2326 | static void invoke_crf_remote(struct rcu_head *rhp, | ||
2327 | void (*func)(struct rcu_head *rhp), | ||
2328 | call_rcu_func_t crf) | ||
2329 | { | ||
2330 | struct rcu_head_remote rhr; | ||
2331 | |||
2332 | rhr.rhp = rhp; | ||
2333 | rhr.crf = crf; | ||
2334 | rhr.func = func; | ||
2335 | smp_call_function_single(0, call_rcu_local, &rhr, 1); | ||
2336 | } | ||
2337 | |||
2338 | /* | ||
2339 | * Helper functions to be passed to wait_rcu_gp(), each of which | ||
2340 | * invokes invoke_crf_remote() to register a callback appropriately. | ||
2341 | */ | ||
2342 | static void __maybe_unused | ||
2343 | call_rcu_preempt_remote(struct rcu_head *rhp, | ||
2344 | void (*func)(struct rcu_head *rhp)) | ||
2345 | { | ||
2346 | invoke_crf_remote(rhp, func, call_rcu); | ||
2347 | } | ||
2348 | static void call_rcu_bh_remote(struct rcu_head *rhp, | ||
2349 | void (*func)(struct rcu_head *rhp)) | ||
2350 | { | ||
2351 | invoke_crf_remote(rhp, func, call_rcu_bh); | ||
2352 | } | ||
2353 | static void call_rcu_sched_remote(struct rcu_head *rhp, | ||
2354 | void (*func)(struct rcu_head *rhp)) | ||
2355 | { | ||
2356 | invoke_crf_remote(rhp, func, call_rcu_sched); | ||
2357 | } | 2192 | } |
2358 | 2193 | ||
2359 | /* | 2194 | /* |
@@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg) | |||
2390 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | 2225 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); |
2391 | ACCESS_ONCE(rdp->nocb_p_count) += c; | 2226 | ACCESS_ONCE(rdp->nocb_p_count) += c; |
2392 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | 2227 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; |
2393 | wait_rcu_gp(rdp->rsp->call_remote); | 2228 | rcu_nocb_wait_gp(rdp); |
2394 | 2229 | ||
2395 | /* Each pass through the following loop invokes a callback. */ | 2230 | /* Each pass through the following loop invokes a callback. */ |
2396 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | 2231 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); |
@@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
2436 | return; | 2271 | return; |
2437 | for_each_cpu(cpu, rcu_nocb_mask) { | 2272 | for_each_cpu(cpu, rcu_nocb_mask) { |
2438 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2273 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2439 | t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); | 2274 | t = kthread_run(rcu_nocb_kthread, rdp, |
2275 | "rcuo%c/%d", rsp->abbr, cpu); | ||
2440 | BUG_ON(IS_ERR(t)); | 2276 | BUG_ON(IS_ERR(t)); |
2441 | ACCESS_ONCE(rdp->nocb_kthread) = t; | 2277 | ACCESS_ONCE(rdp->nocb_kthread) = t; |
2442 | } | 2278 | } |
2443 | } | 2279 | } |
2444 | 2280 | ||
2445 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | 2281 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ |
2446 | static void init_nocb_callback_list(struct rcu_data *rdp) | 2282 | static bool init_nocb_callback_list(struct rcu_data *rdp) |
2447 | { | 2283 | { |
2448 | if (rcu_nocb_mask == NULL || | 2284 | if (rcu_nocb_mask == NULL || |
2449 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | 2285 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) |
2450 | return; | 2286 | return false; |
2451 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2287 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; |
2288 | return true; | ||
2452 | } | 2289 | } |
2453 | 2290 | ||
2454 | /* Initialize the ->call_remote fields in the rcu_state structures. */ | 2291 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ |
2455 | static void __init rcu_init_nocb(void) | 2292 | |
2293 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2456 | { | 2294 | { |
2457 | #ifdef CONFIG_PREEMPT_RCU | 2295 | return 0; |
2458 | rcu_preempt_state.call_remote = call_rcu_preempt_remote; | ||
2459 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2460 | rcu_bh_state.call_remote = call_rcu_bh_remote; | ||
2461 | rcu_sched_state.call_remote = call_rcu_sched_remote; | ||
2462 | } | 2296 | } |
2463 | 2297 | ||
2464 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 2298 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
2299 | { | ||
2300 | } | ||
2465 | 2301 | ||
2466 | static bool is_nocb_cpu(int cpu) | 2302 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) |
2303 | { | ||
2304 | } | ||
2305 | |||
2306 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
2467 | { | 2307 | { |
2468 | return false; | ||
2469 | } | 2308 | } |
2470 | 2309 | ||
2471 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 2310 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
@@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
2480 | return 0; | 2319 | return 0; |
2481 | } | 2320 | } |
2482 | 2321 | ||
2483 | static bool nocb_cpu_expendable(int cpu) | ||
2484 | { | ||
2485 | return 1; | ||
2486 | } | ||
2487 | |||
2488 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2322 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
2489 | { | 2323 | { |
2490 | } | 2324 | } |
@@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
2493 | { | 2327 | { |
2494 | } | 2328 | } |
2495 | 2329 | ||
2496 | static void init_nocb_callback_list(struct rcu_data *rdp) | 2330 | static bool init_nocb_callback_list(struct rcu_data *rdp) |
2497 | { | 2331 | { |
2332 | return false; | ||
2498 | } | 2333 | } |
2499 | 2334 | ||
2500 | static void __init rcu_init_nocb(void) | 2335 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ |
2336 | |||
2337 | /* | ||
2338 | * An adaptive-ticks CPU can potentially execute in kernel mode for an | ||
2339 | * arbitrarily long period of time with the scheduling-clock tick turned | ||
2340 | * off. RCU will be paying attention to this CPU because it is in the | ||
2341 | * kernel, but the CPU cannot be guaranteed to be executing the RCU state | ||
2342 | * machine because the scheduling-clock tick has been disabled. Therefore, | ||
2343 | * if an adaptive-ticks CPU is failing to respond to the current grace | ||
2344 | * period and has not be idle from an RCU perspective, kick it. | ||
2345 | */ | ||
2346 | static void rcu_kick_nohz_cpu(int cpu) | ||
2501 | { | 2347 | { |
2348 | #ifdef CONFIG_NO_HZ_FULL | ||
2349 | if (tick_nohz_full_cpu(cpu)) | ||
2350 | smp_send_reschedule(cpu); | ||
2351 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | ||
2502 | } | 2352 | } |
2503 | |||
2504 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0d095dcaa670..cf6c17412932 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,8 +46,6 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | #define ulong2long(a) (*(long *)(&(a))) | ||
50 | |||
51 | static int r_open(struct inode *inode, struct file *file, | 49 | static int r_open(struct inode *inode, struct file *file, |
52 | const struct seq_operations *op) | 50 | const struct seq_operations *op) |
53 | { | 51 | { |
@@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = { | |||
97 | .open = rcubarrier_open, | 95 | .open = rcubarrier_open, |
98 | .read = seq_read, | 96 | .read = seq_read, |
99 | .llseek = no_llseek, | 97 | .llseek = no_llseek, |
100 | .release = seq_release, | 98 | .release = single_release, |
101 | }; | 99 | }; |
102 | 100 | ||
103 | #ifdef CONFIG_RCU_BOOST | 101 | #ifdef CONFIG_RCU_BOOST |
@@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = { | |||
208 | .open = rcuexp_open, | 206 | .open = rcuexp_open, |
209 | .read = seq_read, | 207 | .read = seq_read, |
210 | .llseek = no_llseek, | 208 | .llseek = no_llseek, |
211 | .release = seq_release, | 209 | .release = single_release, |
212 | }; | 210 | }; |
213 | 211 | ||
214 | #ifdef CONFIG_RCU_BOOST | 212 | #ifdef CONFIG_RCU_BOOST |
@@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = { | |||
308 | .open = rcuhier_open, | 306 | .open = rcuhier_open, |
309 | .read = seq_read, | 307 | .read = seq_read, |
310 | .llseek = no_llseek, | 308 | .llseek = no_llseek, |
311 | .release = seq_release, | 309 | .release = single_release, |
312 | }; | 310 | }; |
313 | 311 | ||
314 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | 312 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) |
@@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = { | |||
350 | .open = rcugp_open, | 348 | .open = rcugp_open, |
351 | .read = seq_read, | 349 | .read = seq_read, |
352 | .llseek = no_llseek, | 350 | .llseek = no_llseek, |
353 | .release = seq_release, | 351 | .release = single_release, |
354 | }; | 352 | }; |
355 | 353 | ||
356 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | 354 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) |
diff --git a/kernel/relay.c b/kernel/relay.c index 01ab081ac53a..b91488ba2e5a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf) | |||
234 | static void relay_remove_buf(struct kref *kref) | 234 | static void relay_remove_buf(struct kref *kref) |
235 | { | 235 | { |
236 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); | 236 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); |
237 | buf->chan->cb->remove_buf_file(buf->dentry); | ||
238 | relay_destroy_buf(buf); | 237 | relay_destroy_buf(buf); |
239 | } | 238 | } |
240 | 239 | ||
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf) | |||
484 | { | 483 | { |
485 | buf->finalized = 1; | 484 | buf->finalized = 1; |
486 | del_timer_sync(&buf->timer); | 485 | del_timer_sync(&buf->timer); |
486 | buf->chan->cb->remove_buf_file(buf->dentry); | ||
487 | kref_put(&buf->kref, relay_remove_buf); | 487 | kref_put(&buf->kref, relay_remove_buf); |
488 | } | 488 | } |
489 | 489 | ||
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename, | |||
588 | chan->version = RELAYFS_CHANNEL_VERSION; | 588 | chan->version = RELAYFS_CHANNEL_VERSION; |
589 | chan->n_subbufs = n_subbufs; | 589 | chan->n_subbufs = n_subbufs; |
590 | chan->subbuf_size = subbuf_size; | 590 | chan->subbuf_size = subbuf_size; |
591 | chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); | 591 | chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs); |
592 | chan->parent = parent; | 592 | chan->parent = parent; |
593 | chan->private_data = private_data; | 593 | chan->private_data = private_data; |
594 | if (base_filename) { | 594 | if (base_filename) { |
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, | |||
1099 | static int subbuf_read_actor(size_t read_start, | 1099 | static int subbuf_read_actor(size_t read_start, |
1100 | struct rchan_buf *buf, | 1100 | struct rchan_buf *buf, |
1101 | size_t avail, | 1101 | size_t avail, |
1102 | read_descriptor_t *desc, | 1102 | read_descriptor_t *desc) |
1103 | read_actor_t actor) | ||
1104 | { | 1103 | { |
1105 | void *from; | 1104 | void *from; |
1106 | int ret = 0; | 1105 | int ret = 0; |
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start, | |||
1121 | typedef int (*subbuf_actor_t) (size_t read_start, | 1120 | typedef int (*subbuf_actor_t) (size_t read_start, |
1122 | struct rchan_buf *buf, | 1121 | struct rchan_buf *buf, |
1123 | size_t avail, | 1122 | size_t avail, |
1124 | read_descriptor_t *desc, | 1123 | read_descriptor_t *desc); |
1125 | read_actor_t actor); | ||
1126 | 1124 | ||
1127 | /* | 1125 | /* |
1128 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries | 1126 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries |
1129 | */ | 1127 | */ |
1130 | static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | 1128 | static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, |
1131 | subbuf_actor_t subbuf_actor, | 1129 | subbuf_actor_t subbuf_actor, |
1132 | read_actor_t actor, | ||
1133 | read_descriptor_t *desc) | 1130 | read_descriptor_t *desc) |
1134 | { | 1131 | { |
1135 | struct rchan_buf *buf = filp->private_data; | 1132 | struct rchan_buf *buf = filp->private_data; |
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | |||
1150 | break; | 1147 | break; |
1151 | 1148 | ||
1152 | avail = min(desc->count, avail); | 1149 | avail = min(desc->count, avail); |
1153 | ret = subbuf_actor(read_start, buf, avail, desc, actor); | 1150 | ret = subbuf_actor(read_start, buf, avail, desc); |
1154 | if (desc->error < 0) | 1151 | if (desc->error < 0) |
1155 | break; | 1152 | break; |
1156 | 1153 | ||
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp, | |||
1174 | desc.count = count; | 1171 | desc.count = count; |
1175 | desc.arg.buf = buffer; | 1172 | desc.arg.buf = buffer; |
1176 | desc.error = 0; | 1173 | desc.error = 0; |
1177 | return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, | 1174 | return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); |
1178 | NULL, &desc); | ||
1179 | } | 1175 | } |
1180 | 1176 | ||
1181 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) | 1177 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) |
diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b9..d7386986e10e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
22 | #include <linux/device.h> | 22 | #include <linux/device.h> |
23 | #include <linux/pfn.h> | 23 | #include <linux/pfn.h> |
24 | #include <linux/mm.h> | ||
24 | #include <asm/io.h> | 25 | #include <asm/io.h> |
25 | 26 | ||
26 | 27 | ||
@@ -50,6 +51,14 @@ struct resource_constraint { | |||
50 | 51 | ||
51 | static DEFINE_RWLOCK(resource_lock); | 52 | static DEFINE_RWLOCK(resource_lock); |
52 | 53 | ||
54 | /* | ||
55 | * For memory hotplug, there is no way to free resource entries allocated | ||
56 | * by boot mem after the system is up. So for reusing the resource entry | ||
57 | * we need to remember the resource. | ||
58 | */ | ||
59 | static struct resource *bootmem_resource_free; | ||
60 | static DEFINE_SPINLOCK(bootmem_resource_lock); | ||
61 | |||
53 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 62 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
54 | { | 63 | { |
55 | struct resource *p = v; | 64 | struct resource *p = v; |
@@ -151,6 +160,40 @@ __initcall(ioresources_init); | |||
151 | 160 | ||
152 | #endif /* CONFIG_PROC_FS */ | 161 | #endif /* CONFIG_PROC_FS */ |
153 | 162 | ||
163 | static void free_resource(struct resource *res) | ||
164 | { | ||
165 | if (!res) | ||
166 | return; | ||
167 | |||
168 | if (!PageSlab(virt_to_head_page(res))) { | ||
169 | spin_lock(&bootmem_resource_lock); | ||
170 | res->sibling = bootmem_resource_free; | ||
171 | bootmem_resource_free = res; | ||
172 | spin_unlock(&bootmem_resource_lock); | ||
173 | } else { | ||
174 | kfree(res); | ||
175 | } | ||
176 | } | ||
177 | |||
178 | static struct resource *alloc_resource(gfp_t flags) | ||
179 | { | ||
180 | struct resource *res = NULL; | ||
181 | |||
182 | spin_lock(&bootmem_resource_lock); | ||
183 | if (bootmem_resource_free) { | ||
184 | res = bootmem_resource_free; | ||
185 | bootmem_resource_free = res->sibling; | ||
186 | } | ||
187 | spin_unlock(&bootmem_resource_lock); | ||
188 | |||
189 | if (res) | ||
190 | memset(res, 0, sizeof(struct resource)); | ||
191 | else | ||
192 | res = kzalloc(sizeof(struct resource), flags); | ||
193 | |||
194 | return res; | ||
195 | } | ||
196 | |||
154 | /* Return the conflict entry if you can't request it */ | 197 | /* Return the conflict entry if you can't request it */ |
155 | static struct resource * __request_resource(struct resource *root, struct resource *new) | 198 | static struct resource * __request_resource(struct resource *root, struct resource *new) |
156 | { | 199 | { |
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) | |||
706 | write_unlock(&resource_lock); | 749 | write_unlock(&resource_lock); |
707 | } | 750 | } |
708 | 751 | ||
709 | /** | 752 | static int __adjust_resource(struct resource *res, resource_size_t start, |
710 | * adjust_resource - modify a resource's start and size | 753 | resource_size_t size) |
711 | * @res: resource to modify | ||
712 | * @start: new start value | ||
713 | * @size: new size | ||
714 | * | ||
715 | * Given an existing resource, change its start and size to match the | ||
716 | * arguments. Returns 0 on success, -EBUSY if it can't fit. | ||
717 | * Existing children of the resource are assumed to be immutable. | ||
718 | */ | ||
719 | int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) | ||
720 | { | 754 | { |
721 | struct resource *tmp, *parent = res->parent; | 755 | struct resource *tmp, *parent = res->parent; |
722 | resource_size_t end = start + size - 1; | 756 | resource_size_t end = start + size - 1; |
723 | int result = -EBUSY; | 757 | int result = -EBUSY; |
724 | 758 | ||
725 | write_lock(&resource_lock); | ||
726 | |||
727 | if (!parent) | 759 | if (!parent) |
728 | goto skip; | 760 | goto skip; |
729 | 761 | ||
@@ -751,6 +783,26 @@ skip: | |||
751 | result = 0; | 783 | result = 0; |
752 | 784 | ||
753 | out: | 785 | out: |
786 | return result; | ||
787 | } | ||
788 | |||
789 | /** | ||
790 | * adjust_resource - modify a resource's start and size | ||
791 | * @res: resource to modify | ||
792 | * @start: new start value | ||
793 | * @size: new size | ||
794 | * | ||
795 | * Given an existing resource, change its start and size to match the | ||
796 | * arguments. Returns 0 on success, -EBUSY if it can't fit. | ||
797 | * Existing children of the resource are assumed to be immutable. | ||
798 | */ | ||
799 | int adjust_resource(struct resource *res, resource_size_t start, | ||
800 | resource_size_t size) | ||
801 | { | ||
802 | int result; | ||
803 | |||
804 | write_lock(&resource_lock); | ||
805 | result = __adjust_resource(res, start, size); | ||
754 | write_unlock(&resource_lock); | 806 | write_unlock(&resource_lock); |
755 | return result; | 807 | return result; |
756 | } | 808 | } |
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
762 | { | 814 | { |
763 | struct resource *parent = root; | 815 | struct resource *parent = root; |
764 | struct resource *conflict; | 816 | struct resource *conflict; |
765 | struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); | 817 | struct resource *res = alloc_resource(GFP_ATOMIC); |
766 | struct resource *next_res = NULL; | 818 | struct resource *next_res = NULL; |
767 | 819 | ||
768 | if (!res) | 820 | if (!res) |
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
787 | /* conflict covered whole area */ | 839 | /* conflict covered whole area */ |
788 | if (conflict->start <= res->start && | 840 | if (conflict->start <= res->start && |
789 | conflict->end >= res->end) { | 841 | conflict->end >= res->end) { |
790 | kfree(res); | 842 | free_resource(res); |
791 | WARN_ON(next_res); | 843 | WARN_ON(next_res); |
792 | break; | 844 | break; |
793 | } | 845 | } |
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
797 | end = res->end; | 849 | end = res->end; |
798 | res->end = conflict->start - 1; | 850 | res->end = conflict->start - 1; |
799 | if (conflict->end < end) { | 851 | if (conflict->end < end) { |
800 | next_res = kzalloc(sizeof(*next_res), | 852 | next_res = alloc_resource(GFP_ATOMIC); |
801 | GFP_ATOMIC); | ||
802 | if (!next_res) { | 853 | if (!next_res) { |
803 | kfree(res); | 854 | free_resource(res); |
804 | break; | 855 | break; |
805 | } | 856 | } |
806 | next_res->name = name; | 857 | next_res->name = name; |
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent, | |||
890 | const char *name, int flags) | 941 | const char *name, int flags) |
891 | { | 942 | { |
892 | DECLARE_WAITQUEUE(wait, current); | 943 | DECLARE_WAITQUEUE(wait, current); |
893 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); | 944 | struct resource *res = alloc_resource(GFP_KERNEL); |
894 | 945 | ||
895 | if (!res) | 946 | if (!res) |
896 | return NULL; | 947 | return NULL; |
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent, | |||
924 | continue; | 975 | continue; |
925 | } | 976 | } |
926 | /* Uhhuh, that didn't work out.. */ | 977 | /* Uhhuh, that didn't work out.. */ |
927 | kfree(res); | 978 | free_resource(res); |
928 | res = NULL; | 979 | res = NULL; |
929 | break; | 980 | break; |
930 | } | 981 | } |
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start, | |||
958 | return -EBUSY; | 1009 | return -EBUSY; |
959 | 1010 | ||
960 | release_resource(res); | 1011 | release_resource(res); |
961 | kfree(res); | 1012 | free_resource(res); |
962 | return 0; | 1013 | return 0; |
963 | } | 1014 | } |
964 | EXPORT_SYMBOL(__check_region); | 1015 | EXPORT_SYMBOL(__check_region); |
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start, | |||
998 | write_unlock(&resource_lock); | 1049 | write_unlock(&resource_lock); |
999 | if (res->flags & IORESOURCE_MUXED) | 1050 | if (res->flags & IORESOURCE_MUXED) |
1000 | wake_up(&muxed_resource_wait); | 1051 | wake_up(&muxed_resource_wait); |
1001 | kfree(res); | 1052 | free_resource(res); |
1002 | return; | 1053 | return; |
1003 | } | 1054 | } |
1004 | p = &res->sibling; | 1055 | p = &res->sibling; |
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start, | |||
1012 | } | 1063 | } |
1013 | EXPORT_SYMBOL(__release_region); | 1064 | EXPORT_SYMBOL(__release_region); |
1014 | 1065 | ||
1066 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1067 | /** | ||
1068 | * release_mem_region_adjustable - release a previously reserved memory region | ||
1069 | * @parent: parent resource descriptor | ||
1070 | * @start: resource start address | ||
1071 | * @size: resource region size | ||
1072 | * | ||
1073 | * This interface is intended for memory hot-delete. The requested region | ||
1074 | * is released from a currently busy memory resource. The requested region | ||
1075 | * must either match exactly or fit into a single busy resource entry. In | ||
1076 | * the latter case, the remaining resource is adjusted accordingly. | ||
1077 | * Existing children of the busy memory resource must be immutable in the | ||
1078 | * request. | ||
1079 | * | ||
1080 | * Note: | ||
1081 | * - Additional release conditions, such as overlapping region, can be | ||
1082 | * supported after they are confirmed as valid cases. | ||
1083 | * - When a busy memory resource gets split into two entries, the code | ||
1084 | * assumes that all children remain in the lower address entry for | ||
1085 | * simplicity. Enhance this logic when necessary. | ||
1086 | */ | ||
1087 | int release_mem_region_adjustable(struct resource *parent, | ||
1088 | resource_size_t start, resource_size_t size) | ||
1089 | { | ||
1090 | struct resource **p; | ||
1091 | struct resource *res; | ||
1092 | struct resource *new_res; | ||
1093 | resource_size_t end; | ||
1094 | int ret = -EINVAL; | ||
1095 | |||
1096 | end = start + size - 1; | ||
1097 | if ((start < parent->start) || (end > parent->end)) | ||
1098 | return ret; | ||
1099 | |||
1100 | /* The alloc_resource() result gets checked later */ | ||
1101 | new_res = alloc_resource(GFP_KERNEL); | ||
1102 | |||
1103 | p = &parent->child; | ||
1104 | write_lock(&resource_lock); | ||
1105 | |||
1106 | while ((res = *p)) { | ||
1107 | if (res->start >= end) | ||
1108 | break; | ||
1109 | |||
1110 | /* look for the next resource if it does not fit into */ | ||
1111 | if (res->start > start || res->end < end) { | ||
1112 | p = &res->sibling; | ||
1113 | continue; | ||
1114 | } | ||
1115 | |||
1116 | if (!(res->flags & IORESOURCE_MEM)) | ||
1117 | break; | ||
1118 | |||
1119 | if (!(res->flags & IORESOURCE_BUSY)) { | ||
1120 | p = &res->child; | ||
1121 | continue; | ||
1122 | } | ||
1123 | |||
1124 | /* found the target resource; let's adjust accordingly */ | ||
1125 | if (res->start == start && res->end == end) { | ||
1126 | /* free the whole entry */ | ||
1127 | *p = res->sibling; | ||
1128 | free_resource(res); | ||
1129 | ret = 0; | ||
1130 | } else if (res->start == start && res->end != end) { | ||
1131 | /* adjust the start */ | ||
1132 | ret = __adjust_resource(res, end + 1, | ||
1133 | res->end - end); | ||
1134 | } else if (res->start != start && res->end == end) { | ||
1135 | /* adjust the end */ | ||
1136 | ret = __adjust_resource(res, res->start, | ||
1137 | start - res->start); | ||
1138 | } else { | ||
1139 | /* split into two entries */ | ||
1140 | if (!new_res) { | ||
1141 | ret = -ENOMEM; | ||
1142 | break; | ||
1143 | } | ||
1144 | new_res->name = res->name; | ||
1145 | new_res->start = end + 1; | ||
1146 | new_res->end = res->end; | ||
1147 | new_res->flags = res->flags; | ||
1148 | new_res->parent = res->parent; | ||
1149 | new_res->sibling = res->sibling; | ||
1150 | new_res->child = NULL; | ||
1151 | |||
1152 | ret = __adjust_resource(res, res->start, | ||
1153 | start - res->start); | ||
1154 | if (ret) | ||
1155 | break; | ||
1156 | res->sibling = new_res; | ||
1157 | new_res = NULL; | ||
1158 | } | ||
1159 | |||
1160 | break; | ||
1161 | } | ||
1162 | |||
1163 | write_unlock(&resource_lock); | ||
1164 | free_resource(new_res); | ||
1165 | return ret; | ||
1166 | } | ||
1167 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1168 | |||
1015 | /* | 1169 | /* |
1016 | * Managed region resource | 1170 | * Managed region resource |
1017 | */ | 1171 | */ |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 7890b10084a7..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
15 | #include <linux/timer.h> | 15 | #include <linux/timer.h> |
16 | #include <linux/freezer.h> | 16 | #include <linux/freezer.h> |
17 | #include <linux/stat.h> | ||
17 | 18 | ||
18 | #include "rtmutex.h" | 19 | #include "rtmutex.h" |
19 | 20 | ||
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at | |||
366 | return curr - buf; | 367 | return curr - buf; |
367 | } | 368 | } |
368 | 369 | ||
369 | static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); | 370 | static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); |
370 | static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); | 371 | static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); |
371 | 372 | ||
372 | static struct bus_type rttest_subsys = { | 373 | static struct bus_type rttest_subsys = { |
373 | .name = "rttest", | 374 | .name = "rttest", |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b3c6c3fcd847..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) | |||
126 | 126 | ||
127 | EXPORT_SYMBOL(_down_write_nest_lock); | 127 | EXPORT_SYMBOL(_down_write_nest_lock); |
128 | 128 | ||
129 | void down_read_non_owner(struct rw_semaphore *sem) | ||
130 | { | ||
131 | might_sleep(); | ||
132 | |||
133 | __down_read(sem); | ||
134 | } | ||
135 | |||
136 | EXPORT_SYMBOL(down_read_non_owner); | ||
137 | |||
129 | void down_write_nested(struct rw_semaphore *sem, int subclass) | 138 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
130 | { | 139 | { |
131 | might_sleep(); | 140 | might_sleep(); |
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
136 | 145 | ||
137 | EXPORT_SYMBOL(down_write_nested); | 146 | EXPORT_SYMBOL(down_write_nested); |
138 | 147 | ||
148 | void up_read_non_owner(struct rw_semaphore *sem) | ||
149 | { | ||
150 | __up_read(sem); | ||
151 | } | ||
152 | |||
153 | EXPORT_SYMBOL(up_read_non_owner); | ||
154 | |||
139 | #endif | 155 | #endif |
140 | 156 | ||
141 | 157 | ||
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o | ||
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492df..c3ae1446461c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) | |||
176 | u64 this_clock, remote_clock; | 176 | u64 this_clock, remote_clock; |
177 | u64 *ptr, old_val, val; | 177 | u64 *ptr, old_val, val; |
178 | 178 | ||
179 | #if BITS_PER_LONG != 64 | ||
180 | again: | ||
181 | /* | ||
182 | * Careful here: The local and the remote clock values need to | ||
183 | * be read out atomic as we need to compare the values and | ||
184 | * then update either the local or the remote side. So the | ||
185 | * cmpxchg64 below only protects one readout. | ||
186 | * | ||
187 | * We must reread via sched_clock_local() in the retry case on | ||
188 | * 32bit as an NMI could use sched_clock_local() via the | ||
189 | * tracer and hit between the readout of | ||
190 | * the low32bit and the high 32bit portion. | ||
191 | */ | ||
192 | this_clock = sched_clock_local(my_scd); | ||
193 | /* | ||
194 | * We must enforce atomic readout on 32bit, otherwise the | ||
195 | * update on the remote cpu can hit inbetween the readout of | ||
196 | * the low32bit and the high 32bit portion. | ||
197 | */ | ||
198 | remote_clock = cmpxchg64(&scd->clock, 0, 0); | ||
199 | #else | ||
200 | /* | ||
201 | * On 64bit the read of [my]scd->clock is atomic versus the | ||
202 | * update, so we can avoid the above 32bit dance. | ||
203 | */ | ||
179 | sched_clock_local(my_scd); | 204 | sched_clock_local(my_scd); |
180 | again: | 205 | again: |
181 | this_clock = my_scd->clock; | 206 | this_clock = my_scd->clock; |
182 | remote_clock = scd->clock; | 207 | remote_clock = scd->clock; |
208 | #endif | ||
183 | 209 | ||
184 | /* | 210 | /* |
185 | * Use the opportunity that we have both locks | 211 | * Use the opportunity that we have both locks |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..58453b8272fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -512,11 +512,6 @@ static inline void init_hrtick(void) | |||
512 | * the target CPU. | 512 | * the target CPU. |
513 | */ | 513 | */ |
514 | #ifdef CONFIG_SMP | 514 | #ifdef CONFIG_SMP |
515 | |||
516 | #ifndef tsk_is_polling | ||
517 | #define tsk_is_polling(t) 0 | ||
518 | #endif | ||
519 | |||
520 | void resched_task(struct task_struct *p) | 515 | void resched_task(struct task_struct *p) |
521 | { | 516 | { |
522 | int cpu; | 517 | int cpu; |
@@ -549,7 +544,7 @@ void resched_cpu(int cpu) | |||
549 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 544 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
550 | } | 545 | } |
551 | 546 | ||
552 | #ifdef CONFIG_NO_HZ | 547 | #ifdef CONFIG_NO_HZ_COMMON |
553 | /* | 548 | /* |
554 | * In the semi idle case, use the nearest busy cpu for migrating timers | 549 | * In the semi idle case, use the nearest busy cpu for migrating timers |
555 | * from an idle cpu. This is good for power-savings. | 550 | * from an idle cpu. This is good for power-savings. |
@@ -587,7 +582,7 @@ unlock: | |||
587 | * account when the CPU goes back to idle and evaluates the timer | 582 | * account when the CPU goes back to idle and evaluates the timer |
588 | * wheel for the next timer event. | 583 | * wheel for the next timer event. |
589 | */ | 584 | */ |
590 | void wake_up_idle_cpu(int cpu) | 585 | static void wake_up_idle_cpu(int cpu) |
591 | { | 586 | { |
592 | struct rq *rq = cpu_rq(cpu); | 587 | struct rq *rq = cpu_rq(cpu); |
593 | 588 | ||
@@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu) | |||
617 | smp_send_reschedule(cpu); | 612 | smp_send_reschedule(cpu); |
618 | } | 613 | } |
619 | 614 | ||
615 | static bool wake_up_full_nohz_cpu(int cpu) | ||
616 | { | ||
617 | if (tick_nohz_full_cpu(cpu)) { | ||
618 | if (cpu != smp_processor_id() || | ||
619 | tick_nohz_tick_stopped()) | ||
620 | smp_send_reschedule(cpu); | ||
621 | return true; | ||
622 | } | ||
623 | |||
624 | return false; | ||
625 | } | ||
626 | |||
627 | void wake_up_nohz_cpu(int cpu) | ||
628 | { | ||
629 | if (!wake_up_full_nohz_cpu(cpu)) | ||
630 | wake_up_idle_cpu(cpu); | ||
631 | } | ||
632 | |||
620 | static inline bool got_nohz_idle_kick(void) | 633 | static inline bool got_nohz_idle_kick(void) |
621 | { | 634 | { |
622 | int cpu = smp_processor_id(); | 635 | int cpu = smp_processor_id(); |
623 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | 636 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); |
624 | } | 637 | } |
625 | 638 | ||
626 | #else /* CONFIG_NO_HZ */ | 639 | #else /* CONFIG_NO_HZ_COMMON */ |
627 | 640 | ||
628 | static inline bool got_nohz_idle_kick(void) | 641 | static inline bool got_nohz_idle_kick(void) |
629 | { | 642 | { |
630 | return false; | 643 | return false; |
631 | } | 644 | } |
632 | 645 | ||
633 | #endif /* CONFIG_NO_HZ */ | 646 | #endif /* CONFIG_NO_HZ_COMMON */ |
647 | |||
648 | #ifdef CONFIG_NO_HZ_FULL | ||
649 | bool sched_can_stop_tick(void) | ||
650 | { | ||
651 | struct rq *rq; | ||
652 | |||
653 | rq = this_rq(); | ||
654 | |||
655 | /* Make sure rq->nr_running update is visible after the IPI */ | ||
656 | smp_rmb(); | ||
657 | |||
658 | /* More than one running task need preemption */ | ||
659 | if (rq->nr_running > 1) | ||
660 | return false; | ||
661 | |||
662 | return true; | ||
663 | } | ||
664 | #endif /* CONFIG_NO_HZ_FULL */ | ||
634 | 665 | ||
635 | void sched_avg_update(struct rq *rq) | 666 | void sched_avg_update(struct rq *rq) |
636 | { | 667 | { |
@@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | |||
1288 | static void | 1319 | static void |
1289 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | 1320 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) |
1290 | { | 1321 | { |
1291 | trace_sched_wakeup(p, true); | ||
1292 | check_preempt_curr(rq, p, wake_flags); | 1322 | check_preempt_curr(rq, p, wake_flags); |
1323 | trace_sched_wakeup(p, true); | ||
1293 | 1324 | ||
1294 | p->state = TASK_RUNNING; | 1325 | p->state = TASK_RUNNING; |
1295 | #ifdef CONFIG_SMP | 1326 | #ifdef CONFIG_SMP |
@@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void) | |||
1362 | 1393 | ||
1363 | void scheduler_ipi(void) | 1394 | void scheduler_ipi(void) |
1364 | { | 1395 | { |
1365 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) | 1396 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() |
1397 | && !tick_nohz_full_cpu(smp_processor_id())) | ||
1366 | return; | 1398 | return; |
1367 | 1399 | ||
1368 | /* | 1400 | /* |
@@ -1379,6 +1411,7 @@ void scheduler_ipi(void) | |||
1379 | * somewhat pessimize the simple resched case. | 1411 | * somewhat pessimize the simple resched case. |
1380 | */ | 1412 | */ |
1381 | irq_enter(); | 1413 | irq_enter(); |
1414 | tick_nohz_full_check(); | ||
1382 | sched_ttwu_pending(); | 1415 | sched_ttwu_pending(); |
1383 | 1416 | ||
1384 | /* | 1417 | /* |
@@ -1498,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
1498 | { | 1531 | { |
1499 | struct rq *rq = task_rq(p); | 1532 | struct rq *rq = task_rq(p); |
1500 | 1533 | ||
1501 | BUG_ON(rq != this_rq()); | 1534 | if (WARN_ON_ONCE(rq != this_rq()) || |
1502 | BUG_ON(p == current); | 1535 | WARN_ON_ONCE(p == current)) |
1536 | return; | ||
1537 | |||
1503 | lockdep_assert_held(&rq->lock); | 1538 | lockdep_assert_held(&rq->lock); |
1504 | 1539 | ||
1505 | if (!raw_spin_trylock(&p->pi_lock)) { | 1540 | if (!raw_spin_trylock(&p->pi_lock)) { |
@@ -1858,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1858 | kprobe_flush_task(prev); | 1893 | kprobe_flush_task(prev); |
1859 | put_task_struct(prev); | 1894 | put_task_struct(prev); |
1860 | } | 1895 | } |
1896 | |||
1897 | tick_nohz_task_switch(current); | ||
1861 | } | 1898 | } |
1862 | 1899 | ||
1863 | #ifdef CONFIG_SMP | 1900 | #ifdef CONFIG_SMP |
@@ -2121,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
2121 | return load >> FSHIFT; | 2158 | return load >> FSHIFT; |
2122 | } | 2159 | } |
2123 | 2160 | ||
2124 | #ifdef CONFIG_NO_HZ | 2161 | #ifdef CONFIG_NO_HZ_COMMON |
2125 | /* | 2162 | /* |
2126 | * Handle NO_HZ for the global load-average. | 2163 | * Handle NO_HZ for the global load-average. |
2127 | * | 2164 | * |
@@ -2347,12 +2384,12 @@ static void calc_global_nohz(void) | |||
2347 | smp_wmb(); | 2384 | smp_wmb(); |
2348 | calc_load_idx++; | 2385 | calc_load_idx++; |
2349 | } | 2386 | } |
2350 | #else /* !CONFIG_NO_HZ */ | 2387 | #else /* !CONFIG_NO_HZ_COMMON */ |
2351 | 2388 | ||
2352 | static inline long calc_load_fold_idle(void) { return 0; } | 2389 | static inline long calc_load_fold_idle(void) { return 0; } |
2353 | static inline void calc_global_nohz(void) { } | 2390 | static inline void calc_global_nohz(void) { } |
2354 | 2391 | ||
2355 | #endif /* CONFIG_NO_HZ */ | 2392 | #endif /* CONFIG_NO_HZ_COMMON */ |
2356 | 2393 | ||
2357 | /* | 2394 | /* |
2358 | * calc_load - update the avenrun load estimates 10 ticks after the | 2395 | * calc_load - update the avenrun load estimates 10 ticks after the |
@@ -2512,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
2512 | sched_avg_update(this_rq); | 2549 | sched_avg_update(this_rq); |
2513 | } | 2550 | } |
2514 | 2551 | ||
2515 | #ifdef CONFIG_NO_HZ | 2552 | #ifdef CONFIG_NO_HZ_COMMON |
2516 | /* | 2553 | /* |
2517 | * There is no sane way to deal with nohz on smp when using jiffies because the | 2554 | * There is no sane way to deal with nohz on smp when using jiffies because the |
2518 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 2555 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading |
@@ -2572,7 +2609,7 @@ void update_cpu_load_nohz(void) | |||
2572 | } | 2609 | } |
2573 | raw_spin_unlock(&this_rq->lock); | 2610 | raw_spin_unlock(&this_rq->lock); |
2574 | } | 2611 | } |
2575 | #endif /* CONFIG_NO_HZ */ | 2612 | #endif /* CONFIG_NO_HZ_COMMON */ |
2576 | 2613 | ||
2577 | /* | 2614 | /* |
2578 | * Called from scheduler_tick() | 2615 | * Called from scheduler_tick() |
@@ -2699,8 +2736,35 @@ void scheduler_tick(void) | |||
2699 | rq->idle_balance = idle_cpu(cpu); | 2736 | rq->idle_balance = idle_cpu(cpu); |
2700 | trigger_load_balance(rq, cpu); | 2737 | trigger_load_balance(rq, cpu); |
2701 | #endif | 2738 | #endif |
2739 | rq_last_tick_reset(rq); | ||
2702 | } | 2740 | } |
2703 | 2741 | ||
2742 | #ifdef CONFIG_NO_HZ_FULL | ||
2743 | /** | ||
2744 | * scheduler_tick_max_deferment | ||
2745 | * | ||
2746 | * Keep at least one tick per second when a single | ||
2747 | * active task is running because the scheduler doesn't | ||
2748 | * yet completely support full dynticks environment. | ||
2749 | * | ||
2750 | * This makes sure that uptime, CFS vruntime, load | ||
2751 | * balancing, etc... continue to move forward, even | ||
2752 | * with a very low granularity. | ||
2753 | */ | ||
2754 | u64 scheduler_tick_max_deferment(void) | ||
2755 | { | ||
2756 | struct rq *rq = this_rq(); | ||
2757 | unsigned long next, now = ACCESS_ONCE(jiffies); | ||
2758 | |||
2759 | next = rq->last_sched_tick + HZ; | ||
2760 | |||
2761 | if (time_before_eq(next, now)) | ||
2762 | return 0; | ||
2763 | |||
2764 | return jiffies_to_usecs(next - now) * NSEC_PER_USEC; | ||
2765 | } | ||
2766 | #endif | ||
2767 | |||
2704 | notrace unsigned long get_parent_ip(unsigned long addr) | 2768 | notrace unsigned long get_parent_ip(unsigned long addr) |
2705 | { | 2769 | { |
2706 | if (in_lock_functions(addr)) { | 2770 | if (in_lock_functions(addr)) { |
@@ -2997,51 +3061,6 @@ void __sched schedule_preempt_disabled(void) | |||
2997 | preempt_disable(); | 3061 | preempt_disable(); |
2998 | } | 3062 | } |
2999 | 3063 | ||
3000 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | ||
3001 | |||
3002 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
3003 | { | ||
3004 | if (lock->owner != owner) | ||
3005 | return false; | ||
3006 | |||
3007 | /* | ||
3008 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
3009 | * lock->owner still matches owner, if that fails, owner might | ||
3010 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
3011 | * ensures the memory stays valid. | ||
3012 | */ | ||
3013 | barrier(); | ||
3014 | |||
3015 | return owner->on_cpu; | ||
3016 | } | ||
3017 | |||
3018 | /* | ||
3019 | * Look out! "owner" is an entirely speculative pointer | ||
3020 | * access and not reliable. | ||
3021 | */ | ||
3022 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | ||
3023 | { | ||
3024 | if (!sched_feat(OWNER_SPIN)) | ||
3025 | return 0; | ||
3026 | |||
3027 | rcu_read_lock(); | ||
3028 | while (owner_running(lock, owner)) { | ||
3029 | if (need_resched()) | ||
3030 | break; | ||
3031 | |||
3032 | arch_mutex_cpu_relax(); | ||
3033 | } | ||
3034 | rcu_read_unlock(); | ||
3035 | |||
3036 | /* | ||
3037 | * We break out the loop above on need_resched() and when the | ||
3038 | * owner changed, which is a sign for heavy contention. Return | ||
3039 | * success only when lock->owner is NULL. | ||
3040 | */ | ||
3041 | return lock->owner == NULL; | ||
3042 | } | ||
3043 | #endif | ||
3044 | |||
3045 | #ifdef CONFIG_PREEMPT | 3064 | #ifdef CONFIG_PREEMPT |
3046 | /* | 3065 | /* |
3047 | * this is the entry point to schedule() from in-kernel preemption | 3066 | * this is the entry point to schedule() from in-kernel preemption |
@@ -3082,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule); | |||
3082 | asmlinkage void __sched preempt_schedule_irq(void) | 3101 | asmlinkage void __sched preempt_schedule_irq(void) |
3083 | { | 3102 | { |
3084 | struct thread_info *ti = current_thread_info(); | 3103 | struct thread_info *ti = current_thread_info(); |
3104 | enum ctx_state prev_state; | ||
3085 | 3105 | ||
3086 | /* Catch callers which need to be fixed */ | 3106 | /* Catch callers which need to be fixed */ |
3087 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3107 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3088 | 3108 | ||
3089 | user_exit(); | 3109 | prev_state = exception_enter(); |
3110 | |||
3090 | do { | 3111 | do { |
3091 | add_preempt_count(PREEMPT_ACTIVE); | 3112 | add_preempt_count(PREEMPT_ACTIVE); |
3092 | local_irq_enable(); | 3113 | local_irq_enable(); |
@@ -3100,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3100 | */ | 3121 | */ |
3101 | barrier(); | 3122 | barrier(); |
3102 | } while (need_resched()); | 3123 | } while (need_resched()); |
3124 | |||
3125 | exception_exit(prev_state); | ||
3103 | } | 3126 | } |
3104 | 3127 | ||
3105 | #endif /* CONFIG_PREEMPT */ | 3128 | #endif /* CONFIG_PREEMPT */ |
@@ -4126,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4126 | get_task_struct(p); | 4149 | get_task_struct(p); |
4127 | rcu_read_unlock(); | 4150 | rcu_read_unlock(); |
4128 | 4151 | ||
4152 | if (p->flags & PF_NO_SETAFFINITY) { | ||
4153 | retval = -EINVAL; | ||
4154 | goto out_put_task; | ||
4155 | } | ||
4129 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { | 4156 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { |
4130 | retval = -ENOMEM; | 4157 | retval = -ENOMEM; |
4131 | goto out_put_task; | 4158 | goto out_put_task; |
@@ -4626,6 +4653,7 @@ void sched_show_task(struct task_struct *p) | |||
4626 | task_pid_nr(p), ppid, | 4653 | task_pid_nr(p), ppid, |
4627 | (unsigned long)task_thread_info(p)->flags); | 4654 | (unsigned long)task_thread_info(p)->flags); |
4628 | 4655 | ||
4656 | print_worker_info(KERN_INFO, p); | ||
4629 | show_stack(p, NULL); | 4657 | show_stack(p, NULL); |
4630 | } | 4658 | } |
4631 | 4659 | ||
@@ -4773,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
4773 | goto out; | 4801 | goto out; |
4774 | } | 4802 | } |
4775 | 4803 | ||
4776 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { | ||
4777 | ret = -EINVAL; | ||
4778 | goto out; | ||
4779 | } | ||
4780 | |||
4781 | do_set_cpus_allowed(p, new_mask); | 4804 | do_set_cpus_allowed(p, new_mask); |
4782 | 4805 | ||
4783 | /* Can the task run on the task's current CPU? If so, we're done */ | 4806 | /* Can the task run on the task's current CPU? If so, we're done */ |
@@ -4999,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
4999 | } | 5022 | } |
5000 | 5023 | ||
5001 | static int min_load_idx = 0; | 5024 | static int min_load_idx = 0; |
5002 | static int max_load_idx = CPU_LOAD_IDX_MAX; | 5025 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; |
5003 | 5026 | ||
5004 | static void | 5027 | static void |
5005 | set_table_entry(struct ctl_table *entry, | 5028 | set_table_entry(struct ctl_table *entry, |
@@ -6248,7 +6271,7 @@ static void sched_init_numa(void) | |||
6248 | * 'level' contains the number of unique distances, excluding the | 6271 | * 'level' contains the number of unique distances, excluding the |
6249 | * identity distance node_distance(i,i). | 6272 | * identity distance node_distance(i,i). |
6250 | * | 6273 | * |
6251 | * The sched_domains_nume_distance[] array includes the actual distance | 6274 | * The sched_domains_numa_distance[] array includes the actual distance |
6252 | * numbers. | 6275 | * numbers. |
6253 | */ | 6276 | */ |
6254 | 6277 | ||
@@ -6861,11 +6884,15 @@ int in_sched_functions(unsigned long addr) | |||
6861 | } | 6884 | } |
6862 | 6885 | ||
6863 | #ifdef CONFIG_CGROUP_SCHED | 6886 | #ifdef CONFIG_CGROUP_SCHED |
6887 | /* | ||
6888 | * Default task group. | ||
6889 | * Every task in system belongs to this group at bootup. | ||
6890 | */ | ||
6864 | struct task_group root_task_group; | 6891 | struct task_group root_task_group; |
6865 | LIST_HEAD(task_groups); | 6892 | LIST_HEAD(task_groups); |
6866 | #endif | 6893 | #endif |
6867 | 6894 | ||
6868 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 6895 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
6869 | 6896 | ||
6870 | void __init sched_init(void) | 6897 | void __init sched_init(void) |
6871 | { | 6898 | { |
@@ -6902,7 +6929,7 @@ void __init sched_init(void) | |||
6902 | #endif /* CONFIG_RT_GROUP_SCHED */ | 6929 | #endif /* CONFIG_RT_GROUP_SCHED */ |
6903 | #ifdef CONFIG_CPUMASK_OFFSTACK | 6930 | #ifdef CONFIG_CPUMASK_OFFSTACK |
6904 | for_each_possible_cpu(i) { | 6931 | for_each_possible_cpu(i) { |
6905 | per_cpu(load_balance_tmpmask, i) = (void *)ptr; | 6932 | per_cpu(load_balance_mask, i) = (void *)ptr; |
6906 | ptr += cpumask_size(); | 6933 | ptr += cpumask_size(); |
6907 | } | 6934 | } |
6908 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 6935 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
@@ -6928,12 +6955,6 @@ void __init sched_init(void) | |||
6928 | 6955 | ||
6929 | #endif /* CONFIG_CGROUP_SCHED */ | 6956 | #endif /* CONFIG_CGROUP_SCHED */ |
6930 | 6957 | ||
6931 | #ifdef CONFIG_CGROUP_CPUACCT | ||
6932 | root_cpuacct.cpustat = &kernel_cpustat; | ||
6933 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
6934 | /* Too early, not expected to fail */ | ||
6935 | BUG_ON(!root_cpuacct.cpuusage); | ||
6936 | #endif | ||
6937 | for_each_possible_cpu(i) { | 6958 | for_each_possible_cpu(i) { |
6938 | struct rq *rq; | 6959 | struct rq *rq; |
6939 | 6960 | ||
@@ -6997,9 +7018,12 @@ void __init sched_init(void) | |||
6997 | INIT_LIST_HEAD(&rq->cfs_tasks); | 7018 | INIT_LIST_HEAD(&rq->cfs_tasks); |
6998 | 7019 | ||
6999 | rq_attach_root(rq, &def_root_domain); | 7020 | rq_attach_root(rq, &def_root_domain); |
7000 | #ifdef CONFIG_NO_HZ | 7021 | #ifdef CONFIG_NO_HZ_COMMON |
7001 | rq->nohz_flags = 0; | 7022 | rq->nohz_flags = 0; |
7002 | #endif | 7023 | #endif |
7024 | #ifdef CONFIG_NO_HZ_FULL | ||
7025 | rq->last_sched_tick = 0; | ||
7026 | #endif | ||
7003 | #endif | 7027 | #endif |
7004 | init_rq_hrtick(rq); | 7028 | init_rq_hrtick(rq); |
7005 | atomic_set(&rq->nr_iowait, 0); | 7029 | atomic_set(&rq->nr_iowait, 0); |
@@ -7455,7 +7479,7 @@ unlock: | |||
7455 | return err; | 7479 | return err; |
7456 | } | 7480 | } |
7457 | 7481 | ||
7458 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 7482 | static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) |
7459 | { | 7483 | { |
7460 | u64 rt_runtime, rt_period; | 7484 | u64 rt_runtime, rt_period; |
7461 | 7485 | ||
@@ -7467,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
7467 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | 7491 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
7468 | } | 7492 | } |
7469 | 7493 | ||
7470 | long sched_group_rt_runtime(struct task_group *tg) | 7494 | static long sched_group_rt_runtime(struct task_group *tg) |
7471 | { | 7495 | { |
7472 | u64 rt_runtime_us; | 7496 | u64 rt_runtime_us; |
7473 | 7497 | ||
@@ -7479,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg) | |||
7479 | return rt_runtime_us; | 7503 | return rt_runtime_us; |
7480 | } | 7504 | } |
7481 | 7505 | ||
7482 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | 7506 | static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) |
7483 | { | 7507 | { |
7484 | u64 rt_runtime, rt_period; | 7508 | u64 rt_runtime, rt_period; |
7485 | 7509 | ||
@@ -7492,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
7492 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | 7516 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
7493 | } | 7517 | } |
7494 | 7518 | ||
7495 | long sched_group_rt_period(struct task_group *tg) | 7519 | static long sched_group_rt_period(struct task_group *tg) |
7496 | { | 7520 | { |
7497 | u64 rt_period_us; | 7521 | u64 rt_period_us; |
7498 | 7522 | ||
@@ -7527,7 +7551,7 @@ static int sched_rt_global_constraints(void) | |||
7527 | return ret; | 7551 | return ret; |
7528 | } | 7552 | } |
7529 | 7553 | ||
7530 | int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | 7554 | static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) |
7531 | { | 7555 | { |
7532 | /* Don't accept realtime tasks when there is no way for them to run */ | 7556 | /* Don't accept realtime tasks when there is no way for them to run */ |
7533 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) | 7557 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) |
@@ -8035,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8035 | 8059 | ||
8036 | #endif /* CONFIG_CGROUP_SCHED */ | 8060 | #endif /* CONFIG_CGROUP_SCHED */ |
8037 | 8061 | ||
8038 | #ifdef CONFIG_CGROUP_CPUACCT | ||
8039 | |||
8040 | /* | ||
8041 | * CPU accounting code for task groups. | ||
8042 | * | ||
8043 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
8044 | * (balbir@in.ibm.com). | ||
8045 | */ | ||
8046 | |||
8047 | struct cpuacct root_cpuacct; | ||
8048 | |||
8049 | /* create a new cpu accounting group */ | ||
8050 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | ||
8051 | { | ||
8052 | struct cpuacct *ca; | ||
8053 | |||
8054 | if (!cgrp->parent) | ||
8055 | return &root_cpuacct.css; | ||
8056 | |||
8057 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
8058 | if (!ca) | ||
8059 | goto out; | ||
8060 | |||
8061 | ca->cpuusage = alloc_percpu(u64); | ||
8062 | if (!ca->cpuusage) | ||
8063 | goto out_free_ca; | ||
8064 | |||
8065 | ca->cpustat = alloc_percpu(struct kernel_cpustat); | ||
8066 | if (!ca->cpustat) | ||
8067 | goto out_free_cpuusage; | ||
8068 | |||
8069 | return &ca->css; | ||
8070 | |||
8071 | out_free_cpuusage: | ||
8072 | free_percpu(ca->cpuusage); | ||
8073 | out_free_ca: | ||
8074 | kfree(ca); | ||
8075 | out: | ||
8076 | return ERR_PTR(-ENOMEM); | ||
8077 | } | ||
8078 | |||
8079 | /* destroy an existing cpu accounting group */ | ||
8080 | static void cpuacct_css_free(struct cgroup *cgrp) | ||
8081 | { | ||
8082 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8083 | |||
8084 | free_percpu(ca->cpustat); | ||
8085 | free_percpu(ca->cpuusage); | ||
8086 | kfree(ca); | ||
8087 | } | ||
8088 | |||
8089 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | ||
8090 | { | ||
8091 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8092 | u64 data; | ||
8093 | |||
8094 | #ifndef CONFIG_64BIT | ||
8095 | /* | ||
8096 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | ||
8097 | */ | ||
8098 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
8099 | data = *cpuusage; | ||
8100 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
8101 | #else | ||
8102 | data = *cpuusage; | ||
8103 | #endif | ||
8104 | |||
8105 | return data; | ||
8106 | } | ||
8107 | |||
8108 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | ||
8109 | { | ||
8110 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8111 | |||
8112 | #ifndef CONFIG_64BIT | ||
8113 | /* | ||
8114 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | ||
8115 | */ | ||
8116 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
8117 | *cpuusage = val; | ||
8118 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
8119 | #else | ||
8120 | *cpuusage = val; | ||
8121 | #endif | ||
8122 | } | ||
8123 | |||
8124 | /* return total cpu usage (in nanoseconds) of a group */ | ||
8125 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | ||
8126 | { | ||
8127 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8128 | u64 totalcpuusage = 0; | ||
8129 | int i; | ||
8130 | |||
8131 | for_each_present_cpu(i) | ||
8132 | totalcpuusage += cpuacct_cpuusage_read(ca, i); | ||
8133 | |||
8134 | return totalcpuusage; | ||
8135 | } | ||
8136 | |||
8137 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
8138 | u64 reset) | ||
8139 | { | ||
8140 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8141 | int err = 0; | ||
8142 | int i; | ||
8143 | |||
8144 | if (reset) { | ||
8145 | err = -EINVAL; | ||
8146 | goto out; | ||
8147 | } | ||
8148 | |||
8149 | for_each_present_cpu(i) | ||
8150 | cpuacct_cpuusage_write(ca, i, 0); | ||
8151 | |||
8152 | out: | ||
8153 | return err; | ||
8154 | } | ||
8155 | |||
8156 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | ||
8157 | struct seq_file *m) | ||
8158 | { | ||
8159 | struct cpuacct *ca = cgroup_ca(cgroup); | ||
8160 | u64 percpu; | ||
8161 | int i; | ||
8162 | |||
8163 | for_each_present_cpu(i) { | ||
8164 | percpu = cpuacct_cpuusage_read(ca, i); | ||
8165 | seq_printf(m, "%llu ", (unsigned long long) percpu); | ||
8166 | } | ||
8167 | seq_printf(m, "\n"); | ||
8168 | return 0; | ||
8169 | } | ||
8170 | |||
8171 | static const char *cpuacct_stat_desc[] = { | ||
8172 | [CPUACCT_STAT_USER] = "user", | ||
8173 | [CPUACCT_STAT_SYSTEM] = "system", | ||
8174 | }; | ||
8175 | |||
8176 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
8177 | struct cgroup_map_cb *cb) | ||
8178 | { | ||
8179 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8180 | int cpu; | ||
8181 | s64 val = 0; | ||
8182 | |||
8183 | for_each_online_cpu(cpu) { | ||
8184 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8185 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
8186 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
8187 | } | ||
8188 | val = cputime64_to_clock_t(val); | ||
8189 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
8190 | |||
8191 | val = 0; | ||
8192 | for_each_online_cpu(cpu) { | ||
8193 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8194 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
8195 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
8196 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
8197 | } | ||
8198 | |||
8199 | val = cputime64_to_clock_t(val); | ||
8200 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
8201 | |||
8202 | return 0; | ||
8203 | } | ||
8204 | |||
8205 | static struct cftype files[] = { | ||
8206 | { | ||
8207 | .name = "usage", | ||
8208 | .read_u64 = cpuusage_read, | ||
8209 | .write_u64 = cpuusage_write, | ||
8210 | }, | ||
8211 | { | ||
8212 | .name = "usage_percpu", | ||
8213 | .read_seq_string = cpuacct_percpu_seq_read, | ||
8214 | }, | ||
8215 | { | ||
8216 | .name = "stat", | ||
8217 | .read_map = cpuacct_stats_show, | ||
8218 | }, | ||
8219 | { } /* terminate */ | ||
8220 | }; | ||
8221 | |||
8222 | /* | ||
8223 | * charge this task's execution time to its accounting group. | ||
8224 | * | ||
8225 | * called with rq->lock held. | ||
8226 | */ | ||
8227 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
8228 | { | ||
8229 | struct cpuacct *ca; | ||
8230 | int cpu; | ||
8231 | |||
8232 | if (unlikely(!cpuacct_subsys.active)) | ||
8233 | return; | ||
8234 | |||
8235 | cpu = task_cpu(tsk); | ||
8236 | |||
8237 | rcu_read_lock(); | ||
8238 | |||
8239 | ca = task_ca(tsk); | ||
8240 | |||
8241 | for (; ca; ca = parent_ca(ca)) { | ||
8242 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8243 | *cpuusage += cputime; | ||
8244 | } | ||
8245 | |||
8246 | rcu_read_unlock(); | ||
8247 | } | ||
8248 | |||
8249 | struct cgroup_subsys cpuacct_subsys = { | ||
8250 | .name = "cpuacct", | ||
8251 | .css_alloc = cpuacct_css_alloc, | ||
8252 | .css_free = cpuacct_css_free, | ||
8253 | .subsys_id = cpuacct_subsys_id, | ||
8254 | .base_cftypes = files, | ||
8255 | }; | ||
8256 | #endif /* CONFIG_CGROUP_CPUACCT */ | ||
8257 | |||
8258 | void dump_cpu_task(int cpu) | 8062 | void dump_cpu_task(int cpu) |
8259 | { | 8063 | { |
8260 | pr_info("Task dump for CPU %d:\n", cpu); | 8064 | pr_info("Task dump for CPU %d:\n", cpu); |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..dbb7e2cd95eb --- /dev/null +++ b/kernel/sched/cpuacct.c | |||
@@ -0,0 +1,296 @@ | |||
1 | #include <linux/cgroup.h> | ||
2 | #include <linux/slab.h> | ||
3 | #include <linux/percpu.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/cpumask.h> | ||
6 | #include <linux/seq_file.h> | ||
7 | #include <linux/rcupdate.h> | ||
8 | #include <linux/kernel_stat.h> | ||
9 | #include <linux/err.h> | ||
10 | |||
11 | #include "sched.h" | ||
12 | |||
13 | /* | ||
14 | * CPU accounting code for task groups. | ||
15 | * | ||
16 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
17 | * (balbir@in.ibm.com). | ||
18 | */ | ||
19 | |||
20 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
21 | enum cpuacct_stat_index { | ||
22 | CPUACCT_STAT_USER, /* ... user mode */ | ||
23 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
24 | |||
25 | CPUACCT_STAT_NSTATS, | ||
26 | }; | ||
27 | |||
28 | /* track cpu usage of a group of tasks and its child groups */ | ||
29 | struct cpuacct { | ||
30 | struct cgroup_subsys_state css; | ||
31 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
32 | u64 __percpu *cpuusage; | ||
33 | struct kernel_cpustat __percpu *cpustat; | ||
34 | }; | ||
35 | |||
36 | /* return cpu accounting group corresponding to this container */ | ||
37 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
38 | { | ||
39 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
40 | struct cpuacct, css); | ||
41 | } | ||
42 | |||
43 | /* return cpu accounting group to which this task belongs */ | ||
44 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
45 | { | ||
46 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
47 | struct cpuacct, css); | ||
48 | } | ||
49 | |||
50 | static inline struct cpuacct *__parent_ca(struct cpuacct *ca) | ||
51 | { | ||
52 | return cgroup_ca(ca->css.cgroup->parent); | ||
53 | } | ||
54 | |||
55 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
56 | { | ||
57 | if (!ca->css.cgroup->parent) | ||
58 | return NULL; | ||
59 | return cgroup_ca(ca->css.cgroup->parent); | ||
60 | } | ||
61 | |||
62 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | ||
63 | static struct cpuacct root_cpuacct = { | ||
64 | .cpustat = &kernel_cpustat, | ||
65 | .cpuusage = &root_cpuacct_cpuusage, | ||
66 | }; | ||
67 | |||
68 | /* create a new cpu accounting group */ | ||
69 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | ||
70 | { | ||
71 | struct cpuacct *ca; | ||
72 | |||
73 | if (!cgrp->parent) | ||
74 | return &root_cpuacct.css; | ||
75 | |||
76 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
77 | if (!ca) | ||
78 | goto out; | ||
79 | |||
80 | ca->cpuusage = alloc_percpu(u64); | ||
81 | if (!ca->cpuusage) | ||
82 | goto out_free_ca; | ||
83 | |||
84 | ca->cpustat = alloc_percpu(struct kernel_cpustat); | ||
85 | if (!ca->cpustat) | ||
86 | goto out_free_cpuusage; | ||
87 | |||
88 | return &ca->css; | ||
89 | |||
90 | out_free_cpuusage: | ||
91 | free_percpu(ca->cpuusage); | ||
92 | out_free_ca: | ||
93 | kfree(ca); | ||
94 | out: | ||
95 | return ERR_PTR(-ENOMEM); | ||
96 | } | ||
97 | |||
98 | /* destroy an existing cpu accounting group */ | ||
99 | static void cpuacct_css_free(struct cgroup *cgrp) | ||
100 | { | ||
101 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
102 | |||
103 | free_percpu(ca->cpustat); | ||
104 | free_percpu(ca->cpuusage); | ||
105 | kfree(ca); | ||
106 | } | ||
107 | |||
108 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | ||
109 | { | ||
110 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
111 | u64 data; | ||
112 | |||
113 | #ifndef CONFIG_64BIT | ||
114 | /* | ||
115 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | ||
116 | */ | ||
117 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
118 | data = *cpuusage; | ||
119 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
120 | #else | ||
121 | data = *cpuusage; | ||
122 | #endif | ||
123 | |||
124 | return data; | ||
125 | } | ||
126 | |||
127 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | ||
128 | { | ||
129 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
130 | |||
131 | #ifndef CONFIG_64BIT | ||
132 | /* | ||
133 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | ||
134 | */ | ||
135 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
136 | *cpuusage = val; | ||
137 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
138 | #else | ||
139 | *cpuusage = val; | ||
140 | #endif | ||
141 | } | ||
142 | |||
143 | /* return total cpu usage (in nanoseconds) of a group */ | ||
144 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | ||
145 | { | ||
146 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
147 | u64 totalcpuusage = 0; | ||
148 | int i; | ||
149 | |||
150 | for_each_present_cpu(i) | ||
151 | totalcpuusage += cpuacct_cpuusage_read(ca, i); | ||
152 | |||
153 | return totalcpuusage; | ||
154 | } | ||
155 | |||
156 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
157 | u64 reset) | ||
158 | { | ||
159 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
160 | int err = 0; | ||
161 | int i; | ||
162 | |||
163 | if (reset) { | ||
164 | err = -EINVAL; | ||
165 | goto out; | ||
166 | } | ||
167 | |||
168 | for_each_present_cpu(i) | ||
169 | cpuacct_cpuusage_write(ca, i, 0); | ||
170 | |||
171 | out: | ||
172 | return err; | ||
173 | } | ||
174 | |||
175 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | ||
176 | struct seq_file *m) | ||
177 | { | ||
178 | struct cpuacct *ca = cgroup_ca(cgroup); | ||
179 | u64 percpu; | ||
180 | int i; | ||
181 | |||
182 | for_each_present_cpu(i) { | ||
183 | percpu = cpuacct_cpuusage_read(ca, i); | ||
184 | seq_printf(m, "%llu ", (unsigned long long) percpu); | ||
185 | } | ||
186 | seq_printf(m, "\n"); | ||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static const char * const cpuacct_stat_desc[] = { | ||
191 | [CPUACCT_STAT_USER] = "user", | ||
192 | [CPUACCT_STAT_SYSTEM] = "system", | ||
193 | }; | ||
194 | |||
195 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
196 | struct cgroup_map_cb *cb) | ||
197 | { | ||
198 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
199 | int cpu; | ||
200 | s64 val = 0; | ||
201 | |||
202 | for_each_online_cpu(cpu) { | ||
203 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
204 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
205 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
206 | } | ||
207 | val = cputime64_to_clock_t(val); | ||
208 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
209 | |||
210 | val = 0; | ||
211 | for_each_online_cpu(cpu) { | ||
212 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
213 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
214 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
215 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
216 | } | ||
217 | |||
218 | val = cputime64_to_clock_t(val); | ||
219 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
220 | |||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | static struct cftype files[] = { | ||
225 | { | ||
226 | .name = "usage", | ||
227 | .read_u64 = cpuusage_read, | ||
228 | .write_u64 = cpuusage_write, | ||
229 | }, | ||
230 | { | ||
231 | .name = "usage_percpu", | ||
232 | .read_seq_string = cpuacct_percpu_seq_read, | ||
233 | }, | ||
234 | { | ||
235 | .name = "stat", | ||
236 | .read_map = cpuacct_stats_show, | ||
237 | }, | ||
238 | { } /* terminate */ | ||
239 | }; | ||
240 | |||
241 | /* | ||
242 | * charge this task's execution time to its accounting group. | ||
243 | * | ||
244 | * called with rq->lock held. | ||
245 | */ | ||
246 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
247 | { | ||
248 | struct cpuacct *ca; | ||
249 | int cpu; | ||
250 | |||
251 | cpu = task_cpu(tsk); | ||
252 | |||
253 | rcu_read_lock(); | ||
254 | |||
255 | ca = task_ca(tsk); | ||
256 | |||
257 | while (true) { | ||
258 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
259 | *cpuusage += cputime; | ||
260 | |||
261 | ca = parent_ca(ca); | ||
262 | if (!ca) | ||
263 | break; | ||
264 | } | ||
265 | |||
266 | rcu_read_unlock(); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Add user/system time to cpuacct. | ||
271 | * | ||
272 | * Note: it's the caller that updates the account of the root cgroup. | ||
273 | */ | ||
274 | void cpuacct_account_field(struct task_struct *p, int index, u64 val) | ||
275 | { | ||
276 | struct kernel_cpustat *kcpustat; | ||
277 | struct cpuacct *ca; | ||
278 | |||
279 | rcu_read_lock(); | ||
280 | ca = task_ca(p); | ||
281 | while (ca != &root_cpuacct) { | ||
282 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
283 | kcpustat->cpustat[index] += val; | ||
284 | ca = __parent_ca(ca); | ||
285 | } | ||
286 | rcu_read_unlock(); | ||
287 | } | ||
288 | |||
289 | struct cgroup_subsys cpuacct_subsys = { | ||
290 | .name = "cpuacct", | ||
291 | .css_alloc = cpuacct_css_alloc, | ||
292 | .css_free = cpuacct_css_free, | ||
293 | .subsys_id = cpuacct_subsys_id, | ||
294 | .base_cftypes = files, | ||
295 | .early_init = 1, | ||
296 | }; | ||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..ed605624a5e7 --- /dev/null +++ b/kernel/sched/cpuacct.h | |||
@@ -0,0 +1,17 @@ | |||
1 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2 | |||
3 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
4 | extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); | ||
5 | |||
6 | #else | ||
7 | |||
8 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
9 | { | ||
10 | } | ||
11 | |||
12 | static inline void | ||
13 | cpuacct_account_field(struct task_struct *p, int index, u64 val) | ||
14 | { | ||
15 | } | ||
16 | |||
17 | #endif | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ed12cbb135f4..cc2dc3eea8a3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) | |||
115 | static inline void task_group_account_field(struct task_struct *p, int index, | 115 | static inline void task_group_account_field(struct task_struct *p, int index, |
116 | u64 tmp) | 116 | u64 tmp) |
117 | { | 117 | { |
118 | #ifdef CONFIG_CGROUP_CPUACCT | ||
119 | struct kernel_cpustat *kcpustat; | ||
120 | struct cpuacct *ca; | ||
121 | #endif | ||
122 | /* | 118 | /* |
123 | * Since all updates are sure to touch the root cgroup, we | 119 | * Since all updates are sure to touch the root cgroup, we |
124 | * get ourselves ahead and touch it first. If the root cgroup | 120 | * get ourselves ahead and touch it first. If the root cgroup |
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
127 | */ | 123 | */ |
128 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; |
129 | 125 | ||
130 | #ifdef CONFIG_CGROUP_CPUACCT | 126 | cpuacct_account_field(p, index, tmp); |
131 | if (unlikely(!cpuacct_subsys.active)) | ||
132 | return; | ||
133 | |||
134 | rcu_read_lock(); | ||
135 | ca = task_ca(p); | ||
136 | while (ca && (ca != &root_cpuacct)) { | ||
137 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
138 | kcpustat->cpustat[index] += tmp; | ||
139 | ca = parent_ca(ca); | ||
140 | } | ||
141 | rcu_read_unlock(); | ||
142 | #endif | ||
143 | } | 127 | } |
144 | 128 | ||
145 | /* | 129 | /* |
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
310 | 294 | ||
311 | t = tsk; | 295 | t = tsk; |
312 | do { | 296 | do { |
313 | task_cputime(tsk, &utime, &stime); | 297 | task_cputime(t, &utime, &stime); |
314 | times->utime += utime; | 298 | times->utime += utime; |
315 | times->stime += stime; | 299 | times->stime += stime; |
316 | times->sum_exec_runtime += task_sched_runtime(t); | 300 | times->sum_exec_runtime += task_sched_runtime(t); |
@@ -388,7 +372,84 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ | |||
388 | struct rq *rq) {} | 372 | struct rq *rq) {} |
389 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 373 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
390 | 374 | ||
391 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 375 | /* |
376 | * Use precise platform statistics if available: | ||
377 | */ | ||
378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
379 | |||
380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
381 | void vtime_task_switch(struct task_struct *prev) | ||
382 | { | ||
383 | if (!vtime_accounting_enabled()) | ||
384 | return; | ||
385 | |||
386 | if (is_idle_task(prev)) | ||
387 | vtime_account_idle(prev); | ||
388 | else | ||
389 | vtime_account_system(prev); | ||
390 | |||
391 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
392 | vtime_account_user(prev); | ||
393 | #endif | ||
394 | arch_vtime_task_switch(prev); | ||
395 | } | ||
396 | #endif | ||
397 | |||
398 | /* | ||
399 | * Archs that account the whole time spent in the idle task | ||
400 | * (outside irq) as idle time can rely on this and just implement | ||
401 | * vtime_account_system() and vtime_account_idle(). Archs that | ||
402 | * have other meaning of the idle time (s390 only includes the | ||
403 | * time spent by the CPU when it's in low power mode) must override | ||
404 | * vtime_account(). | ||
405 | */ | ||
406 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | ||
407 | void vtime_account_irq_enter(struct task_struct *tsk) | ||
408 | { | ||
409 | if (!vtime_accounting_enabled()) | ||
410 | return; | ||
411 | |||
412 | if (!in_interrupt()) { | ||
413 | /* | ||
414 | * If we interrupted user, context_tracking_in_user() | ||
415 | * is 1 because the context tracking don't hook | ||
416 | * on irq entry/exit. This way we know if | ||
417 | * we need to flush user time on kernel entry. | ||
418 | */ | ||
419 | if (context_tracking_in_user()) { | ||
420 | vtime_account_user(tsk); | ||
421 | return; | ||
422 | } | ||
423 | |||
424 | if (is_idle_task(tsk)) { | ||
425 | vtime_account_idle(tsk); | ||
426 | return; | ||
427 | } | ||
428 | } | ||
429 | vtime_account_system(tsk); | ||
430 | } | ||
431 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | ||
432 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | ||
433 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | ||
434 | |||
435 | |||
436 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
437 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
438 | { | ||
439 | *ut = p->utime; | ||
440 | *st = p->stime; | ||
441 | } | ||
442 | |||
443 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
444 | { | ||
445 | struct task_cputime cputime; | ||
446 | |||
447 | thread_group_cputime(p, &cputime); | ||
448 | |||
449 | *ut = cputime.utime; | ||
450 | *st = cputime.stime; | ||
451 | } | ||
452 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | ||
392 | /* | 453 | /* |
393 | * Account a single tick of cpu time. | 454 | * Account a single tick of cpu time. |
394 | * @p: the process that the cpu time gets accounted to | 455 | * @p: the process that the cpu time gets accounted to |
@@ -443,96 +504,50 @@ void account_idle_ticks(unsigned long ticks) | |||
443 | 504 | ||
444 | account_idle_time(jiffies_to_cputime(ticks)); | 505 | account_idle_time(jiffies_to_cputime(ticks)); |
445 | } | 506 | } |
446 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | ||
447 | |||
448 | /* | ||
449 | * Use precise platform statistics if available: | ||
450 | */ | ||
451 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
452 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
453 | { | ||
454 | *ut = p->utime; | ||
455 | *st = p->stime; | ||
456 | } | ||
457 | |||
458 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
459 | { | ||
460 | struct task_cputime cputime; | ||
461 | |||
462 | thread_group_cputime(p, &cputime); | ||
463 | |||
464 | *ut = cputime.utime; | ||
465 | *st = cputime.stime; | ||
466 | } | ||
467 | |||
468 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
469 | void vtime_task_switch(struct task_struct *prev) | ||
470 | { | ||
471 | if (!vtime_accounting_enabled()) | ||
472 | return; | ||
473 | |||
474 | if (is_idle_task(prev)) | ||
475 | vtime_account_idle(prev); | ||
476 | else | ||
477 | vtime_account_system(prev); | ||
478 | |||
479 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
480 | vtime_account_user(prev); | ||
481 | #endif | ||
482 | arch_vtime_task_switch(prev); | ||
483 | } | ||
484 | #endif | ||
485 | 507 | ||
486 | /* | 508 | /* |
487 | * Archs that account the whole time spent in the idle task | 509 | * Perform (stime * rtime) / total, but avoid multiplication overflow by |
488 | * (outside irq) as idle time can rely on this and just implement | 510 | * loosing precision when the numbers are big. |
489 | * vtime_account_system() and vtime_account_idle(). Archs that | ||
490 | * have other meaning of the idle time (s390 only includes the | ||
491 | * time spent by the CPU when it's in low power mode) must override | ||
492 | * vtime_account(). | ||
493 | */ | 511 | */ |
494 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 512 | static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) |
495 | void vtime_account_irq_enter(struct task_struct *tsk) | ||
496 | { | 513 | { |
497 | if (!vtime_accounting_enabled()) | 514 | u64 scaled; |
498 | return; | ||
499 | 515 | ||
500 | if (!in_interrupt()) { | 516 | for (;;) { |
501 | /* | 517 | /* Make sure "rtime" is the bigger of stime/rtime */ |
502 | * If we interrupted user, context_tracking_in_user() | 518 | if (stime > rtime) { |
503 | * is 1 because the context tracking don't hook | 519 | u64 tmp = rtime; rtime = stime; stime = tmp; |
504 | * on irq entry/exit. This way we know if | ||
505 | * we need to flush user time on kernel entry. | ||
506 | */ | ||
507 | if (context_tracking_in_user()) { | ||
508 | vtime_account_user(tsk); | ||
509 | return; | ||
510 | } | 520 | } |
511 | 521 | ||
512 | if (is_idle_task(tsk)) { | 522 | /* Make sure 'total' fits in 32 bits */ |
513 | vtime_account_idle(tsk); | 523 | if (total >> 32) |
514 | return; | 524 | goto drop_precision; |
515 | } | ||
516 | } | ||
517 | vtime_account_system(tsk); | ||
518 | } | ||
519 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | ||
520 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | ||
521 | 525 | ||
522 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ | 526 | /* Does rtime (and thus stime) fit in 32 bits? */ |
527 | if (!(rtime >> 32)) | ||
528 | break; | ||
523 | 529 | ||
524 | static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) | 530 | /* Can we just balance rtime/stime rather than dropping bits? */ |
525 | { | 531 | if (stime >> 31) |
526 | u64 temp = (__force u64) rtime; | 532 | goto drop_precision; |
527 | 533 | ||
528 | temp *= (__force u64) stime; | 534 | /* We can grow stime and shrink rtime and try to make them both fit */ |
535 | stime <<= 1; | ||
536 | rtime >>= 1; | ||
537 | continue; | ||
529 | 538 | ||
530 | if (sizeof(cputime_t) == 4) | 539 | drop_precision: |
531 | temp = div_u64(temp, (__force u32) total); | 540 | /* We drop from rtime, it has more bits than stime */ |
532 | else | 541 | rtime >>= 1; |
533 | temp = div64_u64(temp, (__force u64) total); | 542 | total >>= 1; |
543 | } | ||
534 | 544 | ||
535 | return (__force cputime_t) temp; | 545 | /* |
546 | * Make sure gcc understands that this is a 32x32->64 multiply, | ||
547 | * followed by a 64/32->64 divide. | ||
548 | */ | ||
549 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); | ||
550 | return (__force cputime_t) scaled; | ||
536 | } | 551 | } |
537 | 552 | ||
538 | /* | 553 | /* |
@@ -543,7 +558,13 @@ static void cputime_adjust(struct task_cputime *curr, | |||
543 | struct cputime *prev, | 558 | struct cputime *prev, |
544 | cputime_t *ut, cputime_t *st) | 559 | cputime_t *ut, cputime_t *st) |
545 | { | 560 | { |
546 | cputime_t rtime, stime, total; | 561 | cputime_t rtime, stime, utime, total; |
562 | |||
563 | if (vtime_accounting_enabled()) { | ||
564 | *ut = curr->utime; | ||
565 | *st = curr->stime; | ||
566 | return; | ||
567 | } | ||
547 | 568 | ||
548 | stime = curr->stime; | 569 | stime = curr->stime; |
549 | total = stime + curr->utime; | 570 | total = stime + curr->utime; |
@@ -560,10 +581,22 @@ static void cputime_adjust(struct task_cputime *curr, | |||
560 | */ | 581 | */ |
561 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | 582 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
562 | 583 | ||
563 | if (total) | 584 | /* |
564 | stime = scale_stime(stime, rtime, total); | 585 | * Update userspace visible utime/stime values only if actual execution |
565 | else | 586 | * time is bigger than already exported. Note that can happen, that we |
587 | * provided bigger values due to scaling inaccuracy on big numbers. | ||
588 | */ | ||
589 | if (prev->stime + prev->utime >= rtime) | ||
590 | goto out; | ||
591 | |||
592 | if (total) { | ||
593 | stime = scale_stime((__force u64)stime, | ||
594 | (__force u64)rtime, (__force u64)total); | ||
595 | utime = rtime - stime; | ||
596 | } else { | ||
566 | stime = rtime; | 597 | stime = rtime; |
598 | utime = 0; | ||
599 | } | ||
567 | 600 | ||
568 | /* | 601 | /* |
569 | * If the tick based count grows faster than the scheduler one, | 602 | * If the tick based count grows faster than the scheduler one, |
@@ -571,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr, | |||
571 | * Let's enforce monotonicity. | 604 | * Let's enforce monotonicity. |
572 | */ | 605 | */ |
573 | prev->stime = max(prev->stime, stime); | 606 | prev->stime = max(prev->stime, stime); |
574 | prev->utime = max(prev->utime, rtime - prev->stime); | 607 | prev->utime = max(prev->utime, utime); |
575 | 608 | ||
609 | out: | ||
576 | *ut = prev->utime; | 610 | *ut = prev->utime; |
577 | *st = prev->stime; | 611 | *st = prev->stime; |
578 | } | 612 | } |
@@ -597,7 +631,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
597 | thread_group_cputime(p, &cputime); | 631 | thread_group_cputime(p, &cputime); |
598 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); | 632 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); |
599 | } | 633 | } |
600 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ | 634 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
601 | 635 | ||
602 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 636 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
603 | static unsigned long long vtime_delta(struct task_struct *tsk) | 637 | static unsigned long long vtime_delta(struct task_struct *tsk) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a33e5986fc5..c61a614465c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); | |||
431 | * Scheduling class tree data structure manipulation methods: | 431 | * Scheduling class tree data structure manipulation methods: |
432 | */ | 432 | */ |
433 | 433 | ||
434 | static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) | 434 | static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) |
435 | { | 435 | { |
436 | s64 delta = (s64)(vruntime - min_vruntime); | 436 | s64 delta = (s64)(vruntime - max_vruntime); |
437 | if (delta > 0) | 437 | if (delta > 0) |
438 | min_vruntime = vruntime; | 438 | max_vruntime = vruntime; |
439 | 439 | ||
440 | return min_vruntime; | 440 | return max_vruntime; |
441 | } | 441 | } |
442 | 442 | ||
443 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) | 443 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) |
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
473 | vruntime = min_vruntime(vruntime, se->vruntime); | 473 | vruntime = min_vruntime(vruntime, se->vruntime); |
474 | } | 474 | } |
475 | 475 | ||
476 | /* ensure we never gain time by being placed backwards. */ | ||
476 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 477 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
477 | #ifndef CONFIG_64BIT | 478 | #ifndef CONFIG_64BIT |
478 | smp_wmb(); | 479 | smp_wmb(); |
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
652 | } | 653 | } |
653 | 654 | ||
654 | /* | 655 | /* |
655 | * We calculate the vruntime slice of a to be inserted task | 656 | * We calculate the vruntime slice of a to-be-inserted task. |
656 | * | 657 | * |
657 | * vs = s/w | 658 | * vs = s/w |
658 | */ | 659 | */ |
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
1562 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 1563 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
1563 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | 1564 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ |
1564 | } | 1565 | } |
1566 | |||
1567 | /* | ||
1568 | * Update the rq's load with the elapsed running time before entering | ||
1569 | * idle. if the last scheduled task is not a CFS task, idle_enter will | ||
1570 | * be the only way to update the runnable statistic. | ||
1571 | */ | ||
1572 | void idle_enter_fair(struct rq *this_rq) | ||
1573 | { | ||
1574 | update_rq_runnable_avg(this_rq, 1); | ||
1575 | } | ||
1576 | |||
1577 | /* | ||
1578 | * Update the rq's load with the elapsed idle time before a task is | ||
1579 | * scheduled. if the newly scheduled task is not a CFS task, idle_exit will | ||
1580 | * be the only way to update the runnable statistic. | ||
1581 | */ | ||
1582 | void idle_exit_fair(struct rq *this_rq) | ||
1583 | { | ||
1584 | update_rq_runnable_avg(this_rq, 0); | ||
1585 | } | ||
1586 | |||
1565 | #else | 1587 | #else |
1566 | static inline void update_entity_load_avg(struct sched_entity *se, | 1588 | static inline void update_entity_load_avg(struct sched_entity *se, |
1567 | int update_cfs_rq) {} | 1589 | int update_cfs_rq) {} |
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3874 | int tsk_cache_hot = 0; | 3896 | int tsk_cache_hot = 0; |
3875 | /* | 3897 | /* |
3876 | * We do not migrate tasks that are: | 3898 | * We do not migrate tasks that are: |
3877 | * 1) running (obviously), or | 3899 | * 1) throttled_lb_pair, or |
3878 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3900 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
3879 | * 3) are cache-hot on their current CPU. | 3901 | * 3) running (obviously), or |
3902 | * 4) are cache-hot on their current CPU. | ||
3880 | */ | 3903 | */ |
3904 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | ||
3905 | return 0; | ||
3906 | |||
3881 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 3907 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3882 | int new_dst_cpu; | 3908 | int cpu; |
3883 | 3909 | ||
3884 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3910 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3885 | 3911 | ||
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3894 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 3920 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) |
3895 | return 0; | 3921 | return 0; |
3896 | 3922 | ||
3897 | new_dst_cpu = cpumask_first_and(env->dst_grpmask, | 3923 | /* Prevent to re-select dst_cpu via env's cpus */ |
3898 | tsk_cpus_allowed(p)); | 3924 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
3899 | if (new_dst_cpu < nr_cpu_ids) { | 3925 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
3900 | env->flags |= LBF_SOME_PINNED; | 3926 | env->flags |= LBF_SOME_PINNED; |
3901 | env->new_dst_cpu = new_dst_cpu; | 3927 | env->new_dst_cpu = cpu; |
3928 | break; | ||
3929 | } | ||
3902 | } | 3930 | } |
3931 | |||
3903 | return 0; | 3932 | return 0; |
3904 | } | 3933 | } |
3905 | 3934 | ||
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3920 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); | 3949 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); |
3921 | if (!tsk_cache_hot || | 3950 | if (!tsk_cache_hot || |
3922 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 3951 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3923 | #ifdef CONFIG_SCHEDSTATS | 3952 | |
3924 | if (tsk_cache_hot) { | 3953 | if (tsk_cache_hot) { |
3925 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 3954 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
3926 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 3955 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
3927 | } | 3956 | } |
3928 | #endif | 3957 | |
3929 | return 1; | 3958 | return 1; |
3930 | } | 3959 | } |
3931 | 3960 | ||
3932 | if (tsk_cache_hot) { | 3961 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); |
3933 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); | 3962 | return 0; |
3934 | return 0; | ||
3935 | } | ||
3936 | return 1; | ||
3937 | } | 3963 | } |
3938 | 3964 | ||
3939 | /* | 3965 | /* |
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env) | |||
3948 | struct task_struct *p, *n; | 3974 | struct task_struct *p, *n; |
3949 | 3975 | ||
3950 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 3976 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3951 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) | ||
3952 | continue; | ||
3953 | |||
3954 | if (!can_migrate_task(p, env)) | 3977 | if (!can_migrate_task(p, env)) |
3955 | continue; | 3978 | continue; |
3956 | 3979 | ||
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env) | |||
4002 | break; | 4025 | break; |
4003 | } | 4026 | } |
4004 | 4027 | ||
4005 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | 4028 | if (!can_migrate_task(p, env)) |
4006 | goto next; | 4029 | goto next; |
4007 | 4030 | ||
4008 | load = task_h_load(p); | 4031 | load = task_h_load(p); |
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env) | |||
4013 | if ((load / 2) > env->imbalance) | 4036 | if ((load / 2) > env->imbalance) |
4014 | goto next; | 4037 | goto next; |
4015 | 4038 | ||
4016 | if (!can_migrate_task(p, env)) | ||
4017 | goto next; | ||
4018 | |||
4019 | move_task(p, env); | 4039 | move_task(p, env); |
4020 | pulled++; | 4040 | pulled++; |
4021 | env->imbalance -= load; | 4041 | env->imbalance -= load; |
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
4245 | return load_idx; | 4265 | return load_idx; |
4246 | } | 4266 | } |
4247 | 4267 | ||
4248 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 4268 | static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
4249 | { | 4269 | { |
4250 | return SCHED_POWER_SCALE; | 4270 | return SCHED_POWER_SCALE; |
4251 | } | 4271 | } |
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | |||
4255 | return default_scale_freq_power(sd, cpu); | 4275 | return default_scale_freq_power(sd, cpu); |
4256 | } | 4276 | } |
4257 | 4277 | ||
4258 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | 4278 | static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) |
4259 | { | 4279 | { |
4260 | unsigned long weight = sd->span_weight; | 4280 | unsigned long weight = sd->span_weight; |
4261 | unsigned long smt_gain = sd->smt_gain; | 4281 | unsigned long smt_gain = sd->smt_gain; |
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
4270 | return default_scale_smt_power(sd, cpu); | 4290 | return default_scale_smt_power(sd, cpu); |
4271 | } | 4291 | } |
4272 | 4292 | ||
4273 | unsigned long scale_rt_power(int cpu) | 4293 | static unsigned long scale_rt_power(int cpu) |
4274 | { | 4294 | { |
4275 | struct rq *rq = cpu_rq(cpu); | 4295 | struct rq *rq = cpu_rq(cpu); |
4276 | u64 total, available, age_stamp, avg; | 4296 | u64 total, available, age_stamp, avg; |
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4960 | #define MAX_PINNED_INTERVAL 512 | 4980 | #define MAX_PINNED_INTERVAL 512 |
4961 | 4981 | ||
4962 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4982 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4963 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4983 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); |
4964 | 4984 | ||
4965 | static int need_active_balance(struct lb_env *env) | 4985 | static int need_active_balance(struct lb_env *env) |
4966 | { | 4986 | { |
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4991 | int *balance) | 5011 | int *balance) |
4992 | { | 5012 | { |
4993 | int ld_moved, cur_ld_moved, active_balance = 0; | 5013 | int ld_moved, cur_ld_moved, active_balance = 0; |
4994 | int lb_iterations, max_lb_iterations; | ||
4995 | struct sched_group *group; | 5014 | struct sched_group *group; |
4996 | struct rq *busiest; | 5015 | struct rq *busiest; |
4997 | unsigned long flags; | 5016 | unsigned long flags; |
4998 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 5017 | struct cpumask *cpus = __get_cpu_var(load_balance_mask); |
4999 | 5018 | ||
5000 | struct lb_env env = { | 5019 | struct lb_env env = { |
5001 | .sd = sd, | 5020 | .sd = sd, |
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5007 | .cpus = cpus, | 5026 | .cpus = cpus, |
5008 | }; | 5027 | }; |
5009 | 5028 | ||
5029 | /* | ||
5030 | * For NEWLY_IDLE load_balancing, we don't need to consider | ||
5031 | * other cpus in our group | ||
5032 | */ | ||
5033 | if (idle == CPU_NEWLY_IDLE) | ||
5034 | env.dst_grpmask = NULL; | ||
5035 | |||
5010 | cpumask_copy(cpus, cpu_active_mask); | 5036 | cpumask_copy(cpus, cpu_active_mask); |
5011 | max_lb_iterations = cpumask_weight(env.dst_grpmask); | ||
5012 | 5037 | ||
5013 | schedstat_inc(sd, lb_count[idle]); | 5038 | schedstat_inc(sd, lb_count[idle]); |
5014 | 5039 | ||
@@ -5034,7 +5059,6 @@ redo: | |||
5034 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 5059 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
5035 | 5060 | ||
5036 | ld_moved = 0; | 5061 | ld_moved = 0; |
5037 | lb_iterations = 1; | ||
5038 | if (busiest->nr_running > 1) { | 5062 | if (busiest->nr_running > 1) { |
5039 | /* | 5063 | /* |
5040 | * Attempt to move tasks. If find_busiest_group has found | 5064 | * Attempt to move tasks. If find_busiest_group has found |
@@ -5061,17 +5085,17 @@ more_balance: | |||
5061 | double_rq_unlock(env.dst_rq, busiest); | 5085 | double_rq_unlock(env.dst_rq, busiest); |
5062 | local_irq_restore(flags); | 5086 | local_irq_restore(flags); |
5063 | 5087 | ||
5064 | if (env.flags & LBF_NEED_BREAK) { | ||
5065 | env.flags &= ~LBF_NEED_BREAK; | ||
5066 | goto more_balance; | ||
5067 | } | ||
5068 | |||
5069 | /* | 5088 | /* |
5070 | * some other cpu did the load balance for us. | 5089 | * some other cpu did the load balance for us. |
5071 | */ | 5090 | */ |
5072 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) | 5091 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) |
5073 | resched_cpu(env.dst_cpu); | 5092 | resched_cpu(env.dst_cpu); |
5074 | 5093 | ||
5094 | if (env.flags & LBF_NEED_BREAK) { | ||
5095 | env.flags &= ~LBF_NEED_BREAK; | ||
5096 | goto more_balance; | ||
5097 | } | ||
5098 | |||
5075 | /* | 5099 | /* |
5076 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | 5100 | * Revisit (affine) tasks on src_cpu that couldn't be moved to |
5077 | * us and move them to an alternate dst_cpu in our sched_group | 5101 | * us and move them to an alternate dst_cpu in our sched_group |
@@ -5091,14 +5115,17 @@ more_balance: | |||
5091 | * moreover subsequent load balance cycles should correct the | 5115 | * moreover subsequent load balance cycles should correct the |
5092 | * excess load moved. | 5116 | * excess load moved. |
5093 | */ | 5117 | */ |
5094 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | 5118 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { |
5095 | lb_iterations++ < max_lb_iterations) { | ||
5096 | 5119 | ||
5097 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 5120 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
5098 | env.dst_cpu = env.new_dst_cpu; | 5121 | env.dst_cpu = env.new_dst_cpu; |
5099 | env.flags &= ~LBF_SOME_PINNED; | 5122 | env.flags &= ~LBF_SOME_PINNED; |
5100 | env.loop = 0; | 5123 | env.loop = 0; |
5101 | env.loop_break = sched_nr_migrate_break; | 5124 | env.loop_break = sched_nr_migrate_break; |
5125 | |||
5126 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
5127 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
5128 | |||
5102 | /* | 5129 | /* |
5103 | * Go back to "more_balance" rather than "redo" since we | 5130 | * Go back to "more_balance" rather than "redo" since we |
5104 | * need to continue with same src_cpu. | 5131 | * need to continue with same src_cpu. |
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5219 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5246 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
5220 | return; | 5247 | return; |
5221 | 5248 | ||
5222 | update_rq_runnable_avg(this_rq, 1); | ||
5223 | |||
5224 | /* | 5249 | /* |
5225 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 5250 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
5226 | */ | 5251 | */ |
@@ -5330,7 +5355,7 @@ out_unlock: | |||
5330 | return 0; | 5355 | return 0; |
5331 | } | 5356 | } |
5332 | 5357 | ||
5333 | #ifdef CONFIG_NO_HZ | 5358 | #ifdef CONFIG_NO_HZ_COMMON |
5334 | /* | 5359 | /* |
5335 | * idle load balancing details | 5360 | * idle load balancing details |
5336 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 5361 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) | |||
5395 | struct sched_domain *sd; | 5420 | struct sched_domain *sd; |
5396 | int cpu = smp_processor_id(); | 5421 | int cpu = smp_processor_id(); |
5397 | 5422 | ||
5398 | if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) | ||
5399 | return; | ||
5400 | clear_bit(NOHZ_IDLE, nohz_flags(cpu)); | ||
5401 | |||
5402 | rcu_read_lock(); | 5423 | rcu_read_lock(); |
5403 | for_each_domain(cpu, sd) | 5424 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); |
5425 | |||
5426 | if (!sd || !sd->nohz_idle) | ||
5427 | goto unlock; | ||
5428 | sd->nohz_idle = 0; | ||
5429 | |||
5430 | for (; sd; sd = sd->parent) | ||
5404 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | 5431 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
5432 | unlock: | ||
5405 | rcu_read_unlock(); | 5433 | rcu_read_unlock(); |
5406 | } | 5434 | } |
5407 | 5435 | ||
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void) | |||
5410 | struct sched_domain *sd; | 5438 | struct sched_domain *sd; |
5411 | int cpu = smp_processor_id(); | 5439 | int cpu = smp_processor_id(); |
5412 | 5440 | ||
5413 | if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) | ||
5414 | return; | ||
5415 | set_bit(NOHZ_IDLE, nohz_flags(cpu)); | ||
5416 | |||
5417 | rcu_read_lock(); | 5441 | rcu_read_lock(); |
5418 | for_each_domain(cpu, sd) | 5442 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); |
5443 | |||
5444 | if (!sd || sd->nohz_idle) | ||
5445 | goto unlock; | ||
5446 | sd->nohz_idle = 1; | ||
5447 | |||
5448 | for (; sd; sd = sd->parent) | ||
5419 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | 5449 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
5450 | unlock: | ||
5420 | rcu_read_unlock(); | 5451 | rcu_read_unlock(); |
5421 | } | 5452 | } |
5422 | 5453 | ||
@@ -5468,7 +5499,7 @@ void update_max_interval(void) | |||
5468 | * It checks each scheduling domain to see if it is due to be balanced, | 5499 | * It checks each scheduling domain to see if it is due to be balanced, |
5469 | * and initiates a balancing operation if so. | 5500 | * and initiates a balancing operation if so. |
5470 | * | 5501 | * |
5471 | * Balancing parameters are set up in arch_init_sched_domains. | 5502 | * Balancing parameters are set up in init_sched_domains. |
5472 | */ | 5503 | */ |
5473 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5504 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5474 | { | 5505 | { |
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5506 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5537 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5507 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5538 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
5508 | /* | 5539 | /* |
5509 | * We've pulled tasks over so either we're no | 5540 | * The LBF_SOME_PINNED logic could have changed |
5510 | * longer idle. | 5541 | * env->dst_cpu, so we can't know our idle |
5542 | * state even if we migrated tasks. Update it. | ||
5511 | */ | 5543 | */ |
5512 | idle = CPU_NOT_IDLE; | 5544 | idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; |
5513 | } | 5545 | } |
5514 | sd->last_balance = jiffies; | 5546 | sd->last_balance = jiffies; |
5515 | } | 5547 | } |
@@ -5540,9 +5572,9 @@ out: | |||
5540 | rq->next_balance = next_balance; | 5572 | rq->next_balance = next_balance; |
5541 | } | 5573 | } |
5542 | 5574 | ||
5543 | #ifdef CONFIG_NO_HZ | 5575 | #ifdef CONFIG_NO_HZ_COMMON |
5544 | /* | 5576 | /* |
5545 | * In CONFIG_NO_HZ case, the idle balance kickee will do the | 5577 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
5546 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 5578 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
5547 | */ | 5579 | */ |
5548 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | 5580 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) |
@@ -5685,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu) | |||
5685 | if (time_after_eq(jiffies, rq->next_balance) && | 5717 | if (time_after_eq(jiffies, rq->next_balance) && |
5686 | likely(!on_null_domain(cpu))) | 5718 | likely(!on_null_domain(cpu))) |
5687 | raise_softirq(SCHED_SOFTIRQ); | 5719 | raise_softirq(SCHED_SOFTIRQ); |
5688 | #ifdef CONFIG_NO_HZ | 5720 | #ifdef CONFIG_NO_HZ_COMMON |
5689 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 5721 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) |
5690 | nohz_balancer_kick(cpu); | 5722 | nohz_balancer_kick(cpu); |
5691 | #endif | 5723 | #endif |
@@ -6155,7 +6187,7 @@ __init void init_sched_fair_class(void) | |||
6155 | #ifdef CONFIG_SMP | 6187 | #ifdef CONFIG_SMP |
6156 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | 6188 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
6157 | 6189 | ||
6158 | #ifdef CONFIG_NO_HZ | 6190 | #ifdef CONFIG_NO_HZ_COMMON |
6159 | nohz.next_balance = jiffies; | 6191 | nohz.next_balance = jiffies; |
6160 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 6192 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
6161 | cpu_notifier(sched_ilb_notifier, 0); | 6193 | cpu_notifier(sched_ilb_notifier, 0); |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false) | |||
46 | SCHED_FEAT(LB_BIAS, true) | 46 | SCHED_FEAT(LB_BIAS, true) |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Spin-wait on mutex acquisition when the mutex owner is running on | ||
50 | * another cpu -- assumes that when the owner is running, it will soon | ||
51 | * release the lock. Decreases scheduling overhead. | ||
52 | */ | ||
53 | SCHED_FEAT(OWNER_SPIN, true) | ||
54 | |||
55 | /* | ||
56 | * Decrement CPU power based on time not spent running tasks | 49 | * Decrement CPU power based on time not spent running tasks |
57 | */ | 50 | */ |
58 | SCHED_FEAT(NONTASK_POWER, true) | 51 | SCHED_FEAT(NONTASK_POWER, true) |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..d8da01008d39 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | |||
13 | { | 13 | { |
14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
15 | } | 15 | } |
16 | |||
17 | static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) | ||
18 | { | ||
19 | idle_exit_fair(rq); | ||
20 | rq_last_tick_reset(rq); | ||
21 | } | ||
22 | |||
23 | static void post_schedule_idle(struct rq *rq) | ||
24 | { | ||
25 | idle_enter_fair(rq); | ||
26 | } | ||
16 | #endif /* CONFIG_SMP */ | 27 | #endif /* CONFIG_SMP */ |
17 | /* | 28 | /* |
18 | * Idle tasks are unconditionally rescheduled: | 29 | * Idle tasks are unconditionally rescheduled: |
@@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 36 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
26 | { | 37 | { |
27 | schedstat_inc(rq, sched_goidle); | 38 | schedstat_inc(rq, sched_goidle); |
39 | #ifdef CONFIG_SMP | ||
40 | /* Trigger the post schedule to do an idle_enter for CFS */ | ||
41 | rq->post_schedule = 1; | ||
42 | #endif | ||
28 | return rq->idle; | 43 | return rq->idle; |
29 | } | 44 | } |
30 | 45 | ||
@@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = { | |||
86 | 101 | ||
87 | #ifdef CONFIG_SMP | 102 | #ifdef CONFIG_SMP |
88 | .select_task_rq = select_task_rq_idle, | 103 | .select_task_rq = select_task_rq_idle, |
104 | .pre_schedule = pre_schedule_idle, | ||
105 | .post_schedule = post_schedule_idle, | ||
89 | #endif | 106 | #endif |
90 | 107 | ||
91 | .set_curr_task = set_curr_task_idle, | 108 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfdf469f..ce39224d6155 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -5,8 +5,10 @@ | |||
5 | #include <linux/mutex.h> | 5 | #include <linux/mutex.h> |
6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
7 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
8 | #include <linux/tick.h> | ||
8 | 9 | ||
9 | #include "cpupri.h" | 10 | #include "cpupri.h" |
11 | #include "cpuacct.h" | ||
10 | 12 | ||
11 | extern __read_mostly int scheduler_running; | 13 | extern __read_mostly int scheduler_running; |
12 | 14 | ||
@@ -33,6 +35,31 @@ extern __read_mostly int scheduler_running; | |||
33 | */ | 35 | */ |
34 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 36 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
35 | 37 | ||
38 | /* | ||
39 | * Increase resolution of nice-level calculations for 64-bit architectures. | ||
40 | * The extra resolution improves shares distribution and load balancing of | ||
41 | * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup | ||
42 | * hierarchies, especially on larger systems. This is not a user-visible change | ||
43 | * and does not change the user-interface for setting shares/weights. | ||
44 | * | ||
45 | * We increase resolution only if we have enough bits to allow this increased | ||
46 | * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution | ||
47 | * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the | ||
48 | * increased costs. | ||
49 | */ | ||
50 | #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ | ||
51 | # define SCHED_LOAD_RESOLUTION 10 | ||
52 | # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) | ||
53 | # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) | ||
54 | #else | ||
55 | # define SCHED_LOAD_RESOLUTION 0 | ||
56 | # define scale_load(w) (w) | ||
57 | # define scale_load_down(w) (w) | ||
58 | #endif | ||
59 | |||
60 | #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) | ||
61 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) | ||
62 | |||
36 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 63 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
37 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 64 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
38 | 65 | ||
@@ -154,11 +181,6 @@ struct task_group { | |||
154 | #define MAX_SHARES (1UL << 18) | 181 | #define MAX_SHARES (1UL << 18) |
155 | #endif | 182 | #endif |
156 | 183 | ||
157 | /* Default task group. | ||
158 | * Every task in system belong to this group at bootup. | ||
159 | */ | ||
160 | extern struct task_group root_task_group; | ||
161 | |||
162 | typedef int (*tg_visitor)(struct task_group *, void *); | 184 | typedef int (*tg_visitor)(struct task_group *, void *); |
163 | 185 | ||
164 | extern int walk_tg_tree_from(struct task_group *from, | 186 | extern int walk_tg_tree_from(struct task_group *from, |
@@ -196,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
196 | struct sched_rt_entity *rt_se, int cpu, | 218 | struct sched_rt_entity *rt_se, int cpu, |
197 | struct sched_rt_entity *parent); | 219 | struct sched_rt_entity *parent); |
198 | 220 | ||
221 | extern struct task_group *sched_create_group(struct task_group *parent); | ||
222 | extern void sched_online_group(struct task_group *tg, | ||
223 | struct task_group *parent); | ||
224 | extern void sched_destroy_group(struct task_group *tg); | ||
225 | extern void sched_offline_group(struct task_group *tg); | ||
226 | |||
227 | extern void sched_move_task(struct task_struct *tsk); | ||
228 | |||
229 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
230 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
231 | #endif | ||
232 | |||
199 | #else /* CONFIG_CGROUP_SCHED */ | 233 | #else /* CONFIG_CGROUP_SCHED */ |
200 | 234 | ||
201 | struct cfs_bandwidth { }; | 235 | struct cfs_bandwidth { }; |
@@ -372,10 +406,13 @@ struct rq { | |||
372 | #define CPU_LOAD_IDX_MAX 5 | 406 | #define CPU_LOAD_IDX_MAX 5 |
373 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 407 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
374 | unsigned long last_load_update_tick; | 408 | unsigned long last_load_update_tick; |
375 | #ifdef CONFIG_NO_HZ | 409 | #ifdef CONFIG_NO_HZ_COMMON |
376 | u64 nohz_stamp; | 410 | u64 nohz_stamp; |
377 | unsigned long nohz_flags; | 411 | unsigned long nohz_flags; |
378 | #endif | 412 | #endif |
413 | #ifdef CONFIG_NO_HZ_FULL | ||
414 | unsigned long last_sched_tick; | ||
415 | #endif | ||
379 | int skip_clock_update; | 416 | int skip_clock_update; |
380 | 417 | ||
381 | /* capture load from *all* tasks on this cpu: */ | 418 | /* capture load from *all* tasks on this cpu: */ |
@@ -547,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
547 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 584 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
548 | DECLARE_PER_CPU(int, sd_llc_id); | 585 | DECLARE_PER_CPU(int, sd_llc_id); |
549 | 586 | ||
587 | struct sched_group_power { | ||
588 | atomic_t ref; | ||
589 | /* | ||
590 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | ||
591 | * single CPU. | ||
592 | */ | ||
593 | unsigned int power, power_orig; | ||
594 | unsigned long next_update; | ||
595 | /* | ||
596 | * Number of busy cpus in this group. | ||
597 | */ | ||
598 | atomic_t nr_busy_cpus; | ||
599 | |||
600 | unsigned long cpumask[0]; /* iteration mask */ | ||
601 | }; | ||
602 | |||
603 | struct sched_group { | ||
604 | struct sched_group *next; /* Must be a circular list */ | ||
605 | atomic_t ref; | ||
606 | |||
607 | unsigned int group_weight; | ||
608 | struct sched_group_power *sgp; | ||
609 | |||
610 | /* | ||
611 | * The CPUs this group covers. | ||
612 | * | ||
613 | * NOTE: this field is variable length. (Allocated dynamically | ||
614 | * by attaching extra space to the end of the structure, | ||
615 | * depending on how many CPUs the kernel has booted up with) | ||
616 | */ | ||
617 | unsigned long cpumask[0]; | ||
618 | }; | ||
619 | |||
620 | static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | ||
621 | { | ||
622 | return to_cpumask(sg->cpumask); | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * cpumask masking which cpus in the group are allowed to iterate up the domain | ||
627 | * tree. | ||
628 | */ | ||
629 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) | ||
630 | { | ||
631 | return to_cpumask(sg->sgp->cpumask); | ||
632 | } | ||
633 | |||
634 | /** | ||
635 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
636 | * @group: The group whose first cpu is to be returned. | ||
637 | */ | ||
638 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
639 | { | ||
640 | return cpumask_first(sched_group_cpus(group)); | ||
641 | } | ||
642 | |||
550 | extern int group_balance_cpu(struct sched_group *sg); | 643 | extern int group_balance_cpu(struct sched_group *sg); |
551 | 644 | ||
552 | #endif /* CONFIG_SMP */ | 645 | #endif /* CONFIG_SMP */ |
@@ -784,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
784 | } | 877 | } |
785 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 878 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
786 | 879 | ||
880 | /* | ||
881 | * wake flags | ||
882 | */ | ||
883 | #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ | ||
884 | #define WF_FORK 0x02 /* child wakeup after fork */ | ||
885 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | ||
787 | 886 | ||
788 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 887 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
789 | { | 888 | { |
@@ -856,14 +955,61 @@ static const u32 prio_to_wmult[40] = { | |||
856 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 955 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
857 | }; | 956 | }; |
858 | 957 | ||
859 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 958 | #define ENQUEUE_WAKEUP 1 |
860 | enum cpuacct_stat_index { | 959 | #define ENQUEUE_HEAD 2 |
861 | CPUACCT_STAT_USER, /* ... user mode */ | 960 | #ifdef CONFIG_SMP |
862 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | 961 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ |
962 | #else | ||
963 | #define ENQUEUE_WAKING 0 | ||
964 | #endif | ||
863 | 965 | ||
864 | CPUACCT_STAT_NSTATS, | 966 | #define DEQUEUE_SLEEP 1 |
865 | }; | ||
866 | 967 | ||
968 | struct sched_class { | ||
969 | const struct sched_class *next; | ||
970 | |||
971 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | ||
972 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | ||
973 | void (*yield_task) (struct rq *rq); | ||
974 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | ||
975 | |||
976 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | ||
977 | |||
978 | struct task_struct * (*pick_next_task) (struct rq *rq); | ||
979 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | ||
980 | |||
981 | #ifdef CONFIG_SMP | ||
982 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); | ||
983 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | ||
984 | |||
985 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | ||
986 | void (*post_schedule) (struct rq *this_rq); | ||
987 | void (*task_waking) (struct task_struct *task); | ||
988 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | ||
989 | |||
990 | void (*set_cpus_allowed)(struct task_struct *p, | ||
991 | const struct cpumask *newmask); | ||
992 | |||
993 | void (*rq_online)(struct rq *rq); | ||
994 | void (*rq_offline)(struct rq *rq); | ||
995 | #endif | ||
996 | |||
997 | void (*set_curr_task) (struct rq *rq); | ||
998 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | ||
999 | void (*task_fork) (struct task_struct *p); | ||
1000 | |||
1001 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | ||
1002 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | ||
1003 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | ||
1004 | int oldprio); | ||
1005 | |||
1006 | unsigned int (*get_rr_interval) (struct rq *rq, | ||
1007 | struct task_struct *task); | ||
1008 | |||
1009 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1010 | void (*task_move_group) (struct task_struct *p, int on_rq); | ||
1011 | #endif | ||
1012 | }; | ||
867 | 1013 | ||
868 | #define sched_class_highest (&stop_sched_class) | 1014 | #define sched_class_highest (&stop_sched_class) |
869 | #define for_each_class(class) \ | 1015 | #define for_each_class(class) \ |
@@ -877,9 +1023,23 @@ extern const struct sched_class idle_sched_class; | |||
877 | 1023 | ||
878 | #ifdef CONFIG_SMP | 1024 | #ifdef CONFIG_SMP |
879 | 1025 | ||
1026 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
1027 | |||
880 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1028 | extern void trigger_load_balance(struct rq *rq, int cpu); |
881 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1029 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
882 | 1030 | ||
1031 | /* | ||
1032 | * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg | ||
1033 | * becomes useful in lb | ||
1034 | */ | ||
1035 | #if defined(CONFIG_FAIR_GROUP_SCHED) | ||
1036 | extern void idle_enter_fair(struct rq *this_rq); | ||
1037 | extern void idle_exit_fair(struct rq *this_rq); | ||
1038 | #else | ||
1039 | static inline void idle_enter_fair(struct rq *this_rq) {} | ||
1040 | static inline void idle_exit_fair(struct rq *this_rq) {} | ||
1041 | #endif | ||
1042 | |||
883 | #else /* CONFIG_SMP */ | 1043 | #else /* CONFIG_SMP */ |
884 | 1044 | ||
885 | static inline void idle_balance(int cpu, struct rq *rq) | 1045 | static inline void idle_balance(int cpu, struct rq *rq) |
@@ -891,7 +1051,6 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
891 | extern void sysrq_sched_debug_show(void); | 1051 | extern void sysrq_sched_debug_show(void); |
892 | extern void sched_init_granularity(void); | 1052 | extern void sched_init_granularity(void); |
893 | extern void update_max_interval(void); | 1053 | extern void update_max_interval(void); |
894 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
895 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | 1054 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); |
896 | extern void init_sched_rt_class(void); | 1055 | extern void init_sched_rt_class(void); |
897 | extern void init_sched_fair_class(void); | 1056 | extern void init_sched_fair_class(void); |
@@ -904,45 +1063,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime | |||
904 | 1063 | ||
905 | extern void update_idle_cpu_load(struct rq *this_rq); | 1064 | extern void update_idle_cpu_load(struct rq *this_rq); |
906 | 1065 | ||
907 | #ifdef CONFIG_CGROUP_CPUACCT | ||
908 | #include <linux/cgroup.h> | ||
909 | /* track cpu usage of a group of tasks and its child groups */ | ||
910 | struct cpuacct { | ||
911 | struct cgroup_subsys_state css; | ||
912 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
913 | u64 __percpu *cpuusage; | ||
914 | struct kernel_cpustat __percpu *cpustat; | ||
915 | }; | ||
916 | |||
917 | extern struct cgroup_subsys cpuacct_subsys; | ||
918 | extern struct cpuacct root_cpuacct; | ||
919 | |||
920 | /* return cpu accounting group corresponding to this container */ | ||
921 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
922 | { | ||
923 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
924 | struct cpuacct, css); | ||
925 | } | ||
926 | |||
927 | /* return cpu accounting group to which this task belongs */ | ||
928 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
929 | { | ||
930 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
931 | struct cpuacct, css); | ||
932 | } | ||
933 | |||
934 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
935 | { | ||
936 | if (!ca || !ca->css.cgroup->parent) | ||
937 | return NULL; | ||
938 | return cgroup_ca(ca->css.cgroup->parent); | ||
939 | } | ||
940 | |||
941 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
942 | #else | ||
943 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
944 | #endif | ||
945 | |||
946 | #ifdef CONFIG_PARAVIRT | 1066 | #ifdef CONFIG_PARAVIRT |
947 | static inline u64 steal_ticks(u64 steal) | 1067 | static inline u64 steal_ticks(u64 steal) |
948 | { | 1068 | { |
@@ -956,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal) | |||
956 | static inline void inc_nr_running(struct rq *rq) | 1076 | static inline void inc_nr_running(struct rq *rq) |
957 | { | 1077 | { |
958 | rq->nr_running++; | 1078 | rq->nr_running++; |
1079 | |||
1080 | #ifdef CONFIG_NO_HZ_FULL | ||
1081 | if (rq->nr_running == 2) { | ||
1082 | if (tick_nohz_full_cpu(rq->cpu)) { | ||
1083 | /* Order rq->nr_running write against the IPI */ | ||
1084 | smp_wmb(); | ||
1085 | smp_send_reschedule(rq->cpu); | ||
1086 | } | ||
1087 | } | ||
1088 | #endif | ||
959 | } | 1089 | } |
960 | 1090 | ||
961 | static inline void dec_nr_running(struct rq *rq) | 1091 | static inline void dec_nr_running(struct rq *rq) |
@@ -963,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq) | |||
963 | rq->nr_running--; | 1093 | rq->nr_running--; |
964 | } | 1094 | } |
965 | 1095 | ||
1096 | static inline void rq_last_tick_reset(struct rq *rq) | ||
1097 | { | ||
1098 | #ifdef CONFIG_NO_HZ_FULL | ||
1099 | rq->last_sched_tick = jiffies; | ||
1100 | #endif | ||
1101 | } | ||
1102 | |||
966 | extern void update_rq_clock(struct rq *rq); | 1103 | extern void update_rq_clock(struct rq *rq); |
967 | 1104 | ||
968 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | 1105 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); |
@@ -1183,11 +1320,10 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | |||
1183 | 1320 | ||
1184 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | 1321 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); |
1185 | 1322 | ||
1186 | #ifdef CONFIG_NO_HZ | 1323 | #ifdef CONFIG_NO_HZ_COMMON |
1187 | enum rq_nohz_flag_bits { | 1324 | enum rq_nohz_flag_bits { |
1188 | NOHZ_TICK_STOPPED, | 1325 | NOHZ_TICK_STOPPED, |
1189 | NOHZ_BALANCE_KICK, | 1326 | NOHZ_BALANCE_KICK, |
1190 | NOHZ_IDLE, | ||
1191 | }; | 1327 | }; |
1192 | 1328 | ||
1193 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 1329 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index e036eda1a9c9..da98af347e8b 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
@@ -130,16 +130,11 @@ static int schedstat_open(struct inode *inode, struct file *file) | |||
130 | return seq_open(file, &schedstat_sops); | 130 | return seq_open(file, &schedstat_sops); |
131 | } | 131 | } |
132 | 132 | ||
133 | static int schedstat_release(struct inode *inode, struct file *file) | ||
134 | { | ||
135 | return 0; | ||
136 | }; | ||
137 | |||
138 | static const struct file_operations proc_schedstat_operations = { | 133 | static const struct file_operations proc_schedstat_operations = { |
139 | .open = schedstat_open, | 134 | .open = schedstat_open, |
140 | .read = seq_read, | 135 | .read = seq_read, |
141 | .llseek = seq_lseek, | 136 | .llseek = seq_lseek, |
142 | .release = schedstat_release, | 137 | .release = seq_release, |
143 | }; | 138 | }; |
144 | 139 | ||
145 | static int __init proc_schedstat_init(void) | 140 | static int __init proc_schedstat_init(void) |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5af44b593770..b7a10048a32c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
160 | case BPF_S_ALU_AND_X: | 160 | case BPF_S_ALU_AND_X: |
161 | case BPF_S_ALU_OR_K: | 161 | case BPF_S_ALU_OR_K: |
162 | case BPF_S_ALU_OR_X: | 162 | case BPF_S_ALU_OR_X: |
163 | case BPF_S_ALU_XOR_K: | ||
164 | case BPF_S_ALU_XOR_X: | ||
163 | case BPF_S_ALU_LSH_K: | 165 | case BPF_S_ALU_LSH_K: |
164 | case BPF_S_ALU_LSH_X: | 166 | case BPF_S_ALU_LSH_X: |
165 | case BPF_S_ALU_RSH_K: | 167 | case BPF_S_ALU_RSH_K: |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 4567fc020fe3..6815171a4fff 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c | |||
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(up); | |||
193 | struct semaphore_waiter { | 193 | struct semaphore_waiter { |
194 | struct list_head list; | 194 | struct list_head list; |
195 | struct task_struct *task; | 195 | struct task_struct *task; |
196 | int up; | 196 | bool up; |
197 | }; | 197 | }; |
198 | 198 | ||
199 | /* | 199 | /* |
@@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state, | |||
209 | 209 | ||
210 | list_add_tail(&waiter.list, &sem->wait_list); | 210 | list_add_tail(&waiter.list, &sem->wait_list); |
211 | waiter.task = task; | 211 | waiter.task = task; |
212 | waiter.up = 0; | 212 | waiter.up = false; |
213 | 213 | ||
214 | for (;;) { | 214 | for (;;) { |
215 | if (signal_pending_state(state, task)) | 215 | if (signal_pending_state(state, task)) |
216 | goto interrupted; | 216 | goto interrupted; |
217 | if (timeout <= 0) | 217 | if (unlikely(timeout <= 0)) |
218 | goto timed_out; | 218 | goto timed_out; |
219 | __set_task_state(task, state); | 219 | __set_task_state(task, state); |
220 | raw_spin_unlock_irq(&sem->lock); | 220 | raw_spin_unlock_irq(&sem->lock); |
@@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem) | |||
258 | struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, | 258 | struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, |
259 | struct semaphore_waiter, list); | 259 | struct semaphore_waiter, list); |
260 | list_del(&waiter->list); | 260 | list_del(&waiter->list); |
261 | waiter->up = 1; | 261 | waiter->up = true; |
262 | wake_up_process(waiter->task); | 262 | wake_up_process(waiter->task); |
263 | } | 263 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index dd72567767d9..113411bfe8b1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/user_namespace.h> | 32 | #include <linux/user_namespace.h> |
33 | #include <linux/uprobes.h> | 33 | #include <linux/uprobes.h> |
34 | #include <linux/compat.h> | 34 | #include <linux/compat.h> |
35 | #include <linux/cn_proc.h> | ||
35 | #define CREATE_TRACE_POINTS | 36 | #define CREATE_TRACE_POINTS |
36 | #include <trace/events/signal.h> | 37 | #include <trace/events/signal.h> |
37 | 38 | ||
@@ -854,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t) | |||
854 | * Returns true if the signal should be actually delivered, otherwise | 855 | * Returns true if the signal should be actually delivered, otherwise |
855 | * it should be dropped. | 856 | * it should be dropped. |
856 | */ | 857 | */ |
857 | static int prepare_signal(int sig, struct task_struct *p, bool force) | 858 | static bool prepare_signal(int sig, struct task_struct *p, bool force) |
858 | { | 859 | { |
859 | struct signal_struct *signal = p->signal; | 860 | struct signal_struct *signal = p->signal; |
860 | struct task_struct *t; | 861 | struct task_struct *t; |
861 | 862 | ||
862 | if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { | 863 | if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { |
864 | if (signal->flags & SIGNAL_GROUP_COREDUMP) | ||
865 | return sig == SIGKILL; | ||
863 | /* | 866 | /* |
864 | * The process is in the middle of dying, nothing to do. | 867 | * The process is in the middle of dying, nothing to do. |
865 | */ | 868 | */ |
@@ -1160,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1160 | static void print_fatal_signal(int signr) | 1163 | static void print_fatal_signal(int signr) |
1161 | { | 1164 | { |
1162 | struct pt_regs *regs = signal_pt_regs(); | 1165 | struct pt_regs *regs = signal_pt_regs(); |
1163 | printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", | 1166 | printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr); |
1164 | current->comm, task_pid_nr(current), signr); | ||
1165 | 1167 | ||
1166 | #if defined(__i386__) && !defined(__arch_um__) | 1168 | #if defined(__i386__) && !defined(__arch_um__) |
1167 | printk(KERN_INFO "code at %08lx: ", regs->ip); | 1169 | printk(KERN_INFO "code at %08lx: ", regs->ip); |
@@ -2350,6 +2352,7 @@ relock: | |||
2350 | if (sig_kernel_coredump(signr)) { | 2352 | if (sig_kernel_coredump(signr)) { |
2351 | if (print_fatal_signals) | 2353 | if (print_fatal_signals) |
2352 | print_fatal_signal(info->si_signo); | 2354 | print_fatal_signal(info->si_signo); |
2355 | proc_coredump_connector(current); | ||
2353 | /* | 2356 | /* |
2354 | * If it was able to dump core, this kills all | 2357 | * If it was able to dump core, this kills all |
2355 | * other threads in the group and synchronizes with | 2358 | * other threads in the group and synchronizes with |
@@ -2948,7 +2951,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) | |||
2948 | 2951 | ||
2949 | static int do_tkill(pid_t tgid, pid_t pid, int sig) | 2952 | static int do_tkill(pid_t tgid, pid_t pid, int sig) |
2950 | { | 2953 | { |
2951 | struct siginfo info; | 2954 | struct siginfo info = {}; |
2952 | 2955 | ||
2953 | info.si_signo = sig; | 2956 | info.si_signo = sig; |
2954 | info.si_errno = 0; | 2957 | info.si_errno = 0; |
diff --git a/kernel/smp.c b/kernel/smp.c index 8e451f3ff51b..4dba0f7b72ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -100,16 +100,16 @@ void __init call_function_init(void) | |||
100 | * previous function call. For multi-cpu calls its even more interesting | 100 | * previous function call. For multi-cpu calls its even more interesting |
101 | * as we'll have to ensure no other cpu is observing our csd. | 101 | * as we'll have to ensure no other cpu is observing our csd. |
102 | */ | 102 | */ |
103 | static void csd_lock_wait(struct call_single_data *data) | 103 | static void csd_lock_wait(struct call_single_data *csd) |
104 | { | 104 | { |
105 | while (data->flags & CSD_FLAG_LOCK) | 105 | while (csd->flags & CSD_FLAG_LOCK) |
106 | cpu_relax(); | 106 | cpu_relax(); |
107 | } | 107 | } |
108 | 108 | ||
109 | static void csd_lock(struct call_single_data *data) | 109 | static void csd_lock(struct call_single_data *csd) |
110 | { | 110 | { |
111 | csd_lock_wait(data); | 111 | csd_lock_wait(csd); |
112 | data->flags = CSD_FLAG_LOCK; | 112 | csd->flags |= CSD_FLAG_LOCK; |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * prevent CPU from reordering the above assignment | 115 | * prevent CPU from reordering the above assignment |
@@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data) | |||
119 | smp_mb(); | 119 | smp_mb(); |
120 | } | 120 | } |
121 | 121 | ||
122 | static void csd_unlock(struct call_single_data *data) | 122 | static void csd_unlock(struct call_single_data *csd) |
123 | { | 123 | { |
124 | WARN_ON(!(data->flags & CSD_FLAG_LOCK)); | 124 | WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); |
125 | 125 | ||
126 | /* | 126 | /* |
127 | * ensure we're all done before releasing data: | 127 | * ensure we're all done before releasing data: |
128 | */ | 128 | */ |
129 | smp_mb(); | 129 | smp_mb(); |
130 | 130 | ||
131 | data->flags &= ~CSD_FLAG_LOCK; | 131 | csd->flags &= ~CSD_FLAG_LOCK; |
132 | } | 132 | } |
133 | 133 | ||
134 | /* | 134 | /* |
@@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data) | |||
137 | * ->func, ->info, and ->flags set. | 137 | * ->func, ->info, and ->flags set. |
138 | */ | 138 | */ |
139 | static | 139 | static |
140 | void generic_exec_single(int cpu, struct call_single_data *data, int wait) | 140 | void generic_exec_single(int cpu, struct call_single_data *csd, int wait) |
141 | { | 141 | { |
142 | struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); | 142 | struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); |
143 | unsigned long flags; | 143 | unsigned long flags; |
@@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) | |||
145 | 145 | ||
146 | raw_spin_lock_irqsave(&dst->lock, flags); | 146 | raw_spin_lock_irqsave(&dst->lock, flags); |
147 | ipi = list_empty(&dst->list); | 147 | ipi = list_empty(&dst->list); |
148 | list_add_tail(&data->list, &dst->list); | 148 | list_add_tail(&csd->list, &dst->list); |
149 | raw_spin_unlock_irqrestore(&dst->lock, flags); | 149 | raw_spin_unlock_irqrestore(&dst->lock, flags); |
150 | 150 | ||
151 | /* | 151 | /* |
@@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) | |||
163 | arch_send_call_function_single_ipi(cpu); | 163 | arch_send_call_function_single_ipi(cpu); |
164 | 164 | ||
165 | if (wait) | 165 | if (wait) |
166 | csd_lock_wait(data); | 166 | csd_lock_wait(csd); |
167 | } | 167 | } |
168 | 168 | ||
169 | /* | 169 | /* |
@@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) | |||
173 | void generic_smp_call_function_single_interrupt(void) | 173 | void generic_smp_call_function_single_interrupt(void) |
174 | { | 174 | { |
175 | struct call_single_queue *q = &__get_cpu_var(call_single_queue); | 175 | struct call_single_queue *q = &__get_cpu_var(call_single_queue); |
176 | unsigned int data_flags; | ||
177 | LIST_HEAD(list); | 176 | LIST_HEAD(list); |
178 | 177 | ||
179 | /* | 178 | /* |
@@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void) | |||
186 | raw_spin_unlock(&q->lock); | 185 | raw_spin_unlock(&q->lock); |
187 | 186 | ||
188 | while (!list_empty(&list)) { | 187 | while (!list_empty(&list)) { |
189 | struct call_single_data *data; | 188 | struct call_single_data *csd; |
189 | unsigned int csd_flags; | ||
190 | 190 | ||
191 | data = list_entry(list.next, struct call_single_data, list); | 191 | csd = list_entry(list.next, struct call_single_data, list); |
192 | list_del(&data->list); | 192 | list_del(&csd->list); |
193 | 193 | ||
194 | /* | 194 | /* |
195 | * 'data' can be invalid after this call if flags == 0 | 195 | * 'csd' can be invalid after this call if flags == 0 |
196 | * (when called through generic_exec_single()), | 196 | * (when called through generic_exec_single()), |
197 | * so save them away before making the call: | 197 | * so save them away before making the call: |
198 | */ | 198 | */ |
199 | data_flags = data->flags; | 199 | csd_flags = csd->flags; |
200 | 200 | ||
201 | data->func(data->info); | 201 | csd->func(csd->info); |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * Unlocked CSDs are valid through generic_exec_single(): | 204 | * Unlocked CSDs are valid through generic_exec_single(): |
205 | */ | 205 | */ |
206 | if (data_flags & CSD_FLAG_LOCK) | 206 | if (csd_flags & CSD_FLAG_LOCK) |
207 | csd_unlock(data); | 207 | csd_unlock(csd); |
208 | } | 208 | } |
209 | } | 209 | } |
210 | 210 | ||
@@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, | |||
249 | local_irq_restore(flags); | 249 | local_irq_restore(flags); |
250 | } else { | 250 | } else { |
251 | if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { | 251 | if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { |
252 | struct call_single_data *data = &d; | 252 | struct call_single_data *csd = &d; |
253 | 253 | ||
254 | if (!wait) | 254 | if (!wait) |
255 | data = &__get_cpu_var(csd_data); | 255 | csd = &__get_cpu_var(csd_data); |
256 | 256 | ||
257 | csd_lock(data); | 257 | csd_lock(csd); |
258 | 258 | ||
259 | data->func = func; | 259 | csd->func = func; |
260 | data->info = info; | 260 | csd->info = info; |
261 | generic_exec_single(cpu, data, wait); | 261 | generic_exec_single(cpu, csd, wait); |
262 | } else { | 262 | } else { |
263 | err = -ENXIO; /* CPU not online */ | 263 | err = -ENXIO; /* CPU not online */ |
264 | } | 264 | } |
@@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); | |||
325 | * pre-allocated data structure. Useful for embedding @data inside | 325 | * pre-allocated data structure. Useful for embedding @data inside |
326 | * other structures, for instance. | 326 | * other structures, for instance. |
327 | */ | 327 | */ |
328 | void __smp_call_function_single(int cpu, struct call_single_data *data, | 328 | void __smp_call_function_single(int cpu, struct call_single_data *csd, |
329 | int wait) | 329 | int wait) |
330 | { | 330 | { |
331 | unsigned int this_cpu; | 331 | unsigned int this_cpu; |
@@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
343 | 343 | ||
344 | if (cpu == this_cpu) { | 344 | if (cpu == this_cpu) { |
345 | local_irq_save(flags); | 345 | local_irq_save(flags); |
346 | data->func(data->info); | 346 | csd->func(csd->info); |
347 | local_irq_restore(flags); | 347 | local_irq_restore(flags); |
348 | } else { | 348 | } else { |
349 | csd_lock(data); | 349 | csd_lock(csd); |
350 | generic_exec_single(cpu, data, wait); | 350 | generic_exec_single(cpu, csd, wait); |
351 | } | 351 | } |
352 | put_cpu(); | 352 | put_cpu(); |
353 | } | 353 | } |
@@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
369 | void smp_call_function_many(const struct cpumask *mask, | 369 | void smp_call_function_many(const struct cpumask *mask, |
370 | smp_call_func_t func, void *info, bool wait) | 370 | smp_call_func_t func, void *info, bool wait) |
371 | { | 371 | { |
372 | struct call_function_data *data; | 372 | struct call_function_data *cfd; |
373 | int cpu, next_cpu, this_cpu = smp_processor_id(); | 373 | int cpu, next_cpu, this_cpu = smp_processor_id(); |
374 | 374 | ||
375 | /* | 375 | /* |
@@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask, | |||
401 | return; | 401 | return; |
402 | } | 402 | } |
403 | 403 | ||
404 | data = &__get_cpu_var(cfd_data); | 404 | cfd = &__get_cpu_var(cfd_data); |
405 | 405 | ||
406 | cpumask_and(data->cpumask, mask, cpu_online_mask); | 406 | cpumask_and(cfd->cpumask, mask, cpu_online_mask); |
407 | cpumask_clear_cpu(this_cpu, data->cpumask); | 407 | cpumask_clear_cpu(this_cpu, cfd->cpumask); |
408 | 408 | ||
409 | /* Some callers race with other cpus changing the passed mask */ | 409 | /* Some callers race with other cpus changing the passed mask */ |
410 | if (unlikely(!cpumask_weight(data->cpumask))) | 410 | if (unlikely(!cpumask_weight(cfd->cpumask))) |
411 | return; | 411 | return; |
412 | 412 | ||
413 | /* | 413 | /* |
414 | * After we put an entry into the list, data->cpumask | 414 | * After we put an entry into the list, cfd->cpumask may be cleared |
415 | * may be cleared again when another CPU sends another IPI for | 415 | * again when another CPU sends another IPI for a SMP function call, so |
416 | * a SMP function call, so data->cpumask will be zero. | 416 | * cfd->cpumask will be zero. |
417 | */ | 417 | */ |
418 | cpumask_copy(data->cpumask_ipi, data->cpumask); | 418 | cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); |
419 | 419 | ||
420 | for_each_cpu(cpu, data->cpumask) { | 420 | for_each_cpu(cpu, cfd->cpumask) { |
421 | struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); | 421 | struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); |
422 | struct call_single_queue *dst = | 422 | struct call_single_queue *dst = |
423 | &per_cpu(call_single_queue, cpu); | 423 | &per_cpu(call_single_queue, cpu); |
424 | unsigned long flags; | 424 | unsigned long flags; |
@@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask, | |||
433 | } | 433 | } |
434 | 434 | ||
435 | /* Send a message to all CPUs in the map */ | 435 | /* Send a message to all CPUs in the map */ |
436 | arch_send_call_function_ipi_mask(data->cpumask_ipi); | 436 | arch_send_call_function_ipi_mask(cfd->cpumask_ipi); |
437 | 437 | ||
438 | if (wait) { | 438 | if (wait) { |
439 | for_each_cpu(cpu, data->cpumask) { | 439 | for_each_cpu(cpu, cfd->cpumask) { |
440 | struct call_single_data *csd = | 440 | struct call_single_data *csd; |
441 | per_cpu_ptr(data->csd, cpu); | 441 | |
442 | csd = per_cpu_ptr(cfd->csd, cpu); | ||
442 | csd_lock_wait(csd); | 443 | csd_lock_wait(csd); |
443 | } | 444 | } |
444 | } | 445 | } |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 8eaed9aa9cf0..02fc5c933673 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | |||
185 | } | 185 | } |
186 | get_task_struct(tsk); | 186 | get_task_struct(tsk); |
187 | *per_cpu_ptr(ht->store, cpu) = tsk; | 187 | *per_cpu_ptr(ht->store, cpu) = tsk; |
188 | if (ht->create) | 188 | if (ht->create) { |
189 | ht->create(cpu); | 189 | /* |
190 | * Make sure that the task has actually scheduled out | ||
191 | * into park position, before calling the create | ||
192 | * callback. At least the migration thread callback | ||
193 | * requires that the task is off the runqueue. | ||
194 | */ | ||
195 | if (!wait_task_inactive(tsk, TASK_PARKED)) | ||
196 | WARN_ON(1); | ||
197 | else | ||
198 | ht->create(cpu); | ||
199 | } | ||
190 | return 0; | 200 | return 0; |
191 | } | 201 | } |
192 | 202 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 14d7758074aa..b5197dcb0dad 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void) | |||
329 | wakeup_softirqd(); | 329 | wakeup_softirqd(); |
330 | } | 330 | } |
331 | 331 | ||
332 | static inline void tick_irq_exit(void) | ||
333 | { | ||
334 | #ifdef CONFIG_NO_HZ_COMMON | ||
335 | int cpu = smp_processor_id(); | ||
336 | |||
337 | /* Make sure that timer wheel updates are propagated */ | ||
338 | if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { | ||
339 | if (!in_interrupt()) | ||
340 | tick_nohz_irq_exit(); | ||
341 | } | ||
342 | #endif | ||
343 | } | ||
344 | |||
332 | /* | 345 | /* |
333 | * Exit an interrupt context. Process softirqs if needed and possible: | 346 | * Exit an interrupt context. Process softirqs if needed and possible: |
334 | */ | 347 | */ |
@@ -346,11 +359,7 @@ void irq_exit(void) | |||
346 | if (!in_interrupt() && local_softirq_pending()) | 359 | if (!in_interrupt() && local_softirq_pending()) |
347 | invoke_softirq(); | 360 | invoke_softirq(); |
348 | 361 | ||
349 | #ifdef CONFIG_NO_HZ | 362 | tick_irq_exit(); |
350 | /* Make sure that timer wheel updates are propagated */ | ||
351 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) | ||
352 | tick_nohz_irq_exit(); | ||
353 | #endif | ||
354 | rcu_irq_exit(); | 363 | rcu_irq_exit(); |
355 | } | 364 | } |
356 | 365 | ||
@@ -620,8 +629,7 @@ static void remote_softirq_receive(void *data) | |||
620 | unsigned long flags; | 629 | unsigned long flags; |
621 | int softirq; | 630 | int softirq; |
622 | 631 | ||
623 | softirq = cp->priv; | 632 | softirq = *(int *)cp->info; |
624 | |||
625 | local_irq_save(flags); | 633 | local_irq_save(flags); |
626 | __local_trigger(cp, softirq); | 634 | __local_trigger(cp, softirq); |
627 | local_irq_restore(flags); | 635 | local_irq_restore(flags); |
@@ -631,9 +639,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir | |||
631 | { | 639 | { |
632 | if (cpu_online(cpu)) { | 640 | if (cpu_online(cpu)) { |
633 | cp->func = remote_softirq_receive; | 641 | cp->func = remote_softirq_receive; |
634 | cp->info = cp; | 642 | cp->info = &softirq; |
635 | cp->flags = 0; | 643 | cp->flags = 0; |
636 | cp->priv = softirq; | ||
637 | 644 | ||
638 | __smp_call_function_single(cpu, cp, 0); | 645 | __smp_call_function_single(cpu, cp, 0); |
639 | return 0; | 646 | return 0; |
diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba9..b95d3c72ba21 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -49,6 +49,11 @@ | |||
49 | #include <linux/user_namespace.h> | 49 | #include <linux/user_namespace.h> |
50 | #include <linux/binfmts.h> | 50 | #include <linux/binfmts.h> |
51 | 51 | ||
52 | #include <linux/sched.h> | ||
53 | #include <linux/rcupdate.h> | ||
54 | #include <linux/uidgid.h> | ||
55 | #include <linux/cred.h> | ||
56 | |||
52 | #include <linux/kmsg_dump.h> | 57 | #include <linux/kmsg_dump.h> |
53 | /* Move somewhere else to avoid recompiling? */ | 58 | /* Move somewhere else to avoid recompiling? */ |
54 | #include <generated/utsrelease.h> | 59 | #include <generated/utsrelease.h> |
@@ -324,7 +329,6 @@ void kernel_restart_prepare(char *cmd) | |||
324 | system_state = SYSTEM_RESTART; | 329 | system_state = SYSTEM_RESTART; |
325 | usermodehelper_disable(); | 330 | usermodehelper_disable(); |
326 | device_shutdown(); | 331 | device_shutdown(); |
327 | syscore_shutdown(); | ||
328 | } | 332 | } |
329 | 333 | ||
330 | /** | 334 | /** |
@@ -370,6 +374,7 @@ void kernel_restart(char *cmd) | |||
370 | { | 374 | { |
371 | kernel_restart_prepare(cmd); | 375 | kernel_restart_prepare(cmd); |
372 | disable_nonboot_cpus(); | 376 | disable_nonboot_cpus(); |
377 | syscore_shutdown(); | ||
373 | if (!cmd) | 378 | if (!cmd) |
374 | printk(KERN_EMERG "Restarting system.\n"); | 379 | printk(KERN_EMERG "Restarting system.\n"); |
375 | else | 380 | else |
@@ -395,6 +400,7 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
395 | void kernel_halt(void) | 400 | void kernel_halt(void) |
396 | { | 401 | { |
397 | kernel_shutdown_prepare(SYSTEM_HALT); | 402 | kernel_shutdown_prepare(SYSTEM_HALT); |
403 | disable_nonboot_cpus(); | ||
398 | syscore_shutdown(); | 404 | syscore_shutdown(); |
399 | printk(KERN_EMERG "System halted.\n"); | 405 | printk(KERN_EMERG "System halted.\n"); |
400 | kmsg_dump(KMSG_DUMP_HALT); | 406 | kmsg_dump(KMSG_DUMP_HALT); |
@@ -1043,6 +1049,67 @@ change_okay: | |||
1043 | return old_fsgid; | 1049 | return old_fsgid; |
1044 | } | 1050 | } |
1045 | 1051 | ||
1052 | /** | ||
1053 | * sys_getpid - return the thread group id of the current process | ||
1054 | * | ||
1055 | * Note, despite the name, this returns the tgid not the pid. The tgid and | ||
1056 | * the pid are identical unless CLONE_THREAD was specified on clone() in | ||
1057 | * which case the tgid is the same in all threads of the same group. | ||
1058 | * | ||
1059 | * This is SMP safe as current->tgid does not change. | ||
1060 | */ | ||
1061 | SYSCALL_DEFINE0(getpid) | ||
1062 | { | ||
1063 | return task_tgid_vnr(current); | ||
1064 | } | ||
1065 | |||
1066 | /* Thread ID - the internal kernel "pid" */ | ||
1067 | SYSCALL_DEFINE0(gettid) | ||
1068 | { | ||
1069 | return task_pid_vnr(current); | ||
1070 | } | ||
1071 | |||
1072 | /* | ||
1073 | * Accessing ->real_parent is not SMP-safe, it could | ||
1074 | * change from under us. However, we can use a stale | ||
1075 | * value of ->real_parent under rcu_read_lock(), see | ||
1076 | * release_task()->call_rcu(delayed_put_task_struct). | ||
1077 | */ | ||
1078 | SYSCALL_DEFINE0(getppid) | ||
1079 | { | ||
1080 | int pid; | ||
1081 | |||
1082 | rcu_read_lock(); | ||
1083 | pid = task_tgid_vnr(rcu_dereference(current->real_parent)); | ||
1084 | rcu_read_unlock(); | ||
1085 | |||
1086 | return pid; | ||
1087 | } | ||
1088 | |||
1089 | SYSCALL_DEFINE0(getuid) | ||
1090 | { | ||
1091 | /* Only we change this so SMP safe */ | ||
1092 | return from_kuid_munged(current_user_ns(), current_uid()); | ||
1093 | } | ||
1094 | |||
1095 | SYSCALL_DEFINE0(geteuid) | ||
1096 | { | ||
1097 | /* Only we change this so SMP safe */ | ||
1098 | return from_kuid_munged(current_user_ns(), current_euid()); | ||
1099 | } | ||
1100 | |||
1101 | SYSCALL_DEFINE0(getgid) | ||
1102 | { | ||
1103 | /* Only we change this so SMP safe */ | ||
1104 | return from_kgid_munged(current_user_ns(), current_gid()); | ||
1105 | } | ||
1106 | |||
1107 | SYSCALL_DEFINE0(getegid) | ||
1108 | { | ||
1109 | /* Only we change this so SMP safe */ | ||
1110 | return from_kgid_munged(current_user_ns(), current_egid()); | ||
1111 | } | ||
1112 | |||
1046 | void do_sys_times(struct tms *tms) | 1113 | void do_sys_times(struct tms *tms) |
1047 | { | 1114 | { |
1048 | cputime_t tgutime, tgstime, cutime, cstime; | 1115 | cputime_t tgutime, tgstime, cutime, cstime; |
@@ -1784,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) | |||
1784 | return getrusage(current, who, ru); | 1851 | return getrusage(current, who, ru); |
1785 | } | 1852 | } |
1786 | 1853 | ||
1854 | #ifdef CONFIG_COMPAT | ||
1855 | COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) | ||
1856 | { | ||
1857 | struct rusage r; | ||
1858 | |||
1859 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && | ||
1860 | who != RUSAGE_THREAD) | ||
1861 | return -EINVAL; | ||
1862 | |||
1863 | k_getrusage(current, who, &r); | ||
1864 | return put_compat_rusage(&r, ru); | ||
1865 | } | ||
1866 | #endif | ||
1867 | |||
1787 | SYSCALL_DEFINE1(umask, int, mask) | 1868 | SYSCALL_DEFINE1(umask, int, mask) |
1788 | { | 1869 | { |
1789 | mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); | 1870 | mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); |
1790 | return mask; | 1871 | return mask; |
1791 | } | 1872 | } |
1792 | 1873 | ||
1793 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1794 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1874 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
1795 | { | 1875 | { |
1796 | struct fd exe; | 1876 | struct fd exe; |
@@ -1984,17 +2064,12 @@ out: | |||
1984 | return error; | 2064 | return error; |
1985 | } | 2065 | } |
1986 | 2066 | ||
2067 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1987 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | 2068 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) |
1988 | { | 2069 | { |
1989 | return put_user(me->clear_child_tid, tid_addr); | 2070 | return put_user(me->clear_child_tid, tid_addr); |
1990 | } | 2071 | } |
1991 | 2072 | #else | |
1992 | #else /* CONFIG_CHECKPOINT_RESTORE */ | ||
1993 | static int prctl_set_mm(int opt, unsigned long addr, | ||
1994 | unsigned long arg4, unsigned long arg5) | ||
1995 | { | ||
1996 | return -EINVAL; | ||
1997 | } | ||
1998 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | 2073 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) |
1999 | { | 2074 | { |
2000 | return -EINVAL; | 2075 | return -EINVAL; |
@@ -2185,9 +2260,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, | |||
2185 | 2260 | ||
2186 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | 2261 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; |
2187 | 2262 | ||
2188 | static int __orderly_poweroff(void) | 2263 | static int __orderly_poweroff(bool force) |
2189 | { | 2264 | { |
2190 | int argc; | ||
2191 | char **argv; | 2265 | char **argv; |
2192 | static char *envp[] = { | 2266 | static char *envp[] = { |
2193 | "HOME=/", | 2267 | "HOME=/", |
@@ -2196,20 +2270,40 @@ static int __orderly_poweroff(void) | |||
2196 | }; | 2270 | }; |
2197 | int ret; | 2271 | int ret; |
2198 | 2272 | ||
2199 | argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); | 2273 | argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); |
2200 | if (argv == NULL) { | 2274 | if (argv) { |
2275 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
2276 | argv_free(argv); | ||
2277 | } else { | ||
2201 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | 2278 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", |
2202 | __func__, poweroff_cmd); | 2279 | __func__, poweroff_cmd); |
2203 | return -ENOMEM; | 2280 | ret = -ENOMEM; |
2204 | } | 2281 | } |
2205 | 2282 | ||
2206 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, | 2283 | if (ret && force) { |
2207 | NULL, NULL, NULL); | 2284 | printk(KERN_WARNING "Failed to start orderly shutdown: " |
2208 | argv_free(argv); | 2285 | "forcing the issue\n"); |
2286 | /* | ||
2287 | * I guess this should try to kick off some daemon to sync and | ||
2288 | * poweroff asap. Or not even bother syncing if we're doing an | ||
2289 | * emergency shutdown? | ||
2290 | */ | ||
2291 | emergency_sync(); | ||
2292 | kernel_power_off(); | ||
2293 | } | ||
2209 | 2294 | ||
2210 | return ret; | 2295 | return ret; |
2211 | } | 2296 | } |
2212 | 2297 | ||
2298 | static bool poweroff_force; | ||
2299 | |||
2300 | static void poweroff_work_func(struct work_struct *work) | ||
2301 | { | ||
2302 | __orderly_poweroff(poweroff_force); | ||
2303 | } | ||
2304 | |||
2305 | static DECLARE_WORK(poweroff_work, poweroff_work_func); | ||
2306 | |||
2213 | /** | 2307 | /** |
2214 | * orderly_poweroff - Trigger an orderly system poweroff | 2308 | * orderly_poweroff - Trigger an orderly system poweroff |
2215 | * @force: force poweroff if command execution fails | 2309 | * @force: force poweroff if command execution fails |
@@ -2219,21 +2313,154 @@ static int __orderly_poweroff(void) | |||
2219 | */ | 2313 | */ |
2220 | int orderly_poweroff(bool force) | 2314 | int orderly_poweroff(bool force) |
2221 | { | 2315 | { |
2222 | int ret = __orderly_poweroff(); | 2316 | if (force) /* do not override the pending "true" */ |
2317 | poweroff_force = true; | ||
2318 | schedule_work(&poweroff_work); | ||
2319 | return 0; | ||
2320 | } | ||
2321 | EXPORT_SYMBOL_GPL(orderly_poweroff); | ||
2223 | 2322 | ||
2224 | if (ret && force) { | 2323 | /** |
2225 | printk(KERN_WARNING "Failed to start orderly shutdown: " | 2324 | * do_sysinfo - fill in sysinfo struct |
2226 | "forcing the issue\n"); | 2325 | * @info: pointer to buffer to fill |
2326 | */ | ||
2327 | static int do_sysinfo(struct sysinfo *info) | ||
2328 | { | ||
2329 | unsigned long mem_total, sav_total; | ||
2330 | unsigned int mem_unit, bitcount; | ||
2331 | struct timespec tp; | ||
2227 | 2332 | ||
2228 | /* | 2333 | memset(info, 0, sizeof(struct sysinfo)); |
2229 | * I guess this should try to kick off some daemon to sync and | 2334 | |
2230 | * poweroff asap. Or not even bother syncing if we're doing an | 2335 | ktime_get_ts(&tp); |
2231 | * emergency shutdown? | 2336 | monotonic_to_bootbased(&tp); |
2232 | */ | 2337 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); |
2233 | emergency_sync(); | 2338 | |
2234 | kernel_power_off(); | 2339 | get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); |
2340 | |||
2341 | info->procs = nr_threads; | ||
2342 | |||
2343 | si_meminfo(info); | ||
2344 | si_swapinfo(info); | ||
2345 | |||
2346 | /* | ||
2347 | * If the sum of all the available memory (i.e. ram + swap) | ||
2348 | * is less than can be stored in a 32 bit unsigned long then | ||
2349 | * we can be binary compatible with 2.2.x kernels. If not, | ||
2350 | * well, in that case 2.2.x was broken anyways... | ||
2351 | * | ||
2352 | * -Erik Andersen <andersee@debian.org> | ||
2353 | */ | ||
2354 | |||
2355 | mem_total = info->totalram + info->totalswap; | ||
2356 | if (mem_total < info->totalram || mem_total < info->totalswap) | ||
2357 | goto out; | ||
2358 | bitcount = 0; | ||
2359 | mem_unit = info->mem_unit; | ||
2360 | while (mem_unit > 1) { | ||
2361 | bitcount++; | ||
2362 | mem_unit >>= 1; | ||
2363 | sav_total = mem_total; | ||
2364 | mem_total <<= 1; | ||
2365 | if (mem_total < sav_total) | ||
2366 | goto out; | ||
2235 | } | 2367 | } |
2236 | 2368 | ||
2237 | return ret; | 2369 | /* |
2370 | * If mem_total did not overflow, multiply all memory values by | ||
2371 | * info->mem_unit and set it to 1. This leaves things compatible | ||
2372 | * with 2.2.x, and also retains compatibility with earlier 2.4.x | ||
2373 | * kernels... | ||
2374 | */ | ||
2375 | |||
2376 | info->mem_unit = 1; | ||
2377 | info->totalram <<= bitcount; | ||
2378 | info->freeram <<= bitcount; | ||
2379 | info->sharedram <<= bitcount; | ||
2380 | info->bufferram <<= bitcount; | ||
2381 | info->totalswap <<= bitcount; | ||
2382 | info->freeswap <<= bitcount; | ||
2383 | info->totalhigh <<= bitcount; | ||
2384 | info->freehigh <<= bitcount; | ||
2385 | |||
2386 | out: | ||
2387 | return 0; | ||
2238 | } | 2388 | } |
2239 | EXPORT_SYMBOL_GPL(orderly_poweroff); | 2389 | |
2390 | SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) | ||
2391 | { | ||
2392 | struct sysinfo val; | ||
2393 | |||
2394 | do_sysinfo(&val); | ||
2395 | |||
2396 | if (copy_to_user(info, &val, sizeof(struct sysinfo))) | ||
2397 | return -EFAULT; | ||
2398 | |||
2399 | return 0; | ||
2400 | } | ||
2401 | |||
2402 | #ifdef CONFIG_COMPAT | ||
2403 | struct compat_sysinfo { | ||
2404 | s32 uptime; | ||
2405 | u32 loads[3]; | ||
2406 | u32 totalram; | ||
2407 | u32 freeram; | ||
2408 | u32 sharedram; | ||
2409 | u32 bufferram; | ||
2410 | u32 totalswap; | ||
2411 | u32 freeswap; | ||
2412 | u16 procs; | ||
2413 | u16 pad; | ||
2414 | u32 totalhigh; | ||
2415 | u32 freehigh; | ||
2416 | u32 mem_unit; | ||
2417 | char _f[20-2*sizeof(u32)-sizeof(int)]; | ||
2418 | }; | ||
2419 | |||
2420 | COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) | ||
2421 | { | ||
2422 | struct sysinfo s; | ||
2423 | |||
2424 | do_sysinfo(&s); | ||
2425 | |||
2426 | /* Check to see if any memory value is too large for 32-bit and scale | ||
2427 | * down if needed | ||
2428 | */ | ||
2429 | if ((s.totalram >> 32) || (s.totalswap >> 32)) { | ||
2430 | int bitcount = 0; | ||
2431 | |||
2432 | while (s.mem_unit < PAGE_SIZE) { | ||
2433 | s.mem_unit <<= 1; | ||
2434 | bitcount++; | ||
2435 | } | ||
2436 | |||
2437 | s.totalram >>= bitcount; | ||
2438 | s.freeram >>= bitcount; | ||
2439 | s.sharedram >>= bitcount; | ||
2440 | s.bufferram >>= bitcount; | ||
2441 | s.totalswap >>= bitcount; | ||
2442 | s.freeswap >>= bitcount; | ||
2443 | s.totalhigh >>= bitcount; | ||
2444 | s.freehigh >>= bitcount; | ||
2445 | } | ||
2446 | |||
2447 | if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || | ||
2448 | __put_user(s.uptime, &info->uptime) || | ||
2449 | __put_user(s.loads[0], &info->loads[0]) || | ||
2450 | __put_user(s.loads[1], &info->loads[1]) || | ||
2451 | __put_user(s.loads[2], &info->loads[2]) || | ||
2452 | __put_user(s.totalram, &info->totalram) || | ||
2453 | __put_user(s.freeram, &info->freeram) || | ||
2454 | __put_user(s.sharedram, &info->sharedram) || | ||
2455 | __put_user(s.bufferram, &info->bufferram) || | ||
2456 | __put_user(s.totalswap, &info->totalswap) || | ||
2457 | __put_user(s.freeswap, &info->freeswap) || | ||
2458 | __put_user(s.procs, &info->procs) || | ||
2459 | __put_user(s.totalhigh, &info->totalhigh) || | ||
2460 | __put_user(s.freehigh, &info->freehigh) || | ||
2461 | __put_user(s.mem_unit, &info->mem_unit)) | ||
2462 | return -EFAULT; | ||
2463 | |||
2464 | return 0; | ||
2465 | } | ||
2466 | #endif /* CONFIG_COMPAT */ | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 395084d4ce16..7078052284fd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -20,6 +20,7 @@ cond_syscall(sys_quotactl); | |||
20 | cond_syscall(sys32_quotactl); | 20 | cond_syscall(sys32_quotactl); |
21 | cond_syscall(sys_acct); | 21 | cond_syscall(sys_acct); |
22 | cond_syscall(sys_lookup_dcookie); | 22 | cond_syscall(sys_lookup_dcookie); |
23 | cond_syscall(compat_sys_lookup_dcookie); | ||
23 | cond_syscall(sys_swapon); | 24 | cond_syscall(sys_swapon); |
24 | cond_syscall(sys_swapoff); | 25 | cond_syscall(sys_swapoff); |
25 | cond_syscall(sys_kexec_load); | 26 | cond_syscall(sys_kexec_load); |
@@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev); | |||
155 | cond_syscall(sys_pciconfig_read); | 156 | cond_syscall(sys_pciconfig_read); |
156 | cond_syscall(sys_pciconfig_write); | 157 | cond_syscall(sys_pciconfig_write); |
157 | cond_syscall(sys_pciconfig_iobase); | 158 | cond_syscall(sys_pciconfig_iobase); |
158 | cond_syscall(sys32_ipc); | 159 | cond_syscall(compat_sys_s390_ipc); |
159 | cond_syscall(ppc_rtas); | 160 | cond_syscall(ppc_rtas); |
160 | cond_syscall(sys_spu_run); | 161 | cond_syscall(sys_spu_run); |
161 | cond_syscall(sys_spu_create); | 162 | cond_syscall(sys_spu_create); |
@@ -199,6 +200,7 @@ cond_syscall(sys_perf_event_open); | |||
199 | /* fanotify! */ | 200 | /* fanotify! */ |
200 | cond_syscall(sys_fanotify_init); | 201 | cond_syscall(sys_fanotify_init); |
201 | cond_syscall(sys_fanotify_mark); | 202 | cond_syscall(sys_fanotify_mark); |
203 | cond_syscall(compat_sys_fanotify_mark); | ||
202 | 204 | ||
203 | /* open by handle */ | 205 | /* open by handle */ |
204 | cond_syscall(sys_name_to_handle_at); | 206 | cond_syscall(sys_name_to_handle_at); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afc1dc60f3f8..9edcf456e0fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit; | |||
106 | #endif | 106 | #endif |
107 | extern int pid_max; | 107 | extern int pid_max; |
108 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
109 | extern int sysctl_drop_caches; | ||
110 | extern int percpu_pagelist_fraction; | 109 | extern int percpu_pagelist_fraction; |
111 | extern int compat_log; | 110 | extern int compat_log; |
112 | extern int latencytop_enabled; | 111 | extern int latencytop_enabled; |
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = { | |||
1430 | .extra2 = &one, | 1429 | .extra2 = &one, |
1431 | }, | 1430 | }, |
1432 | #endif | 1431 | #endif |
1432 | { | ||
1433 | .procname = "user_reserve_kbytes", | ||
1434 | .data = &sysctl_user_reserve_kbytes, | ||
1435 | .maxlen = sizeof(sysctl_user_reserve_kbytes), | ||
1436 | .mode = 0644, | ||
1437 | .proc_handler = proc_doulongvec_minmax, | ||
1438 | }, | ||
1439 | { | ||
1440 | .procname = "admin_reserve_kbytes", | ||
1441 | .data = &sysctl_admin_reserve_kbytes, | ||
1442 | .maxlen = sizeof(sysctl_admin_reserve_kbytes), | ||
1443 | .mode = 0644, | ||
1444 | .proc_handler = proc_doulongvec_minmax, | ||
1445 | }, | ||
1433 | { } | 1446 | { } |
1434 | }; | 1447 | }; |
1435 | 1448 | ||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ebf72358e86a..aea4a9ea6fc8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/netdevice.h> | 15 | #include <linux/netdevice.h> |
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/compat.h> | ||
18 | 19 | ||
19 | #ifdef CONFIG_SYSCTL_SYSCALL | 20 | #ifdef CONFIG_SYSCTL_SYSCALL |
20 | 21 | ||
@@ -1447,7 +1448,6 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) | |||
1447 | 1448 | ||
1448 | 1449 | ||
1449 | #ifdef CONFIG_COMPAT | 1450 | #ifdef CONFIG_COMPAT |
1450 | #include <asm/compat.h> | ||
1451 | 1451 | ||
1452 | struct compat_sysctl_args { | 1452 | struct compat_sysctl_args { |
1453 | compat_uptr_t name; | 1453 | compat_uptr_t name; |
@@ -1459,7 +1459,7 @@ struct compat_sysctl_args { | |||
1459 | compat_ulong_t __unused[4]; | 1459 | compat_ulong_t __unused[4]; |
1460 | }; | 1460 | }; |
1461 | 1461 | ||
1462 | asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) | 1462 | COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) |
1463 | { | 1463 | { |
1464 | struct compat_sysctl_args tmp; | 1464 | struct compat_sysctl_args tmp; |
1465 | compat_size_t __user *compat_oldlenp; | 1465 | compat_size_t __user *compat_oldlenp; |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index f8b11a283171..12d6ebbfdd83 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
@@ -365,7 +365,7 @@ int init_test_probes(void) | |||
365 | target2 = kprobe_target2; | 365 | target2 = kprobe_target2; |
366 | 366 | ||
367 | do { | 367 | do { |
368 | rand1 = random32(); | 368 | rand1 = prandom_u32(); |
369 | } while (rand1 <= div_factor); | 369 | } while (rand1 <= div_factor); |
370 | 370 | ||
371 | printk(KERN_INFO "Kprobe smoke test started\n"); | 371 | printk(KERN_INFO "Kprobe smoke test started\n"); |
diff --git a/kernel/time.c b/kernel/time.c index f8342a41efa6..d3617dbd3dca 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -138,13 +138,14 @@ int persistent_clock_is_local; | |||
138 | */ | 138 | */ |
139 | static inline void warp_clock(void) | 139 | static inline void warp_clock(void) |
140 | { | 140 | { |
141 | struct timespec adjust; | 141 | if (sys_tz.tz_minuteswest != 0) { |
142 | struct timespec adjust; | ||
142 | 143 | ||
143 | adjust = current_kernel_time(); | ||
144 | if (sys_tz.tz_minuteswest != 0) | ||
145 | persistent_clock_is_local = 1; | 144 | persistent_clock_is_local = 1; |
146 | adjust.tv_sec += sys_tz.tz_minuteswest * 60; | 145 | adjust.tv_sec = sys_tz.tz_minuteswest * 60; |
147 | do_settimeofday(&adjust); | 146 | adjust.tv_nsec = 0; |
147 | timekeeping_inject_offset(&adjust); | ||
148 | } | ||
148 | } | 149 | } |
149 | 150 | ||
150 | /* | 151 | /* |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..70f27e89012b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG | |||
12 | config ARCH_CLOCKSOURCE_DATA | 12 | config ARCH_CLOCKSOURCE_DATA |
13 | bool | 13 | bool |
14 | 14 | ||
15 | # Platforms has a persistent clock | ||
16 | config ALWAYS_USE_PERSISTENT_CLOCK | ||
17 | bool | ||
18 | default n | ||
19 | |||
20 | # Timekeeping vsyscall support | 15 | # Timekeeping vsyscall support |
21 | config GENERIC_TIME_VSYSCALL | 16 | config GENERIC_TIME_VSYSCALL |
22 | bool | 17 | bool |
@@ -64,20 +59,88 @@ config GENERIC_CMOS_UPDATE | |||
64 | if GENERIC_CLOCKEVENTS | 59 | if GENERIC_CLOCKEVENTS |
65 | menu "Timers subsystem" | 60 | menu "Timers subsystem" |
66 | 61 | ||
67 | # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is | 62 | # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is |
68 | # only related to the tick functionality. Oneshot clockevent devices | 63 | # only related to the tick functionality. Oneshot clockevent devices |
69 | # are supported independ of this. | 64 | # are supported independ of this. |
70 | config TICK_ONESHOT | 65 | config TICK_ONESHOT |
71 | bool | 66 | bool |
72 | 67 | ||
73 | config NO_HZ | 68 | config NO_HZ_COMMON |
74 | bool "Tickless System (Dynamic Ticks)" | 69 | bool |
75 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 70 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
76 | select TICK_ONESHOT | 71 | select TICK_ONESHOT |
72 | |||
73 | choice | ||
74 | prompt "Timer tick handling" | ||
75 | default NO_HZ_IDLE if NO_HZ | ||
76 | |||
77 | config HZ_PERIODIC | ||
78 | bool "Periodic timer ticks (constant rate, no dynticks)" | ||
79 | help | ||
80 | This option keeps the tick running periodically at a constant | ||
81 | rate, even when the CPU doesn't need it. | ||
82 | |||
83 | config NO_HZ_IDLE | ||
84 | bool "Idle dynticks system (tickless idle)" | ||
85 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | ||
86 | select NO_HZ_COMMON | ||
87 | help | ||
88 | This option enables a tickless idle system: timer interrupts | ||
89 | will only trigger on an as-needed basis when the system is idle. | ||
90 | This is usually interesting for energy saving. | ||
91 | |||
92 | Most of the time you want to say Y here. | ||
93 | |||
94 | config NO_HZ_FULL | ||
95 | bool "Full dynticks system (tickless)" | ||
96 | # NO_HZ_COMMON dependency | ||
97 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | ||
98 | # We need at least one periodic CPU for timekeeping | ||
99 | depends on SMP | ||
100 | # RCU_USER_QS dependency | ||
101 | depends on HAVE_CONTEXT_TRACKING | ||
102 | # VIRT_CPU_ACCOUNTING_GEN dependency | ||
103 | depends on 64BIT | ||
104 | select NO_HZ_COMMON | ||
105 | select RCU_USER_QS | ||
106 | select RCU_NOCB_CPU | ||
107 | select VIRT_CPU_ACCOUNTING_GEN | ||
108 | select CONTEXT_TRACKING_FORCE | ||
109 | select IRQ_WORK | ||
110 | help | ||
111 | Adaptively try to shutdown the tick whenever possible, even when | ||
112 | the CPU is running tasks. Typically this requires running a single | ||
113 | task on the CPU. Chances for running tickless are maximized when | ||
114 | the task mostly runs in userspace and has few kernel activity. | ||
115 | |||
116 | You need to fill up the nohz_full boot parameter with the | ||
117 | desired range of dynticks CPUs. | ||
118 | |||
119 | This is implemented at the expense of some overhead in user <-> kernel | ||
120 | transitions: syscalls, exceptions and interrupts. Even when it's | ||
121 | dynamically off. | ||
122 | |||
123 | Say N. | ||
124 | |||
125 | endchoice | ||
126 | |||
127 | config NO_HZ_FULL_ALL | ||
128 | bool "Full dynticks system on all CPUs by default" | ||
129 | depends on NO_HZ_FULL | ||
130 | help | ||
131 | If the user doesn't pass the nohz_full boot option to | ||
132 | define the range of full dynticks CPUs, consider that all | ||
133 | CPUs in the system are full dynticks by default. | ||
134 | Note the boot CPU will still be kept outside the range to | ||
135 | handle the timekeeping duty. | ||
136 | |||
137 | config NO_HZ | ||
138 | bool "Old Idle dynticks config" | ||
139 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | ||
77 | help | 140 | help |
78 | This option enables a tickless system: timer interrupts will | 141 | This is the old config entry that enables dynticks idle. |
79 | only trigger on an as-needed basis both when the system is | 142 | We keep it around for a little while to enforce backward |
80 | busy and when the system is idle. | 143 | compatibility with older config files. |
81 | 144 | ||
82 | config HIGH_RES_TIMERS | 145 | config HIGH_RES_TIMERS |
83 | bool "High Resolution Timer Support" | 146 | bool "High Resolution Timer Support" |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 072bb066bb7d..8f5b3b98577b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -18,13 +18,14 @@ | |||
18 | #include <linux/rtc.h> | 18 | #include <linux/rtc.h> |
19 | 19 | ||
20 | #include "tick-internal.h" | 20 | #include "tick-internal.h" |
21 | #include "ntp_internal.h" | ||
21 | 22 | ||
22 | /* | 23 | /* |
23 | * NTP timekeeping variables: | 24 | * NTP timekeeping variables: |
25 | * | ||
26 | * Note: All of the NTP state is protected by the timekeeping locks. | ||
24 | */ | 27 | */ |
25 | 28 | ||
26 | DEFINE_RAW_SPINLOCK(ntp_lock); | ||
27 | |||
28 | 29 | ||
29 | /* USER_HZ period (usecs): */ | 30 | /* USER_HZ period (usecs): */ |
30 | unsigned long tick_usec = TICK_USEC; | 31 | unsigned long tick_usec = TICK_USEC; |
@@ -53,9 +54,6 @@ static int time_state = TIME_OK; | |||
53 | /* clock status bits: */ | 54 | /* clock status bits: */ |
54 | static int time_status = STA_UNSYNC; | 55 | static int time_status = STA_UNSYNC; |
55 | 56 | ||
56 | /* TAI offset (secs): */ | ||
57 | static long time_tai; | ||
58 | |||
59 | /* time adjustment (nsecs): */ | 57 | /* time adjustment (nsecs): */ |
60 | static s64 time_offset; | 58 | static s64 time_offset; |
61 | 59 | ||
@@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void) | |||
134 | 132 | ||
135 | /** | 133 | /** |
136 | * pps_clear - Clears the PPS state variables | 134 | * pps_clear - Clears the PPS state variables |
137 | * | ||
138 | * Must be called while holding a write on the ntp_lock | ||
139 | */ | 135 | */ |
140 | static inline void pps_clear(void) | 136 | static inline void pps_clear(void) |
141 | { | 137 | { |
@@ -150,8 +146,6 @@ static inline void pps_clear(void) | |||
150 | /* Decrease pps_valid to indicate that another second has passed since | 146 | /* Decrease pps_valid to indicate that another second has passed since |
151 | * the last PPS signal. When it reaches 0, indicate that PPS signal is | 147 | * the last PPS signal. When it reaches 0, indicate that PPS signal is |
152 | * missing. | 148 | * missing. |
153 | * | ||
154 | * Must be called while holding a write on the ntp_lock | ||
155 | */ | 149 | */ |
156 | static inline void pps_dec_valid(void) | 150 | static inline void pps_dec_valid(void) |
157 | { | 151 | { |
@@ -346,10 +340,6 @@ static void ntp_update_offset(long offset) | |||
346 | */ | 340 | */ |
347 | void ntp_clear(void) | 341 | void ntp_clear(void) |
348 | { | 342 | { |
349 | unsigned long flags; | ||
350 | |||
351 | raw_spin_lock_irqsave(&ntp_lock, flags); | ||
352 | |||
353 | time_adjust = 0; /* stop active adjtime() */ | 343 | time_adjust = 0; /* stop active adjtime() */ |
354 | time_status |= STA_UNSYNC; | 344 | time_status |= STA_UNSYNC; |
355 | time_maxerror = NTP_PHASE_LIMIT; | 345 | time_maxerror = NTP_PHASE_LIMIT; |
@@ -362,20 +352,12 @@ void ntp_clear(void) | |||
362 | 352 | ||
363 | /* Clear PPS state variables */ | 353 | /* Clear PPS state variables */ |
364 | pps_clear(); | 354 | pps_clear(); |
365 | raw_spin_unlock_irqrestore(&ntp_lock, flags); | ||
366 | |||
367 | } | 355 | } |
368 | 356 | ||
369 | 357 | ||
370 | u64 ntp_tick_length(void) | 358 | u64 ntp_tick_length(void) |
371 | { | 359 | { |
372 | unsigned long flags; | 360 | return tick_length; |
373 | s64 ret; | ||
374 | |||
375 | raw_spin_lock_irqsave(&ntp_lock, flags); | ||
376 | ret = tick_length; | ||
377 | raw_spin_unlock_irqrestore(&ntp_lock, flags); | ||
378 | return ret; | ||
379 | } | 361 | } |
380 | 362 | ||
381 | 363 | ||
@@ -393,9 +375,6 @@ int second_overflow(unsigned long secs) | |||
393 | { | 375 | { |
394 | s64 delta; | 376 | s64 delta; |
395 | int leap = 0; | 377 | int leap = 0; |
396 | unsigned long flags; | ||
397 | |||
398 | raw_spin_lock_irqsave(&ntp_lock, flags); | ||
399 | 378 | ||
400 | /* | 379 | /* |
401 | * Leap second processing. If in leap-insert state at the end of the | 380 | * Leap second processing. If in leap-insert state at the end of the |
@@ -415,7 +394,6 @@ int second_overflow(unsigned long secs) | |||
415 | else if (secs % 86400 == 0) { | 394 | else if (secs % 86400 == 0) { |
416 | leap = -1; | 395 | leap = -1; |
417 | time_state = TIME_OOP; | 396 | time_state = TIME_OOP; |
418 | time_tai++; | ||
419 | printk(KERN_NOTICE | 397 | printk(KERN_NOTICE |
420 | "Clock: inserting leap second 23:59:60 UTC\n"); | 398 | "Clock: inserting leap second 23:59:60 UTC\n"); |
421 | } | 399 | } |
@@ -425,7 +403,6 @@ int second_overflow(unsigned long secs) | |||
425 | time_state = TIME_OK; | 403 | time_state = TIME_OK; |
426 | else if ((secs + 1) % 86400 == 0) { | 404 | else if ((secs + 1) % 86400 == 0) { |
427 | leap = 1; | 405 | leap = 1; |
428 | time_tai--; | ||
429 | time_state = TIME_WAIT; | 406 | time_state = TIME_WAIT; |
430 | printk(KERN_NOTICE | 407 | printk(KERN_NOTICE |
431 | "Clock: deleting leap second 23:59:59 UTC\n"); | 408 | "Clock: deleting leap second 23:59:59 UTC\n"); |
@@ -479,8 +456,6 @@ int second_overflow(unsigned long secs) | |||
479 | time_adjust = 0; | 456 | time_adjust = 0; |
480 | 457 | ||
481 | out: | 458 | out: |
482 | raw_spin_unlock_irqrestore(&ntp_lock, flags); | ||
483 | |||
484 | return leap; | 459 | return leap; |
485 | } | 460 | } |
486 | 461 | ||
@@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
575 | time_status |= txc->status & ~STA_RONLY; | 550 | time_status |= txc->status & ~STA_RONLY; |
576 | } | 551 | } |
577 | 552 | ||
578 | /* | 553 | |
579 | * Called with ntp_lock held, so we can access and modify | 554 | static inline void process_adjtimex_modes(struct timex *txc, |
580 | * all the global NTP state: | 555 | struct timespec *ts, |
581 | */ | 556 | s32 *time_tai) |
582 | static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) | ||
583 | { | 557 | { |
584 | if (txc->modes & ADJ_STATUS) | 558 | if (txc->modes & ADJ_STATUS) |
585 | process_adj_status(txc, ts); | 559 | process_adj_status(txc, ts); |
@@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts | |||
613 | } | 587 | } |
614 | 588 | ||
615 | if (txc->modes & ADJ_TAI && txc->constant > 0) | 589 | if (txc->modes & ADJ_TAI && txc->constant > 0) |
616 | time_tai = txc->constant; | 590 | *time_tai = txc->constant; |
617 | 591 | ||
618 | if (txc->modes & ADJ_OFFSET) | 592 | if (txc->modes & ADJ_OFFSET) |
619 | ntp_update_offset(txc->offset); | 593 | ntp_update_offset(txc->offset); |
@@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts | |||
625 | ntp_update_frequency(); | 599 | ntp_update_frequency(); |
626 | } | 600 | } |
627 | 601 | ||
628 | /* | 602 | |
629 | * adjtimex mainly allows reading (and writing, if superuser) of | 603 | |
630 | * kernel time-keeping variables. used by xntpd. | 604 | /** |
605 | * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex | ||
631 | */ | 606 | */ |
632 | int do_adjtimex(struct timex *txc) | 607 | int ntp_validate_timex(struct timex *txc) |
633 | { | 608 | { |
634 | struct timespec ts; | ||
635 | int result; | ||
636 | |||
637 | /* Validate the data before disabling interrupts */ | ||
638 | if (txc->modes & ADJ_ADJTIME) { | 609 | if (txc->modes & ADJ_ADJTIME) { |
639 | /* singleshot must not be used with any other mode bits */ | 610 | /* singleshot must not be used with any other mode bits */ |
640 | if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) | 611 | if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) |
@@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc) | |||
646 | /* In order to modify anything, you gotta be super-user! */ | 617 | /* In order to modify anything, you gotta be super-user! */ |
647 | if (txc->modes && !capable(CAP_SYS_TIME)) | 618 | if (txc->modes && !capable(CAP_SYS_TIME)) |
648 | return -EPERM; | 619 | return -EPERM; |
649 | |||
650 | /* | 620 | /* |
651 | * if the quartz is off by more than 10% then | 621 | * if the quartz is off by more than 10% then |
652 | * something is VERY wrong! | 622 | * something is VERY wrong! |
@@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc) | |||
657 | return -EINVAL; | 627 | return -EINVAL; |
658 | } | 628 | } |
659 | 629 | ||
660 | if (txc->modes & ADJ_SETOFFSET) { | 630 | if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) |
661 | struct timespec delta; | 631 | return -EPERM; |
662 | delta.tv_sec = txc->time.tv_sec; | 632 | |
663 | delta.tv_nsec = txc->time.tv_usec; | 633 | return 0; |
664 | if (!capable(CAP_SYS_TIME)) | 634 | } |
665 | return -EPERM; | ||
666 | if (!(txc->modes & ADJ_NANO)) | ||
667 | delta.tv_nsec *= 1000; | ||
668 | result = timekeeping_inject_offset(&delta); | ||
669 | if (result) | ||
670 | return result; | ||
671 | } | ||
672 | 635 | ||
673 | getnstimeofday(&ts); | ||
674 | 636 | ||
675 | raw_spin_lock_irq(&ntp_lock); | 637 | /* |
638 | * adjtimex mainly allows reading (and writing, if superuser) of | ||
639 | * kernel time-keeping variables. used by xntpd. | ||
640 | */ | ||
641 | int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) | ||
642 | { | ||
643 | int result; | ||
676 | 644 | ||
677 | if (txc->modes & ADJ_ADJTIME) { | 645 | if (txc->modes & ADJ_ADJTIME) { |
678 | long save_adjust = time_adjust; | 646 | long save_adjust = time_adjust; |
@@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc) | |||
687 | 655 | ||
688 | /* If there are input parameters, then process them: */ | 656 | /* If there are input parameters, then process them: */ |
689 | if (txc->modes) | 657 | if (txc->modes) |
690 | process_adjtimex_modes(txc, &ts); | 658 | process_adjtimex_modes(txc, ts, time_tai); |
691 | 659 | ||
692 | txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, | 660 | txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, |
693 | NTP_SCALE_SHIFT); | 661 | NTP_SCALE_SHIFT); |
@@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc) | |||
709 | txc->precision = 1; | 677 | txc->precision = 1; |
710 | txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; | 678 | txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; |
711 | txc->tick = tick_usec; | 679 | txc->tick = tick_usec; |
712 | txc->tai = time_tai; | 680 | txc->tai = *time_tai; |
713 | 681 | ||
714 | /* fill PPS status fields */ | 682 | /* fill PPS status fields */ |
715 | pps_fill_timex(txc); | 683 | pps_fill_timex(txc); |
716 | 684 | ||
717 | raw_spin_unlock_irq(&ntp_lock); | 685 | txc->time.tv_sec = ts->tv_sec; |
718 | 686 | txc->time.tv_usec = ts->tv_nsec; | |
719 | txc->time.tv_sec = ts.tv_sec; | ||
720 | txc->time.tv_usec = ts.tv_nsec; | ||
721 | if (!(time_status & STA_NANO)) | 687 | if (!(time_status & STA_NANO)) |
722 | txc->time.tv_usec /= NSEC_PER_USEC; | 688 | txc->time.tv_usec /= NSEC_PER_USEC; |
723 | 689 | ||
@@ -894,7 +860,7 @@ static void hardpps_update_phase(long error) | |||
894 | } | 860 | } |
895 | 861 | ||
896 | /* | 862 | /* |
897 | * hardpps() - discipline CPU clock oscillator to external PPS signal | 863 | * __hardpps() - discipline CPU clock oscillator to external PPS signal |
898 | * | 864 | * |
899 | * This routine is called at each PPS signal arrival in order to | 865 | * This routine is called at each PPS signal arrival in order to |
900 | * discipline the CPU clock oscillator to the PPS signal. It takes two | 866 | * discipline the CPU clock oscillator to the PPS signal. It takes two |
@@ -905,15 +871,12 @@ static void hardpps_update_phase(long error) | |||
905 | * This code is based on David Mills's reference nanokernel | 871 | * This code is based on David Mills's reference nanokernel |
906 | * implementation. It was mostly rewritten but keeps the same idea. | 872 | * implementation. It was mostly rewritten but keeps the same idea. |
907 | */ | 873 | */ |
908 | void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | 874 | void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) |
909 | { | 875 | { |
910 | struct pps_normtime pts_norm, freq_norm; | 876 | struct pps_normtime pts_norm, freq_norm; |
911 | unsigned long flags; | ||
912 | 877 | ||
913 | pts_norm = pps_normalize_ts(*phase_ts); | 878 | pts_norm = pps_normalize_ts(*phase_ts); |
914 | 879 | ||
915 | raw_spin_lock_irqsave(&ntp_lock, flags); | ||
916 | |||
917 | /* clear the error bits, they will be set again if needed */ | 880 | /* clear the error bits, they will be set again if needed */ |
918 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); | 881 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); |
919 | 882 | ||
@@ -925,7 +888,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
925 | * just start the frequency interval */ | 888 | * just start the frequency interval */ |
926 | if (unlikely(pps_fbase.tv_sec == 0)) { | 889 | if (unlikely(pps_fbase.tv_sec == 0)) { |
927 | pps_fbase = *raw_ts; | 890 | pps_fbase = *raw_ts; |
928 | raw_spin_unlock_irqrestore(&ntp_lock, flags); | ||
929 | return; | 891 | return; |
930 | } | 892 | } |
931 | 893 | ||
@@ -940,7 +902,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
940 | time_status |= STA_PPSJITTER; | 902 | time_status |= STA_PPSJITTER; |
941 | /* restart the frequency calibration interval */ | 903 | /* restart the frequency calibration interval */ |
942 | pps_fbase = *raw_ts; | 904 | pps_fbase = *raw_ts; |
943 | raw_spin_unlock_irqrestore(&ntp_lock, flags); | ||
944 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | 905 | pr_err("hardpps: PPSJITTER: bad pulse\n"); |
945 | return; | 906 | return; |
946 | } | 907 | } |
@@ -957,10 +918,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
957 | 918 | ||
958 | hardpps_update_phase(pts_norm.nsec); | 919 | hardpps_update_phase(pts_norm.nsec); |
959 | 920 | ||
960 | raw_spin_unlock_irqrestore(&ntp_lock, flags); | ||
961 | } | 921 | } |
962 | EXPORT_SYMBOL(hardpps); | ||
963 | |||
964 | #endif /* CONFIG_NTP_PPS */ | 922 | #endif /* CONFIG_NTP_PPS */ |
965 | 923 | ||
966 | static int __init ntp_tick_adj_setup(char *str) | 924 | static int __init ntp_tick_adj_setup(char *str) |
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 000000000000..1950cb4ca2a4 --- /dev/null +++ b/kernel/time/ntp_internal.h | |||
@@ -0,0 +1,12 @@ | |||
1 | #ifndef _LINUX_NTP_INTERNAL_H | ||
2 | #define _LINUX_NTP_INTERNAL_H | ||
3 | |||
4 | extern void ntp_init(void); | ||
5 | extern void ntp_clear(void); | ||
6 | /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ | ||
7 | extern u64 ntp_tick_length(void); | ||
8 | extern int second_overflow(unsigned long secs); | ||
9 | extern int ntp_validate_timex(struct timex *); | ||
10 | extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); | ||
11 | extern void __hardpps(const struct timespec *, const struct timespec *); | ||
12 | #endif /* _LINUX_NTP_INTERNAL_H */ | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8d..0c739423b0f9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -28,9 +28,8 @@ | |||
28 | */ | 28 | */ |
29 | 29 | ||
30 | static struct tick_device tick_broadcast_device; | 30 | static struct tick_device tick_broadcast_device; |
31 | /* FIXME: Use cpumask_var_t. */ | 31 | static cpumask_var_t tick_broadcast_mask; |
32 | static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); | 32 | static cpumask_var_t tmpmask; |
33 | static DECLARE_BITMAP(tmpmask, NR_CPUS); | ||
34 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | 33 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); |
35 | static int tick_broadcast_force; | 34 | static int tick_broadcast_force; |
36 | 35 | ||
@@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void) | |||
50 | 49 | ||
51 | struct cpumask *tick_get_broadcast_mask(void) | 50 | struct cpumask *tick_get_broadcast_mask(void) |
52 | { | 51 | { |
53 | return to_cpumask(tick_broadcast_mask); | 52 | return tick_broadcast_mask; |
54 | } | 53 | } |
55 | 54 | ||
56 | /* | 55 | /* |
@@ -67,15 +66,30 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) | |||
67 | */ | 66 | */ |
68 | int tick_check_broadcast_device(struct clock_event_device *dev) | 67 | int tick_check_broadcast_device(struct clock_event_device *dev) |
69 | { | 68 | { |
70 | if ((tick_broadcast_device.evtdev && | 69 | struct clock_event_device *cur = tick_broadcast_device.evtdev; |
70 | |||
71 | if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || | ||
72 | (tick_broadcast_device.evtdev && | ||
71 | tick_broadcast_device.evtdev->rating >= dev->rating) || | 73 | tick_broadcast_device.evtdev->rating >= dev->rating) || |
72 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 74 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) |
73 | return 0; | 75 | return 0; |
74 | 76 | ||
75 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); | 77 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); |
78 | if (cur) | ||
79 | cur->event_handler = clockevents_handle_noop; | ||
76 | tick_broadcast_device.evtdev = dev; | 80 | tick_broadcast_device.evtdev = dev; |
77 | if (!cpumask_empty(tick_get_broadcast_mask())) | 81 | if (!cpumask_empty(tick_broadcast_mask)) |
78 | tick_broadcast_start_periodic(dev); | 82 | tick_broadcast_start_periodic(dev); |
83 | /* | ||
84 | * Inform all cpus about this. We might be in a situation | ||
85 | * where we did not switch to oneshot mode because the per cpu | ||
86 | * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack | ||
87 | * of a oneshot capable broadcast device. Without that | ||
88 | * notification the systems stays stuck in periodic mode | ||
89 | * forever. | ||
90 | */ | ||
91 | if (dev->features & CLOCK_EVT_FEAT_ONESHOT) | ||
92 | tick_clock_notify(); | ||
79 | return 1; | 93 | return 1; |
80 | } | 94 | } |
81 | 95 | ||
@@ -123,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
123 | if (!tick_device_is_functional(dev)) { | 137 | if (!tick_device_is_functional(dev)) { |
124 | dev->event_handler = tick_handle_periodic; | 138 | dev->event_handler = tick_handle_periodic; |
125 | tick_device_setup_broadcast_func(dev); | 139 | tick_device_setup_broadcast_func(dev); |
126 | cpumask_set_cpu(cpu, tick_get_broadcast_mask()); | 140 | cpumask_set_cpu(cpu, tick_broadcast_mask); |
127 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); | 141 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); |
128 | ret = 1; | 142 | ret = 1; |
129 | } else { | 143 | } else { |
@@ -134,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
134 | */ | 148 | */ |
135 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { | 149 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { |
136 | int cpu = smp_processor_id(); | 150 | int cpu = smp_processor_id(); |
137 | cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); | 151 | cpumask_clear_cpu(cpu, tick_broadcast_mask); |
138 | tick_broadcast_clear_oneshot(cpu); | 152 | tick_broadcast_clear_oneshot(cpu); |
139 | } else { | 153 | } else { |
140 | tick_device_setup_broadcast_func(dev); | 154 | tick_device_setup_broadcast_func(dev); |
@@ -198,9 +212,8 @@ static void tick_do_periodic_broadcast(void) | |||
198 | { | 212 | { |
199 | raw_spin_lock(&tick_broadcast_lock); | 213 | raw_spin_lock(&tick_broadcast_lock); |
200 | 214 | ||
201 | cpumask_and(to_cpumask(tmpmask), | 215 | cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); |
202 | cpu_online_mask, tick_get_broadcast_mask()); | 216 | tick_do_broadcast(tmpmask); |
203 | tick_do_broadcast(to_cpumask(tmpmask)); | ||
204 | 217 | ||
205 | raw_spin_unlock(&tick_broadcast_lock); | 218 | raw_spin_unlock(&tick_broadcast_lock); |
206 | } | 219 | } |
@@ -263,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
263 | if (!tick_device_is_functional(dev)) | 276 | if (!tick_device_is_functional(dev)) |
264 | goto out; | 277 | goto out; |
265 | 278 | ||
266 | bc_stopped = cpumask_empty(tick_get_broadcast_mask()); | 279 | bc_stopped = cpumask_empty(tick_broadcast_mask); |
267 | 280 | ||
268 | switch (*reason) { | 281 | switch (*reason) { |
269 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 282 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: |
270 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 283 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: |
271 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { | 284 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { |
272 | cpumask_set_cpu(cpu, tick_get_broadcast_mask()); | ||
273 | if (tick_broadcast_device.mode == | 285 | if (tick_broadcast_device.mode == |
274 | TICKDEV_MODE_PERIODIC) | 286 | TICKDEV_MODE_PERIODIC) |
275 | clockevents_shutdown(dev); | 287 | clockevents_shutdown(dev); |
@@ -279,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
279 | break; | 291 | break; |
280 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 292 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: |
281 | if (!tick_broadcast_force && | 293 | if (!tick_broadcast_force && |
282 | cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { | 294 | cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { |
283 | cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); | ||
284 | if (tick_broadcast_device.mode == | 295 | if (tick_broadcast_device.mode == |
285 | TICKDEV_MODE_PERIODIC) | 296 | TICKDEV_MODE_PERIODIC) |
286 | tick_setup_periodic(dev, 0); | 297 | tick_setup_periodic(dev, 0); |
@@ -288,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
288 | break; | 299 | break; |
289 | } | 300 | } |
290 | 301 | ||
291 | if (cpumask_empty(tick_get_broadcast_mask())) { | 302 | if (cpumask_empty(tick_broadcast_mask)) { |
292 | if (!bc_stopped) | 303 | if (!bc_stopped) |
293 | clockevents_shutdown(bc); | 304 | clockevents_shutdown(bc); |
294 | } else if (bc_stopped) { | 305 | } else if (bc_stopped) { |
@@ -337,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) | |||
337 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 348 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
338 | 349 | ||
339 | bc = tick_broadcast_device.evtdev; | 350 | bc = tick_broadcast_device.evtdev; |
340 | cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); | 351 | cpumask_clear_cpu(cpu, tick_broadcast_mask); |
341 | 352 | ||
342 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { | 353 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { |
343 | if (bc && cpumask_empty(tick_get_broadcast_mask())) | 354 | if (bc && cpumask_empty(tick_broadcast_mask)) |
344 | clockevents_shutdown(bc); | 355 | clockevents_shutdown(bc); |
345 | } | 356 | } |
346 | 357 | ||
@@ -376,13 +387,13 @@ int tick_resume_broadcast(void) | |||
376 | 387 | ||
377 | switch (tick_broadcast_device.mode) { | 388 | switch (tick_broadcast_device.mode) { |
378 | case TICKDEV_MODE_PERIODIC: | 389 | case TICKDEV_MODE_PERIODIC: |
379 | if (!cpumask_empty(tick_get_broadcast_mask())) | 390 | if (!cpumask_empty(tick_broadcast_mask)) |
380 | tick_broadcast_start_periodic(bc); | 391 | tick_broadcast_start_periodic(bc); |
381 | broadcast = cpumask_test_cpu(smp_processor_id(), | 392 | broadcast = cpumask_test_cpu(smp_processor_id(), |
382 | tick_get_broadcast_mask()); | 393 | tick_broadcast_mask); |
383 | break; | 394 | break; |
384 | case TICKDEV_MODE_ONESHOT: | 395 | case TICKDEV_MODE_ONESHOT: |
385 | if (!cpumask_empty(tick_get_broadcast_mask())) | 396 | if (!cpumask_empty(tick_broadcast_mask)) |
386 | broadcast = tick_resume_broadcast_oneshot(bc); | 397 | broadcast = tick_resume_broadcast_oneshot(bc); |
387 | break; | 398 | break; |
388 | } | 399 | } |
@@ -395,25 +406,58 @@ int tick_resume_broadcast(void) | |||
395 | 406 | ||
396 | #ifdef CONFIG_TICK_ONESHOT | 407 | #ifdef CONFIG_TICK_ONESHOT |
397 | 408 | ||
398 | /* FIXME: use cpumask_var_t. */ | 409 | static cpumask_var_t tick_broadcast_oneshot_mask; |
399 | static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); | 410 | static cpumask_var_t tick_broadcast_pending_mask; |
411 | static cpumask_var_t tick_broadcast_force_mask; | ||
400 | 412 | ||
401 | /* | 413 | /* |
402 | * Exposed for debugging: see timer_list.c | 414 | * Exposed for debugging: see timer_list.c |
403 | */ | 415 | */ |
404 | struct cpumask *tick_get_broadcast_oneshot_mask(void) | 416 | struct cpumask *tick_get_broadcast_oneshot_mask(void) |
405 | { | 417 | { |
406 | return to_cpumask(tick_broadcast_oneshot_mask); | 418 | return tick_broadcast_oneshot_mask; |
407 | } | 419 | } |
408 | 420 | ||
409 | static int tick_broadcast_set_event(ktime_t expires, int force) | 421 | /* |
422 | * Called before going idle with interrupts disabled. Checks whether a | ||
423 | * broadcast event from the other core is about to happen. We detected | ||
424 | * that in tick_broadcast_oneshot_control(). The callsite can use this | ||
425 | * to avoid a deep idle transition as we are about to get the | ||
426 | * broadcast IPI right away. | ||
427 | */ | ||
428 | int tick_check_broadcast_expired(void) | ||
410 | { | 429 | { |
411 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 430 | return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); |
431 | } | ||
432 | |||
433 | /* | ||
434 | * Set broadcast interrupt affinity | ||
435 | */ | ||
436 | static void tick_broadcast_set_affinity(struct clock_event_device *bc, | ||
437 | const struct cpumask *cpumask) | ||
438 | { | ||
439 | if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) | ||
440 | return; | ||
441 | |||
442 | if (cpumask_equal(bc->cpumask, cpumask)) | ||
443 | return; | ||
444 | |||
445 | bc->cpumask = cpumask; | ||
446 | irq_set_affinity(bc->irq, bc->cpumask); | ||
447 | } | ||
448 | |||
449 | static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, | ||
450 | ktime_t expires, int force) | ||
451 | { | ||
452 | int ret; | ||
412 | 453 | ||
413 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) | 454 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) |
414 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 455 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
415 | 456 | ||
416 | return clockevents_program_event(bc, expires, force); | 457 | ret = clockevents_program_event(bc, expires, force); |
458 | if (!ret) | ||
459 | tick_broadcast_set_affinity(bc, cpumask_of(cpu)); | ||
460 | return ret; | ||
417 | } | 461 | } |
418 | 462 | ||
419 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 463 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
@@ -428,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
428 | */ | 472 | */ |
429 | void tick_check_oneshot_broadcast(int cpu) | 473 | void tick_check_oneshot_broadcast(int cpu) |
430 | { | 474 | { |
431 | if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { | 475 | if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { |
432 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); | 476 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); |
433 | 477 | ||
434 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); | 478 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); |
@@ -442,27 +486,45 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) | |||
442 | { | 486 | { |
443 | struct tick_device *td; | 487 | struct tick_device *td; |
444 | ktime_t now, next_event; | 488 | ktime_t now, next_event; |
445 | int cpu; | 489 | int cpu, next_cpu = 0; |
446 | 490 | ||
447 | raw_spin_lock(&tick_broadcast_lock); | 491 | raw_spin_lock(&tick_broadcast_lock); |
448 | again: | 492 | again: |
449 | dev->next_event.tv64 = KTIME_MAX; | 493 | dev->next_event.tv64 = KTIME_MAX; |
450 | next_event.tv64 = KTIME_MAX; | 494 | next_event.tv64 = KTIME_MAX; |
451 | cpumask_clear(to_cpumask(tmpmask)); | 495 | cpumask_clear(tmpmask); |
452 | now = ktime_get(); | 496 | now = ktime_get(); |
453 | /* Find all expired events */ | 497 | /* Find all expired events */ |
454 | for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { | 498 | for_each_cpu(cpu, tick_broadcast_oneshot_mask) { |
455 | td = &per_cpu(tick_cpu_device, cpu); | 499 | td = &per_cpu(tick_cpu_device, cpu); |
456 | if (td->evtdev->next_event.tv64 <= now.tv64) | 500 | if (td->evtdev->next_event.tv64 <= now.tv64) { |
457 | cpumask_set_cpu(cpu, to_cpumask(tmpmask)); | 501 | cpumask_set_cpu(cpu, tmpmask); |
458 | else if (td->evtdev->next_event.tv64 < next_event.tv64) | 502 | /* |
503 | * Mark the remote cpu in the pending mask, so | ||
504 | * it can avoid reprogramming the cpu local | ||
505 | * timer in tick_broadcast_oneshot_control(). | ||
506 | */ | ||
507 | cpumask_set_cpu(cpu, tick_broadcast_pending_mask); | ||
508 | } else if (td->evtdev->next_event.tv64 < next_event.tv64) { | ||
459 | next_event.tv64 = td->evtdev->next_event.tv64; | 509 | next_event.tv64 = td->evtdev->next_event.tv64; |
510 | next_cpu = cpu; | ||
511 | } | ||
460 | } | 512 | } |
461 | 513 | ||
462 | /* | 514 | /* |
515 | * Remove the current cpu from the pending mask. The event is | ||
516 | * delivered immediately in tick_do_broadcast() ! | ||
517 | */ | ||
518 | cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); | ||
519 | |||
520 | /* Take care of enforced broadcast requests */ | ||
521 | cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); | ||
522 | cpumask_clear(tick_broadcast_force_mask); | ||
523 | |||
524 | /* | ||
463 | * Wakeup the cpus which have an expired event. | 525 | * Wakeup the cpus which have an expired event. |
464 | */ | 526 | */ |
465 | tick_do_broadcast(to_cpumask(tmpmask)); | 527 | tick_do_broadcast(tmpmask); |
466 | 528 | ||
467 | /* | 529 | /* |
468 | * Two reasons for reprogram: | 530 | * Two reasons for reprogram: |
@@ -479,7 +541,7 @@ again: | |||
479 | * Rearm the broadcast device. If event expired, | 541 | * Rearm the broadcast device. If event expired, |
480 | * repeat the above | 542 | * repeat the above |
481 | */ | 543 | */ |
482 | if (tick_broadcast_set_event(next_event, 0)) | 544 | if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) |
483 | goto again; | 545 | goto again; |
484 | } | 546 | } |
485 | raw_spin_unlock(&tick_broadcast_lock); | 547 | raw_spin_unlock(&tick_broadcast_lock); |
@@ -494,6 +556,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
494 | struct clock_event_device *bc, *dev; | 556 | struct clock_event_device *bc, *dev; |
495 | struct tick_device *td; | 557 | struct tick_device *td; |
496 | unsigned long flags; | 558 | unsigned long flags; |
559 | ktime_t now; | ||
497 | int cpu; | 560 | int cpu; |
498 | 561 | ||
499 | /* | 562 | /* |
@@ -518,21 +581,84 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
518 | 581 | ||
519 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 582 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
520 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | 583 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { |
521 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { | 584 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { |
522 | cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); | 585 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); |
523 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 586 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); |
524 | if (dev->next_event.tv64 < bc->next_event.tv64) | 587 | /* |
525 | tick_broadcast_set_event(dev->next_event, 1); | 588 | * We only reprogram the broadcast timer if we |
589 | * did not mark ourself in the force mask and | ||
590 | * if the cpu local event is earlier than the | ||
591 | * broadcast event. If the current CPU is in | ||
592 | * the force mask, then we are going to be | ||
593 | * woken by the IPI right away. | ||
594 | */ | ||
595 | if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && | ||
596 | dev->next_event.tv64 < bc->next_event.tv64) | ||
597 | tick_broadcast_set_event(bc, cpu, dev->next_event, 1); | ||
526 | } | 598 | } |
527 | } else { | 599 | } else { |
528 | if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { | 600 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { |
529 | cpumask_clear_cpu(cpu, | ||
530 | tick_get_broadcast_oneshot_mask()); | ||
531 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 601 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
532 | if (dev->next_event.tv64 != KTIME_MAX) | 602 | if (dev->next_event.tv64 == KTIME_MAX) |
533 | tick_program_event(dev->next_event, 1); | 603 | goto out; |
604 | /* | ||
605 | * The cpu which was handling the broadcast | ||
606 | * timer marked this cpu in the broadcast | ||
607 | * pending mask and fired the broadcast | ||
608 | * IPI. So we are going to handle the expired | ||
609 | * event anyway via the broadcast IPI | ||
610 | * handler. No need to reprogram the timer | ||
611 | * with an already expired event. | ||
612 | */ | ||
613 | if (cpumask_test_and_clear_cpu(cpu, | ||
614 | tick_broadcast_pending_mask)) | ||
615 | goto out; | ||
616 | |||
617 | /* | ||
618 | * If the pending bit is not set, then we are | ||
619 | * either the CPU handling the broadcast | ||
620 | * interrupt or we got woken by something else. | ||
621 | * | ||
622 | * We are not longer in the broadcast mask, so | ||
623 | * if the cpu local expiry time is already | ||
624 | * reached, we would reprogram the cpu local | ||
625 | * timer with an already expired event. | ||
626 | * | ||
627 | * This can lead to a ping-pong when we return | ||
628 | * to idle and therefor rearm the broadcast | ||
629 | * timer before the cpu local timer was able | ||
630 | * to fire. This happens because the forced | ||
631 | * reprogramming makes sure that the event | ||
632 | * will happen in the future and depending on | ||
633 | * the min_delta setting this might be far | ||
634 | * enough out that the ping-pong starts. | ||
635 | * | ||
636 | * If the cpu local next_event has expired | ||
637 | * then we know that the broadcast timer | ||
638 | * next_event has expired as well and | ||
639 | * broadcast is about to be handled. So we | ||
640 | * avoid reprogramming and enforce that the | ||
641 | * broadcast handler, which did not run yet, | ||
642 | * will invoke the cpu local handler. | ||
643 | * | ||
644 | * We cannot call the handler directly from | ||
645 | * here, because we might be in a NOHZ phase | ||
646 | * and we did not go through the irq_enter() | ||
647 | * nohz fixups. | ||
648 | */ | ||
649 | now = ktime_get(); | ||
650 | if (dev->next_event.tv64 <= now.tv64) { | ||
651 | cpumask_set_cpu(cpu, tick_broadcast_force_mask); | ||
652 | goto out; | ||
653 | } | ||
654 | /* | ||
655 | * We got woken by something else. Reprogram | ||
656 | * the cpu local timer device. | ||
657 | */ | ||
658 | tick_program_event(dev->next_event, 1); | ||
534 | } | 659 | } |
535 | } | 660 | } |
661 | out: | ||
536 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 662 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
537 | } | 663 | } |
538 | 664 | ||
@@ -543,7 +669,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
543 | */ | 669 | */ |
544 | static void tick_broadcast_clear_oneshot(int cpu) | 670 | static void tick_broadcast_clear_oneshot(int cpu) |
545 | { | 671 | { |
546 | cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); | 672 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); |
547 | } | 673 | } |
548 | 674 | ||
549 | static void tick_broadcast_init_next_event(struct cpumask *mask, | 675 | static void tick_broadcast_init_next_event(struct cpumask *mask, |
@@ -573,7 +699,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
573 | bc->event_handler = tick_handle_oneshot_broadcast; | 699 | bc->event_handler = tick_handle_oneshot_broadcast; |
574 | 700 | ||
575 | /* Take the do_timer update */ | 701 | /* Take the do_timer update */ |
576 | tick_do_timer_cpu = cpu; | 702 | if (!tick_nohz_full_cpu(cpu)) |
703 | tick_do_timer_cpu = cpu; | ||
577 | 704 | ||
578 | /* | 705 | /* |
579 | * We must be careful here. There might be other CPUs | 706 | * We must be careful here. There might be other CPUs |
@@ -581,17 +708,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
581 | * oneshot_mask bits for those and program the | 708 | * oneshot_mask bits for those and program the |
582 | * broadcast device to fire. | 709 | * broadcast device to fire. |
583 | */ | 710 | */ |
584 | cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); | 711 | cpumask_copy(tmpmask, tick_broadcast_mask); |
585 | cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); | 712 | cpumask_clear_cpu(cpu, tmpmask); |
586 | cpumask_or(tick_get_broadcast_oneshot_mask(), | 713 | cpumask_or(tick_broadcast_oneshot_mask, |
587 | tick_get_broadcast_oneshot_mask(), | 714 | tick_broadcast_oneshot_mask, tmpmask); |
588 | to_cpumask(tmpmask)); | ||
589 | 715 | ||
590 | if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { | 716 | if (was_periodic && !cpumask_empty(tmpmask)) { |
591 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 717 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
592 | tick_broadcast_init_next_event(to_cpumask(tmpmask), | 718 | tick_broadcast_init_next_event(tmpmask, |
593 | tick_next_period); | 719 | tick_next_period); |
594 | tick_broadcast_set_event(tick_next_period, 1); | 720 | tick_broadcast_set_event(bc, cpu, tick_next_period, 1); |
595 | } else | 721 | } else |
596 | bc->next_event.tv64 = KTIME_MAX; | 722 | bc->next_event.tv64 = KTIME_MAX; |
597 | } else { | 723 | } else { |
@@ -639,7 +765,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | |||
639 | * Clear the broadcast mask flag for the dead cpu, but do not | 765 | * Clear the broadcast mask flag for the dead cpu, but do not |
640 | * stop the broadcast device! | 766 | * stop the broadcast device! |
641 | */ | 767 | */ |
642 | cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); | 768 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); |
643 | 769 | ||
644 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 770 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
645 | } | 771 | } |
@@ -663,3 +789,14 @@ bool tick_broadcast_oneshot_available(void) | |||
663 | } | 789 | } |
664 | 790 | ||
665 | #endif | 791 | #endif |
792 | |||
793 | void __init tick_broadcast_init(void) | ||
794 | { | ||
795 | zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); | ||
796 | zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); | ||
797 | #ifdef CONFIG_TICK_ONESHOT | ||
798 | zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); | ||
799 | zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); | ||
800 | zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); | ||
801 | #endif | ||
802 | } | ||
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b1600a6973f4..5d3fb100bc06 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td, | |||
163 | * this cpu: | 163 | * this cpu: |
164 | */ | 164 | */ |
165 | if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { | 165 | if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { |
166 | tick_do_timer_cpu = cpu; | 166 | if (!tick_nohz_full_cpu(cpu)) |
167 | tick_do_timer_cpu = cpu; | ||
168 | else | ||
169 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | ||
167 | tick_next_period = ktime_get(); | 170 | tick_next_period = ktime_get(); |
168 | tick_period = ktime_set(0, NSEC_PER_SEC / HZ); | 171 | tick_period = ktime_set(0, NSEC_PER_SEC / HZ); |
169 | } | 172 | } |
@@ -323,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup) | |||
323 | */ | 326 | */ |
324 | dev->mode = CLOCK_EVT_MODE_UNUSED; | 327 | dev->mode = CLOCK_EVT_MODE_UNUSED; |
325 | clockevents_exchange_device(dev, NULL); | 328 | clockevents_exchange_device(dev, NULL); |
329 | dev->event_handler = clockevents_handle_noop; | ||
326 | td->evtdev = NULL; | 330 | td->evtdev = NULL; |
327 | } | 331 | } |
328 | raw_spin_unlock_irqrestore(&tick_device_lock, flags); | 332 | raw_spin_unlock_irqrestore(&tick_device_lock, flags); |
@@ -416,4 +420,5 @@ static struct notifier_block tick_notifier = { | |||
416 | void __init tick_init(void) | 420 | void __init tick_init(void) |
417 | { | 421 | { |
418 | clockevents_register_notifier(&tick_notifier); | 422 | clockevents_register_notifier(&tick_notifier); |
423 | tick_broadcast_init(); | ||
419 | } | 424 | } |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cf3e59ed6dc0..f0299eae4602 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -4,6 +4,8 @@ | |||
4 | #include <linux/hrtimer.h> | 4 | #include <linux/hrtimer.h> |
5 | #include <linux/tick.h> | 5 | #include <linux/tick.h> |
6 | 6 | ||
7 | extern seqlock_t jiffies_lock; | ||
8 | |||
7 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | 9 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD |
8 | 10 | ||
9 | #define TICK_DO_TIMER_NONE -1 | 11 | #define TICK_DO_TIMER_NONE -1 |
@@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); | |||
94 | extern void tick_shutdown_broadcast(unsigned int *cpup); | 96 | extern void tick_shutdown_broadcast(unsigned int *cpup); |
95 | extern void tick_suspend_broadcast(void); | 97 | extern void tick_suspend_broadcast(void); |
96 | extern int tick_resume_broadcast(void); | 98 | extern int tick_resume_broadcast(void); |
97 | 99 | extern void tick_broadcast_init(void); | |
98 | extern void | 100 | extern void |
99 | tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | 101 | tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); |
100 | 102 | ||
@@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } | |||
119 | static inline void tick_shutdown_broadcast(unsigned int *cpup) { } | 121 | static inline void tick_shutdown_broadcast(unsigned int *cpup) { } |
120 | static inline void tick_suspend_broadcast(void) { } | 122 | static inline void tick_suspend_broadcast(void) { } |
121 | static inline int tick_resume_broadcast(void) { return 0; } | 123 | static inline int tick_resume_broadcast(void) { return 0; } |
124 | static inline void tick_broadcast_init(void) { } | ||
122 | 125 | ||
123 | /* | 126 | /* |
124 | * Set the periodic handler in non broadcast mode | 127 | * Set the periodic handler in non broadcast mode |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a19a39952c1b..f4208138fbf4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -21,11 +21,15 @@ | |||
21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/irq_work.h> | 23 | #include <linux/irq_work.h> |
24 | #include <linux/posix-timers.h> | ||
25 | #include <linux/perf_event.h> | ||
24 | 26 | ||
25 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
26 | 28 | ||
27 | #include "tick-internal.h" | 29 | #include "tick-internal.h" |
28 | 30 | ||
31 | #include <trace/events/timer.h> | ||
32 | |||
29 | /* | 33 | /* |
30 | * Per cpu nohz control structure | 34 | * Per cpu nohz control structure |
31 | */ | 35 | */ |
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now) | |||
104 | { | 108 | { |
105 | int cpu = smp_processor_id(); | 109 | int cpu = smp_processor_id(); |
106 | 110 | ||
107 | #ifdef CONFIG_NO_HZ | 111 | #ifdef CONFIG_NO_HZ_COMMON |
108 | /* | 112 | /* |
109 | * Check if the do_timer duty was dropped. We don't care about | 113 | * Check if the do_timer duty was dropped. We don't care about |
110 | * concurrency: This happens only when the cpu in charge went | 114 | * concurrency: This happens only when the cpu in charge went |
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now) | |||
112 | * this duty, then the jiffies update is still serialized by | 116 | * this duty, then the jiffies update is still serialized by |
113 | * jiffies_lock. | 117 | * jiffies_lock. |
114 | */ | 118 | */ |
115 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | 119 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) |
120 | && !tick_nohz_full_cpu(cpu)) | ||
116 | tick_do_timer_cpu = cpu; | 121 | tick_do_timer_cpu = cpu; |
117 | #endif | 122 | #endif |
118 | 123 | ||
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now) | |||
123 | 128 | ||
124 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | 129 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) |
125 | { | 130 | { |
126 | #ifdef CONFIG_NO_HZ | 131 | #ifdef CONFIG_NO_HZ_COMMON |
127 | /* | 132 | /* |
128 | * When we are idle and the tick is stopped, we have to touch | 133 | * When we are idle and the tick is stopped, we have to touch |
129 | * the watchdog as we might not schedule for a really long | 134 | * the watchdog as we might not schedule for a really long |
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
142 | profile_tick(CPU_PROFILING); | 147 | profile_tick(CPU_PROFILING); |
143 | } | 148 | } |
144 | 149 | ||
150 | #ifdef CONFIG_NO_HZ_FULL | ||
151 | static cpumask_var_t nohz_full_mask; | ||
152 | bool have_nohz_full_mask; | ||
153 | |||
154 | static bool can_stop_full_tick(void) | ||
155 | { | ||
156 | WARN_ON_ONCE(!irqs_disabled()); | ||
157 | |||
158 | if (!sched_can_stop_tick()) { | ||
159 | trace_tick_stop(0, "more than 1 task in runqueue\n"); | ||
160 | return false; | ||
161 | } | ||
162 | |||
163 | if (!posix_cpu_timers_can_stop_tick(current)) { | ||
164 | trace_tick_stop(0, "posix timers running\n"); | ||
165 | return false; | ||
166 | } | ||
167 | |||
168 | if (!perf_event_can_stop_tick()) { | ||
169 | trace_tick_stop(0, "perf events running\n"); | ||
170 | return false; | ||
171 | } | ||
172 | |||
173 | /* sched_clock_tick() needs us? */ | ||
174 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
175 | /* | ||
176 | * TODO: kick full dynticks CPUs when | ||
177 | * sched_clock_stable is set. | ||
178 | */ | ||
179 | if (!sched_clock_stable) { | ||
180 | trace_tick_stop(0, "unstable sched clock\n"); | ||
181 | return false; | ||
182 | } | ||
183 | #endif | ||
184 | |||
185 | return true; | ||
186 | } | ||
187 | |||
188 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | ||
189 | |||
190 | /* | ||
191 | * Re-evaluate the need for the tick on the current CPU | ||
192 | * and restart it if necessary. | ||
193 | */ | ||
194 | void tick_nohz_full_check(void) | ||
195 | { | ||
196 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
197 | |||
198 | if (tick_nohz_full_cpu(smp_processor_id())) { | ||
199 | if (ts->tick_stopped && !is_idle_task(current)) { | ||
200 | if (!can_stop_full_tick()) | ||
201 | tick_nohz_restart_sched_tick(ts, ktime_get()); | ||
202 | } | ||
203 | } | ||
204 | } | ||
205 | |||
206 | static void nohz_full_kick_work_func(struct irq_work *work) | ||
207 | { | ||
208 | tick_nohz_full_check(); | ||
209 | } | ||
210 | |||
211 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | ||
212 | .func = nohz_full_kick_work_func, | ||
213 | }; | ||
214 | |||
215 | /* | ||
216 | * Kick the current CPU if it's full dynticks in order to force it to | ||
217 | * re-evaluate its dependency on the tick and restart it if necessary. | ||
218 | */ | ||
219 | void tick_nohz_full_kick(void) | ||
220 | { | ||
221 | if (tick_nohz_full_cpu(smp_processor_id())) | ||
222 | irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); | ||
223 | } | ||
224 | |||
225 | static void nohz_full_kick_ipi(void *info) | ||
226 | { | ||
227 | tick_nohz_full_check(); | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * Kick all full dynticks CPUs in order to force these to re-evaluate | ||
232 | * their dependency on the tick and restart it if necessary. | ||
233 | */ | ||
234 | void tick_nohz_full_kick_all(void) | ||
235 | { | ||
236 | if (!have_nohz_full_mask) | ||
237 | return; | ||
238 | |||
239 | preempt_disable(); | ||
240 | smp_call_function_many(nohz_full_mask, | ||
241 | nohz_full_kick_ipi, NULL, false); | ||
242 | preempt_enable(); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Re-evaluate the need for the tick as we switch the current task. | ||
247 | * It might need the tick due to per task/process properties: | ||
248 | * perf events, posix cpu timers, ... | ||
249 | */ | ||
250 | void tick_nohz_task_switch(struct task_struct *tsk) | ||
251 | { | ||
252 | unsigned long flags; | ||
253 | |||
254 | local_irq_save(flags); | ||
255 | |||
256 | if (!tick_nohz_full_cpu(smp_processor_id())) | ||
257 | goto out; | ||
258 | |||
259 | if (tick_nohz_tick_stopped() && !can_stop_full_tick()) | ||
260 | tick_nohz_full_kick(); | ||
261 | |||
262 | out: | ||
263 | local_irq_restore(flags); | ||
264 | } | ||
265 | |||
266 | int tick_nohz_full_cpu(int cpu) | ||
267 | { | ||
268 | if (!have_nohz_full_mask) | ||
269 | return 0; | ||
270 | |||
271 | return cpumask_test_cpu(cpu, nohz_full_mask); | ||
272 | } | ||
273 | |||
274 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | ||
275 | static int __init tick_nohz_full_setup(char *str) | ||
276 | { | ||
277 | int cpu; | ||
278 | |||
279 | alloc_bootmem_cpumask_var(&nohz_full_mask); | ||
280 | if (cpulist_parse(str, nohz_full_mask) < 0) { | ||
281 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | cpu = smp_processor_id(); | ||
286 | if (cpumask_test_cpu(cpu, nohz_full_mask)) { | ||
287 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | ||
288 | cpumask_clear_cpu(cpu, nohz_full_mask); | ||
289 | } | ||
290 | have_nohz_full_mask = true; | ||
291 | |||
292 | return 1; | ||
293 | } | ||
294 | __setup("nohz_full=", tick_nohz_full_setup); | ||
295 | |||
296 | static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, | ||
297 | unsigned long action, | ||
298 | void *hcpu) | ||
299 | { | ||
300 | unsigned int cpu = (unsigned long)hcpu; | ||
301 | |||
302 | switch (action & ~CPU_TASKS_FROZEN) { | ||
303 | case CPU_DOWN_PREPARE: | ||
304 | /* | ||
305 | * If we handle the timekeeping duty for full dynticks CPUs, | ||
306 | * we can't safely shutdown that CPU. | ||
307 | */ | ||
308 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) | ||
309 | return -EINVAL; | ||
310 | break; | ||
311 | } | ||
312 | return NOTIFY_OK; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Worst case string length in chunks of CPU range seems 2 steps | ||
317 | * separations: 0,2,4,6,... | ||
318 | * This is NR_CPUS + sizeof('\0') | ||
319 | */ | ||
320 | static char __initdata nohz_full_buf[NR_CPUS + 1]; | ||
321 | |||
322 | static int tick_nohz_init_all(void) | ||
323 | { | ||
324 | int err = -1; | ||
325 | |||
326 | #ifdef CONFIG_NO_HZ_FULL_ALL | ||
327 | if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { | ||
328 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | ||
329 | return err; | ||
330 | } | ||
331 | err = 0; | ||
332 | cpumask_setall(nohz_full_mask); | ||
333 | cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); | ||
334 | have_nohz_full_mask = true; | ||
335 | #endif | ||
336 | return err; | ||
337 | } | ||
338 | |||
339 | void __init tick_nohz_init(void) | ||
340 | { | ||
341 | int cpu; | ||
342 | |||
343 | if (!have_nohz_full_mask) { | ||
344 | if (tick_nohz_init_all() < 0) | ||
345 | return; | ||
346 | } | ||
347 | |||
348 | cpu_notifier(tick_nohz_cpu_down_callback, 0); | ||
349 | |||
350 | /* Make sure full dynticks CPU are also RCU nocbs */ | ||
351 | for_each_cpu(cpu, nohz_full_mask) { | ||
352 | if (!rcu_is_nocb_cpu(cpu)) { | ||
353 | pr_warning("NO_HZ: CPU %d is not RCU nocb: " | ||
354 | "cleared from nohz_full range", cpu); | ||
355 | cpumask_clear_cpu(cpu, nohz_full_mask); | ||
356 | } | ||
357 | } | ||
358 | |||
359 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); | ||
360 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); | ||
361 | } | ||
362 | #else | ||
363 | #define have_nohz_full_mask (0) | ||
364 | #endif | ||
365 | |||
145 | /* | 366 | /* |
146 | * NOHZ - aka dynamic tick functionality | 367 | * NOHZ - aka dynamic tick functionality |
147 | */ | 368 | */ |
148 | #ifdef CONFIG_NO_HZ | 369 | #ifdef CONFIG_NO_HZ_COMMON |
149 | /* | 370 | /* |
150 | * NO HZ enabled ? | 371 | * NO HZ enabled ? |
151 | */ | 372 | */ |
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
345 | delta_jiffies = rcu_delta_jiffies; | 566 | delta_jiffies = rcu_delta_jiffies; |
346 | } | 567 | } |
347 | } | 568 | } |
569 | |||
348 | /* | 570 | /* |
349 | * Do not stop the tick, if we are only one off | 571 | * Do not stop the tick, if we are only one off (or less) |
350 | * or if the cpu is required for rcu | 572 | * or if the cpu is required for RCU: |
351 | */ | 573 | */ |
352 | if (!ts->tick_stopped && delta_jiffies == 1) | 574 | if (!ts->tick_stopped && delta_jiffies <= 1) |
353 | goto out; | 575 | goto out; |
354 | 576 | ||
355 | /* Schedule the tick, if we are at least one jiffie off */ | 577 | /* Schedule the tick, if we are at least one jiffie off */ |
@@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
378 | time_delta = KTIME_MAX; | 600 | time_delta = KTIME_MAX; |
379 | } | 601 | } |
380 | 602 | ||
603 | #ifdef CONFIG_NO_HZ_FULL | ||
604 | if (!ts->inidle) { | ||
605 | time_delta = min(time_delta, | ||
606 | scheduler_tick_max_deferment()); | ||
607 | } | ||
608 | #endif | ||
609 | |||
381 | /* | 610 | /* |
382 | * calculate the expiry time for the next timer wheel | 611 | * calculate the expiry time for the next timer wheel |
383 | * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals | 612 | * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals |
@@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
421 | 650 | ||
422 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 651 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
423 | ts->tick_stopped = 1; | 652 | ts->tick_stopped = 1; |
653 | trace_tick_stop(1, " "); | ||
424 | } | 654 | } |
425 | 655 | ||
426 | /* | 656 | /* |
@@ -457,6 +687,24 @@ out: | |||
457 | return ret; | 687 | return ret; |
458 | } | 688 | } |
459 | 689 | ||
690 | static void tick_nohz_full_stop_tick(struct tick_sched *ts) | ||
691 | { | ||
692 | #ifdef CONFIG_NO_HZ_FULL | ||
693 | int cpu = smp_processor_id(); | ||
694 | |||
695 | if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) | ||
696 | return; | ||
697 | |||
698 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) | ||
699 | return; | ||
700 | |||
701 | if (!can_stop_full_tick()) | ||
702 | return; | ||
703 | |||
704 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | ||
705 | #endif | ||
706 | } | ||
707 | |||
460 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | 708 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) |
461 | { | 709 | { |
462 | /* | 710 | /* |
@@ -469,6 +717,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
469 | if (unlikely(!cpu_online(cpu))) { | 717 | if (unlikely(!cpu_online(cpu))) { |
470 | if (cpu == tick_do_timer_cpu) | 718 | if (cpu == tick_do_timer_cpu) |
471 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | 719 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; |
720 | return false; | ||
472 | } | 721 | } |
473 | 722 | ||
474 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 723 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
@@ -482,13 +731,28 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
482 | 731 | ||
483 | if (ratelimit < 10 && | 732 | if (ratelimit < 10 && |
484 | (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { | 733 | (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { |
485 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | 734 | pr_warn("NOHZ: local_softirq_pending %02x\n", |
486 | (unsigned int) local_softirq_pending()); | 735 | (unsigned int) local_softirq_pending()); |
487 | ratelimit++; | 736 | ratelimit++; |
488 | } | 737 | } |
489 | return false; | 738 | return false; |
490 | } | 739 | } |
491 | 740 | ||
741 | if (have_nohz_full_mask) { | ||
742 | /* | ||
743 | * Keep the tick alive to guarantee timekeeping progression | ||
744 | * if there are full dynticks CPUs around | ||
745 | */ | ||
746 | if (tick_do_timer_cpu == cpu) | ||
747 | return false; | ||
748 | /* | ||
749 | * Boot safety: make sure the timekeeping duty has been | ||
750 | * assigned before entering dyntick-idle mode, | ||
751 | */ | ||
752 | if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) | ||
753 | return false; | ||
754 | } | ||
755 | |||
492 | return true; | 756 | return true; |
493 | } | 757 | } |
494 | 758 | ||
@@ -568,12 +832,13 @@ void tick_nohz_irq_exit(void) | |||
568 | { | 832 | { |
569 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 833 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
570 | 834 | ||
571 | if (!ts->inidle) | 835 | if (ts->inidle) { |
572 | return; | 836 | /* Cancel the timer because CPU already waken up from the C-states*/ |
573 | 837 | menu_hrtimer_cancel(); | |
574 | /* Cancel the timer because CPU already waken up from the C-states*/ | 838 | __tick_nohz_idle_enter(ts); |
575 | menu_hrtimer_cancel(); | 839 | } else { |
576 | __tick_nohz_idle_enter(ts); | 840 | tick_nohz_full_stop_tick(ts); |
841 | } | ||
577 | } | 842 | } |
578 | 843 | ||
579 | /** | 844 | /** |
@@ -802,7 +1067,7 @@ static inline void tick_check_nohz(int cpu) | |||
802 | static inline void tick_nohz_switch_to_nohz(void) { } | 1067 | static inline void tick_nohz_switch_to_nohz(void) { } |
803 | static inline void tick_check_nohz(int cpu) { } | 1068 | static inline void tick_check_nohz(int cpu) { } |
804 | 1069 | ||
805 | #endif /* NO_HZ */ | 1070 | #endif /* CONFIG_NO_HZ_COMMON */ |
806 | 1071 | ||
807 | /* | 1072 | /* |
808 | * Called from irq_enter to notify about the possible interruption of idle() | 1073 | * Called from irq_enter to notify about the possible interruption of idle() |
@@ -887,14 +1152,14 @@ void tick_setup_sched_timer(void) | |||
887 | now = ktime_get(); | 1152 | now = ktime_get(); |
888 | } | 1153 | } |
889 | 1154 | ||
890 | #ifdef CONFIG_NO_HZ | 1155 | #ifdef CONFIG_NO_HZ_COMMON |
891 | if (tick_nohz_enabled) | 1156 | if (tick_nohz_enabled) |
892 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 1157 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
893 | #endif | 1158 | #endif |
894 | } | 1159 | } |
895 | #endif /* HIGH_RES_TIMERS */ | 1160 | #endif /* HIGH_RES_TIMERS */ |
896 | 1161 | ||
897 | #if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS | 1162 | #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS |
898 | void tick_cancel_sched_timer(int cpu) | 1163 | void tick_cancel_sched_timer(int cpu) |
899 | { | 1164 | { |
900 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 1165 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
@@ -904,7 +1169,7 @@ void tick_cancel_sched_timer(int cpu) | |||
904 | hrtimer_cancel(&ts->sched_timer); | 1169 | hrtimer_cancel(&ts->sched_timer); |
905 | # endif | 1170 | # endif |
906 | 1171 | ||
907 | ts->nohz_mode = NOHZ_MODE_INACTIVE; | 1172 | memset(ts, 0, sizeof(*ts)); |
908 | } | 1173 | } |
909 | #endif | 1174 | #endif |
910 | 1175 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9a0bc98fbe1d..baeeb5c87cf1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -23,8 +23,13 @@ | |||
23 | #include <linux/stop_machine.h> | 23 | #include <linux/stop_machine.h> |
24 | #include <linux/pvclock_gtod.h> | 24 | #include <linux/pvclock_gtod.h> |
25 | 25 | ||
26 | #include "tick-internal.h" | ||
27 | #include "ntp_internal.h" | ||
26 | 28 | ||
27 | static struct timekeeper timekeeper; | 29 | static struct timekeeper timekeeper; |
30 | static DEFINE_RAW_SPINLOCK(timekeeper_lock); | ||
31 | static seqcount_t timekeeper_seq; | ||
32 | static struct timekeeper shadow_timekeeper; | ||
28 | 33 | ||
29 | /* flag for if timekeeping is suspended */ | 34 | /* flag for if timekeeping is suspended */ |
30 | int __read_mostly timekeeping_suspended; | 35 | int __read_mostly timekeeping_suspended; |
@@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) | |||
67 | tk->wall_to_monotonic = wtm; | 72 | tk->wall_to_monotonic = wtm; |
68 | set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); | 73 | set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); |
69 | tk->offs_real = timespec_to_ktime(tmp); | 74 | tk->offs_real = timespec_to_ktime(tmp); |
75 | tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); | ||
70 | } | 76 | } |
71 | 77 | ||
72 | static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) | 78 | static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) |
@@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
96 | 102 | ||
97 | old_clock = tk->clock; | 103 | old_clock = tk->clock; |
98 | tk->clock = clock; | 104 | tk->clock = clock; |
99 | clock->cycle_last = clock->read(clock); | 105 | tk->cycle_last = clock->cycle_last = clock->read(clock); |
100 | 106 | ||
101 | /* Do the ns -> cycle conversion first, using original mult */ | 107 | /* Do the ns -> cycle conversion first, using original mult */ |
102 | tmp = NTP_INTERVAL_LENGTH; | 108 | tmp = NTP_INTERVAL_LENGTH; |
@@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk) | |||
201 | 207 | ||
202 | /** | 208 | /** |
203 | * pvclock_gtod_register_notifier - register a pvclock timedata update listener | 209 | * pvclock_gtod_register_notifier - register a pvclock timedata update listener |
204 | * | ||
205 | * Must hold write on timekeeper.lock | ||
206 | */ | 210 | */ |
207 | int pvclock_gtod_register_notifier(struct notifier_block *nb) | 211 | int pvclock_gtod_register_notifier(struct notifier_block *nb) |
208 | { | 212 | { |
@@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) | |||
210 | unsigned long flags; | 214 | unsigned long flags; |
211 | int ret; | 215 | int ret; |
212 | 216 | ||
213 | write_seqlock_irqsave(&tk->lock, flags); | 217 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
214 | ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); | 218 | ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); |
215 | /* update timekeeping data */ | ||
216 | update_pvclock_gtod(tk); | 219 | update_pvclock_gtod(tk); |
217 | write_sequnlock_irqrestore(&tk->lock, flags); | 220 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
218 | 221 | ||
219 | return ret; | 222 | return ret; |
220 | } | 223 | } |
@@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); | |||
223 | /** | 226 | /** |
224 | * pvclock_gtod_unregister_notifier - unregister a pvclock | 227 | * pvclock_gtod_unregister_notifier - unregister a pvclock |
225 | * timedata update listener | 228 | * timedata update listener |
226 | * | ||
227 | * Must hold write on timekeeper.lock | ||
228 | */ | 229 | */ |
229 | int pvclock_gtod_unregister_notifier(struct notifier_block *nb) | 230 | int pvclock_gtod_unregister_notifier(struct notifier_block *nb) |
230 | { | 231 | { |
231 | struct timekeeper *tk = &timekeeper; | ||
232 | unsigned long flags; | 232 | unsigned long flags; |
233 | int ret; | 233 | int ret; |
234 | 234 | ||
235 | write_seqlock_irqsave(&tk->lock, flags); | 235 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
236 | ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); | 236 | ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); |
237 | write_sequnlock_irqrestore(&tk->lock, flags); | 237 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
238 | 238 | ||
239 | return ret; | 239 | return ret; |
240 | } | 240 | } |
241 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | 241 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); |
242 | 242 | ||
243 | /* must hold write on timekeeper.lock */ | 243 | /* must hold timekeeper_lock */ |
244 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) | 244 | static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) |
245 | { | 245 | { |
246 | if (clearntp) { | 246 | if (clearntp) { |
247 | tk->ntp_error = 0; | 247 | tk->ntp_error = 0; |
@@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) | |||
249 | } | 249 | } |
250 | update_vsyscall(tk); | 250 | update_vsyscall(tk); |
251 | update_pvclock_gtod(tk); | 251 | update_pvclock_gtod(tk); |
252 | |||
253 | if (mirror) | ||
254 | memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); | ||
252 | } | 255 | } |
253 | 256 | ||
254 | /** | 257 | /** |
@@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) | |||
267 | clock = tk->clock; | 270 | clock = tk->clock; |
268 | cycle_now = clock->read(clock); | 271 | cycle_now = clock->read(clock); |
269 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 272 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
270 | clock->cycle_last = cycle_now; | 273 | tk->cycle_last = clock->cycle_last = cycle_now; |
271 | 274 | ||
272 | tk->xtime_nsec += cycle_delta * tk->mult; | 275 | tk->xtime_nsec += cycle_delta * tk->mult; |
273 | 276 | ||
@@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts) | |||
294 | s64 nsecs = 0; | 297 | s64 nsecs = 0; |
295 | 298 | ||
296 | do { | 299 | do { |
297 | seq = read_seqbegin(&tk->lock); | 300 | seq = read_seqcount_begin(&timekeeper_seq); |
298 | 301 | ||
299 | ts->tv_sec = tk->xtime_sec; | 302 | ts->tv_sec = tk->xtime_sec; |
300 | nsecs = timekeeping_get_ns(tk); | 303 | nsecs = timekeeping_get_ns(tk); |
301 | 304 | ||
302 | } while (read_seqretry(&tk->lock, seq)); | 305 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
303 | 306 | ||
304 | ts->tv_nsec = 0; | 307 | ts->tv_nsec = 0; |
305 | timespec_add_ns(ts, nsecs); | 308 | timespec_add_ns(ts, nsecs); |
@@ -335,11 +338,11 @@ ktime_t ktime_get(void) | |||
335 | WARN_ON(timekeeping_suspended); | 338 | WARN_ON(timekeeping_suspended); |
336 | 339 | ||
337 | do { | 340 | do { |
338 | seq = read_seqbegin(&tk->lock); | 341 | seq = read_seqcount_begin(&timekeeper_seq); |
339 | secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; | 342 | secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; |
340 | nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; | 343 | nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; |
341 | 344 | ||
342 | } while (read_seqretry(&tk->lock, seq)); | 345 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
343 | /* | 346 | /* |
344 | * Use ktime_set/ktime_add_ns to create a proper ktime on | 347 | * Use ktime_set/ktime_add_ns to create a proper ktime on |
345 | * 32-bit architectures without CONFIG_KTIME_SCALAR. | 348 | * 32-bit architectures without CONFIG_KTIME_SCALAR. |
@@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts) | |||
366 | WARN_ON(timekeeping_suspended); | 369 | WARN_ON(timekeeping_suspended); |
367 | 370 | ||
368 | do { | 371 | do { |
369 | seq = read_seqbegin(&tk->lock); | 372 | seq = read_seqcount_begin(&timekeeper_seq); |
370 | ts->tv_sec = tk->xtime_sec; | 373 | ts->tv_sec = tk->xtime_sec; |
371 | nsec = timekeeping_get_ns(tk); | 374 | nsec = timekeeping_get_ns(tk); |
372 | tomono = tk->wall_to_monotonic; | 375 | tomono = tk->wall_to_monotonic; |
373 | 376 | ||
374 | } while (read_seqretry(&tk->lock, seq)); | 377 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
375 | 378 | ||
376 | ts->tv_sec += tomono.tv_sec; | 379 | ts->tv_sec += tomono.tv_sec; |
377 | ts->tv_nsec = 0; | 380 | ts->tv_nsec = 0; |
@@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts) | |||
379 | } | 382 | } |
380 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 383 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
381 | 384 | ||
385 | |||
386 | /** | ||
387 | * timekeeping_clocktai - Returns the TAI time of day in a timespec | ||
388 | * @ts: pointer to the timespec to be set | ||
389 | * | ||
390 | * Returns the time of day in a timespec. | ||
391 | */ | ||
392 | void timekeeping_clocktai(struct timespec *ts) | ||
393 | { | ||
394 | struct timekeeper *tk = &timekeeper; | ||
395 | unsigned long seq; | ||
396 | u64 nsecs; | ||
397 | |||
398 | WARN_ON(timekeeping_suspended); | ||
399 | |||
400 | do { | ||
401 | seq = read_seqcount_begin(&timekeeper_seq); | ||
402 | |||
403 | ts->tv_sec = tk->xtime_sec + tk->tai_offset; | ||
404 | nsecs = timekeeping_get_ns(tk); | ||
405 | |||
406 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | ||
407 | |||
408 | ts->tv_nsec = 0; | ||
409 | timespec_add_ns(ts, nsecs); | ||
410 | |||
411 | } | ||
412 | EXPORT_SYMBOL(timekeeping_clocktai); | ||
413 | |||
414 | |||
415 | /** | ||
416 | * ktime_get_clocktai - Returns the TAI time of day in a ktime | ||
417 | * | ||
418 | * Returns the time of day in a ktime. | ||
419 | */ | ||
420 | ktime_t ktime_get_clocktai(void) | ||
421 | { | ||
422 | struct timespec ts; | ||
423 | |||
424 | timekeeping_clocktai(&ts); | ||
425 | return timespec_to_ktime(ts); | ||
426 | } | ||
427 | EXPORT_SYMBOL(ktime_get_clocktai); | ||
428 | |||
382 | #ifdef CONFIG_NTP_PPS | 429 | #ifdef CONFIG_NTP_PPS |
383 | 430 | ||
384 | /** | 431 | /** |
@@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
399 | WARN_ON_ONCE(timekeeping_suspended); | 446 | WARN_ON_ONCE(timekeeping_suspended); |
400 | 447 | ||
401 | do { | 448 | do { |
402 | seq = read_seqbegin(&tk->lock); | 449 | seq = read_seqcount_begin(&timekeeper_seq); |
403 | 450 | ||
404 | *ts_raw = tk->raw_time; | 451 | *ts_raw = tk->raw_time; |
405 | ts_real->tv_sec = tk->xtime_sec; | 452 | ts_real->tv_sec = tk->xtime_sec; |
@@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
408 | nsecs_raw = timekeeping_get_ns_raw(tk); | 455 | nsecs_raw = timekeeping_get_ns_raw(tk); |
409 | nsecs_real = timekeeping_get_ns(tk); | 456 | nsecs_real = timekeeping_get_ns(tk); |
410 | 457 | ||
411 | } while (read_seqretry(&tk->lock, seq)); | 458 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
412 | 459 | ||
413 | timespec_add_ns(ts_raw, nsecs_raw); | 460 | timespec_add_ns(ts_raw, nsecs_raw); |
414 | timespec_add_ns(ts_real, nsecs_real); | 461 | timespec_add_ns(ts_real, nsecs_real); |
@@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv) | |||
448 | if (!timespec_valid_strict(tv)) | 495 | if (!timespec_valid_strict(tv)) |
449 | return -EINVAL; | 496 | return -EINVAL; |
450 | 497 | ||
451 | write_seqlock_irqsave(&tk->lock, flags); | 498 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
499 | write_seqcount_begin(&timekeeper_seq); | ||
452 | 500 | ||
453 | timekeeping_forward_now(tk); | 501 | timekeeping_forward_now(tk); |
454 | 502 | ||
@@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv) | |||
460 | 508 | ||
461 | tk_set_xtime(tk, tv); | 509 | tk_set_xtime(tk, tv); |
462 | 510 | ||
463 | timekeeping_update(tk, true); | 511 | timekeeping_update(tk, true, true); |
464 | 512 | ||
465 | write_sequnlock_irqrestore(&tk->lock, flags); | 513 | write_seqcount_end(&timekeeper_seq); |
514 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
466 | 515 | ||
467 | /* signal hrtimers about time change */ | 516 | /* signal hrtimers about time change */ |
468 | clock_was_set(); | 517 | clock_was_set(); |
@@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
487 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | 536 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) |
488 | return -EINVAL; | 537 | return -EINVAL; |
489 | 538 | ||
490 | write_seqlock_irqsave(&tk->lock, flags); | 539 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
540 | write_seqcount_begin(&timekeeper_seq); | ||
491 | 541 | ||
492 | timekeeping_forward_now(tk); | 542 | timekeeping_forward_now(tk); |
493 | 543 | ||
@@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
502 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); | 552 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); |
503 | 553 | ||
504 | error: /* even if we error out, we forwarded the time, so call update */ | 554 | error: /* even if we error out, we forwarded the time, so call update */ |
505 | timekeeping_update(tk, true); | 555 | timekeeping_update(tk, true, true); |
506 | 556 | ||
507 | write_sequnlock_irqrestore(&tk->lock, flags); | 557 | write_seqcount_end(&timekeeper_seq); |
558 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
508 | 559 | ||
509 | /* signal hrtimers about time change */ | 560 | /* signal hrtimers about time change */ |
510 | clock_was_set(); | 561 | clock_was_set(); |
@@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */ | |||
513 | } | 564 | } |
514 | EXPORT_SYMBOL(timekeeping_inject_offset); | 565 | EXPORT_SYMBOL(timekeeping_inject_offset); |
515 | 566 | ||
567 | |||
568 | /** | ||
569 | * timekeeping_get_tai_offset - Returns current TAI offset from UTC | ||
570 | * | ||
571 | */ | ||
572 | s32 timekeeping_get_tai_offset(void) | ||
573 | { | ||
574 | struct timekeeper *tk = &timekeeper; | ||
575 | unsigned int seq; | ||
576 | s32 ret; | ||
577 | |||
578 | do { | ||
579 | seq = read_seqcount_begin(&timekeeper_seq); | ||
580 | ret = tk->tai_offset; | ||
581 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | ||
582 | |||
583 | return ret; | ||
584 | } | ||
585 | |||
586 | /** | ||
587 | * __timekeeping_set_tai_offset - Lock free worker function | ||
588 | * | ||
589 | */ | ||
590 | static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) | ||
591 | { | ||
592 | tk->tai_offset = tai_offset; | ||
593 | tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); | ||
594 | } | ||
595 | |||
596 | /** | ||
597 | * timekeeping_set_tai_offset - Sets the current TAI offset from UTC | ||
598 | * | ||
599 | */ | ||
600 | void timekeeping_set_tai_offset(s32 tai_offset) | ||
601 | { | ||
602 | struct timekeeper *tk = &timekeeper; | ||
603 | unsigned long flags; | ||
604 | |||
605 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | ||
606 | write_seqcount_begin(&timekeeper_seq); | ||
607 | __timekeeping_set_tai_offset(tk, tai_offset); | ||
608 | write_seqcount_end(&timekeeper_seq); | ||
609 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
610 | clock_was_set(); | ||
611 | } | ||
612 | |||
516 | /** | 613 | /** |
517 | * change_clocksource - Swaps clocksources if a new one is available | 614 | * change_clocksource - Swaps clocksources if a new one is available |
518 | * | 615 | * |
@@ -526,7 +623,8 @@ static int change_clocksource(void *data) | |||
526 | 623 | ||
527 | new = (struct clocksource *) data; | 624 | new = (struct clocksource *) data; |
528 | 625 | ||
529 | write_seqlock_irqsave(&tk->lock, flags); | 626 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
627 | write_seqcount_begin(&timekeeper_seq); | ||
530 | 628 | ||
531 | timekeeping_forward_now(tk); | 629 | timekeeping_forward_now(tk); |
532 | if (!new->enable || new->enable(new) == 0) { | 630 | if (!new->enable || new->enable(new) == 0) { |
@@ -535,9 +633,10 @@ static int change_clocksource(void *data) | |||
535 | if (old->disable) | 633 | if (old->disable) |
536 | old->disable(old); | 634 | old->disable(old); |
537 | } | 635 | } |
538 | timekeeping_update(tk, true); | 636 | timekeeping_update(tk, true, true); |
539 | 637 | ||
540 | write_sequnlock_irqrestore(&tk->lock, flags); | 638 | write_seqcount_end(&timekeeper_seq); |
639 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
541 | 640 | ||
542 | return 0; | 641 | return 0; |
543 | } | 642 | } |
@@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts) | |||
587 | s64 nsecs; | 686 | s64 nsecs; |
588 | 687 | ||
589 | do { | 688 | do { |
590 | seq = read_seqbegin(&tk->lock); | 689 | seq = read_seqcount_begin(&timekeeper_seq); |
591 | nsecs = timekeeping_get_ns_raw(tk); | 690 | nsecs = timekeeping_get_ns_raw(tk); |
592 | *ts = tk->raw_time; | 691 | *ts = tk->raw_time; |
593 | 692 | ||
594 | } while (read_seqretry(&tk->lock, seq)); | 693 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
595 | 694 | ||
596 | timespec_add_ns(ts, nsecs); | 695 | timespec_add_ns(ts, nsecs); |
597 | } | 696 | } |
@@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void) | |||
607 | int ret; | 706 | int ret; |
608 | 707 | ||
609 | do { | 708 | do { |
610 | seq = read_seqbegin(&tk->lock); | 709 | seq = read_seqcount_begin(&timekeeper_seq); |
611 | 710 | ||
612 | ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 711 | ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
613 | 712 | ||
614 | } while (read_seqretry(&tk->lock, seq)); | 713 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
615 | 714 | ||
616 | return ret; | 715 | return ret; |
617 | } | 716 | } |
@@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void) | |||
626 | u64 ret; | 725 | u64 ret; |
627 | 726 | ||
628 | do { | 727 | do { |
629 | seq = read_seqbegin(&tk->lock); | 728 | seq = read_seqcount_begin(&timekeeper_seq); |
630 | 729 | ||
631 | ret = tk->clock->max_idle_ns; | 730 | ret = tk->clock->max_idle_ns; |
632 | 731 | ||
633 | } while (read_seqretry(&tk->lock, seq)); | 732 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
634 | 733 | ||
635 | return ret; | 734 | return ret; |
636 | } | 735 | } |
@@ -693,11 +792,10 @@ void __init timekeeping_init(void) | |||
693 | boot.tv_nsec = 0; | 792 | boot.tv_nsec = 0; |
694 | } | 793 | } |
695 | 794 | ||
696 | seqlock_init(&tk->lock); | 795 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
697 | 796 | write_seqcount_begin(&timekeeper_seq); | |
698 | ntp_init(); | 797 | ntp_init(); |
699 | 798 | ||
700 | write_seqlock_irqsave(&tk->lock, flags); | ||
701 | clock = clocksource_default_clock(); | 799 | clock = clocksource_default_clock(); |
702 | if (clock->enable) | 800 | if (clock->enable) |
703 | clock->enable(clock); | 801 | clock->enable(clock); |
@@ -716,7 +814,10 @@ void __init timekeeping_init(void) | |||
716 | tmp.tv_nsec = 0; | 814 | tmp.tv_nsec = 0; |
717 | tk_set_sleep_time(tk, tmp); | 815 | tk_set_sleep_time(tk, tmp); |
718 | 816 | ||
719 | write_sequnlock_irqrestore(&tk->lock, flags); | 817 | memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); |
818 | |||
819 | write_seqcount_end(&timekeeper_seq); | ||
820 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
720 | } | 821 | } |
721 | 822 | ||
722 | /* time in seconds when suspend began */ | 823 | /* time in seconds when suspend began */ |
@@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
764 | if (has_persistent_clock()) | 865 | if (has_persistent_clock()) |
765 | return; | 866 | return; |
766 | 867 | ||
767 | write_seqlock_irqsave(&tk->lock, flags); | 868 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
869 | write_seqcount_begin(&timekeeper_seq); | ||
768 | 870 | ||
769 | timekeeping_forward_now(tk); | 871 | timekeeping_forward_now(tk); |
770 | 872 | ||
771 | __timekeeping_inject_sleeptime(tk, delta); | 873 | __timekeeping_inject_sleeptime(tk, delta); |
772 | 874 | ||
773 | timekeeping_update(tk, true); | 875 | timekeeping_update(tk, true, true); |
774 | 876 | ||
775 | write_sequnlock_irqrestore(&tk->lock, flags); | 877 | write_seqcount_end(&timekeeper_seq); |
878 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
776 | 879 | ||
777 | /* signal hrtimers about time change */ | 880 | /* signal hrtimers about time change */ |
778 | clock_was_set(); | 881 | clock_was_set(); |
@@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
788 | static void timekeeping_resume(void) | 891 | static void timekeeping_resume(void) |
789 | { | 892 | { |
790 | struct timekeeper *tk = &timekeeper; | 893 | struct timekeeper *tk = &timekeeper; |
894 | struct clocksource *clock = tk->clock; | ||
791 | unsigned long flags; | 895 | unsigned long flags; |
792 | struct timespec ts; | 896 | struct timespec ts_new, ts_delta; |
897 | cycle_t cycle_now, cycle_delta; | ||
898 | bool suspendtime_found = false; | ||
793 | 899 | ||
794 | read_persistent_clock(&ts); | 900 | read_persistent_clock(&ts_new); |
795 | 901 | ||
796 | clockevents_resume(); | 902 | clockevents_resume(); |
797 | clocksource_resume(); | 903 | clocksource_resume(); |
798 | 904 | ||
799 | write_seqlock_irqsave(&tk->lock, flags); | 905 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
906 | write_seqcount_begin(&timekeeper_seq); | ||
907 | |||
908 | /* | ||
909 | * After system resumes, we need to calculate the suspended time and | ||
910 | * compensate it for the OS time. There are 3 sources that could be | ||
911 | * used: Nonstop clocksource during suspend, persistent clock and rtc | ||
912 | * device. | ||
913 | * | ||
914 | * One specific platform may have 1 or 2 or all of them, and the | ||
915 | * preference will be: | ||
916 | * suspend-nonstop clocksource -> persistent clock -> rtc | ||
917 | * The less preferred source will only be tried if there is no better | ||
918 | * usable source. The rtc part is handled separately in rtc core code. | ||
919 | */ | ||
920 | cycle_now = clock->read(clock); | ||
921 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && | ||
922 | cycle_now > clock->cycle_last) { | ||
923 | u64 num, max = ULLONG_MAX; | ||
924 | u32 mult = clock->mult; | ||
925 | u32 shift = clock->shift; | ||
926 | s64 nsec = 0; | ||
927 | |||
928 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | ||
929 | |||
930 | /* | ||
931 | * "cycle_delta * mutl" may cause 64 bits overflow, if the | ||
932 | * suspended time is too long. In that case we need do the | ||
933 | * 64 bits math carefully | ||
934 | */ | ||
935 | do_div(max, mult); | ||
936 | if (cycle_delta > max) { | ||
937 | num = div64_u64(cycle_delta, max); | ||
938 | nsec = (((u64) max * mult) >> shift) * num; | ||
939 | cycle_delta -= num * max; | ||
940 | } | ||
941 | nsec += ((u64) cycle_delta * mult) >> shift; | ||
800 | 942 | ||
801 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 943 | ts_delta = ns_to_timespec(nsec); |
802 | ts = timespec_sub(ts, timekeeping_suspend_time); | 944 | suspendtime_found = true; |
803 | __timekeeping_inject_sleeptime(tk, &ts); | 945 | } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { |
946 | ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); | ||
947 | suspendtime_found = true; | ||
804 | } | 948 | } |
805 | /* re-base the last cycle value */ | 949 | |
806 | tk->clock->cycle_last = tk->clock->read(tk->clock); | 950 | if (suspendtime_found) |
951 | __timekeeping_inject_sleeptime(tk, &ts_delta); | ||
952 | |||
953 | /* Re-base the last cycle value */ | ||
954 | tk->cycle_last = clock->cycle_last = cycle_now; | ||
807 | tk->ntp_error = 0; | 955 | tk->ntp_error = 0; |
808 | timekeeping_suspended = 0; | 956 | timekeeping_suspended = 0; |
809 | timekeeping_update(tk, false); | 957 | timekeeping_update(tk, false, true); |
810 | write_sequnlock_irqrestore(&tk->lock, flags); | 958 | write_seqcount_end(&timekeeper_seq); |
959 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
811 | 960 | ||
812 | touch_softlockup_watchdog(); | 961 | touch_softlockup_watchdog(); |
813 | 962 | ||
@@ -826,7 +975,16 @@ static int timekeeping_suspend(void) | |||
826 | 975 | ||
827 | read_persistent_clock(&timekeeping_suspend_time); | 976 | read_persistent_clock(&timekeeping_suspend_time); |
828 | 977 | ||
829 | write_seqlock_irqsave(&tk->lock, flags); | 978 | /* |
979 | * On some systems the persistent_clock can not be detected at | ||
980 | * timekeeping_init by its return value, so if we see a valid | ||
981 | * value returned, update the persistent_clock_exists flag. | ||
982 | */ | ||
983 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) | ||
984 | persistent_clock_exist = true; | ||
985 | |||
986 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | ||
987 | write_seqcount_begin(&timekeeper_seq); | ||
830 | timekeeping_forward_now(tk); | 988 | timekeeping_forward_now(tk); |
831 | timekeeping_suspended = 1; | 989 | timekeeping_suspended = 1; |
832 | 990 | ||
@@ -849,7 +1007,8 @@ static int timekeeping_suspend(void) | |||
849 | timekeeping_suspend_time = | 1007 | timekeeping_suspend_time = |
850 | timespec_add(timekeeping_suspend_time, delta_delta); | 1008 | timespec_add(timekeeping_suspend_time, delta_delta); |
851 | } | 1009 | } |
852 | write_sequnlock_irqrestore(&tk->lock, flags); | 1010 | write_seqcount_end(&timekeeper_seq); |
1011 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
853 | 1012 | ||
854 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 1013 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
855 | clocksource_suspend(); | 1014 | clocksource_suspend(); |
@@ -1099,6 +1258,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) | |||
1099 | tk_set_wall_to_mono(tk, | 1258 | tk_set_wall_to_mono(tk, |
1100 | timespec_sub(tk->wall_to_monotonic, ts)); | 1259 | timespec_sub(tk->wall_to_monotonic, ts)); |
1101 | 1260 | ||
1261 | __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); | ||
1262 | |||
1102 | clock_was_set_delayed(); | 1263 | clock_was_set_delayed(); |
1103 | } | 1264 | } |
1104 | } | 1265 | } |
@@ -1116,15 +1277,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) | |||
1116 | static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | 1277 | static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, |
1117 | u32 shift) | 1278 | u32 shift) |
1118 | { | 1279 | { |
1280 | cycle_t interval = tk->cycle_interval << shift; | ||
1119 | u64 raw_nsecs; | 1281 | u64 raw_nsecs; |
1120 | 1282 | ||
1121 | /* If the offset is smaller then a shifted interval, do nothing */ | 1283 | /* If the offset is smaller then a shifted interval, do nothing */ |
1122 | if (offset < tk->cycle_interval<<shift) | 1284 | if (offset < interval) |
1123 | return offset; | 1285 | return offset; |
1124 | 1286 | ||
1125 | /* Accumulate one shifted interval */ | 1287 | /* Accumulate one shifted interval */ |
1126 | offset -= tk->cycle_interval << shift; | 1288 | offset -= interval; |
1127 | tk->clock->cycle_last += tk->cycle_interval << shift; | 1289 | tk->cycle_last += interval; |
1128 | 1290 | ||
1129 | tk->xtime_nsec += tk->xtime_interval << shift; | 1291 | tk->xtime_nsec += tk->xtime_interval << shift; |
1130 | accumulate_nsecs_to_secs(tk); | 1292 | accumulate_nsecs_to_secs(tk); |
@@ -1181,27 +1343,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
1181 | static void update_wall_time(void) | 1343 | static void update_wall_time(void) |
1182 | { | 1344 | { |
1183 | struct clocksource *clock; | 1345 | struct clocksource *clock; |
1184 | struct timekeeper *tk = &timekeeper; | 1346 | struct timekeeper *real_tk = &timekeeper; |
1347 | struct timekeeper *tk = &shadow_timekeeper; | ||
1185 | cycle_t offset; | 1348 | cycle_t offset; |
1186 | int shift = 0, maxshift; | 1349 | int shift = 0, maxshift; |
1187 | unsigned long flags; | 1350 | unsigned long flags; |
1188 | 1351 | ||
1189 | write_seqlock_irqsave(&tk->lock, flags); | 1352 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1190 | 1353 | ||
1191 | /* Make sure we're fully resumed: */ | 1354 | /* Make sure we're fully resumed: */ |
1192 | if (unlikely(timekeeping_suspended)) | 1355 | if (unlikely(timekeeping_suspended)) |
1193 | goto out; | 1356 | goto out; |
1194 | 1357 | ||
1195 | clock = tk->clock; | 1358 | clock = real_tk->clock; |
1196 | 1359 | ||
1197 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 1360 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
1198 | offset = tk->cycle_interval; | 1361 | offset = real_tk->cycle_interval; |
1199 | #else | 1362 | #else |
1200 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 1363 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; |
1201 | #endif | 1364 | #endif |
1202 | 1365 | ||
1203 | /* Check if there's really nothing to do */ | 1366 | /* Check if there's really nothing to do */ |
1204 | if (offset < tk->cycle_interval) | 1367 | if (offset < real_tk->cycle_interval) |
1205 | goto out; | 1368 | goto out; |
1206 | 1369 | ||
1207 | /* | 1370 | /* |
@@ -1238,11 +1401,24 @@ static void update_wall_time(void) | |||
1238 | */ | 1401 | */ |
1239 | accumulate_nsecs_to_secs(tk); | 1402 | accumulate_nsecs_to_secs(tk); |
1240 | 1403 | ||
1241 | timekeeping_update(tk, false); | 1404 | write_seqcount_begin(&timekeeper_seq); |
1242 | 1405 | /* Update clock->cycle_last with the new value */ | |
1406 | clock->cycle_last = tk->cycle_last; | ||
1407 | /* | ||
1408 | * Update the real timekeeper. | ||
1409 | * | ||
1410 | * We could avoid this memcpy by switching pointers, but that | ||
1411 | * requires changes to all other timekeeper usage sites as | ||
1412 | * well, i.e. move the timekeeper pointer getter into the | ||
1413 | * spinlocked/seqcount protected sections. And we trade this | ||
1414 | * memcpy under the timekeeper_seq against one before we start | ||
1415 | * updating. | ||
1416 | */ | ||
1417 | memcpy(real_tk, tk, sizeof(*tk)); | ||
1418 | timekeeping_update(real_tk, false, false); | ||
1419 | write_seqcount_end(&timekeeper_seq); | ||
1243 | out: | 1420 | out: |
1244 | write_sequnlock_irqrestore(&tk->lock, flags); | 1421 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1245 | |||
1246 | } | 1422 | } |
1247 | 1423 | ||
1248 | /** | 1424 | /** |
@@ -1289,13 +1465,13 @@ void get_monotonic_boottime(struct timespec *ts) | |||
1289 | WARN_ON(timekeeping_suspended); | 1465 | WARN_ON(timekeeping_suspended); |
1290 | 1466 | ||
1291 | do { | 1467 | do { |
1292 | seq = read_seqbegin(&tk->lock); | 1468 | seq = read_seqcount_begin(&timekeeper_seq); |
1293 | ts->tv_sec = tk->xtime_sec; | 1469 | ts->tv_sec = tk->xtime_sec; |
1294 | nsec = timekeeping_get_ns(tk); | 1470 | nsec = timekeeping_get_ns(tk); |
1295 | tomono = tk->wall_to_monotonic; | 1471 | tomono = tk->wall_to_monotonic; |
1296 | sleep = tk->total_sleep_time; | 1472 | sleep = tk->total_sleep_time; |
1297 | 1473 | ||
1298 | } while (read_seqretry(&tk->lock, seq)); | 1474 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
1299 | 1475 | ||
1300 | ts->tv_sec += tomono.tv_sec + sleep.tv_sec; | 1476 | ts->tv_sec += tomono.tv_sec + sleep.tv_sec; |
1301 | ts->tv_nsec = 0; | 1477 | ts->tv_nsec = 0; |
@@ -1354,10 +1530,10 @@ struct timespec current_kernel_time(void) | |||
1354 | unsigned long seq; | 1530 | unsigned long seq; |
1355 | 1531 | ||
1356 | do { | 1532 | do { |
1357 | seq = read_seqbegin(&tk->lock); | 1533 | seq = read_seqcount_begin(&timekeeper_seq); |
1358 | 1534 | ||
1359 | now = tk_xtime(tk); | 1535 | now = tk_xtime(tk); |
1360 | } while (read_seqretry(&tk->lock, seq)); | 1536 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
1361 | 1537 | ||
1362 | return now; | 1538 | return now; |
1363 | } | 1539 | } |
@@ -1370,11 +1546,11 @@ struct timespec get_monotonic_coarse(void) | |||
1370 | unsigned long seq; | 1546 | unsigned long seq; |
1371 | 1547 | ||
1372 | do { | 1548 | do { |
1373 | seq = read_seqbegin(&tk->lock); | 1549 | seq = read_seqcount_begin(&timekeeper_seq); |
1374 | 1550 | ||
1375 | now = tk_xtime(tk); | 1551 | now = tk_xtime(tk); |
1376 | mono = tk->wall_to_monotonic; | 1552 | mono = tk->wall_to_monotonic; |
1377 | } while (read_seqretry(&tk->lock, seq)); | 1553 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
1378 | 1554 | ||
1379 | set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, | 1555 | set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, |
1380 | now.tv_nsec + mono.tv_nsec); | 1556 | now.tv_nsec + mono.tv_nsec); |
@@ -1405,11 +1581,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1405 | unsigned long seq; | 1581 | unsigned long seq; |
1406 | 1582 | ||
1407 | do { | 1583 | do { |
1408 | seq = read_seqbegin(&tk->lock); | 1584 | seq = read_seqcount_begin(&timekeeper_seq); |
1409 | *xtim = tk_xtime(tk); | 1585 | *xtim = tk_xtime(tk); |
1410 | *wtom = tk->wall_to_monotonic; | 1586 | *wtom = tk->wall_to_monotonic; |
1411 | *sleep = tk->total_sleep_time; | 1587 | *sleep = tk->total_sleep_time; |
1412 | } while (read_seqretry(&tk->lock, seq)); | 1588 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
1413 | } | 1589 | } |
1414 | 1590 | ||
1415 | #ifdef CONFIG_HIGH_RES_TIMERS | 1591 | #ifdef CONFIG_HIGH_RES_TIMERS |
@@ -1421,7 +1597,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1421 | * Returns current monotonic time and updates the offsets | 1597 | * Returns current monotonic time and updates the offsets |
1422 | * Called from hrtimer_interupt() or retrigger_next_event() | 1598 | * Called from hrtimer_interupt() or retrigger_next_event() |
1423 | */ | 1599 | */ |
1424 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) | 1600 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, |
1601 | ktime_t *offs_tai) | ||
1425 | { | 1602 | { |
1426 | struct timekeeper *tk = &timekeeper; | 1603 | struct timekeeper *tk = &timekeeper; |
1427 | ktime_t now; | 1604 | ktime_t now; |
@@ -1429,14 +1606,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) | |||
1429 | u64 secs, nsecs; | 1606 | u64 secs, nsecs; |
1430 | 1607 | ||
1431 | do { | 1608 | do { |
1432 | seq = read_seqbegin(&tk->lock); | 1609 | seq = read_seqcount_begin(&timekeeper_seq); |
1433 | 1610 | ||
1434 | secs = tk->xtime_sec; | 1611 | secs = tk->xtime_sec; |
1435 | nsecs = timekeeping_get_ns(tk); | 1612 | nsecs = timekeeping_get_ns(tk); |
1436 | 1613 | ||
1437 | *offs_real = tk->offs_real; | 1614 | *offs_real = tk->offs_real; |
1438 | *offs_boot = tk->offs_boot; | 1615 | *offs_boot = tk->offs_boot; |
1439 | } while (read_seqretry(&tk->lock, seq)); | 1616 | *offs_tai = tk->offs_tai; |
1617 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | ||
1440 | 1618 | ||
1441 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); | 1619 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); |
1442 | now = ktime_sub(now, *offs_real); | 1620 | now = ktime_sub(now, *offs_real); |
@@ -1454,15 +1632,79 @@ ktime_t ktime_get_monotonic_offset(void) | |||
1454 | struct timespec wtom; | 1632 | struct timespec wtom; |
1455 | 1633 | ||
1456 | do { | 1634 | do { |
1457 | seq = read_seqbegin(&tk->lock); | 1635 | seq = read_seqcount_begin(&timekeeper_seq); |
1458 | wtom = tk->wall_to_monotonic; | 1636 | wtom = tk->wall_to_monotonic; |
1459 | } while (read_seqretry(&tk->lock, seq)); | 1637 | } while (read_seqcount_retry(&timekeeper_seq, seq)); |
1460 | 1638 | ||
1461 | return timespec_to_ktime(wtom); | 1639 | return timespec_to_ktime(wtom); |
1462 | } | 1640 | } |
1463 | EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); | 1641 | EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); |
1464 | 1642 | ||
1465 | /** | 1643 | /** |
1644 | * do_adjtimex() - Accessor function to NTP __do_adjtimex function | ||
1645 | */ | ||
1646 | int do_adjtimex(struct timex *txc) | ||
1647 | { | ||
1648 | struct timekeeper *tk = &timekeeper; | ||
1649 | unsigned long flags; | ||
1650 | struct timespec ts; | ||
1651 | s32 orig_tai, tai; | ||
1652 | int ret; | ||
1653 | |||
1654 | /* Validate the data before disabling interrupts */ | ||
1655 | ret = ntp_validate_timex(txc); | ||
1656 | if (ret) | ||
1657 | return ret; | ||
1658 | |||
1659 | if (txc->modes & ADJ_SETOFFSET) { | ||
1660 | struct timespec delta; | ||
1661 | delta.tv_sec = txc->time.tv_sec; | ||
1662 | delta.tv_nsec = txc->time.tv_usec; | ||
1663 | if (!(txc->modes & ADJ_NANO)) | ||
1664 | delta.tv_nsec *= 1000; | ||
1665 | ret = timekeeping_inject_offset(&delta); | ||
1666 | if (ret) | ||
1667 | return ret; | ||
1668 | } | ||
1669 | |||
1670 | getnstimeofday(&ts); | ||
1671 | |||
1672 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | ||
1673 | write_seqcount_begin(&timekeeper_seq); | ||
1674 | |||
1675 | orig_tai = tai = tk->tai_offset; | ||
1676 | ret = __do_adjtimex(txc, &ts, &tai); | ||
1677 | |||
1678 | if (tai != orig_tai) { | ||
1679 | __timekeeping_set_tai_offset(tk, tai); | ||
1680 | clock_was_set_delayed(); | ||
1681 | } | ||
1682 | write_seqcount_end(&timekeeper_seq); | ||
1683 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
1684 | |||
1685 | return ret; | ||
1686 | } | ||
1687 | |||
1688 | #ifdef CONFIG_NTP_PPS | ||
1689 | /** | ||
1690 | * hardpps() - Accessor function to NTP __hardpps function | ||
1691 | */ | ||
1692 | void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | ||
1693 | { | ||
1694 | unsigned long flags; | ||
1695 | |||
1696 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | ||
1697 | write_seqcount_begin(&timekeeper_seq); | ||
1698 | |||
1699 | __hardpps(phase_ts, raw_ts); | ||
1700 | |||
1701 | write_seqcount_end(&timekeeper_seq); | ||
1702 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | ||
1703 | } | ||
1704 | EXPORT_SYMBOL(hardpps); | ||
1705 | #endif | ||
1706 | |||
1707 | /** | ||
1466 | * xtime_update() - advances the timekeeping infrastructure | 1708 | * xtime_update() - advances the timekeeping infrastructure |
1467 | * @ticks: number of ticks, that have elapsed since the last call. | 1709 | * @ticks: number of ticks, that have elapsed since the last call. |
1468 | * | 1710 | * |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index af5a7e9f164b..3bdf28323012 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -20,6 +20,13 @@ | |||
20 | 20 | ||
21 | #include <asm/uaccess.h> | 21 | #include <asm/uaccess.h> |
22 | 22 | ||
23 | |||
24 | struct timer_list_iter { | ||
25 | int cpu; | ||
26 | bool second_pass; | ||
27 | u64 now; | ||
28 | }; | ||
29 | |||
23 | typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); | 30 | typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); |
24 | 31 | ||
25 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | 32 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); |
@@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
133 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 140 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
134 | int i; | 141 | int i; |
135 | 142 | ||
136 | SEQ_printf(m, "\n"); | ||
137 | SEQ_printf(m, "cpu: %d\n", cpu); | 143 | SEQ_printf(m, "cpu: %d\n", cpu); |
138 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 144 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
139 | SEQ_printf(m, " clock %d:\n", i); | 145 | SEQ_printf(m, " clock %d:\n", i); |
@@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
187 | 193 | ||
188 | #undef P | 194 | #undef P |
189 | #undef P_ns | 195 | #undef P_ns |
196 | SEQ_printf(m, "\n"); | ||
190 | } | 197 | } |
191 | 198 | ||
192 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 199 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
@@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) | |||
195 | { | 202 | { |
196 | struct clock_event_device *dev = td->evtdev; | 203 | struct clock_event_device *dev = td->evtdev; |
197 | 204 | ||
198 | SEQ_printf(m, "\n"); | ||
199 | SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); | 205 | SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); |
200 | if (cpu < 0) | 206 | if (cpu < 0) |
201 | SEQ_printf(m, "Broadcast device\n"); | 207 | SEQ_printf(m, "Broadcast device\n"); |
@@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) | |||
230 | print_name_offset(m, dev->event_handler); | 236 | print_name_offset(m, dev->event_handler); |
231 | SEQ_printf(m, "\n"); | 237 | SEQ_printf(m, "\n"); |
232 | SEQ_printf(m, " retries: %lu\n", dev->retries); | 238 | SEQ_printf(m, " retries: %lu\n", dev->retries); |
239 | SEQ_printf(m, "\n"); | ||
233 | } | 240 | } |
234 | 241 | ||
235 | static void timer_list_show_tickdevices(struct seq_file *m) | 242 | static void timer_list_show_tickdevices_header(struct seq_file *m) |
236 | { | 243 | { |
237 | int cpu; | ||
238 | |||
239 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 244 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST |
240 | print_tickdevice(m, tick_get_broadcast_device(), -1); | 245 | print_tickdevice(m, tick_get_broadcast_device(), -1); |
241 | SEQ_printf(m, "tick_broadcast_mask: %08lx\n", | 246 | SEQ_printf(m, "tick_broadcast_mask: %08lx\n", |
@@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m) | |||
246 | #endif | 251 | #endif |
247 | SEQ_printf(m, "\n"); | 252 | SEQ_printf(m, "\n"); |
248 | #endif | 253 | #endif |
249 | for_each_online_cpu(cpu) | ||
250 | print_tickdevice(m, tick_get_device(cpu), cpu); | ||
251 | SEQ_printf(m, "\n"); | ||
252 | } | 254 | } |
253 | #else | ||
254 | static void timer_list_show_tickdevices(struct seq_file *m) { } | ||
255 | #endif | 255 | #endif |
256 | 256 | ||
257 | static inline void timer_list_header(struct seq_file *m, u64 now) | ||
258 | { | ||
259 | SEQ_printf(m, "Timer List Version: v0.7\n"); | ||
260 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | ||
261 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | ||
262 | SEQ_printf(m, "\n"); | ||
263 | } | ||
264 | |||
257 | static int timer_list_show(struct seq_file *m, void *v) | 265 | static int timer_list_show(struct seq_file *m, void *v) |
258 | { | 266 | { |
267 | struct timer_list_iter *iter = v; | ||
268 | u64 now = ktime_to_ns(ktime_get()); | ||
269 | |||
270 | if (iter->cpu == -1 && !iter->second_pass) | ||
271 | timer_list_header(m, now); | ||
272 | else if (!iter->second_pass) | ||
273 | print_cpu(m, iter->cpu, iter->now); | ||
274 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
275 | else if (iter->cpu == -1 && iter->second_pass) | ||
276 | timer_list_show_tickdevices_header(m); | ||
277 | else | ||
278 | print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); | ||
279 | #endif | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | void sysrq_timer_list_show(void) | ||
284 | { | ||
259 | u64 now = ktime_to_ns(ktime_get()); | 285 | u64 now = ktime_to_ns(ktime_get()); |
260 | int cpu; | 286 | int cpu; |
261 | 287 | ||
262 | SEQ_printf(m, "Timer List Version: v0.7\n"); | 288 | timer_list_header(NULL, now); |
263 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | ||
264 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | ||
265 | 289 | ||
266 | for_each_online_cpu(cpu) | 290 | for_each_online_cpu(cpu) |
267 | print_cpu(m, cpu, now); | 291 | print_cpu(NULL, cpu, now); |
268 | 292 | ||
269 | SEQ_printf(m, "\n"); | 293 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
270 | timer_list_show_tickdevices(m); | 294 | timer_list_show_tickdevices_header(NULL); |
295 | for_each_online_cpu(cpu) | ||
296 | print_tickdevice(NULL, tick_get_device(cpu), cpu); | ||
297 | #endif | ||
298 | return; | ||
299 | } | ||
271 | 300 | ||
272 | return 0; | 301 | static void *timer_list_start(struct seq_file *file, loff_t *offset) |
302 | { | ||
303 | struct timer_list_iter *iter = file->private; | ||
304 | |||
305 | if (!*offset) { | ||
306 | iter->cpu = -1; | ||
307 | iter->now = ktime_to_ns(ktime_get()); | ||
308 | } else if (iter->cpu >= nr_cpu_ids) { | ||
309 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
310 | if (!iter->second_pass) { | ||
311 | iter->cpu = -1; | ||
312 | iter->second_pass = true; | ||
313 | } else | ||
314 | return NULL; | ||
315 | #else | ||
316 | return NULL; | ||
317 | #endif | ||
318 | } | ||
319 | return iter; | ||
273 | } | 320 | } |
274 | 321 | ||
275 | void sysrq_timer_list_show(void) | 322 | static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) |
323 | { | ||
324 | struct timer_list_iter *iter = file->private; | ||
325 | iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); | ||
326 | ++*offset; | ||
327 | return timer_list_start(file, offset); | ||
328 | } | ||
329 | |||
330 | static void timer_list_stop(struct seq_file *seq, void *v) | ||
276 | { | 331 | { |
277 | timer_list_show(NULL, NULL); | ||
278 | } | 332 | } |
279 | 333 | ||
334 | static const struct seq_operations timer_list_sops = { | ||
335 | .start = timer_list_start, | ||
336 | .next = timer_list_next, | ||
337 | .stop = timer_list_stop, | ||
338 | .show = timer_list_show, | ||
339 | }; | ||
340 | |||
280 | static int timer_list_open(struct inode *inode, struct file *filp) | 341 | static int timer_list_open(struct inode *inode, struct file *filp) |
281 | { | 342 | { |
282 | return single_open(filp, timer_list_show, NULL); | 343 | return seq_open_private(filp, &timer_list_sops, |
344 | sizeof(struct timer_list_iter)); | ||
283 | } | 345 | } |
284 | 346 | ||
285 | static const struct file_operations timer_list_fops = { | 347 | static const struct file_operations timer_list_fops = { |
286 | .open = timer_list_open, | 348 | .open = timer_list_open, |
287 | .read = seq_read, | 349 | .read = seq_read, |
288 | .llseek = seq_lseek, | 350 | .llseek = seq_lseek, |
289 | .release = single_release, | 351 | .release = seq_release_private, |
290 | }; | 352 | }; |
291 | 353 | ||
292 | static int __init init_timer_list_procfs(void) | 354 | static int __init init_timer_list_procfs(void) |
diff --git a/kernel/timer.c b/kernel/timer.c index dbf7a78a1ef1..15ffdb3f1948 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/timer.c | 2 | * linux/kernel/timer.c |
3 | * | 3 | * |
4 | * Kernel internal timers, basic process system calls | 4 | * Kernel internal timers |
5 | * | 5 | * |
6 | * Copyright (C) 1991, 1992 Linus Torvalds | 6 | * Copyright (C) 1991, 1992 Linus Torvalds |
7 | * | 7 | * |
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/sched/sysctl.h> | 42 | #include <linux/sched/sysctl.h> |
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/compat.h> | ||
44 | 45 | ||
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
46 | #include <asm/unistd.h> | 47 | #include <asm/unistd.h> |
@@ -738,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
738 | 739 | ||
739 | cpu = smp_processor_id(); | 740 | cpu = smp_processor_id(); |
740 | 741 | ||
741 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 742 | #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) |
742 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) | 743 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) |
743 | cpu = get_nohz_timer_target(); | 744 | cpu = get_nohz_timer_target(); |
744 | #endif | 745 | #endif |
@@ -930,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
930 | debug_activate(timer, timer->expires); | 931 | debug_activate(timer, timer->expires); |
931 | internal_add_timer(base, timer); | 932 | internal_add_timer(base, timer); |
932 | /* | 933 | /* |
933 | * Check whether the other CPU is idle and needs to be | 934 | * Check whether the other CPU is in dynticks mode and needs |
934 | * triggered to reevaluate the timer wheel when nohz is | 935 | * to be triggered to reevaluate the timer wheel. |
935 | * active. We are protected against the other CPU fiddling | 936 | * We are protected against the other CPU fiddling |
936 | * with the timer by holding the timer base lock. This also | 937 | * with the timer by holding the timer base lock. This also |
937 | * makes sure that a CPU on the way to idle can not evaluate | 938 | * makes sure that a CPU on the way to stop its tick can not |
938 | * the timer wheel. | 939 | * evaluate the timer wheel. |
939 | */ | 940 | */ |
940 | wake_up_idle_cpu(cpu); | 941 | wake_up_nohz_cpu(cpu); |
941 | spin_unlock_irqrestore(&base->lock, flags); | 942 | spin_unlock_irqrestore(&base->lock, flags); |
942 | } | 943 | } |
943 | EXPORT_SYMBOL_GPL(add_timer_on); | 944 | EXPORT_SYMBOL_GPL(add_timer_on); |
@@ -1188,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1188 | spin_unlock_irq(&base->lock); | 1189 | spin_unlock_irq(&base->lock); |
1189 | } | 1190 | } |
1190 | 1191 | ||
1191 | #ifdef CONFIG_NO_HZ | 1192 | #ifdef CONFIG_NO_HZ_COMMON |
1192 | /* | 1193 | /* |
1193 | * Find out when the next timer event is due to happen. This | 1194 | * Find out when the next timer event is due to happen. This |
1194 | * is used on S/390 to stop all activity when a CPU is idle. | 1195 | * is used on S/390 to stop all activity when a CPU is idle. |
@@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) | |||
1395 | 1396 | ||
1396 | #endif | 1397 | #endif |
1397 | 1398 | ||
1398 | /** | ||
1399 | * sys_getpid - return the thread group id of the current process | ||
1400 | * | ||
1401 | * Note, despite the name, this returns the tgid not the pid. The tgid and | ||
1402 | * the pid are identical unless CLONE_THREAD was specified on clone() in | ||
1403 | * which case the tgid is the same in all threads of the same group. | ||
1404 | * | ||
1405 | * This is SMP safe as current->tgid does not change. | ||
1406 | */ | ||
1407 | SYSCALL_DEFINE0(getpid) | ||
1408 | { | ||
1409 | return task_tgid_vnr(current); | ||
1410 | } | ||
1411 | |||
1412 | /* | ||
1413 | * Accessing ->real_parent is not SMP-safe, it could | ||
1414 | * change from under us. However, we can use a stale | ||
1415 | * value of ->real_parent under rcu_read_lock(), see | ||
1416 | * release_task()->call_rcu(delayed_put_task_struct). | ||
1417 | */ | ||
1418 | SYSCALL_DEFINE0(getppid) | ||
1419 | { | ||
1420 | int pid; | ||
1421 | |||
1422 | rcu_read_lock(); | ||
1423 | pid = task_tgid_vnr(rcu_dereference(current->real_parent)); | ||
1424 | rcu_read_unlock(); | ||
1425 | |||
1426 | return pid; | ||
1427 | } | ||
1428 | |||
1429 | SYSCALL_DEFINE0(getuid) | ||
1430 | { | ||
1431 | /* Only we change this so SMP safe */ | ||
1432 | return from_kuid_munged(current_user_ns(), current_uid()); | ||
1433 | } | ||
1434 | |||
1435 | SYSCALL_DEFINE0(geteuid) | ||
1436 | { | ||
1437 | /* Only we change this so SMP safe */ | ||
1438 | return from_kuid_munged(current_user_ns(), current_euid()); | ||
1439 | } | ||
1440 | |||
1441 | SYSCALL_DEFINE0(getgid) | ||
1442 | { | ||
1443 | /* Only we change this so SMP safe */ | ||
1444 | return from_kgid_munged(current_user_ns(), current_gid()); | ||
1445 | } | ||
1446 | |||
1447 | SYSCALL_DEFINE0(getegid) | ||
1448 | { | ||
1449 | /* Only we change this so SMP safe */ | ||
1450 | return from_kgid_munged(current_user_ns(), current_egid()); | ||
1451 | } | ||
1452 | |||
1453 | static void process_timeout(unsigned long __data) | 1399 | static void process_timeout(unsigned long __data) |
1454 | { | 1400 | { |
1455 | wake_up_process((struct task_struct *)__data); | 1401 | wake_up_process((struct task_struct *)__data); |
@@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) | |||
1557 | } | 1503 | } |
1558 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1504 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1559 | 1505 | ||
1560 | /* Thread ID - the internal kernel "pid" */ | ||
1561 | SYSCALL_DEFINE0(gettid) | ||
1562 | { | ||
1563 | return task_pid_vnr(current); | ||
1564 | } | ||
1565 | |||
1566 | /** | ||
1567 | * do_sysinfo - fill in sysinfo struct | ||
1568 | * @info: pointer to buffer to fill | ||
1569 | */ | ||
1570 | int do_sysinfo(struct sysinfo *info) | ||
1571 | { | ||
1572 | unsigned long mem_total, sav_total; | ||
1573 | unsigned int mem_unit, bitcount; | ||
1574 | struct timespec tp; | ||
1575 | |||
1576 | memset(info, 0, sizeof(struct sysinfo)); | ||
1577 | |||
1578 | ktime_get_ts(&tp); | ||
1579 | monotonic_to_bootbased(&tp); | ||
1580 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); | ||
1581 | |||
1582 | get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); | ||
1583 | |||
1584 | info->procs = nr_threads; | ||
1585 | |||
1586 | si_meminfo(info); | ||
1587 | si_swapinfo(info); | ||
1588 | |||
1589 | /* | ||
1590 | * If the sum of all the available memory (i.e. ram + swap) | ||
1591 | * is less than can be stored in a 32 bit unsigned long then | ||
1592 | * we can be binary compatible with 2.2.x kernels. If not, | ||
1593 | * well, in that case 2.2.x was broken anyways... | ||
1594 | * | ||
1595 | * -Erik Andersen <andersee@debian.org> | ||
1596 | */ | ||
1597 | |||
1598 | mem_total = info->totalram + info->totalswap; | ||
1599 | if (mem_total < info->totalram || mem_total < info->totalswap) | ||
1600 | goto out; | ||
1601 | bitcount = 0; | ||
1602 | mem_unit = info->mem_unit; | ||
1603 | while (mem_unit > 1) { | ||
1604 | bitcount++; | ||
1605 | mem_unit >>= 1; | ||
1606 | sav_total = mem_total; | ||
1607 | mem_total <<= 1; | ||
1608 | if (mem_total < sav_total) | ||
1609 | goto out; | ||
1610 | } | ||
1611 | |||
1612 | /* | ||
1613 | * If mem_total did not overflow, multiply all memory values by | ||
1614 | * info->mem_unit and set it to 1. This leaves things compatible | ||
1615 | * with 2.2.x, and also retains compatibility with earlier 2.4.x | ||
1616 | * kernels... | ||
1617 | */ | ||
1618 | |||
1619 | info->mem_unit = 1; | ||
1620 | info->totalram <<= bitcount; | ||
1621 | info->freeram <<= bitcount; | ||
1622 | info->sharedram <<= bitcount; | ||
1623 | info->bufferram <<= bitcount; | ||
1624 | info->totalswap <<= bitcount; | ||
1625 | info->freeswap <<= bitcount; | ||
1626 | info->totalhigh <<= bitcount; | ||
1627 | info->freehigh <<= bitcount; | ||
1628 | |||
1629 | out: | ||
1630 | return 0; | ||
1631 | } | ||
1632 | |||
1633 | SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) | ||
1634 | { | ||
1635 | struct sysinfo val; | ||
1636 | |||
1637 | do_sysinfo(&val); | ||
1638 | |||
1639 | if (copy_to_user(info, &val, sizeof(struct sysinfo))) | ||
1640 | return -EFAULT; | ||
1641 | |||
1642 | return 0; | ||
1643 | } | ||
1644 | |||
1645 | static int __cpuinit init_timers_cpu(int cpu) | 1506 | static int __cpuinit init_timers_cpu(int cpu) |
1646 | { | 1507 | { |
1647 | int j; | 1508 | int j; |
@@ -1678,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu) | |||
1678 | boot_done = 1; | 1539 | boot_done = 1; |
1679 | base = &boot_tvec_bases; | 1540 | base = &boot_tvec_bases; |
1680 | } | 1541 | } |
1542 | spin_lock_init(&base->lock); | ||
1681 | tvec_base_done[cpu] = 1; | 1543 | tvec_base_done[cpu] = 1; |
1682 | } else { | 1544 | } else { |
1683 | base = per_cpu(tvec_bases, cpu); | 1545 | base = per_cpu(tvec_bases, cpu); |
1684 | } | 1546 | } |
1685 | 1547 | ||
1686 | spin_lock_init(&base->lock); | ||
1687 | 1548 | ||
1688 | for (j = 0; j < TVN_SIZE; j++) { | 1549 | for (j = 0; j < TVN_SIZE; j++) { |
1689 | INIT_LIST_HEAD(base->tv5.vec + j); | 1550 | INIT_LIST_HEAD(base->tv5.vec + j); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fc382d6e2765..015f85aaca08 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -71,6 +71,7 @@ config TRACE_CLOCK | |||
71 | config RING_BUFFER | 71 | config RING_BUFFER |
72 | bool | 72 | bool |
73 | select TRACE_CLOCK | 73 | select TRACE_CLOCK |
74 | select IRQ_WORK | ||
74 | 75 | ||
75 | config FTRACE_NMI_ENTER | 76 | config FTRACE_NMI_ENTER |
76 | bool | 77 | bool |
@@ -107,7 +108,6 @@ config TRACING | |||
107 | select BINARY_PRINTF | 108 | select BINARY_PRINTF |
108 | select EVENT_TRACING | 109 | select EVENT_TRACING |
109 | select TRACE_CLOCK | 110 | select TRACE_CLOCK |
110 | select IRQ_WORK | ||
111 | 111 | ||
112 | config GENERIC_TRACER | 112 | config GENERIC_TRACER |
113 | bool | 113 | bool |
@@ -176,6 +176,8 @@ config IRQSOFF_TRACER | |||
176 | select GENERIC_TRACER | 176 | select GENERIC_TRACER |
177 | select TRACER_MAX_TRACE | 177 | select TRACER_MAX_TRACE |
178 | select RING_BUFFER_ALLOW_SWAP | 178 | select RING_BUFFER_ALLOW_SWAP |
179 | select TRACER_SNAPSHOT | ||
180 | select TRACER_SNAPSHOT_PER_CPU_SWAP | ||
179 | help | 181 | help |
180 | This option measures the time spent in irqs-off critical | 182 | This option measures the time spent in irqs-off critical |
181 | sections, with microsecond accuracy. | 183 | sections, with microsecond accuracy. |
@@ -198,6 +200,8 @@ config PREEMPT_TRACER | |||
198 | select GENERIC_TRACER | 200 | select GENERIC_TRACER |
199 | select TRACER_MAX_TRACE | 201 | select TRACER_MAX_TRACE |
200 | select RING_BUFFER_ALLOW_SWAP | 202 | select RING_BUFFER_ALLOW_SWAP |
203 | select TRACER_SNAPSHOT | ||
204 | select TRACER_SNAPSHOT_PER_CPU_SWAP | ||
201 | help | 205 | help |
202 | This option measures the time spent in preemption-off critical | 206 | This option measures the time spent in preemption-off critical |
203 | sections, with microsecond accuracy. | 207 | sections, with microsecond accuracy. |
@@ -217,6 +221,7 @@ config SCHED_TRACER | |||
217 | select GENERIC_TRACER | 221 | select GENERIC_TRACER |
218 | select CONTEXT_SWITCH_TRACER | 222 | select CONTEXT_SWITCH_TRACER |
219 | select TRACER_MAX_TRACE | 223 | select TRACER_MAX_TRACE |
224 | select TRACER_SNAPSHOT | ||
220 | help | 225 | help |
221 | This tracer tracks the latency of the highest priority task | 226 | This tracer tracks the latency of the highest priority task |
222 | to be scheduled in, starting from the point it has woken up. | 227 | to be scheduled in, starting from the point it has woken up. |
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT | |||
248 | echo 1 > /sys/kernel/debug/tracing/snapshot | 253 | echo 1 > /sys/kernel/debug/tracing/snapshot |
249 | cat snapshot | 254 | cat snapshot |
250 | 255 | ||
256 | config TRACER_SNAPSHOT_PER_CPU_SWAP | ||
257 | bool "Allow snapshot to swap per CPU" | ||
258 | depends on TRACER_SNAPSHOT | ||
259 | select RING_BUFFER_ALLOW_SWAP | ||
260 | help | ||
261 | Allow doing a snapshot of a single CPU buffer instead of a | ||
262 | full swap (all buffers). If this is set, then the following is | ||
263 | allowed: | ||
264 | |||
265 | echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot | ||
266 | |||
267 | After which, only the tracing buffer for CPU 2 was swapped with | ||
268 | the main tracing buffer, and the other CPU buffers remain the same. | ||
269 | |||
270 | When this is enabled, this adds a little more overhead to the | ||
271 | trace recording, as it needs to add some checks to synchronize | ||
272 | recording with swaps. But this does not affect the performance | ||
273 | of the overall system. This is enabled by default when the preempt | ||
274 | or irq latency tracers are enabled, as those need to swap as well | ||
275 | and already adds the overhead (plus a lot more). | ||
276 | |||
251 | config TRACE_BRANCH_PROFILING | 277 | config TRACE_BRANCH_PROFILING |
252 | bool | 278 | bool |
253 | select GENERIC_TRACER | 279 | select GENERIC_TRACER |
@@ -524,6 +550,29 @@ config RING_BUFFER_BENCHMARK | |||
524 | 550 | ||
525 | If unsure, say N. | 551 | If unsure, say N. |
526 | 552 | ||
553 | config RING_BUFFER_STARTUP_TEST | ||
554 | bool "Ring buffer startup self test" | ||
555 | depends on RING_BUFFER | ||
556 | help | ||
557 | Run a simple self test on the ring buffer on boot up. Late in the | ||
558 | kernel boot sequence, the test will start that kicks off | ||
559 | a thread per cpu. Each thread will write various size events | ||
560 | into the ring buffer. Another thread is created to send IPIs | ||
561 | to each of the threads, where the IPI handler will also write | ||
562 | to the ring buffer, to test/stress the nesting ability. | ||
563 | If any anomalies are discovered, a warning will be displayed | ||
564 | and all ring buffers will be disabled. | ||
565 | |||
566 | The test runs for 10 seconds. This will slow your boot time | ||
567 | by at least 10 more seconds. | ||
568 | |||
569 | At the end of the test, statics and more checks are done. | ||
570 | It will output the stats of each per cpu buffer. What | ||
571 | was written, the sizes, what was read, what was lost, and | ||
572 | other similar details. | ||
573 | |||
574 | If unsure, say N | ||
575 | |||
527 | endif # FTRACE | 576 | endif # FTRACE |
528 | 577 | ||
529 | endif # TRACING_SUPPORT | 578 | endif # TRACING_SUPPORT |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272eec..b8b8560bfb95 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, | |||
72 | bool blk_tracer = blk_tracer_enabled; | 72 | bool blk_tracer = blk_tracer_enabled; |
73 | 73 | ||
74 | if (blk_tracer) { | 74 | if (blk_tracer) { |
75 | buffer = blk_tr->buffer; | 75 | buffer = blk_tr->trace_buffer.buffer; |
76 | pc = preempt_count(); | 76 | pc = preempt_count(); |
77 | event = trace_buffer_lock_reserve(buffer, TRACE_BLK, | 77 | event = trace_buffer_lock_reserve(buffer, TRACE_BLK, |
78 | sizeof(*t) + len, | 78 | sizeof(*t) + len, |
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
218 | if (blk_tracer) { | 218 | if (blk_tracer) { |
219 | tracing_record_cmdline(current); | 219 | tracing_record_cmdline(current); |
220 | 220 | ||
221 | buffer = blk_tr->buffer; | 221 | buffer = blk_tr->trace_buffer.buffer; |
222 | pc = preempt_count(); | 222 | pc = preempt_count(); |
223 | event = trace_buffer_lock_reserve(buffer, TRACE_BLK, | 223 | event = trace_buffer_lock_reserve(buffer, TRACE_BLK, |
224 | sizeof(*t) + pdu_len, | 224 | sizeof(*t) + pdu_len, |
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore, | |||
739 | struct request_queue *q, | 739 | struct request_queue *q, |
740 | struct request *rq) | 740 | struct request *rq) |
741 | { | 741 | { |
742 | struct blk_trace *bt = q->blk_trace; | ||
743 | |||
744 | /* if control ever passes through here, it's a request based driver */ | ||
745 | if (unlikely(bt && !bt->rq_based)) | ||
746 | bt->rq_based = true; | ||
747 | |||
748 | blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); | 742 | blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); |
749 | } | 743 | } |
750 | 744 | ||
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore, | |||
780 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); | 774 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); |
781 | } | 775 | } |
782 | 776 | ||
783 | static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) | 777 | static void blk_add_trace_bio_complete(void *ignore, |
778 | struct request_queue *q, struct bio *bio, | ||
779 | int error) | ||
784 | { | 780 | { |
785 | struct request_queue *q; | ||
786 | struct blk_trace *bt; | ||
787 | |||
788 | if (!bio->bi_bdev) | ||
789 | return; | ||
790 | |||
791 | q = bdev_get_queue(bio->bi_bdev); | ||
792 | bt = q->blk_trace; | ||
793 | |||
794 | /* | ||
795 | * Request based drivers will generate both rq and bio completions. | ||
796 | * Ignore bio ones. | ||
797 | */ | ||
798 | if (likely(!bt) || bt->rq_based) | ||
799 | return; | ||
800 | |||
801 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); | 781 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); |
802 | } | 782 | } |
803 | 783 | ||
@@ -1828,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1828 | 1808 | ||
1829 | rwbs[i] = '\0'; | 1809 | rwbs[i] = '\0'; |
1830 | } | 1810 | } |
1811 | EXPORT_SYMBOL_GPL(blk_fill_rwbs); | ||
1831 | 1812 | ||
1832 | #endif /* CONFIG_EVENT_TRACING */ | 1813 | #endif /* CONFIG_EVENT_TRACING */ |
1833 | 1814 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ab25b88aae56..6c508ff33c62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -64,9 +64,16 @@ | |||
64 | 64 | ||
65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) | 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) |
66 | 66 | ||
67 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
68 | #define INIT_REGEX_LOCK(opsname) \ | ||
69 | .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), | ||
70 | #else | ||
71 | #define INIT_REGEX_LOCK(opsname) | ||
72 | #endif | ||
73 | |||
67 | static struct ftrace_ops ftrace_list_end __read_mostly = { | 74 | static struct ftrace_ops ftrace_list_end __read_mostly = { |
68 | .func = ftrace_stub, | 75 | .func = ftrace_stub, |
69 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | 76 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, |
70 | }; | 77 | }; |
71 | 78 | ||
72 | /* ftrace_enabled is a method to turn ftrace on or off */ | 79 | /* ftrace_enabled is a method to turn ftrace on or off */ |
@@ -113,24 +120,34 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); | |||
113 | 120 | ||
114 | /* | 121 | /* |
115 | * Traverse the ftrace_global_list, invoking all entries. The reason that we | 122 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
116 | * can use rcu_dereference_raw() is that elements removed from this list | 123 | * can use rcu_dereference_raw_notrace() is that elements removed from this list |
117 | * are simply leaked, so there is no need to interact with a grace-period | 124 | * are simply leaked, so there is no need to interact with a grace-period |
118 | * mechanism. The rcu_dereference_raw() calls are needed to handle | 125 | * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle |
119 | * concurrent insertions into the ftrace_global_list. | 126 | * concurrent insertions into the ftrace_global_list. |
120 | * | 127 | * |
121 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 128 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
122 | */ | 129 | */ |
123 | #define do_for_each_ftrace_op(op, list) \ | 130 | #define do_for_each_ftrace_op(op, list) \ |
124 | op = rcu_dereference_raw(list); \ | 131 | op = rcu_dereference_raw_notrace(list); \ |
125 | do | 132 | do |
126 | 133 | ||
127 | /* | 134 | /* |
128 | * Optimized for just a single item in the list (as that is the normal case). | 135 | * Optimized for just a single item in the list (as that is the normal case). |
129 | */ | 136 | */ |
130 | #define while_for_each_ftrace_op(op) \ | 137 | #define while_for_each_ftrace_op(op) \ |
131 | while (likely(op = rcu_dereference_raw((op)->next)) && \ | 138 | while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ |
132 | unlikely((op) != &ftrace_list_end)) | 139 | unlikely((op) != &ftrace_list_end)) |
133 | 140 | ||
141 | static inline void ftrace_ops_init(struct ftrace_ops *ops) | ||
142 | { | ||
143 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
144 | if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { | ||
145 | mutex_init(&ops->regex_lock); | ||
146 | ops->flags |= FTRACE_OPS_FL_INITIALIZED; | ||
147 | } | ||
148 | #endif | ||
149 | } | ||
150 | |||
134 | /** | 151 | /** |
135 | * ftrace_nr_registered_ops - return number of ops registered | 152 | * ftrace_nr_registered_ops - return number of ops registered |
136 | * | 153 | * |
@@ -486,7 +503,6 @@ struct ftrace_profile_stat { | |||
486 | #define PROFILES_PER_PAGE \ | 503 | #define PROFILES_PER_PAGE \ |
487 | (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) | 504 | (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) |
488 | 505 | ||
489 | static int ftrace_profile_bits __read_mostly; | ||
490 | static int ftrace_profile_enabled __read_mostly; | 506 | static int ftrace_profile_enabled __read_mostly; |
491 | 507 | ||
492 | /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ | 508 | /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ |
@@ -494,7 +510,8 @@ static DEFINE_MUTEX(ftrace_profile_lock); | |||
494 | 510 | ||
495 | static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); | 511 | static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); |
496 | 512 | ||
497 | #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ | 513 | #define FTRACE_PROFILE_HASH_BITS 10 |
514 | #define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS) | ||
498 | 515 | ||
499 | static void * | 516 | static void * |
500 | function_stat_next(void *v, int idx) | 517 | function_stat_next(void *v, int idx) |
@@ -676,7 +693,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) | |||
676 | 693 | ||
677 | pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); | 694 | pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); |
678 | 695 | ||
679 | for (i = 0; i < pages; i++) { | 696 | for (i = 1; i < pages; i++) { |
680 | pg->next = (void *)get_zeroed_page(GFP_KERNEL); | 697 | pg->next = (void *)get_zeroed_page(GFP_KERNEL); |
681 | if (!pg->next) | 698 | if (!pg->next) |
682 | goto out_free; | 699 | goto out_free; |
@@ -694,7 +711,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) | |||
694 | free_page(tmp); | 711 | free_page(tmp); |
695 | } | 712 | } |
696 | 713 | ||
697 | free_page((unsigned long)stat->pages); | ||
698 | stat->pages = NULL; | 714 | stat->pages = NULL; |
699 | stat->start = NULL; | 715 | stat->start = NULL; |
700 | 716 | ||
@@ -725,13 +741,6 @@ static int ftrace_profile_init_cpu(int cpu) | |||
725 | if (!stat->hash) | 741 | if (!stat->hash) |
726 | return -ENOMEM; | 742 | return -ENOMEM; |
727 | 743 | ||
728 | if (!ftrace_profile_bits) { | ||
729 | size--; | ||
730 | |||
731 | for (; size; size >>= 1) | ||
732 | ftrace_profile_bits++; | ||
733 | } | ||
734 | |||
735 | /* Preallocate the function profiling pages */ | 744 | /* Preallocate the function profiling pages */ |
736 | if (ftrace_profile_pages_init(stat) < 0) { | 745 | if (ftrace_profile_pages_init(stat) < 0) { |
737 | kfree(stat->hash); | 746 | kfree(stat->hash); |
@@ -764,13 +773,13 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) | |||
764 | struct hlist_head *hhd; | 773 | struct hlist_head *hhd; |
765 | unsigned long key; | 774 | unsigned long key; |
766 | 775 | ||
767 | key = hash_long(ip, ftrace_profile_bits); | 776 | key = hash_long(ip, FTRACE_PROFILE_HASH_BITS); |
768 | hhd = &stat->hash[key]; | 777 | hhd = &stat->hash[key]; |
769 | 778 | ||
770 | if (hlist_empty(hhd)) | 779 | if (hlist_empty(hhd)) |
771 | return NULL; | 780 | return NULL; |
772 | 781 | ||
773 | hlist_for_each_entry_rcu(rec, hhd, node) { | 782 | hlist_for_each_entry_rcu_notrace(rec, hhd, node) { |
774 | if (rec->ip == ip) | 783 | if (rec->ip == ip) |
775 | return rec; | 784 | return rec; |
776 | } | 785 | } |
@@ -783,7 +792,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, | |||
783 | { | 792 | { |
784 | unsigned long key; | 793 | unsigned long key; |
785 | 794 | ||
786 | key = hash_long(rec->ip, ftrace_profile_bits); | 795 | key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS); |
787 | hlist_add_head_rcu(&rec->node, &stat->hash[key]); | 796 | hlist_add_head_rcu(&rec->node, &stat->hash[key]); |
788 | } | 797 | } |
789 | 798 | ||
@@ -915,7 +924,8 @@ static void unregister_ftrace_profiler(void) | |||
915 | #else | 924 | #else |
916 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { | 925 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { |
917 | .func = function_profile_call, | 926 | .func = function_profile_call, |
918 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | 927 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, |
928 | INIT_REGEX_LOCK(ftrace_profile_ops) | ||
919 | }; | 929 | }; |
920 | 930 | ||
921 | static int register_ftrace_profiler(void) | 931 | static int register_ftrace_profiler(void) |
@@ -1053,6 +1063,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | |||
1053 | 1063 | ||
1054 | static struct pid * const ftrace_swapper_pid = &init_struct_pid; | 1064 | static struct pid * const ftrace_swapper_pid = &init_struct_pid; |
1055 | 1065 | ||
1066 | loff_t | ||
1067 | ftrace_filter_lseek(struct file *file, loff_t offset, int whence) | ||
1068 | { | ||
1069 | loff_t ret; | ||
1070 | |||
1071 | if (file->f_mode & FMODE_READ) | ||
1072 | ret = seq_lseek(file, offset, whence); | ||
1073 | else | ||
1074 | file->f_pos = ret = 1; | ||
1075 | |||
1076 | return ret; | ||
1077 | } | ||
1078 | |||
1056 | #ifdef CONFIG_DYNAMIC_FTRACE | 1079 | #ifdef CONFIG_DYNAMIC_FTRACE |
1057 | 1080 | ||
1058 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD | 1081 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD |
@@ -1067,7 +1090,7 @@ struct ftrace_func_probe { | |||
1067 | unsigned long flags; | 1090 | unsigned long flags; |
1068 | unsigned long ip; | 1091 | unsigned long ip; |
1069 | void *data; | 1092 | void *data; |
1070 | struct rcu_head rcu; | 1093 | struct list_head free_list; |
1071 | }; | 1094 | }; |
1072 | 1095 | ||
1073 | struct ftrace_func_entry { | 1096 | struct ftrace_func_entry { |
@@ -1098,11 +1121,10 @@ static struct ftrace_ops global_ops = { | |||
1098 | .func = ftrace_stub, | 1121 | .func = ftrace_stub, |
1099 | .notrace_hash = EMPTY_HASH, | 1122 | .notrace_hash = EMPTY_HASH, |
1100 | .filter_hash = EMPTY_HASH, | 1123 | .filter_hash = EMPTY_HASH, |
1101 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | 1124 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, |
1125 | INIT_REGEX_LOCK(global_ops) | ||
1102 | }; | 1126 | }; |
1103 | 1127 | ||
1104 | static DEFINE_MUTEX(ftrace_regex_lock); | ||
1105 | |||
1106 | struct ftrace_page { | 1128 | struct ftrace_page { |
1107 | struct ftrace_page *next; | 1129 | struct ftrace_page *next; |
1108 | struct dyn_ftrace *records; | 1130 | struct dyn_ftrace *records; |
@@ -1143,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | |||
1143 | 1165 | ||
1144 | hhd = &hash->buckets[key]; | 1166 | hhd = &hash->buckets[key]; |
1145 | 1167 | ||
1146 | hlist_for_each_entry_rcu(entry, hhd, hlist) { | 1168 | hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { |
1147 | if (entry->ip == ip) | 1169 | if (entry->ip == ip) |
1148 | return entry; | 1170 | return entry; |
1149 | } | 1171 | } |
@@ -1242,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) | |||
1242 | 1264 | ||
1243 | void ftrace_free_filter(struct ftrace_ops *ops) | 1265 | void ftrace_free_filter(struct ftrace_ops *ops) |
1244 | { | 1266 | { |
1267 | ftrace_ops_init(ops); | ||
1245 | free_ftrace_hash(ops->filter_hash); | 1268 | free_ftrace_hash(ops->filter_hash); |
1246 | free_ftrace_hash(ops->notrace_hash); | 1269 | free_ftrace_hash(ops->notrace_hash); |
1247 | } | 1270 | } |
@@ -1317,7 +1340,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1317 | struct hlist_head *hhd; | 1340 | struct hlist_head *hhd; |
1318 | struct ftrace_hash *old_hash; | 1341 | struct ftrace_hash *old_hash; |
1319 | struct ftrace_hash *new_hash; | 1342 | struct ftrace_hash *new_hash; |
1320 | unsigned long key; | ||
1321 | int size = src->count; | 1343 | int size = src->count; |
1322 | int bits = 0; | 1344 | int bits = 0; |
1323 | int ret; | 1345 | int ret; |
@@ -1360,10 +1382,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1360 | for (i = 0; i < size; i++) { | 1382 | for (i = 0; i < size; i++) { |
1361 | hhd = &src->buckets[i]; | 1383 | hhd = &src->buckets[i]; |
1362 | hlist_for_each_entry_safe(entry, tn, hhd, hlist) { | 1384 | hlist_for_each_entry_safe(entry, tn, hhd, hlist) { |
1363 | if (bits > 0) | ||
1364 | key = hash_long(entry->ip, bits); | ||
1365 | else | ||
1366 | key = 0; | ||
1367 | remove_hash_entry(src, entry); | 1385 | remove_hash_entry(src, entry); |
1368 | __add_hash_entry(new_hash, entry); | 1386 | __add_hash_entry(new_hash, entry); |
1369 | } | 1387 | } |
@@ -1404,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1404 | struct ftrace_hash *notrace_hash; | 1422 | struct ftrace_hash *notrace_hash; |
1405 | int ret; | 1423 | int ret; |
1406 | 1424 | ||
1407 | filter_hash = rcu_dereference_raw(ops->filter_hash); | 1425 | filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); |
1408 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); | 1426 | notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); |
1409 | 1427 | ||
1410 | if ((ftrace_hash_empty(filter_hash) || | 1428 | if ((ftrace_hash_empty(filter_hash) || |
1411 | ftrace_lookup_ip(filter_hash, ip)) && | 1429 | ftrace_lookup_ip(filter_hash, ip)) && |
@@ -2441,7 +2459,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
2441 | !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || | 2459 | !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || |
2442 | 2460 | ||
2443 | ((iter->flags & FTRACE_ITER_ENABLED) && | 2461 | ((iter->flags & FTRACE_ITER_ENABLED) && |
2444 | !(rec->flags & ~FTRACE_FL_MASK))) { | 2462 | !(rec->flags & FTRACE_FL_ENABLED))) { |
2445 | 2463 | ||
2446 | rec = NULL; | 2464 | rec = NULL; |
2447 | goto retry; | 2465 | goto retry; |
@@ -2613,7 +2631,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) | |||
2613 | * routine, you can use ftrace_filter_write() for the write | 2631 | * routine, you can use ftrace_filter_write() for the write |
2614 | * routine if @flag has FTRACE_ITER_FILTER set, or | 2632 | * routine if @flag has FTRACE_ITER_FILTER set, or |
2615 | * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. | 2633 | * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. |
2616 | * ftrace_regex_lseek() should be used as the lseek routine, and | 2634 | * ftrace_filter_lseek() should be used as the lseek routine, and |
2617 | * release must call ftrace_regex_release(). | 2635 | * release must call ftrace_regex_release(). |
2618 | */ | 2636 | */ |
2619 | int | 2637 | int |
@@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
2624 | struct ftrace_hash *hash; | 2642 | struct ftrace_hash *hash; |
2625 | int ret = 0; | 2643 | int ret = 0; |
2626 | 2644 | ||
2645 | ftrace_ops_init(ops); | ||
2646 | |||
2627 | if (unlikely(ftrace_disabled)) | 2647 | if (unlikely(ftrace_disabled)) |
2628 | return -ENODEV; | 2648 | return -ENODEV; |
2629 | 2649 | ||
@@ -2636,28 +2656,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
2636 | return -ENOMEM; | 2656 | return -ENOMEM; |
2637 | } | 2657 | } |
2638 | 2658 | ||
2659 | iter->ops = ops; | ||
2660 | iter->flags = flag; | ||
2661 | |||
2662 | mutex_lock(&ops->regex_lock); | ||
2663 | |||
2639 | if (flag & FTRACE_ITER_NOTRACE) | 2664 | if (flag & FTRACE_ITER_NOTRACE) |
2640 | hash = ops->notrace_hash; | 2665 | hash = ops->notrace_hash; |
2641 | else | 2666 | else |
2642 | hash = ops->filter_hash; | 2667 | hash = ops->filter_hash; |
2643 | 2668 | ||
2644 | iter->ops = ops; | ||
2645 | iter->flags = flag; | ||
2646 | |||
2647 | if (file->f_mode & FMODE_WRITE) { | 2669 | if (file->f_mode & FMODE_WRITE) { |
2648 | mutex_lock(&ftrace_lock); | ||
2649 | iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); | 2670 | iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); |
2650 | mutex_unlock(&ftrace_lock); | ||
2651 | |||
2652 | if (!iter->hash) { | 2671 | if (!iter->hash) { |
2653 | trace_parser_put(&iter->parser); | 2672 | trace_parser_put(&iter->parser); |
2654 | kfree(iter); | 2673 | kfree(iter); |
2655 | return -ENOMEM; | 2674 | ret = -ENOMEM; |
2675 | goto out_unlock; | ||
2656 | } | 2676 | } |
2657 | } | 2677 | } |
2658 | 2678 | ||
2659 | mutex_lock(&ftrace_regex_lock); | ||
2660 | |||
2661 | if ((file->f_mode & FMODE_WRITE) && | 2679 | if ((file->f_mode & FMODE_WRITE) && |
2662 | (file->f_flags & O_TRUNC)) | 2680 | (file->f_flags & O_TRUNC)) |
2663 | ftrace_filter_reset(iter->hash); | 2681 | ftrace_filter_reset(iter->hash); |
@@ -2677,7 +2695,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
2677 | } | 2695 | } |
2678 | } else | 2696 | } else |
2679 | file->private_data = iter; | 2697 | file->private_data = iter; |
2680 | mutex_unlock(&ftrace_regex_lock); | 2698 | |
2699 | out_unlock: | ||
2700 | mutex_unlock(&ops->regex_lock); | ||
2681 | 2701 | ||
2682 | return ret; | 2702 | return ret; |
2683 | } | 2703 | } |
@@ -2697,19 +2717,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file) | |||
2697 | inode, file); | 2717 | inode, file); |
2698 | } | 2718 | } |
2699 | 2719 | ||
2700 | loff_t | ||
2701 | ftrace_regex_lseek(struct file *file, loff_t offset, int whence) | ||
2702 | { | ||
2703 | loff_t ret; | ||
2704 | |||
2705 | if (file->f_mode & FMODE_READ) | ||
2706 | ret = seq_lseek(file, offset, whence); | ||
2707 | else | ||
2708 | file->f_pos = ret = 1; | ||
2709 | |||
2710 | return ret; | ||
2711 | } | ||
2712 | |||
2713 | static int ftrace_match(char *str, char *regex, int len, int type) | 2720 | static int ftrace_match(char *str, char *regex, int len, int type) |
2714 | { | 2721 | { |
2715 | int matched = 0; | 2722 | int matched = 0; |
@@ -2913,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | |||
2913 | * on the hash. rcu_read_lock is too dangerous here. | 2920 | * on the hash. rcu_read_lock is too dangerous here. |
2914 | */ | 2921 | */ |
2915 | preempt_disable_notrace(); | 2922 | preempt_disable_notrace(); |
2916 | hlist_for_each_entry_rcu(entry, hhd, node) { | 2923 | hlist_for_each_entry_rcu_notrace(entry, hhd, node) { |
2917 | if (entry->ip == ip) | 2924 | if (entry->ip == ip) |
2918 | entry->ops->func(ip, parent_ip, &entry->data); | 2925 | entry->ops->func(ip, parent_ip, &entry->data); |
2919 | } | 2926 | } |
@@ -2923,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | |||
2923 | static struct ftrace_ops trace_probe_ops __read_mostly = | 2930 | static struct ftrace_ops trace_probe_ops __read_mostly = |
2924 | { | 2931 | { |
2925 | .func = function_trace_probe_call, | 2932 | .func = function_trace_probe_call, |
2933 | .flags = FTRACE_OPS_FL_INITIALIZED, | ||
2934 | INIT_REGEX_LOCK(trace_probe_ops) | ||
2926 | }; | 2935 | }; |
2927 | 2936 | ||
2928 | static int ftrace_probe_registered; | 2937 | static int ftrace_probe_registered; |
@@ -2932,8 +2941,12 @@ static void __enable_ftrace_function_probe(void) | |||
2932 | int ret; | 2941 | int ret; |
2933 | int i; | 2942 | int i; |
2934 | 2943 | ||
2935 | if (ftrace_probe_registered) | 2944 | if (ftrace_probe_registered) { |
2945 | /* still need to update the function call sites */ | ||
2946 | if (ftrace_enabled) | ||
2947 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | ||
2936 | return; | 2948 | return; |
2949 | } | ||
2937 | 2950 | ||
2938 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { | 2951 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { |
2939 | struct hlist_head *hhd = &ftrace_func_hash[i]; | 2952 | struct hlist_head *hhd = &ftrace_func_hash[i]; |
@@ -2974,28 +2987,27 @@ static void __disable_ftrace_function_probe(void) | |||
2974 | } | 2987 | } |
2975 | 2988 | ||
2976 | 2989 | ||
2977 | static void ftrace_free_entry_rcu(struct rcu_head *rhp) | 2990 | static void ftrace_free_entry(struct ftrace_func_probe *entry) |
2978 | { | 2991 | { |
2979 | struct ftrace_func_probe *entry = | ||
2980 | container_of(rhp, struct ftrace_func_probe, rcu); | ||
2981 | |||
2982 | if (entry->ops->free) | 2992 | if (entry->ops->free) |
2983 | entry->ops->free(&entry->data); | 2993 | entry->ops->free(entry->ops, entry->ip, &entry->data); |
2984 | kfree(entry); | 2994 | kfree(entry); |
2985 | } | 2995 | } |
2986 | 2996 | ||
2987 | |||
2988 | int | 2997 | int |
2989 | register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | 2998 | register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, |
2990 | void *data) | 2999 | void *data) |
2991 | { | 3000 | { |
2992 | struct ftrace_func_probe *entry; | 3001 | struct ftrace_func_probe *entry; |
3002 | struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; | ||
3003 | struct ftrace_hash *hash; | ||
2993 | struct ftrace_page *pg; | 3004 | struct ftrace_page *pg; |
2994 | struct dyn_ftrace *rec; | 3005 | struct dyn_ftrace *rec; |
2995 | int type, len, not; | 3006 | int type, len, not; |
2996 | unsigned long key; | 3007 | unsigned long key; |
2997 | int count = 0; | 3008 | int count = 0; |
2998 | char *search; | 3009 | char *search; |
3010 | int ret; | ||
2999 | 3011 | ||
3000 | type = filter_parse_regex(glob, strlen(glob), &search, ¬); | 3012 | type = filter_parse_regex(glob, strlen(glob), &search, ¬); |
3001 | len = strlen(search); | 3013 | len = strlen(search); |
@@ -3004,10 +3016,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
3004 | if (WARN_ON(not)) | 3016 | if (WARN_ON(not)) |
3005 | return -EINVAL; | 3017 | return -EINVAL; |
3006 | 3018 | ||
3007 | mutex_lock(&ftrace_lock); | 3019 | mutex_lock(&trace_probe_ops.regex_lock); |
3008 | 3020 | ||
3009 | if (unlikely(ftrace_disabled)) | 3021 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); |
3010 | goto out_unlock; | 3022 | if (!hash) { |
3023 | count = -ENOMEM; | ||
3024 | goto out; | ||
3025 | } | ||
3026 | |||
3027 | if (unlikely(ftrace_disabled)) { | ||
3028 | count = -ENODEV; | ||
3029 | goto out; | ||
3030 | } | ||
3031 | |||
3032 | mutex_lock(&ftrace_lock); | ||
3011 | 3033 | ||
3012 | do_for_each_ftrace_rec(pg, rec) { | 3034 | do_for_each_ftrace_rec(pg, rec) { |
3013 | 3035 | ||
@@ -3031,14 +3053,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
3031 | * for each function we find. We call the callback | 3053 | * for each function we find. We call the callback |
3032 | * to give the caller an opportunity to do so. | 3054 | * to give the caller an opportunity to do so. |
3033 | */ | 3055 | */ |
3034 | if (ops->callback) { | 3056 | if (ops->init) { |
3035 | if (ops->callback(rec->ip, &entry->data) < 0) { | 3057 | if (ops->init(ops, rec->ip, &entry->data) < 0) { |
3036 | /* caller does not like this func */ | 3058 | /* caller does not like this func */ |
3037 | kfree(entry); | 3059 | kfree(entry); |
3038 | continue; | 3060 | continue; |
3039 | } | 3061 | } |
3040 | } | 3062 | } |
3041 | 3063 | ||
3064 | ret = enter_record(hash, rec, 0); | ||
3065 | if (ret < 0) { | ||
3066 | kfree(entry); | ||
3067 | count = ret; | ||
3068 | goto out_unlock; | ||
3069 | } | ||
3070 | |||
3042 | entry->ops = ops; | 3071 | entry->ops = ops; |
3043 | entry->ip = rec->ip; | 3072 | entry->ip = rec->ip; |
3044 | 3073 | ||
@@ -3046,10 +3075,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
3046 | hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); | 3075 | hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); |
3047 | 3076 | ||
3048 | } while_for_each_ftrace_rec(); | 3077 | } while_for_each_ftrace_rec(); |
3078 | |||
3079 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | ||
3080 | if (ret < 0) | ||
3081 | count = ret; | ||
3082 | |||
3049 | __enable_ftrace_function_probe(); | 3083 | __enable_ftrace_function_probe(); |
3050 | 3084 | ||
3051 | out_unlock: | 3085 | out_unlock: |
3052 | mutex_unlock(&ftrace_lock); | 3086 | mutex_unlock(&ftrace_lock); |
3087 | out: | ||
3088 | mutex_unlock(&trace_probe_ops.regex_lock); | ||
3089 | free_ftrace_hash(hash); | ||
3053 | 3090 | ||
3054 | return count; | 3091 | return count; |
3055 | } | 3092 | } |
@@ -3063,7 +3100,12 @@ static void | |||
3063 | __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | 3100 | __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, |
3064 | void *data, int flags) | 3101 | void *data, int flags) |
3065 | { | 3102 | { |
3103 | struct ftrace_func_entry *rec_entry; | ||
3066 | struct ftrace_func_probe *entry; | 3104 | struct ftrace_func_probe *entry; |
3105 | struct ftrace_func_probe *p; | ||
3106 | struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; | ||
3107 | struct list_head free_list; | ||
3108 | struct ftrace_hash *hash; | ||
3067 | struct hlist_node *tmp; | 3109 | struct hlist_node *tmp; |
3068 | char str[KSYM_SYMBOL_LEN]; | 3110 | char str[KSYM_SYMBOL_LEN]; |
3069 | int type = MATCH_FULL; | 3111 | int type = MATCH_FULL; |
@@ -3083,7 +3125,15 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
3083 | return; | 3125 | return; |
3084 | } | 3126 | } |
3085 | 3127 | ||
3086 | mutex_lock(&ftrace_lock); | 3128 | mutex_lock(&trace_probe_ops.regex_lock); |
3129 | |||
3130 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | ||
3131 | if (!hash) | ||
3132 | /* Hmm, should report this somehow */ | ||
3133 | goto out_unlock; | ||
3134 | |||
3135 | INIT_LIST_HEAD(&free_list); | ||
3136 | |||
3087 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { | 3137 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { |
3088 | struct hlist_head *hhd = &ftrace_func_hash[i]; | 3138 | struct hlist_head *hhd = &ftrace_func_hash[i]; |
3089 | 3139 | ||
@@ -3104,12 +3154,32 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
3104 | continue; | 3154 | continue; |
3105 | } | 3155 | } |
3106 | 3156 | ||
3107 | hlist_del(&entry->node); | 3157 | rec_entry = ftrace_lookup_ip(hash, entry->ip); |
3108 | call_rcu(&entry->rcu, ftrace_free_entry_rcu); | 3158 | /* It is possible more than one entry had this ip */ |
3159 | if (rec_entry) | ||
3160 | free_hash_entry(hash, rec_entry); | ||
3161 | |||
3162 | hlist_del_rcu(&entry->node); | ||
3163 | list_add(&entry->free_list, &free_list); | ||
3109 | } | 3164 | } |
3110 | } | 3165 | } |
3166 | mutex_lock(&ftrace_lock); | ||
3111 | __disable_ftrace_function_probe(); | 3167 | __disable_ftrace_function_probe(); |
3168 | /* | ||
3169 | * Remove after the disable is called. Otherwise, if the last | ||
3170 | * probe is removed, a null hash means *all enabled*. | ||
3171 | */ | ||
3172 | ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | ||
3173 | synchronize_sched(); | ||
3174 | list_for_each_entry_safe(entry, p, &free_list, free_list) { | ||
3175 | list_del(&entry->free_list); | ||
3176 | ftrace_free_entry(entry); | ||
3177 | } | ||
3112 | mutex_unlock(&ftrace_lock); | 3178 | mutex_unlock(&ftrace_lock); |
3179 | |||
3180 | out_unlock: | ||
3181 | mutex_unlock(&trace_probe_ops.regex_lock); | ||
3182 | free_ftrace_hash(hash); | ||
3113 | } | 3183 | } |
3114 | 3184 | ||
3115 | void | 3185 | void |
@@ -3218,18 +3288,17 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
3218 | if (!cnt) | 3288 | if (!cnt) |
3219 | return 0; | 3289 | return 0; |
3220 | 3290 | ||
3221 | mutex_lock(&ftrace_regex_lock); | ||
3222 | |||
3223 | ret = -ENODEV; | ||
3224 | if (unlikely(ftrace_disabled)) | ||
3225 | goto out_unlock; | ||
3226 | |||
3227 | if (file->f_mode & FMODE_READ) { | 3291 | if (file->f_mode & FMODE_READ) { |
3228 | struct seq_file *m = file->private_data; | 3292 | struct seq_file *m = file->private_data; |
3229 | iter = m->private; | 3293 | iter = m->private; |
3230 | } else | 3294 | } else |
3231 | iter = file->private_data; | 3295 | iter = file->private_data; |
3232 | 3296 | ||
3297 | if (unlikely(ftrace_disabled)) | ||
3298 | return -ENODEV; | ||
3299 | |||
3300 | /* iter->hash is a local copy, so we don't need regex_lock */ | ||
3301 | |||
3233 | parser = &iter->parser; | 3302 | parser = &iter->parser; |
3234 | read = trace_get_user(parser, ubuf, cnt, ppos); | 3303 | read = trace_get_user(parser, ubuf, cnt, ppos); |
3235 | 3304 | ||
@@ -3238,14 +3307,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
3238 | ret = ftrace_process_regex(iter->hash, parser->buffer, | 3307 | ret = ftrace_process_regex(iter->hash, parser->buffer, |
3239 | parser->idx, enable); | 3308 | parser->idx, enable); |
3240 | trace_parser_clear(parser); | 3309 | trace_parser_clear(parser); |
3241 | if (ret) | 3310 | if (ret < 0) |
3242 | goto out_unlock; | 3311 | goto out; |
3243 | } | 3312 | } |
3244 | 3313 | ||
3245 | ret = read; | 3314 | ret = read; |
3246 | out_unlock: | 3315 | out: |
3247 | mutex_unlock(&ftrace_regex_lock); | ||
3248 | |||
3249 | return ret; | 3316 | return ret; |
3250 | } | 3317 | } |
3251 | 3318 | ||
@@ -3297,16 +3364,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3297 | if (unlikely(ftrace_disabled)) | 3364 | if (unlikely(ftrace_disabled)) |
3298 | return -ENODEV; | 3365 | return -ENODEV; |
3299 | 3366 | ||
3367 | mutex_lock(&ops->regex_lock); | ||
3368 | |||
3300 | if (enable) | 3369 | if (enable) |
3301 | orig_hash = &ops->filter_hash; | 3370 | orig_hash = &ops->filter_hash; |
3302 | else | 3371 | else |
3303 | orig_hash = &ops->notrace_hash; | 3372 | orig_hash = &ops->notrace_hash; |
3304 | 3373 | ||
3305 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | 3374 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); |
3306 | if (!hash) | 3375 | if (!hash) { |
3307 | return -ENOMEM; | 3376 | ret = -ENOMEM; |
3377 | goto out_regex_unlock; | ||
3378 | } | ||
3308 | 3379 | ||
3309 | mutex_lock(&ftrace_regex_lock); | ||
3310 | if (reset) | 3380 | if (reset) |
3311 | ftrace_filter_reset(hash); | 3381 | ftrace_filter_reset(hash); |
3312 | if (buf && !ftrace_match_records(hash, buf, len)) { | 3382 | if (buf && !ftrace_match_records(hash, buf, len)) { |
@@ -3328,7 +3398,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3328 | mutex_unlock(&ftrace_lock); | 3398 | mutex_unlock(&ftrace_lock); |
3329 | 3399 | ||
3330 | out_regex_unlock: | 3400 | out_regex_unlock: |
3331 | mutex_unlock(&ftrace_regex_lock); | 3401 | mutex_unlock(&ops->regex_lock); |
3332 | 3402 | ||
3333 | free_ftrace_hash(hash); | 3403 | free_ftrace_hash(hash); |
3334 | return ret; | 3404 | return ret; |
@@ -3354,6 +3424,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, | |||
3354 | int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, | 3424 | int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, |
3355 | int remove, int reset) | 3425 | int remove, int reset) |
3356 | { | 3426 | { |
3427 | ftrace_ops_init(ops); | ||
3357 | return ftrace_set_addr(ops, ip, remove, reset, 1); | 3428 | return ftrace_set_addr(ops, ip, remove, reset, 1); |
3358 | } | 3429 | } |
3359 | EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); | 3430 | EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); |
@@ -3378,6 +3449,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3378 | int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, | 3449 | int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, |
3379 | int len, int reset) | 3450 | int len, int reset) |
3380 | { | 3451 | { |
3452 | ftrace_ops_init(ops); | ||
3381 | return ftrace_set_regex(ops, buf, len, reset, 1); | 3453 | return ftrace_set_regex(ops, buf, len, reset, 1); |
3382 | } | 3454 | } |
3383 | EXPORT_SYMBOL_GPL(ftrace_set_filter); | 3455 | EXPORT_SYMBOL_GPL(ftrace_set_filter); |
@@ -3396,6 +3468,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); | |||
3396 | int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, | 3468 | int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, |
3397 | int len, int reset) | 3469 | int len, int reset) |
3398 | { | 3470 | { |
3471 | ftrace_ops_init(ops); | ||
3399 | return ftrace_set_regex(ops, buf, len, reset, 0); | 3472 | return ftrace_set_regex(ops, buf, len, reset, 0); |
3400 | } | 3473 | } |
3401 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | 3474 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); |
@@ -3441,14 +3514,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; | |||
3441 | 3514 | ||
3442 | static int __init set_ftrace_notrace(char *str) | 3515 | static int __init set_ftrace_notrace(char *str) |
3443 | { | 3516 | { |
3444 | strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); | 3517 | strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); |
3445 | return 1; | 3518 | return 1; |
3446 | } | 3519 | } |
3447 | __setup("ftrace_notrace=", set_ftrace_notrace); | 3520 | __setup("ftrace_notrace=", set_ftrace_notrace); |
3448 | 3521 | ||
3449 | static int __init set_ftrace_filter(char *str) | 3522 | static int __init set_ftrace_filter(char *str) |
3450 | { | 3523 | { |
3451 | strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); | 3524 | strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); |
3452 | return 1; | 3525 | return 1; |
3453 | } | 3526 | } |
3454 | __setup("ftrace_filter=", set_ftrace_filter); | 3527 | __setup("ftrace_filter=", set_ftrace_filter); |
@@ -3486,6 +3559,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) | |||
3486 | { | 3559 | { |
3487 | char *func; | 3560 | char *func; |
3488 | 3561 | ||
3562 | ftrace_ops_init(ops); | ||
3563 | |||
3489 | while (buf) { | 3564 | while (buf) { |
3490 | func = strsep(&buf, ","); | 3565 | func = strsep(&buf, ","); |
3491 | ftrace_set_regex(ops, func, strlen(func), 0, enable); | 3566 | ftrace_set_regex(ops, func, strlen(func), 0, enable); |
@@ -3513,10 +3588,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
3513 | int filter_hash; | 3588 | int filter_hash; |
3514 | int ret; | 3589 | int ret; |
3515 | 3590 | ||
3516 | mutex_lock(&ftrace_regex_lock); | ||
3517 | if (file->f_mode & FMODE_READ) { | 3591 | if (file->f_mode & FMODE_READ) { |
3518 | iter = m->private; | 3592 | iter = m->private; |
3519 | |||
3520 | seq_release(inode, file); | 3593 | seq_release(inode, file); |
3521 | } else | 3594 | } else |
3522 | iter = file->private_data; | 3595 | iter = file->private_data; |
@@ -3529,6 +3602,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
3529 | 3602 | ||
3530 | trace_parser_put(parser); | 3603 | trace_parser_put(parser); |
3531 | 3604 | ||
3605 | mutex_lock(&iter->ops->regex_lock); | ||
3606 | |||
3532 | if (file->f_mode & FMODE_WRITE) { | 3607 | if (file->f_mode & FMODE_WRITE) { |
3533 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | 3608 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); |
3534 | 3609 | ||
@@ -3546,10 +3621,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
3546 | 3621 | ||
3547 | mutex_unlock(&ftrace_lock); | 3622 | mutex_unlock(&ftrace_lock); |
3548 | } | 3623 | } |
3624 | |||
3625 | mutex_unlock(&iter->ops->regex_lock); | ||
3549 | free_ftrace_hash(iter->hash); | 3626 | free_ftrace_hash(iter->hash); |
3550 | kfree(iter); | 3627 | kfree(iter); |
3551 | 3628 | ||
3552 | mutex_unlock(&ftrace_regex_lock); | ||
3553 | return 0; | 3629 | return 0; |
3554 | } | 3630 | } |
3555 | 3631 | ||
@@ -3571,7 +3647,7 @@ static const struct file_operations ftrace_filter_fops = { | |||
3571 | .open = ftrace_filter_open, | 3647 | .open = ftrace_filter_open, |
3572 | .read = seq_read, | 3648 | .read = seq_read, |
3573 | .write = ftrace_filter_write, | 3649 | .write = ftrace_filter_write, |
3574 | .llseek = ftrace_regex_lseek, | 3650 | .llseek = ftrace_filter_lseek, |
3575 | .release = ftrace_regex_release, | 3651 | .release = ftrace_regex_release, |
3576 | }; | 3652 | }; |
3577 | 3653 | ||
@@ -3579,7 +3655,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
3579 | .open = ftrace_notrace_open, | 3655 | .open = ftrace_notrace_open, |
3580 | .read = seq_read, | 3656 | .read = seq_read, |
3581 | .write = ftrace_notrace_write, | 3657 | .write = ftrace_notrace_write, |
3582 | .llseek = ftrace_regex_lseek, | 3658 | .llseek = ftrace_filter_lseek, |
3583 | .release = ftrace_regex_release, | 3659 | .release = ftrace_regex_release, |
3584 | }; | 3660 | }; |
3585 | 3661 | ||
@@ -3737,7 +3813,8 @@ out: | |||
3737 | if (fail) | 3813 | if (fail) |
3738 | return -EINVAL; | 3814 | return -EINVAL; |
3739 | 3815 | ||
3740 | ftrace_graph_filter_enabled = 1; | 3816 | ftrace_graph_filter_enabled = !!(*idx); |
3817 | |||
3741 | return 0; | 3818 | return 0; |
3742 | } | 3819 | } |
3743 | 3820 | ||
@@ -3784,8 +3861,8 @@ static const struct file_operations ftrace_graph_fops = { | |||
3784 | .open = ftrace_graph_open, | 3861 | .open = ftrace_graph_open, |
3785 | .read = seq_read, | 3862 | .read = seq_read, |
3786 | .write = ftrace_graph_write, | 3863 | .write = ftrace_graph_write, |
3864 | .llseek = ftrace_filter_lseek, | ||
3787 | .release = ftrace_graph_release, | 3865 | .release = ftrace_graph_release, |
3788 | .llseek = seq_lseek, | ||
3789 | }; | 3866 | }; |
3790 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 3867 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
3791 | 3868 | ||
@@ -4087,7 +4164,8 @@ void __init ftrace_init(void) | |||
4087 | 4164 | ||
4088 | static struct ftrace_ops global_ops = { | 4165 | static struct ftrace_ops global_ops = { |
4089 | .func = ftrace_stub, | 4166 | .func = ftrace_stub, |
4090 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | 4167 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, |
4168 | INIT_REGEX_LOCK(global_ops) | ||
4091 | }; | 4169 | }; |
4092 | 4170 | ||
4093 | static int __init ftrace_nodyn_init(void) | 4171 | static int __init ftrace_nodyn_init(void) |
@@ -4131,7 +4209,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | |||
4131 | preempt_disable_notrace(); | 4209 | preempt_disable_notrace(); |
4132 | trace_recursion_set(TRACE_CONTROL_BIT); | 4210 | trace_recursion_set(TRACE_CONTROL_BIT); |
4133 | do_for_each_ftrace_op(op, ftrace_control_list) { | 4211 | do_for_each_ftrace_op(op, ftrace_control_list) { |
4134 | if (!ftrace_function_local_disabled(op) && | 4212 | if (!(op->flags & FTRACE_OPS_FL_STUB) && |
4213 | !ftrace_function_local_disabled(op) && | ||
4135 | ftrace_ops_test(op, ip)) | 4214 | ftrace_ops_test(op, ip)) |
4136 | op->func(ip, parent_ip, op, regs); | 4215 | op->func(ip, parent_ip, op, regs); |
4137 | } while_for_each_ftrace_op(op); | 4216 | } while_for_each_ftrace_op(op); |
@@ -4140,8 +4219,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | |||
4140 | } | 4219 | } |
4141 | 4220 | ||
4142 | static struct ftrace_ops control_ops = { | 4221 | static struct ftrace_ops control_ops = { |
4143 | .func = ftrace_ops_control_func, | 4222 | .func = ftrace_ops_control_func, |
4144 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | 4223 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, |
4224 | INIT_REGEX_LOCK(control_ops) | ||
4145 | }; | 4225 | }; |
4146 | 4226 | ||
4147 | static inline void | 4227 | static inline void |
@@ -4439,7 +4519,7 @@ static const struct file_operations ftrace_pid_fops = { | |||
4439 | .open = ftrace_pid_open, | 4519 | .open = ftrace_pid_open, |
4440 | .write = ftrace_pid_write, | 4520 | .write = ftrace_pid_write, |
4441 | .read = seq_read, | 4521 | .read = seq_read, |
4442 | .llseek = seq_lseek, | 4522 | .llseek = ftrace_filter_lseek, |
4443 | .release = ftrace_pid_release, | 4523 | .release = ftrace_pid_release, |
4444 | }; | 4524 | }; |
4445 | 4525 | ||
@@ -4499,6 +4579,8 @@ int register_ftrace_function(struct ftrace_ops *ops) | |||
4499 | { | 4579 | { |
4500 | int ret = -1; | 4580 | int ret = -1; |
4501 | 4581 | ||
4582 | ftrace_ops_init(ops); | ||
4583 | |||
4502 | mutex_lock(&ftrace_lock); | 4584 | mutex_lock(&ftrace_lock); |
4503 | 4585 | ||
4504 | ret = __register_ftrace_function(ops); | 4586 | ret = __register_ftrace_function(ops); |
@@ -4555,12 +4637,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
4555 | ftrace_startup_sysctl(); | 4637 | ftrace_startup_sysctl(); |
4556 | 4638 | ||
4557 | /* we are starting ftrace again */ | 4639 | /* we are starting ftrace again */ |
4558 | if (ftrace_ops_list != &ftrace_list_end) { | 4640 | if (ftrace_ops_list != &ftrace_list_end) |
4559 | if (ftrace_ops_list->next == &ftrace_list_end) | 4641 | update_ftrace_function(); |
4560 | ftrace_trace_function = ftrace_ops_list->func; | ||
4561 | else | ||
4562 | ftrace_trace_function = ftrace_ops_list_func; | ||
4563 | } | ||
4564 | 4642 | ||
4565 | } else { | 4643 | } else { |
4566 | /* stopping ftrace calls (just send to ftrace_stub) */ | 4644 | /* stopping ftrace calls (just send to ftrace_stub) */ |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6989df2ba194..e444ff88f0a4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -8,13 +8,16 @@ | |||
8 | #include <linux/trace_clock.h> | 8 | #include <linux/trace_clock.h> |
9 | #include <linux/trace_seq.h> | 9 | #include <linux/trace_seq.h> |
10 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <linux/irq_work.h> | ||
11 | #include <linux/debugfs.h> | 12 | #include <linux/debugfs.h> |
12 | #include <linux/uaccess.h> | 13 | #include <linux/uaccess.h> |
13 | #include <linux/hardirq.h> | 14 | #include <linux/hardirq.h> |
15 | #include <linux/kthread.h> /* for self test */ | ||
14 | #include <linux/kmemcheck.h> | 16 | #include <linux/kmemcheck.h> |
15 | #include <linux/module.h> | 17 | #include <linux/module.h> |
16 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
17 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
20 | #include <linux/delay.h> | ||
18 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
19 | #include <linux/init.h> | 22 | #include <linux/init.h> |
20 | #include <linux/hash.h> | 23 | #include <linux/hash.h> |
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s) | |||
444 | return ret; | 447 | return ret; |
445 | } | 448 | } |
446 | 449 | ||
450 | struct rb_irq_work { | ||
451 | struct irq_work work; | ||
452 | wait_queue_head_t waiters; | ||
453 | bool waiters_pending; | ||
454 | }; | ||
455 | |||
447 | /* | 456 | /* |
448 | * head_page == tail_page && head == tail then buffer is empty. | 457 | * head_page == tail_page && head == tail then buffer is empty. |
449 | */ | 458 | */ |
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu { | |||
478 | struct list_head new_pages; /* new pages to add */ | 487 | struct list_head new_pages; /* new pages to add */ |
479 | struct work_struct update_pages_work; | 488 | struct work_struct update_pages_work; |
480 | struct completion update_done; | 489 | struct completion update_done; |
490 | |||
491 | struct rb_irq_work irq_work; | ||
481 | }; | 492 | }; |
482 | 493 | ||
483 | struct ring_buffer { | 494 | struct ring_buffer { |
@@ -497,6 +508,8 @@ struct ring_buffer { | |||
497 | struct notifier_block cpu_notify; | 508 | struct notifier_block cpu_notify; |
498 | #endif | 509 | #endif |
499 | u64 (*clock)(void); | 510 | u64 (*clock)(void); |
511 | |||
512 | struct rb_irq_work irq_work; | ||
500 | }; | 513 | }; |
501 | 514 | ||
502 | struct ring_buffer_iter { | 515 | struct ring_buffer_iter { |
@@ -508,6 +521,121 @@ struct ring_buffer_iter { | |||
508 | u64 read_stamp; | 521 | u64 read_stamp; |
509 | }; | 522 | }; |
510 | 523 | ||
524 | /* | ||
525 | * rb_wake_up_waiters - wake up tasks waiting for ring buffer input | ||
526 | * | ||
527 | * Schedules a delayed work to wake up any task that is blocked on the | ||
528 | * ring buffer waiters queue. | ||
529 | */ | ||
530 | static void rb_wake_up_waiters(struct irq_work *work) | ||
531 | { | ||
532 | struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); | ||
533 | |||
534 | wake_up_all(&rbwork->waiters); | ||
535 | } | ||
536 | |||
537 | /** | ||
538 | * ring_buffer_wait - wait for input to the ring buffer | ||
539 | * @buffer: buffer to wait on | ||
540 | * @cpu: the cpu buffer to wait on | ||
541 | * | ||
542 | * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon | ||
543 | * as data is added to any of the @buffer's cpu buffers. Otherwise | ||
544 | * it will wait for data to be added to a specific cpu buffer. | ||
545 | */ | ||
546 | void ring_buffer_wait(struct ring_buffer *buffer, int cpu) | ||
547 | { | ||
548 | struct ring_buffer_per_cpu *cpu_buffer; | ||
549 | DEFINE_WAIT(wait); | ||
550 | struct rb_irq_work *work; | ||
551 | |||
552 | /* | ||
553 | * Depending on what the caller is waiting for, either any | ||
554 | * data in any cpu buffer, or a specific buffer, put the | ||
555 | * caller on the appropriate wait queue. | ||
556 | */ | ||
557 | if (cpu == RING_BUFFER_ALL_CPUS) | ||
558 | work = &buffer->irq_work; | ||
559 | else { | ||
560 | cpu_buffer = buffer->buffers[cpu]; | ||
561 | work = &cpu_buffer->irq_work; | ||
562 | } | ||
563 | |||
564 | |||
565 | prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); | ||
566 | |||
567 | /* | ||
568 | * The events can happen in critical sections where | ||
569 | * checking a work queue can cause deadlocks. | ||
570 | * After adding a task to the queue, this flag is set | ||
571 | * only to notify events to try to wake up the queue | ||
572 | * using irq_work. | ||
573 | * | ||
574 | * We don't clear it even if the buffer is no longer | ||
575 | * empty. The flag only causes the next event to run | ||
576 | * irq_work to do the work queue wake up. The worse | ||
577 | * that can happen if we race with !trace_empty() is that | ||
578 | * an event will cause an irq_work to try to wake up | ||
579 | * an empty queue. | ||
580 | * | ||
581 | * There's no reason to protect this flag either, as | ||
582 | * the work queue and irq_work logic will do the necessary | ||
583 | * synchronization for the wake ups. The only thing | ||
584 | * that is necessary is that the wake up happens after | ||
585 | * a task has been queued. It's OK for spurious wake ups. | ||
586 | */ | ||
587 | work->waiters_pending = true; | ||
588 | |||
589 | if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || | ||
590 | (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) | ||
591 | schedule(); | ||
592 | |||
593 | finish_wait(&work->waiters, &wait); | ||
594 | } | ||
595 | |||
596 | /** | ||
597 | * ring_buffer_poll_wait - poll on buffer input | ||
598 | * @buffer: buffer to wait on | ||
599 | * @cpu: the cpu buffer to wait on | ||
600 | * @filp: the file descriptor | ||
601 | * @poll_table: The poll descriptor | ||
602 | * | ||
603 | * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon | ||
604 | * as data is added to any of the @buffer's cpu buffers. Otherwise | ||
605 | * it will wait for data to be added to a specific cpu buffer. | ||
606 | * | ||
607 | * Returns POLLIN | POLLRDNORM if data exists in the buffers, | ||
608 | * zero otherwise. | ||
609 | */ | ||
610 | int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, | ||
611 | struct file *filp, poll_table *poll_table) | ||
612 | { | ||
613 | struct ring_buffer_per_cpu *cpu_buffer; | ||
614 | struct rb_irq_work *work; | ||
615 | |||
616 | if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || | ||
617 | (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) | ||
618 | return POLLIN | POLLRDNORM; | ||
619 | |||
620 | if (cpu == RING_BUFFER_ALL_CPUS) | ||
621 | work = &buffer->irq_work; | ||
622 | else { | ||
623 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
624 | return -EINVAL; | ||
625 | |||
626 | cpu_buffer = buffer->buffers[cpu]; | ||
627 | work = &cpu_buffer->irq_work; | ||
628 | } | ||
629 | |||
630 | work->waiters_pending = true; | ||
631 | poll_wait(filp, &work->waiters, poll_table); | ||
632 | |||
633 | if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || | ||
634 | (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) | ||
635 | return POLLIN | POLLRDNORM; | ||
636 | return 0; | ||
637 | } | ||
638 | |||
511 | /* buffer may be either ring_buffer or ring_buffer_per_cpu */ | 639 | /* buffer may be either ring_buffer or ring_buffer_per_cpu */ |
512 | #define RB_WARN_ON(b, cond) \ | 640 | #define RB_WARN_ON(b, cond) \ |
513 | ({ \ | 641 | ({ \ |
@@ -1063,6 +1191,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) | |||
1063 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1191 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1064 | INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); | 1192 | INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); |
1065 | init_completion(&cpu_buffer->update_done); | 1193 | init_completion(&cpu_buffer->update_done); |
1194 | init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); | ||
1195 | init_waitqueue_head(&cpu_buffer->irq_work.waiters); | ||
1066 | 1196 | ||
1067 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1197 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
1068 | GFP_KERNEL, cpu_to_node(cpu)); | 1198 | GFP_KERNEL, cpu_to_node(cpu)); |
@@ -1158,6 +1288,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1158 | buffer->clock = trace_clock_local; | 1288 | buffer->clock = trace_clock_local; |
1159 | buffer->reader_lock_key = key; | 1289 | buffer->reader_lock_key = key; |
1160 | 1290 | ||
1291 | init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); | ||
1292 | init_waitqueue_head(&buffer->irq_work.waiters); | ||
1293 | |||
1161 | /* need at least two pages */ | 1294 | /* need at least two pages */ |
1162 | if (nr_pages < 2) | 1295 | if (nr_pages < 2) |
1163 | nr_pages = 2; | 1296 | nr_pages = 2; |
@@ -1553,11 +1686,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1553 | if (!cpu_buffer->nr_pages_to_update) | 1686 | if (!cpu_buffer->nr_pages_to_update) |
1554 | continue; | 1687 | continue; |
1555 | 1688 | ||
1556 | if (cpu_online(cpu)) | 1689 | /* The update must run on the CPU that is being updated. */ |
1690 | preempt_disable(); | ||
1691 | if (cpu == smp_processor_id() || !cpu_online(cpu)) { | ||
1692 | rb_update_pages(cpu_buffer); | ||
1693 | cpu_buffer->nr_pages_to_update = 0; | ||
1694 | } else { | ||
1695 | /* | ||
1696 | * Can not disable preemption for schedule_work_on() | ||
1697 | * on PREEMPT_RT. | ||
1698 | */ | ||
1699 | preempt_enable(); | ||
1557 | schedule_work_on(cpu, | 1700 | schedule_work_on(cpu, |
1558 | &cpu_buffer->update_pages_work); | 1701 | &cpu_buffer->update_pages_work); |
1559 | else | 1702 | preempt_disable(); |
1560 | rb_update_pages(cpu_buffer); | 1703 | } |
1704 | preempt_enable(); | ||
1561 | } | 1705 | } |
1562 | 1706 | ||
1563 | /* wait for all the updates to complete */ | 1707 | /* wait for all the updates to complete */ |
@@ -1595,12 +1739,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1595 | 1739 | ||
1596 | get_online_cpus(); | 1740 | get_online_cpus(); |
1597 | 1741 | ||
1598 | if (cpu_online(cpu_id)) { | 1742 | preempt_disable(); |
1743 | /* The update must run on the CPU that is being updated. */ | ||
1744 | if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) | ||
1745 | rb_update_pages(cpu_buffer); | ||
1746 | else { | ||
1747 | /* | ||
1748 | * Can not disable preemption for schedule_work_on() | ||
1749 | * on PREEMPT_RT. | ||
1750 | */ | ||
1751 | preempt_enable(); | ||
1599 | schedule_work_on(cpu_id, | 1752 | schedule_work_on(cpu_id, |
1600 | &cpu_buffer->update_pages_work); | 1753 | &cpu_buffer->update_pages_work); |
1601 | wait_for_completion(&cpu_buffer->update_done); | 1754 | wait_for_completion(&cpu_buffer->update_done); |
1602 | } else | 1755 | preempt_disable(); |
1603 | rb_update_pages(cpu_buffer); | 1756 | } |
1757 | preempt_enable(); | ||
1604 | 1758 | ||
1605 | cpu_buffer->nr_pages_to_update = 0; | 1759 | cpu_buffer->nr_pages_to_update = 0; |
1606 | put_online_cpus(); | 1760 | put_online_cpus(); |
@@ -2612,6 +2766,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, | |||
2612 | rb_end_commit(cpu_buffer); | 2766 | rb_end_commit(cpu_buffer); |
2613 | } | 2767 | } |
2614 | 2768 | ||
2769 | static __always_inline void | ||
2770 | rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) | ||
2771 | { | ||
2772 | if (buffer->irq_work.waiters_pending) { | ||
2773 | buffer->irq_work.waiters_pending = false; | ||
2774 | /* irq_work_queue() supplies it's own memory barriers */ | ||
2775 | irq_work_queue(&buffer->irq_work.work); | ||
2776 | } | ||
2777 | |||
2778 | if (cpu_buffer->irq_work.waiters_pending) { | ||
2779 | cpu_buffer->irq_work.waiters_pending = false; | ||
2780 | /* irq_work_queue() supplies it's own memory barriers */ | ||
2781 | irq_work_queue(&cpu_buffer->irq_work.work); | ||
2782 | } | ||
2783 | } | ||
2784 | |||
2615 | /** | 2785 | /** |
2616 | * ring_buffer_unlock_commit - commit a reserved | 2786 | * ring_buffer_unlock_commit - commit a reserved |
2617 | * @buffer: The buffer to commit to | 2787 | * @buffer: The buffer to commit to |
@@ -2631,6 +2801,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, | |||
2631 | 2801 | ||
2632 | rb_commit(cpu_buffer, event); | 2802 | rb_commit(cpu_buffer, event); |
2633 | 2803 | ||
2804 | rb_wakeups(buffer, cpu_buffer); | ||
2805 | |||
2634 | trace_recursive_unlock(); | 2806 | trace_recursive_unlock(); |
2635 | 2807 | ||
2636 | preempt_enable_notrace(); | 2808 | preempt_enable_notrace(); |
@@ -2803,6 +2975,8 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2803 | 2975 | ||
2804 | rb_commit(cpu_buffer, event); | 2976 | rb_commit(cpu_buffer, event); |
2805 | 2977 | ||
2978 | rb_wakeups(buffer, cpu_buffer); | ||
2979 | |||
2806 | ret = 0; | 2980 | ret = 0; |
2807 | out: | 2981 | out: |
2808 | preempt_enable_notrace(); | 2982 | preempt_enable_notrace(); |
@@ -4467,3 +4641,320 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4467 | return NOTIFY_OK; | 4641 | return NOTIFY_OK; |
4468 | } | 4642 | } |
4469 | #endif | 4643 | #endif |
4644 | |||
4645 | #ifdef CONFIG_RING_BUFFER_STARTUP_TEST | ||
4646 | /* | ||
4647 | * This is a basic integrity check of the ring buffer. | ||
4648 | * Late in the boot cycle this test will run when configured in. | ||
4649 | * It will kick off a thread per CPU that will go into a loop | ||
4650 | * writing to the per cpu ring buffer various sizes of data. | ||
4651 | * Some of the data will be large items, some small. | ||
4652 | * | ||
4653 | * Another thread is created that goes into a spin, sending out | ||
4654 | * IPIs to the other CPUs to also write into the ring buffer. | ||
4655 | * this is to test the nesting ability of the buffer. | ||
4656 | * | ||
4657 | * Basic stats are recorded and reported. If something in the | ||
4658 | * ring buffer should happen that's not expected, a big warning | ||
4659 | * is displayed and all ring buffers are disabled. | ||
4660 | */ | ||
4661 | static struct task_struct *rb_threads[NR_CPUS] __initdata; | ||
4662 | |||
4663 | struct rb_test_data { | ||
4664 | struct ring_buffer *buffer; | ||
4665 | unsigned long events; | ||
4666 | unsigned long bytes_written; | ||
4667 | unsigned long bytes_alloc; | ||
4668 | unsigned long bytes_dropped; | ||
4669 | unsigned long events_nested; | ||
4670 | unsigned long bytes_written_nested; | ||
4671 | unsigned long bytes_alloc_nested; | ||
4672 | unsigned long bytes_dropped_nested; | ||
4673 | int min_size_nested; | ||
4674 | int max_size_nested; | ||
4675 | int max_size; | ||
4676 | int min_size; | ||
4677 | int cpu; | ||
4678 | int cnt; | ||
4679 | }; | ||
4680 | |||
4681 | static struct rb_test_data rb_data[NR_CPUS] __initdata; | ||
4682 | |||
4683 | /* 1 meg per cpu */ | ||
4684 | #define RB_TEST_BUFFER_SIZE 1048576 | ||
4685 | |||
4686 | static char rb_string[] __initdata = | ||
4687 | "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" | ||
4688 | "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" | ||
4689 | "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; | ||
4690 | |||
4691 | static bool rb_test_started __initdata; | ||
4692 | |||
4693 | struct rb_item { | ||
4694 | int size; | ||
4695 | char str[]; | ||
4696 | }; | ||
4697 | |||
4698 | static __init int rb_write_something(struct rb_test_data *data, bool nested) | ||
4699 | { | ||
4700 | struct ring_buffer_event *event; | ||
4701 | struct rb_item *item; | ||
4702 | bool started; | ||
4703 | int event_len; | ||
4704 | int size; | ||
4705 | int len; | ||
4706 | int cnt; | ||
4707 | |||
4708 | /* Have nested writes different that what is written */ | ||
4709 | cnt = data->cnt + (nested ? 27 : 0); | ||
4710 | |||
4711 | /* Multiply cnt by ~e, to make some unique increment */ | ||
4712 | size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); | ||
4713 | |||
4714 | len = size + sizeof(struct rb_item); | ||
4715 | |||
4716 | started = rb_test_started; | ||
4717 | /* read rb_test_started before checking buffer enabled */ | ||
4718 | smp_rmb(); | ||
4719 | |||
4720 | event = ring_buffer_lock_reserve(data->buffer, len); | ||
4721 | if (!event) { | ||
4722 | /* Ignore dropped events before test starts. */ | ||
4723 | if (started) { | ||
4724 | if (nested) | ||
4725 | data->bytes_dropped += len; | ||
4726 | else | ||
4727 | data->bytes_dropped_nested += len; | ||
4728 | } | ||
4729 | return len; | ||
4730 | } | ||
4731 | |||
4732 | event_len = ring_buffer_event_length(event); | ||
4733 | |||
4734 | if (RB_WARN_ON(data->buffer, event_len < len)) | ||
4735 | goto out; | ||
4736 | |||
4737 | item = ring_buffer_event_data(event); | ||
4738 | item->size = size; | ||
4739 | memcpy(item->str, rb_string, size); | ||
4740 | |||
4741 | if (nested) { | ||
4742 | data->bytes_alloc_nested += event_len; | ||
4743 | data->bytes_written_nested += len; | ||
4744 | data->events_nested++; | ||
4745 | if (!data->min_size_nested || len < data->min_size_nested) | ||
4746 | data->min_size_nested = len; | ||
4747 | if (len > data->max_size_nested) | ||
4748 | data->max_size_nested = len; | ||
4749 | } else { | ||
4750 | data->bytes_alloc += event_len; | ||
4751 | data->bytes_written += len; | ||
4752 | data->events++; | ||
4753 | if (!data->min_size || len < data->min_size) | ||
4754 | data->max_size = len; | ||
4755 | if (len > data->max_size) | ||
4756 | data->max_size = len; | ||
4757 | } | ||
4758 | |||
4759 | out: | ||
4760 | ring_buffer_unlock_commit(data->buffer, event); | ||
4761 | |||
4762 | return 0; | ||
4763 | } | ||
4764 | |||
4765 | static __init int rb_test(void *arg) | ||
4766 | { | ||
4767 | struct rb_test_data *data = arg; | ||
4768 | |||
4769 | while (!kthread_should_stop()) { | ||
4770 | rb_write_something(data, false); | ||
4771 | data->cnt++; | ||
4772 | |||
4773 | set_current_state(TASK_INTERRUPTIBLE); | ||
4774 | /* Now sleep between a min of 100-300us and a max of 1ms */ | ||
4775 | usleep_range(((data->cnt % 3) + 1) * 100, 1000); | ||
4776 | } | ||
4777 | |||
4778 | return 0; | ||
4779 | } | ||
4780 | |||
4781 | static __init void rb_ipi(void *ignore) | ||
4782 | { | ||
4783 | struct rb_test_data *data; | ||
4784 | int cpu = smp_processor_id(); | ||
4785 | |||
4786 | data = &rb_data[cpu]; | ||
4787 | rb_write_something(data, true); | ||
4788 | } | ||
4789 | |||
4790 | static __init int rb_hammer_test(void *arg) | ||
4791 | { | ||
4792 | while (!kthread_should_stop()) { | ||
4793 | |||
4794 | /* Send an IPI to all cpus to write data! */ | ||
4795 | smp_call_function(rb_ipi, NULL, 1); | ||
4796 | /* No sleep, but for non preempt, let others run */ | ||
4797 | schedule(); | ||
4798 | } | ||
4799 | |||
4800 | return 0; | ||
4801 | } | ||
4802 | |||
4803 | static __init int test_ringbuffer(void) | ||
4804 | { | ||
4805 | struct task_struct *rb_hammer; | ||
4806 | struct ring_buffer *buffer; | ||
4807 | int cpu; | ||
4808 | int ret = 0; | ||
4809 | |||
4810 | pr_info("Running ring buffer tests...\n"); | ||
4811 | |||
4812 | buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); | ||
4813 | if (WARN_ON(!buffer)) | ||
4814 | return 0; | ||
4815 | |||
4816 | /* Disable buffer so that threads can't write to it yet */ | ||
4817 | ring_buffer_record_off(buffer); | ||
4818 | |||
4819 | for_each_online_cpu(cpu) { | ||
4820 | rb_data[cpu].buffer = buffer; | ||
4821 | rb_data[cpu].cpu = cpu; | ||
4822 | rb_data[cpu].cnt = cpu; | ||
4823 | rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], | ||
4824 | "rbtester/%d", cpu); | ||
4825 | if (WARN_ON(!rb_threads[cpu])) { | ||
4826 | pr_cont("FAILED\n"); | ||
4827 | ret = -1; | ||
4828 | goto out_free; | ||
4829 | } | ||
4830 | |||
4831 | kthread_bind(rb_threads[cpu], cpu); | ||
4832 | wake_up_process(rb_threads[cpu]); | ||
4833 | } | ||
4834 | |||
4835 | /* Now create the rb hammer! */ | ||
4836 | rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); | ||
4837 | if (WARN_ON(!rb_hammer)) { | ||
4838 | pr_cont("FAILED\n"); | ||
4839 | ret = -1; | ||
4840 | goto out_free; | ||
4841 | } | ||
4842 | |||
4843 | ring_buffer_record_on(buffer); | ||
4844 | /* | ||
4845 | * Show buffer is enabled before setting rb_test_started. | ||
4846 | * Yes there's a small race window where events could be | ||
4847 | * dropped and the thread wont catch it. But when a ring | ||
4848 | * buffer gets enabled, there will always be some kind of | ||
4849 | * delay before other CPUs see it. Thus, we don't care about | ||
4850 | * those dropped events. We care about events dropped after | ||
4851 | * the threads see that the buffer is active. | ||
4852 | */ | ||
4853 | smp_wmb(); | ||
4854 | rb_test_started = true; | ||
4855 | |||
4856 | set_current_state(TASK_INTERRUPTIBLE); | ||
4857 | /* Just run for 10 seconds */; | ||
4858 | schedule_timeout(10 * HZ); | ||
4859 | |||
4860 | kthread_stop(rb_hammer); | ||
4861 | |||
4862 | out_free: | ||
4863 | for_each_online_cpu(cpu) { | ||
4864 | if (!rb_threads[cpu]) | ||
4865 | break; | ||
4866 | kthread_stop(rb_threads[cpu]); | ||
4867 | } | ||
4868 | if (ret) { | ||
4869 | ring_buffer_free(buffer); | ||
4870 | return ret; | ||
4871 | } | ||
4872 | |||
4873 | /* Report! */ | ||
4874 | pr_info("finished\n"); | ||
4875 | for_each_online_cpu(cpu) { | ||
4876 | struct ring_buffer_event *event; | ||
4877 | struct rb_test_data *data = &rb_data[cpu]; | ||
4878 | struct rb_item *item; | ||
4879 | unsigned long total_events; | ||
4880 | unsigned long total_dropped; | ||
4881 | unsigned long total_written; | ||
4882 | unsigned long total_alloc; | ||
4883 | unsigned long total_read = 0; | ||
4884 | unsigned long total_size = 0; | ||
4885 | unsigned long total_len = 0; | ||
4886 | unsigned long total_lost = 0; | ||
4887 | unsigned long lost; | ||
4888 | int big_event_size; | ||
4889 | int small_event_size; | ||
4890 | |||
4891 | ret = -1; | ||
4892 | |||
4893 | total_events = data->events + data->events_nested; | ||
4894 | total_written = data->bytes_written + data->bytes_written_nested; | ||
4895 | total_alloc = data->bytes_alloc + data->bytes_alloc_nested; | ||
4896 | total_dropped = data->bytes_dropped + data->bytes_dropped_nested; | ||
4897 | |||
4898 | big_event_size = data->max_size + data->max_size_nested; | ||
4899 | small_event_size = data->min_size + data->min_size_nested; | ||
4900 | |||
4901 | pr_info("CPU %d:\n", cpu); | ||
4902 | pr_info(" events: %ld\n", total_events); | ||
4903 | pr_info(" dropped bytes: %ld\n", total_dropped); | ||
4904 | pr_info(" alloced bytes: %ld\n", total_alloc); | ||
4905 | pr_info(" written bytes: %ld\n", total_written); | ||
4906 | pr_info(" biggest event: %d\n", big_event_size); | ||
4907 | pr_info(" smallest event: %d\n", small_event_size); | ||
4908 | |||
4909 | if (RB_WARN_ON(buffer, total_dropped)) | ||
4910 | break; | ||
4911 | |||
4912 | ret = 0; | ||
4913 | |||
4914 | while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { | ||
4915 | total_lost += lost; | ||
4916 | item = ring_buffer_event_data(event); | ||
4917 | total_len += ring_buffer_event_length(event); | ||
4918 | total_size += item->size + sizeof(struct rb_item); | ||
4919 | if (memcmp(&item->str[0], rb_string, item->size) != 0) { | ||
4920 | pr_info("FAILED!\n"); | ||
4921 | pr_info("buffer had: %.*s\n", item->size, item->str); | ||
4922 | pr_info("expected: %.*s\n", item->size, rb_string); | ||
4923 | RB_WARN_ON(buffer, 1); | ||
4924 | ret = -1; | ||
4925 | break; | ||
4926 | } | ||
4927 | total_read++; | ||
4928 | } | ||
4929 | if (ret) | ||
4930 | break; | ||
4931 | |||
4932 | ret = -1; | ||
4933 | |||
4934 | pr_info(" read events: %ld\n", total_read); | ||
4935 | pr_info(" lost events: %ld\n", total_lost); | ||
4936 | pr_info(" total events: %ld\n", total_lost + total_read); | ||
4937 | pr_info(" recorded len bytes: %ld\n", total_len); | ||
4938 | pr_info(" recorded size bytes: %ld\n", total_size); | ||
4939 | if (total_lost) | ||
4940 | pr_info(" With dropped events, record len and size may not match\n" | ||
4941 | " alloced and written from above\n"); | ||
4942 | if (!total_lost) { | ||
4943 | if (RB_WARN_ON(buffer, total_len != total_alloc || | ||
4944 | total_size != total_written)) | ||
4945 | break; | ||
4946 | } | ||
4947 | if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) | ||
4948 | break; | ||
4949 | |||
4950 | ret = 0; | ||
4951 | } | ||
4952 | if (!ret) | ||
4953 | pr_info("Ring buffer PASSED!\n"); | ||
4954 | |||
4955 | ring_buffer_free(buffer); | ||
4956 | return 0; | ||
4957 | } | ||
4958 | |||
4959 | late_initcall(test_ringbuffer); | ||
4960 | #endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1f835a83cb2c..1a41023a1f88 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * ring buffer based function tracer | 2 | * ring buffer based function tracer |
3 | * | 3 | * |
4 | * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> | 4 | * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> |
5 | * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> | 5 | * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> |
6 | * | 6 | * |
7 | * Originally taken from the RT patch by: | 7 | * Originally taken from the RT patch by: |
@@ -19,7 +19,6 @@ | |||
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
21 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
22 | #include <linux/irq_work.h> | ||
23 | #include <linux/debugfs.h> | 22 | #include <linux/debugfs.h> |
24 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
25 | #include <linux/hardirq.h> | 24 | #include <linux/hardirq.h> |
@@ -48,7 +47,7 @@ | |||
48 | * On boot up, the ring buffer is set to the minimum size, so that | 47 | * On boot up, the ring buffer is set to the minimum size, so that |
49 | * we do not waste memory on systems that are not using tracing. | 48 | * we do not waste memory on systems that are not using tracing. |
50 | */ | 49 | */ |
51 | int ring_buffer_expanded; | 50 | bool ring_buffer_expanded; |
52 | 51 | ||
53 | /* | 52 | /* |
54 | * We need to change this state when a selftest is running. | 53 | * We need to change this state when a selftest is running. |
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) | |||
87 | static DEFINE_PER_CPU(bool, trace_cmdline_save); | 86 | static DEFINE_PER_CPU(bool, trace_cmdline_save); |
88 | 87 | ||
89 | /* | 88 | /* |
90 | * When a reader is waiting for data, then this variable is | ||
91 | * set to true. | ||
92 | */ | ||
93 | static bool trace_wakeup_needed; | ||
94 | |||
95 | static struct irq_work trace_work_wakeup; | ||
96 | |||
97 | /* | ||
98 | * Kill all tracing for good (never come back). | 89 | * Kill all tracing for good (never come back). |
99 | * It is initialized to 1 but will turn to zero if the initialization | 90 | * It is initialized to 1 but will turn to zero if the initialization |
100 | * of the tracer is successful. But that is the only place that sets | 91 | * of the tracer is successful. But that is the only place that sets |
@@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf); | |||
130 | static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; | 121 | static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; |
131 | static char *default_bootup_tracer; | 122 | static char *default_bootup_tracer; |
132 | 123 | ||
124 | static bool allocate_snapshot; | ||
125 | |||
133 | static int __init set_cmdline_ftrace(char *str) | 126 | static int __init set_cmdline_ftrace(char *str) |
134 | { | 127 | { |
135 | strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); | 128 | strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); |
136 | default_bootup_tracer = bootup_tracer_buf; | 129 | default_bootup_tracer = bootup_tracer_buf; |
137 | /* We are using ftrace early, expand it */ | 130 | /* We are using ftrace early, expand it */ |
138 | ring_buffer_expanded = 1; | 131 | ring_buffer_expanded = true; |
139 | return 1; | 132 | return 1; |
140 | } | 133 | } |
141 | __setup("ftrace=", set_cmdline_ftrace); | 134 | __setup("ftrace=", set_cmdline_ftrace); |
@@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str) | |||
156 | } | 149 | } |
157 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); | 150 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); |
158 | 151 | ||
152 | static int __init boot_alloc_snapshot(char *str) | ||
153 | { | ||
154 | allocate_snapshot = true; | ||
155 | /* We also need the main ring buffer expanded */ | ||
156 | ring_buffer_expanded = true; | ||
157 | return 1; | ||
158 | } | ||
159 | __setup("alloc_snapshot", boot_alloc_snapshot); | ||
160 | |||
159 | 161 | ||
160 | static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; | 162 | static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; |
161 | static char *trace_boot_options __initdata; | 163 | static char *trace_boot_options __initdata; |
162 | 164 | ||
163 | static int __init set_trace_boot_options(char *str) | 165 | static int __init set_trace_boot_options(char *str) |
164 | { | 166 | { |
165 | strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); | 167 | strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); |
166 | trace_boot_options = trace_boot_options_buf; | 168 | trace_boot_options = trace_boot_options_buf; |
167 | return 0; | 169 | return 0; |
168 | } | 170 | } |
@@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec) | |||
189 | */ | 191 | */ |
190 | static struct trace_array global_trace; | 192 | static struct trace_array global_trace; |
191 | 193 | ||
192 | static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); | 194 | LIST_HEAD(ftrace_trace_arrays); |
193 | 195 | ||
194 | int filter_current_check_discard(struct ring_buffer *buffer, | 196 | int filter_current_check_discard(struct ring_buffer *buffer, |
195 | struct ftrace_event_call *call, void *rec, | 197 | struct ftrace_event_call *call, void *rec, |
@@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu) | |||
204 | u64 ts; | 206 | u64 ts; |
205 | 207 | ||
206 | /* Early boot up does not have a buffer yet */ | 208 | /* Early boot up does not have a buffer yet */ |
207 | if (!global_trace.buffer) | 209 | if (!global_trace.trace_buffer.buffer) |
208 | return trace_clock_local(); | 210 | return trace_clock_local(); |
209 | 211 | ||
210 | ts = ring_buffer_time_stamp(global_trace.buffer, cpu); | 212 | ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); |
211 | ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); | 213 | ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); |
212 | 214 | ||
213 | return ts; | 215 | return ts; |
214 | } | 216 | } |
215 | 217 | ||
216 | /* | ||
217 | * The max_tr is used to snapshot the global_trace when a maximum | ||
218 | * latency is reached. Some tracers will use this to store a maximum | ||
219 | * trace while it continues examining live traces. | ||
220 | * | ||
221 | * The buffers for the max_tr are set up the same as the global_trace. | ||
222 | * When a snapshot is taken, the link list of the max_tr is swapped | ||
223 | * with the link list of the global_trace and the buffers are reset for | ||
224 | * the global_trace so the tracing can continue. | ||
225 | */ | ||
226 | static struct trace_array max_tr; | ||
227 | |||
228 | static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); | ||
229 | |||
230 | int tracing_is_enabled(void) | 218 | int tracing_is_enabled(void) |
231 | { | 219 | { |
232 | return tracing_is_on(); | 220 | return tracing_is_on(); |
@@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; | |||
249 | /* trace_types holds a link list of available tracers. */ | 237 | /* trace_types holds a link list of available tracers. */ |
250 | static struct tracer *trace_types __read_mostly; | 238 | static struct tracer *trace_types __read_mostly; |
251 | 239 | ||
252 | /* current_trace points to the tracer that is currently active */ | ||
253 | static struct tracer *current_trace __read_mostly = &nop_trace; | ||
254 | |||
255 | /* | 240 | /* |
256 | * trace_types_lock is used to protect the trace_types list. | 241 | * trace_types_lock is used to protect the trace_types list. |
257 | */ | 242 | */ |
@@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock); | |||
285 | 270 | ||
286 | static inline void trace_access_lock(int cpu) | 271 | static inline void trace_access_lock(int cpu) |
287 | { | 272 | { |
288 | if (cpu == TRACE_PIPE_ALL_CPU) { | 273 | if (cpu == RING_BUFFER_ALL_CPUS) { |
289 | /* gain it for accessing the whole ring buffer. */ | 274 | /* gain it for accessing the whole ring buffer. */ |
290 | down_write(&all_cpu_access_lock); | 275 | down_write(&all_cpu_access_lock); |
291 | } else { | 276 | } else { |
292 | /* gain it for accessing a cpu ring buffer. */ | 277 | /* gain it for accessing a cpu ring buffer. */ |
293 | 278 | ||
294 | /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ | 279 | /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ |
295 | down_read(&all_cpu_access_lock); | 280 | down_read(&all_cpu_access_lock); |
296 | 281 | ||
297 | /* Secondly block other access to this @cpu ring buffer. */ | 282 | /* Secondly block other access to this @cpu ring buffer. */ |
@@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu) | |||
301 | 286 | ||
302 | static inline void trace_access_unlock(int cpu) | 287 | static inline void trace_access_unlock(int cpu) |
303 | { | 288 | { |
304 | if (cpu == TRACE_PIPE_ALL_CPU) { | 289 | if (cpu == RING_BUFFER_ALL_CPUS) { |
305 | up_write(&all_cpu_access_lock); | 290 | up_write(&all_cpu_access_lock); |
306 | } else { | 291 | } else { |
307 | mutex_unlock(&per_cpu(cpu_access_lock, cpu)); | 292 | mutex_unlock(&per_cpu(cpu_access_lock, cpu)); |
@@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void) | |||
339 | 324 | ||
340 | #endif | 325 | #endif |
341 | 326 | ||
342 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ | ||
343 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | ||
344 | |||
345 | /* trace_flags holds trace_options default values */ | 327 | /* trace_flags holds trace_options default values */ |
346 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 328 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
347 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 329 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
348 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | | 330 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | |
349 | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; | 331 | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; |
350 | |||
351 | static int trace_stop_count; | ||
352 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); | ||
353 | |||
354 | /** | ||
355 | * trace_wake_up - wake up tasks waiting for trace input | ||
356 | * | ||
357 | * Schedules a delayed work to wake up any task that is blocked on the | ||
358 | * trace_wait queue. These is used with trace_poll for tasks polling the | ||
359 | * trace. | ||
360 | */ | ||
361 | static void trace_wake_up(struct irq_work *work) | ||
362 | { | ||
363 | wake_up_all(&trace_wait); | ||
364 | |||
365 | } | ||
366 | 332 | ||
367 | /** | 333 | /** |
368 | * tracing_on - enable tracing buffers | 334 | * tracing_on - enable tracing buffers |
@@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work) | |||
372 | */ | 338 | */ |
373 | void tracing_on(void) | 339 | void tracing_on(void) |
374 | { | 340 | { |
375 | if (global_trace.buffer) | 341 | if (global_trace.trace_buffer.buffer) |
376 | ring_buffer_record_on(global_trace.buffer); | 342 | ring_buffer_record_on(global_trace.trace_buffer.buffer); |
377 | /* | 343 | /* |
378 | * This flag is only looked at when buffers haven't been | 344 | * This flag is only looked at when buffers haven't been |
379 | * allocated yet. We don't really care about the race | 345 | * allocated yet. We don't really care about the race |
@@ -385,6 +351,196 @@ void tracing_on(void) | |||
385 | EXPORT_SYMBOL_GPL(tracing_on); | 351 | EXPORT_SYMBOL_GPL(tracing_on); |
386 | 352 | ||
387 | /** | 353 | /** |
354 | * __trace_puts - write a constant string into the trace buffer. | ||
355 | * @ip: The address of the caller | ||
356 | * @str: The constant string to write | ||
357 | * @size: The size of the string. | ||
358 | */ | ||
359 | int __trace_puts(unsigned long ip, const char *str, int size) | ||
360 | { | ||
361 | struct ring_buffer_event *event; | ||
362 | struct ring_buffer *buffer; | ||
363 | struct print_entry *entry; | ||
364 | unsigned long irq_flags; | ||
365 | int alloc; | ||
366 | |||
367 | alloc = sizeof(*entry) + size + 2; /* possible \n added */ | ||
368 | |||
369 | local_save_flags(irq_flags); | ||
370 | buffer = global_trace.trace_buffer.buffer; | ||
371 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, | ||
372 | irq_flags, preempt_count()); | ||
373 | if (!event) | ||
374 | return 0; | ||
375 | |||
376 | entry = ring_buffer_event_data(event); | ||
377 | entry->ip = ip; | ||
378 | |||
379 | memcpy(&entry->buf, str, size); | ||
380 | |||
381 | /* Add a newline if necessary */ | ||
382 | if (entry->buf[size - 1] != '\n') { | ||
383 | entry->buf[size] = '\n'; | ||
384 | entry->buf[size + 1] = '\0'; | ||
385 | } else | ||
386 | entry->buf[size] = '\0'; | ||
387 | |||
388 | __buffer_unlock_commit(buffer, event); | ||
389 | |||
390 | return size; | ||
391 | } | ||
392 | EXPORT_SYMBOL_GPL(__trace_puts); | ||
393 | |||
394 | /** | ||
395 | * __trace_bputs - write the pointer to a constant string into trace buffer | ||
396 | * @ip: The address of the caller | ||
397 | * @str: The constant string to write to the buffer to | ||
398 | */ | ||
399 | int __trace_bputs(unsigned long ip, const char *str) | ||
400 | { | ||
401 | struct ring_buffer_event *event; | ||
402 | struct ring_buffer *buffer; | ||
403 | struct bputs_entry *entry; | ||
404 | unsigned long irq_flags; | ||
405 | int size = sizeof(struct bputs_entry); | ||
406 | |||
407 | local_save_flags(irq_flags); | ||
408 | buffer = global_trace.trace_buffer.buffer; | ||
409 | event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, | ||
410 | irq_flags, preempt_count()); | ||
411 | if (!event) | ||
412 | return 0; | ||
413 | |||
414 | entry = ring_buffer_event_data(event); | ||
415 | entry->ip = ip; | ||
416 | entry->str = str; | ||
417 | |||
418 | __buffer_unlock_commit(buffer, event); | ||
419 | |||
420 | return 1; | ||
421 | } | ||
422 | EXPORT_SYMBOL_GPL(__trace_bputs); | ||
423 | |||
424 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
425 | /** | ||
426 | * trace_snapshot - take a snapshot of the current buffer. | ||
427 | * | ||
428 | * This causes a swap between the snapshot buffer and the current live | ||
429 | * tracing buffer. You can use this to take snapshots of the live | ||
430 | * trace when some condition is triggered, but continue to trace. | ||
431 | * | ||
432 | * Note, make sure to allocate the snapshot with either | ||
433 | * a tracing_snapshot_alloc(), or by doing it manually | ||
434 | * with: echo 1 > /sys/kernel/debug/tracing/snapshot | ||
435 | * | ||
436 | * If the snapshot buffer is not allocated, it will stop tracing. | ||
437 | * Basically making a permanent snapshot. | ||
438 | */ | ||
439 | void tracing_snapshot(void) | ||
440 | { | ||
441 | struct trace_array *tr = &global_trace; | ||
442 | struct tracer *tracer = tr->current_trace; | ||
443 | unsigned long flags; | ||
444 | |||
445 | if (in_nmi()) { | ||
446 | internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); | ||
447 | internal_trace_puts("*** snapshot is being ignored ***\n"); | ||
448 | return; | ||
449 | } | ||
450 | |||
451 | if (!tr->allocated_snapshot) { | ||
452 | internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); | ||
453 | internal_trace_puts("*** stopping trace here! ***\n"); | ||
454 | tracing_off(); | ||
455 | return; | ||
456 | } | ||
457 | |||
458 | /* Note, snapshot can not be used when the tracer uses it */ | ||
459 | if (tracer->use_max_tr) { | ||
460 | internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); | ||
461 | internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); | ||
462 | return; | ||
463 | } | ||
464 | |||
465 | local_irq_save(flags); | ||
466 | update_max_tr(tr, current, smp_processor_id()); | ||
467 | local_irq_restore(flags); | ||
468 | } | ||
469 | EXPORT_SYMBOL_GPL(tracing_snapshot); | ||
470 | |||
471 | static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, | ||
472 | struct trace_buffer *size_buf, int cpu_id); | ||
473 | static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); | ||
474 | |||
475 | static int alloc_snapshot(struct trace_array *tr) | ||
476 | { | ||
477 | int ret; | ||
478 | |||
479 | if (!tr->allocated_snapshot) { | ||
480 | |||
481 | /* allocate spare buffer */ | ||
482 | ret = resize_buffer_duplicate_size(&tr->max_buffer, | ||
483 | &tr->trace_buffer, RING_BUFFER_ALL_CPUS); | ||
484 | if (ret < 0) | ||
485 | return ret; | ||
486 | |||
487 | tr->allocated_snapshot = true; | ||
488 | } | ||
489 | |||
490 | return 0; | ||
491 | } | ||
492 | |||
493 | void free_snapshot(struct trace_array *tr) | ||
494 | { | ||
495 | /* | ||
496 | * We don't free the ring buffer. instead, resize it because | ||
497 | * The max_tr ring buffer has some state (e.g. ring->clock) and | ||
498 | * we want preserve it. | ||
499 | */ | ||
500 | ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); | ||
501 | set_buffer_entries(&tr->max_buffer, 1); | ||
502 | tracing_reset_online_cpus(&tr->max_buffer); | ||
503 | tr->allocated_snapshot = false; | ||
504 | } | ||
505 | |||
506 | /** | ||
507 | * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. | ||
508 | * | ||
509 | * This is similar to trace_snapshot(), but it will allocate the | ||
510 | * snapshot buffer if it isn't already allocated. Use this only | ||
511 | * where it is safe to sleep, as the allocation may sleep. | ||
512 | * | ||
513 | * This causes a swap between the snapshot buffer and the current live | ||
514 | * tracing buffer. You can use this to take snapshots of the live | ||
515 | * trace when some condition is triggered, but continue to trace. | ||
516 | */ | ||
517 | void tracing_snapshot_alloc(void) | ||
518 | { | ||
519 | struct trace_array *tr = &global_trace; | ||
520 | int ret; | ||
521 | |||
522 | ret = alloc_snapshot(tr); | ||
523 | if (WARN_ON(ret < 0)) | ||
524 | return; | ||
525 | |||
526 | tracing_snapshot(); | ||
527 | } | ||
528 | EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); | ||
529 | #else | ||
530 | void tracing_snapshot(void) | ||
531 | { | ||
532 | WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); | ||
533 | } | ||
534 | EXPORT_SYMBOL_GPL(tracing_snapshot); | ||
535 | void tracing_snapshot_alloc(void) | ||
536 | { | ||
537 | /* Give warning */ | ||
538 | tracing_snapshot(); | ||
539 | } | ||
540 | EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); | ||
541 | #endif /* CONFIG_TRACER_SNAPSHOT */ | ||
542 | |||
543 | /** | ||
388 | * tracing_off - turn off tracing buffers | 544 | * tracing_off - turn off tracing buffers |
389 | * | 545 | * |
390 | * This function stops the tracing buffers from recording data. | 546 | * This function stops the tracing buffers from recording data. |
@@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
394 | */ | 550 | */ |
395 | void tracing_off(void) | 551 | void tracing_off(void) |
396 | { | 552 | { |
397 | if (global_trace.buffer) | 553 | if (global_trace.trace_buffer.buffer) |
398 | ring_buffer_record_off(global_trace.buffer); | 554 | ring_buffer_record_off(global_trace.trace_buffer.buffer); |
399 | /* | 555 | /* |
400 | * This flag is only looked at when buffers haven't been | 556 | * This flag is only looked at when buffers haven't been |
401 | * allocated yet. We don't really care about the race | 557 | * allocated yet. We don't really care about the race |
@@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off); | |||
411 | */ | 567 | */ |
412 | int tracing_is_on(void) | 568 | int tracing_is_on(void) |
413 | { | 569 | { |
414 | if (global_trace.buffer) | 570 | if (global_trace.trace_buffer.buffer) |
415 | return ring_buffer_record_is_on(global_trace.buffer); | 571 | return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); |
416 | return !global_trace.buffer_disabled; | 572 | return !global_trace.buffer_disabled; |
417 | } | 573 | } |
418 | EXPORT_SYMBOL_GPL(tracing_is_on); | 574 | EXPORT_SYMBOL_GPL(tracing_is_on); |
@@ -479,6 +635,7 @@ static const char *trace_options[] = { | |||
479 | "disable_on_free", | 635 | "disable_on_free", |
480 | "irq-info", | 636 | "irq-info", |
481 | "markers", | 637 | "markers", |
638 | "function-trace", | ||
482 | NULL | 639 | NULL |
483 | }; | 640 | }; |
484 | 641 | ||
@@ -490,6 +647,8 @@ static struct { | |||
490 | { trace_clock_local, "local", 1 }, | 647 | { trace_clock_local, "local", 1 }, |
491 | { trace_clock_global, "global", 1 }, | 648 | { trace_clock_global, "global", 1 }, |
492 | { trace_clock_counter, "counter", 0 }, | 649 | { trace_clock_counter, "counter", 0 }, |
650 | { trace_clock_jiffies, "uptime", 1 }, | ||
651 | { trace_clock, "perf", 1 }, | ||
493 | ARCH_TRACE_CLOCKS | 652 | ARCH_TRACE_CLOCKS |
494 | }; | 653 | }; |
495 | 654 | ||
@@ -670,20 +829,29 @@ unsigned long __read_mostly tracing_max_latency; | |||
670 | static void | 829 | static void |
671 | __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | 830 | __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) |
672 | { | 831 | { |
673 | struct trace_array_cpu *data = tr->data[cpu]; | 832 | struct trace_buffer *trace_buf = &tr->trace_buffer; |
674 | struct trace_array_cpu *max_data; | 833 | struct trace_buffer *max_buf = &tr->max_buffer; |
834 | struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); | ||
835 | struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); | ||
675 | 836 | ||
676 | max_tr.cpu = cpu; | 837 | max_buf->cpu = cpu; |
677 | max_tr.time_start = data->preempt_timestamp; | 838 | max_buf->time_start = data->preempt_timestamp; |
678 | 839 | ||
679 | max_data = max_tr.data[cpu]; | ||
680 | max_data->saved_latency = tracing_max_latency; | 840 | max_data->saved_latency = tracing_max_latency; |
681 | max_data->critical_start = data->critical_start; | 841 | max_data->critical_start = data->critical_start; |
682 | max_data->critical_end = data->critical_end; | 842 | max_data->critical_end = data->critical_end; |
683 | 843 | ||
684 | memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); | 844 | memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); |
685 | max_data->pid = tsk->pid; | 845 | max_data->pid = tsk->pid; |
686 | max_data->uid = task_uid(tsk); | 846 | /* |
847 | * If tsk == current, then use current_uid(), as that does not use | ||
848 | * RCU. The irq tracer can be called out of RCU scope. | ||
849 | */ | ||
850 | if (tsk == current) | ||
851 | max_data->uid = current_uid(); | ||
852 | else | ||
853 | max_data->uid = task_uid(tsk); | ||
854 | |||
687 | max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; | 855 | max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; |
688 | max_data->policy = tsk->policy; | 856 | max_data->policy = tsk->policy; |
689 | max_data->rt_priority = tsk->rt_priority; | 857 | max_data->rt_priority = tsk->rt_priority; |
@@ -704,23 +872,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
704 | void | 872 | void |
705 | update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | 873 | update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) |
706 | { | 874 | { |
707 | struct ring_buffer *buf = tr->buffer; | 875 | struct ring_buffer *buf; |
708 | 876 | ||
709 | if (trace_stop_count) | 877 | if (tr->stop_count) |
710 | return; | 878 | return; |
711 | 879 | ||
712 | WARN_ON_ONCE(!irqs_disabled()); | 880 | WARN_ON_ONCE(!irqs_disabled()); |
713 | 881 | ||
714 | if (!current_trace->allocated_snapshot) { | 882 | if (!tr->allocated_snapshot) { |
715 | /* Only the nop tracer should hit this when disabling */ | 883 | /* Only the nop tracer should hit this when disabling */ |
716 | WARN_ON_ONCE(current_trace != &nop_trace); | 884 | WARN_ON_ONCE(tr->current_trace != &nop_trace); |
717 | return; | 885 | return; |
718 | } | 886 | } |
719 | 887 | ||
720 | arch_spin_lock(&ftrace_max_lock); | 888 | arch_spin_lock(&ftrace_max_lock); |
721 | 889 | ||
722 | tr->buffer = max_tr.buffer; | 890 | buf = tr->trace_buffer.buffer; |
723 | max_tr.buffer = buf; | 891 | tr->trace_buffer.buffer = tr->max_buffer.buffer; |
892 | tr->max_buffer.buffer = buf; | ||
724 | 893 | ||
725 | __update_max_tr(tr, tsk, cpu); | 894 | __update_max_tr(tr, tsk, cpu); |
726 | arch_spin_unlock(&ftrace_max_lock); | 895 | arch_spin_unlock(&ftrace_max_lock); |
@@ -739,16 +908,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
739 | { | 908 | { |
740 | int ret; | 909 | int ret; |
741 | 910 | ||
742 | if (trace_stop_count) | 911 | if (tr->stop_count) |
743 | return; | 912 | return; |
744 | 913 | ||
745 | WARN_ON_ONCE(!irqs_disabled()); | 914 | WARN_ON_ONCE(!irqs_disabled()); |
746 | if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) | 915 | if (!tr->allocated_snapshot) { |
916 | /* Only the nop tracer should hit this when disabling */ | ||
917 | WARN_ON_ONCE(tr->current_trace != &nop_trace); | ||
747 | return; | 918 | return; |
919 | } | ||
748 | 920 | ||
749 | arch_spin_lock(&ftrace_max_lock); | 921 | arch_spin_lock(&ftrace_max_lock); |
750 | 922 | ||
751 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); | 923 | ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); |
752 | 924 | ||
753 | if (ret == -EBUSY) { | 925 | if (ret == -EBUSY) { |
754 | /* | 926 | /* |
@@ -757,7 +929,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
757 | * the max trace buffer (no one writes directly to it) | 929 | * the max trace buffer (no one writes directly to it) |
758 | * and flag that it failed. | 930 | * and flag that it failed. |
759 | */ | 931 | */ |
760 | trace_array_printk(&max_tr, _THIS_IP_, | 932 | trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, |
761 | "Failed to swap buffers due to commit in progress\n"); | 933 | "Failed to swap buffers due to commit in progress\n"); |
762 | } | 934 | } |
763 | 935 | ||
@@ -770,37 +942,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
770 | 942 | ||
771 | static void default_wait_pipe(struct trace_iterator *iter) | 943 | static void default_wait_pipe(struct trace_iterator *iter) |
772 | { | 944 | { |
773 | DEFINE_WAIT(wait); | 945 | /* Iterators are static, they should be filled or empty */ |
946 | if (trace_buffer_iter(iter, iter->cpu_file)) | ||
947 | return; | ||
948 | |||
949 | ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); | ||
950 | } | ||
951 | |||
952 | #ifdef CONFIG_FTRACE_STARTUP_TEST | ||
953 | static int run_tracer_selftest(struct tracer *type) | ||
954 | { | ||
955 | struct trace_array *tr = &global_trace; | ||
956 | struct tracer *saved_tracer = tr->current_trace; | ||
957 | int ret; | ||
774 | 958 | ||
775 | prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); | 959 | if (!type->selftest || tracing_selftest_disabled) |
960 | return 0; | ||
776 | 961 | ||
777 | /* | 962 | /* |
778 | * The events can happen in critical sections where | 963 | * Run a selftest on this tracer. |
779 | * checking a work queue can cause deadlocks. | 964 | * Here we reset the trace buffer, and set the current |
780 | * After adding a task to the queue, this flag is set | 965 | * tracer to be this tracer. The tracer can then run some |
781 | * only to notify events to try to wake up the queue | 966 | * internal tracing to verify that everything is in order. |
782 | * using irq_work. | 967 | * If we fail, we do not register this tracer. |
783 | * | ||
784 | * We don't clear it even if the buffer is no longer | ||
785 | * empty. The flag only causes the next event to run | ||
786 | * irq_work to do the work queue wake up. The worse | ||
787 | * that can happen if we race with !trace_empty() is that | ||
788 | * an event will cause an irq_work to try to wake up | ||
789 | * an empty queue. | ||
790 | * | ||
791 | * There's no reason to protect this flag either, as | ||
792 | * the work queue and irq_work logic will do the necessary | ||
793 | * synchronization for the wake ups. The only thing | ||
794 | * that is necessary is that the wake up happens after | ||
795 | * a task has been queued. It's OK for spurious wake ups. | ||
796 | */ | 968 | */ |
797 | trace_wakeup_needed = true; | 969 | tracing_reset_online_cpus(&tr->trace_buffer); |
798 | 970 | ||
799 | if (trace_empty(iter)) | 971 | tr->current_trace = type; |
800 | schedule(); | 972 | |
973 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
974 | if (type->use_max_tr) { | ||
975 | /* If we expanded the buffers, make sure the max is expanded too */ | ||
976 | if (ring_buffer_expanded) | ||
977 | ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, | ||
978 | RING_BUFFER_ALL_CPUS); | ||
979 | tr->allocated_snapshot = true; | ||
980 | } | ||
981 | #endif | ||
982 | |||
983 | /* the test is responsible for initializing and enabling */ | ||
984 | pr_info("Testing tracer %s: ", type->name); | ||
985 | ret = type->selftest(type, tr); | ||
986 | /* the test is responsible for resetting too */ | ||
987 | tr->current_trace = saved_tracer; | ||
988 | if (ret) { | ||
989 | printk(KERN_CONT "FAILED!\n"); | ||
990 | /* Add the warning after printing 'FAILED' */ | ||
991 | WARN_ON(1); | ||
992 | return -1; | ||
993 | } | ||
994 | /* Only reset on passing, to avoid touching corrupted buffers */ | ||
995 | tracing_reset_online_cpus(&tr->trace_buffer); | ||
996 | |||
997 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
998 | if (type->use_max_tr) { | ||
999 | tr->allocated_snapshot = false; | ||
1000 | |||
1001 | /* Shrink the max buffer again */ | ||
1002 | if (ring_buffer_expanded) | ||
1003 | ring_buffer_resize(tr->max_buffer.buffer, 1, | ||
1004 | RING_BUFFER_ALL_CPUS); | ||
1005 | } | ||
1006 | #endif | ||
801 | 1007 | ||
802 | finish_wait(&trace_wait, &wait); | 1008 | printk(KERN_CONT "PASSED\n"); |
1009 | return 0; | ||
803 | } | 1010 | } |
1011 | #else | ||
1012 | static inline int run_tracer_selftest(struct tracer *type) | ||
1013 | { | ||
1014 | return 0; | ||
1015 | } | ||
1016 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ | ||
804 | 1017 | ||
805 | /** | 1018 | /** |
806 | * register_tracer - register a tracer with the ftrace system. | 1019 | * register_tracer - register a tracer with the ftrace system. |
@@ -847,57 +1060,9 @@ int register_tracer(struct tracer *type) | |||
847 | if (!type->wait_pipe) | 1060 | if (!type->wait_pipe) |
848 | type->wait_pipe = default_wait_pipe; | 1061 | type->wait_pipe = default_wait_pipe; |
849 | 1062 | ||
850 | 1063 | ret = run_tracer_selftest(type); | |
851 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1064 | if (ret < 0) |
852 | if (type->selftest && !tracing_selftest_disabled) { | 1065 | goto out; |
853 | struct tracer *saved_tracer = current_trace; | ||
854 | struct trace_array *tr = &global_trace; | ||
855 | |||
856 | /* | ||
857 | * Run a selftest on this tracer. | ||
858 | * Here we reset the trace buffer, and set the current | ||
859 | * tracer to be this tracer. The tracer can then run some | ||
860 | * internal tracing to verify that everything is in order. | ||
861 | * If we fail, we do not register this tracer. | ||
862 | */ | ||
863 | tracing_reset_online_cpus(tr); | ||
864 | |||
865 | current_trace = type; | ||
866 | |||
867 | if (type->use_max_tr) { | ||
868 | /* If we expanded the buffers, make sure the max is expanded too */ | ||
869 | if (ring_buffer_expanded) | ||
870 | ring_buffer_resize(max_tr.buffer, trace_buf_size, | ||
871 | RING_BUFFER_ALL_CPUS); | ||
872 | type->allocated_snapshot = true; | ||
873 | } | ||
874 | |||
875 | /* the test is responsible for initializing and enabling */ | ||
876 | pr_info("Testing tracer %s: ", type->name); | ||
877 | ret = type->selftest(type, tr); | ||
878 | /* the test is responsible for resetting too */ | ||
879 | current_trace = saved_tracer; | ||
880 | if (ret) { | ||
881 | printk(KERN_CONT "FAILED!\n"); | ||
882 | /* Add the warning after printing 'FAILED' */ | ||
883 | WARN_ON(1); | ||
884 | goto out; | ||
885 | } | ||
886 | /* Only reset on passing, to avoid touching corrupted buffers */ | ||
887 | tracing_reset_online_cpus(tr); | ||
888 | |||
889 | if (type->use_max_tr) { | ||
890 | type->allocated_snapshot = false; | ||
891 | |||
892 | /* Shrink the max buffer again */ | ||
893 | if (ring_buffer_expanded) | ||
894 | ring_buffer_resize(max_tr.buffer, 1, | ||
895 | RING_BUFFER_ALL_CPUS); | ||
896 | } | ||
897 | |||
898 | printk(KERN_CONT "PASSED\n"); | ||
899 | } | ||
900 | #endif | ||
901 | 1066 | ||
902 | type->next = trace_types; | 1067 | type->next = trace_types; |
903 | trace_types = type; | 1068 | trace_types = type; |
@@ -917,7 +1082,7 @@ int register_tracer(struct tracer *type) | |||
917 | tracing_set_tracer(type->name); | 1082 | tracing_set_tracer(type->name); |
918 | default_bootup_tracer = NULL; | 1083 | default_bootup_tracer = NULL; |
919 | /* disable other selftests, since this will break it. */ | 1084 | /* disable other selftests, since this will break it. */ |
920 | tracing_selftest_disabled = 1; | 1085 | tracing_selftest_disabled = true; |
921 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1086 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
922 | printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", | 1087 | printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", |
923 | type->name); | 1088 | type->name); |
@@ -927,9 +1092,9 @@ int register_tracer(struct tracer *type) | |||
927 | return ret; | 1092 | return ret; |
928 | } | 1093 | } |
929 | 1094 | ||
930 | void tracing_reset(struct trace_array *tr, int cpu) | 1095 | void tracing_reset(struct trace_buffer *buf, int cpu) |
931 | { | 1096 | { |
932 | struct ring_buffer *buffer = tr->buffer; | 1097 | struct ring_buffer *buffer = buf->buffer; |
933 | 1098 | ||
934 | if (!buffer) | 1099 | if (!buffer) |
935 | return; | 1100 | return; |
@@ -943,9 +1108,9 @@ void tracing_reset(struct trace_array *tr, int cpu) | |||
943 | ring_buffer_record_enable(buffer); | 1108 | ring_buffer_record_enable(buffer); |
944 | } | 1109 | } |
945 | 1110 | ||
946 | void tracing_reset_online_cpus(struct trace_array *tr) | 1111 | void tracing_reset_online_cpus(struct trace_buffer *buf) |
947 | { | 1112 | { |
948 | struct ring_buffer *buffer = tr->buffer; | 1113 | struct ring_buffer *buffer = buf->buffer; |
949 | int cpu; | 1114 | int cpu; |
950 | 1115 | ||
951 | if (!buffer) | 1116 | if (!buffer) |
@@ -956,7 +1121,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
956 | /* Make sure all commits have finished */ | 1121 | /* Make sure all commits have finished */ |
957 | synchronize_sched(); | 1122 | synchronize_sched(); |
958 | 1123 | ||
959 | tr->time_start = ftrace_now(tr->cpu); | 1124 | buf->time_start = ftrace_now(buf->cpu); |
960 | 1125 | ||
961 | for_each_online_cpu(cpu) | 1126 | for_each_online_cpu(cpu) |
962 | ring_buffer_reset_cpu(buffer, cpu); | 1127 | ring_buffer_reset_cpu(buffer, cpu); |
@@ -966,12 +1131,21 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
966 | 1131 | ||
967 | void tracing_reset_current(int cpu) | 1132 | void tracing_reset_current(int cpu) |
968 | { | 1133 | { |
969 | tracing_reset(&global_trace, cpu); | 1134 | tracing_reset(&global_trace.trace_buffer, cpu); |
970 | } | 1135 | } |
971 | 1136 | ||
972 | void tracing_reset_current_online_cpus(void) | 1137 | void tracing_reset_all_online_cpus(void) |
973 | { | 1138 | { |
974 | tracing_reset_online_cpus(&global_trace); | 1139 | struct trace_array *tr; |
1140 | |||
1141 | mutex_lock(&trace_types_lock); | ||
1142 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
1143 | tracing_reset_online_cpus(&tr->trace_buffer); | ||
1144 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
1145 | tracing_reset_online_cpus(&tr->max_buffer); | ||
1146 | #endif | ||
1147 | } | ||
1148 | mutex_unlock(&trace_types_lock); | ||
975 | } | 1149 | } |
976 | 1150 | ||
977 | #define SAVED_CMDLINES 128 | 1151 | #define SAVED_CMDLINES 128 |
@@ -994,7 +1168,7 @@ static void trace_init_cmdlines(void) | |||
994 | 1168 | ||
995 | int is_tracing_stopped(void) | 1169 | int is_tracing_stopped(void) |
996 | { | 1170 | { |
997 | return trace_stop_count; | 1171 | return global_trace.stop_count; |
998 | } | 1172 | } |
999 | 1173 | ||
1000 | /** | 1174 | /** |
@@ -1026,12 +1200,12 @@ void tracing_start(void) | |||
1026 | if (tracing_disabled) | 1200 | if (tracing_disabled) |
1027 | return; | 1201 | return; |
1028 | 1202 | ||
1029 | raw_spin_lock_irqsave(&tracing_start_lock, flags); | 1203 | raw_spin_lock_irqsave(&global_trace.start_lock, flags); |
1030 | if (--trace_stop_count) { | 1204 | if (--global_trace.stop_count) { |
1031 | if (trace_stop_count < 0) { | 1205 | if (global_trace.stop_count < 0) { |
1032 | /* Someone screwed up their debugging */ | 1206 | /* Someone screwed up their debugging */ |
1033 | WARN_ON_ONCE(1); | 1207 | WARN_ON_ONCE(1); |
1034 | trace_stop_count = 0; | 1208 | global_trace.stop_count = 0; |
1035 | } | 1209 | } |
1036 | goto out; | 1210 | goto out; |
1037 | } | 1211 | } |
@@ -1039,19 +1213,52 @@ void tracing_start(void) | |||
1039 | /* Prevent the buffers from switching */ | 1213 | /* Prevent the buffers from switching */ |
1040 | arch_spin_lock(&ftrace_max_lock); | 1214 | arch_spin_lock(&ftrace_max_lock); |
1041 | 1215 | ||
1042 | buffer = global_trace.buffer; | 1216 | buffer = global_trace.trace_buffer.buffer; |
1043 | if (buffer) | 1217 | if (buffer) |
1044 | ring_buffer_record_enable(buffer); | 1218 | ring_buffer_record_enable(buffer); |
1045 | 1219 | ||
1046 | buffer = max_tr.buffer; | 1220 | #ifdef CONFIG_TRACER_MAX_TRACE |
1221 | buffer = global_trace.max_buffer.buffer; | ||
1047 | if (buffer) | 1222 | if (buffer) |
1048 | ring_buffer_record_enable(buffer); | 1223 | ring_buffer_record_enable(buffer); |
1224 | #endif | ||
1049 | 1225 | ||
1050 | arch_spin_unlock(&ftrace_max_lock); | 1226 | arch_spin_unlock(&ftrace_max_lock); |
1051 | 1227 | ||
1052 | ftrace_start(); | 1228 | ftrace_start(); |
1053 | out: | 1229 | out: |
1054 | raw_spin_unlock_irqrestore(&tracing_start_lock, flags); | 1230 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); |
1231 | } | ||
1232 | |||
1233 | static void tracing_start_tr(struct trace_array *tr) | ||
1234 | { | ||
1235 | struct ring_buffer *buffer; | ||
1236 | unsigned long flags; | ||
1237 | |||
1238 | if (tracing_disabled) | ||
1239 | return; | ||
1240 | |||
1241 | /* If global, we need to also start the max tracer */ | ||
1242 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) | ||
1243 | return tracing_start(); | ||
1244 | |||
1245 | raw_spin_lock_irqsave(&tr->start_lock, flags); | ||
1246 | |||
1247 | if (--tr->stop_count) { | ||
1248 | if (tr->stop_count < 0) { | ||
1249 | /* Someone screwed up their debugging */ | ||
1250 | WARN_ON_ONCE(1); | ||
1251 | tr->stop_count = 0; | ||
1252 | } | ||
1253 | goto out; | ||
1254 | } | ||
1255 | |||
1256 | buffer = tr->trace_buffer.buffer; | ||
1257 | if (buffer) | ||
1258 | ring_buffer_record_enable(buffer); | ||
1259 | |||
1260 | out: | ||
1261 | raw_spin_unlock_irqrestore(&tr->start_lock, flags); | ||
1055 | } | 1262 | } |
1056 | 1263 | ||
1057 | /** | 1264 | /** |
@@ -1066,25 +1273,48 @@ void tracing_stop(void) | |||
1066 | unsigned long flags; | 1273 | unsigned long flags; |
1067 | 1274 | ||
1068 | ftrace_stop(); | 1275 | ftrace_stop(); |
1069 | raw_spin_lock_irqsave(&tracing_start_lock, flags); | 1276 | raw_spin_lock_irqsave(&global_trace.start_lock, flags); |
1070 | if (trace_stop_count++) | 1277 | if (global_trace.stop_count++) |
1071 | goto out; | 1278 | goto out; |
1072 | 1279 | ||
1073 | /* Prevent the buffers from switching */ | 1280 | /* Prevent the buffers from switching */ |
1074 | arch_spin_lock(&ftrace_max_lock); | 1281 | arch_spin_lock(&ftrace_max_lock); |
1075 | 1282 | ||
1076 | buffer = global_trace.buffer; | 1283 | buffer = global_trace.trace_buffer.buffer; |
1077 | if (buffer) | 1284 | if (buffer) |
1078 | ring_buffer_record_disable(buffer); | 1285 | ring_buffer_record_disable(buffer); |
1079 | 1286 | ||
1080 | buffer = max_tr.buffer; | 1287 | #ifdef CONFIG_TRACER_MAX_TRACE |
1288 | buffer = global_trace.max_buffer.buffer; | ||
1081 | if (buffer) | 1289 | if (buffer) |
1082 | ring_buffer_record_disable(buffer); | 1290 | ring_buffer_record_disable(buffer); |
1291 | #endif | ||
1083 | 1292 | ||
1084 | arch_spin_unlock(&ftrace_max_lock); | 1293 | arch_spin_unlock(&ftrace_max_lock); |
1085 | 1294 | ||
1086 | out: | 1295 | out: |
1087 | raw_spin_unlock_irqrestore(&tracing_start_lock, flags); | 1296 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); |
1297 | } | ||
1298 | |||
1299 | static void tracing_stop_tr(struct trace_array *tr) | ||
1300 | { | ||
1301 | struct ring_buffer *buffer; | ||
1302 | unsigned long flags; | ||
1303 | |||
1304 | /* If global, we need to also stop the max tracer */ | ||
1305 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) | ||
1306 | return tracing_stop(); | ||
1307 | |||
1308 | raw_spin_lock_irqsave(&tr->start_lock, flags); | ||
1309 | if (tr->stop_count++) | ||
1310 | goto out; | ||
1311 | |||
1312 | buffer = tr->trace_buffer.buffer; | ||
1313 | if (buffer) | ||
1314 | ring_buffer_record_disable(buffer); | ||
1315 | |||
1316 | out: | ||
1317 | raw_spin_unlock_irqrestore(&tr->start_lock, flags); | ||
1088 | } | 1318 | } |
1089 | 1319 | ||
1090 | void trace_stop_cmdline_recording(void); | 1320 | void trace_stop_cmdline_recording(void); |
@@ -1217,11 +1447,6 @@ void | |||
1217 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) | 1447 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) |
1218 | { | 1448 | { |
1219 | __this_cpu_write(trace_cmdline_save, true); | 1449 | __this_cpu_write(trace_cmdline_save, true); |
1220 | if (trace_wakeup_needed) { | ||
1221 | trace_wakeup_needed = false; | ||
1222 | /* irq_work_queue() supplies it's own memory barriers */ | ||
1223 | irq_work_queue(&trace_work_wakeup); | ||
1224 | } | ||
1225 | ring_buffer_unlock_commit(buffer, event); | 1450 | ring_buffer_unlock_commit(buffer, event); |
1226 | } | 1451 | } |
1227 | 1452 | ||
@@ -1245,11 +1470,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer, | |||
1245 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); | 1470 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); |
1246 | 1471 | ||
1247 | struct ring_buffer_event * | 1472 | struct ring_buffer_event * |
1473 | trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, | ||
1474 | struct ftrace_event_file *ftrace_file, | ||
1475 | int type, unsigned long len, | ||
1476 | unsigned long flags, int pc) | ||
1477 | { | ||
1478 | *current_rb = ftrace_file->tr->trace_buffer.buffer; | ||
1479 | return trace_buffer_lock_reserve(*current_rb, | ||
1480 | type, len, flags, pc); | ||
1481 | } | ||
1482 | EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); | ||
1483 | |||
1484 | struct ring_buffer_event * | ||
1248 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, | 1485 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, |
1249 | int type, unsigned long len, | 1486 | int type, unsigned long len, |
1250 | unsigned long flags, int pc) | 1487 | unsigned long flags, int pc) |
1251 | { | 1488 | { |
1252 | *current_rb = global_trace.buffer; | 1489 | *current_rb = global_trace.trace_buffer.buffer; |
1253 | return trace_buffer_lock_reserve(*current_rb, | 1490 | return trace_buffer_lock_reserve(*current_rb, |
1254 | type, len, flags, pc); | 1491 | type, len, flags, pc); |
1255 | } | 1492 | } |
@@ -1288,7 +1525,7 @@ trace_function(struct trace_array *tr, | |||
1288 | int pc) | 1525 | int pc) |
1289 | { | 1526 | { |
1290 | struct ftrace_event_call *call = &event_function; | 1527 | struct ftrace_event_call *call = &event_function; |
1291 | struct ring_buffer *buffer = tr->buffer; | 1528 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
1292 | struct ring_buffer_event *event; | 1529 | struct ring_buffer_event *event; |
1293 | struct ftrace_entry *entry; | 1530 | struct ftrace_entry *entry; |
1294 | 1531 | ||
@@ -1429,13 +1666,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, | |||
1429 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, | 1666 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, |
1430 | int pc) | 1667 | int pc) |
1431 | { | 1668 | { |
1432 | __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); | 1669 | __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); |
1433 | } | 1670 | } |
1434 | 1671 | ||
1435 | /** | 1672 | /** |
1436 | * trace_dump_stack - record a stack back trace in the trace buffer | 1673 | * trace_dump_stack - record a stack back trace in the trace buffer |
1674 | * @skip: Number of functions to skip (helper handlers) | ||
1437 | */ | 1675 | */ |
1438 | void trace_dump_stack(void) | 1676 | void trace_dump_stack(int skip) |
1439 | { | 1677 | { |
1440 | unsigned long flags; | 1678 | unsigned long flags; |
1441 | 1679 | ||
@@ -1444,8 +1682,13 @@ void trace_dump_stack(void) | |||
1444 | 1682 | ||
1445 | local_save_flags(flags); | 1683 | local_save_flags(flags); |
1446 | 1684 | ||
1447 | /* skipping 3 traces, seems to get us at the caller of this function */ | 1685 | /* |
1448 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); | 1686 | * Skip 3 more, seems to get us at the caller of |
1687 | * this function. | ||
1688 | */ | ||
1689 | skip += 3; | ||
1690 | __ftrace_trace_stack(global_trace.trace_buffer.buffer, | ||
1691 | flags, skip, preempt_count(), NULL); | ||
1449 | } | 1692 | } |
1450 | 1693 | ||
1451 | static DEFINE_PER_CPU(int, user_stack_count); | 1694 | static DEFINE_PER_CPU(int, user_stack_count); |
@@ -1615,7 +1858,7 @@ void trace_printk_init_buffers(void) | |||
1615 | * directly here. If the global_trace.buffer is already | 1858 | * directly here. If the global_trace.buffer is already |
1616 | * allocated here, then this was called by module code. | 1859 | * allocated here, then this was called by module code. |
1617 | */ | 1860 | */ |
1618 | if (global_trace.buffer) | 1861 | if (global_trace.trace_buffer.buffer) |
1619 | tracing_start_cmdline_record(); | 1862 | tracing_start_cmdline_record(); |
1620 | } | 1863 | } |
1621 | 1864 | ||
@@ -1675,7 +1918,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1675 | 1918 | ||
1676 | local_save_flags(flags); | 1919 | local_save_flags(flags); |
1677 | size = sizeof(*entry) + sizeof(u32) * len; | 1920 | size = sizeof(*entry) + sizeof(u32) * len; |
1678 | buffer = tr->buffer; | 1921 | buffer = tr->trace_buffer.buffer; |
1679 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, | 1922 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, |
1680 | flags, pc); | 1923 | flags, pc); |
1681 | if (!event) | 1924 | if (!event) |
@@ -1698,27 +1941,12 @@ out: | |||
1698 | } | 1941 | } |
1699 | EXPORT_SYMBOL_GPL(trace_vbprintk); | 1942 | EXPORT_SYMBOL_GPL(trace_vbprintk); |
1700 | 1943 | ||
1701 | int trace_array_printk(struct trace_array *tr, | 1944 | static int |
1702 | unsigned long ip, const char *fmt, ...) | 1945 | __trace_array_vprintk(struct ring_buffer *buffer, |
1703 | { | 1946 | unsigned long ip, const char *fmt, va_list args) |
1704 | int ret; | ||
1705 | va_list ap; | ||
1706 | |||
1707 | if (!(trace_flags & TRACE_ITER_PRINTK)) | ||
1708 | return 0; | ||
1709 | |||
1710 | va_start(ap, fmt); | ||
1711 | ret = trace_array_vprintk(tr, ip, fmt, ap); | ||
1712 | va_end(ap); | ||
1713 | return ret; | ||
1714 | } | ||
1715 | |||
1716 | int trace_array_vprintk(struct trace_array *tr, | ||
1717 | unsigned long ip, const char *fmt, va_list args) | ||
1718 | { | 1947 | { |
1719 | struct ftrace_event_call *call = &event_print; | 1948 | struct ftrace_event_call *call = &event_print; |
1720 | struct ring_buffer_event *event; | 1949 | struct ring_buffer_event *event; |
1721 | struct ring_buffer *buffer; | ||
1722 | int len = 0, size, pc; | 1950 | int len = 0, size, pc; |
1723 | struct print_entry *entry; | 1951 | struct print_entry *entry; |
1724 | unsigned long flags; | 1952 | unsigned long flags; |
@@ -1746,7 +1974,6 @@ int trace_array_vprintk(struct trace_array *tr, | |||
1746 | 1974 | ||
1747 | local_save_flags(flags); | 1975 | local_save_flags(flags); |
1748 | size = sizeof(*entry) + len + 1; | 1976 | size = sizeof(*entry) + len + 1; |
1749 | buffer = tr->buffer; | ||
1750 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | 1977 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, |
1751 | flags, pc); | 1978 | flags, pc); |
1752 | if (!event) | 1979 | if (!event) |
@@ -1767,6 +1994,42 @@ int trace_array_vprintk(struct trace_array *tr, | |||
1767 | return len; | 1994 | return len; |
1768 | } | 1995 | } |
1769 | 1996 | ||
1997 | int trace_array_vprintk(struct trace_array *tr, | ||
1998 | unsigned long ip, const char *fmt, va_list args) | ||
1999 | { | ||
2000 | return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); | ||
2001 | } | ||
2002 | |||
2003 | int trace_array_printk(struct trace_array *tr, | ||
2004 | unsigned long ip, const char *fmt, ...) | ||
2005 | { | ||
2006 | int ret; | ||
2007 | va_list ap; | ||
2008 | |||
2009 | if (!(trace_flags & TRACE_ITER_PRINTK)) | ||
2010 | return 0; | ||
2011 | |||
2012 | va_start(ap, fmt); | ||
2013 | ret = trace_array_vprintk(tr, ip, fmt, ap); | ||
2014 | va_end(ap); | ||
2015 | return ret; | ||
2016 | } | ||
2017 | |||
2018 | int trace_array_printk_buf(struct ring_buffer *buffer, | ||
2019 | unsigned long ip, const char *fmt, ...) | ||
2020 | { | ||
2021 | int ret; | ||
2022 | va_list ap; | ||
2023 | |||
2024 | if (!(trace_flags & TRACE_ITER_PRINTK)) | ||
2025 | return 0; | ||
2026 | |||
2027 | va_start(ap, fmt); | ||
2028 | ret = __trace_array_vprintk(buffer, ip, fmt, ap); | ||
2029 | va_end(ap); | ||
2030 | return ret; | ||
2031 | } | ||
2032 | |||
1770 | int trace_vprintk(unsigned long ip, const char *fmt, va_list args) | 2033 | int trace_vprintk(unsigned long ip, const char *fmt, va_list args) |
1771 | { | 2034 | { |
1772 | return trace_array_vprintk(&global_trace, ip, fmt, args); | 2035 | return trace_array_vprintk(&global_trace, ip, fmt, args); |
@@ -1792,7 +2055,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
1792 | if (buf_iter) | 2055 | if (buf_iter) |
1793 | event = ring_buffer_iter_peek(buf_iter, ts); | 2056 | event = ring_buffer_iter_peek(buf_iter, ts); |
1794 | else | 2057 | else |
1795 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, | 2058 | event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, |
1796 | lost_events); | 2059 | lost_events); |
1797 | 2060 | ||
1798 | if (event) { | 2061 | if (event) { |
@@ -1807,7 +2070,7 @@ static struct trace_entry * | |||
1807 | __find_next_entry(struct trace_iterator *iter, int *ent_cpu, | 2070 | __find_next_entry(struct trace_iterator *iter, int *ent_cpu, |
1808 | unsigned long *missing_events, u64 *ent_ts) | 2071 | unsigned long *missing_events, u64 *ent_ts) |
1809 | { | 2072 | { |
1810 | struct ring_buffer *buffer = iter->tr->buffer; | 2073 | struct ring_buffer *buffer = iter->trace_buffer->buffer; |
1811 | struct trace_entry *ent, *next = NULL; | 2074 | struct trace_entry *ent, *next = NULL; |
1812 | unsigned long lost_events = 0, next_lost = 0; | 2075 | unsigned long lost_events = 0, next_lost = 0; |
1813 | int cpu_file = iter->cpu_file; | 2076 | int cpu_file = iter->cpu_file; |
@@ -1820,7 +2083,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, | |||
1820 | * If we are in a per_cpu trace file, don't bother by iterating over | 2083 | * If we are in a per_cpu trace file, don't bother by iterating over |
1821 | * all cpu and peek directly. | 2084 | * all cpu and peek directly. |
1822 | */ | 2085 | */ |
1823 | if (cpu_file > TRACE_PIPE_ALL_CPU) { | 2086 | if (cpu_file > RING_BUFFER_ALL_CPUS) { |
1824 | if (ring_buffer_empty_cpu(buffer, cpu_file)) | 2087 | if (ring_buffer_empty_cpu(buffer, cpu_file)) |
1825 | return NULL; | 2088 | return NULL; |
1826 | ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); | 2089 | ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); |
@@ -1884,7 +2147,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) | |||
1884 | 2147 | ||
1885 | static void trace_consume(struct trace_iterator *iter) | 2148 | static void trace_consume(struct trace_iterator *iter) |
1886 | { | 2149 | { |
1887 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, | 2150 | ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, |
1888 | &iter->lost_events); | 2151 | &iter->lost_events); |
1889 | } | 2152 | } |
1890 | 2153 | ||
@@ -1917,13 +2180,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) | |||
1917 | 2180 | ||
1918 | void tracing_iter_reset(struct trace_iterator *iter, int cpu) | 2181 | void tracing_iter_reset(struct trace_iterator *iter, int cpu) |
1919 | { | 2182 | { |
1920 | struct trace_array *tr = iter->tr; | ||
1921 | struct ring_buffer_event *event; | 2183 | struct ring_buffer_event *event; |
1922 | struct ring_buffer_iter *buf_iter; | 2184 | struct ring_buffer_iter *buf_iter; |
1923 | unsigned long entries = 0; | 2185 | unsigned long entries = 0; |
1924 | u64 ts; | 2186 | u64 ts; |
1925 | 2187 | ||
1926 | tr->data[cpu]->skipped_entries = 0; | 2188 | per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; |
1927 | 2189 | ||
1928 | buf_iter = trace_buffer_iter(iter, cpu); | 2190 | buf_iter = trace_buffer_iter(iter, cpu); |
1929 | if (!buf_iter) | 2191 | if (!buf_iter) |
@@ -1937,13 +2199,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
1937 | * by the timestamp being before the start of the buffer. | 2199 | * by the timestamp being before the start of the buffer. |
1938 | */ | 2200 | */ |
1939 | while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { | 2201 | while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { |
1940 | if (ts >= iter->tr->time_start) | 2202 | if (ts >= iter->trace_buffer->time_start) |
1941 | break; | 2203 | break; |
1942 | entries++; | 2204 | entries++; |
1943 | ring_buffer_read(buf_iter, NULL); | 2205 | ring_buffer_read(buf_iter, NULL); |
1944 | } | 2206 | } |
1945 | 2207 | ||
1946 | tr->data[cpu]->skipped_entries = entries; | 2208 | per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; |
1947 | } | 2209 | } |
1948 | 2210 | ||
1949 | /* | 2211 | /* |
@@ -1953,6 +2215,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
1953 | static void *s_start(struct seq_file *m, loff_t *pos) | 2215 | static void *s_start(struct seq_file *m, loff_t *pos) |
1954 | { | 2216 | { |
1955 | struct trace_iterator *iter = m->private; | 2217 | struct trace_iterator *iter = m->private; |
2218 | struct trace_array *tr = iter->tr; | ||
1956 | int cpu_file = iter->cpu_file; | 2219 | int cpu_file = iter->cpu_file; |
1957 | void *p = NULL; | 2220 | void *p = NULL; |
1958 | loff_t l = 0; | 2221 | loff_t l = 0; |
@@ -1965,12 +2228,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1965 | * will point to the same string as current_trace->name. | 2228 | * will point to the same string as current_trace->name. |
1966 | */ | 2229 | */ |
1967 | mutex_lock(&trace_types_lock); | 2230 | mutex_lock(&trace_types_lock); |
1968 | if (unlikely(current_trace && iter->trace->name != current_trace->name)) | 2231 | if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) |
1969 | *iter->trace = *current_trace; | 2232 | *iter->trace = *tr->current_trace; |
1970 | mutex_unlock(&trace_types_lock); | 2233 | mutex_unlock(&trace_types_lock); |
1971 | 2234 | ||
2235 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
1972 | if (iter->snapshot && iter->trace->use_max_tr) | 2236 | if (iter->snapshot && iter->trace->use_max_tr) |
1973 | return ERR_PTR(-EBUSY); | 2237 | return ERR_PTR(-EBUSY); |
2238 | #endif | ||
1974 | 2239 | ||
1975 | if (!iter->snapshot) | 2240 | if (!iter->snapshot) |
1976 | atomic_inc(&trace_record_cmdline_disabled); | 2241 | atomic_inc(&trace_record_cmdline_disabled); |
@@ -1980,7 +2245,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1980 | iter->cpu = 0; | 2245 | iter->cpu = 0; |
1981 | iter->idx = -1; | 2246 | iter->idx = -1; |
1982 | 2247 | ||
1983 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | 2248 | if (cpu_file == RING_BUFFER_ALL_CPUS) { |
1984 | for_each_tracing_cpu(cpu) | 2249 | for_each_tracing_cpu(cpu) |
1985 | tracing_iter_reset(iter, cpu); | 2250 | tracing_iter_reset(iter, cpu); |
1986 | } else | 2251 | } else |
@@ -2012,17 +2277,21 @@ static void s_stop(struct seq_file *m, void *p) | |||
2012 | { | 2277 | { |
2013 | struct trace_iterator *iter = m->private; | 2278 | struct trace_iterator *iter = m->private; |
2014 | 2279 | ||
2280 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
2015 | if (iter->snapshot && iter->trace->use_max_tr) | 2281 | if (iter->snapshot && iter->trace->use_max_tr) |
2016 | return; | 2282 | return; |
2283 | #endif | ||
2017 | 2284 | ||
2018 | if (!iter->snapshot) | 2285 | if (!iter->snapshot) |
2019 | atomic_dec(&trace_record_cmdline_disabled); | 2286 | atomic_dec(&trace_record_cmdline_disabled); |
2287 | |||
2020 | trace_access_unlock(iter->cpu_file); | 2288 | trace_access_unlock(iter->cpu_file); |
2021 | trace_event_read_unlock(); | 2289 | trace_event_read_unlock(); |
2022 | } | 2290 | } |
2023 | 2291 | ||
2024 | static void | 2292 | static void |
2025 | get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) | 2293 | get_total_entries(struct trace_buffer *buf, |
2294 | unsigned long *total, unsigned long *entries) | ||
2026 | { | 2295 | { |
2027 | unsigned long count; | 2296 | unsigned long count; |
2028 | int cpu; | 2297 | int cpu; |
@@ -2031,19 +2300,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e | |||
2031 | *entries = 0; | 2300 | *entries = 0; |
2032 | 2301 | ||
2033 | for_each_tracing_cpu(cpu) { | 2302 | for_each_tracing_cpu(cpu) { |
2034 | count = ring_buffer_entries_cpu(tr->buffer, cpu); | 2303 | count = ring_buffer_entries_cpu(buf->buffer, cpu); |
2035 | /* | 2304 | /* |
2036 | * If this buffer has skipped entries, then we hold all | 2305 | * If this buffer has skipped entries, then we hold all |
2037 | * entries for the trace and we need to ignore the | 2306 | * entries for the trace and we need to ignore the |
2038 | * ones before the time stamp. | 2307 | * ones before the time stamp. |
2039 | */ | 2308 | */ |
2040 | if (tr->data[cpu]->skipped_entries) { | 2309 | if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { |
2041 | count -= tr->data[cpu]->skipped_entries; | 2310 | count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; |
2042 | /* total is the same as the entries */ | 2311 | /* total is the same as the entries */ |
2043 | *total += count; | 2312 | *total += count; |
2044 | } else | 2313 | } else |
2045 | *total += count + | 2314 | *total += count + |
2046 | ring_buffer_overrun_cpu(tr->buffer, cpu); | 2315 | ring_buffer_overrun_cpu(buf->buffer, cpu); |
2047 | *entries += count; | 2316 | *entries += count; |
2048 | } | 2317 | } |
2049 | } | 2318 | } |
@@ -2060,27 +2329,27 @@ static void print_lat_help_header(struct seq_file *m) | |||
2060 | seq_puts(m, "# \\ / ||||| \\ | / \n"); | 2329 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
2061 | } | 2330 | } |
2062 | 2331 | ||
2063 | static void print_event_info(struct trace_array *tr, struct seq_file *m) | 2332 | static void print_event_info(struct trace_buffer *buf, struct seq_file *m) |
2064 | { | 2333 | { |
2065 | unsigned long total; | 2334 | unsigned long total; |
2066 | unsigned long entries; | 2335 | unsigned long entries; |
2067 | 2336 | ||
2068 | get_total_entries(tr, &total, &entries); | 2337 | get_total_entries(buf, &total, &entries); |
2069 | seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", | 2338 | seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", |
2070 | entries, total, num_online_cpus()); | 2339 | entries, total, num_online_cpus()); |
2071 | seq_puts(m, "#\n"); | 2340 | seq_puts(m, "#\n"); |
2072 | } | 2341 | } |
2073 | 2342 | ||
2074 | static void print_func_help_header(struct trace_array *tr, struct seq_file *m) | 2343 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) |
2075 | { | 2344 | { |
2076 | print_event_info(tr, m); | 2345 | print_event_info(buf, m); |
2077 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | 2346 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); |
2078 | seq_puts(m, "# | | | | |\n"); | 2347 | seq_puts(m, "# | | | | |\n"); |
2079 | } | 2348 | } |
2080 | 2349 | ||
2081 | static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) | 2350 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) |
2082 | { | 2351 | { |
2083 | print_event_info(tr, m); | 2352 | print_event_info(buf, m); |
2084 | seq_puts(m, "# _-----=> irqs-off\n"); | 2353 | seq_puts(m, "# _-----=> irqs-off\n"); |
2085 | seq_puts(m, "# / _----=> need-resched\n"); | 2354 | seq_puts(m, "# / _----=> need-resched\n"); |
2086 | seq_puts(m, "# | / _---=> hardirq/softirq\n"); | 2355 | seq_puts(m, "# | / _---=> hardirq/softirq\n"); |
@@ -2094,16 +2363,16 @@ void | |||
2094 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) | 2363 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) |
2095 | { | 2364 | { |
2096 | unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); | 2365 | unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); |
2097 | struct trace_array *tr = iter->tr; | 2366 | struct trace_buffer *buf = iter->trace_buffer; |
2098 | struct trace_array_cpu *data = tr->data[tr->cpu]; | 2367 | struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); |
2099 | struct tracer *type = current_trace; | 2368 | struct tracer *type = iter->trace; |
2100 | unsigned long entries; | 2369 | unsigned long entries; |
2101 | unsigned long total; | 2370 | unsigned long total; |
2102 | const char *name = "preemption"; | 2371 | const char *name = "preemption"; |
2103 | 2372 | ||
2104 | name = type->name; | 2373 | name = type->name; |
2105 | 2374 | ||
2106 | get_total_entries(tr, &total, &entries); | 2375 | get_total_entries(buf, &total, &entries); |
2107 | 2376 | ||
2108 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", | 2377 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", |
2109 | name, UTS_RELEASE); | 2378 | name, UTS_RELEASE); |
@@ -2114,7 +2383,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) | |||
2114 | nsecs_to_usecs(data->saved_latency), | 2383 | nsecs_to_usecs(data->saved_latency), |
2115 | entries, | 2384 | entries, |
2116 | total, | 2385 | total, |
2117 | tr->cpu, | 2386 | buf->cpu, |
2118 | #if defined(CONFIG_PREEMPT_NONE) | 2387 | #if defined(CONFIG_PREEMPT_NONE) |
2119 | "server", | 2388 | "server", |
2120 | #elif defined(CONFIG_PREEMPT_VOLUNTARY) | 2389 | #elif defined(CONFIG_PREEMPT_VOLUNTARY) |
@@ -2165,7 +2434,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) | |||
2165 | if (cpumask_test_cpu(iter->cpu, iter->started)) | 2434 | if (cpumask_test_cpu(iter->cpu, iter->started)) |
2166 | return; | 2435 | return; |
2167 | 2436 | ||
2168 | if (iter->tr->data[iter->cpu]->skipped_entries) | 2437 | if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) |
2169 | return; | 2438 | return; |
2170 | 2439 | ||
2171 | cpumask_set_cpu(iter->cpu, iter->started); | 2440 | cpumask_set_cpu(iter->cpu, iter->started); |
@@ -2288,14 +2557,14 @@ int trace_empty(struct trace_iterator *iter) | |||
2288 | int cpu; | 2557 | int cpu; |
2289 | 2558 | ||
2290 | /* If we are looking at one CPU buffer, only check that one */ | 2559 | /* If we are looking at one CPU buffer, only check that one */ |
2291 | if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { | 2560 | if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { |
2292 | cpu = iter->cpu_file; | 2561 | cpu = iter->cpu_file; |
2293 | buf_iter = trace_buffer_iter(iter, cpu); | 2562 | buf_iter = trace_buffer_iter(iter, cpu); |
2294 | if (buf_iter) { | 2563 | if (buf_iter) { |
2295 | if (!ring_buffer_iter_empty(buf_iter)) | 2564 | if (!ring_buffer_iter_empty(buf_iter)) |
2296 | return 0; | 2565 | return 0; |
2297 | } else { | 2566 | } else { |
2298 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) | 2567 | if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) |
2299 | return 0; | 2568 | return 0; |
2300 | } | 2569 | } |
2301 | return 1; | 2570 | return 1; |
@@ -2307,7 +2576,7 @@ int trace_empty(struct trace_iterator *iter) | |||
2307 | if (!ring_buffer_iter_empty(buf_iter)) | 2576 | if (!ring_buffer_iter_empty(buf_iter)) |
2308 | return 0; | 2577 | return 0; |
2309 | } else { | 2578 | } else { |
2310 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) | 2579 | if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) |
2311 | return 0; | 2580 | return 0; |
2312 | } | 2581 | } |
2313 | } | 2582 | } |
@@ -2331,6 +2600,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
2331 | return ret; | 2600 | return ret; |
2332 | } | 2601 | } |
2333 | 2602 | ||
2603 | if (iter->ent->type == TRACE_BPUTS && | ||
2604 | trace_flags & TRACE_ITER_PRINTK && | ||
2605 | trace_flags & TRACE_ITER_PRINTK_MSGONLY) | ||
2606 | return trace_print_bputs_msg_only(iter); | ||
2607 | |||
2334 | if (iter->ent->type == TRACE_BPRINT && | 2608 | if (iter->ent->type == TRACE_BPRINT && |
2335 | trace_flags & TRACE_ITER_PRINTK && | 2609 | trace_flags & TRACE_ITER_PRINTK && |
2336 | trace_flags & TRACE_ITER_PRINTK_MSGONLY) | 2610 | trace_flags & TRACE_ITER_PRINTK_MSGONLY) |
@@ -2385,9 +2659,9 @@ void trace_default_header(struct seq_file *m) | |||
2385 | } else { | 2659 | } else { |
2386 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { | 2660 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { |
2387 | if (trace_flags & TRACE_ITER_IRQ_INFO) | 2661 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
2388 | print_func_help_header_irq(iter->tr, m); | 2662 | print_func_help_header_irq(iter->trace_buffer, m); |
2389 | else | 2663 | else |
2390 | print_func_help_header(iter->tr, m); | 2664 | print_func_help_header(iter->trace_buffer, m); |
2391 | } | 2665 | } |
2392 | } | 2666 | } |
2393 | } | 2667 | } |
@@ -2401,14 +2675,8 @@ static void test_ftrace_alive(struct seq_file *m) | |||
2401 | } | 2675 | } |
2402 | 2676 | ||
2403 | #ifdef CONFIG_TRACER_MAX_TRACE | 2677 | #ifdef CONFIG_TRACER_MAX_TRACE |
2404 | static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) | 2678 | static void show_snapshot_main_help(struct seq_file *m) |
2405 | { | 2679 | { |
2406 | if (iter->trace->allocated_snapshot) | ||
2407 | seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); | ||
2408 | else | ||
2409 | seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); | ||
2410 | |||
2411 | seq_printf(m, "# Snapshot commands:\n"); | ||
2412 | seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); | 2680 | seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); |
2413 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); | 2681 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); |
2414 | seq_printf(m, "# Takes a snapshot of the main buffer.\n"); | 2682 | seq_printf(m, "# Takes a snapshot of the main buffer.\n"); |
@@ -2416,6 +2684,35 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) | |||
2416 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); | 2684 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); |
2417 | seq_printf(m, "# is not a '0' or '1')\n"); | 2685 | seq_printf(m, "# is not a '0' or '1')\n"); |
2418 | } | 2686 | } |
2687 | |||
2688 | static void show_snapshot_percpu_help(struct seq_file *m) | ||
2689 | { | ||
2690 | seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); | ||
2691 | #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP | ||
2692 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); | ||
2693 | seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); | ||
2694 | #else | ||
2695 | seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); | ||
2696 | seq_printf(m, "# Must use main snapshot file to allocate.\n"); | ||
2697 | #endif | ||
2698 | seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); | ||
2699 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); | ||
2700 | seq_printf(m, "# is not a '0' or '1')\n"); | ||
2701 | } | ||
2702 | |||
2703 | static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) | ||
2704 | { | ||
2705 | if (iter->tr->allocated_snapshot) | ||
2706 | seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); | ||
2707 | else | ||
2708 | seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); | ||
2709 | |||
2710 | seq_printf(m, "# Snapshot commands:\n"); | ||
2711 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) | ||
2712 | show_snapshot_main_help(m); | ||
2713 | else | ||
2714 | show_snapshot_percpu_help(m); | ||
2715 | } | ||
2419 | #else | 2716 | #else |
2420 | /* Should never be called */ | 2717 | /* Should never be called */ |
2421 | static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } | 2718 | static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } |
@@ -2475,7 +2772,8 @@ static const struct seq_operations tracer_seq_ops = { | |||
2475 | static struct trace_iterator * | 2772 | static struct trace_iterator * |
2476 | __tracing_open(struct inode *inode, struct file *file, bool snapshot) | 2773 | __tracing_open(struct inode *inode, struct file *file, bool snapshot) |
2477 | { | 2774 | { |
2478 | long cpu_file = (long) inode->i_private; | 2775 | struct trace_cpu *tc = inode->i_private; |
2776 | struct trace_array *tr = tc->tr; | ||
2479 | struct trace_iterator *iter; | 2777 | struct trace_iterator *iter; |
2480 | int cpu; | 2778 | int cpu; |
2481 | 2779 | ||
@@ -2500,26 +2798,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) | |||
2500 | if (!iter->trace) | 2798 | if (!iter->trace) |
2501 | goto fail; | 2799 | goto fail; |
2502 | 2800 | ||
2503 | *iter->trace = *current_trace; | 2801 | *iter->trace = *tr->current_trace; |
2504 | 2802 | ||
2505 | if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) | 2803 | if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) |
2506 | goto fail; | 2804 | goto fail; |
2507 | 2805 | ||
2508 | if (current_trace->print_max || snapshot) | 2806 | iter->tr = tr; |
2509 | iter->tr = &max_tr; | 2807 | |
2808 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
2809 | /* Currently only the top directory has a snapshot */ | ||
2810 | if (tr->current_trace->print_max || snapshot) | ||
2811 | iter->trace_buffer = &tr->max_buffer; | ||
2510 | else | 2812 | else |
2511 | iter->tr = &global_trace; | 2813 | #endif |
2814 | iter->trace_buffer = &tr->trace_buffer; | ||
2512 | iter->snapshot = snapshot; | 2815 | iter->snapshot = snapshot; |
2513 | iter->pos = -1; | 2816 | iter->pos = -1; |
2514 | mutex_init(&iter->mutex); | 2817 | mutex_init(&iter->mutex); |
2515 | iter->cpu_file = cpu_file; | 2818 | iter->cpu_file = tc->cpu; |
2516 | 2819 | ||
2517 | /* Notify the tracer early; before we stop tracing. */ | 2820 | /* Notify the tracer early; before we stop tracing. */ |
2518 | if (iter->trace && iter->trace->open) | 2821 | if (iter->trace && iter->trace->open) |
2519 | iter->trace->open(iter); | 2822 | iter->trace->open(iter); |
2520 | 2823 | ||
2521 | /* Annotate start of buffers if we had overruns */ | 2824 | /* Annotate start of buffers if we had overruns */ |
2522 | if (ring_buffer_overruns(iter->tr->buffer)) | 2825 | if (ring_buffer_overruns(iter->trace_buffer->buffer)) |
2523 | iter->iter_flags |= TRACE_FILE_ANNOTATE; | 2826 | iter->iter_flags |= TRACE_FILE_ANNOTATE; |
2524 | 2827 | ||
2525 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | 2828 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ |
@@ -2528,12 +2831,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) | |||
2528 | 2831 | ||
2529 | /* stop the trace while dumping if we are not opening "snapshot" */ | 2832 | /* stop the trace while dumping if we are not opening "snapshot" */ |
2530 | if (!iter->snapshot) | 2833 | if (!iter->snapshot) |
2531 | tracing_stop(); | 2834 | tracing_stop_tr(tr); |
2532 | 2835 | ||
2533 | if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { | 2836 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { |
2534 | for_each_tracing_cpu(cpu) { | 2837 | for_each_tracing_cpu(cpu) { |
2535 | iter->buffer_iter[cpu] = | 2838 | iter->buffer_iter[cpu] = |
2536 | ring_buffer_read_prepare(iter->tr->buffer, cpu); | 2839 | ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); |
2537 | } | 2840 | } |
2538 | ring_buffer_read_prepare_sync(); | 2841 | ring_buffer_read_prepare_sync(); |
2539 | for_each_tracing_cpu(cpu) { | 2842 | for_each_tracing_cpu(cpu) { |
@@ -2543,12 +2846,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) | |||
2543 | } else { | 2846 | } else { |
2544 | cpu = iter->cpu_file; | 2847 | cpu = iter->cpu_file; |
2545 | iter->buffer_iter[cpu] = | 2848 | iter->buffer_iter[cpu] = |
2546 | ring_buffer_read_prepare(iter->tr->buffer, cpu); | 2849 | ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); |
2547 | ring_buffer_read_prepare_sync(); | 2850 | ring_buffer_read_prepare_sync(); |
2548 | ring_buffer_read_start(iter->buffer_iter[cpu]); | 2851 | ring_buffer_read_start(iter->buffer_iter[cpu]); |
2549 | tracing_iter_reset(iter, cpu); | 2852 | tracing_iter_reset(iter, cpu); |
2550 | } | 2853 | } |
2551 | 2854 | ||
2855 | tr->ref++; | ||
2856 | |||
2552 | mutex_unlock(&trace_types_lock); | 2857 | mutex_unlock(&trace_types_lock); |
2553 | 2858 | ||
2554 | return iter; | 2859 | return iter; |
@@ -2575,14 +2880,20 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2575 | { | 2880 | { |
2576 | struct seq_file *m = file->private_data; | 2881 | struct seq_file *m = file->private_data; |
2577 | struct trace_iterator *iter; | 2882 | struct trace_iterator *iter; |
2883 | struct trace_array *tr; | ||
2578 | int cpu; | 2884 | int cpu; |
2579 | 2885 | ||
2580 | if (!(file->f_mode & FMODE_READ)) | 2886 | if (!(file->f_mode & FMODE_READ)) |
2581 | return 0; | 2887 | return 0; |
2582 | 2888 | ||
2583 | iter = m->private; | 2889 | iter = m->private; |
2890 | tr = iter->tr; | ||
2584 | 2891 | ||
2585 | mutex_lock(&trace_types_lock); | 2892 | mutex_lock(&trace_types_lock); |
2893 | |||
2894 | WARN_ON(!tr->ref); | ||
2895 | tr->ref--; | ||
2896 | |||
2586 | for_each_tracing_cpu(cpu) { | 2897 | for_each_tracing_cpu(cpu) { |
2587 | if (iter->buffer_iter[cpu]) | 2898 | if (iter->buffer_iter[cpu]) |
2588 | ring_buffer_read_finish(iter->buffer_iter[cpu]); | 2899 | ring_buffer_read_finish(iter->buffer_iter[cpu]); |
@@ -2593,7 +2904,7 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2593 | 2904 | ||
2594 | if (!iter->snapshot) | 2905 | if (!iter->snapshot) |
2595 | /* reenable tracing if it was previously enabled */ | 2906 | /* reenable tracing if it was previously enabled */ |
2596 | tracing_start(); | 2907 | tracing_start_tr(tr); |
2597 | mutex_unlock(&trace_types_lock); | 2908 | mutex_unlock(&trace_types_lock); |
2598 | 2909 | ||
2599 | mutex_destroy(&iter->mutex); | 2910 | mutex_destroy(&iter->mutex); |
@@ -2612,12 +2923,13 @@ static int tracing_open(struct inode *inode, struct file *file) | |||
2612 | /* If this file was open for write, then erase contents */ | 2923 | /* If this file was open for write, then erase contents */ |
2613 | if ((file->f_mode & FMODE_WRITE) && | 2924 | if ((file->f_mode & FMODE_WRITE) && |
2614 | (file->f_flags & O_TRUNC)) { | 2925 | (file->f_flags & O_TRUNC)) { |
2615 | long cpu = (long) inode->i_private; | 2926 | struct trace_cpu *tc = inode->i_private; |
2927 | struct trace_array *tr = tc->tr; | ||
2616 | 2928 | ||
2617 | if (cpu == TRACE_PIPE_ALL_CPU) | 2929 | if (tc->cpu == RING_BUFFER_ALL_CPUS) |
2618 | tracing_reset_online_cpus(&global_trace); | 2930 | tracing_reset_online_cpus(&tr->trace_buffer); |
2619 | else | 2931 | else |
2620 | tracing_reset(&global_trace, cpu); | 2932 | tracing_reset(&tr->trace_buffer, tc->cpu); |
2621 | } | 2933 | } |
2622 | 2934 | ||
2623 | if (file->f_mode & FMODE_READ) { | 2935 | if (file->f_mode & FMODE_READ) { |
@@ -2764,8 +3076,9 @@ static ssize_t | |||
2764 | tracing_cpumask_write(struct file *filp, const char __user *ubuf, | 3076 | tracing_cpumask_write(struct file *filp, const char __user *ubuf, |
2765 | size_t count, loff_t *ppos) | 3077 | size_t count, loff_t *ppos) |
2766 | { | 3078 | { |
2767 | int err, cpu; | 3079 | struct trace_array *tr = filp->private_data; |
2768 | cpumask_var_t tracing_cpumask_new; | 3080 | cpumask_var_t tracing_cpumask_new; |
3081 | int err, cpu; | ||
2769 | 3082 | ||
2770 | if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) | 3083 | if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) |
2771 | return -ENOMEM; | 3084 | return -ENOMEM; |
@@ -2785,13 +3098,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
2785 | */ | 3098 | */ |
2786 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 3099 | if (cpumask_test_cpu(cpu, tracing_cpumask) && |
2787 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 3100 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2788 | atomic_inc(&global_trace.data[cpu]->disabled); | 3101 | atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); |
2789 | ring_buffer_record_disable_cpu(global_trace.buffer, cpu); | 3102 | ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); |
2790 | } | 3103 | } |
2791 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 3104 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && |
2792 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 3105 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2793 | atomic_dec(&global_trace.data[cpu]->disabled); | 3106 | atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); |
2794 | ring_buffer_record_enable_cpu(global_trace.buffer, cpu); | 3107 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); |
2795 | } | 3108 | } |
2796 | } | 3109 | } |
2797 | arch_spin_unlock(&ftrace_max_lock); | 3110 | arch_spin_unlock(&ftrace_max_lock); |
@@ -2820,12 +3133,13 @@ static const struct file_operations tracing_cpumask_fops = { | |||
2820 | static int tracing_trace_options_show(struct seq_file *m, void *v) | 3133 | static int tracing_trace_options_show(struct seq_file *m, void *v) |
2821 | { | 3134 | { |
2822 | struct tracer_opt *trace_opts; | 3135 | struct tracer_opt *trace_opts; |
3136 | struct trace_array *tr = m->private; | ||
2823 | u32 tracer_flags; | 3137 | u32 tracer_flags; |
2824 | int i; | 3138 | int i; |
2825 | 3139 | ||
2826 | mutex_lock(&trace_types_lock); | 3140 | mutex_lock(&trace_types_lock); |
2827 | tracer_flags = current_trace->flags->val; | 3141 | tracer_flags = tr->current_trace->flags->val; |
2828 | trace_opts = current_trace->flags->opts; | 3142 | trace_opts = tr->current_trace->flags->opts; |
2829 | 3143 | ||
2830 | for (i = 0; trace_options[i]; i++) { | 3144 | for (i = 0; trace_options[i]; i++) { |
2831 | if (trace_flags & (1 << i)) | 3145 | if (trace_flags & (1 << i)) |
@@ -2880,11 +3194,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) | |||
2880 | return -EINVAL; | 3194 | return -EINVAL; |
2881 | } | 3195 | } |
2882 | 3196 | ||
2883 | static void set_tracer_flags(unsigned int mask, int enabled) | 3197 | /* Some tracers require overwrite to stay enabled */ |
3198 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) | ||
3199 | { | ||
3200 | if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) | ||
3201 | return -1; | ||
3202 | |||
3203 | return 0; | ||
3204 | } | ||
3205 | |||
3206 | int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) | ||
2884 | { | 3207 | { |
2885 | /* do nothing if flag is already set */ | 3208 | /* do nothing if flag is already set */ |
2886 | if (!!(trace_flags & mask) == !!enabled) | 3209 | if (!!(trace_flags & mask) == !!enabled) |
2887 | return; | 3210 | return 0; |
3211 | |||
3212 | /* Give the tracer a chance to approve the change */ | ||
3213 | if (tr->current_trace->flag_changed) | ||
3214 | if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) | ||
3215 | return -EINVAL; | ||
2888 | 3216 | ||
2889 | if (enabled) | 3217 | if (enabled) |
2890 | trace_flags |= mask; | 3218 | trace_flags |= mask; |
@@ -2894,18 +3222,24 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2894 | if (mask == TRACE_ITER_RECORD_CMD) | 3222 | if (mask == TRACE_ITER_RECORD_CMD) |
2895 | trace_event_enable_cmd_record(enabled); | 3223 | trace_event_enable_cmd_record(enabled); |
2896 | 3224 | ||
2897 | if (mask == TRACE_ITER_OVERWRITE) | 3225 | if (mask == TRACE_ITER_OVERWRITE) { |
2898 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | 3226 | ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); |
3227 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
3228 | ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); | ||
3229 | #endif | ||
3230 | } | ||
2899 | 3231 | ||
2900 | if (mask == TRACE_ITER_PRINTK) | 3232 | if (mask == TRACE_ITER_PRINTK) |
2901 | trace_printk_start_stop_comm(enabled); | 3233 | trace_printk_start_stop_comm(enabled); |
3234 | |||
3235 | return 0; | ||
2902 | } | 3236 | } |
2903 | 3237 | ||
2904 | static int trace_set_options(char *option) | 3238 | static int trace_set_options(struct trace_array *tr, char *option) |
2905 | { | 3239 | { |
2906 | char *cmp; | 3240 | char *cmp; |
2907 | int neg = 0; | 3241 | int neg = 0; |
2908 | int ret = 0; | 3242 | int ret = -ENODEV; |
2909 | int i; | 3243 | int i; |
2910 | 3244 | ||
2911 | cmp = strstrip(option); | 3245 | cmp = strstrip(option); |
@@ -2915,19 +3249,20 @@ static int trace_set_options(char *option) | |||
2915 | cmp += 2; | 3249 | cmp += 2; |
2916 | } | 3250 | } |
2917 | 3251 | ||
3252 | mutex_lock(&trace_types_lock); | ||
3253 | |||
2918 | for (i = 0; trace_options[i]; i++) { | 3254 | for (i = 0; trace_options[i]; i++) { |
2919 | if (strcmp(cmp, trace_options[i]) == 0) { | 3255 | if (strcmp(cmp, trace_options[i]) == 0) { |
2920 | set_tracer_flags(1 << i, !neg); | 3256 | ret = set_tracer_flag(tr, 1 << i, !neg); |
2921 | break; | 3257 | break; |
2922 | } | 3258 | } |
2923 | } | 3259 | } |
2924 | 3260 | ||
2925 | /* If no option could be set, test the specific tracer options */ | 3261 | /* If no option could be set, test the specific tracer options */ |
2926 | if (!trace_options[i]) { | 3262 | if (!trace_options[i]) |
2927 | mutex_lock(&trace_types_lock); | 3263 | ret = set_tracer_option(tr->current_trace, cmp, neg); |
2928 | ret = set_tracer_option(current_trace, cmp, neg); | 3264 | |
2929 | mutex_unlock(&trace_types_lock); | 3265 | mutex_unlock(&trace_types_lock); |
2930 | } | ||
2931 | 3266 | ||
2932 | return ret; | 3267 | return ret; |
2933 | } | 3268 | } |
@@ -2936,7 +3271,10 @@ static ssize_t | |||
2936 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, | 3271 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, |
2937 | size_t cnt, loff_t *ppos) | 3272 | size_t cnt, loff_t *ppos) |
2938 | { | 3273 | { |
3274 | struct seq_file *m = filp->private_data; | ||
3275 | struct trace_array *tr = m->private; | ||
2939 | char buf[64]; | 3276 | char buf[64]; |
3277 | int ret; | ||
2940 | 3278 | ||
2941 | if (cnt >= sizeof(buf)) | 3279 | if (cnt >= sizeof(buf)) |
2942 | return -EINVAL; | 3280 | return -EINVAL; |
@@ -2946,7 +3284,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
2946 | 3284 | ||
2947 | buf[cnt] = 0; | 3285 | buf[cnt] = 0; |
2948 | 3286 | ||
2949 | trace_set_options(buf); | 3287 | ret = trace_set_options(tr, buf); |
3288 | if (ret < 0) | ||
3289 | return ret; | ||
2950 | 3290 | ||
2951 | *ppos += cnt; | 3291 | *ppos += cnt; |
2952 | 3292 | ||
@@ -2957,7 +3297,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) | |||
2957 | { | 3297 | { |
2958 | if (tracing_disabled) | 3298 | if (tracing_disabled) |
2959 | return -ENODEV; | 3299 | return -ENODEV; |
2960 | return single_open(file, tracing_trace_options_show, NULL); | 3300 | |
3301 | return single_open(file, tracing_trace_options_show, inode->i_private); | ||
2961 | } | 3302 | } |
2962 | 3303 | ||
2963 | static const struct file_operations tracing_iter_fops = { | 3304 | static const struct file_operations tracing_iter_fops = { |
@@ -2970,20 +3311,84 @@ static const struct file_operations tracing_iter_fops = { | |||
2970 | 3311 | ||
2971 | static const char readme_msg[] = | 3312 | static const char readme_msg[] = |
2972 | "tracing mini-HOWTO:\n\n" | 3313 | "tracing mini-HOWTO:\n\n" |
2973 | "# mount -t debugfs nodev /sys/kernel/debug\n\n" | 3314 | "# echo 0 > tracing_on : quick way to disable tracing\n" |
2974 | "# cat /sys/kernel/debug/tracing/available_tracers\n" | 3315 | "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" |
2975 | "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" | 3316 | " Important files:\n" |
2976 | "# cat /sys/kernel/debug/tracing/current_tracer\n" | 3317 | " trace\t\t\t- The static contents of the buffer\n" |
2977 | "nop\n" | 3318 | "\t\t\t To clear the buffer write into this file: echo > trace\n" |
2978 | "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" | 3319 | " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" |
2979 | "# cat /sys/kernel/debug/tracing/current_tracer\n" | 3320 | " current_tracer\t- function and latency tracers\n" |
2980 | "wakeup\n" | 3321 | " available_tracers\t- list of configured tracers for current_tracer\n" |
2981 | "# cat /sys/kernel/debug/tracing/trace_options\n" | 3322 | " buffer_size_kb\t- view and modify size of per cpu buffer\n" |
2982 | "noprint-parent nosym-offset nosym-addr noverbose\n" | 3323 | " buffer_total_size_kb - view total size of all cpu buffers\n\n" |
2983 | "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" | 3324 | " trace_clock\t\t-change the clock used to order events\n" |
2984 | "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" | 3325 | " local: Per cpu clock but may not be synced across CPUs\n" |
2985 | "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" | 3326 | " global: Synced across CPUs but slows tracing down.\n" |
2986 | "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" | 3327 | " counter: Not a clock, but just an increment\n" |
3328 | " uptime: Jiffy counter from time of boot\n" | ||
3329 | " perf: Same clock that perf events use\n" | ||
3330 | #ifdef CONFIG_X86_64 | ||
3331 | " x86-tsc: TSC cycle counter\n" | ||
3332 | #endif | ||
3333 | "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" | ||
3334 | " tracing_cpumask\t- Limit which CPUs to trace\n" | ||
3335 | " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" | ||
3336 | "\t\t\t Remove sub-buffer with rmdir\n" | ||
3337 | " trace_options\t\t- Set format or modify how tracing happens\n" | ||
3338 | "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" | ||
3339 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
3340 | "\n available_filter_functions - list of functions that can be filtered on\n" | ||
3341 | " set_ftrace_filter\t- echo function name in here to only trace these functions\n" | ||
3342 | " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" | ||
3343 | " modules: Can select a group via module\n" | ||
3344 | " Format: :mod:<module-name>\n" | ||
3345 | " example: echo :mod:ext3 > set_ftrace_filter\n" | ||
3346 | " triggers: a command to perform when function is hit\n" | ||
3347 | " Format: <function>:<trigger>[:count]\n" | ||
3348 | " trigger: traceon, traceoff\n" | ||
3349 | " enable_event:<system>:<event>\n" | ||
3350 | " disable_event:<system>:<event>\n" | ||
3351 | #ifdef CONFIG_STACKTRACE | ||
3352 | " stacktrace\n" | ||
3353 | #endif | ||
3354 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
3355 | " snapshot\n" | ||
3356 | #endif | ||
3357 | " example: echo do_fault:traceoff > set_ftrace_filter\n" | ||
3358 | " echo do_trap:traceoff:3 > set_ftrace_filter\n" | ||
3359 | " The first one will disable tracing every time do_fault is hit\n" | ||
3360 | " The second will disable tracing at most 3 times when do_trap is hit\n" | ||
3361 | " The first time do trap is hit and it disables tracing, the counter\n" | ||
3362 | " will decrement to 2. If tracing is already disabled, the counter\n" | ||
3363 | " will not decrement. It only decrements when the trigger did work\n" | ||
3364 | " To remove trigger without count:\n" | ||
3365 | " echo '!<function>:<trigger> > set_ftrace_filter\n" | ||
3366 | " To remove trigger with a count:\n" | ||
3367 | " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" | ||
3368 | " set_ftrace_notrace\t- echo function name in here to never trace.\n" | ||
3369 | " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" | ||
3370 | " modules: Can select a group via module command :mod:\n" | ||
3371 | " Does not accept triggers\n" | ||
3372 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
3373 | #ifdef CONFIG_FUNCTION_TRACER | ||
3374 | " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" | ||
3375 | #endif | ||
3376 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
3377 | " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" | ||
3378 | " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" | ||
3379 | #endif | ||
3380 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
3381 | "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" | ||
3382 | "\t\t\t Read the contents for more information\n" | ||
3383 | #endif | ||
3384 | #ifdef CONFIG_STACKTRACE | ||
3385 | " stack_trace\t\t- Shows the max stack trace when active\n" | ||
3386 | " stack_max_size\t- Shows current max stack size that was traced\n" | ||
3387 | "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" | ||
3388 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
3389 | " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" | ||
3390 | #endif | ||
3391 | #endif /* CONFIG_STACKTRACE */ | ||
2987 | ; | 3392 | ; |
2988 | 3393 | ||
2989 | static ssize_t | 3394 | static ssize_t |
@@ -3055,11 +3460,12 @@ static ssize_t | |||
3055 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 3460 | tracing_set_trace_read(struct file *filp, char __user *ubuf, |
3056 | size_t cnt, loff_t *ppos) | 3461 | size_t cnt, loff_t *ppos) |
3057 | { | 3462 | { |
3463 | struct trace_array *tr = filp->private_data; | ||
3058 | char buf[MAX_TRACER_SIZE+2]; | 3464 | char buf[MAX_TRACER_SIZE+2]; |
3059 | int r; | 3465 | int r; |
3060 | 3466 | ||
3061 | mutex_lock(&trace_types_lock); | 3467 | mutex_lock(&trace_types_lock); |
3062 | r = sprintf(buf, "%s\n", current_trace->name); | 3468 | r = sprintf(buf, "%s\n", tr->current_trace->name); |
3063 | mutex_unlock(&trace_types_lock); | 3469 | mutex_unlock(&trace_types_lock); |
3064 | 3470 | ||
3065 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 3471 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
@@ -3067,43 +3473,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, | |||
3067 | 3473 | ||
3068 | int tracer_init(struct tracer *t, struct trace_array *tr) | 3474 | int tracer_init(struct tracer *t, struct trace_array *tr) |
3069 | { | 3475 | { |
3070 | tracing_reset_online_cpus(tr); | 3476 | tracing_reset_online_cpus(&tr->trace_buffer); |
3071 | return t->init(tr); | 3477 | return t->init(tr); |
3072 | } | 3478 | } |
3073 | 3479 | ||
3074 | static void set_buffer_entries(struct trace_array *tr, unsigned long val) | 3480 | static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) |
3075 | { | 3481 | { |
3076 | int cpu; | 3482 | int cpu; |
3483 | |||
3077 | for_each_tracing_cpu(cpu) | 3484 | for_each_tracing_cpu(cpu) |
3078 | tr->data[cpu]->entries = val; | 3485 | per_cpu_ptr(buf->data, cpu)->entries = val; |
3079 | } | 3486 | } |
3080 | 3487 | ||
3488 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
3081 | /* resize @tr's buffer to the size of @size_tr's entries */ | 3489 | /* resize @tr's buffer to the size of @size_tr's entries */ |
3082 | static int resize_buffer_duplicate_size(struct trace_array *tr, | 3490 | static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, |
3083 | struct trace_array *size_tr, int cpu_id) | 3491 | struct trace_buffer *size_buf, int cpu_id) |
3084 | { | 3492 | { |
3085 | int cpu, ret = 0; | 3493 | int cpu, ret = 0; |
3086 | 3494 | ||
3087 | if (cpu_id == RING_BUFFER_ALL_CPUS) { | 3495 | if (cpu_id == RING_BUFFER_ALL_CPUS) { |
3088 | for_each_tracing_cpu(cpu) { | 3496 | for_each_tracing_cpu(cpu) { |
3089 | ret = ring_buffer_resize(tr->buffer, | 3497 | ret = ring_buffer_resize(trace_buf->buffer, |
3090 | size_tr->data[cpu]->entries, cpu); | 3498 | per_cpu_ptr(size_buf->data, cpu)->entries, cpu); |
3091 | if (ret < 0) | 3499 | if (ret < 0) |
3092 | break; | 3500 | break; |
3093 | tr->data[cpu]->entries = size_tr->data[cpu]->entries; | 3501 | per_cpu_ptr(trace_buf->data, cpu)->entries = |
3502 | per_cpu_ptr(size_buf->data, cpu)->entries; | ||
3094 | } | 3503 | } |
3095 | } else { | 3504 | } else { |
3096 | ret = ring_buffer_resize(tr->buffer, | 3505 | ret = ring_buffer_resize(trace_buf->buffer, |
3097 | size_tr->data[cpu_id]->entries, cpu_id); | 3506 | per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); |
3098 | if (ret == 0) | 3507 | if (ret == 0) |
3099 | tr->data[cpu_id]->entries = | 3508 | per_cpu_ptr(trace_buf->data, cpu_id)->entries = |
3100 | size_tr->data[cpu_id]->entries; | 3509 | per_cpu_ptr(size_buf->data, cpu_id)->entries; |
3101 | } | 3510 | } |
3102 | 3511 | ||
3103 | return ret; | 3512 | return ret; |
3104 | } | 3513 | } |
3514 | #endif /* CONFIG_TRACER_MAX_TRACE */ | ||
3105 | 3515 | ||
3106 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | 3516 | static int __tracing_resize_ring_buffer(struct trace_array *tr, |
3517 | unsigned long size, int cpu) | ||
3107 | { | 3518 | { |
3108 | int ret; | 3519 | int ret; |
3109 | 3520 | ||
@@ -3112,23 +3523,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3112 | * we use the size that was given, and we can forget about | 3523 | * we use the size that was given, and we can forget about |
3113 | * expanding it later. | 3524 | * expanding it later. |
3114 | */ | 3525 | */ |
3115 | ring_buffer_expanded = 1; | 3526 | ring_buffer_expanded = true; |
3116 | 3527 | ||
3117 | /* May be called before buffers are initialized */ | 3528 | /* May be called before buffers are initialized */ |
3118 | if (!global_trace.buffer) | 3529 | if (!tr->trace_buffer.buffer) |
3119 | return 0; | 3530 | return 0; |
3120 | 3531 | ||
3121 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); | 3532 | ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); |
3122 | if (ret < 0) | 3533 | if (ret < 0) |
3123 | return ret; | 3534 | return ret; |
3124 | 3535 | ||
3125 | if (!current_trace->use_max_tr) | 3536 | #ifdef CONFIG_TRACER_MAX_TRACE |
3537 | if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || | ||
3538 | !tr->current_trace->use_max_tr) | ||
3126 | goto out; | 3539 | goto out; |
3127 | 3540 | ||
3128 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); | 3541 | ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); |
3129 | if (ret < 0) { | 3542 | if (ret < 0) { |
3130 | int r = resize_buffer_duplicate_size(&global_trace, | 3543 | int r = resize_buffer_duplicate_size(&tr->trace_buffer, |
3131 | &global_trace, cpu); | 3544 | &tr->trace_buffer, cpu); |
3132 | if (r < 0) { | 3545 | if (r < 0) { |
3133 | /* | 3546 | /* |
3134 | * AARGH! We are left with different | 3547 | * AARGH! We are left with different |
@@ -3151,20 +3564,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3151 | } | 3564 | } |
3152 | 3565 | ||
3153 | if (cpu == RING_BUFFER_ALL_CPUS) | 3566 | if (cpu == RING_BUFFER_ALL_CPUS) |
3154 | set_buffer_entries(&max_tr, size); | 3567 | set_buffer_entries(&tr->max_buffer, size); |
3155 | else | 3568 | else |
3156 | max_tr.data[cpu]->entries = size; | 3569 | per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; |
3157 | 3570 | ||
3158 | out: | 3571 | out: |
3572 | #endif /* CONFIG_TRACER_MAX_TRACE */ | ||
3573 | |||
3159 | if (cpu == RING_BUFFER_ALL_CPUS) | 3574 | if (cpu == RING_BUFFER_ALL_CPUS) |
3160 | set_buffer_entries(&global_trace, size); | 3575 | set_buffer_entries(&tr->trace_buffer, size); |
3161 | else | 3576 | else |
3162 | global_trace.data[cpu]->entries = size; | 3577 | per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; |
3163 | 3578 | ||
3164 | return ret; | 3579 | return ret; |
3165 | } | 3580 | } |
3166 | 3581 | ||
3167 | static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) | 3582 | static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, |
3583 | unsigned long size, int cpu_id) | ||
3168 | { | 3584 | { |
3169 | int ret = size; | 3585 | int ret = size; |
3170 | 3586 | ||
@@ -3178,7 +3594,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) | |||
3178 | } | 3594 | } |
3179 | } | 3595 | } |
3180 | 3596 | ||
3181 | ret = __tracing_resize_ring_buffer(size, cpu_id); | 3597 | ret = __tracing_resize_ring_buffer(tr, size, cpu_id); |
3182 | if (ret < 0) | 3598 | if (ret < 0) |
3183 | ret = -ENOMEM; | 3599 | ret = -ENOMEM; |
3184 | 3600 | ||
@@ -3205,7 +3621,7 @@ int tracing_update_buffers(void) | |||
3205 | 3621 | ||
3206 | mutex_lock(&trace_types_lock); | 3622 | mutex_lock(&trace_types_lock); |
3207 | if (!ring_buffer_expanded) | 3623 | if (!ring_buffer_expanded) |
3208 | ret = __tracing_resize_ring_buffer(trace_buf_size, | 3624 | ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, |
3209 | RING_BUFFER_ALL_CPUS); | 3625 | RING_BUFFER_ALL_CPUS); |
3210 | mutex_unlock(&trace_types_lock); | 3626 | mutex_unlock(&trace_types_lock); |
3211 | 3627 | ||
@@ -3215,7 +3631,7 @@ int tracing_update_buffers(void) | |||
3215 | struct trace_option_dentry; | 3631 | struct trace_option_dentry; |
3216 | 3632 | ||
3217 | static struct trace_option_dentry * | 3633 | static struct trace_option_dentry * |
3218 | create_trace_option_files(struct tracer *tracer); | 3634 | create_trace_option_files(struct trace_array *tr, struct tracer *tracer); |
3219 | 3635 | ||
3220 | static void | 3636 | static void |
3221 | destroy_trace_option_files(struct trace_option_dentry *topts); | 3637 | destroy_trace_option_files(struct trace_option_dentry *topts); |
@@ -3225,13 +3641,15 @@ static int tracing_set_tracer(const char *buf) | |||
3225 | static struct trace_option_dentry *topts; | 3641 | static struct trace_option_dentry *topts; |
3226 | struct trace_array *tr = &global_trace; | 3642 | struct trace_array *tr = &global_trace; |
3227 | struct tracer *t; | 3643 | struct tracer *t; |
3644 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
3228 | bool had_max_tr; | 3645 | bool had_max_tr; |
3646 | #endif | ||
3229 | int ret = 0; | 3647 | int ret = 0; |
3230 | 3648 | ||
3231 | mutex_lock(&trace_types_lock); | 3649 | mutex_lock(&trace_types_lock); |
3232 | 3650 | ||
3233 | if (!ring_buffer_expanded) { | 3651 | if (!ring_buffer_expanded) { |
3234 | ret = __tracing_resize_ring_buffer(trace_buf_size, | 3652 | ret = __tracing_resize_ring_buffer(tr, trace_buf_size, |
3235 | RING_BUFFER_ALL_CPUS); | 3653 | RING_BUFFER_ALL_CPUS); |
3236 | if (ret < 0) | 3654 | if (ret < 0) |
3237 | goto out; | 3655 | goto out; |
@@ -3246,15 +3664,21 @@ static int tracing_set_tracer(const char *buf) | |||
3246 | ret = -EINVAL; | 3664 | ret = -EINVAL; |
3247 | goto out; | 3665 | goto out; |
3248 | } | 3666 | } |
3249 | if (t == current_trace) | 3667 | if (t == tr->current_trace) |
3250 | goto out; | 3668 | goto out; |
3251 | 3669 | ||
3252 | trace_branch_disable(); | 3670 | trace_branch_disable(); |
3253 | if (current_trace->reset) | ||
3254 | current_trace->reset(tr); | ||
3255 | 3671 | ||
3256 | had_max_tr = current_trace->allocated_snapshot; | 3672 | tr->current_trace->enabled = false; |
3257 | current_trace = &nop_trace; | 3673 | |
3674 | if (tr->current_trace->reset) | ||
3675 | tr->current_trace->reset(tr); | ||
3676 | |||
3677 | /* Current trace needs to be nop_trace before synchronize_sched */ | ||
3678 | tr->current_trace = &nop_trace; | ||
3679 | |||
3680 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
3681 | had_max_tr = tr->allocated_snapshot; | ||
3258 | 3682 | ||
3259 | if (had_max_tr && !t->use_max_tr) { | 3683 | if (had_max_tr && !t->use_max_tr) { |
3260 | /* | 3684 | /* |
@@ -3265,27 +3689,20 @@ static int tracing_set_tracer(const char *buf) | |||
3265 | * so a synchronized_sched() is sufficient. | 3689 | * so a synchronized_sched() is sufficient. |
3266 | */ | 3690 | */ |
3267 | synchronize_sched(); | 3691 | synchronize_sched(); |
3268 | /* | 3692 | free_snapshot(tr); |
3269 | * We don't free the ring buffer. instead, resize it because | ||
3270 | * The max_tr ring buffer has some state (e.g. ring->clock) and | ||
3271 | * we want preserve it. | ||
3272 | */ | ||
3273 | ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); | ||
3274 | set_buffer_entries(&max_tr, 1); | ||
3275 | tracing_reset_online_cpus(&max_tr); | ||
3276 | current_trace->allocated_snapshot = false; | ||
3277 | } | 3693 | } |
3694 | #endif | ||
3278 | destroy_trace_option_files(topts); | 3695 | destroy_trace_option_files(topts); |
3279 | 3696 | ||
3280 | topts = create_trace_option_files(t); | 3697 | topts = create_trace_option_files(tr, t); |
3698 | |||
3699 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
3281 | if (t->use_max_tr && !had_max_tr) { | 3700 | if (t->use_max_tr && !had_max_tr) { |
3282 | /* we need to make per cpu buffer sizes equivalent */ | 3701 | ret = alloc_snapshot(tr); |
3283 | ret = resize_buffer_duplicate_size(&max_tr, &global_trace, | ||
3284 | RING_BUFFER_ALL_CPUS); | ||
3285 | if (ret < 0) | 3702 | if (ret < 0) |
3286 | goto out; | 3703 | goto out; |
3287 | t->allocated_snapshot = true; | ||
3288 | } | 3704 | } |
3705 | #endif | ||
3289 | 3706 | ||
3290 | if (t->init) { | 3707 | if (t->init) { |
3291 | ret = tracer_init(t, tr); | 3708 | ret = tracer_init(t, tr); |
@@ -3293,7 +3710,8 @@ static int tracing_set_tracer(const char *buf) | |||
3293 | goto out; | 3710 | goto out; |
3294 | } | 3711 | } |
3295 | 3712 | ||
3296 | current_trace = t; | 3713 | tr->current_trace = t; |
3714 | tr->current_trace->enabled = true; | ||
3297 | trace_branch_enable(tr); | 3715 | trace_branch_enable(tr); |
3298 | out: | 3716 | out: |
3299 | mutex_unlock(&trace_types_lock); | 3717 | mutex_unlock(&trace_types_lock); |
@@ -3367,7 +3785,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, | |||
3367 | 3785 | ||
3368 | static int tracing_open_pipe(struct inode *inode, struct file *filp) | 3786 | static int tracing_open_pipe(struct inode *inode, struct file *filp) |
3369 | { | 3787 | { |
3370 | long cpu_file = (long) inode->i_private; | 3788 | struct trace_cpu *tc = inode->i_private; |
3789 | struct trace_array *tr = tc->tr; | ||
3371 | struct trace_iterator *iter; | 3790 | struct trace_iterator *iter; |
3372 | int ret = 0; | 3791 | int ret = 0; |
3373 | 3792 | ||
@@ -3392,7 +3811,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3392 | ret = -ENOMEM; | 3811 | ret = -ENOMEM; |
3393 | goto fail; | 3812 | goto fail; |
3394 | } | 3813 | } |
3395 | *iter->trace = *current_trace; | 3814 | *iter->trace = *tr->current_trace; |
3396 | 3815 | ||
3397 | if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { | 3816 | if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { |
3398 | ret = -ENOMEM; | 3817 | ret = -ENOMEM; |
@@ -3409,8 +3828,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3409 | if (trace_clocks[trace_clock_id].in_ns) | 3828 | if (trace_clocks[trace_clock_id].in_ns) |
3410 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | 3829 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; |
3411 | 3830 | ||
3412 | iter->cpu_file = cpu_file; | 3831 | iter->cpu_file = tc->cpu; |
3413 | iter->tr = &global_trace; | 3832 | iter->tr = tc->tr; |
3833 | iter->trace_buffer = &tc->tr->trace_buffer; | ||
3414 | mutex_init(&iter->mutex); | 3834 | mutex_init(&iter->mutex); |
3415 | filp->private_data = iter; | 3835 | filp->private_data = iter; |
3416 | 3836 | ||
@@ -3449,24 +3869,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) | |||
3449 | } | 3869 | } |
3450 | 3870 | ||
3451 | static unsigned int | 3871 | static unsigned int |
3452 | tracing_poll_pipe(struct file *filp, poll_table *poll_table) | 3872 | trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) |
3453 | { | 3873 | { |
3454 | struct trace_iterator *iter = filp->private_data; | 3874 | /* Iterators are static, they should be filled or empty */ |
3875 | if (trace_buffer_iter(iter, iter->cpu_file)) | ||
3876 | return POLLIN | POLLRDNORM; | ||
3455 | 3877 | ||
3456 | if (trace_flags & TRACE_ITER_BLOCK) { | 3878 | if (trace_flags & TRACE_ITER_BLOCK) |
3457 | /* | 3879 | /* |
3458 | * Always select as readable when in blocking mode | 3880 | * Always select as readable when in blocking mode |
3459 | */ | 3881 | */ |
3460 | return POLLIN | POLLRDNORM; | 3882 | return POLLIN | POLLRDNORM; |
3461 | } else { | 3883 | else |
3462 | if (!trace_empty(iter)) | 3884 | return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, |
3463 | return POLLIN | POLLRDNORM; | 3885 | filp, poll_table); |
3464 | poll_wait(filp, &trace_wait, poll_table); | 3886 | } |
3465 | if (!trace_empty(iter)) | ||
3466 | return POLLIN | POLLRDNORM; | ||
3467 | 3887 | ||
3468 | return 0; | 3888 | static unsigned int |
3469 | } | 3889 | tracing_poll_pipe(struct file *filp, poll_table *poll_table) |
3890 | { | ||
3891 | struct trace_iterator *iter = filp->private_data; | ||
3892 | |||
3893 | return trace_poll(iter, filp, poll_table); | ||
3470 | } | 3894 | } |
3471 | 3895 | ||
3472 | /* | 3896 | /* |
@@ -3532,6 +3956,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, | |||
3532 | size_t cnt, loff_t *ppos) | 3956 | size_t cnt, loff_t *ppos) |
3533 | { | 3957 | { |
3534 | struct trace_iterator *iter = filp->private_data; | 3958 | struct trace_iterator *iter = filp->private_data; |
3959 | struct trace_array *tr = iter->tr; | ||
3535 | ssize_t sret; | 3960 | ssize_t sret; |
3536 | 3961 | ||
3537 | /* return any leftover data */ | 3962 | /* return any leftover data */ |
@@ -3543,8 +3968,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, | |||
3543 | 3968 | ||
3544 | /* copy the tracer to avoid using a global lock all around */ | 3969 | /* copy the tracer to avoid using a global lock all around */ |
3545 | mutex_lock(&trace_types_lock); | 3970 | mutex_lock(&trace_types_lock); |
3546 | if (unlikely(iter->trace->name != current_trace->name)) | 3971 | if (unlikely(iter->trace->name != tr->current_trace->name)) |
3547 | *iter->trace = *current_trace; | 3972 | *iter->trace = *tr->current_trace; |
3548 | mutex_unlock(&trace_types_lock); | 3973 | mutex_unlock(&trace_types_lock); |
3549 | 3974 | ||
3550 | /* | 3975 | /* |
@@ -3700,6 +4125,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3700 | .ops = &tracing_pipe_buf_ops, | 4125 | .ops = &tracing_pipe_buf_ops, |
3701 | .spd_release = tracing_spd_release_pipe, | 4126 | .spd_release = tracing_spd_release_pipe, |
3702 | }; | 4127 | }; |
4128 | struct trace_array *tr = iter->tr; | ||
3703 | ssize_t ret; | 4129 | ssize_t ret; |
3704 | size_t rem; | 4130 | size_t rem; |
3705 | unsigned int i; | 4131 | unsigned int i; |
@@ -3709,8 +4135,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3709 | 4135 | ||
3710 | /* copy the tracer to avoid using a global lock all around */ | 4136 | /* copy the tracer to avoid using a global lock all around */ |
3711 | mutex_lock(&trace_types_lock); | 4137 | mutex_lock(&trace_types_lock); |
3712 | if (unlikely(iter->trace->name != current_trace->name)) | 4138 | if (unlikely(iter->trace->name != tr->current_trace->name)) |
3713 | *iter->trace = *current_trace; | 4139 | *iter->trace = *tr->current_trace; |
3714 | mutex_unlock(&trace_types_lock); | 4140 | mutex_unlock(&trace_types_lock); |
3715 | 4141 | ||
3716 | mutex_lock(&iter->mutex); | 4142 | mutex_lock(&iter->mutex); |
@@ -3772,43 +4198,19 @@ out_err: | |||
3772 | goto out; | 4198 | goto out; |
3773 | } | 4199 | } |
3774 | 4200 | ||
3775 | struct ftrace_entries_info { | ||
3776 | struct trace_array *tr; | ||
3777 | int cpu; | ||
3778 | }; | ||
3779 | |||
3780 | static int tracing_entries_open(struct inode *inode, struct file *filp) | ||
3781 | { | ||
3782 | struct ftrace_entries_info *info; | ||
3783 | |||
3784 | if (tracing_disabled) | ||
3785 | return -ENODEV; | ||
3786 | |||
3787 | info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
3788 | if (!info) | ||
3789 | return -ENOMEM; | ||
3790 | |||
3791 | info->tr = &global_trace; | ||
3792 | info->cpu = (unsigned long)inode->i_private; | ||
3793 | |||
3794 | filp->private_data = info; | ||
3795 | |||
3796 | return 0; | ||
3797 | } | ||
3798 | |||
3799 | static ssize_t | 4201 | static ssize_t |
3800 | tracing_entries_read(struct file *filp, char __user *ubuf, | 4202 | tracing_entries_read(struct file *filp, char __user *ubuf, |
3801 | size_t cnt, loff_t *ppos) | 4203 | size_t cnt, loff_t *ppos) |
3802 | { | 4204 | { |
3803 | struct ftrace_entries_info *info = filp->private_data; | 4205 | struct trace_cpu *tc = filp->private_data; |
3804 | struct trace_array *tr = info->tr; | 4206 | struct trace_array *tr = tc->tr; |
3805 | char buf[64]; | 4207 | char buf[64]; |
3806 | int r = 0; | 4208 | int r = 0; |
3807 | ssize_t ret; | 4209 | ssize_t ret; |
3808 | 4210 | ||
3809 | mutex_lock(&trace_types_lock); | 4211 | mutex_lock(&trace_types_lock); |
3810 | 4212 | ||
3811 | if (info->cpu == RING_BUFFER_ALL_CPUS) { | 4213 | if (tc->cpu == RING_BUFFER_ALL_CPUS) { |
3812 | int cpu, buf_size_same; | 4214 | int cpu, buf_size_same; |
3813 | unsigned long size; | 4215 | unsigned long size; |
3814 | 4216 | ||
@@ -3818,8 +4220,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf, | |||
3818 | for_each_tracing_cpu(cpu) { | 4220 | for_each_tracing_cpu(cpu) { |
3819 | /* fill in the size from first enabled cpu */ | 4221 | /* fill in the size from first enabled cpu */ |
3820 | if (size == 0) | 4222 | if (size == 0) |
3821 | size = tr->data[cpu]->entries; | 4223 | size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; |
3822 | if (size != tr->data[cpu]->entries) { | 4224 | if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { |
3823 | buf_size_same = 0; | 4225 | buf_size_same = 0; |
3824 | break; | 4226 | break; |
3825 | } | 4227 | } |
@@ -3835,7 +4237,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, | |||
3835 | } else | 4237 | } else |
3836 | r = sprintf(buf, "X\n"); | 4238 | r = sprintf(buf, "X\n"); |
3837 | } else | 4239 | } else |
3838 | r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); | 4240 | r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); |
3839 | 4241 | ||
3840 | mutex_unlock(&trace_types_lock); | 4242 | mutex_unlock(&trace_types_lock); |
3841 | 4243 | ||
@@ -3847,7 +4249,7 @@ static ssize_t | |||
3847 | tracing_entries_write(struct file *filp, const char __user *ubuf, | 4249 | tracing_entries_write(struct file *filp, const char __user *ubuf, |
3848 | size_t cnt, loff_t *ppos) | 4250 | size_t cnt, loff_t *ppos) |
3849 | { | 4251 | { |
3850 | struct ftrace_entries_info *info = filp->private_data; | 4252 | struct trace_cpu *tc = filp->private_data; |
3851 | unsigned long val; | 4253 | unsigned long val; |
3852 | int ret; | 4254 | int ret; |
3853 | 4255 | ||
@@ -3862,7 +4264,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3862 | /* value is in KB */ | 4264 | /* value is in KB */ |
3863 | val <<= 10; | 4265 | val <<= 10; |
3864 | 4266 | ||
3865 | ret = tracing_resize_ring_buffer(val, info->cpu); | 4267 | ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); |
3866 | if (ret < 0) | 4268 | if (ret < 0) |
3867 | return ret; | 4269 | return ret; |
3868 | 4270 | ||
@@ -3871,16 +4273,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3871 | return cnt; | 4273 | return cnt; |
3872 | } | 4274 | } |
3873 | 4275 | ||
3874 | static int | ||
3875 | tracing_entries_release(struct inode *inode, struct file *filp) | ||
3876 | { | ||
3877 | struct ftrace_entries_info *info = filp->private_data; | ||
3878 | |||
3879 | kfree(info); | ||
3880 | |||
3881 | return 0; | ||
3882 | } | ||
3883 | |||
3884 | static ssize_t | 4276 | static ssize_t |
3885 | tracing_total_entries_read(struct file *filp, char __user *ubuf, | 4277 | tracing_total_entries_read(struct file *filp, char __user *ubuf, |
3886 | size_t cnt, loff_t *ppos) | 4278 | size_t cnt, loff_t *ppos) |
@@ -3892,7 +4284,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, | |||
3892 | 4284 | ||
3893 | mutex_lock(&trace_types_lock); | 4285 | mutex_lock(&trace_types_lock); |
3894 | for_each_tracing_cpu(cpu) { | 4286 | for_each_tracing_cpu(cpu) { |
3895 | size += tr->data[cpu]->entries >> 10; | 4287 | size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; |
3896 | if (!ring_buffer_expanded) | 4288 | if (!ring_buffer_expanded) |
3897 | expanded_size += trace_buf_size >> 10; | 4289 | expanded_size += trace_buf_size >> 10; |
3898 | } | 4290 | } |
@@ -3922,11 +4314,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, | |||
3922 | static int | 4314 | static int |
3923 | tracing_free_buffer_release(struct inode *inode, struct file *filp) | 4315 | tracing_free_buffer_release(struct inode *inode, struct file *filp) |
3924 | { | 4316 | { |
4317 | struct trace_array *tr = inode->i_private; | ||
4318 | |||
3925 | /* disable tracing ? */ | 4319 | /* disable tracing ? */ |
3926 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | 4320 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) |
3927 | tracing_off(); | 4321 | tracing_off(); |
3928 | /* resize the ring buffer to 0 */ | 4322 | /* resize the ring buffer to 0 */ |
3929 | tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); | 4323 | tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); |
3930 | 4324 | ||
3931 | return 0; | 4325 | return 0; |
3932 | } | 4326 | } |
@@ -3995,7 +4389,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3995 | 4389 | ||
3996 | local_save_flags(irq_flags); | 4390 | local_save_flags(irq_flags); |
3997 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | 4391 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ |
3998 | buffer = global_trace.buffer; | 4392 | buffer = global_trace.trace_buffer.buffer; |
3999 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | 4393 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, |
4000 | irq_flags, preempt_count()); | 4394 | irq_flags, preempt_count()); |
4001 | if (!event) { | 4395 | if (!event) { |
@@ -4037,13 +4431,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
4037 | 4431 | ||
4038 | static int tracing_clock_show(struct seq_file *m, void *v) | 4432 | static int tracing_clock_show(struct seq_file *m, void *v) |
4039 | { | 4433 | { |
4434 | struct trace_array *tr = m->private; | ||
4040 | int i; | 4435 | int i; |
4041 | 4436 | ||
4042 | for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) | 4437 | for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) |
4043 | seq_printf(m, | 4438 | seq_printf(m, |
4044 | "%s%s%s%s", i ? " " : "", | 4439 | "%s%s%s%s", i ? " " : "", |
4045 | i == trace_clock_id ? "[" : "", trace_clocks[i].name, | 4440 | i == tr->clock_id ? "[" : "", trace_clocks[i].name, |
4046 | i == trace_clock_id ? "]" : ""); | 4441 | i == tr->clock_id ? "]" : ""); |
4047 | seq_putc(m, '\n'); | 4442 | seq_putc(m, '\n'); |
4048 | 4443 | ||
4049 | return 0; | 4444 | return 0; |
@@ -4052,6 +4447,8 @@ static int tracing_clock_show(struct seq_file *m, void *v) | |||
4052 | static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | 4447 | static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, |
4053 | size_t cnt, loff_t *fpos) | 4448 | size_t cnt, loff_t *fpos) |
4054 | { | 4449 | { |
4450 | struct seq_file *m = filp->private_data; | ||
4451 | struct trace_array *tr = m->private; | ||
4055 | char buf[64]; | 4452 | char buf[64]; |
4056 | const char *clockstr; | 4453 | const char *clockstr; |
4057 | int i; | 4454 | int i; |
@@ -4073,20 +4470,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | |||
4073 | if (i == ARRAY_SIZE(trace_clocks)) | 4470 | if (i == ARRAY_SIZE(trace_clocks)) |
4074 | return -EINVAL; | 4471 | return -EINVAL; |
4075 | 4472 | ||
4076 | trace_clock_id = i; | ||
4077 | |||
4078 | mutex_lock(&trace_types_lock); | 4473 | mutex_lock(&trace_types_lock); |
4079 | 4474 | ||
4080 | ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); | 4475 | tr->clock_id = i; |
4081 | if (max_tr.buffer) | 4476 | |
4082 | ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); | 4477 | ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); |
4083 | 4478 | ||
4084 | /* | 4479 | /* |
4085 | * New clock may not be consistent with the previous clock. | 4480 | * New clock may not be consistent with the previous clock. |
4086 | * Reset the buffer so that it doesn't have incomparable timestamps. | 4481 | * Reset the buffer so that it doesn't have incomparable timestamps. |
4087 | */ | 4482 | */ |
4088 | tracing_reset_online_cpus(&global_trace); | 4483 | tracing_reset_online_cpus(&global_trace.trace_buffer); |
4089 | tracing_reset_online_cpus(&max_tr); | 4484 | |
4485 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
4486 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) | ||
4487 | ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); | ||
4488 | tracing_reset_online_cpus(&global_trace.max_buffer); | ||
4489 | #endif | ||
4090 | 4490 | ||
4091 | mutex_unlock(&trace_types_lock); | 4491 | mutex_unlock(&trace_types_lock); |
4092 | 4492 | ||
@@ -4099,20 +4499,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file) | |||
4099 | { | 4499 | { |
4100 | if (tracing_disabled) | 4500 | if (tracing_disabled) |
4101 | return -ENODEV; | 4501 | return -ENODEV; |
4102 | return single_open(file, tracing_clock_show, NULL); | 4502 | |
4503 | return single_open(file, tracing_clock_show, inode->i_private); | ||
4103 | } | 4504 | } |
4104 | 4505 | ||
4506 | struct ftrace_buffer_info { | ||
4507 | struct trace_iterator iter; | ||
4508 | void *spare; | ||
4509 | unsigned int read; | ||
4510 | }; | ||
4511 | |||
4105 | #ifdef CONFIG_TRACER_SNAPSHOT | 4512 | #ifdef CONFIG_TRACER_SNAPSHOT |
4106 | static int tracing_snapshot_open(struct inode *inode, struct file *file) | 4513 | static int tracing_snapshot_open(struct inode *inode, struct file *file) |
4107 | { | 4514 | { |
4515 | struct trace_cpu *tc = inode->i_private; | ||
4108 | struct trace_iterator *iter; | 4516 | struct trace_iterator *iter; |
4517 | struct seq_file *m; | ||
4109 | int ret = 0; | 4518 | int ret = 0; |
4110 | 4519 | ||
4111 | if (file->f_mode & FMODE_READ) { | 4520 | if (file->f_mode & FMODE_READ) { |
4112 | iter = __tracing_open(inode, file, true); | 4521 | iter = __tracing_open(inode, file, true); |
4113 | if (IS_ERR(iter)) | 4522 | if (IS_ERR(iter)) |
4114 | ret = PTR_ERR(iter); | 4523 | ret = PTR_ERR(iter); |
4524 | } else { | ||
4525 | /* Writes still need the seq_file to hold the private data */ | ||
4526 | m = kzalloc(sizeof(*m), GFP_KERNEL); | ||
4527 | if (!m) | ||
4528 | return -ENOMEM; | ||
4529 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | ||
4530 | if (!iter) { | ||
4531 | kfree(m); | ||
4532 | return -ENOMEM; | ||
4533 | } | ||
4534 | iter->tr = tc->tr; | ||
4535 | iter->trace_buffer = &tc->tr->max_buffer; | ||
4536 | iter->cpu_file = tc->cpu; | ||
4537 | m->private = iter; | ||
4538 | file->private_data = m; | ||
4115 | } | 4539 | } |
4540 | |||
4116 | return ret; | 4541 | return ret; |
4117 | } | 4542 | } |
4118 | 4543 | ||
@@ -4120,6 +4545,9 @@ static ssize_t | |||
4120 | tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, | 4545 | tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, |
4121 | loff_t *ppos) | 4546 | loff_t *ppos) |
4122 | { | 4547 | { |
4548 | struct seq_file *m = filp->private_data; | ||
4549 | struct trace_iterator *iter = m->private; | ||
4550 | struct trace_array *tr = iter->tr; | ||
4123 | unsigned long val; | 4551 | unsigned long val; |
4124 | int ret; | 4552 | int ret; |
4125 | 4553 | ||
@@ -4133,40 +4561,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
4133 | 4561 | ||
4134 | mutex_lock(&trace_types_lock); | 4562 | mutex_lock(&trace_types_lock); |
4135 | 4563 | ||
4136 | if (current_trace->use_max_tr) { | 4564 | if (tr->current_trace->use_max_tr) { |
4137 | ret = -EBUSY; | 4565 | ret = -EBUSY; |
4138 | goto out; | 4566 | goto out; |
4139 | } | 4567 | } |
4140 | 4568 | ||
4141 | switch (val) { | 4569 | switch (val) { |
4142 | case 0: | 4570 | case 0: |
4143 | if (current_trace->allocated_snapshot) { | 4571 | if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { |
4144 | /* free spare buffer */ | 4572 | ret = -EINVAL; |
4145 | ring_buffer_resize(max_tr.buffer, 1, | 4573 | break; |
4146 | RING_BUFFER_ALL_CPUS); | ||
4147 | set_buffer_entries(&max_tr, 1); | ||
4148 | tracing_reset_online_cpus(&max_tr); | ||
4149 | current_trace->allocated_snapshot = false; | ||
4150 | } | 4574 | } |
4575 | if (tr->allocated_snapshot) | ||
4576 | free_snapshot(tr); | ||
4151 | break; | 4577 | break; |
4152 | case 1: | 4578 | case 1: |
4153 | if (!current_trace->allocated_snapshot) { | 4579 | /* Only allow per-cpu swap if the ring buffer supports it */ |
4154 | /* allocate spare buffer */ | 4580 | #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP |
4155 | ret = resize_buffer_duplicate_size(&max_tr, | 4581 | if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { |
4156 | &global_trace, RING_BUFFER_ALL_CPUS); | 4582 | ret = -EINVAL; |
4583 | break; | ||
4584 | } | ||
4585 | #endif | ||
4586 | if (!tr->allocated_snapshot) { | ||
4587 | ret = alloc_snapshot(tr); | ||
4157 | if (ret < 0) | 4588 | if (ret < 0) |
4158 | break; | 4589 | break; |
4159 | current_trace->allocated_snapshot = true; | ||
4160 | } | 4590 | } |
4161 | |||
4162 | local_irq_disable(); | 4591 | local_irq_disable(); |
4163 | /* Now, we're going to swap */ | 4592 | /* Now, we're going to swap */ |
4164 | update_max_tr(&global_trace, current, smp_processor_id()); | 4593 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) |
4594 | update_max_tr(tr, current, smp_processor_id()); | ||
4595 | else | ||
4596 | update_max_tr_single(tr, current, iter->cpu_file); | ||
4165 | local_irq_enable(); | 4597 | local_irq_enable(); |
4166 | break; | 4598 | break; |
4167 | default: | 4599 | default: |
4168 | if (current_trace->allocated_snapshot) | 4600 | if (tr->allocated_snapshot) { |
4169 | tracing_reset_online_cpus(&max_tr); | 4601 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) |
4602 | tracing_reset_online_cpus(&tr->max_buffer); | ||
4603 | else | ||
4604 | tracing_reset(&tr->max_buffer, iter->cpu_file); | ||
4605 | } | ||
4170 | break; | 4606 | break; |
4171 | } | 4607 | } |
4172 | 4608 | ||
@@ -4178,6 +4614,51 @@ out: | |||
4178 | mutex_unlock(&trace_types_lock); | 4614 | mutex_unlock(&trace_types_lock); |
4179 | return ret; | 4615 | return ret; |
4180 | } | 4616 | } |
4617 | |||
4618 | static int tracing_snapshot_release(struct inode *inode, struct file *file) | ||
4619 | { | ||
4620 | struct seq_file *m = file->private_data; | ||
4621 | |||
4622 | if (file->f_mode & FMODE_READ) | ||
4623 | return tracing_release(inode, file); | ||
4624 | |||
4625 | /* If write only, the seq_file is just a stub */ | ||
4626 | if (m) | ||
4627 | kfree(m->private); | ||
4628 | kfree(m); | ||
4629 | |||
4630 | return 0; | ||
4631 | } | ||
4632 | |||
4633 | static int tracing_buffers_open(struct inode *inode, struct file *filp); | ||
4634 | static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, | ||
4635 | size_t count, loff_t *ppos); | ||
4636 | static int tracing_buffers_release(struct inode *inode, struct file *file); | ||
4637 | static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, | ||
4638 | struct pipe_inode_info *pipe, size_t len, unsigned int flags); | ||
4639 | |||
4640 | static int snapshot_raw_open(struct inode *inode, struct file *filp) | ||
4641 | { | ||
4642 | struct ftrace_buffer_info *info; | ||
4643 | int ret; | ||
4644 | |||
4645 | ret = tracing_buffers_open(inode, filp); | ||
4646 | if (ret < 0) | ||
4647 | return ret; | ||
4648 | |||
4649 | info = filp->private_data; | ||
4650 | |||
4651 | if (info->iter.trace->use_max_tr) { | ||
4652 | tracing_buffers_release(inode, filp); | ||
4653 | return -EBUSY; | ||
4654 | } | ||
4655 | |||
4656 | info->iter.snapshot = true; | ||
4657 | info->iter.trace_buffer = &info->iter.tr->max_buffer; | ||
4658 | |||
4659 | return ret; | ||
4660 | } | ||
4661 | |||
4181 | #endif /* CONFIG_TRACER_SNAPSHOT */ | 4662 | #endif /* CONFIG_TRACER_SNAPSHOT */ |
4182 | 4663 | ||
4183 | 4664 | ||
@@ -4205,10 +4686,9 @@ static const struct file_operations tracing_pipe_fops = { | |||
4205 | }; | 4686 | }; |
4206 | 4687 | ||
4207 | static const struct file_operations tracing_entries_fops = { | 4688 | static const struct file_operations tracing_entries_fops = { |
4208 | .open = tracing_entries_open, | 4689 | .open = tracing_open_generic, |
4209 | .read = tracing_entries_read, | 4690 | .read = tracing_entries_read, |
4210 | .write = tracing_entries_write, | 4691 | .write = tracing_entries_write, |
4211 | .release = tracing_entries_release, | ||
4212 | .llseek = generic_file_llseek, | 4692 | .llseek = generic_file_llseek, |
4213 | }; | 4693 | }; |
4214 | 4694 | ||
@@ -4243,20 +4723,23 @@ static const struct file_operations snapshot_fops = { | |||
4243 | .read = seq_read, | 4723 | .read = seq_read, |
4244 | .write = tracing_snapshot_write, | 4724 | .write = tracing_snapshot_write, |
4245 | .llseek = tracing_seek, | 4725 | .llseek = tracing_seek, |
4246 | .release = tracing_release, | 4726 | .release = tracing_snapshot_release, |
4247 | }; | 4727 | }; |
4248 | #endif /* CONFIG_TRACER_SNAPSHOT */ | ||
4249 | 4728 | ||
4250 | struct ftrace_buffer_info { | 4729 | static const struct file_operations snapshot_raw_fops = { |
4251 | struct trace_array *tr; | 4730 | .open = snapshot_raw_open, |
4252 | void *spare; | 4731 | .read = tracing_buffers_read, |
4253 | int cpu; | 4732 | .release = tracing_buffers_release, |
4254 | unsigned int read; | 4733 | .splice_read = tracing_buffers_splice_read, |
4734 | .llseek = no_llseek, | ||
4255 | }; | 4735 | }; |
4256 | 4736 | ||
4737 | #endif /* CONFIG_TRACER_SNAPSHOT */ | ||
4738 | |||
4257 | static int tracing_buffers_open(struct inode *inode, struct file *filp) | 4739 | static int tracing_buffers_open(struct inode *inode, struct file *filp) |
4258 | { | 4740 | { |
4259 | int cpu = (int)(long)inode->i_private; | 4741 | struct trace_cpu *tc = inode->i_private; |
4742 | struct trace_array *tr = tc->tr; | ||
4260 | struct ftrace_buffer_info *info; | 4743 | struct ftrace_buffer_info *info; |
4261 | 4744 | ||
4262 | if (tracing_disabled) | 4745 | if (tracing_disabled) |
@@ -4266,72 +4749,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) | |||
4266 | if (!info) | 4749 | if (!info) |
4267 | return -ENOMEM; | 4750 | return -ENOMEM; |
4268 | 4751 | ||
4269 | info->tr = &global_trace; | 4752 | mutex_lock(&trace_types_lock); |
4270 | info->cpu = cpu; | 4753 | |
4271 | info->spare = NULL; | 4754 | tr->ref++; |
4755 | |||
4756 | info->iter.tr = tr; | ||
4757 | info->iter.cpu_file = tc->cpu; | ||
4758 | info->iter.trace = tr->current_trace; | ||
4759 | info->iter.trace_buffer = &tr->trace_buffer; | ||
4760 | info->spare = NULL; | ||
4272 | /* Force reading ring buffer for first read */ | 4761 | /* Force reading ring buffer for first read */ |
4273 | info->read = (unsigned int)-1; | 4762 | info->read = (unsigned int)-1; |
4274 | 4763 | ||
4275 | filp->private_data = info; | 4764 | filp->private_data = info; |
4276 | 4765 | ||
4766 | mutex_unlock(&trace_types_lock); | ||
4767 | |||
4277 | return nonseekable_open(inode, filp); | 4768 | return nonseekable_open(inode, filp); |
4278 | } | 4769 | } |
4279 | 4770 | ||
4771 | static unsigned int | ||
4772 | tracing_buffers_poll(struct file *filp, poll_table *poll_table) | ||
4773 | { | ||
4774 | struct ftrace_buffer_info *info = filp->private_data; | ||
4775 | struct trace_iterator *iter = &info->iter; | ||
4776 | |||
4777 | return trace_poll(iter, filp, poll_table); | ||
4778 | } | ||
4779 | |||
4280 | static ssize_t | 4780 | static ssize_t |
4281 | tracing_buffers_read(struct file *filp, char __user *ubuf, | 4781 | tracing_buffers_read(struct file *filp, char __user *ubuf, |
4282 | size_t count, loff_t *ppos) | 4782 | size_t count, loff_t *ppos) |
4283 | { | 4783 | { |
4284 | struct ftrace_buffer_info *info = filp->private_data; | 4784 | struct ftrace_buffer_info *info = filp->private_data; |
4785 | struct trace_iterator *iter = &info->iter; | ||
4285 | ssize_t ret; | 4786 | ssize_t ret; |
4286 | size_t size; | 4787 | ssize_t size; |
4287 | 4788 | ||
4288 | if (!count) | 4789 | if (!count) |
4289 | return 0; | 4790 | return 0; |
4290 | 4791 | ||
4792 | mutex_lock(&trace_types_lock); | ||
4793 | |||
4794 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
4795 | if (iter->snapshot && iter->tr->current_trace->use_max_tr) { | ||
4796 | size = -EBUSY; | ||
4797 | goto out_unlock; | ||
4798 | } | ||
4799 | #endif | ||
4800 | |||
4291 | if (!info->spare) | 4801 | if (!info->spare) |
4292 | info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); | 4802 | info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, |
4803 | iter->cpu_file); | ||
4804 | size = -ENOMEM; | ||
4293 | if (!info->spare) | 4805 | if (!info->spare) |
4294 | return -ENOMEM; | 4806 | goto out_unlock; |
4295 | 4807 | ||
4296 | /* Do we have previous read data to read? */ | 4808 | /* Do we have previous read data to read? */ |
4297 | if (info->read < PAGE_SIZE) | 4809 | if (info->read < PAGE_SIZE) |
4298 | goto read; | 4810 | goto read; |
4299 | 4811 | ||
4300 | trace_access_lock(info->cpu); | 4812 | again: |
4301 | ret = ring_buffer_read_page(info->tr->buffer, | 4813 | trace_access_lock(iter->cpu_file); |
4814 | ret = ring_buffer_read_page(iter->trace_buffer->buffer, | ||
4302 | &info->spare, | 4815 | &info->spare, |
4303 | count, | 4816 | count, |
4304 | info->cpu, 0); | 4817 | iter->cpu_file, 0); |
4305 | trace_access_unlock(info->cpu); | 4818 | trace_access_unlock(iter->cpu_file); |
4306 | if (ret < 0) | ||
4307 | return 0; | ||
4308 | 4819 | ||
4309 | info->read = 0; | 4820 | if (ret < 0) { |
4821 | if (trace_empty(iter)) { | ||
4822 | if ((filp->f_flags & O_NONBLOCK)) { | ||
4823 | size = -EAGAIN; | ||
4824 | goto out_unlock; | ||
4825 | } | ||
4826 | mutex_unlock(&trace_types_lock); | ||
4827 | iter->trace->wait_pipe(iter); | ||
4828 | mutex_lock(&trace_types_lock); | ||
4829 | if (signal_pending(current)) { | ||
4830 | size = -EINTR; | ||
4831 | goto out_unlock; | ||
4832 | } | ||
4833 | goto again; | ||
4834 | } | ||
4835 | size = 0; | ||
4836 | goto out_unlock; | ||
4837 | } | ||
4310 | 4838 | ||
4311 | read: | 4839 | info->read = 0; |
4840 | read: | ||
4312 | size = PAGE_SIZE - info->read; | 4841 | size = PAGE_SIZE - info->read; |
4313 | if (size > count) | 4842 | if (size > count) |
4314 | size = count; | 4843 | size = count; |
4315 | 4844 | ||
4316 | ret = copy_to_user(ubuf, info->spare + info->read, size); | 4845 | ret = copy_to_user(ubuf, info->spare + info->read, size); |
4317 | if (ret == size) | 4846 | if (ret == size) { |
4318 | return -EFAULT; | 4847 | size = -EFAULT; |
4848 | goto out_unlock; | ||
4849 | } | ||
4319 | size -= ret; | 4850 | size -= ret; |
4320 | 4851 | ||
4321 | *ppos += size; | 4852 | *ppos += size; |
4322 | info->read += size; | 4853 | info->read += size; |
4323 | 4854 | ||
4855 | out_unlock: | ||
4856 | mutex_unlock(&trace_types_lock); | ||
4857 | |||
4324 | return size; | 4858 | return size; |
4325 | } | 4859 | } |
4326 | 4860 | ||
4327 | static int tracing_buffers_release(struct inode *inode, struct file *file) | 4861 | static int tracing_buffers_release(struct inode *inode, struct file *file) |
4328 | { | 4862 | { |
4329 | struct ftrace_buffer_info *info = file->private_data; | 4863 | struct ftrace_buffer_info *info = file->private_data; |
4864 | struct trace_iterator *iter = &info->iter; | ||
4865 | |||
4866 | mutex_lock(&trace_types_lock); | ||
4867 | |||
4868 | WARN_ON(!iter->tr->ref); | ||
4869 | iter->tr->ref--; | ||
4330 | 4870 | ||
4331 | if (info->spare) | 4871 | if (info->spare) |
4332 | ring_buffer_free_read_page(info->tr->buffer, info->spare); | 4872 | ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); |
4333 | kfree(info); | 4873 | kfree(info); |
4334 | 4874 | ||
4875 | mutex_unlock(&trace_types_lock); | ||
4876 | |||
4335 | return 0; | 4877 | return 0; |
4336 | } | 4878 | } |
4337 | 4879 | ||
@@ -4396,6 +4938,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4396 | unsigned int flags) | 4938 | unsigned int flags) |
4397 | { | 4939 | { |
4398 | struct ftrace_buffer_info *info = file->private_data; | 4940 | struct ftrace_buffer_info *info = file->private_data; |
4941 | struct trace_iterator *iter = &info->iter; | ||
4399 | struct partial_page partial_def[PIPE_DEF_BUFFERS]; | 4942 | struct partial_page partial_def[PIPE_DEF_BUFFERS]; |
4400 | struct page *pages_def[PIPE_DEF_BUFFERS]; | 4943 | struct page *pages_def[PIPE_DEF_BUFFERS]; |
4401 | struct splice_pipe_desc spd = { | 4944 | struct splice_pipe_desc spd = { |
@@ -4408,10 +4951,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4408 | }; | 4951 | }; |
4409 | struct buffer_ref *ref; | 4952 | struct buffer_ref *ref; |
4410 | int entries, size, i; | 4953 | int entries, size, i; |
4411 | size_t ret; | 4954 | ssize_t ret; |
4412 | 4955 | ||
4413 | if (splice_grow_spd(pipe, &spd)) | 4956 | mutex_lock(&trace_types_lock); |
4414 | return -ENOMEM; | 4957 | |
4958 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
4959 | if (iter->snapshot && iter->tr->current_trace->use_max_tr) { | ||
4960 | ret = -EBUSY; | ||
4961 | goto out; | ||
4962 | } | ||
4963 | #endif | ||
4964 | |||
4965 | if (splice_grow_spd(pipe, &spd)) { | ||
4966 | ret = -ENOMEM; | ||
4967 | goto out; | ||
4968 | } | ||
4415 | 4969 | ||
4416 | if (*ppos & (PAGE_SIZE - 1)) { | 4970 | if (*ppos & (PAGE_SIZE - 1)) { |
4417 | ret = -EINVAL; | 4971 | ret = -EINVAL; |
@@ -4426,8 +4980,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4426 | len &= PAGE_MASK; | 4980 | len &= PAGE_MASK; |
4427 | } | 4981 | } |
4428 | 4982 | ||
4429 | trace_access_lock(info->cpu); | 4983 | again: |
4430 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 4984 | trace_access_lock(iter->cpu_file); |
4985 | entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); | ||
4431 | 4986 | ||
4432 | for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { | 4987 | for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { |
4433 | struct page *page; | 4988 | struct page *page; |
@@ -4438,15 +4993,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4438 | break; | 4993 | break; |
4439 | 4994 | ||
4440 | ref->ref = 1; | 4995 | ref->ref = 1; |
4441 | ref->buffer = info->tr->buffer; | 4996 | ref->buffer = iter->trace_buffer->buffer; |
4442 | ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); | 4997 | ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); |
4443 | if (!ref->page) { | 4998 | if (!ref->page) { |
4444 | kfree(ref); | 4999 | kfree(ref); |
4445 | break; | 5000 | break; |
4446 | } | 5001 | } |
4447 | 5002 | ||
4448 | r = ring_buffer_read_page(ref->buffer, &ref->page, | 5003 | r = ring_buffer_read_page(ref->buffer, &ref->page, |
4449 | len, info->cpu, 1); | 5004 | len, iter->cpu_file, 1); |
4450 | if (r < 0) { | 5005 | if (r < 0) { |
4451 | ring_buffer_free_read_page(ref->buffer, ref->page); | 5006 | ring_buffer_free_read_page(ref->buffer, ref->page); |
4452 | kfree(ref); | 5007 | kfree(ref); |
@@ -4470,31 +5025,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4470 | spd.nr_pages++; | 5025 | spd.nr_pages++; |
4471 | *ppos += PAGE_SIZE; | 5026 | *ppos += PAGE_SIZE; |
4472 | 5027 | ||
4473 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 5028 | entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); |
4474 | } | 5029 | } |
4475 | 5030 | ||
4476 | trace_access_unlock(info->cpu); | 5031 | trace_access_unlock(iter->cpu_file); |
4477 | spd.nr_pages = i; | 5032 | spd.nr_pages = i; |
4478 | 5033 | ||
4479 | /* did we read anything? */ | 5034 | /* did we read anything? */ |
4480 | if (!spd.nr_pages) { | 5035 | if (!spd.nr_pages) { |
4481 | if (flags & SPLICE_F_NONBLOCK) | 5036 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { |
4482 | ret = -EAGAIN; | 5037 | ret = -EAGAIN; |
4483 | else | 5038 | goto out; |
4484 | ret = 0; | 5039 | } |
4485 | /* TODO: block */ | 5040 | mutex_unlock(&trace_types_lock); |
4486 | goto out; | 5041 | iter->trace->wait_pipe(iter); |
5042 | mutex_lock(&trace_types_lock); | ||
5043 | if (signal_pending(current)) { | ||
5044 | ret = -EINTR; | ||
5045 | goto out; | ||
5046 | } | ||
5047 | goto again; | ||
4487 | } | 5048 | } |
4488 | 5049 | ||
4489 | ret = splice_to_pipe(pipe, &spd); | 5050 | ret = splice_to_pipe(pipe, &spd); |
4490 | splice_shrink_spd(&spd); | 5051 | splice_shrink_spd(&spd); |
4491 | out: | 5052 | out: |
5053 | mutex_unlock(&trace_types_lock); | ||
5054 | |||
4492 | return ret; | 5055 | return ret; |
4493 | } | 5056 | } |
4494 | 5057 | ||
4495 | static const struct file_operations tracing_buffers_fops = { | 5058 | static const struct file_operations tracing_buffers_fops = { |
4496 | .open = tracing_buffers_open, | 5059 | .open = tracing_buffers_open, |
4497 | .read = tracing_buffers_read, | 5060 | .read = tracing_buffers_read, |
5061 | .poll = tracing_buffers_poll, | ||
4498 | .release = tracing_buffers_release, | 5062 | .release = tracing_buffers_release, |
4499 | .splice_read = tracing_buffers_splice_read, | 5063 | .splice_read = tracing_buffers_splice_read, |
4500 | .llseek = no_llseek, | 5064 | .llseek = no_llseek, |
@@ -4504,12 +5068,14 @@ static ssize_t | |||
4504 | tracing_stats_read(struct file *filp, char __user *ubuf, | 5068 | tracing_stats_read(struct file *filp, char __user *ubuf, |
4505 | size_t count, loff_t *ppos) | 5069 | size_t count, loff_t *ppos) |
4506 | { | 5070 | { |
4507 | unsigned long cpu = (unsigned long)filp->private_data; | 5071 | struct trace_cpu *tc = filp->private_data; |
4508 | struct trace_array *tr = &global_trace; | 5072 | struct trace_array *tr = tc->tr; |
5073 | struct trace_buffer *trace_buf = &tr->trace_buffer; | ||
4509 | struct trace_seq *s; | 5074 | struct trace_seq *s; |
4510 | unsigned long cnt; | 5075 | unsigned long cnt; |
4511 | unsigned long long t; | 5076 | unsigned long long t; |
4512 | unsigned long usec_rem; | 5077 | unsigned long usec_rem; |
5078 | int cpu = tc->cpu; | ||
4513 | 5079 | ||
4514 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 5080 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
4515 | if (!s) | 5081 | if (!s) |
@@ -4517,41 +5083,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4517 | 5083 | ||
4518 | trace_seq_init(s); | 5084 | trace_seq_init(s); |
4519 | 5085 | ||
4520 | cnt = ring_buffer_entries_cpu(tr->buffer, cpu); | 5086 | cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); |
4521 | trace_seq_printf(s, "entries: %ld\n", cnt); | 5087 | trace_seq_printf(s, "entries: %ld\n", cnt); |
4522 | 5088 | ||
4523 | cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); | 5089 | cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); |
4524 | trace_seq_printf(s, "overrun: %ld\n", cnt); | 5090 | trace_seq_printf(s, "overrun: %ld\n", cnt); |
4525 | 5091 | ||
4526 | cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); | 5092 | cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); |
4527 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); | 5093 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); |
4528 | 5094 | ||
4529 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); | 5095 | cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); |
4530 | trace_seq_printf(s, "bytes: %ld\n", cnt); | 5096 | trace_seq_printf(s, "bytes: %ld\n", cnt); |
4531 | 5097 | ||
4532 | if (trace_clocks[trace_clock_id].in_ns) { | 5098 | if (trace_clocks[trace_clock_id].in_ns) { |
4533 | /* local or global for trace_clock */ | 5099 | /* local or global for trace_clock */ |
4534 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); | 5100 | t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); |
4535 | usec_rem = do_div(t, USEC_PER_SEC); | 5101 | usec_rem = do_div(t, USEC_PER_SEC); |
4536 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", | 5102 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", |
4537 | t, usec_rem); | 5103 | t, usec_rem); |
4538 | 5104 | ||
4539 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | 5105 | t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); |
4540 | usec_rem = do_div(t, USEC_PER_SEC); | 5106 | usec_rem = do_div(t, USEC_PER_SEC); |
4541 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | 5107 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); |
4542 | } else { | 5108 | } else { |
4543 | /* counter or tsc mode for trace_clock */ | 5109 | /* counter or tsc mode for trace_clock */ |
4544 | trace_seq_printf(s, "oldest event ts: %llu\n", | 5110 | trace_seq_printf(s, "oldest event ts: %llu\n", |
4545 | ring_buffer_oldest_event_ts(tr->buffer, cpu)); | 5111 | ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); |
4546 | 5112 | ||
4547 | trace_seq_printf(s, "now ts: %llu\n", | 5113 | trace_seq_printf(s, "now ts: %llu\n", |
4548 | ring_buffer_time_stamp(tr->buffer, cpu)); | 5114 | ring_buffer_time_stamp(trace_buf->buffer, cpu)); |
4549 | } | 5115 | } |
4550 | 5116 | ||
4551 | cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); | 5117 | cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); |
4552 | trace_seq_printf(s, "dropped events: %ld\n", cnt); | 5118 | trace_seq_printf(s, "dropped events: %ld\n", cnt); |
4553 | 5119 | ||
4554 | cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); | 5120 | cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); |
4555 | trace_seq_printf(s, "read events: %ld\n", cnt); | 5121 | trace_seq_printf(s, "read events: %ld\n", cnt); |
4556 | 5122 | ||
4557 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 5123 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
@@ -4603,60 +5169,161 @@ static const struct file_operations tracing_dyn_info_fops = { | |||
4603 | .read = tracing_read_dyn_info, | 5169 | .read = tracing_read_dyn_info, |
4604 | .llseek = generic_file_llseek, | 5170 | .llseek = generic_file_llseek, |
4605 | }; | 5171 | }; |
4606 | #endif | 5172 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
4607 | 5173 | ||
4608 | static struct dentry *d_tracer; | 5174 | #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) |
5175 | static void | ||
5176 | ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) | ||
5177 | { | ||
5178 | tracing_snapshot(); | ||
5179 | } | ||
4609 | 5180 | ||
4610 | struct dentry *tracing_init_dentry(void) | 5181 | static void |
5182 | ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) | ||
5183 | { | ||
5184 | unsigned long *count = (long *)data; | ||
5185 | |||
5186 | if (!*count) | ||
5187 | return; | ||
5188 | |||
5189 | if (*count != -1) | ||
5190 | (*count)--; | ||
5191 | |||
5192 | tracing_snapshot(); | ||
5193 | } | ||
5194 | |||
5195 | static int | ||
5196 | ftrace_snapshot_print(struct seq_file *m, unsigned long ip, | ||
5197 | struct ftrace_probe_ops *ops, void *data) | ||
5198 | { | ||
5199 | long count = (long)data; | ||
5200 | |||
5201 | seq_printf(m, "%ps:", (void *)ip); | ||
5202 | |||
5203 | seq_printf(m, "snapshot"); | ||
5204 | |||
5205 | if (count == -1) | ||
5206 | seq_printf(m, ":unlimited\n"); | ||
5207 | else | ||
5208 | seq_printf(m, ":count=%ld\n", count); | ||
5209 | |||
5210 | return 0; | ||
5211 | } | ||
5212 | |||
5213 | static struct ftrace_probe_ops snapshot_probe_ops = { | ||
5214 | .func = ftrace_snapshot, | ||
5215 | .print = ftrace_snapshot_print, | ||
5216 | }; | ||
5217 | |||
5218 | static struct ftrace_probe_ops snapshot_count_probe_ops = { | ||
5219 | .func = ftrace_count_snapshot, | ||
5220 | .print = ftrace_snapshot_print, | ||
5221 | }; | ||
5222 | |||
5223 | static int | ||
5224 | ftrace_trace_snapshot_callback(struct ftrace_hash *hash, | ||
5225 | char *glob, char *cmd, char *param, int enable) | ||
5226 | { | ||
5227 | struct ftrace_probe_ops *ops; | ||
5228 | void *count = (void *)-1; | ||
5229 | char *number; | ||
5230 | int ret; | ||
5231 | |||
5232 | /* hash funcs only work with set_ftrace_filter */ | ||
5233 | if (!enable) | ||
5234 | return -EINVAL; | ||
5235 | |||
5236 | ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; | ||
5237 | |||
5238 | if (glob[0] == '!') { | ||
5239 | unregister_ftrace_function_probe_func(glob+1, ops); | ||
5240 | return 0; | ||
5241 | } | ||
5242 | |||
5243 | if (!param) | ||
5244 | goto out_reg; | ||
5245 | |||
5246 | number = strsep(¶m, ":"); | ||
5247 | |||
5248 | if (!strlen(number)) | ||
5249 | goto out_reg; | ||
5250 | |||
5251 | /* | ||
5252 | * We use the callback data field (which is a pointer) | ||
5253 | * as our counter. | ||
5254 | */ | ||
5255 | ret = kstrtoul(number, 0, (unsigned long *)&count); | ||
5256 | if (ret) | ||
5257 | return ret; | ||
5258 | |||
5259 | out_reg: | ||
5260 | ret = register_ftrace_function_probe(glob, ops, count); | ||
5261 | |||
5262 | if (ret >= 0) | ||
5263 | alloc_snapshot(&global_trace); | ||
5264 | |||
5265 | return ret < 0 ? ret : 0; | ||
5266 | } | ||
5267 | |||
5268 | static struct ftrace_func_command ftrace_snapshot_cmd = { | ||
5269 | .name = "snapshot", | ||
5270 | .func = ftrace_trace_snapshot_callback, | ||
5271 | }; | ||
5272 | |||
5273 | static int register_snapshot_cmd(void) | ||
4611 | { | 5274 | { |
4612 | static int once; | 5275 | return register_ftrace_command(&ftrace_snapshot_cmd); |
5276 | } | ||
5277 | #else | ||
5278 | static inline int register_snapshot_cmd(void) { return 0; } | ||
5279 | #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ | ||
4613 | 5280 | ||
4614 | if (d_tracer) | 5281 | struct dentry *tracing_init_dentry_tr(struct trace_array *tr) |
4615 | return d_tracer; | 5282 | { |
5283 | if (tr->dir) | ||
5284 | return tr->dir; | ||
4616 | 5285 | ||
4617 | if (!debugfs_initialized()) | 5286 | if (!debugfs_initialized()) |
4618 | return NULL; | 5287 | return NULL; |
4619 | 5288 | ||
4620 | d_tracer = debugfs_create_dir("tracing", NULL); | 5289 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) |
5290 | tr->dir = debugfs_create_dir("tracing", NULL); | ||
4621 | 5291 | ||
4622 | if (!d_tracer && !once) { | 5292 | if (!tr->dir) |
4623 | once = 1; | 5293 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); |
4624 | pr_warning("Could not create debugfs directory 'tracing'\n"); | ||
4625 | return NULL; | ||
4626 | } | ||
4627 | 5294 | ||
4628 | return d_tracer; | 5295 | return tr->dir; |
4629 | } | 5296 | } |
4630 | 5297 | ||
4631 | static struct dentry *d_percpu; | 5298 | struct dentry *tracing_init_dentry(void) |
5299 | { | ||
5300 | return tracing_init_dentry_tr(&global_trace); | ||
5301 | } | ||
4632 | 5302 | ||
4633 | static struct dentry *tracing_dentry_percpu(void) | 5303 | static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) |
4634 | { | 5304 | { |
4635 | static int once; | ||
4636 | struct dentry *d_tracer; | 5305 | struct dentry *d_tracer; |
4637 | 5306 | ||
4638 | if (d_percpu) | 5307 | if (tr->percpu_dir) |
4639 | return d_percpu; | 5308 | return tr->percpu_dir; |
4640 | |||
4641 | d_tracer = tracing_init_dentry(); | ||
4642 | 5309 | ||
5310 | d_tracer = tracing_init_dentry_tr(tr); | ||
4643 | if (!d_tracer) | 5311 | if (!d_tracer) |
4644 | return NULL; | 5312 | return NULL; |
4645 | 5313 | ||
4646 | d_percpu = debugfs_create_dir("per_cpu", d_tracer); | 5314 | tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); |
4647 | 5315 | ||
4648 | if (!d_percpu && !once) { | 5316 | WARN_ONCE(!tr->percpu_dir, |
4649 | once = 1; | 5317 | "Could not create debugfs directory 'per_cpu/%d'\n", cpu); |
4650 | pr_warning("Could not create debugfs directory 'per_cpu'\n"); | ||
4651 | return NULL; | ||
4652 | } | ||
4653 | 5318 | ||
4654 | return d_percpu; | 5319 | return tr->percpu_dir; |
4655 | } | 5320 | } |
4656 | 5321 | ||
4657 | static void tracing_init_debugfs_percpu(long cpu) | 5322 | static void |
5323 | tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | ||
4658 | { | 5324 | { |
4659 | struct dentry *d_percpu = tracing_dentry_percpu(); | 5325 | struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
5326 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); | ||
4660 | struct dentry *d_cpu; | 5327 | struct dentry *d_cpu; |
4661 | char cpu_dir[30]; /* 30 characters should be more than enough */ | 5328 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4662 | 5329 | ||
@@ -4672,20 +5339,28 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4672 | 5339 | ||
4673 | /* per cpu trace_pipe */ | 5340 | /* per cpu trace_pipe */ |
4674 | trace_create_file("trace_pipe", 0444, d_cpu, | 5341 | trace_create_file("trace_pipe", 0444, d_cpu, |
4675 | (void *) cpu, &tracing_pipe_fops); | 5342 | (void *)&data->trace_cpu, &tracing_pipe_fops); |
4676 | 5343 | ||
4677 | /* per cpu trace */ | 5344 | /* per cpu trace */ |
4678 | trace_create_file("trace", 0644, d_cpu, | 5345 | trace_create_file("trace", 0644, d_cpu, |
4679 | (void *) cpu, &tracing_fops); | 5346 | (void *)&data->trace_cpu, &tracing_fops); |
4680 | 5347 | ||
4681 | trace_create_file("trace_pipe_raw", 0444, d_cpu, | 5348 | trace_create_file("trace_pipe_raw", 0444, d_cpu, |
4682 | (void *) cpu, &tracing_buffers_fops); | 5349 | (void *)&data->trace_cpu, &tracing_buffers_fops); |
4683 | 5350 | ||
4684 | trace_create_file("stats", 0444, d_cpu, | 5351 | trace_create_file("stats", 0444, d_cpu, |
4685 | (void *) cpu, &tracing_stats_fops); | 5352 | (void *)&data->trace_cpu, &tracing_stats_fops); |
4686 | 5353 | ||
4687 | trace_create_file("buffer_size_kb", 0444, d_cpu, | 5354 | trace_create_file("buffer_size_kb", 0444, d_cpu, |
4688 | (void *) cpu, &tracing_entries_fops); | 5355 | (void *)&data->trace_cpu, &tracing_entries_fops); |
5356 | |||
5357 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
5358 | trace_create_file("snapshot", 0644, d_cpu, | ||
5359 | (void *)&data->trace_cpu, &snapshot_fops); | ||
5360 | |||
5361 | trace_create_file("snapshot_raw", 0444, d_cpu, | ||
5362 | (void *)&data->trace_cpu, &snapshot_raw_fops); | ||
5363 | #endif | ||
4689 | } | 5364 | } |
4690 | 5365 | ||
4691 | #ifdef CONFIG_FTRACE_SELFTEST | 5366 | #ifdef CONFIG_FTRACE_SELFTEST |
@@ -4696,6 +5371,7 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4696 | struct trace_option_dentry { | 5371 | struct trace_option_dentry { |
4697 | struct tracer_opt *opt; | 5372 | struct tracer_opt *opt; |
4698 | struct tracer_flags *flags; | 5373 | struct tracer_flags *flags; |
5374 | struct trace_array *tr; | ||
4699 | struct dentry *entry; | 5375 | struct dentry *entry; |
4700 | }; | 5376 | }; |
4701 | 5377 | ||
@@ -4731,7 +5407,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
4731 | 5407 | ||
4732 | if (!!(topt->flags->val & topt->opt->bit) != val) { | 5408 | if (!!(topt->flags->val & topt->opt->bit) != val) { |
4733 | mutex_lock(&trace_types_lock); | 5409 | mutex_lock(&trace_types_lock); |
4734 | ret = __set_tracer_option(current_trace, topt->flags, | 5410 | ret = __set_tracer_option(topt->tr->current_trace, topt->flags, |
4735 | topt->opt, !val); | 5411 | topt->opt, !val); |
4736 | mutex_unlock(&trace_types_lock); | 5412 | mutex_unlock(&trace_types_lock); |
4737 | if (ret) | 5413 | if (ret) |
@@ -4770,6 +5446,7 @@ static ssize_t | |||
4770 | trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, | 5446 | trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, |
4771 | loff_t *ppos) | 5447 | loff_t *ppos) |
4772 | { | 5448 | { |
5449 | struct trace_array *tr = &global_trace; | ||
4773 | long index = (long)filp->private_data; | 5450 | long index = (long)filp->private_data; |
4774 | unsigned long val; | 5451 | unsigned long val; |
4775 | int ret; | 5452 | int ret; |
@@ -4780,7 +5457,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
4780 | 5457 | ||
4781 | if (val != 0 && val != 1) | 5458 | if (val != 0 && val != 1) |
4782 | return -EINVAL; | 5459 | return -EINVAL; |
4783 | set_tracer_flags(1 << index, val); | 5460 | |
5461 | mutex_lock(&trace_types_lock); | ||
5462 | ret = set_tracer_flag(tr, 1 << index, val); | ||
5463 | mutex_unlock(&trace_types_lock); | ||
5464 | |||
5465 | if (ret < 0) | ||
5466 | return ret; | ||
4784 | 5467 | ||
4785 | *ppos += cnt; | 5468 | *ppos += cnt; |
4786 | 5469 | ||
@@ -4810,40 +5493,41 @@ struct dentry *trace_create_file(const char *name, | |||
4810 | } | 5493 | } |
4811 | 5494 | ||
4812 | 5495 | ||
4813 | static struct dentry *trace_options_init_dentry(void) | 5496 | static struct dentry *trace_options_init_dentry(struct trace_array *tr) |
4814 | { | 5497 | { |
4815 | struct dentry *d_tracer; | 5498 | struct dentry *d_tracer; |
4816 | static struct dentry *t_options; | ||
4817 | 5499 | ||
4818 | if (t_options) | 5500 | if (tr->options) |
4819 | return t_options; | 5501 | return tr->options; |
4820 | 5502 | ||
4821 | d_tracer = tracing_init_dentry(); | 5503 | d_tracer = tracing_init_dentry_tr(tr); |
4822 | if (!d_tracer) | 5504 | if (!d_tracer) |
4823 | return NULL; | 5505 | return NULL; |
4824 | 5506 | ||
4825 | t_options = debugfs_create_dir("options", d_tracer); | 5507 | tr->options = debugfs_create_dir("options", d_tracer); |
4826 | if (!t_options) { | 5508 | if (!tr->options) { |
4827 | pr_warning("Could not create debugfs directory 'options'\n"); | 5509 | pr_warning("Could not create debugfs directory 'options'\n"); |
4828 | return NULL; | 5510 | return NULL; |
4829 | } | 5511 | } |
4830 | 5512 | ||
4831 | return t_options; | 5513 | return tr->options; |
4832 | } | 5514 | } |
4833 | 5515 | ||
4834 | static void | 5516 | static void |
4835 | create_trace_option_file(struct trace_option_dentry *topt, | 5517 | create_trace_option_file(struct trace_array *tr, |
5518 | struct trace_option_dentry *topt, | ||
4836 | struct tracer_flags *flags, | 5519 | struct tracer_flags *flags, |
4837 | struct tracer_opt *opt) | 5520 | struct tracer_opt *opt) |
4838 | { | 5521 | { |
4839 | struct dentry *t_options; | 5522 | struct dentry *t_options; |
4840 | 5523 | ||
4841 | t_options = trace_options_init_dentry(); | 5524 | t_options = trace_options_init_dentry(tr); |
4842 | if (!t_options) | 5525 | if (!t_options) |
4843 | return; | 5526 | return; |
4844 | 5527 | ||
4845 | topt->flags = flags; | 5528 | topt->flags = flags; |
4846 | topt->opt = opt; | 5529 | topt->opt = opt; |
5530 | topt->tr = tr; | ||
4847 | 5531 | ||
4848 | topt->entry = trace_create_file(opt->name, 0644, t_options, topt, | 5532 | topt->entry = trace_create_file(opt->name, 0644, t_options, topt, |
4849 | &trace_options_fops); | 5533 | &trace_options_fops); |
@@ -4851,7 +5535,7 @@ create_trace_option_file(struct trace_option_dentry *topt, | |||
4851 | } | 5535 | } |
4852 | 5536 | ||
4853 | static struct trace_option_dentry * | 5537 | static struct trace_option_dentry * |
4854 | create_trace_option_files(struct tracer *tracer) | 5538 | create_trace_option_files(struct trace_array *tr, struct tracer *tracer) |
4855 | { | 5539 | { |
4856 | struct trace_option_dentry *topts; | 5540 | struct trace_option_dentry *topts; |
4857 | struct tracer_flags *flags; | 5541 | struct tracer_flags *flags; |
@@ -4876,7 +5560,7 @@ create_trace_option_files(struct tracer *tracer) | |||
4876 | return NULL; | 5560 | return NULL; |
4877 | 5561 | ||
4878 | for (cnt = 0; opts[cnt].name; cnt++) | 5562 | for (cnt = 0; opts[cnt].name; cnt++) |
4879 | create_trace_option_file(&topts[cnt], flags, | 5563 | create_trace_option_file(tr, &topts[cnt], flags, |
4880 | &opts[cnt]); | 5564 | &opts[cnt]); |
4881 | 5565 | ||
4882 | return topts; | 5566 | return topts; |
@@ -4899,11 +5583,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts) | |||
4899 | } | 5583 | } |
4900 | 5584 | ||
4901 | static struct dentry * | 5585 | static struct dentry * |
4902 | create_trace_option_core_file(const char *option, long index) | 5586 | create_trace_option_core_file(struct trace_array *tr, |
5587 | const char *option, long index) | ||
4903 | { | 5588 | { |
4904 | struct dentry *t_options; | 5589 | struct dentry *t_options; |
4905 | 5590 | ||
4906 | t_options = trace_options_init_dentry(); | 5591 | t_options = trace_options_init_dentry(tr); |
4907 | if (!t_options) | 5592 | if (!t_options) |
4908 | return NULL; | 5593 | return NULL; |
4909 | 5594 | ||
@@ -4911,17 +5596,17 @@ create_trace_option_core_file(const char *option, long index) | |||
4911 | &trace_options_core_fops); | 5596 | &trace_options_core_fops); |
4912 | } | 5597 | } |
4913 | 5598 | ||
4914 | static __init void create_trace_options_dir(void) | 5599 | static __init void create_trace_options_dir(struct trace_array *tr) |
4915 | { | 5600 | { |
4916 | struct dentry *t_options; | 5601 | struct dentry *t_options; |
4917 | int i; | 5602 | int i; |
4918 | 5603 | ||
4919 | t_options = trace_options_init_dentry(); | 5604 | t_options = trace_options_init_dentry(tr); |
4920 | if (!t_options) | 5605 | if (!t_options) |
4921 | return; | 5606 | return; |
4922 | 5607 | ||
4923 | for (i = 0; trace_options[i]; i++) | 5608 | for (i = 0; trace_options[i]; i++) |
4924 | create_trace_option_core_file(trace_options[i], i); | 5609 | create_trace_option_core_file(tr, trace_options[i], i); |
4925 | } | 5610 | } |
4926 | 5611 | ||
4927 | static ssize_t | 5612 | static ssize_t |
@@ -4929,7 +5614,7 @@ rb_simple_read(struct file *filp, char __user *ubuf, | |||
4929 | size_t cnt, loff_t *ppos) | 5614 | size_t cnt, loff_t *ppos) |
4930 | { | 5615 | { |
4931 | struct trace_array *tr = filp->private_data; | 5616 | struct trace_array *tr = filp->private_data; |
4932 | struct ring_buffer *buffer = tr->buffer; | 5617 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
4933 | char buf[64]; | 5618 | char buf[64]; |
4934 | int r; | 5619 | int r; |
4935 | 5620 | ||
@@ -4948,7 +5633,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, | |||
4948 | size_t cnt, loff_t *ppos) | 5633 | size_t cnt, loff_t *ppos) |
4949 | { | 5634 | { |
4950 | struct trace_array *tr = filp->private_data; | 5635 | struct trace_array *tr = filp->private_data; |
4951 | struct ring_buffer *buffer = tr->buffer; | 5636 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
4952 | unsigned long val; | 5637 | unsigned long val; |
4953 | int ret; | 5638 | int ret; |
4954 | 5639 | ||
@@ -4960,12 +5645,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf, | |||
4960 | mutex_lock(&trace_types_lock); | 5645 | mutex_lock(&trace_types_lock); |
4961 | if (val) { | 5646 | if (val) { |
4962 | ring_buffer_record_on(buffer); | 5647 | ring_buffer_record_on(buffer); |
4963 | if (current_trace->start) | 5648 | if (tr->current_trace->start) |
4964 | current_trace->start(tr); | 5649 | tr->current_trace->start(tr); |
4965 | } else { | 5650 | } else { |
4966 | ring_buffer_record_off(buffer); | 5651 | ring_buffer_record_off(buffer); |
4967 | if (current_trace->stop) | 5652 | if (tr->current_trace->stop) |
4968 | current_trace->stop(tr); | 5653 | tr->current_trace->stop(tr); |
4969 | } | 5654 | } |
4970 | mutex_unlock(&trace_types_lock); | 5655 | mutex_unlock(&trace_types_lock); |
4971 | } | 5656 | } |
@@ -4982,23 +5667,310 @@ static const struct file_operations rb_simple_fops = { | |||
4982 | .llseek = default_llseek, | 5667 | .llseek = default_llseek, |
4983 | }; | 5668 | }; |
4984 | 5669 | ||
5670 | struct dentry *trace_instance_dir; | ||
5671 | |||
5672 | static void | ||
5673 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); | ||
5674 | |||
5675 | static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) | ||
5676 | { | ||
5677 | int cpu; | ||
5678 | |||
5679 | for_each_tracing_cpu(cpu) { | ||
5680 | memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); | ||
5681 | per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; | ||
5682 | per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; | ||
5683 | } | ||
5684 | } | ||
5685 | |||
5686 | static int | ||
5687 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) | ||
5688 | { | ||
5689 | enum ring_buffer_flags rb_flags; | ||
5690 | |||
5691 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; | ||
5692 | |||
5693 | buf->buffer = ring_buffer_alloc(size, rb_flags); | ||
5694 | if (!buf->buffer) | ||
5695 | return -ENOMEM; | ||
5696 | |||
5697 | buf->data = alloc_percpu(struct trace_array_cpu); | ||
5698 | if (!buf->data) { | ||
5699 | ring_buffer_free(buf->buffer); | ||
5700 | return -ENOMEM; | ||
5701 | } | ||
5702 | |||
5703 | init_trace_buffers(tr, buf); | ||
5704 | |||
5705 | /* Allocate the first page for all buffers */ | ||
5706 | set_buffer_entries(&tr->trace_buffer, | ||
5707 | ring_buffer_size(tr->trace_buffer.buffer, 0)); | ||
5708 | |||
5709 | return 0; | ||
5710 | } | ||
5711 | |||
5712 | static int allocate_trace_buffers(struct trace_array *tr, int size) | ||
5713 | { | ||
5714 | int ret; | ||
5715 | |||
5716 | ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); | ||
5717 | if (ret) | ||
5718 | return ret; | ||
5719 | |||
5720 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
5721 | ret = allocate_trace_buffer(tr, &tr->max_buffer, | ||
5722 | allocate_snapshot ? size : 1); | ||
5723 | if (WARN_ON(ret)) { | ||
5724 | ring_buffer_free(tr->trace_buffer.buffer); | ||
5725 | free_percpu(tr->trace_buffer.data); | ||
5726 | return -ENOMEM; | ||
5727 | } | ||
5728 | tr->allocated_snapshot = allocate_snapshot; | ||
5729 | |||
5730 | /* | ||
5731 | * Only the top level trace array gets its snapshot allocated | ||
5732 | * from the kernel command line. | ||
5733 | */ | ||
5734 | allocate_snapshot = false; | ||
5735 | #endif | ||
5736 | return 0; | ||
5737 | } | ||
5738 | |||
5739 | static int new_instance_create(const char *name) | ||
5740 | { | ||
5741 | struct trace_array *tr; | ||
5742 | int ret; | ||
5743 | |||
5744 | mutex_lock(&trace_types_lock); | ||
5745 | |||
5746 | ret = -EEXIST; | ||
5747 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
5748 | if (tr->name && strcmp(tr->name, name) == 0) | ||
5749 | goto out_unlock; | ||
5750 | } | ||
5751 | |||
5752 | ret = -ENOMEM; | ||
5753 | tr = kzalloc(sizeof(*tr), GFP_KERNEL); | ||
5754 | if (!tr) | ||
5755 | goto out_unlock; | ||
5756 | |||
5757 | tr->name = kstrdup(name, GFP_KERNEL); | ||
5758 | if (!tr->name) | ||
5759 | goto out_free_tr; | ||
5760 | |||
5761 | raw_spin_lock_init(&tr->start_lock); | ||
5762 | |||
5763 | tr->current_trace = &nop_trace; | ||
5764 | |||
5765 | INIT_LIST_HEAD(&tr->systems); | ||
5766 | INIT_LIST_HEAD(&tr->events); | ||
5767 | |||
5768 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) | ||
5769 | goto out_free_tr; | ||
5770 | |||
5771 | /* Holder for file callbacks */ | ||
5772 | tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; | ||
5773 | tr->trace_cpu.tr = tr; | ||
5774 | |||
5775 | tr->dir = debugfs_create_dir(name, trace_instance_dir); | ||
5776 | if (!tr->dir) | ||
5777 | goto out_free_tr; | ||
5778 | |||
5779 | ret = event_trace_add_tracer(tr->dir, tr); | ||
5780 | if (ret) | ||
5781 | goto out_free_tr; | ||
5782 | |||
5783 | init_tracer_debugfs(tr, tr->dir); | ||
5784 | |||
5785 | list_add(&tr->list, &ftrace_trace_arrays); | ||
5786 | |||
5787 | mutex_unlock(&trace_types_lock); | ||
5788 | |||
5789 | return 0; | ||
5790 | |||
5791 | out_free_tr: | ||
5792 | if (tr->trace_buffer.buffer) | ||
5793 | ring_buffer_free(tr->trace_buffer.buffer); | ||
5794 | kfree(tr->name); | ||
5795 | kfree(tr); | ||
5796 | |||
5797 | out_unlock: | ||
5798 | mutex_unlock(&trace_types_lock); | ||
5799 | |||
5800 | return ret; | ||
5801 | |||
5802 | } | ||
5803 | |||
5804 | static int instance_delete(const char *name) | ||
5805 | { | ||
5806 | struct trace_array *tr; | ||
5807 | int found = 0; | ||
5808 | int ret; | ||
5809 | |||
5810 | mutex_lock(&trace_types_lock); | ||
5811 | |||
5812 | ret = -ENODEV; | ||
5813 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
5814 | if (tr->name && strcmp(tr->name, name) == 0) { | ||
5815 | found = 1; | ||
5816 | break; | ||
5817 | } | ||
5818 | } | ||
5819 | if (!found) | ||
5820 | goto out_unlock; | ||
5821 | |||
5822 | ret = -EBUSY; | ||
5823 | if (tr->ref) | ||
5824 | goto out_unlock; | ||
5825 | |||
5826 | list_del(&tr->list); | ||
5827 | |||
5828 | event_trace_del_tracer(tr); | ||
5829 | debugfs_remove_recursive(tr->dir); | ||
5830 | free_percpu(tr->trace_buffer.data); | ||
5831 | ring_buffer_free(tr->trace_buffer.buffer); | ||
5832 | |||
5833 | kfree(tr->name); | ||
5834 | kfree(tr); | ||
5835 | |||
5836 | ret = 0; | ||
5837 | |||
5838 | out_unlock: | ||
5839 | mutex_unlock(&trace_types_lock); | ||
5840 | |||
5841 | return ret; | ||
5842 | } | ||
5843 | |||
5844 | static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) | ||
5845 | { | ||
5846 | struct dentry *parent; | ||
5847 | int ret; | ||
5848 | |||
5849 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
5850 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); | ||
5851 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
5852 | return -ENOENT; | ||
5853 | |||
5854 | /* | ||
5855 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
5856 | * take the mutex. As the instances directory can not be destroyed | ||
5857 | * or changed in any other way, it is safe to unlock it, and | ||
5858 | * let the dentry try. If two users try to make the same dir at | ||
5859 | * the same time, then the new_instance_create() will determine the | ||
5860 | * winner. | ||
5861 | */ | ||
5862 | mutex_unlock(&inode->i_mutex); | ||
5863 | |||
5864 | ret = new_instance_create(dentry->d_iname); | ||
5865 | |||
5866 | mutex_lock(&inode->i_mutex); | ||
5867 | |||
5868 | return ret; | ||
5869 | } | ||
5870 | |||
5871 | static int instance_rmdir(struct inode *inode, struct dentry *dentry) | ||
5872 | { | ||
5873 | struct dentry *parent; | ||
5874 | int ret; | ||
5875 | |||
5876 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
5877 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); | ||
5878 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
5879 | return -ENOENT; | ||
5880 | |||
5881 | /* The caller did a dget() on dentry */ | ||
5882 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
5883 | |||
5884 | /* | ||
5885 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
5886 | * take the mutex. As the instances directory can not be destroyed | ||
5887 | * or changed in any other way, it is safe to unlock it, and | ||
5888 | * let the dentry try. If two users try to make the same dir at | ||
5889 | * the same time, then the instance_delete() will determine the | ||
5890 | * winner. | ||
5891 | */ | ||
5892 | mutex_unlock(&inode->i_mutex); | ||
5893 | |||
5894 | ret = instance_delete(dentry->d_iname); | ||
5895 | |||
5896 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); | ||
5897 | mutex_lock(&dentry->d_inode->i_mutex); | ||
5898 | |||
5899 | return ret; | ||
5900 | } | ||
5901 | |||
5902 | static const struct inode_operations instance_dir_inode_operations = { | ||
5903 | .lookup = simple_lookup, | ||
5904 | .mkdir = instance_mkdir, | ||
5905 | .rmdir = instance_rmdir, | ||
5906 | }; | ||
5907 | |||
5908 | static __init void create_trace_instances(struct dentry *d_tracer) | ||
5909 | { | ||
5910 | trace_instance_dir = debugfs_create_dir("instances", d_tracer); | ||
5911 | if (WARN_ON(!trace_instance_dir)) | ||
5912 | return; | ||
5913 | |||
5914 | /* Hijack the dir inode operations, to allow mkdir */ | ||
5915 | trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; | ||
5916 | } | ||
5917 | |||
5918 | static void | ||
5919 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | ||
5920 | { | ||
5921 | int cpu; | ||
5922 | |||
5923 | trace_create_file("trace_options", 0644, d_tracer, | ||
5924 | tr, &tracing_iter_fops); | ||
5925 | |||
5926 | trace_create_file("trace", 0644, d_tracer, | ||
5927 | (void *)&tr->trace_cpu, &tracing_fops); | ||
5928 | |||
5929 | trace_create_file("trace_pipe", 0444, d_tracer, | ||
5930 | (void *)&tr->trace_cpu, &tracing_pipe_fops); | ||
5931 | |||
5932 | trace_create_file("buffer_size_kb", 0644, d_tracer, | ||
5933 | (void *)&tr->trace_cpu, &tracing_entries_fops); | ||
5934 | |||
5935 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | ||
5936 | tr, &tracing_total_entries_fops); | ||
5937 | |||
5938 | trace_create_file("free_buffer", 0644, d_tracer, | ||
5939 | tr, &tracing_free_buffer_fops); | ||
5940 | |||
5941 | trace_create_file("trace_marker", 0220, d_tracer, | ||
5942 | tr, &tracing_mark_fops); | ||
5943 | |||
5944 | trace_create_file("trace_clock", 0644, d_tracer, tr, | ||
5945 | &trace_clock_fops); | ||
5946 | |||
5947 | trace_create_file("tracing_on", 0644, d_tracer, | ||
5948 | tr, &rb_simple_fops); | ||
5949 | |||
5950 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
5951 | trace_create_file("snapshot", 0644, d_tracer, | ||
5952 | (void *)&tr->trace_cpu, &snapshot_fops); | ||
5953 | #endif | ||
5954 | |||
5955 | for_each_tracing_cpu(cpu) | ||
5956 | tracing_init_debugfs_percpu(tr, cpu); | ||
5957 | |||
5958 | } | ||
5959 | |||
4985 | static __init int tracer_init_debugfs(void) | 5960 | static __init int tracer_init_debugfs(void) |
4986 | { | 5961 | { |
4987 | struct dentry *d_tracer; | 5962 | struct dentry *d_tracer; |
4988 | int cpu; | ||
4989 | 5963 | ||
4990 | trace_access_lock_init(); | 5964 | trace_access_lock_init(); |
4991 | 5965 | ||
4992 | d_tracer = tracing_init_dentry(); | 5966 | d_tracer = tracing_init_dentry(); |
5967 | if (!d_tracer) | ||
5968 | return 0; | ||
4993 | 5969 | ||
4994 | trace_create_file("trace_options", 0644, d_tracer, | 5970 | init_tracer_debugfs(&global_trace, d_tracer); |
4995 | NULL, &tracing_iter_fops); | ||
4996 | 5971 | ||
4997 | trace_create_file("tracing_cpumask", 0644, d_tracer, | 5972 | trace_create_file("tracing_cpumask", 0644, d_tracer, |
4998 | NULL, &tracing_cpumask_fops); | 5973 | &global_trace, &tracing_cpumask_fops); |
4999 | |||
5000 | trace_create_file("trace", 0644, d_tracer, | ||
5001 | (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); | ||
5002 | 5974 | ||
5003 | trace_create_file("available_tracers", 0444, d_tracer, | 5975 | trace_create_file("available_tracers", 0444, d_tracer, |
5004 | &global_trace, &show_traces_fops); | 5976 | &global_trace, &show_traces_fops); |
@@ -5017,44 +5989,17 @@ static __init int tracer_init_debugfs(void) | |||
5017 | trace_create_file("README", 0444, d_tracer, | 5989 | trace_create_file("README", 0444, d_tracer, |
5018 | NULL, &tracing_readme_fops); | 5990 | NULL, &tracing_readme_fops); |
5019 | 5991 | ||
5020 | trace_create_file("trace_pipe", 0444, d_tracer, | ||
5021 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); | ||
5022 | |||
5023 | trace_create_file("buffer_size_kb", 0644, d_tracer, | ||
5024 | (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); | ||
5025 | |||
5026 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | ||
5027 | &global_trace, &tracing_total_entries_fops); | ||
5028 | |||
5029 | trace_create_file("free_buffer", 0644, d_tracer, | ||
5030 | &global_trace, &tracing_free_buffer_fops); | ||
5031 | |||
5032 | trace_create_file("trace_marker", 0220, d_tracer, | ||
5033 | NULL, &tracing_mark_fops); | ||
5034 | |||
5035 | trace_create_file("saved_cmdlines", 0444, d_tracer, | 5992 | trace_create_file("saved_cmdlines", 0444, d_tracer, |
5036 | NULL, &tracing_saved_cmdlines_fops); | 5993 | NULL, &tracing_saved_cmdlines_fops); |
5037 | 5994 | ||
5038 | trace_create_file("trace_clock", 0644, d_tracer, NULL, | ||
5039 | &trace_clock_fops); | ||
5040 | |||
5041 | trace_create_file("tracing_on", 0644, d_tracer, | ||
5042 | &global_trace, &rb_simple_fops); | ||
5043 | |||
5044 | #ifdef CONFIG_DYNAMIC_FTRACE | 5995 | #ifdef CONFIG_DYNAMIC_FTRACE |
5045 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 5996 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
5046 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 5997 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
5047 | #endif | 5998 | #endif |
5048 | 5999 | ||
5049 | #ifdef CONFIG_TRACER_SNAPSHOT | 6000 | create_trace_instances(d_tracer); |
5050 | trace_create_file("snapshot", 0644, d_tracer, | ||
5051 | (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); | ||
5052 | #endif | ||
5053 | |||
5054 | create_trace_options_dir(); | ||
5055 | 6001 | ||
5056 | for_each_tracing_cpu(cpu) | 6002 | create_trace_options_dir(&global_trace); |
5057 | tracing_init_debugfs_percpu(cpu); | ||
5058 | 6003 | ||
5059 | return 0; | 6004 | return 0; |
5060 | } | 6005 | } |
@@ -5110,8 +6055,8 @@ void | |||
5110 | trace_printk_seq(struct trace_seq *s) | 6055 | trace_printk_seq(struct trace_seq *s) |
5111 | { | 6056 | { |
5112 | /* Probably should print a warning here. */ | 6057 | /* Probably should print a warning here. */ |
5113 | if (s->len >= 1000) | 6058 | if (s->len >= TRACE_MAX_PRINT) |
5114 | s->len = 1000; | 6059 | s->len = TRACE_MAX_PRINT; |
5115 | 6060 | ||
5116 | /* should be zero ended, but we are paranoid. */ | 6061 | /* should be zero ended, but we are paranoid. */ |
5117 | s->buffer[s->len] = 0; | 6062 | s->buffer[s->len] = 0; |
@@ -5124,46 +6069,43 @@ trace_printk_seq(struct trace_seq *s) | |||
5124 | void trace_init_global_iter(struct trace_iterator *iter) | 6069 | void trace_init_global_iter(struct trace_iterator *iter) |
5125 | { | 6070 | { |
5126 | iter->tr = &global_trace; | 6071 | iter->tr = &global_trace; |
5127 | iter->trace = current_trace; | 6072 | iter->trace = iter->tr->current_trace; |
5128 | iter->cpu_file = TRACE_PIPE_ALL_CPU; | 6073 | iter->cpu_file = RING_BUFFER_ALL_CPUS; |
6074 | iter->trace_buffer = &global_trace.trace_buffer; | ||
5129 | } | 6075 | } |
5130 | 6076 | ||
5131 | static void | 6077 | void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) |
5132 | __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | ||
5133 | { | 6078 | { |
5134 | static arch_spinlock_t ftrace_dump_lock = | ||
5135 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
5136 | /* use static because iter can be a bit big for the stack */ | 6079 | /* use static because iter can be a bit big for the stack */ |
5137 | static struct trace_iterator iter; | 6080 | static struct trace_iterator iter; |
6081 | static atomic_t dump_running; | ||
5138 | unsigned int old_userobj; | 6082 | unsigned int old_userobj; |
5139 | static int dump_ran; | ||
5140 | unsigned long flags; | 6083 | unsigned long flags; |
5141 | int cnt = 0, cpu; | 6084 | int cnt = 0, cpu; |
5142 | 6085 | ||
5143 | /* only one dump */ | 6086 | /* Only allow one dump user at a time. */ |
5144 | local_irq_save(flags); | 6087 | if (atomic_inc_return(&dump_running) != 1) { |
5145 | arch_spin_lock(&ftrace_dump_lock); | 6088 | atomic_dec(&dump_running); |
5146 | if (dump_ran) | 6089 | return; |
5147 | goto out; | 6090 | } |
5148 | |||
5149 | dump_ran = 1; | ||
5150 | 6091 | ||
6092 | /* | ||
6093 | * Always turn off tracing when we dump. | ||
6094 | * We don't need to show trace output of what happens | ||
6095 | * between multiple crashes. | ||
6096 | * | ||
6097 | * If the user does a sysrq-z, then they can re-enable | ||
6098 | * tracing with echo 1 > tracing_on. | ||
6099 | */ | ||
5151 | tracing_off(); | 6100 | tracing_off(); |
5152 | 6101 | ||
5153 | /* Did function tracer already get disabled? */ | 6102 | local_irq_save(flags); |
5154 | if (ftrace_is_dead()) { | ||
5155 | printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | ||
5156 | printk("# MAY BE MISSING FUNCTION EVENTS\n"); | ||
5157 | } | ||
5158 | |||
5159 | if (disable_tracing) | ||
5160 | ftrace_kill(); | ||
5161 | 6103 | ||
5162 | /* Simulate the iterator */ | 6104 | /* Simulate the iterator */ |
5163 | trace_init_global_iter(&iter); | 6105 | trace_init_global_iter(&iter); |
5164 | 6106 | ||
5165 | for_each_tracing_cpu(cpu) { | 6107 | for_each_tracing_cpu(cpu) { |
5166 | atomic_inc(&iter.tr->data[cpu]->disabled); | 6108 | atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); |
5167 | } | 6109 | } |
5168 | 6110 | ||
5169 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; | 6111 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; |
@@ -5173,7 +6115,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
5173 | 6115 | ||
5174 | switch (oops_dump_mode) { | 6116 | switch (oops_dump_mode) { |
5175 | case DUMP_ALL: | 6117 | case DUMP_ALL: |
5176 | iter.cpu_file = TRACE_PIPE_ALL_CPU; | 6118 | iter.cpu_file = RING_BUFFER_ALL_CPUS; |
5177 | break; | 6119 | break; |
5178 | case DUMP_ORIG: | 6120 | case DUMP_ORIG: |
5179 | iter.cpu_file = raw_smp_processor_id(); | 6121 | iter.cpu_file = raw_smp_processor_id(); |
@@ -5182,11 +6124,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
5182 | goto out_enable; | 6124 | goto out_enable; |
5183 | default: | 6125 | default: |
5184 | printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); | 6126 | printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); |
5185 | iter.cpu_file = TRACE_PIPE_ALL_CPU; | 6127 | iter.cpu_file = RING_BUFFER_ALL_CPUS; |
5186 | } | 6128 | } |
5187 | 6129 | ||
5188 | printk(KERN_TRACE "Dumping ftrace buffer:\n"); | 6130 | printk(KERN_TRACE "Dumping ftrace buffer:\n"); |
5189 | 6131 | ||
6132 | /* Did function tracer already get disabled? */ | ||
6133 | if (ftrace_is_dead()) { | ||
6134 | printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | ||
6135 | printk("# MAY BE MISSING FUNCTION EVENTS\n"); | ||
6136 | } | ||
6137 | |||
5190 | /* | 6138 | /* |
5191 | * We need to stop all tracing on all CPUS to read the | 6139 | * We need to stop all tracing on all CPUS to read the |
5192 | * the next buffer. This is a bit expensive, but is | 6140 | * the next buffer. This is a bit expensive, but is |
@@ -5226,33 +6174,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
5226 | printk(KERN_TRACE "---------------------------------\n"); | 6174 | printk(KERN_TRACE "---------------------------------\n"); |
5227 | 6175 | ||
5228 | out_enable: | 6176 | out_enable: |
5229 | /* Re-enable tracing if requested */ | 6177 | trace_flags |= old_userobj; |
5230 | if (!disable_tracing) { | ||
5231 | trace_flags |= old_userobj; | ||
5232 | 6178 | ||
5233 | for_each_tracing_cpu(cpu) { | 6179 | for_each_tracing_cpu(cpu) { |
5234 | atomic_dec(&iter.tr->data[cpu]->disabled); | 6180 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
5235 | } | ||
5236 | tracing_on(); | ||
5237 | } | 6181 | } |
5238 | 6182 | atomic_dec(&dump_running); | |
5239 | out: | ||
5240 | arch_spin_unlock(&ftrace_dump_lock); | ||
5241 | local_irq_restore(flags); | 6183 | local_irq_restore(flags); |
5242 | } | 6184 | } |
5243 | |||
5244 | /* By default: disable tracing after the dump */ | ||
5245 | void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | ||
5246 | { | ||
5247 | __ftrace_dump(true, oops_dump_mode); | ||
5248 | } | ||
5249 | EXPORT_SYMBOL_GPL(ftrace_dump); | 6185 | EXPORT_SYMBOL_GPL(ftrace_dump); |
5250 | 6186 | ||
5251 | __init static int tracer_alloc_buffers(void) | 6187 | __init static int tracer_alloc_buffers(void) |
5252 | { | 6188 | { |
5253 | int ring_buf_size; | 6189 | int ring_buf_size; |
5254 | enum ring_buffer_flags rb_flags; | ||
5255 | int i; | ||
5256 | int ret = -ENOMEM; | 6190 | int ret = -ENOMEM; |
5257 | 6191 | ||
5258 | 6192 | ||
@@ -5273,46 +6207,29 @@ __init static int tracer_alloc_buffers(void) | |||
5273 | else | 6207 | else |
5274 | ring_buf_size = 1; | 6208 | ring_buf_size = 1; |
5275 | 6209 | ||
5276 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; | ||
5277 | |||
5278 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); | 6210 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); |
5279 | cpumask_copy(tracing_cpumask, cpu_all_mask); | 6211 | cpumask_copy(tracing_cpumask, cpu_all_mask); |
5280 | 6212 | ||
6213 | raw_spin_lock_init(&global_trace.start_lock); | ||
6214 | |||
5281 | /* TODO: make the number of buffers hot pluggable with CPUS */ | 6215 | /* TODO: make the number of buffers hot pluggable with CPUS */ |
5282 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); | 6216 | if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { |
5283 | if (!global_trace.buffer) { | ||
5284 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); | 6217 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); |
5285 | WARN_ON(1); | 6218 | WARN_ON(1); |
5286 | goto out_free_cpumask; | 6219 | goto out_free_cpumask; |
5287 | } | 6220 | } |
6221 | |||
5288 | if (global_trace.buffer_disabled) | 6222 | if (global_trace.buffer_disabled) |
5289 | tracing_off(); | 6223 | tracing_off(); |
5290 | 6224 | ||
5291 | |||
5292 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
5293 | max_tr.buffer = ring_buffer_alloc(1, rb_flags); | ||
5294 | if (!max_tr.buffer) { | ||
5295 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | ||
5296 | WARN_ON(1); | ||
5297 | ring_buffer_free(global_trace.buffer); | ||
5298 | goto out_free_cpumask; | ||
5299 | } | ||
5300 | #endif | ||
5301 | |||
5302 | /* Allocate the first page for all buffers */ | ||
5303 | for_each_tracing_cpu(i) { | ||
5304 | global_trace.data[i] = &per_cpu(global_trace_cpu, i); | ||
5305 | max_tr.data[i] = &per_cpu(max_tr_data, i); | ||
5306 | } | ||
5307 | |||
5308 | set_buffer_entries(&global_trace, | ||
5309 | ring_buffer_size(global_trace.buffer, 0)); | ||
5310 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
5311 | set_buffer_entries(&max_tr, 1); | ||
5312 | #endif | ||
5313 | |||
5314 | trace_init_cmdlines(); | 6225 | trace_init_cmdlines(); |
5315 | init_irq_work(&trace_work_wakeup, trace_wake_up); | 6226 | |
6227 | /* | ||
6228 | * register_tracer() might reference current_trace, so it | ||
6229 | * needs to be set before we register anything. This is | ||
6230 | * just a bootstrap of current_trace anyway. | ||
6231 | */ | ||
6232 | global_trace.current_trace = &nop_trace; | ||
5316 | 6233 | ||
5317 | register_tracer(&nop_trace); | 6234 | register_tracer(&nop_trace); |
5318 | 6235 | ||
@@ -5324,16 +6241,32 @@ __init static int tracer_alloc_buffers(void) | |||
5324 | 6241 | ||
5325 | register_die_notifier(&trace_die_notifier); | 6242 | register_die_notifier(&trace_die_notifier); |
5326 | 6243 | ||
6244 | global_trace.flags = TRACE_ARRAY_FL_GLOBAL; | ||
6245 | |||
6246 | /* Holder for file callbacks */ | ||
6247 | global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; | ||
6248 | global_trace.trace_cpu.tr = &global_trace; | ||
6249 | |||
6250 | INIT_LIST_HEAD(&global_trace.systems); | ||
6251 | INIT_LIST_HEAD(&global_trace.events); | ||
6252 | list_add(&global_trace.list, &ftrace_trace_arrays); | ||
6253 | |||
5327 | while (trace_boot_options) { | 6254 | while (trace_boot_options) { |
5328 | char *option; | 6255 | char *option; |
5329 | 6256 | ||
5330 | option = strsep(&trace_boot_options, ","); | 6257 | option = strsep(&trace_boot_options, ","); |
5331 | trace_set_options(option); | 6258 | trace_set_options(&global_trace, option); |
5332 | } | 6259 | } |
5333 | 6260 | ||
6261 | register_snapshot_cmd(); | ||
6262 | |||
5334 | return 0; | 6263 | return 0; |
5335 | 6264 | ||
5336 | out_free_cpumask: | 6265 | out_free_cpumask: |
6266 | free_percpu(global_trace.trace_buffer.data); | ||
6267 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
6268 | free_percpu(global_trace.max_buffer.data); | ||
6269 | #endif | ||
5337 | free_cpumask_var(tracing_cpumask); | 6270 | free_cpumask_var(tracing_cpumask); |
5338 | out_free_buffer_mask: | 6271 | out_free_buffer_mask: |
5339 | free_cpumask_var(tracing_buffer_mask); | 6272 | free_cpumask_var(tracing_buffer_mask); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d56..711ca7d3e7f1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -13,6 +13,11 @@ | |||
13 | #include <linux/trace_seq.h> | 13 | #include <linux/trace_seq.h> |
14 | #include <linux/ftrace_event.h> | 14 | #include <linux/ftrace_event.h> |
15 | 15 | ||
16 | #ifdef CONFIG_FTRACE_SYSCALLS | ||
17 | #include <asm/unistd.h> /* For NR_SYSCALLS */ | ||
18 | #include <asm/syscall.h> /* some archs define it here */ | ||
19 | #endif | ||
20 | |||
16 | enum trace_type { | 21 | enum trace_type { |
17 | __TRACE_FIRST_TYPE = 0, | 22 | __TRACE_FIRST_TYPE = 0, |
18 | 23 | ||
@@ -29,6 +34,7 @@ enum trace_type { | |||
29 | TRACE_GRAPH_ENT, | 34 | TRACE_GRAPH_ENT, |
30 | TRACE_USER_STACK, | 35 | TRACE_USER_STACK, |
31 | TRACE_BLK, | 36 | TRACE_BLK, |
37 | TRACE_BPUTS, | ||
32 | 38 | ||
33 | __TRACE_LAST_TYPE, | 39 | __TRACE_LAST_TYPE, |
34 | }; | 40 | }; |
@@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head { | |||
103 | unsigned long ret_ip; | 109 | unsigned long ret_ip; |
104 | }; | 110 | }; |
105 | 111 | ||
106 | struct uprobe_trace_entry_head { | ||
107 | struct trace_entry ent; | ||
108 | unsigned long ip; | ||
109 | }; | ||
110 | |||
111 | /* | 112 | /* |
112 | * trace_flag_type is an enumeration that holds different | 113 | * trace_flag_type is an enumeration that holds different |
113 | * states when a trace occurs. These are: | 114 | * states when a trace occurs. These are: |
@@ -127,12 +128,21 @@ enum trace_flag_type { | |||
127 | 128 | ||
128 | #define TRACE_BUF_SIZE 1024 | 129 | #define TRACE_BUF_SIZE 1024 |
129 | 130 | ||
131 | struct trace_array; | ||
132 | |||
133 | struct trace_cpu { | ||
134 | struct trace_array *tr; | ||
135 | struct dentry *dir; | ||
136 | int cpu; | ||
137 | }; | ||
138 | |||
130 | /* | 139 | /* |
131 | * The CPU trace array - it consists of thousands of trace entries | 140 | * The CPU trace array - it consists of thousands of trace entries |
132 | * plus some other descriptor data: (for example which task started | 141 | * plus some other descriptor data: (for example which task started |
133 | * the trace, etc.) | 142 | * the trace, etc.) |
134 | */ | 143 | */ |
135 | struct trace_array_cpu { | 144 | struct trace_array_cpu { |
145 | struct trace_cpu trace_cpu; | ||
136 | atomic_t disabled; | 146 | atomic_t disabled; |
137 | void *buffer_page; /* ring buffer spare */ | 147 | void *buffer_page; /* ring buffer spare */ |
138 | 148 | ||
@@ -151,20 +161,83 @@ struct trace_array_cpu { | |||
151 | char comm[TASK_COMM_LEN]; | 161 | char comm[TASK_COMM_LEN]; |
152 | }; | 162 | }; |
153 | 163 | ||
164 | struct tracer; | ||
165 | |||
166 | struct trace_buffer { | ||
167 | struct trace_array *tr; | ||
168 | struct ring_buffer *buffer; | ||
169 | struct trace_array_cpu __percpu *data; | ||
170 | cycle_t time_start; | ||
171 | int cpu; | ||
172 | }; | ||
173 | |||
154 | /* | 174 | /* |
155 | * The trace array - an array of per-CPU trace arrays. This is the | 175 | * The trace array - an array of per-CPU trace arrays. This is the |
156 | * highest level data structure that individual tracers deal with. | 176 | * highest level data structure that individual tracers deal with. |
157 | * They have on/off state as well: | 177 | * They have on/off state as well: |
158 | */ | 178 | */ |
159 | struct trace_array { | 179 | struct trace_array { |
160 | struct ring_buffer *buffer; | 180 | struct list_head list; |
161 | int cpu; | 181 | char *name; |
182 | struct trace_buffer trace_buffer; | ||
183 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
184 | /* | ||
185 | * The max_buffer is used to snapshot the trace when a maximum | ||
186 | * latency is reached, or when the user initiates a snapshot. | ||
187 | * Some tracers will use this to store a maximum trace while | ||
188 | * it continues examining live traces. | ||
189 | * | ||
190 | * The buffers for the max_buffer are set up the same as the trace_buffer | ||
191 | * When a snapshot is taken, the buffer of the max_buffer is swapped | ||
192 | * with the buffer of the trace_buffer and the buffers are reset for | ||
193 | * the trace_buffer so the tracing can continue. | ||
194 | */ | ||
195 | struct trace_buffer max_buffer; | ||
196 | bool allocated_snapshot; | ||
197 | #endif | ||
162 | int buffer_disabled; | 198 | int buffer_disabled; |
163 | cycle_t time_start; | 199 | struct trace_cpu trace_cpu; /* place holder */ |
200 | #ifdef CONFIG_FTRACE_SYSCALLS | ||
201 | int sys_refcount_enter; | ||
202 | int sys_refcount_exit; | ||
203 | DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); | ||
204 | DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); | ||
205 | #endif | ||
206 | int stop_count; | ||
207 | int clock_id; | ||
208 | struct tracer *current_trace; | ||
209 | unsigned int flags; | ||
210 | raw_spinlock_t start_lock; | ||
211 | struct dentry *dir; | ||
212 | struct dentry *options; | ||
213 | struct dentry *percpu_dir; | ||
214 | struct dentry *event_dir; | ||
215 | struct list_head systems; | ||
216 | struct list_head events; | ||
164 | struct task_struct *waiter; | 217 | struct task_struct *waiter; |
165 | struct trace_array_cpu *data[NR_CPUS]; | 218 | int ref; |
219 | }; | ||
220 | |||
221 | enum { | ||
222 | TRACE_ARRAY_FL_GLOBAL = (1 << 0) | ||
166 | }; | 223 | }; |
167 | 224 | ||
225 | extern struct list_head ftrace_trace_arrays; | ||
226 | |||
227 | /* | ||
228 | * The global tracer (top) should be the first trace array added, | ||
229 | * but we check the flag anyway. | ||
230 | */ | ||
231 | static inline struct trace_array *top_trace_array(void) | ||
232 | { | ||
233 | struct trace_array *tr; | ||
234 | |||
235 | tr = list_entry(ftrace_trace_arrays.prev, | ||
236 | typeof(*tr), list); | ||
237 | WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); | ||
238 | return tr; | ||
239 | } | ||
240 | |||
168 | #define FTRACE_CMP_TYPE(var, type) \ | 241 | #define FTRACE_CMP_TYPE(var, type) \ |
169 | __builtin_types_compatible_p(typeof(var), type *) | 242 | __builtin_types_compatible_p(typeof(var), type *) |
170 | 243 | ||
@@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void); | |||
200 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ | 273 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ |
201 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 274 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
202 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 275 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
276 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ | ||
203 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 277 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
204 | TRACE_MMIO_RW); \ | 278 | TRACE_MMIO_RW); \ |
205 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 279 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
@@ -283,11 +357,16 @@ struct tracer { | |||
283 | enum print_line_t (*print_line)(struct trace_iterator *iter); | 357 | enum print_line_t (*print_line)(struct trace_iterator *iter); |
284 | /* If you handled the flag setting, return 0 */ | 358 | /* If you handled the flag setting, return 0 */ |
285 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 359 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
360 | /* Return 0 if OK with change, else return non-zero */ | ||
361 | int (*flag_changed)(struct tracer *tracer, | ||
362 | u32 mask, int set); | ||
286 | struct tracer *next; | 363 | struct tracer *next; |
287 | struct tracer_flags *flags; | 364 | struct tracer_flags *flags; |
288 | bool print_max; | 365 | bool print_max; |
366 | bool enabled; | ||
367 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
289 | bool use_max_tr; | 368 | bool use_max_tr; |
290 | bool allocated_snapshot; | 369 | #endif |
291 | }; | 370 | }; |
292 | 371 | ||
293 | 372 | ||
@@ -423,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit) | |||
423 | current->trace_recursion = val; | 502 | current->trace_recursion = val; |
424 | } | 503 | } |
425 | 504 | ||
426 | #define TRACE_PIPE_ALL_CPU -1 | ||
427 | |||
428 | static inline struct ring_buffer_iter * | 505 | static inline struct ring_buffer_iter * |
429 | trace_buffer_iter(struct trace_iterator *iter, int cpu) | 506 | trace_buffer_iter(struct trace_iterator *iter, int cpu) |
430 | { | 507 | { |
@@ -435,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) | |||
435 | 512 | ||
436 | int tracer_init(struct tracer *t, struct trace_array *tr); | 513 | int tracer_init(struct tracer *t, struct trace_array *tr); |
437 | int tracing_is_enabled(void); | 514 | int tracing_is_enabled(void); |
438 | void tracing_reset(struct trace_array *tr, int cpu); | 515 | void tracing_reset(struct trace_buffer *buf, int cpu); |
439 | void tracing_reset_online_cpus(struct trace_array *tr); | 516 | void tracing_reset_online_cpus(struct trace_buffer *buf); |
440 | void tracing_reset_current(int cpu); | 517 | void tracing_reset_current(int cpu); |
441 | void tracing_reset_current_online_cpus(void); | 518 | void tracing_reset_all_online_cpus(void); |
442 | int tracing_open_generic(struct inode *inode, struct file *filp); | 519 | int tracing_open_generic(struct inode *inode, struct file *filp); |
443 | struct dentry *trace_create_file(const char *name, | 520 | struct dentry *trace_create_file(const char *name, |
444 | umode_t mode, | 521 | umode_t mode, |
@@ -446,6 +523,7 @@ struct dentry *trace_create_file(const char *name, | |||
446 | void *data, | 523 | void *data, |
447 | const struct file_operations *fops); | 524 | const struct file_operations *fops); |
448 | 525 | ||
526 | struct dentry *tracing_init_dentry_tr(struct trace_array *tr); | ||
449 | struct dentry *tracing_init_dentry(void); | 527 | struct dentry *tracing_init_dentry(void); |
450 | 528 | ||
451 | struct ring_buffer_event; | 529 | struct ring_buffer_event; |
@@ -579,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void); | |||
579 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 | 657 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 |
580 | extern int DYN_FTRACE_TEST_NAME2(void); | 658 | extern int DYN_FTRACE_TEST_NAME2(void); |
581 | 659 | ||
582 | extern int ring_buffer_expanded; | 660 | extern bool ring_buffer_expanded; |
583 | extern bool tracing_selftest_disabled; | 661 | extern bool tracing_selftest_disabled; |
584 | DECLARE_PER_CPU(int, ftrace_cpu_disabled); | 662 | DECLARE_PER_CPU(int, ftrace_cpu_disabled); |
585 | 663 | ||
@@ -615,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr, | |||
615 | unsigned long ip, const char *fmt, va_list args); | 693 | unsigned long ip, const char *fmt, va_list args); |
616 | int trace_array_printk(struct trace_array *tr, | 694 | int trace_array_printk(struct trace_array *tr, |
617 | unsigned long ip, const char *fmt, ...); | 695 | unsigned long ip, const char *fmt, ...); |
696 | int trace_array_printk_buf(struct ring_buffer *buffer, | ||
697 | unsigned long ip, const char *fmt, ...); | ||
618 | void trace_printk_seq(struct trace_seq *s); | 698 | void trace_printk_seq(struct trace_seq *s); |
619 | enum print_line_t print_trace_line(struct trace_iterator *iter); | 699 | enum print_line_t print_trace_line(struct trace_iterator *iter); |
620 | 700 | ||
@@ -782,6 +862,7 @@ enum trace_iterator_flags { | |||
782 | TRACE_ITER_STOP_ON_FREE = 0x400000, | 862 | TRACE_ITER_STOP_ON_FREE = 0x400000, |
783 | TRACE_ITER_IRQ_INFO = 0x800000, | 863 | TRACE_ITER_IRQ_INFO = 0x800000, |
784 | TRACE_ITER_MARKERS = 0x1000000, | 864 | TRACE_ITER_MARKERS = 0x1000000, |
865 | TRACE_ITER_FUNCTION = 0x2000000, | ||
785 | }; | 866 | }; |
786 | 867 | ||
787 | /* | 868 | /* |
@@ -828,8 +909,8 @@ enum { | |||
828 | 909 | ||
829 | struct ftrace_event_field { | 910 | struct ftrace_event_field { |
830 | struct list_head link; | 911 | struct list_head link; |
831 | char *name; | 912 | const char *name; |
832 | char *type; | 913 | const char *type; |
833 | int filter_type; | 914 | int filter_type; |
834 | int offset; | 915 | int offset; |
835 | int size; | 916 | int size; |
@@ -847,12 +928,19 @@ struct event_filter { | |||
847 | struct event_subsystem { | 928 | struct event_subsystem { |
848 | struct list_head list; | 929 | struct list_head list; |
849 | const char *name; | 930 | const char *name; |
850 | struct dentry *entry; | ||
851 | struct event_filter *filter; | 931 | struct event_filter *filter; |
852 | int nr_events; | ||
853 | int ref_count; | 932 | int ref_count; |
854 | }; | 933 | }; |
855 | 934 | ||
935 | struct ftrace_subsystem_dir { | ||
936 | struct list_head list; | ||
937 | struct event_subsystem *subsystem; | ||
938 | struct trace_array *tr; | ||
939 | struct dentry *entry; | ||
940 | int ref_count; | ||
941 | int nr_events; | ||
942 | }; | ||
943 | |||
856 | #define FILTER_PRED_INVALID ((unsigned short)-1) | 944 | #define FILTER_PRED_INVALID ((unsigned short)-1) |
857 | #define FILTER_PRED_IS_RIGHT (1 << 15) | 945 | #define FILTER_PRED_IS_RIGHT (1 << 15) |
858 | #define FILTER_PRED_FOLD (1 << 15) | 946 | #define FILTER_PRED_FOLD (1 << 15) |
@@ -902,22 +990,20 @@ struct filter_pred { | |||
902 | unsigned short right; | 990 | unsigned short right; |
903 | }; | 991 | }; |
904 | 992 | ||
905 | extern struct list_head ftrace_common_fields; | ||
906 | |||
907 | extern enum regex_type | 993 | extern enum regex_type |
908 | filter_parse_regex(char *buff, int len, char **search, int *not); | 994 | filter_parse_regex(char *buff, int len, char **search, int *not); |
909 | extern void print_event_filter(struct ftrace_event_call *call, | 995 | extern void print_event_filter(struct ftrace_event_call *call, |
910 | struct trace_seq *s); | 996 | struct trace_seq *s); |
911 | extern int apply_event_filter(struct ftrace_event_call *call, | 997 | extern int apply_event_filter(struct ftrace_event_call *call, |
912 | char *filter_string); | 998 | char *filter_string); |
913 | extern int apply_subsystem_event_filter(struct event_subsystem *system, | 999 | extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, |
914 | char *filter_string); | 1000 | char *filter_string); |
915 | extern void print_subsystem_event_filter(struct event_subsystem *system, | 1001 | extern void print_subsystem_event_filter(struct event_subsystem *system, |
916 | struct trace_seq *s); | 1002 | struct trace_seq *s); |
917 | extern int filter_assign_type(const char *type); | 1003 | extern int filter_assign_type(const char *type); |
918 | 1004 | ||
919 | struct list_head * | 1005 | struct ftrace_event_field * |
920 | trace_get_fields(struct ftrace_event_call *event_call); | 1006 | trace_find_event_field(struct ftrace_event_call *call, char *name); |
921 | 1007 | ||
922 | static inline int | 1008 | static inline int |
923 | filter_check_discard(struct ftrace_event_call *call, void *rec, | 1009 | filter_check_discard(struct ftrace_event_call *call, void *rec, |
@@ -934,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, | |||
934 | } | 1020 | } |
935 | 1021 | ||
936 | extern void trace_event_enable_cmd_record(bool enable); | 1022 | extern void trace_event_enable_cmd_record(bool enable); |
1023 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); | ||
1024 | extern int event_trace_del_tracer(struct trace_array *tr); | ||
937 | 1025 | ||
938 | extern struct mutex event_mutex; | 1026 | extern struct mutex event_mutex; |
939 | extern struct list_head ftrace_events; | 1027 | extern struct list_head ftrace_events; |
@@ -943,6 +1031,19 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
943 | 1031 | ||
944 | void trace_printk_init_buffers(void); | 1032 | void trace_printk_init_buffers(void); |
945 | void trace_printk_start_comm(void); | 1033 | void trace_printk_start_comm(void); |
1034 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); | ||
1035 | int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); | ||
1036 | |||
1037 | /* | ||
1038 | * Normal trace_printk() and friends allocates special buffers | ||
1039 | * to do the manipulation, as well as saves the print formats | ||
1040 | * into sections to display. But the trace infrastructure wants | ||
1041 | * to use these without the added overhead at the price of being | ||
1042 | * a bit slower (used mainly for warnings, where we don't care | ||
1043 | * about performance). The internal_trace_puts() is for such | ||
1044 | * a purpose. | ||
1045 | */ | ||
1046 | #define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) | ||
946 | 1047 | ||
947 | #undef FTRACE_ENTRY | 1048 | #undef FTRACE_ENTRY |
948 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 1049 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 95e96842ed29..d594da0dc03c 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
32 | { | 32 | { |
33 | struct ftrace_event_call *call = &event_branch; | 33 | struct ftrace_event_call *call = &event_branch; |
34 | struct trace_array *tr = branch_tracer; | 34 | struct trace_array *tr = branch_tracer; |
35 | struct trace_array_cpu *data; | ||
35 | struct ring_buffer_event *event; | 36 | struct ring_buffer_event *event; |
36 | struct trace_branch *entry; | 37 | struct trace_branch *entry; |
37 | struct ring_buffer *buffer; | 38 | struct ring_buffer *buffer; |
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
51 | 52 | ||
52 | local_irq_save(flags); | 53 | local_irq_save(flags); |
53 | cpu = raw_smp_processor_id(); | 54 | cpu = raw_smp_processor_id(); |
54 | if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) | 55 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
56 | if (atomic_inc_return(&data->disabled) != 1) | ||
55 | goto out; | 57 | goto out; |
56 | 58 | ||
57 | pc = preempt_count(); | 59 | pc = preempt_count(); |
58 | buffer = tr->buffer; | 60 | buffer = tr->trace_buffer.buffer; |
59 | event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, | 61 | event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, |
60 | sizeof(*entry), flags, pc); | 62 | sizeof(*entry), flags, pc); |
61 | if (!event) | 63 | if (!event) |
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
80 | __buffer_unlock_commit(buffer, event); | 82 | __buffer_unlock_commit(buffer, event); |
81 | 83 | ||
82 | out: | 84 | out: |
83 | atomic_dec(&tr->data[cpu]->disabled); | 85 | atomic_dec(&data->disabled); |
84 | local_irq_restore(flags); | 86 | local_irq_restore(flags); |
85 | } | 87 | } |
86 | 88 | ||
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aa8f5f48dae6..26dc348332b7 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void) | |||
57 | return local_clock(); | 57 | return local_clock(); |
58 | } | 58 | } |
59 | 59 | ||
60 | /* | ||
61 | * trace_jiffy_clock(): Simply use jiffies as a clock counter. | ||
62 | */ | ||
63 | u64 notrace trace_clock_jiffies(void) | ||
64 | { | ||
65 | u64 jiffy = jiffies - INITIAL_JIFFIES; | ||
66 | |||
67 | /* Return nsecs */ | ||
68 | return (u64)jiffies_to_usecs(jiffy) * 1000ULL; | ||
69 | } | ||
60 | 70 | ||
61 | /* | 71 | /* |
62 | * trace_clock_global(): special globally coherent trace clock | 72 | * trace_clock_global(): special globally coherent trace clock |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4108e1250ca2..e2d027ac66a2 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry, | |||
223 | __dynamic_array( u32, buf ) | 223 | __dynamic_array( u32, buf ) |
224 | ), | 224 | ), |
225 | 225 | ||
226 | F_printk("%08lx fmt:%p", | 226 | F_printk("%pf: %s", |
227 | __entry->ip, __entry->fmt), | 227 | (void *)__entry->ip, __entry->fmt), |
228 | 228 | ||
229 | FILTER_OTHER | 229 | FILTER_OTHER |
230 | ); | 230 | ); |
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry, | |||
238 | __dynamic_array( char, buf ) | 238 | __dynamic_array( char, buf ) |
239 | ), | 239 | ), |
240 | 240 | ||
241 | F_printk("%08lx %s", | 241 | F_printk("%pf: %s", |
242 | __entry->ip, __entry->buf), | 242 | (void *)__entry->ip, __entry->buf), |
243 | |||
244 | FILTER_OTHER | ||
245 | ); | ||
246 | |||
247 | FTRACE_ENTRY(bputs, bputs_entry, | ||
248 | |||
249 | TRACE_BPUTS, | ||
250 | |||
251 | F_STRUCT( | ||
252 | __field( unsigned long, ip ) | ||
253 | __field( const char *, str ) | ||
254 | ), | ||
255 | |||
256 | F_printk("%pf: %s", | ||
257 | (void *)__entry->ip, __entry->str), | ||
243 | 258 | ||
244 | FILTER_OTHER | 259 | FILTER_OTHER |
245 | ); | 260 | ); |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e9b284250c..27963e2bf4bf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE]; | |||
34 | EXPORT_SYMBOL_GPL(event_storage); | 34 | EXPORT_SYMBOL_GPL(event_storage); |
35 | 35 | ||
36 | LIST_HEAD(ftrace_events); | 36 | LIST_HEAD(ftrace_events); |
37 | LIST_HEAD(ftrace_common_fields); | 37 | static LIST_HEAD(ftrace_common_fields); |
38 | 38 | ||
39 | struct list_head * | 39 | #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) |
40 | |||
41 | static struct kmem_cache *field_cachep; | ||
42 | static struct kmem_cache *file_cachep; | ||
43 | |||
44 | /* Double loops, do not use break, only goto's work */ | ||
45 | #define do_for_each_event_file(tr, file) \ | ||
46 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ | ||
47 | list_for_each_entry(file, &tr->events, list) | ||
48 | |||
49 | #define do_for_each_event_file_safe(tr, file) \ | ||
50 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ | ||
51 | struct ftrace_event_file *___n; \ | ||
52 | list_for_each_entry_safe(file, ___n, &tr->events, list) | ||
53 | |||
54 | #define while_for_each_event_file() \ | ||
55 | } | ||
56 | |||
57 | static struct list_head * | ||
40 | trace_get_fields(struct ftrace_event_call *event_call) | 58 | trace_get_fields(struct ftrace_event_call *event_call) |
41 | { | 59 | { |
42 | if (!event_call->class->get_fields) | 60 | if (!event_call->class->get_fields) |
@@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call) | |||
44 | return event_call->class->get_fields(event_call); | 62 | return event_call->class->get_fields(event_call); |
45 | } | 63 | } |
46 | 64 | ||
65 | static struct ftrace_event_field * | ||
66 | __find_event_field(struct list_head *head, char *name) | ||
67 | { | ||
68 | struct ftrace_event_field *field; | ||
69 | |||
70 | list_for_each_entry(field, head, link) { | ||
71 | if (!strcmp(field->name, name)) | ||
72 | return field; | ||
73 | } | ||
74 | |||
75 | return NULL; | ||
76 | } | ||
77 | |||
78 | struct ftrace_event_field * | ||
79 | trace_find_event_field(struct ftrace_event_call *call, char *name) | ||
80 | { | ||
81 | struct ftrace_event_field *field; | ||
82 | struct list_head *head; | ||
83 | |||
84 | field = __find_event_field(&ftrace_common_fields, name); | ||
85 | if (field) | ||
86 | return field; | ||
87 | |||
88 | head = trace_get_fields(call); | ||
89 | return __find_event_field(head, name); | ||
90 | } | ||
91 | |||
47 | static int __trace_define_field(struct list_head *head, const char *type, | 92 | static int __trace_define_field(struct list_head *head, const char *type, |
48 | const char *name, int offset, int size, | 93 | const char *name, int offset, int size, |
49 | int is_signed, int filter_type) | 94 | int is_signed, int filter_type) |
50 | { | 95 | { |
51 | struct ftrace_event_field *field; | 96 | struct ftrace_event_field *field; |
52 | 97 | ||
53 | field = kzalloc(sizeof(*field), GFP_KERNEL); | 98 | field = kmem_cache_alloc(field_cachep, GFP_TRACE); |
54 | if (!field) | 99 | if (!field) |
55 | goto err; | 100 | goto err; |
56 | 101 | ||
57 | field->name = kstrdup(name, GFP_KERNEL); | 102 | field->name = name; |
58 | if (!field->name) | 103 | field->type = type; |
59 | goto err; | ||
60 | |||
61 | field->type = kstrdup(type, GFP_KERNEL); | ||
62 | if (!field->type) | ||
63 | goto err; | ||
64 | 104 | ||
65 | if (filter_type == FILTER_OTHER) | 105 | if (filter_type == FILTER_OTHER) |
66 | field->filter_type = filter_assign_type(type); | 106 | field->filter_type = filter_assign_type(type); |
@@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type, | |||
76 | return 0; | 116 | return 0; |
77 | 117 | ||
78 | err: | 118 | err: |
79 | if (field) | 119 | kmem_cache_free(field_cachep, field); |
80 | kfree(field->name); | ||
81 | kfree(field); | ||
82 | 120 | ||
83 | return -ENOMEM; | 121 | return -ENOMEM; |
84 | } | 122 | } |
@@ -120,7 +158,7 @@ static int trace_define_common_fields(void) | |||
120 | return ret; | 158 | return ret; |
121 | } | 159 | } |
122 | 160 | ||
123 | void trace_destroy_fields(struct ftrace_event_call *call) | 161 | static void trace_destroy_fields(struct ftrace_event_call *call) |
124 | { | 162 | { |
125 | struct ftrace_event_field *field, *next; | 163 | struct ftrace_event_field *field, *next; |
126 | struct list_head *head; | 164 | struct list_head *head; |
@@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call) | |||
128 | head = trace_get_fields(call); | 166 | head = trace_get_fields(call); |
129 | list_for_each_entry_safe(field, next, head, link) { | 167 | list_for_each_entry_safe(field, next, head, link) { |
130 | list_del(&field->link); | 168 | list_del(&field->link); |
131 | kfree(field->type); | 169 | kmem_cache_free(field_cachep, field); |
132 | kfree(field->name); | ||
133 | kfree(field); | ||
134 | } | 170 | } |
135 | } | 171 | } |
136 | 172 | ||
@@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); | |||
149 | int ftrace_event_reg(struct ftrace_event_call *call, | 185 | int ftrace_event_reg(struct ftrace_event_call *call, |
150 | enum trace_reg type, void *data) | 186 | enum trace_reg type, void *data) |
151 | { | 187 | { |
188 | struct ftrace_event_file *file = data; | ||
189 | |||
152 | switch (type) { | 190 | switch (type) { |
153 | case TRACE_REG_REGISTER: | 191 | case TRACE_REG_REGISTER: |
154 | return tracepoint_probe_register(call->name, | 192 | return tracepoint_probe_register(call->name, |
155 | call->class->probe, | 193 | call->class->probe, |
156 | call); | 194 | file); |
157 | case TRACE_REG_UNREGISTER: | 195 | case TRACE_REG_UNREGISTER: |
158 | tracepoint_probe_unregister(call->name, | 196 | tracepoint_probe_unregister(call->name, |
159 | call->class->probe, | 197 | call->class->probe, |
160 | call); | 198 | file); |
161 | return 0; | 199 | return 0; |
162 | 200 | ||
163 | #ifdef CONFIG_PERF_EVENTS | 201 | #ifdef CONFIG_PERF_EVENTS |
@@ -183,54 +221,106 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg); | |||
183 | 221 | ||
184 | void trace_event_enable_cmd_record(bool enable) | 222 | void trace_event_enable_cmd_record(bool enable) |
185 | { | 223 | { |
186 | struct ftrace_event_call *call; | 224 | struct ftrace_event_file *file; |
225 | struct trace_array *tr; | ||
187 | 226 | ||
188 | mutex_lock(&event_mutex); | 227 | mutex_lock(&event_mutex); |
189 | list_for_each_entry(call, &ftrace_events, list) { | 228 | do_for_each_event_file(tr, file) { |
190 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) | 229 | |
230 | if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) | ||
191 | continue; | 231 | continue; |
192 | 232 | ||
193 | if (enable) { | 233 | if (enable) { |
194 | tracing_start_cmdline_record(); | 234 | tracing_start_cmdline_record(); |
195 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; | 235 | set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); |
196 | } else { | 236 | } else { |
197 | tracing_stop_cmdline_record(); | 237 | tracing_stop_cmdline_record(); |
198 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; | 238 | clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); |
199 | } | 239 | } |
200 | } | 240 | } while_for_each_event_file(); |
201 | mutex_unlock(&event_mutex); | 241 | mutex_unlock(&event_mutex); |
202 | } | 242 | } |
203 | 243 | ||
204 | static int ftrace_event_enable_disable(struct ftrace_event_call *call, | 244 | static int __ftrace_event_enable_disable(struct ftrace_event_file *file, |
205 | int enable) | 245 | int enable, int soft_disable) |
206 | { | 246 | { |
247 | struct ftrace_event_call *call = file->event_call; | ||
207 | int ret = 0; | 248 | int ret = 0; |
249 | int disable; | ||
208 | 250 | ||
209 | switch (enable) { | 251 | switch (enable) { |
210 | case 0: | 252 | case 0: |
211 | if (call->flags & TRACE_EVENT_FL_ENABLED) { | 253 | /* |
212 | call->flags &= ~TRACE_EVENT_FL_ENABLED; | 254 | * When soft_disable is set and enable is cleared, the sm_ref |
213 | if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { | 255 | * reference counter is decremented. If it reaches 0, we want |
256 | * to clear the SOFT_DISABLED flag but leave the event in the | ||
257 | * state that it was. That is, if the event was enabled and | ||
258 | * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED | ||
259 | * is set we do not want the event to be enabled before we | ||
260 | * clear the bit. | ||
261 | * | ||
262 | * When soft_disable is not set but the SOFT_MODE flag is, | ||
263 | * we do nothing. Do not disable the tracepoint, otherwise | ||
264 | * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. | ||
265 | */ | ||
266 | if (soft_disable) { | ||
267 | if (atomic_dec_return(&file->sm_ref) > 0) | ||
268 | break; | ||
269 | disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; | ||
270 | clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); | ||
271 | } else | ||
272 | disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); | ||
273 | |||
274 | if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { | ||
275 | clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); | ||
276 | if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { | ||
214 | tracing_stop_cmdline_record(); | 277 | tracing_stop_cmdline_record(); |
215 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; | 278 | clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); |
216 | } | 279 | } |
217 | call->class->reg(call, TRACE_REG_UNREGISTER, NULL); | 280 | call->class->reg(call, TRACE_REG_UNREGISTER, file); |
218 | } | 281 | } |
282 | /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ | ||
283 | if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) | ||
284 | set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); | ||
219 | break; | 285 | break; |
220 | case 1: | 286 | case 1: |
221 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { | 287 | /* |
288 | * When soft_disable is set and enable is set, we want to | ||
289 | * register the tracepoint for the event, but leave the event | ||
290 | * as is. That means, if the event was already enabled, we do | ||
291 | * nothing (but set SOFT_MODE). If the event is disabled, we | ||
292 | * set SOFT_DISABLED before enabling the event tracepoint, so | ||
293 | * it still seems to be disabled. | ||
294 | */ | ||
295 | if (!soft_disable) | ||
296 | clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); | ||
297 | else { | ||
298 | if (atomic_inc_return(&file->sm_ref) > 1) | ||
299 | break; | ||
300 | set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); | ||
301 | } | ||
302 | |||
303 | if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { | ||
304 | |||
305 | /* Keep the event disabled, when going to SOFT_MODE. */ | ||
306 | if (soft_disable) | ||
307 | set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); | ||
308 | |||
222 | if (trace_flags & TRACE_ITER_RECORD_CMD) { | 309 | if (trace_flags & TRACE_ITER_RECORD_CMD) { |
223 | tracing_start_cmdline_record(); | 310 | tracing_start_cmdline_record(); |
224 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; | 311 | set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); |
225 | } | 312 | } |
226 | ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); | 313 | ret = call->class->reg(call, TRACE_REG_REGISTER, file); |
227 | if (ret) { | 314 | if (ret) { |
228 | tracing_stop_cmdline_record(); | 315 | tracing_stop_cmdline_record(); |
229 | pr_info("event trace: Could not enable event " | 316 | pr_info("event trace: Could not enable event " |
230 | "%s\n", call->name); | 317 | "%s\n", call->name); |
231 | break; | 318 | break; |
232 | } | 319 | } |
233 | call->flags |= TRACE_EVENT_FL_ENABLED; | 320 | set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); |
321 | |||
322 | /* WAS_ENABLED gets set but never cleared. */ | ||
323 | call->flags |= TRACE_EVENT_FL_WAS_ENABLED; | ||
234 | } | 324 | } |
235 | break; | 325 | break; |
236 | } | 326 | } |
@@ -238,13 +328,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, | |||
238 | return ret; | 328 | return ret; |
239 | } | 329 | } |
240 | 330 | ||
241 | static void ftrace_clear_events(void) | 331 | static int ftrace_event_enable_disable(struct ftrace_event_file *file, |
332 | int enable) | ||
242 | { | 333 | { |
243 | struct ftrace_event_call *call; | 334 | return __ftrace_event_enable_disable(file, enable, 0); |
335 | } | ||
336 | |||
337 | static void ftrace_clear_events(struct trace_array *tr) | ||
338 | { | ||
339 | struct ftrace_event_file *file; | ||
244 | 340 | ||
245 | mutex_lock(&event_mutex); | 341 | mutex_lock(&event_mutex); |
246 | list_for_each_entry(call, &ftrace_events, list) { | 342 | list_for_each_entry(file, &tr->events, list) { |
247 | ftrace_event_enable_disable(call, 0); | 343 | ftrace_event_enable_disable(file, 0); |
248 | } | 344 | } |
249 | mutex_unlock(&event_mutex); | 345 | mutex_unlock(&event_mutex); |
250 | } | 346 | } |
@@ -257,11 +353,12 @@ static void __put_system(struct event_subsystem *system) | |||
257 | if (--system->ref_count) | 353 | if (--system->ref_count) |
258 | return; | 354 | return; |
259 | 355 | ||
356 | list_del(&system->list); | ||
357 | |||
260 | if (filter) { | 358 | if (filter) { |
261 | kfree(filter->filter_string); | 359 | kfree(filter->filter_string); |
262 | kfree(filter); | 360 | kfree(filter); |
263 | } | 361 | } |
264 | kfree(system->name); | ||
265 | kfree(system); | 362 | kfree(system); |
266 | } | 363 | } |
267 | 364 | ||
@@ -271,24 +368,45 @@ static void __get_system(struct event_subsystem *system) | |||
271 | system->ref_count++; | 368 | system->ref_count++; |
272 | } | 369 | } |
273 | 370 | ||
274 | static void put_system(struct event_subsystem *system) | 371 | static void __get_system_dir(struct ftrace_subsystem_dir *dir) |
372 | { | ||
373 | WARN_ON_ONCE(dir->ref_count == 0); | ||
374 | dir->ref_count++; | ||
375 | __get_system(dir->subsystem); | ||
376 | } | ||
377 | |||
378 | static void __put_system_dir(struct ftrace_subsystem_dir *dir) | ||
379 | { | ||
380 | WARN_ON_ONCE(dir->ref_count == 0); | ||
381 | /* If the subsystem is about to be freed, the dir must be too */ | ||
382 | WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); | ||
383 | |||
384 | __put_system(dir->subsystem); | ||
385 | if (!--dir->ref_count) | ||
386 | kfree(dir); | ||
387 | } | ||
388 | |||
389 | static void put_system(struct ftrace_subsystem_dir *dir) | ||
275 | { | 390 | { |
276 | mutex_lock(&event_mutex); | 391 | mutex_lock(&event_mutex); |
277 | __put_system(system); | 392 | __put_system_dir(dir); |
278 | mutex_unlock(&event_mutex); | 393 | mutex_unlock(&event_mutex); |
279 | } | 394 | } |
280 | 395 | ||
281 | /* | 396 | /* |
282 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. | 397 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. |
283 | */ | 398 | */ |
284 | static int __ftrace_set_clr_event(const char *match, const char *sub, | 399 | static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, |
285 | const char *event, int set) | 400 | const char *sub, const char *event, int set) |
286 | { | 401 | { |
402 | struct ftrace_event_file *file; | ||
287 | struct ftrace_event_call *call; | 403 | struct ftrace_event_call *call; |
288 | int ret = -EINVAL; | 404 | int ret = -EINVAL; |
289 | 405 | ||
290 | mutex_lock(&event_mutex); | 406 | mutex_lock(&event_mutex); |
291 | list_for_each_entry(call, &ftrace_events, list) { | 407 | list_for_each_entry(file, &tr->events, list) { |
408 | |||
409 | call = file->event_call; | ||
292 | 410 | ||
293 | if (!call->name || !call->class || !call->class->reg) | 411 | if (!call->name || !call->class || !call->class->reg) |
294 | continue; | 412 | continue; |
@@ -307,7 +425,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
307 | if (event && strcmp(event, call->name) != 0) | 425 | if (event && strcmp(event, call->name) != 0) |
308 | continue; | 426 | continue; |
309 | 427 | ||
310 | ftrace_event_enable_disable(call, set); | 428 | ftrace_event_enable_disable(file, set); |
311 | 429 | ||
312 | ret = 0; | 430 | ret = 0; |
313 | } | 431 | } |
@@ -316,7 +434,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
316 | return ret; | 434 | return ret; |
317 | } | 435 | } |
318 | 436 | ||
319 | static int ftrace_set_clr_event(char *buf, int set) | 437 | static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) |
320 | { | 438 | { |
321 | char *event = NULL, *sub = NULL, *match; | 439 | char *event = NULL, *sub = NULL, *match; |
322 | 440 | ||
@@ -344,7 +462,7 @@ static int ftrace_set_clr_event(char *buf, int set) | |||
344 | event = NULL; | 462 | event = NULL; |
345 | } | 463 | } |
346 | 464 | ||
347 | return __ftrace_set_clr_event(match, sub, event, set); | 465 | return __ftrace_set_clr_event(tr, match, sub, event, set); |
348 | } | 466 | } |
349 | 467 | ||
350 | /** | 468 | /** |
@@ -361,7 +479,9 @@ static int ftrace_set_clr_event(char *buf, int set) | |||
361 | */ | 479 | */ |
362 | int trace_set_clr_event(const char *system, const char *event, int set) | 480 | int trace_set_clr_event(const char *system, const char *event, int set) |
363 | { | 481 | { |
364 | return __ftrace_set_clr_event(NULL, system, event, set); | 482 | struct trace_array *tr = top_trace_array(); |
483 | |||
484 | return __ftrace_set_clr_event(tr, NULL, system, event, set); | ||
365 | } | 485 | } |
366 | EXPORT_SYMBOL_GPL(trace_set_clr_event); | 486 | EXPORT_SYMBOL_GPL(trace_set_clr_event); |
367 | 487 | ||
@@ -373,6 +493,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf, | |||
373 | size_t cnt, loff_t *ppos) | 493 | size_t cnt, loff_t *ppos) |
374 | { | 494 | { |
375 | struct trace_parser parser; | 495 | struct trace_parser parser; |
496 | struct seq_file *m = file->private_data; | ||
497 | struct trace_array *tr = m->private; | ||
376 | ssize_t read, ret; | 498 | ssize_t read, ret; |
377 | 499 | ||
378 | if (!cnt) | 500 | if (!cnt) |
@@ -395,7 +517,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, | |||
395 | 517 | ||
396 | parser.buffer[parser.idx] = 0; | 518 | parser.buffer[parser.idx] = 0; |
397 | 519 | ||
398 | ret = ftrace_set_clr_event(parser.buffer + !set, set); | 520 | ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); |
399 | if (ret) | 521 | if (ret) |
400 | goto out_put; | 522 | goto out_put; |
401 | } | 523 | } |
@@ -411,17 +533,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf, | |||
411 | static void * | 533 | static void * |
412 | t_next(struct seq_file *m, void *v, loff_t *pos) | 534 | t_next(struct seq_file *m, void *v, loff_t *pos) |
413 | { | 535 | { |
414 | struct ftrace_event_call *call = v; | 536 | struct ftrace_event_file *file = v; |
537 | struct ftrace_event_call *call; | ||
538 | struct trace_array *tr = m->private; | ||
415 | 539 | ||
416 | (*pos)++; | 540 | (*pos)++; |
417 | 541 | ||
418 | list_for_each_entry_continue(call, &ftrace_events, list) { | 542 | list_for_each_entry_continue(file, &tr->events, list) { |
543 | call = file->event_call; | ||
419 | /* | 544 | /* |
420 | * The ftrace subsystem is for showing formats only. | 545 | * The ftrace subsystem is for showing formats only. |
421 | * They can not be enabled or disabled via the event files. | 546 | * They can not be enabled or disabled via the event files. |
422 | */ | 547 | */ |
423 | if (call->class && call->class->reg) | 548 | if (call->class && call->class->reg) |
424 | return call; | 549 | return file; |
425 | } | 550 | } |
426 | 551 | ||
427 | return NULL; | 552 | return NULL; |
@@ -429,30 +554,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
429 | 554 | ||
430 | static void *t_start(struct seq_file *m, loff_t *pos) | 555 | static void *t_start(struct seq_file *m, loff_t *pos) |
431 | { | 556 | { |
432 | struct ftrace_event_call *call; | 557 | struct ftrace_event_file *file; |
558 | struct trace_array *tr = m->private; | ||
433 | loff_t l; | 559 | loff_t l; |
434 | 560 | ||
435 | mutex_lock(&event_mutex); | 561 | mutex_lock(&event_mutex); |
436 | 562 | ||
437 | call = list_entry(&ftrace_events, struct ftrace_event_call, list); | 563 | file = list_entry(&tr->events, struct ftrace_event_file, list); |
438 | for (l = 0; l <= *pos; ) { | 564 | for (l = 0; l <= *pos; ) { |
439 | call = t_next(m, call, &l); | 565 | file = t_next(m, file, &l); |
440 | if (!call) | 566 | if (!file) |
441 | break; | 567 | break; |
442 | } | 568 | } |
443 | return call; | 569 | return file; |
444 | } | 570 | } |
445 | 571 | ||
446 | static void * | 572 | static void * |
447 | s_next(struct seq_file *m, void *v, loff_t *pos) | 573 | s_next(struct seq_file *m, void *v, loff_t *pos) |
448 | { | 574 | { |
449 | struct ftrace_event_call *call = v; | 575 | struct ftrace_event_file *file = v; |
576 | struct trace_array *tr = m->private; | ||
450 | 577 | ||
451 | (*pos)++; | 578 | (*pos)++; |
452 | 579 | ||
453 | list_for_each_entry_continue(call, &ftrace_events, list) { | 580 | list_for_each_entry_continue(file, &tr->events, list) { |
454 | if (call->flags & TRACE_EVENT_FL_ENABLED) | 581 | if (file->flags & FTRACE_EVENT_FL_ENABLED) |
455 | return call; | 582 | return file; |
456 | } | 583 | } |
457 | 584 | ||
458 | return NULL; | 585 | return NULL; |
@@ -460,23 +587,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos) | |||
460 | 587 | ||
461 | static void *s_start(struct seq_file *m, loff_t *pos) | 588 | static void *s_start(struct seq_file *m, loff_t *pos) |
462 | { | 589 | { |
463 | struct ftrace_event_call *call; | 590 | struct ftrace_event_file *file; |
591 | struct trace_array *tr = m->private; | ||
464 | loff_t l; | 592 | loff_t l; |
465 | 593 | ||
466 | mutex_lock(&event_mutex); | 594 | mutex_lock(&event_mutex); |
467 | 595 | ||
468 | call = list_entry(&ftrace_events, struct ftrace_event_call, list); | 596 | file = list_entry(&tr->events, struct ftrace_event_file, list); |
469 | for (l = 0; l <= *pos; ) { | 597 | for (l = 0; l <= *pos; ) { |
470 | call = s_next(m, call, &l); | 598 | file = s_next(m, file, &l); |
471 | if (!call) | 599 | if (!file) |
472 | break; | 600 | break; |
473 | } | 601 | } |
474 | return call; | 602 | return file; |
475 | } | 603 | } |
476 | 604 | ||
477 | static int t_show(struct seq_file *m, void *v) | 605 | static int t_show(struct seq_file *m, void *v) |
478 | { | 606 | { |
479 | struct ftrace_event_call *call = v; | 607 | struct ftrace_event_file *file = v; |
608 | struct ftrace_event_call *call = file->event_call; | ||
480 | 609 | ||
481 | if (strcmp(call->class->system, TRACE_SYSTEM) != 0) | 610 | if (strcmp(call->class->system, TRACE_SYSTEM) != 0) |
482 | seq_printf(m, "%s:", call->class->system); | 611 | seq_printf(m, "%s:", call->class->system); |
@@ -494,25 +623,33 @@ static ssize_t | |||
494 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | 623 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, |
495 | loff_t *ppos) | 624 | loff_t *ppos) |
496 | { | 625 | { |
497 | struct ftrace_event_call *call = filp->private_data; | 626 | struct ftrace_event_file *file = filp->private_data; |
498 | char *buf; | 627 | char *buf; |
499 | 628 | ||
500 | if (call->flags & TRACE_EVENT_FL_ENABLED) | 629 | if (file->flags & FTRACE_EVENT_FL_ENABLED) { |
501 | buf = "1\n"; | 630 | if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) |
502 | else | 631 | buf = "0*\n"; |
632 | else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) | ||
633 | buf = "1*\n"; | ||
634 | else | ||
635 | buf = "1\n"; | ||
636 | } else | ||
503 | buf = "0\n"; | 637 | buf = "0\n"; |
504 | 638 | ||
505 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); | 639 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); |
506 | } | 640 | } |
507 | 641 | ||
508 | static ssize_t | 642 | static ssize_t |
509 | event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | 643 | event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, |
510 | loff_t *ppos) | 644 | loff_t *ppos) |
511 | { | 645 | { |
512 | struct ftrace_event_call *call = filp->private_data; | 646 | struct ftrace_event_file *file = filp->private_data; |
513 | unsigned long val; | 647 | unsigned long val; |
514 | int ret; | 648 | int ret; |
515 | 649 | ||
650 | if (!file) | ||
651 | return -EINVAL; | ||
652 | |||
516 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | 653 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
517 | if (ret) | 654 | if (ret) |
518 | return ret; | 655 | return ret; |
@@ -525,7 +662,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
525 | case 0: | 662 | case 0: |
526 | case 1: | 663 | case 1: |
527 | mutex_lock(&event_mutex); | 664 | mutex_lock(&event_mutex); |
528 | ret = ftrace_event_enable_disable(call, val); | 665 | ret = ftrace_event_enable_disable(file, val); |
529 | mutex_unlock(&event_mutex); | 666 | mutex_unlock(&event_mutex); |
530 | break; | 667 | break; |
531 | 668 | ||
@@ -543,14 +680,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
543 | loff_t *ppos) | 680 | loff_t *ppos) |
544 | { | 681 | { |
545 | const char set_to_char[4] = { '?', '0', '1', 'X' }; | 682 | const char set_to_char[4] = { '?', '0', '1', 'X' }; |
546 | struct event_subsystem *system = filp->private_data; | 683 | struct ftrace_subsystem_dir *dir = filp->private_data; |
684 | struct event_subsystem *system = dir->subsystem; | ||
547 | struct ftrace_event_call *call; | 685 | struct ftrace_event_call *call; |
686 | struct ftrace_event_file *file; | ||
687 | struct trace_array *tr = dir->tr; | ||
548 | char buf[2]; | 688 | char buf[2]; |
549 | int set = 0; | 689 | int set = 0; |
550 | int ret; | 690 | int ret; |
551 | 691 | ||
552 | mutex_lock(&event_mutex); | 692 | mutex_lock(&event_mutex); |
553 | list_for_each_entry(call, &ftrace_events, list) { | 693 | list_for_each_entry(file, &tr->events, list) { |
694 | call = file->event_call; | ||
554 | if (!call->name || !call->class || !call->class->reg) | 695 | if (!call->name || !call->class || !call->class->reg) |
555 | continue; | 696 | continue; |
556 | 697 | ||
@@ -562,7 +703,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
562 | * or if all events or cleared, or if we have | 703 | * or if all events or cleared, or if we have |
563 | * a mixture. | 704 | * a mixture. |
564 | */ | 705 | */ |
565 | set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); | 706 | set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); |
566 | 707 | ||
567 | /* | 708 | /* |
568 | * If we have a mixture, no need to look further. | 709 | * If we have a mixture, no need to look further. |
@@ -584,7 +725,8 @@ static ssize_t | |||
584 | system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | 725 | system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, |
585 | loff_t *ppos) | 726 | loff_t *ppos) |
586 | { | 727 | { |
587 | struct event_subsystem *system = filp->private_data; | 728 | struct ftrace_subsystem_dir *dir = filp->private_data; |
729 | struct event_subsystem *system = dir->subsystem; | ||
588 | const char *name = NULL; | 730 | const char *name = NULL; |
589 | unsigned long val; | 731 | unsigned long val; |
590 | ssize_t ret; | 732 | ssize_t ret; |
@@ -607,7 +749,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
607 | if (system) | 749 | if (system) |
608 | name = system->name; | 750 | name = system->name; |
609 | 751 | ||
610 | ret = __ftrace_set_clr_event(NULL, name, NULL, val); | 752 | ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); |
611 | if (ret) | 753 | if (ret) |
612 | goto out; | 754 | goto out; |
613 | 755 | ||
@@ -845,43 +987,75 @@ static LIST_HEAD(event_subsystems); | |||
845 | static int subsystem_open(struct inode *inode, struct file *filp) | 987 | static int subsystem_open(struct inode *inode, struct file *filp) |
846 | { | 988 | { |
847 | struct event_subsystem *system = NULL; | 989 | struct event_subsystem *system = NULL; |
990 | struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ | ||
991 | struct trace_array *tr; | ||
848 | int ret; | 992 | int ret; |
849 | 993 | ||
850 | if (!inode->i_private) | ||
851 | goto skip_search; | ||
852 | |||
853 | /* Make sure the system still exists */ | 994 | /* Make sure the system still exists */ |
854 | mutex_lock(&event_mutex); | 995 | mutex_lock(&event_mutex); |
855 | list_for_each_entry(system, &event_subsystems, list) { | 996 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { |
856 | if (system == inode->i_private) { | 997 | list_for_each_entry(dir, &tr->systems, list) { |
857 | /* Don't open systems with no events */ | 998 | if (dir == inode->i_private) { |
858 | if (!system->nr_events) { | 999 | /* Don't open systems with no events */ |
859 | system = NULL; | 1000 | if (dir->nr_events) { |
860 | break; | 1001 | __get_system_dir(dir); |
1002 | system = dir->subsystem; | ||
1003 | } | ||
1004 | goto exit_loop; | ||
861 | } | 1005 | } |
862 | __get_system(system); | ||
863 | break; | ||
864 | } | 1006 | } |
865 | } | 1007 | } |
1008 | exit_loop: | ||
866 | mutex_unlock(&event_mutex); | 1009 | mutex_unlock(&event_mutex); |
867 | 1010 | ||
868 | if (system != inode->i_private) | 1011 | if (!system) |
869 | return -ENODEV; | 1012 | return -ENODEV; |
870 | 1013 | ||
871 | skip_search: | 1014 | /* Some versions of gcc think dir can be uninitialized here */ |
1015 | WARN_ON(!dir); | ||
1016 | |||
1017 | ret = tracing_open_generic(inode, filp); | ||
1018 | if (ret < 0) | ||
1019 | put_system(dir); | ||
1020 | |||
1021 | return ret; | ||
1022 | } | ||
1023 | |||
1024 | static int system_tr_open(struct inode *inode, struct file *filp) | ||
1025 | { | ||
1026 | struct ftrace_subsystem_dir *dir; | ||
1027 | struct trace_array *tr = inode->i_private; | ||
1028 | int ret; | ||
1029 | |||
1030 | /* Make a temporary dir that has no system but points to tr */ | ||
1031 | dir = kzalloc(sizeof(*dir), GFP_KERNEL); | ||
1032 | if (!dir) | ||
1033 | return -ENOMEM; | ||
1034 | |||
1035 | dir->tr = tr; | ||
1036 | |||
872 | ret = tracing_open_generic(inode, filp); | 1037 | ret = tracing_open_generic(inode, filp); |
873 | if (ret < 0 && system) | 1038 | if (ret < 0) |
874 | put_system(system); | 1039 | kfree(dir); |
1040 | |||
1041 | filp->private_data = dir; | ||
875 | 1042 | ||
876 | return ret; | 1043 | return ret; |
877 | } | 1044 | } |
878 | 1045 | ||
879 | static int subsystem_release(struct inode *inode, struct file *file) | 1046 | static int subsystem_release(struct inode *inode, struct file *file) |
880 | { | 1047 | { |
881 | struct event_subsystem *system = inode->i_private; | 1048 | struct ftrace_subsystem_dir *dir = file->private_data; |
882 | 1049 | ||
883 | if (system) | 1050 | /* |
884 | put_system(system); | 1051 | * If dir->subsystem is NULL, then this is a temporary |
1052 | * descriptor that was made for a trace_array to enable | ||
1053 | * all subsystems. | ||
1054 | */ | ||
1055 | if (dir->subsystem) | ||
1056 | put_system(dir); | ||
1057 | else | ||
1058 | kfree(dir); | ||
885 | 1059 | ||
886 | return 0; | 1060 | return 0; |
887 | } | 1061 | } |
@@ -890,7 +1064,8 @@ static ssize_t | |||
890 | subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | 1064 | subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, |
891 | loff_t *ppos) | 1065 | loff_t *ppos) |
892 | { | 1066 | { |
893 | struct event_subsystem *system = filp->private_data; | 1067 | struct ftrace_subsystem_dir *dir = filp->private_data; |
1068 | struct event_subsystem *system = dir->subsystem; | ||
894 | struct trace_seq *s; | 1069 | struct trace_seq *s; |
895 | int r; | 1070 | int r; |
896 | 1071 | ||
@@ -915,7 +1090,7 @@ static ssize_t | |||
915 | subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | 1090 | subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, |
916 | loff_t *ppos) | 1091 | loff_t *ppos) |
917 | { | 1092 | { |
918 | struct event_subsystem *system = filp->private_data; | 1093 | struct ftrace_subsystem_dir *dir = filp->private_data; |
919 | char *buf; | 1094 | char *buf; |
920 | int err; | 1095 | int err; |
921 | 1096 | ||
@@ -932,7 +1107,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
932 | } | 1107 | } |
933 | buf[cnt] = '\0'; | 1108 | buf[cnt] = '\0'; |
934 | 1109 | ||
935 | err = apply_subsystem_event_filter(system, buf); | 1110 | err = apply_subsystem_event_filter(dir, buf); |
936 | free_page((unsigned long) buf); | 1111 | free_page((unsigned long) buf); |
937 | if (err < 0) | 1112 | if (err < 0) |
938 | return err; | 1113 | return err; |
@@ -1041,30 +1216,35 @@ static const struct file_operations ftrace_system_enable_fops = { | |||
1041 | .release = subsystem_release, | 1216 | .release = subsystem_release, |
1042 | }; | 1217 | }; |
1043 | 1218 | ||
1219 | static const struct file_operations ftrace_tr_enable_fops = { | ||
1220 | .open = system_tr_open, | ||
1221 | .read = system_enable_read, | ||
1222 | .write = system_enable_write, | ||
1223 | .llseek = default_llseek, | ||
1224 | .release = subsystem_release, | ||
1225 | }; | ||
1226 | |||
1044 | static const struct file_operations ftrace_show_header_fops = { | 1227 | static const struct file_operations ftrace_show_header_fops = { |
1045 | .open = tracing_open_generic, | 1228 | .open = tracing_open_generic, |
1046 | .read = show_header, | 1229 | .read = show_header, |
1047 | .llseek = default_llseek, | 1230 | .llseek = default_llseek, |
1048 | }; | 1231 | }; |
1049 | 1232 | ||
1050 | static struct dentry *event_trace_events_dir(void) | 1233 | static int |
1234 | ftrace_event_open(struct inode *inode, struct file *file, | ||
1235 | const struct seq_operations *seq_ops) | ||
1051 | { | 1236 | { |
1052 | static struct dentry *d_tracer; | 1237 | struct seq_file *m; |
1053 | static struct dentry *d_events; | 1238 | int ret; |
1054 | |||
1055 | if (d_events) | ||
1056 | return d_events; | ||
1057 | |||
1058 | d_tracer = tracing_init_dentry(); | ||
1059 | if (!d_tracer) | ||
1060 | return NULL; | ||
1061 | 1239 | ||
1062 | d_events = debugfs_create_dir("events", d_tracer); | 1240 | ret = seq_open(file, seq_ops); |
1063 | if (!d_events) | 1241 | if (ret < 0) |
1064 | pr_warning("Could not create debugfs " | 1242 | return ret; |
1065 | "'events' directory\n"); | 1243 | m = file->private_data; |
1244 | /* copy tr over to seq ops */ | ||
1245 | m->private = inode->i_private; | ||
1066 | 1246 | ||
1067 | return d_events; | 1247 | return ret; |
1068 | } | 1248 | } |
1069 | 1249 | ||
1070 | static int | 1250 | static int |
@@ -1072,117 +1252,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file) | |||
1072 | { | 1252 | { |
1073 | const struct seq_operations *seq_ops = &show_event_seq_ops; | 1253 | const struct seq_operations *seq_ops = &show_event_seq_ops; |
1074 | 1254 | ||
1075 | return seq_open(file, seq_ops); | 1255 | return ftrace_event_open(inode, file, seq_ops); |
1076 | } | 1256 | } |
1077 | 1257 | ||
1078 | static int | 1258 | static int |
1079 | ftrace_event_set_open(struct inode *inode, struct file *file) | 1259 | ftrace_event_set_open(struct inode *inode, struct file *file) |
1080 | { | 1260 | { |
1081 | const struct seq_operations *seq_ops = &show_set_event_seq_ops; | 1261 | const struct seq_operations *seq_ops = &show_set_event_seq_ops; |
1262 | struct trace_array *tr = inode->i_private; | ||
1082 | 1263 | ||
1083 | if ((file->f_mode & FMODE_WRITE) && | 1264 | if ((file->f_mode & FMODE_WRITE) && |
1084 | (file->f_flags & O_TRUNC)) | 1265 | (file->f_flags & O_TRUNC)) |
1085 | ftrace_clear_events(); | 1266 | ftrace_clear_events(tr); |
1086 | 1267 | ||
1087 | return seq_open(file, seq_ops); | 1268 | return ftrace_event_open(inode, file, seq_ops); |
1269 | } | ||
1270 | |||
1271 | static struct event_subsystem * | ||
1272 | create_new_subsystem(const char *name) | ||
1273 | { | ||
1274 | struct event_subsystem *system; | ||
1275 | |||
1276 | /* need to create new entry */ | ||
1277 | system = kmalloc(sizeof(*system), GFP_KERNEL); | ||
1278 | if (!system) | ||
1279 | return NULL; | ||
1280 | |||
1281 | system->ref_count = 1; | ||
1282 | system->name = name; | ||
1283 | |||
1284 | system->filter = NULL; | ||
1285 | |||
1286 | system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); | ||
1287 | if (!system->filter) | ||
1288 | goto out_free; | ||
1289 | |||
1290 | list_add(&system->list, &event_subsystems); | ||
1291 | |||
1292 | return system; | ||
1293 | |||
1294 | out_free: | ||
1295 | kfree(system); | ||
1296 | return NULL; | ||
1088 | } | 1297 | } |
1089 | 1298 | ||
1090 | static struct dentry * | 1299 | static struct dentry * |
1091 | event_subsystem_dir(const char *name, struct dentry *d_events) | 1300 | event_subsystem_dir(struct trace_array *tr, const char *name, |
1301 | struct ftrace_event_file *file, struct dentry *parent) | ||
1092 | { | 1302 | { |
1303 | struct ftrace_subsystem_dir *dir; | ||
1093 | struct event_subsystem *system; | 1304 | struct event_subsystem *system; |
1094 | struct dentry *entry; | 1305 | struct dentry *entry; |
1095 | 1306 | ||
1096 | /* First see if we did not already create this dir */ | 1307 | /* First see if we did not already create this dir */ |
1097 | list_for_each_entry(system, &event_subsystems, list) { | 1308 | list_for_each_entry(dir, &tr->systems, list) { |
1309 | system = dir->subsystem; | ||
1098 | if (strcmp(system->name, name) == 0) { | 1310 | if (strcmp(system->name, name) == 0) { |
1099 | system->nr_events++; | 1311 | dir->nr_events++; |
1100 | return system->entry; | 1312 | file->system = dir; |
1313 | return dir->entry; | ||
1101 | } | 1314 | } |
1102 | } | 1315 | } |
1103 | 1316 | ||
1104 | /* need to create new entry */ | 1317 | /* Now see if the system itself exists. */ |
1105 | system = kmalloc(sizeof(*system), GFP_KERNEL); | 1318 | list_for_each_entry(system, &event_subsystems, list) { |
1106 | if (!system) { | 1319 | if (strcmp(system->name, name) == 0) |
1107 | pr_warning("No memory to create event subsystem %s\n", | 1320 | break; |
1108 | name); | ||
1109 | return d_events; | ||
1110 | } | 1321 | } |
1322 | /* Reset system variable when not found */ | ||
1323 | if (&system->list == &event_subsystems) | ||
1324 | system = NULL; | ||
1111 | 1325 | ||
1112 | system->entry = debugfs_create_dir(name, d_events); | 1326 | dir = kmalloc(sizeof(*dir), GFP_KERNEL); |
1113 | if (!system->entry) { | 1327 | if (!dir) |
1114 | pr_warning("Could not create event subsystem %s\n", | 1328 | goto out_fail; |
1115 | name); | ||
1116 | kfree(system); | ||
1117 | return d_events; | ||
1118 | } | ||
1119 | 1329 | ||
1120 | system->nr_events = 1; | 1330 | if (!system) { |
1121 | system->ref_count = 1; | 1331 | system = create_new_subsystem(name); |
1122 | system->name = kstrdup(name, GFP_KERNEL); | 1332 | if (!system) |
1123 | if (!system->name) { | 1333 | goto out_free; |
1124 | debugfs_remove(system->entry); | 1334 | } else |
1125 | kfree(system); | 1335 | __get_system(system); |
1126 | return d_events; | 1336 | |
1337 | dir->entry = debugfs_create_dir(name, parent); | ||
1338 | if (!dir->entry) { | ||
1339 | pr_warning("Failed to create system directory %s\n", name); | ||
1340 | __put_system(system); | ||
1341 | goto out_free; | ||
1127 | } | 1342 | } |
1128 | 1343 | ||
1129 | list_add(&system->list, &event_subsystems); | 1344 | dir->tr = tr; |
1130 | 1345 | dir->ref_count = 1; | |
1131 | system->filter = NULL; | 1346 | dir->nr_events = 1; |
1132 | 1347 | dir->subsystem = system; | |
1133 | system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); | 1348 | file->system = dir; |
1134 | if (!system->filter) { | ||
1135 | pr_warning("Could not allocate filter for subsystem " | ||
1136 | "'%s'\n", name); | ||
1137 | return system->entry; | ||
1138 | } | ||
1139 | 1349 | ||
1140 | entry = debugfs_create_file("filter", 0644, system->entry, system, | 1350 | entry = debugfs_create_file("filter", 0644, dir->entry, dir, |
1141 | &ftrace_subsystem_filter_fops); | 1351 | &ftrace_subsystem_filter_fops); |
1142 | if (!entry) { | 1352 | if (!entry) { |
1143 | kfree(system->filter); | 1353 | kfree(system->filter); |
1144 | system->filter = NULL; | 1354 | system->filter = NULL; |
1145 | pr_warning("Could not create debugfs " | 1355 | pr_warning("Could not create debugfs '%s/filter' entry\n", name); |
1146 | "'%s/filter' entry\n", name); | ||
1147 | } | 1356 | } |
1148 | 1357 | ||
1149 | trace_create_file("enable", 0644, system->entry, system, | 1358 | trace_create_file("enable", 0644, dir->entry, dir, |
1150 | &ftrace_system_enable_fops); | 1359 | &ftrace_system_enable_fops); |
1151 | 1360 | ||
1152 | return system->entry; | 1361 | list_add(&dir->list, &tr->systems); |
1362 | |||
1363 | return dir->entry; | ||
1364 | |||
1365 | out_free: | ||
1366 | kfree(dir); | ||
1367 | out_fail: | ||
1368 | /* Only print this message if failed on memory allocation */ | ||
1369 | if (!dir || !system) | ||
1370 | pr_warning("No memory to create event subsystem %s\n", | ||
1371 | name); | ||
1372 | return NULL; | ||
1153 | } | 1373 | } |
1154 | 1374 | ||
1155 | static int | 1375 | static int |
1156 | event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | 1376 | event_create_dir(struct dentry *parent, |
1377 | struct ftrace_event_file *file, | ||
1157 | const struct file_operations *id, | 1378 | const struct file_operations *id, |
1158 | const struct file_operations *enable, | 1379 | const struct file_operations *enable, |
1159 | const struct file_operations *filter, | 1380 | const struct file_operations *filter, |
1160 | const struct file_operations *format) | 1381 | const struct file_operations *format) |
1161 | { | 1382 | { |
1383 | struct ftrace_event_call *call = file->event_call; | ||
1384 | struct trace_array *tr = file->tr; | ||
1162 | struct list_head *head; | 1385 | struct list_head *head; |
1386 | struct dentry *d_events; | ||
1163 | int ret; | 1387 | int ret; |
1164 | 1388 | ||
1165 | /* | 1389 | /* |
1166 | * If the trace point header did not define TRACE_SYSTEM | 1390 | * If the trace point header did not define TRACE_SYSTEM |
1167 | * then the system would be called "TRACE_SYSTEM". | 1391 | * then the system would be called "TRACE_SYSTEM". |
1168 | */ | 1392 | */ |
1169 | if (strcmp(call->class->system, TRACE_SYSTEM) != 0) | 1393 | if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { |
1170 | d_events = event_subsystem_dir(call->class->system, d_events); | 1394 | d_events = event_subsystem_dir(tr, call->class->system, file, parent); |
1171 | 1395 | if (!d_events) | |
1172 | call->dir = debugfs_create_dir(call->name, d_events); | 1396 | return -ENOMEM; |
1173 | if (!call->dir) { | 1397 | } else |
1174 | pr_warning("Could not create debugfs " | 1398 | d_events = parent; |
1175 | "'%s' directory\n", call->name); | 1399 | |
1400 | file->dir = debugfs_create_dir(call->name, d_events); | ||
1401 | if (!file->dir) { | ||
1402 | pr_warning("Could not create debugfs '%s' directory\n", | ||
1403 | call->name); | ||
1176 | return -1; | 1404 | return -1; |
1177 | } | 1405 | } |
1178 | 1406 | ||
1179 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) | 1407 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
1180 | trace_create_file("enable", 0644, call->dir, call, | 1408 | trace_create_file("enable", 0644, file->dir, file, |
1181 | enable); | 1409 | enable); |
1182 | 1410 | ||
1183 | #ifdef CONFIG_PERF_EVENTS | 1411 | #ifdef CONFIG_PERF_EVENTS |
1184 | if (call->event.type && call->class->reg) | 1412 | if (call->event.type && call->class->reg) |
1185 | trace_create_file("id", 0444, call->dir, call, | 1413 | trace_create_file("id", 0444, file->dir, call, |
1186 | id); | 1414 | id); |
1187 | #endif | 1415 | #endif |
1188 | 1416 | ||
@@ -1196,23 +1424,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
1196 | if (ret < 0) { | 1424 | if (ret < 0) { |
1197 | pr_warning("Could not initialize trace point" | 1425 | pr_warning("Could not initialize trace point" |
1198 | " events/%s\n", call->name); | 1426 | " events/%s\n", call->name); |
1199 | return ret; | 1427 | return -1; |
1200 | } | 1428 | } |
1201 | } | 1429 | } |
1202 | trace_create_file("filter", 0644, call->dir, call, | 1430 | trace_create_file("filter", 0644, file->dir, call, |
1203 | filter); | 1431 | filter); |
1204 | 1432 | ||
1205 | trace_create_file("format", 0444, call->dir, call, | 1433 | trace_create_file("format", 0444, file->dir, call, |
1206 | format); | 1434 | format); |
1207 | 1435 | ||
1208 | return 0; | 1436 | return 0; |
1209 | } | 1437 | } |
1210 | 1438 | ||
1439 | static void remove_subsystem(struct ftrace_subsystem_dir *dir) | ||
1440 | { | ||
1441 | if (!dir) | ||
1442 | return; | ||
1443 | |||
1444 | if (!--dir->nr_events) { | ||
1445 | debugfs_remove_recursive(dir->entry); | ||
1446 | list_del(&dir->list); | ||
1447 | __put_system_dir(dir); | ||
1448 | } | ||
1449 | } | ||
1450 | |||
1451 | static void remove_event_from_tracers(struct ftrace_event_call *call) | ||
1452 | { | ||
1453 | struct ftrace_event_file *file; | ||
1454 | struct trace_array *tr; | ||
1455 | |||
1456 | do_for_each_event_file_safe(tr, file) { | ||
1457 | |||
1458 | if (file->event_call != call) | ||
1459 | continue; | ||
1460 | |||
1461 | list_del(&file->list); | ||
1462 | debugfs_remove_recursive(file->dir); | ||
1463 | remove_subsystem(file->system); | ||
1464 | kmem_cache_free(file_cachep, file); | ||
1465 | |||
1466 | /* | ||
1467 | * The do_for_each_event_file_safe() is | ||
1468 | * a double loop. After finding the call for this | ||
1469 | * trace_array, we use break to jump to the next | ||
1470 | * trace_array. | ||
1471 | */ | ||
1472 | break; | ||
1473 | } while_for_each_event_file(); | ||
1474 | } | ||
1475 | |||
1211 | static void event_remove(struct ftrace_event_call *call) | 1476 | static void event_remove(struct ftrace_event_call *call) |
1212 | { | 1477 | { |
1213 | ftrace_event_enable_disable(call, 0); | 1478 | struct trace_array *tr; |
1479 | struct ftrace_event_file *file; | ||
1480 | |||
1481 | do_for_each_event_file(tr, file) { | ||
1482 | if (file->event_call != call) | ||
1483 | continue; | ||
1484 | ftrace_event_enable_disable(file, 0); | ||
1485 | /* | ||
1486 | * The do_for_each_event_file() is | ||
1487 | * a double loop. After finding the call for this | ||
1488 | * trace_array, we use break to jump to the next | ||
1489 | * trace_array. | ||
1490 | */ | ||
1491 | break; | ||
1492 | } while_for_each_event_file(); | ||
1493 | |||
1214 | if (call->event.funcs) | 1494 | if (call->event.funcs) |
1215 | __unregister_ftrace_event(&call->event); | 1495 | __unregister_ftrace_event(&call->event); |
1496 | remove_event_from_tracers(call); | ||
1216 | list_del(&call->list); | 1497 | list_del(&call->list); |
1217 | } | 1498 | } |
1218 | 1499 | ||
@@ -1234,82 +1515,109 @@ static int event_init(struct ftrace_event_call *call) | |||
1234 | } | 1515 | } |
1235 | 1516 | ||
1236 | static int | 1517 | static int |
1237 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | 1518 | __register_event(struct ftrace_event_call *call, struct module *mod) |
1238 | const struct file_operations *id, | ||
1239 | const struct file_operations *enable, | ||
1240 | const struct file_operations *filter, | ||
1241 | const struct file_operations *format) | ||
1242 | { | 1519 | { |
1243 | struct dentry *d_events; | ||
1244 | int ret; | 1520 | int ret; |
1245 | 1521 | ||
1246 | ret = event_init(call); | 1522 | ret = event_init(call); |
1247 | if (ret < 0) | 1523 | if (ret < 0) |
1248 | return ret; | 1524 | return ret; |
1249 | 1525 | ||
1250 | d_events = event_trace_events_dir(); | 1526 | list_add(&call->list, &ftrace_events); |
1251 | if (!d_events) | ||
1252 | return -ENOENT; | ||
1253 | |||
1254 | ret = event_create_dir(call, d_events, id, enable, filter, format); | ||
1255 | if (!ret) | ||
1256 | list_add(&call->list, &ftrace_events); | ||
1257 | call->mod = mod; | 1527 | call->mod = mod; |
1258 | 1528 | ||
1259 | return ret; | 1529 | return 0; |
1530 | } | ||
1531 | |||
1532 | static struct ftrace_event_file * | ||
1533 | trace_create_new_event(struct ftrace_event_call *call, | ||
1534 | struct trace_array *tr) | ||
1535 | { | ||
1536 | struct ftrace_event_file *file; | ||
1537 | |||
1538 | file = kmem_cache_alloc(file_cachep, GFP_TRACE); | ||
1539 | if (!file) | ||
1540 | return NULL; | ||
1541 | |||
1542 | file->event_call = call; | ||
1543 | file->tr = tr; | ||
1544 | atomic_set(&file->sm_ref, 0); | ||
1545 | list_add(&file->list, &tr->events); | ||
1546 | |||
1547 | return file; | ||
1260 | } | 1548 | } |
1261 | 1549 | ||
1550 | /* Add an event to a trace directory */ | ||
1551 | static int | ||
1552 | __trace_add_new_event(struct ftrace_event_call *call, | ||
1553 | struct trace_array *tr, | ||
1554 | const struct file_operations *id, | ||
1555 | const struct file_operations *enable, | ||
1556 | const struct file_operations *filter, | ||
1557 | const struct file_operations *format) | ||
1558 | { | ||
1559 | struct ftrace_event_file *file; | ||
1560 | |||
1561 | file = trace_create_new_event(call, tr); | ||
1562 | if (!file) | ||
1563 | return -ENOMEM; | ||
1564 | |||
1565 | return event_create_dir(tr->event_dir, file, id, enable, filter, format); | ||
1566 | } | ||
1567 | |||
1568 | /* | ||
1569 | * Just create a decriptor for early init. A descriptor is required | ||
1570 | * for enabling events at boot. We want to enable events before | ||
1571 | * the filesystem is initialized. | ||
1572 | */ | ||
1573 | static __init int | ||
1574 | __trace_early_add_new_event(struct ftrace_event_call *call, | ||
1575 | struct trace_array *tr) | ||
1576 | { | ||
1577 | struct ftrace_event_file *file; | ||
1578 | |||
1579 | file = trace_create_new_event(call, tr); | ||
1580 | if (!file) | ||
1581 | return -ENOMEM; | ||
1582 | |||
1583 | return 0; | ||
1584 | } | ||
1585 | |||
1586 | struct ftrace_module_file_ops; | ||
1587 | static void __add_event_to_tracers(struct ftrace_event_call *call, | ||
1588 | struct ftrace_module_file_ops *file_ops); | ||
1589 | |||
1262 | /* Add an additional event_call dynamically */ | 1590 | /* Add an additional event_call dynamically */ |
1263 | int trace_add_event_call(struct ftrace_event_call *call) | 1591 | int trace_add_event_call(struct ftrace_event_call *call) |
1264 | { | 1592 | { |
1265 | int ret; | 1593 | int ret; |
1266 | mutex_lock(&event_mutex); | 1594 | mutex_lock(&event_mutex); |
1267 | ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, | ||
1268 | &ftrace_enable_fops, | ||
1269 | &ftrace_event_filter_fops, | ||
1270 | &ftrace_event_format_fops); | ||
1271 | mutex_unlock(&event_mutex); | ||
1272 | return ret; | ||
1273 | } | ||
1274 | |||
1275 | static void remove_subsystem_dir(const char *name) | ||
1276 | { | ||
1277 | struct event_subsystem *system; | ||
1278 | 1595 | ||
1279 | if (strcmp(name, TRACE_SYSTEM) == 0) | 1596 | ret = __register_event(call, NULL); |
1280 | return; | 1597 | if (ret >= 0) |
1598 | __add_event_to_tracers(call, NULL); | ||
1281 | 1599 | ||
1282 | list_for_each_entry(system, &event_subsystems, list) { | 1600 | mutex_unlock(&event_mutex); |
1283 | if (strcmp(system->name, name) == 0) { | 1601 | return ret; |
1284 | if (!--system->nr_events) { | ||
1285 | debugfs_remove_recursive(system->entry); | ||
1286 | list_del(&system->list); | ||
1287 | __put_system(system); | ||
1288 | } | ||
1289 | break; | ||
1290 | } | ||
1291 | } | ||
1292 | } | 1602 | } |
1293 | 1603 | ||
1294 | /* | 1604 | /* |
1295 | * Must be called under locking both of event_mutex and trace_event_mutex. | 1605 | * Must be called under locking both of event_mutex and trace_event_sem. |
1296 | */ | 1606 | */ |
1297 | static void __trace_remove_event_call(struct ftrace_event_call *call) | 1607 | static void __trace_remove_event_call(struct ftrace_event_call *call) |
1298 | { | 1608 | { |
1299 | event_remove(call); | 1609 | event_remove(call); |
1300 | trace_destroy_fields(call); | 1610 | trace_destroy_fields(call); |
1301 | destroy_preds(call); | 1611 | destroy_preds(call); |
1302 | debugfs_remove_recursive(call->dir); | ||
1303 | remove_subsystem_dir(call->class->system); | ||
1304 | } | 1612 | } |
1305 | 1613 | ||
1306 | /* Remove an event_call */ | 1614 | /* Remove an event_call */ |
1307 | void trace_remove_event_call(struct ftrace_event_call *call) | 1615 | void trace_remove_event_call(struct ftrace_event_call *call) |
1308 | { | 1616 | { |
1309 | mutex_lock(&event_mutex); | 1617 | mutex_lock(&event_mutex); |
1310 | down_write(&trace_event_mutex); | 1618 | down_write(&trace_event_sem); |
1311 | __trace_remove_event_call(call); | 1619 | __trace_remove_event_call(call); |
1312 | up_write(&trace_event_mutex); | 1620 | up_write(&trace_event_sem); |
1313 | mutex_unlock(&event_mutex); | 1621 | mutex_unlock(&event_mutex); |
1314 | } | 1622 | } |
1315 | 1623 | ||
@@ -1336,6 +1644,26 @@ struct ftrace_module_file_ops { | |||
1336 | }; | 1644 | }; |
1337 | 1645 | ||
1338 | static struct ftrace_module_file_ops * | 1646 | static struct ftrace_module_file_ops * |
1647 | find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) | ||
1648 | { | ||
1649 | /* | ||
1650 | * As event_calls are added in groups by module, | ||
1651 | * when we find one file_ops, we don't need to search for | ||
1652 | * each call in that module, as the rest should be the | ||
1653 | * same. Only search for a new one if the last one did | ||
1654 | * not match. | ||
1655 | */ | ||
1656 | if (file_ops && mod == file_ops->mod) | ||
1657 | return file_ops; | ||
1658 | |||
1659 | list_for_each_entry(file_ops, &ftrace_module_file_list, list) { | ||
1660 | if (file_ops->mod == mod) | ||
1661 | return file_ops; | ||
1662 | } | ||
1663 | return NULL; | ||
1664 | } | ||
1665 | |||
1666 | static struct ftrace_module_file_ops * | ||
1339 | trace_create_file_ops(struct module *mod) | 1667 | trace_create_file_ops(struct module *mod) |
1340 | { | 1668 | { |
1341 | struct ftrace_module_file_ops *file_ops; | 1669 | struct ftrace_module_file_ops *file_ops; |
@@ -1386,9 +1714,8 @@ static void trace_module_add_events(struct module *mod) | |||
1386 | return; | 1714 | return; |
1387 | 1715 | ||
1388 | for_each_event(call, start, end) { | 1716 | for_each_event(call, start, end) { |
1389 | __trace_add_event_call(*call, mod, | 1717 | __register_event(*call, mod); |
1390 | &file_ops->id, &file_ops->enable, | 1718 | __add_event_to_tracers(*call, file_ops); |
1391 | &file_ops->filter, &file_ops->format); | ||
1392 | } | 1719 | } |
1393 | } | 1720 | } |
1394 | 1721 | ||
@@ -1396,12 +1723,13 @@ static void trace_module_remove_events(struct module *mod) | |||
1396 | { | 1723 | { |
1397 | struct ftrace_module_file_ops *file_ops; | 1724 | struct ftrace_module_file_ops *file_ops; |
1398 | struct ftrace_event_call *call, *p; | 1725 | struct ftrace_event_call *call, *p; |
1399 | bool found = false; | 1726 | bool clear_trace = false; |
1400 | 1727 | ||
1401 | down_write(&trace_event_mutex); | 1728 | down_write(&trace_event_sem); |
1402 | list_for_each_entry_safe(call, p, &ftrace_events, list) { | 1729 | list_for_each_entry_safe(call, p, &ftrace_events, list) { |
1403 | if (call->mod == mod) { | 1730 | if (call->mod == mod) { |
1404 | found = true; | 1731 | if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) |
1732 | clear_trace = true; | ||
1405 | __trace_remove_event_call(call); | 1733 | __trace_remove_event_call(call); |
1406 | } | 1734 | } |
1407 | } | 1735 | } |
@@ -1415,14 +1743,18 @@ static void trace_module_remove_events(struct module *mod) | |||
1415 | list_del(&file_ops->list); | 1743 | list_del(&file_ops->list); |
1416 | kfree(file_ops); | 1744 | kfree(file_ops); |
1417 | } | 1745 | } |
1746 | up_write(&trace_event_sem); | ||
1418 | 1747 | ||
1419 | /* | 1748 | /* |
1420 | * It is safest to reset the ring buffer if the module being unloaded | 1749 | * It is safest to reset the ring buffer if the module being unloaded |
1421 | * registered any events. | 1750 | * registered any events that were used. The only worry is if |
1751 | * a new module gets loaded, and takes on the same id as the events | ||
1752 | * of this module. When printing out the buffer, traced events left | ||
1753 | * over from this module may be passed to the new module events and | ||
1754 | * unexpected results may occur. | ||
1422 | */ | 1755 | */ |
1423 | if (found) | 1756 | if (clear_trace) |
1424 | tracing_reset_current_online_cpus(); | 1757 | tracing_reset_all_online_cpus(); |
1425 | up_write(&trace_event_mutex); | ||
1426 | } | 1758 | } |
1427 | 1759 | ||
1428 | static int trace_module_notify(struct notifier_block *self, | 1760 | static int trace_module_notify(struct notifier_block *self, |
@@ -1443,14 +1775,445 @@ static int trace_module_notify(struct notifier_block *self, | |||
1443 | 1775 | ||
1444 | return 0; | 1776 | return 0; |
1445 | } | 1777 | } |
1778 | |||
1779 | static int | ||
1780 | __trace_add_new_mod_event(struct ftrace_event_call *call, | ||
1781 | struct trace_array *tr, | ||
1782 | struct ftrace_module_file_ops *file_ops) | ||
1783 | { | ||
1784 | return __trace_add_new_event(call, tr, | ||
1785 | &file_ops->id, &file_ops->enable, | ||
1786 | &file_ops->filter, &file_ops->format); | ||
1787 | } | ||
1788 | |||
1446 | #else | 1789 | #else |
1447 | static int trace_module_notify(struct notifier_block *self, | 1790 | static inline struct ftrace_module_file_ops * |
1448 | unsigned long val, void *data) | 1791 | find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) |
1792 | { | ||
1793 | return NULL; | ||
1794 | } | ||
1795 | static inline int trace_module_notify(struct notifier_block *self, | ||
1796 | unsigned long val, void *data) | ||
1449 | { | 1797 | { |
1450 | return 0; | 1798 | return 0; |
1451 | } | 1799 | } |
1800 | static inline int | ||
1801 | __trace_add_new_mod_event(struct ftrace_event_call *call, | ||
1802 | struct trace_array *tr, | ||
1803 | struct ftrace_module_file_ops *file_ops) | ||
1804 | { | ||
1805 | return -ENODEV; | ||
1806 | } | ||
1452 | #endif /* CONFIG_MODULES */ | 1807 | #endif /* CONFIG_MODULES */ |
1453 | 1808 | ||
1809 | /* Create a new event directory structure for a trace directory. */ | ||
1810 | static void | ||
1811 | __trace_add_event_dirs(struct trace_array *tr) | ||
1812 | { | ||
1813 | struct ftrace_module_file_ops *file_ops = NULL; | ||
1814 | struct ftrace_event_call *call; | ||
1815 | int ret; | ||
1816 | |||
1817 | list_for_each_entry(call, &ftrace_events, list) { | ||
1818 | if (call->mod) { | ||
1819 | /* | ||
1820 | * Directories for events by modules need to | ||
1821 | * keep module ref counts when opened (as we don't | ||
1822 | * want the module to disappear when reading one | ||
1823 | * of these files). The file_ops keep account of | ||
1824 | * the module ref count. | ||
1825 | */ | ||
1826 | file_ops = find_ftrace_file_ops(file_ops, call->mod); | ||
1827 | if (!file_ops) | ||
1828 | continue; /* Warn? */ | ||
1829 | ret = __trace_add_new_mod_event(call, tr, file_ops); | ||
1830 | if (ret < 0) | ||
1831 | pr_warning("Could not create directory for event %s\n", | ||
1832 | call->name); | ||
1833 | continue; | ||
1834 | } | ||
1835 | ret = __trace_add_new_event(call, tr, | ||
1836 | &ftrace_event_id_fops, | ||
1837 | &ftrace_enable_fops, | ||
1838 | &ftrace_event_filter_fops, | ||
1839 | &ftrace_event_format_fops); | ||
1840 | if (ret < 0) | ||
1841 | pr_warning("Could not create directory for event %s\n", | ||
1842 | call->name); | ||
1843 | } | ||
1844 | } | ||
1845 | |||
1846 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
1847 | |||
1848 | /* Avoid typos */ | ||
1849 | #define ENABLE_EVENT_STR "enable_event" | ||
1850 | #define DISABLE_EVENT_STR "disable_event" | ||
1851 | |||
1852 | struct event_probe_data { | ||
1853 | struct ftrace_event_file *file; | ||
1854 | unsigned long count; | ||
1855 | int ref; | ||
1856 | bool enable; | ||
1857 | }; | ||
1858 | |||
1859 | static struct ftrace_event_file * | ||
1860 | find_event_file(struct trace_array *tr, const char *system, const char *event) | ||
1861 | { | ||
1862 | struct ftrace_event_file *file; | ||
1863 | struct ftrace_event_call *call; | ||
1864 | |||
1865 | list_for_each_entry(file, &tr->events, list) { | ||
1866 | |||
1867 | call = file->event_call; | ||
1868 | |||
1869 | if (!call->name || !call->class || !call->class->reg) | ||
1870 | continue; | ||
1871 | |||
1872 | if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) | ||
1873 | continue; | ||
1874 | |||
1875 | if (strcmp(event, call->name) == 0 && | ||
1876 | strcmp(system, call->class->system) == 0) | ||
1877 | return file; | ||
1878 | } | ||
1879 | return NULL; | ||
1880 | } | ||
1881 | |||
1882 | static void | ||
1883 | event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) | ||
1884 | { | ||
1885 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | ||
1886 | struct event_probe_data *data = *pdata; | ||
1887 | |||
1888 | if (!data) | ||
1889 | return; | ||
1890 | |||
1891 | if (data->enable) | ||
1892 | clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); | ||
1893 | else | ||
1894 | set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); | ||
1895 | } | ||
1896 | |||
1897 | static void | ||
1898 | event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) | ||
1899 | { | ||
1900 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | ||
1901 | struct event_probe_data *data = *pdata; | ||
1902 | |||
1903 | if (!data) | ||
1904 | return; | ||
1905 | |||
1906 | if (!data->count) | ||
1907 | return; | ||
1908 | |||
1909 | /* Skip if the event is in a state we want to switch to */ | ||
1910 | if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) | ||
1911 | return; | ||
1912 | |||
1913 | if (data->count != -1) | ||
1914 | (data->count)--; | ||
1915 | |||
1916 | event_enable_probe(ip, parent_ip, _data); | ||
1917 | } | ||
1918 | |||
1919 | static int | ||
1920 | event_enable_print(struct seq_file *m, unsigned long ip, | ||
1921 | struct ftrace_probe_ops *ops, void *_data) | ||
1922 | { | ||
1923 | struct event_probe_data *data = _data; | ||
1924 | |||
1925 | seq_printf(m, "%ps:", (void *)ip); | ||
1926 | |||
1927 | seq_printf(m, "%s:%s:%s", | ||
1928 | data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, | ||
1929 | data->file->event_call->class->system, | ||
1930 | data->file->event_call->name); | ||
1931 | |||
1932 | if (data->count == -1) | ||
1933 | seq_printf(m, ":unlimited\n"); | ||
1934 | else | ||
1935 | seq_printf(m, ":count=%ld\n", data->count); | ||
1936 | |||
1937 | return 0; | ||
1938 | } | ||
1939 | |||
1940 | static int | ||
1941 | event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, | ||
1942 | void **_data) | ||
1943 | { | ||
1944 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | ||
1945 | struct event_probe_data *data = *pdata; | ||
1946 | |||
1947 | data->ref++; | ||
1948 | return 0; | ||
1949 | } | ||
1950 | |||
1951 | static void | ||
1952 | event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, | ||
1953 | void **_data) | ||
1954 | { | ||
1955 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | ||
1956 | struct event_probe_data *data = *pdata; | ||
1957 | |||
1958 | if (WARN_ON_ONCE(data->ref <= 0)) | ||
1959 | return; | ||
1960 | |||
1961 | data->ref--; | ||
1962 | if (!data->ref) { | ||
1963 | /* Remove the SOFT_MODE flag */ | ||
1964 | __ftrace_event_enable_disable(data->file, 0, 1); | ||
1965 | module_put(data->file->event_call->mod); | ||
1966 | kfree(data); | ||
1967 | } | ||
1968 | *pdata = NULL; | ||
1969 | } | ||
1970 | |||
1971 | static struct ftrace_probe_ops event_enable_probe_ops = { | ||
1972 | .func = event_enable_probe, | ||
1973 | .print = event_enable_print, | ||
1974 | .init = event_enable_init, | ||
1975 | .free = event_enable_free, | ||
1976 | }; | ||
1977 | |||
1978 | static struct ftrace_probe_ops event_enable_count_probe_ops = { | ||
1979 | .func = event_enable_count_probe, | ||
1980 | .print = event_enable_print, | ||
1981 | .init = event_enable_init, | ||
1982 | .free = event_enable_free, | ||
1983 | }; | ||
1984 | |||
1985 | static struct ftrace_probe_ops event_disable_probe_ops = { | ||
1986 | .func = event_enable_probe, | ||
1987 | .print = event_enable_print, | ||
1988 | .init = event_enable_init, | ||
1989 | .free = event_enable_free, | ||
1990 | }; | ||
1991 | |||
1992 | static struct ftrace_probe_ops event_disable_count_probe_ops = { | ||
1993 | .func = event_enable_count_probe, | ||
1994 | .print = event_enable_print, | ||
1995 | .init = event_enable_init, | ||
1996 | .free = event_enable_free, | ||
1997 | }; | ||
1998 | |||
1999 | static int | ||
2000 | event_enable_func(struct ftrace_hash *hash, | ||
2001 | char *glob, char *cmd, char *param, int enabled) | ||
2002 | { | ||
2003 | struct trace_array *tr = top_trace_array(); | ||
2004 | struct ftrace_event_file *file; | ||
2005 | struct ftrace_probe_ops *ops; | ||
2006 | struct event_probe_data *data; | ||
2007 | const char *system; | ||
2008 | const char *event; | ||
2009 | char *number; | ||
2010 | bool enable; | ||
2011 | int ret; | ||
2012 | |||
2013 | /* hash funcs only work with set_ftrace_filter */ | ||
2014 | if (!enabled) | ||
2015 | return -EINVAL; | ||
2016 | |||
2017 | if (!param) | ||
2018 | return -EINVAL; | ||
2019 | |||
2020 | system = strsep(¶m, ":"); | ||
2021 | if (!param) | ||
2022 | return -EINVAL; | ||
2023 | |||
2024 | event = strsep(¶m, ":"); | ||
2025 | |||
2026 | mutex_lock(&event_mutex); | ||
2027 | |||
2028 | ret = -EINVAL; | ||
2029 | file = find_event_file(tr, system, event); | ||
2030 | if (!file) | ||
2031 | goto out; | ||
2032 | |||
2033 | enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; | ||
2034 | |||
2035 | if (enable) | ||
2036 | ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; | ||
2037 | else | ||
2038 | ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; | ||
2039 | |||
2040 | if (glob[0] == '!') { | ||
2041 | unregister_ftrace_function_probe_func(glob+1, ops); | ||
2042 | ret = 0; | ||
2043 | goto out; | ||
2044 | } | ||
2045 | |||
2046 | ret = -ENOMEM; | ||
2047 | data = kzalloc(sizeof(*data), GFP_KERNEL); | ||
2048 | if (!data) | ||
2049 | goto out; | ||
2050 | |||
2051 | data->enable = enable; | ||
2052 | data->count = -1; | ||
2053 | data->file = file; | ||
2054 | |||
2055 | if (!param) | ||
2056 | goto out_reg; | ||
2057 | |||
2058 | number = strsep(¶m, ":"); | ||
2059 | |||
2060 | ret = -EINVAL; | ||
2061 | if (!strlen(number)) | ||
2062 | goto out_free; | ||
2063 | |||
2064 | /* | ||
2065 | * We use the callback data field (which is a pointer) | ||
2066 | * as our counter. | ||
2067 | */ | ||
2068 | ret = kstrtoul(number, 0, &data->count); | ||
2069 | if (ret) | ||
2070 | goto out_free; | ||
2071 | |||
2072 | out_reg: | ||
2073 | /* Don't let event modules unload while probe registered */ | ||
2074 | ret = try_module_get(file->event_call->mod); | ||
2075 | if (!ret) { | ||
2076 | ret = -EBUSY; | ||
2077 | goto out_free; | ||
2078 | } | ||
2079 | |||
2080 | ret = __ftrace_event_enable_disable(file, 1, 1); | ||
2081 | if (ret < 0) | ||
2082 | goto out_put; | ||
2083 | ret = register_ftrace_function_probe(glob, ops, data); | ||
2084 | /* | ||
2085 | * The above returns on success the # of functions enabled, | ||
2086 | * but if it didn't find any functions it returns zero. | ||
2087 | * Consider no functions a failure too. | ||
2088 | */ | ||
2089 | if (!ret) { | ||
2090 | ret = -ENOENT; | ||
2091 | goto out_disable; | ||
2092 | } else if (ret < 0) | ||
2093 | goto out_disable; | ||
2094 | /* Just return zero, not the number of enabled functions */ | ||
2095 | ret = 0; | ||
2096 | out: | ||
2097 | mutex_unlock(&event_mutex); | ||
2098 | return ret; | ||
2099 | |||
2100 | out_disable: | ||
2101 | __ftrace_event_enable_disable(file, 0, 1); | ||
2102 | out_put: | ||
2103 | module_put(file->event_call->mod); | ||
2104 | out_free: | ||
2105 | kfree(data); | ||
2106 | goto out; | ||
2107 | } | ||
2108 | |||
2109 | static struct ftrace_func_command event_enable_cmd = { | ||
2110 | .name = ENABLE_EVENT_STR, | ||
2111 | .func = event_enable_func, | ||
2112 | }; | ||
2113 | |||
2114 | static struct ftrace_func_command event_disable_cmd = { | ||
2115 | .name = DISABLE_EVENT_STR, | ||
2116 | .func = event_enable_func, | ||
2117 | }; | ||
2118 | |||
2119 | static __init int register_event_cmds(void) | ||
2120 | { | ||
2121 | int ret; | ||
2122 | |||
2123 | ret = register_ftrace_command(&event_enable_cmd); | ||
2124 | if (WARN_ON(ret < 0)) | ||
2125 | return ret; | ||
2126 | ret = register_ftrace_command(&event_disable_cmd); | ||
2127 | if (WARN_ON(ret < 0)) | ||
2128 | unregister_ftrace_command(&event_enable_cmd); | ||
2129 | return ret; | ||
2130 | } | ||
2131 | #else | ||
2132 | static inline int register_event_cmds(void) { return 0; } | ||
2133 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
2134 | |||
2135 | /* | ||
2136 | * The top level array has already had its ftrace_event_file | ||
2137 | * descriptors created in order to allow for early events to | ||
2138 | * be recorded. This function is called after the debugfs has been | ||
2139 | * initialized, and we now have to create the files associated | ||
2140 | * to the events. | ||
2141 | */ | ||
2142 | static __init void | ||
2143 | __trace_early_add_event_dirs(struct trace_array *tr) | ||
2144 | { | ||
2145 | struct ftrace_event_file *file; | ||
2146 | int ret; | ||
2147 | |||
2148 | |||
2149 | list_for_each_entry(file, &tr->events, list) { | ||
2150 | ret = event_create_dir(tr->event_dir, file, | ||
2151 | &ftrace_event_id_fops, | ||
2152 | &ftrace_enable_fops, | ||
2153 | &ftrace_event_filter_fops, | ||
2154 | &ftrace_event_format_fops); | ||
2155 | if (ret < 0) | ||
2156 | pr_warning("Could not create directory for event %s\n", | ||
2157 | file->event_call->name); | ||
2158 | } | ||
2159 | } | ||
2160 | |||
2161 | /* | ||
2162 | * For early boot up, the top trace array requires to have | ||
2163 | * a list of events that can be enabled. This must be done before | ||
2164 | * the filesystem is set up in order to allow events to be traced | ||
2165 | * early. | ||
2166 | */ | ||
2167 | static __init void | ||
2168 | __trace_early_add_events(struct trace_array *tr) | ||
2169 | { | ||
2170 | struct ftrace_event_call *call; | ||
2171 | int ret; | ||
2172 | |||
2173 | list_for_each_entry(call, &ftrace_events, list) { | ||
2174 | /* Early boot up should not have any modules loaded */ | ||
2175 | if (WARN_ON_ONCE(call->mod)) | ||
2176 | continue; | ||
2177 | |||
2178 | ret = __trace_early_add_new_event(call, tr); | ||
2179 | if (ret < 0) | ||
2180 | pr_warning("Could not create early event %s\n", | ||
2181 | call->name); | ||
2182 | } | ||
2183 | } | ||
2184 | |||
2185 | /* Remove the event directory structure for a trace directory. */ | ||
2186 | static void | ||
2187 | __trace_remove_event_dirs(struct trace_array *tr) | ||
2188 | { | ||
2189 | struct ftrace_event_file *file, *next; | ||
2190 | |||
2191 | list_for_each_entry_safe(file, next, &tr->events, list) { | ||
2192 | list_del(&file->list); | ||
2193 | debugfs_remove_recursive(file->dir); | ||
2194 | remove_subsystem(file->system); | ||
2195 | kmem_cache_free(file_cachep, file); | ||
2196 | } | ||
2197 | } | ||
2198 | |||
2199 | static void | ||
2200 | __add_event_to_tracers(struct ftrace_event_call *call, | ||
2201 | struct ftrace_module_file_ops *file_ops) | ||
2202 | { | ||
2203 | struct trace_array *tr; | ||
2204 | |||
2205 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
2206 | if (file_ops) | ||
2207 | __trace_add_new_mod_event(call, tr, file_ops); | ||
2208 | else | ||
2209 | __trace_add_new_event(call, tr, | ||
2210 | &ftrace_event_id_fops, | ||
2211 | &ftrace_enable_fops, | ||
2212 | &ftrace_event_filter_fops, | ||
2213 | &ftrace_event_format_fops); | ||
2214 | } | ||
2215 | } | ||
2216 | |||
1454 | static struct notifier_block trace_module_nb = { | 2217 | static struct notifier_block trace_module_nb = { |
1455 | .notifier_call = trace_module_notify, | 2218 | .notifier_call = trace_module_notify, |
1456 | .priority = 0, | 2219 | .priority = 0, |
@@ -1464,15 +2227,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; | |||
1464 | static __init int setup_trace_event(char *str) | 2227 | static __init int setup_trace_event(char *str) |
1465 | { | 2228 | { |
1466 | strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); | 2229 | strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); |
1467 | ring_buffer_expanded = 1; | 2230 | ring_buffer_expanded = true; |
1468 | tracing_selftest_disabled = 1; | 2231 | tracing_selftest_disabled = true; |
1469 | 2232 | ||
1470 | return 1; | 2233 | return 1; |
1471 | } | 2234 | } |
1472 | __setup("trace_event=", setup_trace_event); | 2235 | __setup("trace_event=", setup_trace_event); |
1473 | 2236 | ||
2237 | /* Expects to have event_mutex held when called */ | ||
2238 | static int | ||
2239 | create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) | ||
2240 | { | ||
2241 | struct dentry *d_events; | ||
2242 | struct dentry *entry; | ||
2243 | |||
2244 | entry = debugfs_create_file("set_event", 0644, parent, | ||
2245 | tr, &ftrace_set_event_fops); | ||
2246 | if (!entry) { | ||
2247 | pr_warning("Could not create debugfs 'set_event' entry\n"); | ||
2248 | return -ENOMEM; | ||
2249 | } | ||
2250 | |||
2251 | d_events = debugfs_create_dir("events", parent); | ||
2252 | if (!d_events) { | ||
2253 | pr_warning("Could not create debugfs 'events' directory\n"); | ||
2254 | return -ENOMEM; | ||
2255 | } | ||
2256 | |||
2257 | /* ring buffer internal formats */ | ||
2258 | trace_create_file("header_page", 0444, d_events, | ||
2259 | ring_buffer_print_page_header, | ||
2260 | &ftrace_show_header_fops); | ||
2261 | |||
2262 | trace_create_file("header_event", 0444, d_events, | ||
2263 | ring_buffer_print_entry_header, | ||
2264 | &ftrace_show_header_fops); | ||
2265 | |||
2266 | trace_create_file("enable", 0644, d_events, | ||
2267 | tr, &ftrace_tr_enable_fops); | ||
2268 | |||
2269 | tr->event_dir = d_events; | ||
2270 | |||
2271 | return 0; | ||
2272 | } | ||
2273 | |||
2274 | /** | ||
2275 | * event_trace_add_tracer - add a instance of a trace_array to events | ||
2276 | * @parent: The parent dentry to place the files/directories for events in | ||
2277 | * @tr: The trace array associated with these events | ||
2278 | * | ||
2279 | * When a new instance is created, it needs to set up its events | ||
2280 | * directory, as well as other files associated with events. It also | ||
2281 | * creates the event hierachry in the @parent/events directory. | ||
2282 | * | ||
2283 | * Returns 0 on success. | ||
2284 | */ | ||
2285 | int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) | ||
2286 | { | ||
2287 | int ret; | ||
2288 | |||
2289 | mutex_lock(&event_mutex); | ||
2290 | |||
2291 | ret = create_event_toplevel_files(parent, tr); | ||
2292 | if (ret) | ||
2293 | goto out_unlock; | ||
2294 | |||
2295 | down_write(&trace_event_sem); | ||
2296 | __trace_add_event_dirs(tr); | ||
2297 | up_write(&trace_event_sem); | ||
2298 | |||
2299 | out_unlock: | ||
2300 | mutex_unlock(&event_mutex); | ||
2301 | |||
2302 | return ret; | ||
2303 | } | ||
2304 | |||
2305 | /* | ||
2306 | * The top trace array already had its file descriptors created. | ||
2307 | * Now the files themselves need to be created. | ||
2308 | */ | ||
2309 | static __init int | ||
2310 | early_event_add_tracer(struct dentry *parent, struct trace_array *tr) | ||
2311 | { | ||
2312 | int ret; | ||
2313 | |||
2314 | mutex_lock(&event_mutex); | ||
2315 | |||
2316 | ret = create_event_toplevel_files(parent, tr); | ||
2317 | if (ret) | ||
2318 | goto out_unlock; | ||
2319 | |||
2320 | down_write(&trace_event_sem); | ||
2321 | __trace_early_add_event_dirs(tr); | ||
2322 | up_write(&trace_event_sem); | ||
2323 | |||
2324 | out_unlock: | ||
2325 | mutex_unlock(&event_mutex); | ||
2326 | |||
2327 | return ret; | ||
2328 | } | ||
2329 | |||
2330 | int event_trace_del_tracer(struct trace_array *tr) | ||
2331 | { | ||
2332 | /* Disable any running events */ | ||
2333 | __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); | ||
2334 | |||
2335 | mutex_lock(&event_mutex); | ||
2336 | |||
2337 | down_write(&trace_event_sem); | ||
2338 | __trace_remove_event_dirs(tr); | ||
2339 | debugfs_remove_recursive(tr->event_dir); | ||
2340 | up_write(&trace_event_sem); | ||
2341 | |||
2342 | tr->event_dir = NULL; | ||
2343 | |||
2344 | mutex_unlock(&event_mutex); | ||
2345 | |||
2346 | return 0; | ||
2347 | } | ||
2348 | |||
2349 | static __init int event_trace_memsetup(void) | ||
2350 | { | ||
2351 | field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); | ||
2352 | file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); | ||
2353 | return 0; | ||
2354 | } | ||
2355 | |||
1474 | static __init int event_trace_enable(void) | 2356 | static __init int event_trace_enable(void) |
1475 | { | 2357 | { |
2358 | struct trace_array *tr = top_trace_array(); | ||
1476 | struct ftrace_event_call **iter, *call; | 2359 | struct ftrace_event_call **iter, *call; |
1477 | char *buf = bootup_event_buf; | 2360 | char *buf = bootup_event_buf; |
1478 | char *token; | 2361 | char *token; |
@@ -1486,6 +2369,14 @@ static __init int event_trace_enable(void) | |||
1486 | list_add(&call->list, &ftrace_events); | 2369 | list_add(&call->list, &ftrace_events); |
1487 | } | 2370 | } |
1488 | 2371 | ||
2372 | /* | ||
2373 | * We need the top trace array to have a working set of trace | ||
2374 | * points at early init, before the debug files and directories | ||
2375 | * are created. Create the file entries now, and attach them | ||
2376 | * to the actual file dentries later. | ||
2377 | */ | ||
2378 | __trace_early_add_events(tr); | ||
2379 | |||
1489 | while (true) { | 2380 | while (true) { |
1490 | token = strsep(&buf, ","); | 2381 | token = strsep(&buf, ","); |
1491 | 2382 | ||
@@ -1494,73 +2385,43 @@ static __init int event_trace_enable(void) | |||
1494 | if (!*token) | 2385 | if (!*token) |
1495 | continue; | 2386 | continue; |
1496 | 2387 | ||
1497 | ret = ftrace_set_clr_event(token, 1); | 2388 | ret = ftrace_set_clr_event(tr, token, 1); |
1498 | if (ret) | 2389 | if (ret) |
1499 | pr_warn("Failed to enable trace event: %s\n", token); | 2390 | pr_warn("Failed to enable trace event: %s\n", token); |
1500 | } | 2391 | } |
1501 | 2392 | ||
1502 | trace_printk_start_comm(); | 2393 | trace_printk_start_comm(); |
1503 | 2394 | ||
2395 | register_event_cmds(); | ||
2396 | |||
1504 | return 0; | 2397 | return 0; |
1505 | } | 2398 | } |
1506 | 2399 | ||
1507 | static __init int event_trace_init(void) | 2400 | static __init int event_trace_init(void) |
1508 | { | 2401 | { |
1509 | struct ftrace_event_call *call; | 2402 | struct trace_array *tr; |
1510 | struct dentry *d_tracer; | 2403 | struct dentry *d_tracer; |
1511 | struct dentry *entry; | 2404 | struct dentry *entry; |
1512 | struct dentry *d_events; | ||
1513 | int ret; | 2405 | int ret; |
1514 | 2406 | ||
2407 | tr = top_trace_array(); | ||
2408 | |||
1515 | d_tracer = tracing_init_dentry(); | 2409 | d_tracer = tracing_init_dentry(); |
1516 | if (!d_tracer) | 2410 | if (!d_tracer) |
1517 | return 0; | 2411 | return 0; |
1518 | 2412 | ||
1519 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 2413 | entry = debugfs_create_file("available_events", 0444, d_tracer, |
1520 | NULL, &ftrace_avail_fops); | 2414 | tr, &ftrace_avail_fops); |
1521 | if (!entry) | 2415 | if (!entry) |
1522 | pr_warning("Could not create debugfs " | 2416 | pr_warning("Could not create debugfs " |
1523 | "'available_events' entry\n"); | 2417 | "'available_events' entry\n"); |
1524 | 2418 | ||
1525 | entry = debugfs_create_file("set_event", 0644, d_tracer, | ||
1526 | NULL, &ftrace_set_event_fops); | ||
1527 | if (!entry) | ||
1528 | pr_warning("Could not create debugfs " | ||
1529 | "'set_event' entry\n"); | ||
1530 | |||
1531 | d_events = event_trace_events_dir(); | ||
1532 | if (!d_events) | ||
1533 | return 0; | ||
1534 | |||
1535 | /* ring buffer internal formats */ | ||
1536 | trace_create_file("header_page", 0444, d_events, | ||
1537 | ring_buffer_print_page_header, | ||
1538 | &ftrace_show_header_fops); | ||
1539 | |||
1540 | trace_create_file("header_event", 0444, d_events, | ||
1541 | ring_buffer_print_entry_header, | ||
1542 | &ftrace_show_header_fops); | ||
1543 | |||
1544 | trace_create_file("enable", 0644, d_events, | ||
1545 | NULL, &ftrace_system_enable_fops); | ||
1546 | |||
1547 | if (trace_define_common_fields()) | 2419 | if (trace_define_common_fields()) |
1548 | pr_warning("tracing: Failed to allocate common fields"); | 2420 | pr_warning("tracing: Failed to allocate common fields"); |
1549 | 2421 | ||
1550 | /* | 2422 | ret = early_event_add_tracer(d_tracer, tr); |
1551 | * Early initialization already enabled ftrace event. | 2423 | if (ret) |
1552 | * Now it's only necessary to create the event directory. | 2424 | return ret; |
1553 | */ | ||
1554 | list_for_each_entry(call, &ftrace_events, list) { | ||
1555 | |||
1556 | ret = event_create_dir(call, d_events, | ||
1557 | &ftrace_event_id_fops, | ||
1558 | &ftrace_enable_fops, | ||
1559 | &ftrace_event_filter_fops, | ||
1560 | &ftrace_event_format_fops); | ||
1561 | if (ret < 0) | ||
1562 | event_remove(call); | ||
1563 | } | ||
1564 | 2425 | ||
1565 | ret = register_module_notifier(&trace_module_nb); | 2426 | ret = register_module_notifier(&trace_module_nb); |
1566 | if (ret) | 2427 | if (ret) |
@@ -1568,6 +2429,7 @@ static __init int event_trace_init(void) | |||
1568 | 2429 | ||
1569 | return 0; | 2430 | return 0; |
1570 | } | 2431 | } |
2432 | early_initcall(event_trace_memsetup); | ||
1571 | core_initcall(event_trace_enable); | 2433 | core_initcall(event_trace_enable); |
1572 | fs_initcall(event_trace_init); | 2434 | fs_initcall(event_trace_init); |
1573 | 2435 | ||
@@ -1627,13 +2489,20 @@ static __init void event_test_stuff(void) | |||
1627 | */ | 2489 | */ |
1628 | static __init void event_trace_self_tests(void) | 2490 | static __init void event_trace_self_tests(void) |
1629 | { | 2491 | { |
2492 | struct ftrace_subsystem_dir *dir; | ||
2493 | struct ftrace_event_file *file; | ||
1630 | struct ftrace_event_call *call; | 2494 | struct ftrace_event_call *call; |
1631 | struct event_subsystem *system; | 2495 | struct event_subsystem *system; |
2496 | struct trace_array *tr; | ||
1632 | int ret; | 2497 | int ret; |
1633 | 2498 | ||
2499 | tr = top_trace_array(); | ||
2500 | |||
1634 | pr_info("Running tests on trace events:\n"); | 2501 | pr_info("Running tests on trace events:\n"); |
1635 | 2502 | ||
1636 | list_for_each_entry(call, &ftrace_events, list) { | 2503 | list_for_each_entry(file, &tr->events, list) { |
2504 | |||
2505 | call = file->event_call; | ||
1637 | 2506 | ||
1638 | /* Only test those that have a probe */ | 2507 | /* Only test those that have a probe */ |
1639 | if (!call->class || !call->class->probe) | 2508 | if (!call->class || !call->class->probe) |
@@ -1657,15 +2526,15 @@ static __init void event_trace_self_tests(void) | |||
1657 | * If an event is already enabled, someone is using | 2526 | * If an event is already enabled, someone is using |
1658 | * it and the self test should not be on. | 2527 | * it and the self test should not be on. |
1659 | */ | 2528 | */ |
1660 | if (call->flags & TRACE_EVENT_FL_ENABLED) { | 2529 | if (file->flags & FTRACE_EVENT_FL_ENABLED) { |
1661 | pr_warning("Enabled event during self test!\n"); | 2530 | pr_warning("Enabled event during self test!\n"); |
1662 | WARN_ON_ONCE(1); | 2531 | WARN_ON_ONCE(1); |
1663 | continue; | 2532 | continue; |
1664 | } | 2533 | } |
1665 | 2534 | ||
1666 | ftrace_event_enable_disable(call, 1); | 2535 | ftrace_event_enable_disable(file, 1); |
1667 | event_test_stuff(); | 2536 | event_test_stuff(); |
1668 | ftrace_event_enable_disable(call, 0); | 2537 | ftrace_event_enable_disable(file, 0); |
1669 | 2538 | ||
1670 | pr_cont("OK\n"); | 2539 | pr_cont("OK\n"); |
1671 | } | 2540 | } |
@@ -1674,7 +2543,9 @@ static __init void event_trace_self_tests(void) | |||
1674 | 2543 | ||
1675 | pr_info("Running tests on trace event systems:\n"); | 2544 | pr_info("Running tests on trace event systems:\n"); |
1676 | 2545 | ||
1677 | list_for_each_entry(system, &event_subsystems, list) { | 2546 | list_for_each_entry(dir, &tr->systems, list) { |
2547 | |||
2548 | system = dir->subsystem; | ||
1678 | 2549 | ||
1679 | /* the ftrace system is special, skip it */ | 2550 | /* the ftrace system is special, skip it */ |
1680 | if (strcmp(system->name, "ftrace") == 0) | 2551 | if (strcmp(system->name, "ftrace") == 0) |
@@ -1682,7 +2553,7 @@ static __init void event_trace_self_tests(void) | |||
1682 | 2553 | ||
1683 | pr_info("Testing event system %s: ", system->name); | 2554 | pr_info("Testing event system %s: ", system->name); |
1684 | 2555 | ||
1685 | ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); | 2556 | ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); |
1686 | if (WARN_ON_ONCE(ret)) { | 2557 | if (WARN_ON_ONCE(ret)) { |
1687 | pr_warning("error enabling system %s\n", | 2558 | pr_warning("error enabling system %s\n", |
1688 | system->name); | 2559 | system->name); |
@@ -1691,7 +2562,7 @@ static __init void event_trace_self_tests(void) | |||
1691 | 2562 | ||
1692 | event_test_stuff(); | 2563 | event_test_stuff(); |
1693 | 2564 | ||
1694 | ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); | 2565 | ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); |
1695 | if (WARN_ON_ONCE(ret)) { | 2566 | if (WARN_ON_ONCE(ret)) { |
1696 | pr_warning("error disabling system %s\n", | 2567 | pr_warning("error disabling system %s\n", |
1697 | system->name); | 2568 | system->name); |
@@ -1706,7 +2577,7 @@ static __init void event_trace_self_tests(void) | |||
1706 | pr_info("Running tests on all trace events:\n"); | 2577 | pr_info("Running tests on all trace events:\n"); |
1707 | pr_info("Testing all events: "); | 2578 | pr_info("Testing all events: "); |
1708 | 2579 | ||
1709 | ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); | 2580 | ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); |
1710 | if (WARN_ON_ONCE(ret)) { | 2581 | if (WARN_ON_ONCE(ret)) { |
1711 | pr_warning("error enabling all events\n"); | 2582 | pr_warning("error enabling all events\n"); |
1712 | return; | 2583 | return; |
@@ -1715,7 +2586,7 @@ static __init void event_trace_self_tests(void) | |||
1715 | event_test_stuff(); | 2586 | event_test_stuff(); |
1716 | 2587 | ||
1717 | /* reset sysname */ | 2588 | /* reset sysname */ |
1718 | ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); | 2589 | ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); |
1719 | if (WARN_ON_ONCE(ret)) { | 2590 | if (WARN_ON_ONCE(ret)) { |
1720 | pr_warning("error disabling all events\n"); | 2591 | pr_warning("error disabling all events\n"); |
1721 | return; | 2592 | return; |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e5b0ca8b8d4d..e1b653f7e1ca 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
658 | mutex_unlock(&event_mutex); | 658 | mutex_unlock(&event_mutex); |
659 | } | 659 | } |
660 | 660 | ||
661 | static struct ftrace_event_field * | ||
662 | __find_event_field(struct list_head *head, char *name) | ||
663 | { | ||
664 | struct ftrace_event_field *field; | ||
665 | |||
666 | list_for_each_entry(field, head, link) { | ||
667 | if (!strcmp(field->name, name)) | ||
668 | return field; | ||
669 | } | ||
670 | |||
671 | return NULL; | ||
672 | } | ||
673 | |||
674 | static struct ftrace_event_field * | ||
675 | find_event_field(struct ftrace_event_call *call, char *name) | ||
676 | { | ||
677 | struct ftrace_event_field *field; | ||
678 | struct list_head *head; | ||
679 | |||
680 | field = __find_event_field(&ftrace_common_fields, name); | ||
681 | if (field) | ||
682 | return field; | ||
683 | |||
684 | head = trace_get_fields(call); | ||
685 | return __find_event_field(head, name); | ||
686 | } | ||
687 | |||
688 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) | 661 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
689 | { | 662 | { |
690 | stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); | 663 | stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); |
@@ -777,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter, | |||
777 | 750 | ||
778 | static void __free_preds(struct event_filter *filter) | 751 | static void __free_preds(struct event_filter *filter) |
779 | { | 752 | { |
753 | int i; | ||
754 | |||
780 | if (filter->preds) { | 755 | if (filter->preds) { |
756 | for (i = 0; i < filter->n_preds; i++) | ||
757 | kfree(filter->preds[i].ops); | ||
781 | kfree(filter->preds); | 758 | kfree(filter->preds); |
782 | filter->preds = NULL; | 759 | filter->preds = NULL; |
783 | } | 760 | } |
@@ -1337,7 +1314,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, | |||
1337 | return NULL; | 1314 | return NULL; |
1338 | } | 1315 | } |
1339 | 1316 | ||
1340 | field = find_event_field(call, operand1); | 1317 | field = trace_find_event_field(call, operand1); |
1341 | if (!field) { | 1318 | if (!field) { |
1342 | parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); | 1319 | parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); |
1343 | return NULL; | 1320 | return NULL; |
@@ -1907,16 +1884,17 @@ out_unlock: | |||
1907 | return err; | 1884 | return err; |
1908 | } | 1885 | } |
1909 | 1886 | ||
1910 | int apply_subsystem_event_filter(struct event_subsystem *system, | 1887 | int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, |
1911 | char *filter_string) | 1888 | char *filter_string) |
1912 | { | 1889 | { |
1890 | struct event_subsystem *system = dir->subsystem; | ||
1913 | struct event_filter *filter; | 1891 | struct event_filter *filter; |
1914 | int err = 0; | 1892 | int err = 0; |
1915 | 1893 | ||
1916 | mutex_lock(&event_mutex); | 1894 | mutex_lock(&event_mutex); |
1917 | 1895 | ||
1918 | /* Make sure the system still has events */ | 1896 | /* Make sure the system still has events */ |
1919 | if (!system->nr_events) { | 1897 | if (!dir->nr_events) { |
1920 | err = -ENODEV; | 1898 | err = -ENODEV; |
1921 | goto out_unlock; | 1899 | goto out_unlock; |
1922 | } | 1900 | } |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e039906b037d..d21a74670088 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
129 | 129 | ||
130 | #undef FTRACE_ENTRY | 130 | #undef FTRACE_ENTRY |
131 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ | 131 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ |
132 | int \ | 132 | static int __init \ |
133 | ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | 133 | ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ |
134 | { \ | 134 | { \ |
135 | struct struct_name field; \ | 135 | struct struct_name field; \ |
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | |||
168 | #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ | 168 | #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ |
169 | regfn) \ | 169 | regfn) \ |
170 | \ | 170 | \ |
171 | struct ftrace_event_class event_class_ftrace_##call = { \ | 171 | struct ftrace_event_class __refdata event_class_ftrace_##call = { \ |
172 | .system = __stringify(TRACE_SYSTEM), \ | 172 | .system = __stringify(TRACE_SYSTEM), \ |
173 | .define_fields = ftrace_define_fields_##call, \ | 173 | .define_fields = ftrace_define_fields_##call, \ |
174 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ | 174 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 601152523326..c4d6d7191988 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void); | |||
28 | static int function_trace_init(struct trace_array *tr) | 28 | static int function_trace_init(struct trace_array *tr) |
29 | { | 29 | { |
30 | func_trace = tr; | 30 | func_trace = tr; |
31 | tr->cpu = get_cpu(); | 31 | tr->trace_buffer.cpu = get_cpu(); |
32 | put_cpu(); | 32 | put_cpu(); |
33 | 33 | ||
34 | tracing_start_cmdline_record(); | 34 | tracing_start_cmdline_record(); |
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr) | |||
44 | 44 | ||
45 | static void function_trace_start(struct trace_array *tr) | 45 | static void function_trace_start(struct trace_array *tr) |
46 | { | 46 | { |
47 | tracing_reset_online_cpus(tr); | 47 | tracing_reset_online_cpus(&tr->trace_buffer); |
48 | } | 48 | } |
49 | 49 | ||
50 | /* Our option */ | 50 | /* Our option */ |
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, | |||
76 | goto out; | 76 | goto out; |
77 | 77 | ||
78 | cpu = smp_processor_id(); | 78 | cpu = smp_processor_id(); |
79 | data = tr->data[cpu]; | 79 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
80 | if (!atomic_read(&data->disabled)) { | 80 | if (!atomic_read(&data->disabled)) { |
81 | local_save_flags(flags); | 81 | local_save_flags(flags); |
82 | trace_function(tr, ip, parent_ip, flags, pc); | 82 | trace_function(tr, ip, parent_ip, flags, pc); |
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
107 | */ | 107 | */ |
108 | local_irq_save(flags); | 108 | local_irq_save(flags); |
109 | cpu = raw_smp_processor_id(); | 109 | cpu = raw_smp_processor_id(); |
110 | data = tr->data[cpu]; | 110 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
111 | disabled = atomic_inc_return(&data->disabled); | 111 | disabled = atomic_inc_return(&data->disabled); |
112 | 112 | ||
113 | if (likely(disabled == 1)) { | 113 | if (likely(disabled == 1)) { |
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly = | |||
214 | }; | 214 | }; |
215 | 215 | ||
216 | #ifdef CONFIG_DYNAMIC_FTRACE | 216 | #ifdef CONFIG_DYNAMIC_FTRACE |
217 | static void | 217 | static int update_count(void **data) |
218 | ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) | ||
219 | { | 218 | { |
220 | long *count = (long *)data; | 219 | unsigned long *count = (long *)data; |
221 | |||
222 | if (tracing_is_on()) | ||
223 | return; | ||
224 | 220 | ||
225 | if (!*count) | 221 | if (!*count) |
226 | return; | 222 | return 0; |
227 | 223 | ||
228 | if (*count != -1) | 224 | if (*count != -1) |
229 | (*count)--; | 225 | (*count)--; |
230 | 226 | ||
231 | tracing_on(); | 227 | return 1; |
232 | } | 228 | } |
233 | 229 | ||
234 | static void | 230 | static void |
235 | ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) | 231 | ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) |
236 | { | 232 | { |
237 | long *count = (long *)data; | 233 | if (tracing_is_on()) |
234 | return; | ||
235 | |||
236 | if (update_count(data)) | ||
237 | tracing_on(); | ||
238 | } | ||
238 | 239 | ||
240 | static void | ||
241 | ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) | ||
242 | { | ||
239 | if (!tracing_is_on()) | 243 | if (!tracing_is_on()) |
240 | return; | 244 | return; |
241 | 245 | ||
242 | if (!*count) | 246 | if (update_count(data)) |
247 | tracing_off(); | ||
248 | } | ||
249 | |||
250 | static void | ||
251 | ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) | ||
252 | { | ||
253 | if (tracing_is_on()) | ||
243 | return; | 254 | return; |
244 | 255 | ||
245 | if (*count != -1) | 256 | tracing_on(); |
246 | (*count)--; | 257 | } |
258 | |||
259 | static void | ||
260 | ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) | ||
261 | { | ||
262 | if (!tracing_is_on()) | ||
263 | return; | ||
247 | 264 | ||
248 | tracing_off(); | 265 | tracing_off(); |
249 | } | 266 | } |
250 | 267 | ||
251 | static int | 268 | /* |
252 | ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, | 269 | * Skip 4: |
253 | struct ftrace_probe_ops *ops, void *data); | 270 | * ftrace_stacktrace() |
271 | * function_trace_probe_call() | ||
272 | * ftrace_ops_list_func() | ||
273 | * ftrace_call() | ||
274 | */ | ||
275 | #define STACK_SKIP 4 | ||
254 | 276 | ||
255 | static struct ftrace_probe_ops traceon_probe_ops = { | 277 | static void |
256 | .func = ftrace_traceon, | 278 | ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) |
257 | .print = ftrace_trace_onoff_print, | 279 | { |
258 | }; | 280 | trace_dump_stack(STACK_SKIP); |
281 | } | ||
259 | 282 | ||
260 | static struct ftrace_probe_ops traceoff_probe_ops = { | 283 | static void |
261 | .func = ftrace_traceoff, | 284 | ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) |
262 | .print = ftrace_trace_onoff_print, | 285 | { |
263 | }; | 286 | if (!tracing_is_on()) |
287 | return; | ||
288 | |||
289 | if (update_count(data)) | ||
290 | trace_dump_stack(STACK_SKIP); | ||
291 | } | ||
264 | 292 | ||
265 | static int | 293 | static int |
266 | ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, | 294 | ftrace_probe_print(const char *name, struct seq_file *m, |
267 | struct ftrace_probe_ops *ops, void *data) | 295 | unsigned long ip, void *data) |
268 | { | 296 | { |
269 | long count = (long)data; | 297 | long count = (long)data; |
270 | 298 | ||
271 | seq_printf(m, "%ps:", (void *)ip); | 299 | seq_printf(m, "%ps:%s", (void *)ip, name); |
272 | |||
273 | if (ops == &traceon_probe_ops) | ||
274 | seq_printf(m, "traceon"); | ||
275 | else | ||
276 | seq_printf(m, "traceoff"); | ||
277 | 300 | ||
278 | if (count == -1) | 301 | if (count == -1) |
279 | seq_printf(m, ":unlimited\n"); | 302 | seq_printf(m, ":unlimited\n"); |
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, | |||
284 | } | 307 | } |
285 | 308 | ||
286 | static int | 309 | static int |
287 | ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) | 310 | ftrace_traceon_print(struct seq_file *m, unsigned long ip, |
311 | struct ftrace_probe_ops *ops, void *data) | ||
288 | { | 312 | { |
289 | struct ftrace_probe_ops *ops; | 313 | return ftrace_probe_print("traceon", m, ip, data); |
290 | 314 | } | |
291 | /* we register both traceon and traceoff to this callback */ | ||
292 | if (strcmp(cmd, "traceon") == 0) | ||
293 | ops = &traceon_probe_ops; | ||
294 | else | ||
295 | ops = &traceoff_probe_ops; | ||
296 | 315 | ||
297 | unregister_ftrace_function_probe_func(glob, ops); | 316 | static int |
317 | ftrace_traceoff_print(struct seq_file *m, unsigned long ip, | ||
318 | struct ftrace_probe_ops *ops, void *data) | ||
319 | { | ||
320 | return ftrace_probe_print("traceoff", m, ip, data); | ||
321 | } | ||
298 | 322 | ||
299 | return 0; | 323 | static int |
324 | ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, | ||
325 | struct ftrace_probe_ops *ops, void *data) | ||
326 | { | ||
327 | return ftrace_probe_print("stacktrace", m, ip, data); | ||
300 | } | 328 | } |
301 | 329 | ||
330 | static struct ftrace_probe_ops traceon_count_probe_ops = { | ||
331 | .func = ftrace_traceon_count, | ||
332 | .print = ftrace_traceon_print, | ||
333 | }; | ||
334 | |||
335 | static struct ftrace_probe_ops traceoff_count_probe_ops = { | ||
336 | .func = ftrace_traceoff_count, | ||
337 | .print = ftrace_traceoff_print, | ||
338 | }; | ||
339 | |||
340 | static struct ftrace_probe_ops stacktrace_count_probe_ops = { | ||
341 | .func = ftrace_stacktrace_count, | ||
342 | .print = ftrace_stacktrace_print, | ||
343 | }; | ||
344 | |||
345 | static struct ftrace_probe_ops traceon_probe_ops = { | ||
346 | .func = ftrace_traceon, | ||
347 | .print = ftrace_traceon_print, | ||
348 | }; | ||
349 | |||
350 | static struct ftrace_probe_ops traceoff_probe_ops = { | ||
351 | .func = ftrace_traceoff, | ||
352 | .print = ftrace_traceoff_print, | ||
353 | }; | ||
354 | |||
355 | static struct ftrace_probe_ops stacktrace_probe_ops = { | ||
356 | .func = ftrace_stacktrace, | ||
357 | .print = ftrace_stacktrace_print, | ||
358 | }; | ||
359 | |||
302 | static int | 360 | static int |
303 | ftrace_trace_onoff_callback(struct ftrace_hash *hash, | 361 | ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, |
304 | char *glob, char *cmd, char *param, int enable) | 362 | struct ftrace_hash *hash, char *glob, |
363 | char *cmd, char *param, int enable) | ||
305 | { | 364 | { |
306 | struct ftrace_probe_ops *ops; | ||
307 | void *count = (void *)-1; | 365 | void *count = (void *)-1; |
308 | char *number; | 366 | char *number; |
309 | int ret; | 367 | int ret; |
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, | |||
312 | if (!enable) | 370 | if (!enable) |
313 | return -EINVAL; | 371 | return -EINVAL; |
314 | 372 | ||
315 | if (glob[0] == '!') | 373 | if (glob[0] == '!') { |
316 | return ftrace_trace_onoff_unreg(glob+1, cmd, param); | 374 | unregister_ftrace_function_probe_func(glob+1, ops); |
317 | 375 | return 0; | |
318 | /* we register both traceon and traceoff to this callback */ | 376 | } |
319 | if (strcmp(cmd, "traceon") == 0) | ||
320 | ops = &traceon_probe_ops; | ||
321 | else | ||
322 | ops = &traceoff_probe_ops; | ||
323 | 377 | ||
324 | if (!param) | 378 | if (!param) |
325 | goto out_reg; | 379 | goto out_reg; |
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, | |||
343 | return ret < 0 ? ret : 0; | 397 | return ret < 0 ? ret : 0; |
344 | } | 398 | } |
345 | 399 | ||
400 | static int | ||
401 | ftrace_trace_onoff_callback(struct ftrace_hash *hash, | ||
402 | char *glob, char *cmd, char *param, int enable) | ||
403 | { | ||
404 | struct ftrace_probe_ops *ops; | ||
405 | |||
406 | /* we register both traceon and traceoff to this callback */ | ||
407 | if (strcmp(cmd, "traceon") == 0) | ||
408 | ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; | ||
409 | else | ||
410 | ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; | ||
411 | |||
412 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | ||
413 | param, enable); | ||
414 | } | ||
415 | |||
416 | static int | ||
417 | ftrace_stacktrace_callback(struct ftrace_hash *hash, | ||
418 | char *glob, char *cmd, char *param, int enable) | ||
419 | { | ||
420 | struct ftrace_probe_ops *ops; | ||
421 | |||
422 | ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; | ||
423 | |||
424 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | ||
425 | param, enable); | ||
426 | } | ||
427 | |||
346 | static struct ftrace_func_command ftrace_traceon_cmd = { | 428 | static struct ftrace_func_command ftrace_traceon_cmd = { |
347 | .name = "traceon", | 429 | .name = "traceon", |
348 | .func = ftrace_trace_onoff_callback, | 430 | .func = ftrace_trace_onoff_callback, |
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = { | |||
353 | .func = ftrace_trace_onoff_callback, | 435 | .func = ftrace_trace_onoff_callback, |
354 | }; | 436 | }; |
355 | 437 | ||
438 | static struct ftrace_func_command ftrace_stacktrace_cmd = { | ||
439 | .name = "stacktrace", | ||
440 | .func = ftrace_stacktrace_callback, | ||
441 | }; | ||
442 | |||
356 | static int __init init_func_cmd_traceon(void) | 443 | static int __init init_func_cmd_traceon(void) |
357 | { | 444 | { |
358 | int ret; | 445 | int ret; |
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void) | |||
364 | ret = register_ftrace_command(&ftrace_traceon_cmd); | 451 | ret = register_ftrace_command(&ftrace_traceon_cmd); |
365 | if (ret) | 452 | if (ret) |
366 | unregister_ftrace_command(&ftrace_traceoff_cmd); | 453 | unregister_ftrace_command(&ftrace_traceoff_cmd); |
454 | |||
455 | ret = register_ftrace_command(&ftrace_stacktrace_cmd); | ||
456 | if (ret) { | ||
457 | unregister_ftrace_command(&ftrace_traceoff_cmd); | ||
458 | unregister_ftrace_command(&ftrace_traceon_cmd); | ||
459 | } | ||
367 | return ret; | 460 | return ret; |
368 | } | 461 | } |
369 | #else | 462 | #else |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 39ada66389cc..8388bc99f2ee 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr, | |||
218 | { | 218 | { |
219 | struct ftrace_event_call *call = &event_funcgraph_entry; | 219 | struct ftrace_event_call *call = &event_funcgraph_entry; |
220 | struct ring_buffer_event *event; | 220 | struct ring_buffer_event *event; |
221 | struct ring_buffer *buffer = tr->buffer; | 221 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
222 | struct ftrace_graph_ent_entry *entry; | 222 | struct ftrace_graph_ent_entry *entry; |
223 | 223 | ||
224 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) | 224 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
265 | 265 | ||
266 | local_irq_save(flags); | 266 | local_irq_save(flags); |
267 | cpu = raw_smp_processor_id(); | 267 | cpu = raw_smp_processor_id(); |
268 | data = tr->data[cpu]; | 268 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
269 | disabled = atomic_inc_return(&data->disabled); | 269 | disabled = atomic_inc_return(&data->disabled); |
270 | if (likely(disabled == 1)) { | 270 | if (likely(disabled == 1)) { |
271 | pc = preempt_count(); | 271 | pc = preempt_count(); |
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr, | |||
323 | { | 323 | { |
324 | struct ftrace_event_call *call = &event_funcgraph_exit; | 324 | struct ftrace_event_call *call = &event_funcgraph_exit; |
325 | struct ring_buffer_event *event; | 325 | struct ring_buffer_event *event; |
326 | struct ring_buffer *buffer = tr->buffer; | 326 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
327 | struct ftrace_graph_ret_entry *entry; | 327 | struct ftrace_graph_ret_entry *entry; |
328 | 328 | ||
329 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) | 329 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) | |||
350 | 350 | ||
351 | local_irq_save(flags); | 351 | local_irq_save(flags); |
352 | cpu = raw_smp_processor_id(); | 352 | cpu = raw_smp_processor_id(); |
353 | data = tr->data[cpu]; | 353 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
354 | disabled = atomic_inc_return(&data->disabled); | 354 | disabled = atomic_inc_return(&data->disabled); |
355 | if (likely(disabled == 1)) { | 355 | if (likely(disabled == 1)) { |
356 | pc = preempt_count(); | 356 | pc = preempt_count(); |
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
560 | * We need to consume the current entry to see | 560 | * We need to consume the current entry to see |
561 | * the next one. | 561 | * the next one. |
562 | */ | 562 | */ |
563 | ring_buffer_consume(iter->tr->buffer, iter->cpu, | 563 | ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, |
564 | NULL, NULL); | 564 | NULL, NULL); |
565 | event = ring_buffer_peek(iter->tr->buffer, iter->cpu, | 565 | event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu, |
566 | NULL, NULL); | 566 | NULL, NULL); |
567 | } | 567 | } |
568 | 568 | ||
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac4881..b19d065a28cb 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -32,7 +32,8 @@ enum { | |||
32 | 32 | ||
33 | static int trace_type __read_mostly; | 33 | static int trace_type __read_mostly; |
34 | 34 | ||
35 | static int save_lat_flag; | 35 | static int save_flags; |
36 | static bool function_enabled; | ||
36 | 37 | ||
37 | static void stop_irqsoff_tracer(struct trace_array *tr, int graph); | 38 | static void stop_irqsoff_tracer(struct trace_array *tr, int graph); |
38 | static int start_irqsoff_tracer(struct trace_array *tr, int graph); | 39 | static int start_irqsoff_tracer(struct trace_array *tr, int graph); |
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr, | |||
121 | if (!irqs_disabled_flags(*flags)) | 122 | if (!irqs_disabled_flags(*flags)) |
122 | return 0; | 123 | return 0; |
123 | 124 | ||
124 | *data = tr->data[cpu]; | 125 | *data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
125 | disabled = atomic_inc_return(&(*data)->disabled); | 126 | disabled = atomic_inc_return(&(*data)->disabled); |
126 | 127 | ||
127 | if (likely(disabled == 1)) | 128 | if (likely(disabled == 1)) |
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) | |||
175 | per_cpu(tracing_cpu, cpu) = 0; | 176 | per_cpu(tracing_cpu, cpu) = 0; |
176 | 177 | ||
177 | tracing_max_latency = 0; | 178 | tracing_max_latency = 0; |
178 | tracing_reset_online_cpus(irqsoff_trace); | 179 | tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); |
179 | 180 | ||
180 | return start_irqsoff_tracer(irqsoff_trace, set); | 181 | return start_irqsoff_tracer(irqsoff_trace, set); |
181 | } | 182 | } |
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) | |||
380 | if (per_cpu(tracing_cpu, cpu)) | 381 | if (per_cpu(tracing_cpu, cpu)) |
381 | return; | 382 | return; |
382 | 383 | ||
383 | data = tr->data[cpu]; | 384 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
384 | 385 | ||
385 | if (unlikely(!data) || atomic_read(&data->disabled)) | 386 | if (unlikely(!data) || atomic_read(&data->disabled)) |
386 | return; | 387 | return; |
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) | |||
418 | if (!tracer_enabled) | 419 | if (!tracer_enabled) |
419 | return; | 420 | return; |
420 | 421 | ||
421 | data = tr->data[cpu]; | 422 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
422 | 423 | ||
423 | if (unlikely(!data) || | 424 | if (unlikely(!data) || |
424 | !data->critical_start || atomic_read(&data->disabled)) | 425 | !data->critical_start || atomic_read(&data->disabled)) |
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) | |||
528 | } | 529 | } |
529 | #endif /* CONFIG_PREEMPT_TRACER */ | 530 | #endif /* CONFIG_PREEMPT_TRACER */ |
530 | 531 | ||
531 | static int start_irqsoff_tracer(struct trace_array *tr, int graph) | 532 | static int register_irqsoff_function(int graph, int set) |
532 | { | 533 | { |
533 | int ret = 0; | 534 | int ret; |
534 | 535 | ||
535 | if (!graph) | 536 | /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ |
536 | ret = register_ftrace_function(&trace_ops); | 537 | if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) |
537 | else | 538 | return 0; |
539 | |||
540 | if (graph) | ||
538 | ret = register_ftrace_graph(&irqsoff_graph_return, | 541 | ret = register_ftrace_graph(&irqsoff_graph_return, |
539 | &irqsoff_graph_entry); | 542 | &irqsoff_graph_entry); |
543 | else | ||
544 | ret = register_ftrace_function(&trace_ops); | ||
545 | |||
546 | if (!ret) | ||
547 | function_enabled = true; | ||
548 | |||
549 | return ret; | ||
550 | } | ||
551 | |||
552 | static void unregister_irqsoff_function(int graph) | ||
553 | { | ||
554 | if (!function_enabled) | ||
555 | return; | ||
556 | |||
557 | if (graph) | ||
558 | unregister_ftrace_graph(); | ||
559 | else | ||
560 | unregister_ftrace_function(&trace_ops); | ||
561 | |||
562 | function_enabled = false; | ||
563 | } | ||
564 | |||
565 | static void irqsoff_function_set(int set) | ||
566 | { | ||
567 | if (set) | ||
568 | register_irqsoff_function(is_graph(), 1); | ||
569 | else | ||
570 | unregister_irqsoff_function(is_graph()); | ||
571 | } | ||
572 | |||
573 | static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) | ||
574 | { | ||
575 | if (mask & TRACE_ITER_FUNCTION) | ||
576 | irqsoff_function_set(set); | ||
577 | |||
578 | return trace_keep_overwrite(tracer, mask, set); | ||
579 | } | ||
580 | |||
581 | static int start_irqsoff_tracer(struct trace_array *tr, int graph) | ||
582 | { | ||
583 | int ret; | ||
584 | |||
585 | ret = register_irqsoff_function(graph, 0); | ||
540 | 586 | ||
541 | if (!ret && tracing_is_enabled()) | 587 | if (!ret && tracing_is_enabled()) |
542 | tracer_enabled = 1; | 588 | tracer_enabled = 1; |
@@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) | |||
550 | { | 596 | { |
551 | tracer_enabled = 0; | 597 | tracer_enabled = 0; |
552 | 598 | ||
553 | if (!graph) | 599 | unregister_irqsoff_function(graph); |
554 | unregister_ftrace_function(&trace_ops); | ||
555 | else | ||
556 | unregister_ftrace_graph(); | ||
557 | } | 600 | } |
558 | 601 | ||
559 | static void __irqsoff_tracer_init(struct trace_array *tr) | 602 | static void __irqsoff_tracer_init(struct trace_array *tr) |
560 | { | 603 | { |
561 | save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; | 604 | save_flags = trace_flags; |
562 | trace_flags |= TRACE_ITER_LATENCY_FMT; | 605 | |
606 | /* non overwrite screws up the latency tracers */ | ||
607 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); | ||
608 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); | ||
563 | 609 | ||
564 | tracing_max_latency = 0; | 610 | tracing_max_latency = 0; |
565 | irqsoff_trace = tr; | 611 | irqsoff_trace = tr; |
566 | /* make sure that the tracer is visible */ | 612 | /* make sure that the tracer is visible */ |
567 | smp_wmb(); | 613 | smp_wmb(); |
568 | tracing_reset_online_cpus(tr); | 614 | tracing_reset_online_cpus(&tr->trace_buffer); |
569 | 615 | ||
570 | if (start_irqsoff_tracer(tr, is_graph())) | 616 | if (start_irqsoff_tracer(tr, is_graph())) |
571 | printk(KERN_ERR "failed to start irqsoff tracer\n"); | 617 | printk(KERN_ERR "failed to start irqsoff tracer\n"); |
@@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr) | |||
573 | 619 | ||
574 | static void irqsoff_tracer_reset(struct trace_array *tr) | 620 | static void irqsoff_tracer_reset(struct trace_array *tr) |
575 | { | 621 | { |
622 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; | ||
623 | int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; | ||
624 | |||
576 | stop_irqsoff_tracer(tr, is_graph()); | 625 | stop_irqsoff_tracer(tr, is_graph()); |
577 | 626 | ||
578 | if (!save_lat_flag) | 627 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); |
579 | trace_flags &= ~TRACE_ITER_LATENCY_FMT; | 628 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); |
580 | } | 629 | } |
581 | 630 | ||
582 | static void irqsoff_tracer_start(struct trace_array *tr) | 631 | static void irqsoff_tracer_start(struct trace_array *tr) |
@@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
609 | .print_line = irqsoff_print_line, | 658 | .print_line = irqsoff_print_line, |
610 | .flags = &tracer_flags, | 659 | .flags = &tracer_flags, |
611 | .set_flag = irqsoff_set_flag, | 660 | .set_flag = irqsoff_set_flag, |
661 | .flag_changed = irqsoff_flag_changed, | ||
612 | #ifdef CONFIG_FTRACE_SELFTEST | 662 | #ifdef CONFIG_FTRACE_SELFTEST |
613 | .selftest = trace_selftest_startup_irqsoff, | 663 | .selftest = trace_selftest_startup_irqsoff, |
614 | #endif | 664 | #endif |
@@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
642 | .print_line = irqsoff_print_line, | 692 | .print_line = irqsoff_print_line, |
643 | .flags = &tracer_flags, | 693 | .flags = &tracer_flags, |
644 | .set_flag = irqsoff_set_flag, | 694 | .set_flag = irqsoff_set_flag, |
695 | .flag_changed = irqsoff_flag_changed, | ||
645 | #ifdef CONFIG_FTRACE_SELFTEST | 696 | #ifdef CONFIG_FTRACE_SELFTEST |
646 | .selftest = trace_selftest_startup_preemptoff, | 697 | .selftest = trace_selftest_startup_preemptoff, |
647 | #endif | 698 | #endif |
@@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
677 | .print_line = irqsoff_print_line, | 728 | .print_line = irqsoff_print_line, |
678 | .flags = &tracer_flags, | 729 | .flags = &tracer_flags, |
679 | .set_flag = irqsoff_set_flag, | 730 | .set_flag = irqsoff_set_flag, |
731 | .flag_changed = irqsoff_flag_changed, | ||
680 | #ifdef CONFIG_FTRACE_SELFTEST | 732 | #ifdef CONFIG_FTRACE_SELFTEST |
681 | .selftest = trace_selftest_startup_preemptirqsoff, | 733 | .selftest = trace_selftest_startup_preemptirqsoff, |
682 | #endif | 734 | #endif |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b3..bd90e1b06088 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c | |||
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) | |||
26 | trace_init_global_iter(&iter); | 26 | trace_init_global_iter(&iter); |
27 | 27 | ||
28 | for_each_tracing_cpu(cpu) { | 28 | for_each_tracing_cpu(cpu) { |
29 | atomic_inc(&iter.tr->data[cpu]->disabled); | 29 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
30 | } | 30 | } |
31 | 31 | ||
32 | old_userobj = trace_flags; | 32 | old_userobj = trace_flags; |
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) | |||
43 | iter.iter_flags |= TRACE_FILE_LAT_FMT; | 43 | iter.iter_flags |= TRACE_FILE_LAT_FMT; |
44 | iter.pos = -1; | 44 | iter.pos = -1; |
45 | 45 | ||
46 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | 46 | if (cpu_file == RING_BUFFER_ALL_CPUS) { |
47 | for_each_tracing_cpu(cpu) { | 47 | for_each_tracing_cpu(cpu) { |
48 | iter.buffer_iter[cpu] = | 48 | iter.buffer_iter[cpu] = |
49 | ring_buffer_read_prepare(iter.tr->buffer, cpu); | 49 | ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); |
50 | ring_buffer_read_start(iter.buffer_iter[cpu]); | 50 | ring_buffer_read_start(iter.buffer_iter[cpu]); |
51 | tracing_iter_reset(&iter, cpu); | 51 | tracing_iter_reset(&iter, cpu); |
52 | } | 52 | } |
53 | } else { | 53 | } else { |
54 | iter.cpu_file = cpu_file; | 54 | iter.cpu_file = cpu_file; |
55 | iter.buffer_iter[cpu_file] = | 55 | iter.buffer_iter[cpu_file] = |
56 | ring_buffer_read_prepare(iter.tr->buffer, cpu_file); | 56 | ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); |
57 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); | 57 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); |
58 | tracing_iter_reset(&iter, cpu_file); | 58 | tracing_iter_reset(&iter, cpu_file); |
59 | } | 59 | } |
@@ -83,7 +83,7 @@ out: | |||
83 | trace_flags = old_userobj; | 83 | trace_flags = old_userobj; |
84 | 84 | ||
85 | for_each_tracing_cpu(cpu) { | 85 | for_each_tracing_cpu(cpu) { |
86 | atomic_dec(&iter.tr->data[cpu]->disabled); | 86 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
87 | } | 87 | } |
88 | 88 | ||
89 | for_each_tracing_cpu(cpu) | 89 | for_each_tracing_cpu(cpu) |
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv) | |||
115 | !cpu_online(cpu_file)) | 115 | !cpu_online(cpu_file)) |
116 | return KDB_BADINT; | 116 | return KDB_BADINT; |
117 | } else { | 117 | } else { |
118 | cpu_file = TRACE_PIPE_ALL_CPU; | 118 | cpu_file = RING_BUFFER_ALL_CPUS; |
119 | } | 119 | } |
120 | 120 | ||
121 | kdb_trap_printk++; | 121 | kdb_trap_printk++; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1865d5f76538..9f46e98ba8f2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -27,7 +27,6 @@ | |||
27 | /** | 27 | /** |
28 | * Kprobe event core functions | 28 | * Kprobe event core functions |
29 | */ | 29 | */ |
30 | |||
31 | struct trace_probe { | 30 | struct trace_probe { |
32 | struct list_head list; | 31 | struct list_head list; |
33 | struct kretprobe rp; /* Use rp.kp for kprobe use */ | 32 | struct kretprobe rp; /* Use rp.kp for kprobe use */ |
@@ -36,6 +35,7 @@ struct trace_probe { | |||
36 | const char *symbol; /* symbol name */ | 35 | const char *symbol; /* symbol name */ |
37 | struct ftrace_event_class class; | 36 | struct ftrace_event_class class; |
38 | struct ftrace_event_call call; | 37 | struct ftrace_event_call call; |
38 | struct ftrace_event_file * __rcu *files; | ||
39 | ssize_t size; /* trace entry size */ | 39 | ssize_t size; /* trace entry size */ |
40 | unsigned int nr_args; | 40 | unsigned int nr_args; |
41 | struct probe_arg args[]; | 41 | struct probe_arg args[]; |
@@ -46,7 +46,7 @@ struct trace_probe { | |||
46 | (sizeof(struct probe_arg) * (n))) | 46 | (sizeof(struct probe_arg) * (n))) |
47 | 47 | ||
48 | 48 | ||
49 | static __kprobes int trace_probe_is_return(struct trace_probe *tp) | 49 | static __kprobes bool trace_probe_is_return(struct trace_probe *tp) |
50 | { | 50 | { |
51 | return tp->rp.handler != NULL; | 51 | return tp->rp.handler != NULL; |
52 | } | 52 | } |
@@ -183,12 +183,63 @@ static struct trace_probe *find_trace_probe(const char *event, | |||
183 | return NULL; | 183 | return NULL; |
184 | } | 184 | } |
185 | 185 | ||
186 | /* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ | 186 | static int trace_probe_nr_files(struct trace_probe *tp) |
187 | static int enable_trace_probe(struct trace_probe *tp, int flag) | 187 | { |
188 | struct ftrace_event_file **file; | ||
189 | int ret = 0; | ||
190 | |||
191 | /* | ||
192 | * Since all tp->files updater is protected by probe_enable_lock, | ||
193 | * we don't need to lock an rcu_read_lock. | ||
194 | */ | ||
195 | file = rcu_dereference_raw(tp->files); | ||
196 | if (file) | ||
197 | while (*(file++)) | ||
198 | ret++; | ||
199 | |||
200 | return ret; | ||
201 | } | ||
202 | |||
203 | static DEFINE_MUTEX(probe_enable_lock); | ||
204 | |||
205 | /* | ||
206 | * Enable trace_probe | ||
207 | * if the file is NULL, enable "perf" handler, or enable "trace" handler. | ||
208 | */ | ||
209 | static int | ||
210 | enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | ||
188 | { | 211 | { |
189 | int ret = 0; | 212 | int ret = 0; |
190 | 213 | ||
191 | tp->flags |= flag; | 214 | mutex_lock(&probe_enable_lock); |
215 | |||
216 | if (file) { | ||
217 | struct ftrace_event_file **new, **old; | ||
218 | int n = trace_probe_nr_files(tp); | ||
219 | |||
220 | old = rcu_dereference_raw(tp->files); | ||
221 | /* 1 is for new one and 1 is for stopper */ | ||
222 | new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), | ||
223 | GFP_KERNEL); | ||
224 | if (!new) { | ||
225 | ret = -ENOMEM; | ||
226 | goto out_unlock; | ||
227 | } | ||
228 | memcpy(new, old, n * sizeof(struct ftrace_event_file *)); | ||
229 | new[n] = file; | ||
230 | /* The last one keeps a NULL */ | ||
231 | |||
232 | rcu_assign_pointer(tp->files, new); | ||
233 | tp->flags |= TP_FLAG_TRACE; | ||
234 | |||
235 | if (old) { | ||
236 | /* Make sure the probe is done with old files */ | ||
237 | synchronize_sched(); | ||
238 | kfree(old); | ||
239 | } | ||
240 | } else | ||
241 | tp->flags |= TP_FLAG_PROFILE; | ||
242 | |||
192 | if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && | 243 | if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && |
193 | !trace_probe_has_gone(tp)) { | 244 | !trace_probe_has_gone(tp)) { |
194 | if (trace_probe_is_return(tp)) | 245 | if (trace_probe_is_return(tp)) |
@@ -197,19 +248,90 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) | |||
197 | ret = enable_kprobe(&tp->rp.kp); | 248 | ret = enable_kprobe(&tp->rp.kp); |
198 | } | 249 | } |
199 | 250 | ||
251 | out_unlock: | ||
252 | mutex_unlock(&probe_enable_lock); | ||
253 | |||
200 | return ret; | 254 | return ret; |
201 | } | 255 | } |
202 | 256 | ||
203 | /* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ | 257 | static int |
204 | static void disable_trace_probe(struct trace_probe *tp, int flag) | 258 | trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) |
259 | { | ||
260 | struct ftrace_event_file **files; | ||
261 | int i; | ||
262 | |||
263 | /* | ||
264 | * Since all tp->files updater is protected by probe_enable_lock, | ||
265 | * we don't need to lock an rcu_read_lock. | ||
266 | */ | ||
267 | files = rcu_dereference_raw(tp->files); | ||
268 | if (files) { | ||
269 | for (i = 0; files[i]; i++) | ||
270 | if (files[i] == file) | ||
271 | return i; | ||
272 | } | ||
273 | |||
274 | return -1; | ||
275 | } | ||
276 | |||
277 | /* | ||
278 | * Disable trace_probe | ||
279 | * if the file is NULL, disable "perf" handler, or disable "trace" handler. | ||
280 | */ | ||
281 | static int | ||
282 | disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | ||
205 | { | 283 | { |
206 | tp->flags &= ~flag; | 284 | int ret = 0; |
285 | |||
286 | mutex_lock(&probe_enable_lock); | ||
287 | |||
288 | if (file) { | ||
289 | struct ftrace_event_file **new, **old; | ||
290 | int n = trace_probe_nr_files(tp); | ||
291 | int i, j; | ||
292 | |||
293 | old = rcu_dereference_raw(tp->files); | ||
294 | if (n == 0 || trace_probe_file_index(tp, file) < 0) { | ||
295 | ret = -EINVAL; | ||
296 | goto out_unlock; | ||
297 | } | ||
298 | |||
299 | if (n == 1) { /* Remove the last file */ | ||
300 | tp->flags &= ~TP_FLAG_TRACE; | ||
301 | new = NULL; | ||
302 | } else { | ||
303 | new = kzalloc(n * sizeof(struct ftrace_event_file *), | ||
304 | GFP_KERNEL); | ||
305 | if (!new) { | ||
306 | ret = -ENOMEM; | ||
307 | goto out_unlock; | ||
308 | } | ||
309 | |||
310 | /* This copy & check loop copies the NULL stopper too */ | ||
311 | for (i = 0, j = 0; j < n && i < n + 1; i++) | ||
312 | if (old[i] != file) | ||
313 | new[j++] = old[i]; | ||
314 | } | ||
315 | |||
316 | rcu_assign_pointer(tp->files, new); | ||
317 | |||
318 | /* Make sure the probe is done with old files */ | ||
319 | synchronize_sched(); | ||
320 | kfree(old); | ||
321 | } else | ||
322 | tp->flags &= ~TP_FLAG_PROFILE; | ||
323 | |||
207 | if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { | 324 | if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { |
208 | if (trace_probe_is_return(tp)) | 325 | if (trace_probe_is_return(tp)) |
209 | disable_kretprobe(&tp->rp); | 326 | disable_kretprobe(&tp->rp); |
210 | else | 327 | else |
211 | disable_kprobe(&tp->rp.kp); | 328 | disable_kprobe(&tp->rp.kp); |
212 | } | 329 | } |
330 | |||
331 | out_unlock: | ||
332 | mutex_unlock(&probe_enable_lock); | ||
333 | |||
334 | return ret; | ||
213 | } | 335 | } |
214 | 336 | ||
215 | /* Internal register function - just handle k*probes and flags */ | 337 | /* Internal register function - just handle k*probes and flags */ |
@@ -723,9 +845,10 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, | |||
723 | } | 845 | } |
724 | 846 | ||
725 | /* Kprobe handler */ | 847 | /* Kprobe handler */ |
726 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | 848 | static __kprobes void |
849 | __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, | ||
850 | struct ftrace_event_file *ftrace_file) | ||
727 | { | 851 | { |
728 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | ||
729 | struct kprobe_trace_entry_head *entry; | 852 | struct kprobe_trace_entry_head *entry; |
730 | struct ring_buffer_event *event; | 853 | struct ring_buffer_event *event; |
731 | struct ring_buffer *buffer; | 854 | struct ring_buffer *buffer; |
@@ -733,7 +856,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
733 | unsigned long irq_flags; | 856 | unsigned long irq_flags; |
734 | struct ftrace_event_call *call = &tp->call; | 857 | struct ftrace_event_call *call = &tp->call; |
735 | 858 | ||
736 | tp->nhit++; | 859 | WARN_ON(call != ftrace_file->event_call); |
860 | |||
861 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | ||
862 | return; | ||
737 | 863 | ||
738 | local_save_flags(irq_flags); | 864 | local_save_flags(irq_flags); |
739 | pc = preempt_count(); | 865 | pc = preempt_count(); |
@@ -741,13 +867,14 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
741 | dsize = __get_data_size(tp, regs); | 867 | dsize = __get_data_size(tp, regs); |
742 | size = sizeof(*entry) + tp->size + dsize; | 868 | size = sizeof(*entry) + tp->size + dsize; |
743 | 869 | ||
744 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 870 | event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, |
745 | size, irq_flags, pc); | 871 | call->event.type, |
872 | size, irq_flags, pc); | ||
746 | if (!event) | 873 | if (!event) |
747 | return; | 874 | return; |
748 | 875 | ||
749 | entry = ring_buffer_event_data(event); | 876 | entry = ring_buffer_event_data(event); |
750 | entry->ip = (unsigned long)kp->addr; | 877 | entry->ip = (unsigned long)tp->rp.kp.addr; |
751 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 878 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
752 | 879 | ||
753 | if (!filter_current_check_discard(buffer, call, entry, event)) | 880 | if (!filter_current_check_discard(buffer, call, entry, event)) |
@@ -755,11 +882,31 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
755 | irq_flags, pc, regs); | 882 | irq_flags, pc, regs); |
756 | } | 883 | } |
757 | 884 | ||
885 | static __kprobes void | ||
886 | kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) | ||
887 | { | ||
888 | /* | ||
889 | * Note: preempt is already disabled around the kprobe handler. | ||
890 | * However, we still need an smp_read_barrier_depends() corresponding | ||
891 | * to smp_wmb() in rcu_assign_pointer() to access the pointer. | ||
892 | */ | ||
893 | struct ftrace_event_file **file = rcu_dereference_raw(tp->files); | ||
894 | |||
895 | if (unlikely(!file)) | ||
896 | return; | ||
897 | |||
898 | while (*file) { | ||
899 | __kprobe_trace_func(tp, regs, *file); | ||
900 | file++; | ||
901 | } | ||
902 | } | ||
903 | |||
758 | /* Kretprobe handler */ | 904 | /* Kretprobe handler */ |
759 | static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | 905 | static __kprobes void |
760 | struct pt_regs *regs) | 906 | __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, |
907 | struct pt_regs *regs, | ||
908 | struct ftrace_event_file *ftrace_file) | ||
761 | { | 909 | { |
762 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | ||
763 | struct kretprobe_trace_entry_head *entry; | 910 | struct kretprobe_trace_entry_head *entry; |
764 | struct ring_buffer_event *event; | 911 | struct ring_buffer_event *event; |
765 | struct ring_buffer *buffer; | 912 | struct ring_buffer *buffer; |
@@ -767,14 +914,20 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
767 | unsigned long irq_flags; | 914 | unsigned long irq_flags; |
768 | struct ftrace_event_call *call = &tp->call; | 915 | struct ftrace_event_call *call = &tp->call; |
769 | 916 | ||
917 | WARN_ON(call != ftrace_file->event_call); | ||
918 | |||
919 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | ||
920 | return; | ||
921 | |||
770 | local_save_flags(irq_flags); | 922 | local_save_flags(irq_flags); |
771 | pc = preempt_count(); | 923 | pc = preempt_count(); |
772 | 924 | ||
773 | dsize = __get_data_size(tp, regs); | 925 | dsize = __get_data_size(tp, regs); |
774 | size = sizeof(*entry) + tp->size + dsize; | 926 | size = sizeof(*entry) + tp->size + dsize; |
775 | 927 | ||
776 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 928 | event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, |
777 | size, irq_flags, pc); | 929 | call->event.type, |
930 | size, irq_flags, pc); | ||
778 | if (!event) | 931 | if (!event) |
779 | return; | 932 | return; |
780 | 933 | ||
@@ -788,8 +941,28 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
788 | irq_flags, pc, regs); | 941 | irq_flags, pc, regs); |
789 | } | 942 | } |
790 | 943 | ||
944 | static __kprobes void | ||
945 | kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, | ||
946 | struct pt_regs *regs) | ||
947 | { | ||
948 | /* | ||
949 | * Note: preempt is already disabled around the kprobe handler. | ||
950 | * However, we still need an smp_read_barrier_depends() corresponding | ||
951 | * to smp_wmb() in rcu_assign_pointer() to access the pointer. | ||
952 | */ | ||
953 | struct ftrace_event_file **file = rcu_dereference_raw(tp->files); | ||
954 | |||
955 | if (unlikely(!file)) | ||
956 | return; | ||
957 | |||
958 | while (*file) { | ||
959 | __kretprobe_trace_func(tp, ri, regs, *file); | ||
960 | file++; | ||
961 | } | ||
962 | } | ||
963 | |||
791 | /* Event entry printers */ | 964 | /* Event entry printers */ |
792 | enum print_line_t | 965 | static enum print_line_t |
793 | print_kprobe_event(struct trace_iterator *iter, int flags, | 966 | print_kprobe_event(struct trace_iterator *iter, int flags, |
794 | struct trace_event *event) | 967 | struct trace_event *event) |
795 | { | 968 | { |
@@ -825,7 +998,7 @@ partial: | |||
825 | return TRACE_TYPE_PARTIAL_LINE; | 998 | return TRACE_TYPE_PARTIAL_LINE; |
826 | } | 999 | } |
827 | 1000 | ||
828 | enum print_line_t | 1001 | static enum print_line_t |
829 | print_kretprobe_event(struct trace_iterator *iter, int flags, | 1002 | print_kretprobe_event(struct trace_iterator *iter, int flags, |
830 | struct trace_event *event) | 1003 | struct trace_event *event) |
831 | { | 1004 | { |
@@ -975,10 +1148,9 @@ static int set_print_fmt(struct trace_probe *tp) | |||
975 | #ifdef CONFIG_PERF_EVENTS | 1148 | #ifdef CONFIG_PERF_EVENTS |
976 | 1149 | ||
977 | /* Kprobe profile handler */ | 1150 | /* Kprobe profile handler */ |
978 | static __kprobes void kprobe_perf_func(struct kprobe *kp, | 1151 | static __kprobes void |
979 | struct pt_regs *regs) | 1152 | kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) |
980 | { | 1153 | { |
981 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | ||
982 | struct ftrace_event_call *call = &tp->call; | 1154 | struct ftrace_event_call *call = &tp->call; |
983 | struct kprobe_trace_entry_head *entry; | 1155 | struct kprobe_trace_entry_head *entry; |
984 | struct hlist_head *head; | 1156 | struct hlist_head *head; |
@@ -997,7 +1169,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
997 | if (!entry) | 1169 | if (!entry) |
998 | return; | 1170 | return; |
999 | 1171 | ||
1000 | entry->ip = (unsigned long)kp->addr; | 1172 | entry->ip = (unsigned long)tp->rp.kp.addr; |
1001 | memset(&entry[1], 0, dsize); | 1173 | memset(&entry[1], 0, dsize); |
1002 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1174 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1003 | 1175 | ||
@@ -1007,10 +1179,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
1007 | } | 1179 | } |
1008 | 1180 | ||
1009 | /* Kretprobe profile handler */ | 1181 | /* Kretprobe profile handler */ |
1010 | static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | 1182 | static __kprobes void |
1011 | struct pt_regs *regs) | 1183 | kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, |
1184 | struct pt_regs *regs) | ||
1012 | { | 1185 | { |
1013 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | ||
1014 | struct ftrace_event_call *call = &tp->call; | 1186 | struct ftrace_event_call *call = &tp->call; |
1015 | struct kretprobe_trace_entry_head *entry; | 1187 | struct kretprobe_trace_entry_head *entry; |
1016 | struct hlist_head *head; | 1188 | struct hlist_head *head; |
@@ -1044,20 +1216,19 @@ int kprobe_register(struct ftrace_event_call *event, | |||
1044 | enum trace_reg type, void *data) | 1216 | enum trace_reg type, void *data) |
1045 | { | 1217 | { |
1046 | struct trace_probe *tp = (struct trace_probe *)event->data; | 1218 | struct trace_probe *tp = (struct trace_probe *)event->data; |
1219 | struct ftrace_event_file *file = data; | ||
1047 | 1220 | ||
1048 | switch (type) { | 1221 | switch (type) { |
1049 | case TRACE_REG_REGISTER: | 1222 | case TRACE_REG_REGISTER: |
1050 | return enable_trace_probe(tp, TP_FLAG_TRACE); | 1223 | return enable_trace_probe(tp, file); |
1051 | case TRACE_REG_UNREGISTER: | 1224 | case TRACE_REG_UNREGISTER: |
1052 | disable_trace_probe(tp, TP_FLAG_TRACE); | 1225 | return disable_trace_probe(tp, file); |
1053 | return 0; | ||
1054 | 1226 | ||
1055 | #ifdef CONFIG_PERF_EVENTS | 1227 | #ifdef CONFIG_PERF_EVENTS |
1056 | case TRACE_REG_PERF_REGISTER: | 1228 | case TRACE_REG_PERF_REGISTER: |
1057 | return enable_trace_probe(tp, TP_FLAG_PROFILE); | 1229 | return enable_trace_probe(tp, NULL); |
1058 | case TRACE_REG_PERF_UNREGISTER: | 1230 | case TRACE_REG_PERF_UNREGISTER: |
1059 | disable_trace_probe(tp, TP_FLAG_PROFILE); | 1231 | return disable_trace_probe(tp, NULL); |
1060 | return 0; | ||
1061 | case TRACE_REG_PERF_OPEN: | 1232 | case TRACE_REG_PERF_OPEN: |
1062 | case TRACE_REG_PERF_CLOSE: | 1233 | case TRACE_REG_PERF_CLOSE: |
1063 | case TRACE_REG_PERF_ADD: | 1234 | case TRACE_REG_PERF_ADD: |
@@ -1073,11 +1244,13 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | |||
1073 | { | 1244 | { |
1074 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | 1245 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); |
1075 | 1246 | ||
1247 | tp->nhit++; | ||
1248 | |||
1076 | if (tp->flags & TP_FLAG_TRACE) | 1249 | if (tp->flags & TP_FLAG_TRACE) |
1077 | kprobe_trace_func(kp, regs); | 1250 | kprobe_trace_func(tp, regs); |
1078 | #ifdef CONFIG_PERF_EVENTS | 1251 | #ifdef CONFIG_PERF_EVENTS |
1079 | if (tp->flags & TP_FLAG_PROFILE) | 1252 | if (tp->flags & TP_FLAG_PROFILE) |
1080 | kprobe_perf_func(kp, regs); | 1253 | kprobe_perf_func(tp, regs); |
1081 | #endif | 1254 | #endif |
1082 | return 0; /* We don't tweek kernel, so just return 0 */ | 1255 | return 0; /* We don't tweek kernel, so just return 0 */ |
1083 | } | 1256 | } |
@@ -1087,11 +1260,13 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
1087 | { | 1260 | { |
1088 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | 1261 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); |
1089 | 1262 | ||
1263 | tp->nhit++; | ||
1264 | |||
1090 | if (tp->flags & TP_FLAG_TRACE) | 1265 | if (tp->flags & TP_FLAG_TRACE) |
1091 | kretprobe_trace_func(ri, regs); | 1266 | kretprobe_trace_func(tp, ri, regs); |
1092 | #ifdef CONFIG_PERF_EVENTS | 1267 | #ifdef CONFIG_PERF_EVENTS |
1093 | if (tp->flags & TP_FLAG_PROFILE) | 1268 | if (tp->flags & TP_FLAG_PROFILE) |
1094 | kretprobe_perf_func(ri, regs); | 1269 | kretprobe_perf_func(tp, ri, regs); |
1095 | #endif | 1270 | #endif |
1096 | return 0; /* We don't tweek kernel, so just return 0 */ | 1271 | return 0; /* We don't tweek kernel, so just return 0 */ |
1097 | } | 1272 | } |
@@ -1189,11 +1364,24 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, | |||
1189 | return a1 + a2 + a3 + a4 + a5 + a6; | 1364 | return a1 + a2 + a3 + a4 + a5 + a6; |
1190 | } | 1365 | } |
1191 | 1366 | ||
1367 | static struct ftrace_event_file * | ||
1368 | find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) | ||
1369 | { | ||
1370 | struct ftrace_event_file *file; | ||
1371 | |||
1372 | list_for_each_entry(file, &tr->events, list) | ||
1373 | if (file->event_call == &tp->call) | ||
1374 | return file; | ||
1375 | |||
1376 | return NULL; | ||
1377 | } | ||
1378 | |||
1192 | static __init int kprobe_trace_self_tests_init(void) | 1379 | static __init int kprobe_trace_self_tests_init(void) |
1193 | { | 1380 | { |
1194 | int ret, warn = 0; | 1381 | int ret, warn = 0; |
1195 | int (*target)(int, int, int, int, int, int); | 1382 | int (*target)(int, int, int, int, int, int); |
1196 | struct trace_probe *tp; | 1383 | struct trace_probe *tp; |
1384 | struct ftrace_event_file *file; | ||
1197 | 1385 | ||
1198 | target = kprobe_trace_selftest_target; | 1386 | target = kprobe_trace_selftest_target; |
1199 | 1387 | ||
@@ -1203,31 +1391,43 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1203 | "$stack $stack0 +0($stack)", | 1391 | "$stack $stack0 +0($stack)", |
1204 | create_trace_probe); | 1392 | create_trace_probe); |
1205 | if (WARN_ON_ONCE(ret)) { | 1393 | if (WARN_ON_ONCE(ret)) { |
1206 | pr_warning("error on probing function entry.\n"); | 1394 | pr_warn("error on probing function entry.\n"); |
1207 | warn++; | 1395 | warn++; |
1208 | } else { | 1396 | } else { |
1209 | /* Enable trace point */ | 1397 | /* Enable trace point */ |
1210 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); | 1398 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); |
1211 | if (WARN_ON_ONCE(tp == NULL)) { | 1399 | if (WARN_ON_ONCE(tp == NULL)) { |
1212 | pr_warning("error on getting new probe.\n"); | 1400 | pr_warn("error on getting new probe.\n"); |
1213 | warn++; | 1401 | warn++; |
1214 | } else | 1402 | } else { |
1215 | enable_trace_probe(tp, TP_FLAG_TRACE); | 1403 | file = find_trace_probe_file(tp, top_trace_array()); |
1404 | if (WARN_ON_ONCE(file == NULL)) { | ||
1405 | pr_warn("error on getting probe file.\n"); | ||
1406 | warn++; | ||
1407 | } else | ||
1408 | enable_trace_probe(tp, file); | ||
1409 | } | ||
1216 | } | 1410 | } |
1217 | 1411 | ||
1218 | ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " | 1412 | ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " |
1219 | "$retval", create_trace_probe); | 1413 | "$retval", create_trace_probe); |
1220 | if (WARN_ON_ONCE(ret)) { | 1414 | if (WARN_ON_ONCE(ret)) { |
1221 | pr_warning("error on probing function return.\n"); | 1415 | pr_warn("error on probing function return.\n"); |
1222 | warn++; | 1416 | warn++; |
1223 | } else { | 1417 | } else { |
1224 | /* Enable trace point */ | 1418 | /* Enable trace point */ |
1225 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); | 1419 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); |
1226 | if (WARN_ON_ONCE(tp == NULL)) { | 1420 | if (WARN_ON_ONCE(tp == NULL)) { |
1227 | pr_warning("error on getting new probe.\n"); | 1421 | pr_warn("error on getting 2nd new probe.\n"); |
1228 | warn++; | 1422 | warn++; |
1229 | } else | 1423 | } else { |
1230 | enable_trace_probe(tp, TP_FLAG_TRACE); | 1424 | file = find_trace_probe_file(tp, top_trace_array()); |
1425 | if (WARN_ON_ONCE(file == NULL)) { | ||
1426 | pr_warn("error on getting probe file.\n"); | ||
1427 | warn++; | ||
1428 | } else | ||
1429 | enable_trace_probe(tp, file); | ||
1430 | } | ||
1231 | } | 1431 | } |
1232 | 1432 | ||
1233 | if (warn) | 1433 | if (warn) |
@@ -1238,27 +1438,39 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1238 | /* Disable trace points before removing it */ | 1438 | /* Disable trace points before removing it */ |
1239 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); | 1439 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); |
1240 | if (WARN_ON_ONCE(tp == NULL)) { | 1440 | if (WARN_ON_ONCE(tp == NULL)) { |
1241 | pr_warning("error on getting test probe.\n"); | 1441 | pr_warn("error on getting test probe.\n"); |
1242 | warn++; | 1442 | warn++; |
1243 | } else | 1443 | } else { |
1244 | disable_trace_probe(tp, TP_FLAG_TRACE); | 1444 | file = find_trace_probe_file(tp, top_trace_array()); |
1445 | if (WARN_ON_ONCE(file == NULL)) { | ||
1446 | pr_warn("error on getting probe file.\n"); | ||
1447 | warn++; | ||
1448 | } else | ||
1449 | disable_trace_probe(tp, file); | ||
1450 | } | ||
1245 | 1451 | ||
1246 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); | 1452 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); |
1247 | if (WARN_ON_ONCE(tp == NULL)) { | 1453 | if (WARN_ON_ONCE(tp == NULL)) { |
1248 | pr_warning("error on getting 2nd test probe.\n"); | 1454 | pr_warn("error on getting 2nd test probe.\n"); |
1249 | warn++; | 1455 | warn++; |
1250 | } else | 1456 | } else { |
1251 | disable_trace_probe(tp, TP_FLAG_TRACE); | 1457 | file = find_trace_probe_file(tp, top_trace_array()); |
1458 | if (WARN_ON_ONCE(file == NULL)) { | ||
1459 | pr_warn("error on getting probe file.\n"); | ||
1460 | warn++; | ||
1461 | } else | ||
1462 | disable_trace_probe(tp, file); | ||
1463 | } | ||
1252 | 1464 | ||
1253 | ret = traceprobe_command("-:testprobe", create_trace_probe); | 1465 | ret = traceprobe_command("-:testprobe", create_trace_probe); |
1254 | if (WARN_ON_ONCE(ret)) { | 1466 | if (WARN_ON_ONCE(ret)) { |
1255 | pr_warning("error on deleting a probe.\n"); | 1467 | pr_warn("error on deleting a probe.\n"); |
1256 | warn++; | 1468 | warn++; |
1257 | } | 1469 | } |
1258 | 1470 | ||
1259 | ret = traceprobe_command("-:testprobe2", create_trace_probe); | 1471 | ret = traceprobe_command("-:testprobe2", create_trace_probe); |
1260 | if (WARN_ON_ONCE(ret)) { | 1472 | if (WARN_ON_ONCE(ret)) { |
1261 | pr_warning("error on deleting a probe.\n"); | 1473 | pr_warn("error on deleting a probe.\n"); |
1262 | warn++; | 1474 | warn++; |
1263 | } | 1475 | } |
1264 | 1476 | ||
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e5..a5e8f4878bfa 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr) | |||
31 | overrun_detected = false; | 31 | overrun_detected = false; |
32 | prev_overruns = 0; | 32 | prev_overruns = 0; |
33 | 33 | ||
34 | tracing_reset_online_cpus(tr); | 34 | tracing_reset_online_cpus(&tr->trace_buffer); |
35 | } | 35 | } |
36 | 36 | ||
37 | static int mmio_trace_init(struct trace_array *tr) | 37 | static int mmio_trace_init(struct trace_array *tr) |
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter) | |||
128 | static unsigned long count_overruns(struct trace_iterator *iter) | 128 | static unsigned long count_overruns(struct trace_iterator *iter) |
129 | { | 129 | { |
130 | unsigned long cnt = atomic_xchg(&dropped_count, 0); | 130 | unsigned long cnt = atomic_xchg(&dropped_count, 0); |
131 | unsigned long over = ring_buffer_overruns(iter->tr->buffer); | 131 | unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer); |
132 | 132 | ||
133 | if (over > prev_overruns) | 133 | if (over > prev_overruns) |
134 | cnt += over - prev_overruns; | 134 | cnt += over - prev_overruns; |
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, | |||
309 | struct mmiotrace_rw *rw) | 309 | struct mmiotrace_rw *rw) |
310 | { | 310 | { |
311 | struct ftrace_event_call *call = &event_mmiotrace_rw; | 311 | struct ftrace_event_call *call = &event_mmiotrace_rw; |
312 | struct ring_buffer *buffer = tr->buffer; | 312 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
313 | struct ring_buffer_event *event; | 313 | struct ring_buffer_event *event; |
314 | struct trace_mmiotrace_rw *entry; | 314 | struct trace_mmiotrace_rw *entry; |
315 | int pc = preempt_count(); | 315 | int pc = preempt_count(); |
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, | |||
330 | void mmio_trace_rw(struct mmiotrace_rw *rw) | 330 | void mmio_trace_rw(struct mmiotrace_rw *rw) |
331 | { | 331 | { |
332 | struct trace_array *tr = mmio_trace_array; | 332 | struct trace_array *tr = mmio_trace_array; |
333 | struct trace_array_cpu *data = tr->data[smp_processor_id()]; | 333 | struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); |
334 | __trace_mmiotrace_rw(tr, data, rw); | 334 | __trace_mmiotrace_rw(tr, data, rw); |
335 | } | 335 | } |
336 | 336 | ||
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, | |||
339 | struct mmiotrace_map *map) | 339 | struct mmiotrace_map *map) |
340 | { | 340 | { |
341 | struct ftrace_event_call *call = &event_mmiotrace_map; | 341 | struct ftrace_event_call *call = &event_mmiotrace_map; |
342 | struct ring_buffer *buffer = tr->buffer; | 342 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
343 | struct ring_buffer_event *event; | 343 | struct ring_buffer_event *event; |
344 | struct trace_mmiotrace_map *entry; | 344 | struct trace_mmiotrace_map *entry; |
345 | int pc = preempt_count(); | 345 | int pc = preempt_count(); |
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map) | |||
363 | struct trace_array_cpu *data; | 363 | struct trace_array_cpu *data; |
364 | 364 | ||
365 | preempt_disable(); | 365 | preempt_disable(); |
366 | data = tr->data[smp_processor_id()]; | 366 | data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); |
367 | __trace_mmiotrace_map(tr, data, map); | 367 | __trace_mmiotrace_map(tr, data, map); |
368 | preempt_enable(); | 368 | preempt_enable(); |
369 | } | 369 | } |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 697e88d13907..bb922d9ee51b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -14,7 +14,7 @@ | |||
14 | /* must be a power of 2 */ | 14 | /* must be a power of 2 */ |
15 | #define EVENT_HASHSIZE 128 | 15 | #define EVENT_HASHSIZE 128 |
16 | 16 | ||
17 | DECLARE_RWSEM(trace_event_mutex); | 17 | DECLARE_RWSEM(trace_event_sem); |
18 | 18 | ||
19 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; | 19 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; |
20 | 20 | ||
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) | |||
37 | return ret; | 37 | return ret; |
38 | } | 38 | } |
39 | 39 | ||
40 | enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) | ||
41 | { | ||
42 | struct trace_seq *s = &iter->seq; | ||
43 | struct trace_entry *entry = iter->ent; | ||
44 | struct bputs_entry *field; | ||
45 | int ret; | ||
46 | |||
47 | trace_assign_type(field, entry); | ||
48 | |||
49 | ret = trace_seq_puts(s, field->str); | ||
50 | if (!ret) | ||
51 | return TRACE_TYPE_PARTIAL_LINE; | ||
52 | |||
53 | return TRACE_TYPE_HANDLED; | ||
54 | } | ||
55 | |||
40 | enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) | 56 | enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) |
41 | { | 57 | { |
42 | struct trace_seq *s = &iter->seq; | 58 | struct trace_seq *s = &iter->seq; |
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | |||
397 | } | 413 | } |
398 | EXPORT_SYMBOL(ftrace_print_hex_seq); | 414 | EXPORT_SYMBOL(ftrace_print_hex_seq); |
399 | 415 | ||
416 | int ftrace_raw_output_prep(struct trace_iterator *iter, | ||
417 | struct trace_event *trace_event) | ||
418 | { | ||
419 | struct ftrace_event_call *event; | ||
420 | struct trace_seq *s = &iter->seq; | ||
421 | struct trace_seq *p = &iter->tmp_seq; | ||
422 | struct trace_entry *entry; | ||
423 | int ret; | ||
424 | |||
425 | event = container_of(trace_event, struct ftrace_event_call, event); | ||
426 | entry = iter->ent; | ||
427 | |||
428 | if (entry->type != event->event.type) { | ||
429 | WARN_ON_ONCE(1); | ||
430 | return TRACE_TYPE_UNHANDLED; | ||
431 | } | ||
432 | |||
433 | trace_seq_init(p); | ||
434 | ret = trace_seq_printf(s, "%s: ", event->name); | ||
435 | if (!ret) | ||
436 | return TRACE_TYPE_PARTIAL_LINE; | ||
437 | |||
438 | return 0; | ||
439 | } | ||
440 | EXPORT_SYMBOL(ftrace_raw_output_prep); | ||
441 | |||
400 | #ifdef CONFIG_KRETPROBES | 442 | #ifdef CONFIG_KRETPROBES |
401 | static inline const char *kretprobed(const char *name) | 443 | static inline const char *kretprobed(const char *name) |
402 | { | 444 | { |
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) | |||
617 | { | 659 | { |
618 | unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; | 660 | unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; |
619 | unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; | 661 | unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; |
620 | unsigned long long abs_ts = iter->ts - iter->tr->time_start; | 662 | unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; |
621 | unsigned long long rel_ts = next_ts - iter->ts; | 663 | unsigned long long rel_ts = next_ts - iter->ts; |
622 | struct trace_seq *s = &iter->seq; | 664 | struct trace_seq *s = &iter->seq; |
623 | 665 | ||
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list) | |||
783 | 825 | ||
784 | void trace_event_read_lock(void) | 826 | void trace_event_read_lock(void) |
785 | { | 827 | { |
786 | down_read(&trace_event_mutex); | 828 | down_read(&trace_event_sem); |
787 | } | 829 | } |
788 | 830 | ||
789 | void trace_event_read_unlock(void) | 831 | void trace_event_read_unlock(void) |
790 | { | 832 | { |
791 | up_read(&trace_event_mutex); | 833 | up_read(&trace_event_sem); |
792 | } | 834 | } |
793 | 835 | ||
794 | /** | 836 | /** |
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event) | |||
811 | unsigned key; | 853 | unsigned key; |
812 | int ret = 0; | 854 | int ret = 0; |
813 | 855 | ||
814 | down_write(&trace_event_mutex); | 856 | down_write(&trace_event_sem); |
815 | 857 | ||
816 | if (WARN_ON(!event)) | 858 | if (WARN_ON(!event)) |
817 | goto out; | 859 | goto out; |
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event) | |||
866 | 908 | ||
867 | ret = event->type; | 909 | ret = event->type; |
868 | out: | 910 | out: |
869 | up_write(&trace_event_mutex); | 911 | up_write(&trace_event_sem); |
870 | 912 | ||
871 | return ret; | 913 | return ret; |
872 | } | 914 | } |
873 | EXPORT_SYMBOL_GPL(register_ftrace_event); | 915 | EXPORT_SYMBOL_GPL(register_ftrace_event); |
874 | 916 | ||
875 | /* | 917 | /* |
876 | * Used by module code with the trace_event_mutex held for write. | 918 | * Used by module code with the trace_event_sem held for write. |
877 | */ | 919 | */ |
878 | int __unregister_ftrace_event(struct trace_event *event) | 920 | int __unregister_ftrace_event(struct trace_event *event) |
879 | { | 921 | { |
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event) | |||
888 | */ | 930 | */ |
889 | int unregister_ftrace_event(struct trace_event *event) | 931 | int unregister_ftrace_event(struct trace_event *event) |
890 | { | 932 | { |
891 | down_write(&trace_event_mutex); | 933 | down_write(&trace_event_sem); |
892 | __unregister_ftrace_event(event); | 934 | __unregister_ftrace_event(event); |
893 | up_write(&trace_event_mutex); | 935 | up_write(&trace_event_sem); |
894 | 936 | ||
895 | return 0; | 937 | return 0; |
896 | } | 938 | } |
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = { | |||
1217 | .funcs = &trace_user_stack_funcs, | 1259 | .funcs = &trace_user_stack_funcs, |
1218 | }; | 1260 | }; |
1219 | 1261 | ||
1262 | /* TRACE_BPUTS */ | ||
1263 | static enum print_line_t | ||
1264 | trace_bputs_print(struct trace_iterator *iter, int flags, | ||
1265 | struct trace_event *event) | ||
1266 | { | ||
1267 | struct trace_entry *entry = iter->ent; | ||
1268 | struct trace_seq *s = &iter->seq; | ||
1269 | struct bputs_entry *field; | ||
1270 | |||
1271 | trace_assign_type(field, entry); | ||
1272 | |||
1273 | if (!seq_print_ip_sym(s, field->ip, flags)) | ||
1274 | goto partial; | ||
1275 | |||
1276 | if (!trace_seq_puts(s, ": ")) | ||
1277 | goto partial; | ||
1278 | |||
1279 | if (!trace_seq_puts(s, field->str)) | ||
1280 | goto partial; | ||
1281 | |||
1282 | return TRACE_TYPE_HANDLED; | ||
1283 | |||
1284 | partial: | ||
1285 | return TRACE_TYPE_PARTIAL_LINE; | ||
1286 | } | ||
1287 | |||
1288 | |||
1289 | static enum print_line_t | ||
1290 | trace_bputs_raw(struct trace_iterator *iter, int flags, | ||
1291 | struct trace_event *event) | ||
1292 | { | ||
1293 | struct bputs_entry *field; | ||
1294 | struct trace_seq *s = &iter->seq; | ||
1295 | |||
1296 | trace_assign_type(field, iter->ent); | ||
1297 | |||
1298 | if (!trace_seq_printf(s, ": %lx : ", field->ip)) | ||
1299 | goto partial; | ||
1300 | |||
1301 | if (!trace_seq_puts(s, field->str)) | ||
1302 | goto partial; | ||
1303 | |||
1304 | return TRACE_TYPE_HANDLED; | ||
1305 | |||
1306 | partial: | ||
1307 | return TRACE_TYPE_PARTIAL_LINE; | ||
1308 | } | ||
1309 | |||
1310 | static struct trace_event_functions trace_bputs_funcs = { | ||
1311 | .trace = trace_bputs_print, | ||
1312 | .raw = trace_bputs_raw, | ||
1313 | }; | ||
1314 | |||
1315 | static struct trace_event trace_bputs_event = { | ||
1316 | .type = TRACE_BPUTS, | ||
1317 | .funcs = &trace_bputs_funcs, | ||
1318 | }; | ||
1319 | |||
1220 | /* TRACE_BPRINT */ | 1320 | /* TRACE_BPRINT */ |
1221 | static enum print_line_t | 1321 | static enum print_line_t |
1222 | trace_bprint_print(struct trace_iterator *iter, int flags, | 1322 | trace_bprint_print(struct trace_iterator *iter, int flags, |
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = { | |||
1329 | &trace_wake_event, | 1429 | &trace_wake_event, |
1330 | &trace_stack_event, | 1430 | &trace_stack_event, |
1331 | &trace_user_stack_event, | 1431 | &trace_user_stack_event, |
1432 | &trace_bputs_event, | ||
1332 | &trace_bprint_event, | 1433 | &trace_bprint_event, |
1333 | &trace_print_event, | 1434 | &trace_print_event, |
1334 | NULL | 1435 | NULL |
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492b..127a9d8c8357 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h | |||
@@ -5,6 +5,8 @@ | |||
5 | #include "trace.h" | 5 | #include "trace.h" |
6 | 6 | ||
7 | extern enum print_line_t | 7 | extern enum print_line_t |
8 | trace_print_bputs_msg_only(struct trace_iterator *iter); | ||
9 | extern enum print_line_t | ||
8 | trace_print_bprintk_msg_only(struct trace_iterator *iter); | 10 | trace_print_bprintk_msg_only(struct trace_iterator *iter); |
9 | extern enum print_line_t | 11 | extern enum print_line_t |
10 | trace_print_printk_msg_only(struct trace_iterator *iter); | 12 | trace_print_printk_msg_only(struct trace_iterator *iter); |
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); | |||
31 | 33 | ||
32 | /* used by module unregistering */ | 34 | /* used by module unregistering */ |
33 | extern int __unregister_ftrace_event(struct trace_event *event); | 35 | extern int __unregister_ftrace_event(struct trace_event *event); |
34 | extern struct rw_semaphore trace_event_mutex; | 36 | extern struct rw_semaphore trace_event_sem; |
35 | 37 | ||
36 | #define MAX_MEMHEX_BYTES 8 | 38 | #define MAX_MEMHEX_BYTES 8 |
37 | #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) | 39 | #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3374c792ccd8..4e98e3b257a3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr, | |||
28 | unsigned long flags, int pc) | 28 | unsigned long flags, int pc) |
29 | { | 29 | { |
30 | struct ftrace_event_call *call = &event_context_switch; | 30 | struct ftrace_event_call *call = &event_context_switch; |
31 | struct ring_buffer *buffer = tr->buffer; | 31 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
32 | struct ring_buffer_event *event; | 32 | struct ring_buffer_event *event; |
33 | struct ctx_switch_entry *entry; | 33 | struct ctx_switch_entry *entry; |
34 | 34 | ||
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n | |||
69 | pc = preempt_count(); | 69 | pc = preempt_count(); |
70 | local_irq_save(flags); | 70 | local_irq_save(flags); |
71 | cpu = raw_smp_processor_id(); | 71 | cpu = raw_smp_processor_id(); |
72 | data = ctx_trace->data[cpu]; | 72 | data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); |
73 | 73 | ||
74 | if (likely(!atomic_read(&data->disabled))) | 74 | if (likely(!atomic_read(&data->disabled))) |
75 | tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); | 75 | tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); |
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
86 | struct ftrace_event_call *call = &event_wakeup; | 86 | struct ftrace_event_call *call = &event_wakeup; |
87 | struct ring_buffer_event *event; | 87 | struct ring_buffer_event *event; |
88 | struct ctx_switch_entry *entry; | 88 | struct ctx_switch_entry *entry; |
89 | struct ring_buffer *buffer = tr->buffer; | 89 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
90 | 90 | ||
91 | event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, | 91 | event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, |
92 | sizeof(*entry), flags, pc); | 92 | sizeof(*entry), flags, pc); |
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) | |||
123 | pc = preempt_count(); | 123 | pc = preempt_count(); |
124 | local_irq_save(flags); | 124 | local_irq_save(flags); |
125 | cpu = raw_smp_processor_id(); | 125 | cpu = raw_smp_processor_id(); |
126 | data = ctx_trace->data[cpu]; | 126 | data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); |
127 | 127 | ||
128 | if (likely(!atomic_read(&data->disabled))) | 128 | if (likely(!atomic_read(&data->disabled))) |
129 | tracing_sched_wakeup_trace(ctx_trace, wakee, current, | 129 | tracing_sched_wakeup_trace(ctx_trace, wakee, current, |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a1..fee77e15d815 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr); | |||
36 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace); | 36 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace); |
37 | static void wakeup_graph_return(struct ftrace_graph_ret *trace); | 37 | static void wakeup_graph_return(struct ftrace_graph_ret *trace); |
38 | 38 | ||
39 | static int save_lat_flag; | 39 | static int save_flags; |
40 | static bool function_enabled; | ||
40 | 41 | ||
41 | #define TRACE_DISPLAY_GRAPH 1 | 42 | #define TRACE_DISPLAY_GRAPH 1 |
42 | 43 | ||
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr, | |||
89 | if (cpu != wakeup_current_cpu) | 90 | if (cpu != wakeup_current_cpu) |
90 | goto out_enable; | 91 | goto out_enable; |
91 | 92 | ||
92 | *data = tr->data[cpu]; | 93 | *data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
93 | disabled = atomic_inc_return(&(*data)->disabled); | 94 | disabled = atomic_inc_return(&(*data)->disabled); |
94 | if (unlikely(disabled != 1)) | 95 | if (unlikely(disabled != 1)) |
95 | goto out; | 96 | goto out; |
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly = | |||
134 | }; | 135 | }; |
135 | #endif /* CONFIG_FUNCTION_TRACER */ | 136 | #endif /* CONFIG_FUNCTION_TRACER */ |
136 | 137 | ||
137 | static int start_func_tracer(int graph) | 138 | static int register_wakeup_function(int graph, int set) |
138 | { | 139 | { |
139 | int ret; | 140 | int ret; |
140 | 141 | ||
141 | if (!graph) | 142 | /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ |
142 | ret = register_ftrace_function(&trace_ops); | 143 | if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) |
143 | else | 144 | return 0; |
145 | |||
146 | if (graph) | ||
144 | ret = register_ftrace_graph(&wakeup_graph_return, | 147 | ret = register_ftrace_graph(&wakeup_graph_return, |
145 | &wakeup_graph_entry); | 148 | &wakeup_graph_entry); |
149 | else | ||
150 | ret = register_ftrace_function(&trace_ops); | ||
151 | |||
152 | if (!ret) | ||
153 | function_enabled = true; | ||
154 | |||
155 | return ret; | ||
156 | } | ||
157 | |||
158 | static void unregister_wakeup_function(int graph) | ||
159 | { | ||
160 | if (!function_enabled) | ||
161 | return; | ||
162 | |||
163 | if (graph) | ||
164 | unregister_ftrace_graph(); | ||
165 | else | ||
166 | unregister_ftrace_function(&trace_ops); | ||
167 | |||
168 | function_enabled = false; | ||
169 | } | ||
170 | |||
171 | static void wakeup_function_set(int set) | ||
172 | { | ||
173 | if (set) | ||
174 | register_wakeup_function(is_graph(), 1); | ||
175 | else | ||
176 | unregister_wakeup_function(is_graph()); | ||
177 | } | ||
178 | |||
179 | static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) | ||
180 | { | ||
181 | if (mask & TRACE_ITER_FUNCTION) | ||
182 | wakeup_function_set(set); | ||
183 | |||
184 | return trace_keep_overwrite(tracer, mask, set); | ||
185 | } | ||
186 | |||
187 | static int start_func_tracer(int graph) | ||
188 | { | ||
189 | int ret; | ||
190 | |||
191 | ret = register_wakeup_function(graph, 0); | ||
146 | 192 | ||
147 | if (!ret && tracing_is_enabled()) | 193 | if (!ret && tracing_is_enabled()) |
148 | tracer_enabled = 1; | 194 | tracer_enabled = 1; |
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph) | |||
156 | { | 202 | { |
157 | tracer_enabled = 0; | 203 | tracer_enabled = 0; |
158 | 204 | ||
159 | if (!graph) | 205 | unregister_wakeup_function(graph); |
160 | unregister_ftrace_function(&trace_ops); | ||
161 | else | ||
162 | unregister_ftrace_graph(); | ||
163 | } | 206 | } |
164 | 207 | ||
165 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 208 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore, | |||
353 | 396 | ||
354 | /* disable local data, not wakeup_cpu data */ | 397 | /* disable local data, not wakeup_cpu data */ |
355 | cpu = raw_smp_processor_id(); | 398 | cpu = raw_smp_processor_id(); |
356 | disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); | 399 | disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); |
357 | if (likely(disabled != 1)) | 400 | if (likely(disabled != 1)) |
358 | goto out; | 401 | goto out; |
359 | 402 | ||
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore, | |||
365 | goto out_unlock; | 408 | goto out_unlock; |
366 | 409 | ||
367 | /* The task we are waiting for is waking up */ | 410 | /* The task we are waiting for is waking up */ |
368 | data = wakeup_trace->data[wakeup_cpu]; | 411 | data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); |
369 | 412 | ||
370 | __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); | 413 | __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); |
371 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); | 414 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); |
@@ -387,7 +430,7 @@ out_unlock: | |||
387 | arch_spin_unlock(&wakeup_lock); | 430 | arch_spin_unlock(&wakeup_lock); |
388 | local_irq_restore(flags); | 431 | local_irq_restore(flags); |
389 | out: | 432 | out: |
390 | atomic_dec(&wakeup_trace->data[cpu]->disabled); | 433 | atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); |
391 | } | 434 | } |
392 | 435 | ||
393 | static void __wakeup_reset(struct trace_array *tr) | 436 | static void __wakeup_reset(struct trace_array *tr) |
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr) | |||
405 | { | 448 | { |
406 | unsigned long flags; | 449 | unsigned long flags; |
407 | 450 | ||
408 | tracing_reset_online_cpus(tr); | 451 | tracing_reset_online_cpus(&tr->trace_buffer); |
409 | 452 | ||
410 | local_irq_save(flags); | 453 | local_irq_save(flags); |
411 | arch_spin_lock(&wakeup_lock); | 454 | arch_spin_lock(&wakeup_lock); |
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
435 | return; | 478 | return; |
436 | 479 | ||
437 | pc = preempt_count(); | 480 | pc = preempt_count(); |
438 | disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); | 481 | disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); |
439 | if (unlikely(disabled != 1)) | 482 | if (unlikely(disabled != 1)) |
440 | goto out; | 483 | goto out; |
441 | 484 | ||
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
458 | 501 | ||
459 | local_save_flags(flags); | 502 | local_save_flags(flags); |
460 | 503 | ||
461 | data = wakeup_trace->data[wakeup_cpu]; | 504 | data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); |
462 | data->preempt_timestamp = ftrace_now(cpu); | 505 | data->preempt_timestamp = ftrace_now(cpu); |
463 | tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); | 506 | tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); |
464 | 507 | ||
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
472 | out_locked: | 515 | out_locked: |
473 | arch_spin_unlock(&wakeup_lock); | 516 | arch_spin_unlock(&wakeup_lock); |
474 | out: | 517 | out: |
475 | atomic_dec(&wakeup_trace->data[cpu]->disabled); | 518 | atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); |
476 | } | 519 | } |
477 | 520 | ||
478 | static void start_wakeup_tracer(struct trace_array *tr) | 521 | static void start_wakeup_tracer(struct trace_array *tr) |
@@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr) | |||
540 | 583 | ||
541 | static int __wakeup_tracer_init(struct trace_array *tr) | 584 | static int __wakeup_tracer_init(struct trace_array *tr) |
542 | { | 585 | { |
543 | save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; | 586 | save_flags = trace_flags; |
544 | trace_flags |= TRACE_ITER_LATENCY_FMT; | 587 | |
588 | /* non overwrite screws up the latency tracers */ | ||
589 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); | ||
590 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); | ||
545 | 591 | ||
546 | tracing_max_latency = 0; | 592 | tracing_max_latency = 0; |
547 | wakeup_trace = tr; | 593 | wakeup_trace = tr; |
@@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) | |||
563 | 609 | ||
564 | static void wakeup_tracer_reset(struct trace_array *tr) | 610 | static void wakeup_tracer_reset(struct trace_array *tr) |
565 | { | 611 | { |
612 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; | ||
613 | int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; | ||
614 | |||
566 | stop_wakeup_tracer(tr); | 615 | stop_wakeup_tracer(tr); |
567 | /* make sure we put back any tasks we are tracing */ | 616 | /* make sure we put back any tasks we are tracing */ |
568 | wakeup_reset(tr); | 617 | wakeup_reset(tr); |
569 | 618 | ||
570 | if (!save_lat_flag) | 619 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); |
571 | trace_flags &= ~TRACE_ITER_LATENCY_FMT; | 620 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); |
572 | } | 621 | } |
573 | 622 | ||
574 | static void wakeup_tracer_start(struct trace_array *tr) | 623 | static void wakeup_tracer_start(struct trace_array *tr) |
@@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
594 | .print_line = wakeup_print_line, | 643 | .print_line = wakeup_print_line, |
595 | .flags = &tracer_flags, | 644 | .flags = &tracer_flags, |
596 | .set_flag = wakeup_set_flag, | 645 | .set_flag = wakeup_set_flag, |
646 | .flag_changed = wakeup_flag_changed, | ||
597 | #ifdef CONFIG_FTRACE_SELFTEST | 647 | #ifdef CONFIG_FTRACE_SELFTEST |
598 | .selftest = trace_selftest_startup_wakeup, | 648 | .selftest = trace_selftest_startup_wakeup, |
599 | #endif | 649 | #endif |
@@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
615 | .print_line = wakeup_print_line, | 665 | .print_line = wakeup_print_line, |
616 | .flags = &tracer_flags, | 666 | .flags = &tracer_flags, |
617 | .set_flag = wakeup_set_flag, | 667 | .set_flag = wakeup_set_flag, |
668 | .flag_changed = wakeup_flag_changed, | ||
618 | #ifdef CONFIG_FTRACE_SELFTEST | 669 | #ifdef CONFIG_FTRACE_SELFTEST |
619 | .selftest = trace_selftest_startup_wakeup, | 670 | .selftest = trace_selftest_startup_wakeup, |
620 | #endif | 671 | #endif |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 51c819c12c29..2901e3b88590 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry) | |||
21 | return 0; | 21 | return 0; |
22 | } | 22 | } |
23 | 23 | ||
24 | static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) | 24 | static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) |
25 | { | 25 | { |
26 | struct ring_buffer_event *event; | 26 | struct ring_buffer_event *event; |
27 | struct trace_entry *entry; | 27 | struct trace_entry *entry; |
28 | unsigned int loops = 0; | 28 | unsigned int loops = 0; |
29 | 29 | ||
30 | while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { | 30 | while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) { |
31 | entry = ring_buffer_event_data(event); | 31 | entry = ring_buffer_event_data(event); |
32 | 32 | ||
33 | /* | 33 | /* |
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) | |||
58 | * Test the trace buffer to see if all the elements | 58 | * Test the trace buffer to see if all the elements |
59 | * are still sane. | 59 | * are still sane. |
60 | */ | 60 | */ |
61 | static int trace_test_buffer(struct trace_array *tr, unsigned long *count) | 61 | static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) |
62 | { | 62 | { |
63 | unsigned long flags, cnt = 0; | 63 | unsigned long flags, cnt = 0; |
64 | int cpu, ret = 0; | 64 | int cpu, ret = 0; |
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) | |||
67 | local_irq_save(flags); | 67 | local_irq_save(flags); |
68 | arch_spin_lock(&ftrace_max_lock); | 68 | arch_spin_lock(&ftrace_max_lock); |
69 | 69 | ||
70 | cnt = ring_buffer_entries(tr->buffer); | 70 | cnt = ring_buffer_entries(buf->buffer); |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * The trace_test_buffer_cpu runs a while loop to consume all data. | 73 | * The trace_test_buffer_cpu runs a while loop to consume all data. |
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) | |||
78 | */ | 78 | */ |
79 | tracing_off(); | 79 | tracing_off(); |
80 | for_each_possible_cpu(cpu) { | 80 | for_each_possible_cpu(cpu) { |
81 | ret = trace_test_buffer_cpu(tr, cpu); | 81 | ret = trace_test_buffer_cpu(buf, cpu); |
82 | if (ret) | 82 | if (ret) |
83 | break; | 83 | break; |
84 | } | 84 | } |
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
355 | msleep(100); | 355 | msleep(100); |
356 | 356 | ||
357 | /* we should have nothing in the buffer */ | 357 | /* we should have nothing in the buffer */ |
358 | ret = trace_test_buffer(tr, &count); | 358 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
359 | if (ret) | 359 | if (ret) |
360 | goto out; | 360 | goto out; |
361 | 361 | ||
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
376 | ftrace_enabled = 0; | 376 | ftrace_enabled = 0; |
377 | 377 | ||
378 | /* check the trace buffer */ | 378 | /* check the trace buffer */ |
379 | ret = trace_test_buffer(tr, &count); | 379 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
380 | tracing_start(); | 380 | tracing_start(); |
381 | 381 | ||
382 | /* we should only have one item */ | 382 | /* we should only have one item */ |
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
666 | ftrace_enabled = 0; | 666 | ftrace_enabled = 0; |
667 | 667 | ||
668 | /* check the trace buffer */ | 668 | /* check the trace buffer */ |
669 | ret = trace_test_buffer(tr, &count); | 669 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
670 | trace->reset(tr); | 670 | trace->reset(tr); |
671 | tracing_start(); | 671 | tracing_start(); |
672 | 672 | ||
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
703 | /* Maximum number of functions to trace before diagnosing a hang */ | 703 | /* Maximum number of functions to trace before diagnosing a hang */ |
704 | #define GRAPH_MAX_FUNC_TEST 100000000 | 704 | #define GRAPH_MAX_FUNC_TEST 100000000 |
705 | 705 | ||
706 | static void | ||
707 | __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); | ||
708 | static unsigned int graph_hang_thresh; | 706 | static unsigned int graph_hang_thresh; |
709 | 707 | ||
710 | /* Wrap the real function entry probe to avoid possible hanging */ | 708 | /* Wrap the real function entry probe to avoid possible hanging */ |
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) | |||
714 | if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { | 712 | if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { |
715 | ftrace_graph_stop(); | 713 | ftrace_graph_stop(); |
716 | printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); | 714 | printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); |
717 | if (ftrace_dump_on_oops) | 715 | if (ftrace_dump_on_oops) { |
718 | __ftrace_dump(false, DUMP_ALL); | 716 | ftrace_dump(DUMP_ALL); |
717 | /* ftrace_dump() disables tracing */ | ||
718 | tracing_on(); | ||
719 | } | ||
719 | return 0; | 720 | return 0; |
720 | } | 721 | } |
721 | 722 | ||
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, | |||
737 | * Simulate the init() callback but we attach a watchdog callback | 738 | * Simulate the init() callback but we attach a watchdog callback |
738 | * to detect and recover from possible hangs | 739 | * to detect and recover from possible hangs |
739 | */ | 740 | */ |
740 | tracing_reset_online_cpus(tr); | 741 | tracing_reset_online_cpus(&tr->trace_buffer); |
741 | set_graph_array(tr); | 742 | set_graph_array(tr); |
742 | ret = register_ftrace_graph(&trace_graph_return, | 743 | ret = register_ftrace_graph(&trace_graph_return, |
743 | &trace_graph_entry_watchdog); | 744 | &trace_graph_entry_watchdog); |
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, | |||
760 | tracing_stop(); | 761 | tracing_stop(); |
761 | 762 | ||
762 | /* check the trace buffer */ | 763 | /* check the trace buffer */ |
763 | ret = trace_test_buffer(tr, &count); | 764 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
764 | 765 | ||
765 | trace->reset(tr); | 766 | trace->reset(tr); |
766 | tracing_start(); | 767 | tracing_start(); |
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | |||
815 | /* stop the tracing. */ | 816 | /* stop the tracing. */ |
816 | tracing_stop(); | 817 | tracing_stop(); |
817 | /* check both trace buffers */ | 818 | /* check both trace buffers */ |
818 | ret = trace_test_buffer(tr, NULL); | 819 | ret = trace_test_buffer(&tr->trace_buffer, NULL); |
819 | if (!ret) | 820 | if (!ret) |
820 | ret = trace_test_buffer(&max_tr, &count); | 821 | ret = trace_test_buffer(&tr->max_buffer, &count); |
821 | trace->reset(tr); | 822 | trace->reset(tr); |
822 | tracing_start(); | 823 | tracing_start(); |
823 | 824 | ||
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | |||
877 | /* stop the tracing. */ | 878 | /* stop the tracing. */ |
878 | tracing_stop(); | 879 | tracing_stop(); |
879 | /* check both trace buffers */ | 880 | /* check both trace buffers */ |
880 | ret = trace_test_buffer(tr, NULL); | 881 | ret = trace_test_buffer(&tr->trace_buffer, NULL); |
881 | if (!ret) | 882 | if (!ret) |
882 | ret = trace_test_buffer(&max_tr, &count); | 883 | ret = trace_test_buffer(&tr->max_buffer, &count); |
883 | trace->reset(tr); | 884 | trace->reset(tr); |
884 | tracing_start(); | 885 | tracing_start(); |
885 | 886 | ||
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * | |||
943 | /* stop the tracing. */ | 944 | /* stop the tracing. */ |
944 | tracing_stop(); | 945 | tracing_stop(); |
945 | /* check both trace buffers */ | 946 | /* check both trace buffers */ |
946 | ret = trace_test_buffer(tr, NULL); | 947 | ret = trace_test_buffer(&tr->trace_buffer, NULL); |
947 | if (ret) | 948 | if (ret) |
948 | goto out; | 949 | goto out; |
949 | 950 | ||
950 | ret = trace_test_buffer(&max_tr, &count); | 951 | ret = trace_test_buffer(&tr->max_buffer, &count); |
951 | if (ret) | 952 | if (ret) |
952 | goto out; | 953 | goto out; |
953 | 954 | ||
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * | |||
973 | /* stop the tracing. */ | 974 | /* stop the tracing. */ |
974 | tracing_stop(); | 975 | tracing_stop(); |
975 | /* check both trace buffers */ | 976 | /* check both trace buffers */ |
976 | ret = trace_test_buffer(tr, NULL); | 977 | ret = trace_test_buffer(&tr->trace_buffer, NULL); |
977 | if (ret) | 978 | if (ret) |
978 | goto out; | 979 | goto out; |
979 | 980 | ||
980 | ret = trace_test_buffer(&max_tr, &count); | 981 | ret = trace_test_buffer(&tr->max_buffer, &count); |
981 | 982 | ||
982 | if (!ret && !count) { | 983 | if (!ret && !count) { |
983 | printk(KERN_CONT ".. no entries found .."); | 984 | printk(KERN_CONT ".. no entries found .."); |
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1084 | /* stop the tracing. */ | 1085 | /* stop the tracing. */ |
1085 | tracing_stop(); | 1086 | tracing_stop(); |
1086 | /* check both trace buffers */ | 1087 | /* check both trace buffers */ |
1087 | ret = trace_test_buffer(tr, NULL); | 1088 | ret = trace_test_buffer(&tr->trace_buffer, NULL); |
1088 | printk("ret = %d\n", ret); | 1089 | printk("ret = %d\n", ret); |
1089 | if (!ret) | 1090 | if (!ret) |
1090 | ret = trace_test_buffer(&max_tr, &count); | 1091 | ret = trace_test_buffer(&tr->max_buffer, &count); |
1091 | 1092 | ||
1092 | 1093 | ||
1093 | trace->reset(tr); | 1094 | trace->reset(tr); |
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr | |||
1126 | /* stop the tracing. */ | 1127 | /* stop the tracing. */ |
1127 | tracing_stop(); | 1128 | tracing_stop(); |
1128 | /* check the trace buffer */ | 1129 | /* check the trace buffer */ |
1129 | ret = trace_test_buffer(tr, &count); | 1130 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
1130 | trace->reset(tr); | 1131 | trace->reset(tr); |
1131 | tracing_start(); | 1132 | tracing_start(); |
1132 | 1133 | ||
@@ -1158,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | |||
1158 | /* stop the tracing. */ | 1159 | /* stop the tracing. */ |
1159 | tracing_stop(); | 1160 | tracing_stop(); |
1160 | /* check the trace buffer */ | 1161 | /* check the trace buffer */ |
1161 | ret = trace_test_buffer(tr, &count); | 1162 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
1162 | trace->reset(tr); | 1163 | trace->reset(tr); |
1163 | tracing_start(); | 1164 | tracing_start(); |
1164 | 1165 | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc701..b20428c5efe2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -20,13 +20,24 @@ | |||
20 | 20 | ||
21 | #define STACK_TRACE_ENTRIES 500 | 21 | #define STACK_TRACE_ENTRIES 500 |
22 | 22 | ||
23 | #ifdef CC_USING_FENTRY | ||
24 | # define fentry 1 | ||
25 | #else | ||
26 | # define fentry 0 | ||
27 | #endif | ||
28 | |||
23 | static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = | 29 | static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = |
24 | { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; | 30 | { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; |
25 | static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; | 31 | static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; |
26 | 32 | ||
33 | /* | ||
34 | * Reserve one entry for the passed in ip. This will allow | ||
35 | * us to remove most or all of the stack size overhead | ||
36 | * added by the stack tracer itself. | ||
37 | */ | ||
27 | static struct stack_trace max_stack_trace = { | 38 | static struct stack_trace max_stack_trace = { |
28 | .max_entries = STACK_TRACE_ENTRIES, | 39 | .max_entries = STACK_TRACE_ENTRIES - 1, |
29 | .entries = stack_dump_trace, | 40 | .entries = &stack_dump_trace[1], |
30 | }; | 41 | }; |
31 | 42 | ||
32 | static unsigned long max_stack_size; | 43 | static unsigned long max_stack_size; |
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex); | |||
39 | int stack_tracer_enabled; | 50 | int stack_tracer_enabled; |
40 | static int last_stack_tracer_enabled; | 51 | static int last_stack_tracer_enabled; |
41 | 52 | ||
42 | static inline void check_stack(void) | 53 | static inline void |
54 | check_stack(unsigned long ip, unsigned long *stack) | ||
43 | { | 55 | { |
44 | unsigned long this_size, flags; | 56 | unsigned long this_size, flags; |
45 | unsigned long *p, *top, *start; | 57 | unsigned long *p, *top, *start; |
58 | static int tracer_frame; | ||
59 | int frame_size = ACCESS_ONCE(tracer_frame); | ||
46 | int i; | 60 | int i; |
47 | 61 | ||
48 | this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); | 62 | this_size = ((unsigned long)stack) & (THREAD_SIZE-1); |
49 | this_size = THREAD_SIZE - this_size; | 63 | this_size = THREAD_SIZE - this_size; |
64 | /* Remove the frame of the tracer */ | ||
65 | this_size -= frame_size; | ||
50 | 66 | ||
51 | if (this_size <= max_stack_size) | 67 | if (this_size <= max_stack_size) |
52 | return; | 68 | return; |
53 | 69 | ||
54 | /* we do not handle interrupt stacks yet */ | 70 | /* we do not handle interrupt stacks yet */ |
55 | if (!object_is_on_stack(&this_size)) | 71 | if (!object_is_on_stack(stack)) |
56 | return; | 72 | return; |
57 | 73 | ||
58 | local_irq_save(flags); | 74 | local_irq_save(flags); |
59 | arch_spin_lock(&max_stack_lock); | 75 | arch_spin_lock(&max_stack_lock); |
60 | 76 | ||
77 | /* In case another CPU set the tracer_frame on us */ | ||
78 | if (unlikely(!frame_size)) | ||
79 | this_size -= tracer_frame; | ||
80 | |||
61 | /* a race could have already updated it */ | 81 | /* a race could have already updated it */ |
62 | if (this_size <= max_stack_size) | 82 | if (this_size <= max_stack_size) |
63 | goto out; | 83 | goto out; |
@@ -70,10 +90,18 @@ static inline void check_stack(void) | |||
70 | save_stack_trace(&max_stack_trace); | 90 | save_stack_trace(&max_stack_trace); |
71 | 91 | ||
72 | /* | 92 | /* |
93 | * Add the passed in ip from the function tracer. | ||
94 | * Searching for this on the stack will skip over | ||
95 | * most of the overhead from the stack tracer itself. | ||
96 | */ | ||
97 | stack_dump_trace[0] = ip; | ||
98 | max_stack_trace.nr_entries++; | ||
99 | |||
100 | /* | ||
73 | * Now find where in the stack these are. | 101 | * Now find where in the stack these are. |
74 | */ | 102 | */ |
75 | i = 0; | 103 | i = 0; |
76 | start = &this_size; | 104 | start = stack; |
77 | top = (unsigned long *) | 105 | top = (unsigned long *) |
78 | (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); | 106 | (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); |
79 | 107 | ||
@@ -97,6 +125,18 @@ static inline void check_stack(void) | |||
97 | found = 1; | 125 | found = 1; |
98 | /* Start the search from here */ | 126 | /* Start the search from here */ |
99 | start = p + 1; | 127 | start = p + 1; |
128 | /* | ||
129 | * We do not want to show the overhead | ||
130 | * of the stack tracer stack in the | ||
131 | * max stack. If we haven't figured | ||
132 | * out what that is, then figure it out | ||
133 | * now. | ||
134 | */ | ||
135 | if (unlikely(!tracer_frame) && i == 1) { | ||
136 | tracer_frame = (p - stack) * | ||
137 | sizeof(unsigned long); | ||
138 | max_stack_size -= tracer_frame; | ||
139 | } | ||
100 | } | 140 | } |
101 | } | 141 | } |
102 | 142 | ||
@@ -113,6 +153,7 @@ static void | |||
113 | stack_trace_call(unsigned long ip, unsigned long parent_ip, | 153 | stack_trace_call(unsigned long ip, unsigned long parent_ip, |
114 | struct ftrace_ops *op, struct pt_regs *pt_regs) | 154 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
115 | { | 155 | { |
156 | unsigned long stack; | ||
116 | int cpu; | 157 | int cpu; |
117 | 158 | ||
118 | preempt_disable_notrace(); | 159 | preempt_disable_notrace(); |
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
122 | if (per_cpu(trace_active, cpu)++ != 0) | 163 | if (per_cpu(trace_active, cpu)++ != 0) |
123 | goto out; | 164 | goto out; |
124 | 165 | ||
125 | check_stack(); | 166 | /* |
167 | * When fentry is used, the traced function does not get | ||
168 | * its stack frame set up, and we lose the parent. | ||
169 | * The ip is pretty useless because the function tracer | ||
170 | * was called before that function set up its stack frame. | ||
171 | * In this case, we use the parent ip. | ||
172 | * | ||
173 | * By adding the return address of either the parent ip | ||
174 | * or the current ip we can disregard most of the stack usage | ||
175 | * caused by the stack tracer itself. | ||
176 | * | ||
177 | * The function tracer always reports the address of where the | ||
178 | * mcount call was, but the stack will hold the return address. | ||
179 | */ | ||
180 | if (fentry) | ||
181 | ip = parent_ip; | ||
182 | else | ||
183 | ip += MCOUNT_INSN_SIZE; | ||
184 | |||
185 | check_stack(ip, &stack); | ||
126 | 186 | ||
127 | out: | 187 | out: |
128 | per_cpu(trace_active, cpu)--; | 188 | per_cpu(trace_active, cpu)--; |
@@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = { | |||
322 | .open = stack_trace_filter_open, | 382 | .open = stack_trace_filter_open, |
323 | .read = seq_read, | 383 | .read = seq_read, |
324 | .write = ftrace_filter_write, | 384 | .write = ftrace_filter_write, |
325 | .llseek = ftrace_regex_lseek, | 385 | .llseek = ftrace_filter_lseek, |
326 | .release = ftrace_regex_release, | 386 | .release = ftrace_regex_release, |
327 | }; | 387 | }; |
328 | 388 | ||
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void) | |||
371 | struct dentry *d_tracer; | 431 | struct dentry *d_tracer; |
372 | 432 | ||
373 | d_tracer = tracing_init_dentry(); | 433 | d_tracer = tracing_init_dentry(); |
434 | if (!d_tracer) | ||
435 | return 0; | ||
374 | 436 | ||
375 | trace_create_file("stack_max_size", 0644, d_tracer, | 437 | trace_create_file("stack_max_size", 0644, d_tracer, |
376 | &max_stack_size, &stack_max_size_fops); | 438 | &max_stack_size, &stack_max_size_fops); |
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e73..847f88a6194b 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c | |||
@@ -307,6 +307,8 @@ static int tracing_stat_init(void) | |||
307 | struct dentry *d_tracing; | 307 | struct dentry *d_tracing; |
308 | 308 | ||
309 | d_tracing = tracing_init_dentry(); | 309 | d_tracing = tracing_init_dentry(); |
310 | if (!d_tracing) | ||
311 | return 0; | ||
310 | 312 | ||
311 | stat_dir = debugfs_create_dir("trace_stat", d_tracing); | 313 | stat_dir = debugfs_create_dir("trace_stat", d_tracing); |
312 | if (!stat_dir) | 314 | if (!stat_dir) |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a809e321058..8f2ac73c7a5f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -12,10 +12,6 @@ | |||
12 | #include "trace.h" | 12 | #include "trace.h" |
13 | 13 | ||
14 | static DEFINE_MUTEX(syscall_trace_lock); | 14 | static DEFINE_MUTEX(syscall_trace_lock); |
15 | static int sys_refcount_enter; | ||
16 | static int sys_refcount_exit; | ||
17 | static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); | ||
18 | static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); | ||
19 | 15 | ||
20 | static int syscall_enter_register(struct ftrace_event_call *event, | 16 | static int syscall_enter_register(struct ftrace_event_call *event, |
21 | enum trace_reg type, void *data); | 17 | enum trace_reg type, void *data); |
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name | |||
41 | /* | 37 | /* |
42 | * Only compare after the "sys" prefix. Archs that use | 38 | * Only compare after the "sys" prefix. Archs that use |
43 | * syscall wrappers may have syscalls symbols aliases prefixed | 39 | * syscall wrappers may have syscalls symbols aliases prefixed |
44 | * with "SyS" instead of "sys", leading to an unwanted | 40 | * with ".SyS" or ".sys" instead of "sys", leading to an unwanted |
45 | * mismatch. | 41 | * mismatch. |
46 | */ | 42 | */ |
47 | return !strcmp(sym + 3, name + 3); | 43 | return !strcmp(sym + 3, name + 3); |
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) | |||
265 | kfree(call->print_fmt); | 261 | kfree(call->print_fmt); |
266 | } | 262 | } |
267 | 263 | ||
268 | static int syscall_enter_define_fields(struct ftrace_event_call *call) | 264 | static int __init syscall_enter_define_fields(struct ftrace_event_call *call) |
269 | { | 265 | { |
270 | struct syscall_trace_enter trace; | 266 | struct syscall_trace_enter trace; |
271 | struct syscall_metadata *meta = call->data; | 267 | struct syscall_metadata *meta = call->data; |
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call) | |||
288 | return ret; | 284 | return ret; |
289 | } | 285 | } |
290 | 286 | ||
291 | static int syscall_exit_define_fields(struct ftrace_event_call *call) | 287 | static int __init syscall_exit_define_fields(struct ftrace_event_call *call) |
292 | { | 288 | { |
293 | struct syscall_trace_exit trace; | 289 | struct syscall_trace_exit trace; |
294 | int ret; | 290 | int ret; |
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) | |||
303 | return ret; | 299 | return ret; |
304 | } | 300 | } |
305 | 301 | ||
306 | static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) | 302 | static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) |
307 | { | 303 | { |
304 | struct trace_array *tr = data; | ||
308 | struct syscall_trace_enter *entry; | 305 | struct syscall_trace_enter *entry; |
309 | struct syscall_metadata *sys_data; | 306 | struct syscall_metadata *sys_data; |
310 | struct ring_buffer_event *event; | 307 | struct ring_buffer_event *event; |
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
315 | syscall_nr = trace_get_syscall_nr(current, regs); | 312 | syscall_nr = trace_get_syscall_nr(current, regs); |
316 | if (syscall_nr < 0) | 313 | if (syscall_nr < 0) |
317 | return; | 314 | return; |
318 | if (!test_bit(syscall_nr, enabled_enter_syscalls)) | 315 | if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) |
319 | return; | 316 | return; |
320 | 317 | ||
321 | sys_data = syscall_nr_to_meta(syscall_nr); | 318 | sys_data = syscall_nr_to_meta(syscall_nr); |
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
324 | 321 | ||
325 | size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; | 322 | size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; |
326 | 323 | ||
327 | event = trace_current_buffer_lock_reserve(&buffer, | 324 | buffer = tr->trace_buffer.buffer; |
325 | event = trace_buffer_lock_reserve(buffer, | ||
328 | sys_data->enter_event->event.type, size, 0, 0); | 326 | sys_data->enter_event->event.type, size, 0, 0); |
329 | if (!event) | 327 | if (!event) |
330 | return; | 328 | return; |
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
338 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); | 336 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); |
339 | } | 337 | } |
340 | 338 | ||
341 | static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | 339 | static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) |
342 | { | 340 | { |
341 | struct trace_array *tr = data; | ||
343 | struct syscall_trace_exit *entry; | 342 | struct syscall_trace_exit *entry; |
344 | struct syscall_metadata *sys_data; | 343 | struct syscall_metadata *sys_data; |
345 | struct ring_buffer_event *event; | 344 | struct ring_buffer_event *event; |
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
349 | syscall_nr = trace_get_syscall_nr(current, regs); | 348 | syscall_nr = trace_get_syscall_nr(current, regs); |
350 | if (syscall_nr < 0) | 349 | if (syscall_nr < 0) |
351 | return; | 350 | return; |
352 | if (!test_bit(syscall_nr, enabled_exit_syscalls)) | 351 | if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) |
353 | return; | 352 | return; |
354 | 353 | ||
355 | sys_data = syscall_nr_to_meta(syscall_nr); | 354 | sys_data = syscall_nr_to_meta(syscall_nr); |
356 | if (!sys_data) | 355 | if (!sys_data) |
357 | return; | 356 | return; |
358 | 357 | ||
359 | event = trace_current_buffer_lock_reserve(&buffer, | 358 | buffer = tr->trace_buffer.buffer; |
359 | event = trace_buffer_lock_reserve(buffer, | ||
360 | sys_data->exit_event->event.type, sizeof(*entry), 0, 0); | 360 | sys_data->exit_event->event.type, sizeof(*entry), 0, 0); |
361 | if (!event) | 361 | if (!event) |
362 | return; | 362 | return; |
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
370 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); | 370 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); |
371 | } | 371 | } |
372 | 372 | ||
373 | static int reg_event_syscall_enter(struct ftrace_event_call *call) | 373 | static int reg_event_syscall_enter(struct ftrace_event_file *file, |
374 | struct ftrace_event_call *call) | ||
374 | { | 375 | { |
376 | struct trace_array *tr = file->tr; | ||
375 | int ret = 0; | 377 | int ret = 0; |
376 | int num; | 378 | int num; |
377 | 379 | ||
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call) | |||
379 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) | 381 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
380 | return -ENOSYS; | 382 | return -ENOSYS; |
381 | mutex_lock(&syscall_trace_lock); | 383 | mutex_lock(&syscall_trace_lock); |
382 | if (!sys_refcount_enter) | 384 | if (!tr->sys_refcount_enter) |
383 | ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); | 385 | ret = register_trace_sys_enter(ftrace_syscall_enter, tr); |
384 | if (!ret) { | 386 | if (!ret) { |
385 | set_bit(num, enabled_enter_syscalls); | 387 | set_bit(num, tr->enabled_enter_syscalls); |
386 | sys_refcount_enter++; | 388 | tr->sys_refcount_enter++; |
387 | } | 389 | } |
388 | mutex_unlock(&syscall_trace_lock); | 390 | mutex_unlock(&syscall_trace_lock); |
389 | return ret; | 391 | return ret; |
390 | } | 392 | } |
391 | 393 | ||
392 | static void unreg_event_syscall_enter(struct ftrace_event_call *call) | 394 | static void unreg_event_syscall_enter(struct ftrace_event_file *file, |
395 | struct ftrace_event_call *call) | ||
393 | { | 396 | { |
397 | struct trace_array *tr = file->tr; | ||
394 | int num; | 398 | int num; |
395 | 399 | ||
396 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 400 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
397 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) | 401 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
398 | return; | 402 | return; |
399 | mutex_lock(&syscall_trace_lock); | 403 | mutex_lock(&syscall_trace_lock); |
400 | sys_refcount_enter--; | 404 | tr->sys_refcount_enter--; |
401 | clear_bit(num, enabled_enter_syscalls); | 405 | clear_bit(num, tr->enabled_enter_syscalls); |
402 | if (!sys_refcount_enter) | 406 | if (!tr->sys_refcount_enter) |
403 | unregister_trace_sys_enter(ftrace_syscall_enter, NULL); | 407 | unregister_trace_sys_enter(ftrace_syscall_enter, tr); |
404 | mutex_unlock(&syscall_trace_lock); | 408 | mutex_unlock(&syscall_trace_lock); |
405 | } | 409 | } |
406 | 410 | ||
407 | static int reg_event_syscall_exit(struct ftrace_event_call *call) | 411 | static int reg_event_syscall_exit(struct ftrace_event_file *file, |
412 | struct ftrace_event_call *call) | ||
408 | { | 413 | { |
414 | struct trace_array *tr = file->tr; | ||
409 | int ret = 0; | 415 | int ret = 0; |
410 | int num; | 416 | int num; |
411 | 417 | ||
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call) | |||
413 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) | 419 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
414 | return -ENOSYS; | 420 | return -ENOSYS; |
415 | mutex_lock(&syscall_trace_lock); | 421 | mutex_lock(&syscall_trace_lock); |
416 | if (!sys_refcount_exit) | 422 | if (!tr->sys_refcount_exit) |
417 | ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); | 423 | ret = register_trace_sys_exit(ftrace_syscall_exit, tr); |
418 | if (!ret) { | 424 | if (!ret) { |
419 | set_bit(num, enabled_exit_syscalls); | 425 | set_bit(num, tr->enabled_exit_syscalls); |
420 | sys_refcount_exit++; | 426 | tr->sys_refcount_exit++; |
421 | } | 427 | } |
422 | mutex_unlock(&syscall_trace_lock); | 428 | mutex_unlock(&syscall_trace_lock); |
423 | return ret; | 429 | return ret; |
424 | } | 430 | } |
425 | 431 | ||
426 | static void unreg_event_syscall_exit(struct ftrace_event_call *call) | 432 | static void unreg_event_syscall_exit(struct ftrace_event_file *file, |
433 | struct ftrace_event_call *call) | ||
427 | { | 434 | { |
435 | struct trace_array *tr = file->tr; | ||
428 | int num; | 436 | int num; |
429 | 437 | ||
430 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 438 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
431 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) | 439 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
432 | return; | 440 | return; |
433 | mutex_lock(&syscall_trace_lock); | 441 | mutex_lock(&syscall_trace_lock); |
434 | sys_refcount_exit--; | 442 | tr->sys_refcount_exit--; |
435 | clear_bit(num, enabled_exit_syscalls); | 443 | clear_bit(num, tr->enabled_exit_syscalls); |
436 | if (!sys_refcount_exit) | 444 | if (!tr->sys_refcount_exit) |
437 | unregister_trace_sys_exit(ftrace_syscall_exit, NULL); | 445 | unregister_trace_sys_exit(ftrace_syscall_exit, tr); |
438 | mutex_unlock(&syscall_trace_lock); | 446 | mutex_unlock(&syscall_trace_lock); |
439 | } | 447 | } |
440 | 448 | ||
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = { | |||
471 | .trace = print_syscall_exit, | 479 | .trace = print_syscall_exit, |
472 | }; | 480 | }; |
473 | 481 | ||
474 | struct ftrace_event_class event_class_syscall_enter = { | 482 | struct ftrace_event_class __refdata event_class_syscall_enter = { |
475 | .system = "syscalls", | 483 | .system = "syscalls", |
476 | .reg = syscall_enter_register, | 484 | .reg = syscall_enter_register, |
477 | .define_fields = syscall_enter_define_fields, | 485 | .define_fields = syscall_enter_define_fields, |
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = { | |||
479 | .raw_init = init_syscall_trace, | 487 | .raw_init = init_syscall_trace, |
480 | }; | 488 | }; |
481 | 489 | ||
482 | struct ftrace_event_class event_class_syscall_exit = { | 490 | struct ftrace_event_class __refdata event_class_syscall_exit = { |
483 | .system = "syscalls", | 491 | .system = "syscalls", |
484 | .reg = syscall_exit_register, | 492 | .reg = syscall_exit_register, |
485 | .define_fields = syscall_exit_define_fields, | 493 | .define_fields = syscall_exit_define_fields, |
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call) | |||
685 | static int syscall_enter_register(struct ftrace_event_call *event, | 693 | static int syscall_enter_register(struct ftrace_event_call *event, |
686 | enum trace_reg type, void *data) | 694 | enum trace_reg type, void *data) |
687 | { | 695 | { |
696 | struct ftrace_event_file *file = data; | ||
697 | |||
688 | switch (type) { | 698 | switch (type) { |
689 | case TRACE_REG_REGISTER: | 699 | case TRACE_REG_REGISTER: |
690 | return reg_event_syscall_enter(event); | 700 | return reg_event_syscall_enter(file, event); |
691 | case TRACE_REG_UNREGISTER: | 701 | case TRACE_REG_UNREGISTER: |
692 | unreg_event_syscall_enter(event); | 702 | unreg_event_syscall_enter(file, event); |
693 | return 0; | 703 | return 0; |
694 | 704 | ||
695 | #ifdef CONFIG_PERF_EVENTS | 705 | #ifdef CONFIG_PERF_EVENTS |
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event, | |||
711 | static int syscall_exit_register(struct ftrace_event_call *event, | 721 | static int syscall_exit_register(struct ftrace_event_call *event, |
712 | enum trace_reg type, void *data) | 722 | enum trace_reg type, void *data) |
713 | { | 723 | { |
724 | struct ftrace_event_file *file = data; | ||
725 | |||
714 | switch (type) { | 726 | switch (type) { |
715 | case TRACE_REG_REGISTER: | 727 | case TRACE_REG_REGISTER: |
716 | return reg_event_syscall_exit(event); | 728 | return reg_event_syscall_exit(file, event); |
717 | case TRACE_REG_UNREGISTER: | 729 | case TRACE_REG_UNREGISTER: |
718 | unreg_event_syscall_exit(event); | 730 | unreg_event_syscall_exit(file, event); |
719 | return 0; | 731 | return 0; |
720 | 732 | ||
721 | #ifdef CONFIG_PERF_EVENTS | 733 | #ifdef CONFIG_PERF_EVENTS |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8dad2a92dee9..32494fb0ee64 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -28,6 +28,18 @@ | |||
28 | 28 | ||
29 | #define UPROBE_EVENT_SYSTEM "uprobes" | 29 | #define UPROBE_EVENT_SYSTEM "uprobes" |
30 | 30 | ||
31 | struct uprobe_trace_entry_head { | ||
32 | struct trace_entry ent; | ||
33 | unsigned long vaddr[]; | ||
34 | }; | ||
35 | |||
36 | #define SIZEOF_TRACE_ENTRY(is_return) \ | ||
37 | (sizeof(struct uprobe_trace_entry_head) + \ | ||
38 | sizeof(unsigned long) * (is_return ? 2 : 1)) | ||
39 | |||
40 | #define DATAOF_TRACE_ENTRY(entry, is_return) \ | ||
41 | ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) | ||
42 | |||
31 | struct trace_uprobe_filter { | 43 | struct trace_uprobe_filter { |
32 | rwlock_t rwlock; | 44 | rwlock_t rwlock; |
33 | int nr_systemwide; | 45 | int nr_systemwide; |
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock); | |||
64 | static LIST_HEAD(uprobe_list); | 76 | static LIST_HEAD(uprobe_list); |
65 | 77 | ||
66 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); | 78 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); |
79 | static int uretprobe_dispatcher(struct uprobe_consumer *con, | ||
80 | unsigned long func, struct pt_regs *regs); | ||
67 | 81 | ||
68 | static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) | 82 | static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) |
69 | { | 83 | { |
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) | |||
77 | return !filter->nr_systemwide && list_empty(&filter->perf_events); | 91 | return !filter->nr_systemwide && list_empty(&filter->perf_events); |
78 | } | 92 | } |
79 | 93 | ||
94 | static inline bool is_ret_probe(struct trace_uprobe *tu) | ||
95 | { | ||
96 | return tu->consumer.ret_handler != NULL; | ||
97 | } | ||
98 | |||
80 | /* | 99 | /* |
81 | * Allocate new trace_uprobe and initialize it (including uprobes). | 100 | * Allocate new trace_uprobe and initialize it (including uprobes). |
82 | */ | 101 | */ |
83 | static struct trace_uprobe * | 102 | static struct trace_uprobe * |
84 | alloc_trace_uprobe(const char *group, const char *event, int nargs) | 103 | alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) |
85 | { | 104 | { |
86 | struct trace_uprobe *tu; | 105 | struct trace_uprobe *tu; |
87 | 106 | ||
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) | |||
106 | 125 | ||
107 | INIT_LIST_HEAD(&tu->list); | 126 | INIT_LIST_HEAD(&tu->list); |
108 | tu->consumer.handler = uprobe_dispatcher; | 127 | tu->consumer.handler = uprobe_dispatcher; |
128 | if (is_ret) | ||
129 | tu->consumer.ret_handler = uretprobe_dispatcher; | ||
109 | init_trace_uprobe_filter(&tu->filter); | 130 | init_trace_uprobe_filter(&tu->filter); |
110 | return tu; | 131 | return tu; |
111 | 132 | ||
@@ -180,7 +201,7 @@ end: | |||
180 | 201 | ||
181 | /* | 202 | /* |
182 | * Argument syntax: | 203 | * Argument syntax: |
183 | * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] | 204 | * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] |
184 | * | 205 | * |
185 | * - Remove uprobe: -:[GRP/]EVENT | 206 | * - Remove uprobe: -:[GRP/]EVENT |
186 | */ | 207 | */ |
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv) | |||
192 | char buf[MAX_EVENT_NAME_LEN]; | 213 | char buf[MAX_EVENT_NAME_LEN]; |
193 | struct path path; | 214 | struct path path; |
194 | unsigned long offset; | 215 | unsigned long offset; |
195 | bool is_delete; | 216 | bool is_delete, is_return; |
196 | int i, ret; | 217 | int i, ret; |
197 | 218 | ||
198 | inode = NULL; | 219 | inode = NULL; |
199 | ret = 0; | 220 | ret = 0; |
200 | is_delete = false; | 221 | is_delete = false; |
222 | is_return = false; | ||
201 | event = NULL; | 223 | event = NULL; |
202 | group = NULL; | 224 | group = NULL; |
203 | 225 | ||
204 | /* argc must be >= 1 */ | 226 | /* argc must be >= 1 */ |
205 | if (argv[0][0] == '-') | 227 | if (argv[0][0] == '-') |
206 | is_delete = true; | 228 | is_delete = true; |
229 | else if (argv[0][0] == 'r') | ||
230 | is_return = true; | ||
207 | else if (argv[0][0] != 'p') { | 231 | else if (argv[0][0] != 'p') { |
208 | pr_info("Probe definition must be started with 'p' or '-'.\n"); | 232 | pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); |
209 | return -EINVAL; | 233 | return -EINVAL; |
210 | } | 234 | } |
211 | 235 | ||
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv) | |||
303 | kfree(tail); | 327 | kfree(tail); |
304 | } | 328 | } |
305 | 329 | ||
306 | tu = alloc_trace_uprobe(group, event, argc); | 330 | tu = alloc_trace_uprobe(group, event, argc, is_return); |
307 | if (IS_ERR(tu)) { | 331 | if (IS_ERR(tu)) { |
308 | pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); | 332 | pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); |
309 | ret = PTR_ERR(tu); | 333 | ret = PTR_ERR(tu); |
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v) | |||
414 | static int probes_seq_show(struct seq_file *m, void *v) | 438 | static int probes_seq_show(struct seq_file *m, void *v) |
415 | { | 439 | { |
416 | struct trace_uprobe *tu = v; | 440 | struct trace_uprobe *tu = v; |
441 | char c = is_ret_probe(tu) ? 'r' : 'p'; | ||
417 | int i; | 442 | int i; |
418 | 443 | ||
419 | seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); | 444 | seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); |
420 | seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); | 445 | seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); |
421 | 446 | ||
422 | for (i = 0; i < tu->nr_args; i++) | 447 | for (i = 0; i < tu->nr_args; i++) |
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = { | |||
485 | .release = seq_release, | 510 | .release = seq_release, |
486 | }; | 511 | }; |
487 | 512 | ||
488 | /* uprobe handler */ | 513 | static void uprobe_trace_print(struct trace_uprobe *tu, |
489 | static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) | 514 | unsigned long func, struct pt_regs *regs) |
490 | { | 515 | { |
491 | struct uprobe_trace_entry_head *entry; | 516 | struct uprobe_trace_entry_head *entry; |
492 | struct ring_buffer_event *event; | 517 | struct ring_buffer_event *event; |
493 | struct ring_buffer *buffer; | 518 | struct ring_buffer *buffer; |
494 | u8 *data; | 519 | void *data; |
495 | int size, i, pc; | 520 | int size, i; |
496 | unsigned long irq_flags; | ||
497 | struct ftrace_event_call *call = &tu->call; | 521 | struct ftrace_event_call *call = &tu->call; |
498 | 522 | ||
499 | local_save_flags(irq_flags); | 523 | size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); |
500 | pc = preempt_count(); | ||
501 | |||
502 | size = sizeof(*entry) + tu->size; | ||
503 | |||
504 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 524 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
505 | size, irq_flags, pc); | 525 | size + tu->size, 0, 0); |
506 | if (!event) | 526 | if (!event) |
507 | return 0; | 527 | return; |
508 | 528 | ||
509 | entry = ring_buffer_event_data(event); | 529 | entry = ring_buffer_event_data(event); |
510 | entry->ip = instruction_pointer(task_pt_regs(current)); | 530 | if (is_ret_probe(tu)) { |
511 | data = (u8 *)&entry[1]; | 531 | entry->vaddr[0] = func; |
532 | entry->vaddr[1] = instruction_pointer(regs); | ||
533 | data = DATAOF_TRACE_ENTRY(entry, true); | ||
534 | } else { | ||
535 | entry->vaddr[0] = instruction_pointer(regs); | ||
536 | data = DATAOF_TRACE_ENTRY(entry, false); | ||
537 | } | ||
538 | |||
512 | for (i = 0; i < tu->nr_args; i++) | 539 | for (i = 0; i < tu->nr_args; i++) |
513 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | 540 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); |
514 | 541 | ||
515 | if (!filter_current_check_discard(buffer, call, entry, event)) | 542 | if (!filter_current_check_discard(buffer, call, entry, event)) |
516 | trace_buffer_unlock_commit(buffer, event, irq_flags, pc); | 543 | trace_buffer_unlock_commit(buffer, event, 0, 0); |
544 | } | ||
517 | 545 | ||
546 | /* uprobe handler */ | ||
547 | static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) | ||
548 | { | ||
549 | if (!is_ret_probe(tu)) | ||
550 | uprobe_trace_print(tu, 0, regs); | ||
518 | return 0; | 551 | return 0; |
519 | } | 552 | } |
520 | 553 | ||
554 | static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, | ||
555 | struct pt_regs *regs) | ||
556 | { | ||
557 | uprobe_trace_print(tu, func, regs); | ||
558 | } | ||
559 | |||
521 | /* Event entry printers */ | 560 | /* Event entry printers */ |
522 | static enum print_line_t | 561 | static enum print_line_t |
523 | print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) | 562 | print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) |
524 | { | 563 | { |
525 | struct uprobe_trace_entry_head *field; | 564 | struct uprobe_trace_entry_head *entry; |
526 | struct trace_seq *s = &iter->seq; | 565 | struct trace_seq *s = &iter->seq; |
527 | struct trace_uprobe *tu; | 566 | struct trace_uprobe *tu; |
528 | u8 *data; | 567 | u8 *data; |
529 | int i; | 568 | int i; |
530 | 569 | ||
531 | field = (struct uprobe_trace_entry_head *)iter->ent; | 570 | entry = (struct uprobe_trace_entry_head *)iter->ent; |
532 | tu = container_of(event, struct trace_uprobe, call.event); | 571 | tu = container_of(event, struct trace_uprobe, call.event); |
533 | 572 | ||
534 | if (!trace_seq_printf(s, "%s: (", tu->call.name)) | 573 | if (is_ret_probe(tu)) { |
535 | goto partial; | 574 | if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, |
536 | 575 | entry->vaddr[1], entry->vaddr[0])) | |
537 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) | 576 | goto partial; |
538 | goto partial; | 577 | data = DATAOF_TRACE_ENTRY(entry, true); |
539 | 578 | } else { | |
540 | if (!trace_seq_puts(s, ")")) | 579 | if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, |
541 | goto partial; | 580 | entry->vaddr[0])) |
581 | goto partial; | ||
582 | data = DATAOF_TRACE_ENTRY(entry, false); | ||
583 | } | ||
542 | 584 | ||
543 | data = (u8 *)&field[1]; | ||
544 | for (i = 0; i < tu->nr_args; i++) { | 585 | for (i = 0; i < tu->nr_args; i++) { |
545 | if (!tu->args[i].type->print(s, tu->args[i].name, | 586 | if (!tu->args[i].type->print(s, tu->args[i].name, |
546 | data + tu->args[i].offset, field)) | 587 | data + tu->args[i].offset, entry)) |
547 | goto partial; | 588 | goto partial; |
548 | } | 589 | } |
549 | 590 | ||
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) | |||
595 | 636 | ||
596 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) | 637 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) |
597 | { | 638 | { |
598 | int ret, i; | 639 | int ret, i, size; |
599 | struct uprobe_trace_entry_head field; | 640 | struct uprobe_trace_entry_head field; |
600 | struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; | 641 | struct trace_uprobe *tu = event_call->data; |
601 | 642 | ||
602 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | 643 | if (is_ret_probe(tu)) { |
644 | DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); | ||
645 | DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); | ||
646 | size = SIZEOF_TRACE_ENTRY(true); | ||
647 | } else { | ||
648 | DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); | ||
649 | size = SIZEOF_TRACE_ENTRY(false); | ||
650 | } | ||
603 | /* Set argument names as fields */ | 651 | /* Set argument names as fields */ |
604 | for (i = 0; i < tu->nr_args; i++) { | 652 | for (i = 0; i < tu->nr_args; i++) { |
605 | ret = trace_define_field(event_call, tu->args[i].type->fmttype, | 653 | ret = trace_define_field(event_call, tu->args[i].type->fmttype, |
606 | tu->args[i].name, | 654 | tu->args[i].name, |
607 | sizeof(field) + tu->args[i].offset, | 655 | size + tu->args[i].offset, |
608 | tu->args[i].type->size, | 656 | tu->args[i].type->size, |
609 | tu->args[i].type->is_signed, | 657 | tu->args[i].type->is_signed, |
610 | FILTER_OTHER); | 658 | FILTER_OTHER); |
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) | |||
622 | int i; | 670 | int i; |
623 | int pos = 0; | 671 | int pos = 0; |
624 | 672 | ||
625 | fmt = "(%lx)"; | 673 | if (is_ret_probe(tu)) { |
626 | arg = "REC->" FIELD_STRING_IP; | 674 | fmt = "(%lx <- %lx)"; |
675 | arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; | ||
676 | } else { | ||
677 | fmt = "(%lx)"; | ||
678 | arg = "REC->" FIELD_STRING_IP; | ||
679 | } | ||
627 | 680 | ||
628 | /* When len=0, we just calculate the needed length */ | 681 | /* When len=0, we just calculate the needed length */ |
629 | 682 | ||
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, | |||
752 | return ret; | 805 | return ret; |
753 | } | 806 | } |
754 | 807 | ||
755 | /* uprobe profile handler */ | 808 | static void uprobe_perf_print(struct trace_uprobe *tu, |
756 | static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | 809 | unsigned long func, struct pt_regs *regs) |
757 | { | 810 | { |
758 | struct ftrace_event_call *call = &tu->call; | 811 | struct ftrace_event_call *call = &tu->call; |
759 | struct uprobe_trace_entry_head *entry; | 812 | struct uprobe_trace_entry_head *entry; |
760 | struct hlist_head *head; | 813 | struct hlist_head *head; |
761 | u8 *data; | 814 | void *data; |
762 | int size, __size, i; | 815 | int size, rctx, i; |
763 | int rctx; | ||
764 | 816 | ||
765 | if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) | 817 | size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); |
766 | return UPROBE_HANDLER_REMOVE; | 818 | size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); |
767 | |||
768 | __size = sizeof(*entry) + tu->size; | ||
769 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | ||
770 | size -= sizeof(u32); | ||
771 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) | 819 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) |
772 | return 0; | 820 | return; |
773 | 821 | ||
774 | preempt_disable(); | 822 | preempt_disable(); |
823 | head = this_cpu_ptr(call->perf_events); | ||
824 | if (hlist_empty(head)) | ||
825 | goto out; | ||
775 | 826 | ||
776 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 827 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); |
777 | if (!entry) | 828 | if (!entry) |
778 | goto out; | 829 | goto out; |
779 | 830 | ||
780 | entry->ip = instruction_pointer(task_pt_regs(current)); | 831 | if (is_ret_probe(tu)) { |
781 | data = (u8 *)&entry[1]; | 832 | entry->vaddr[0] = func; |
833 | entry->vaddr[1] = instruction_pointer(regs); | ||
834 | data = DATAOF_TRACE_ENTRY(entry, true); | ||
835 | } else { | ||
836 | entry->vaddr[0] = instruction_pointer(regs); | ||
837 | data = DATAOF_TRACE_ENTRY(entry, false); | ||
838 | } | ||
839 | |||
782 | for (i = 0; i < tu->nr_args; i++) | 840 | for (i = 0; i < tu->nr_args; i++) |
783 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | 841 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); |
784 | 842 | ||
785 | head = this_cpu_ptr(call->perf_events); | 843 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); |
786 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); | ||
787 | |||
788 | out: | 844 | out: |
789 | preempt_enable(); | 845 | preempt_enable(); |
846 | } | ||
847 | |||
848 | /* uprobe profile handler */ | ||
849 | static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | ||
850 | { | ||
851 | if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) | ||
852 | return UPROBE_HANDLER_REMOVE; | ||
853 | |||
854 | if (!is_ret_probe(tu)) | ||
855 | uprobe_perf_print(tu, 0, regs); | ||
790 | return 0; | 856 | return 0; |
791 | } | 857 | } |
858 | |||
859 | static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, | ||
860 | struct pt_regs *regs) | ||
861 | { | ||
862 | uprobe_perf_print(tu, func, regs); | ||
863 | } | ||
792 | #endif /* CONFIG_PERF_EVENTS */ | 864 | #endif /* CONFIG_PERF_EVENTS */ |
793 | 865 | ||
794 | static | 866 | static |
795 | int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) | 867 | int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) |
796 | { | 868 | { |
797 | struct trace_uprobe *tu = (struct trace_uprobe *)event->data; | 869 | struct trace_uprobe *tu = event->data; |
798 | 870 | ||
799 | switch (type) { | 871 | switch (type) { |
800 | case TRACE_REG_REGISTER: | 872 | case TRACE_REG_REGISTER: |
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) | |||
843 | return ret; | 915 | return ret; |
844 | } | 916 | } |
845 | 917 | ||
918 | static int uretprobe_dispatcher(struct uprobe_consumer *con, | ||
919 | unsigned long func, struct pt_regs *regs) | ||
920 | { | ||
921 | struct trace_uprobe *tu; | ||
922 | |||
923 | tu = container_of(con, struct trace_uprobe, consumer); | ||
924 | |||
925 | if (tu->flags & TP_FLAG_TRACE) | ||
926 | uretprobe_trace_func(tu, func, regs); | ||
927 | |||
928 | #ifdef CONFIG_PERF_EVENTS | ||
929 | if (tu->flags & TP_FLAG_PROFILE) | ||
930 | uretprobe_perf_func(tu, func, regs); | ||
931 | #endif | ||
932 | return 0; | ||
933 | } | ||
934 | |||
846 | static struct trace_event_functions uprobe_funcs = { | 935 | static struct trace_event_functions uprobe_funcs = { |
847 | .trace = print_uprobe_event | 936 | .trace = print_uprobe_event |
848 | }; | 937 | }; |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 0c05a4592047..29f26540e9c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, | |||
112 | int nr_probes = 0; | 112 | int nr_probes = 0; |
113 | struct tracepoint_func *old, *new; | 113 | struct tracepoint_func *old, *new; |
114 | 114 | ||
115 | WARN_ON(!probe); | 115 | if (WARN_ON(!probe)) |
116 | return ERR_PTR(-EINVAL); | ||
116 | 117 | ||
117 | debug_print_probes(entry); | 118 | debug_print_probes(entry); |
118 | old = entry->funcs; | 119 | old = entry->funcs; |
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, | |||
152 | 153 | ||
153 | debug_print_probes(entry); | 154 | debug_print_probes(entry); |
154 | /* (N -> M), (N > 1, M >= 0) probes */ | 155 | /* (N -> M), (N > 1, M >= 0) probes */ |
155 | for (nr_probes = 0; old[nr_probes].func; nr_probes++) { | 156 | if (probe) { |
156 | if (!probe || | 157 | for (nr_probes = 0; old[nr_probes].func; nr_probes++) { |
157 | (old[nr_probes].func == probe && | 158 | if (old[nr_probes].func == probe && |
158 | old[nr_probes].data == data)) | 159 | old[nr_probes].data == data) |
159 | nr_del++; | 160 | nr_del++; |
161 | } | ||
160 | } | 162 | } |
161 | 163 | ||
164 | /* | ||
165 | * If probe is NULL, then nr_probes = nr_del = 0, and then the | ||
166 | * entire entry will be removed. | ||
167 | */ | ||
162 | if (nr_probes - nr_del == 0) { | 168 | if (nr_probes - nr_del == 0) { |
163 | /* N -> 0, (N > 1) */ | 169 | /* N -> 0, (N > 1) */ |
164 | entry->funcs = NULL; | 170 | entry->funcs = NULL; |
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, | |||
173 | if (new == NULL) | 179 | if (new == NULL) |
174 | return ERR_PTR(-ENOMEM); | 180 | return ERR_PTR(-ENOMEM); |
175 | for (i = 0; old[i].func; i++) | 181 | for (i = 0; old[i].func; i++) |
176 | if (probe && | 182 | if (old[i].func != probe || old[i].data != data) |
177 | (old[i].func != probe || old[i].data != data)) | ||
178 | new[j++] = old[i]; | 183 | new[j++] = old[i]; |
179 | new[nr_probes - nr_del].func = NULL; | 184 | new[nr_probes - nr_del].func = NULL; |
180 | entry->refcount = nr_probes - nr_del; | 185 | entry->refcount = nr_probes - nr_del; |
diff --git a/kernel/uid16.c b/kernel/uid16.c index d7948eb10225..f6c83d7ef000 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -18,67 +18,43 @@ | |||
18 | 18 | ||
19 | SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) | 19 | SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) |
20 | { | 20 | { |
21 | long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); | 21 | return sys_chown(filename, low2highuid(user), low2highgid(group)); |
22 | /* avoid REGPARM breakage on x86: */ | ||
23 | asmlinkage_protect(3, ret, filename, user, group); | ||
24 | return ret; | ||
25 | } | 22 | } |
26 | 23 | ||
27 | SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) | 24 | SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) |
28 | { | 25 | { |
29 | long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); | 26 | return sys_lchown(filename, low2highuid(user), low2highgid(group)); |
30 | /* avoid REGPARM breakage on x86: */ | ||
31 | asmlinkage_protect(3, ret, filename, user, group); | ||
32 | return ret; | ||
33 | } | 27 | } |
34 | 28 | ||
35 | SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) | 29 | SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) |
36 | { | 30 | { |
37 | long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); | 31 | return sys_fchown(fd, low2highuid(user), low2highgid(group)); |
38 | /* avoid REGPARM breakage on x86: */ | ||
39 | asmlinkage_protect(3, ret, fd, user, group); | ||
40 | return ret; | ||
41 | } | 32 | } |
42 | 33 | ||
43 | SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) | 34 | SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) |
44 | { | 35 | { |
45 | long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); | 36 | return sys_setregid(low2highgid(rgid), low2highgid(egid)); |
46 | /* avoid REGPARM breakage on x86: */ | ||
47 | asmlinkage_protect(2, ret, rgid, egid); | ||
48 | return ret; | ||
49 | } | 37 | } |
50 | 38 | ||
51 | SYSCALL_DEFINE1(setgid16, old_gid_t, gid) | 39 | SYSCALL_DEFINE1(setgid16, old_gid_t, gid) |
52 | { | 40 | { |
53 | long ret = sys_setgid(low2highgid(gid)); | 41 | return sys_setgid(low2highgid(gid)); |
54 | /* avoid REGPARM breakage on x86: */ | ||
55 | asmlinkage_protect(1, ret, gid); | ||
56 | return ret; | ||
57 | } | 42 | } |
58 | 43 | ||
59 | SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) | 44 | SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) |
60 | { | 45 | { |
61 | long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); | 46 | return sys_setreuid(low2highuid(ruid), low2highuid(euid)); |
62 | /* avoid REGPARM breakage on x86: */ | ||
63 | asmlinkage_protect(2, ret, ruid, euid); | ||
64 | return ret; | ||
65 | } | 47 | } |
66 | 48 | ||
67 | SYSCALL_DEFINE1(setuid16, old_uid_t, uid) | 49 | SYSCALL_DEFINE1(setuid16, old_uid_t, uid) |
68 | { | 50 | { |
69 | long ret = sys_setuid(low2highuid(uid)); | 51 | return sys_setuid(low2highuid(uid)); |
70 | /* avoid REGPARM breakage on x86: */ | ||
71 | asmlinkage_protect(1, ret, uid); | ||
72 | return ret; | ||
73 | } | 52 | } |
74 | 53 | ||
75 | SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) | 54 | SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) |
76 | { | 55 | { |
77 | long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), | 56 | return sys_setresuid(low2highuid(ruid), low2highuid(euid), |
78 | low2highuid(suid)); | 57 | low2highuid(suid)); |
79 | /* avoid REGPARM breakage on x86: */ | ||
80 | asmlinkage_protect(3, ret, ruid, euid, suid); | ||
81 | return ret; | ||
82 | } | 58 | } |
83 | 59 | ||
84 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) | 60 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) |
@@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid | |||
100 | 76 | ||
101 | SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) | 77 | SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) |
102 | { | 78 | { |
103 | long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), | 79 | return sys_setresgid(low2highgid(rgid), low2highgid(egid), |
104 | low2highgid(sgid)); | 80 | low2highgid(sgid)); |
105 | /* avoid REGPARM breakage on x86: */ | ||
106 | asmlinkage_protect(3, ret, rgid, egid, sgid); | ||
107 | return ret; | ||
108 | } | 81 | } |
109 | 82 | ||
110 | 83 | ||
@@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid | |||
127 | 100 | ||
128 | SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) | 101 | SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) |
129 | { | 102 | { |
130 | long ret = sys_setfsuid(low2highuid(uid)); | 103 | return sys_setfsuid(low2highuid(uid)); |
131 | /* avoid REGPARM breakage on x86: */ | ||
132 | asmlinkage_protect(1, ret, uid); | ||
133 | return ret; | ||
134 | } | 104 | } |
135 | 105 | ||
136 | SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) | 106 | SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) |
137 | { | 107 | { |
138 | long ret = sys_setfsgid(low2highgid(gid)); | 108 | return sys_setfsgid(low2highgid(gid)); |
139 | /* avoid REGPARM breakage on x86: */ | ||
140 | asmlinkage_protect(1, ret, gid); | ||
141 | return ret; | ||
142 | } | 109 | } |
143 | 110 | ||
144 | static int groups16_to_user(old_gid_t __user *grouplist, | 111 | static int groups16_to_user(old_gid_t __user *grouplist, |
diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03b..69b4c3d48cde 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -16,7 +16,7 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/export.h> | 17 | #include <linux/export.h> |
18 | #include <linux/user_namespace.h> | 18 | #include <linux/user_namespace.h> |
19 | #include <linux/proc_fs.h> | 19 | #include <linux/proc_ns.h> |
20 | 20 | ||
21 | /* | 21 | /* |
22 | * userns count is 1 for root user, 1 for init_uts_ns, | 22 | * userns count is 1 for root user, 1 for init_uts_ns, |
@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { | |||
51 | .owner = GLOBAL_ROOT_UID, | 51 | .owner = GLOBAL_ROOT_UID, |
52 | .group = GLOBAL_ROOT_GID, | 52 | .group = GLOBAL_ROOT_GID, |
53 | .proc_inum = PROC_USER_INIT_INO, | 53 | .proc_inum = PROC_USER_INIT_INO, |
54 | .may_mount_sysfs = true, | ||
55 | .may_mount_proc = true, | ||
54 | }; | 56 | }; |
55 | EXPORT_SYMBOL_GPL(init_user_ns); | 57 | EXPORT_SYMBOL_GPL(init_user_ns); |
56 | 58 | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d342043..d8c30db06c5b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -9,7 +9,7 @@ | |||
9 | #include <linux/nsproxy.h> | 9 | #include <linux/nsproxy.h> |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/proc_fs.h> | 12 | #include <linux/proc_ns.h> |
13 | #include <linux/highuid.h> | 13 | #include <linux/highuid.h> |
14 | #include <linux/cred.h> | 14 | #include <linux/cred.h> |
15 | #include <linux/securebits.h> | 15 | #include <linux/securebits.h> |
@@ -25,7 +25,8 @@ | |||
25 | 25 | ||
26 | static struct kmem_cache *user_ns_cachep __read_mostly; | 26 | static struct kmem_cache *user_ns_cachep __read_mostly; |
27 | 27 | ||
28 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | 28 | static bool new_idmap_permitted(const struct file *file, |
29 | struct user_namespace *ns, int cap_setid, | ||
29 | struct uid_gid_map *map); | 30 | struct uid_gid_map *map); |
30 | 31 | ||
31 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) | 32 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) |
@@ -61,6 +62,15 @@ int create_user_ns(struct cred *new) | |||
61 | kgid_t group = new->egid; | 62 | kgid_t group = new->egid; |
62 | int ret; | 63 | int ret; |
63 | 64 | ||
65 | /* | ||
66 | * Verify that we can not violate the policy of which files | ||
67 | * may be accessed that is specified by the root directory, | ||
68 | * by verifing that the root directory is at the root of the | ||
69 | * mount namespace which allows all files to be accessed. | ||
70 | */ | ||
71 | if (current_chrooted()) | ||
72 | return -EPERM; | ||
73 | |||
64 | /* The creator needs a mapping in the parent user namespace | 74 | /* The creator needs a mapping in the parent user namespace |
65 | * or else we won't be able to reasonably tell userspace who | 75 | * or else we won't be able to reasonably tell userspace who |
66 | * created a user_namespace. | 76 | * created a user_namespace. |
@@ -87,6 +97,8 @@ int create_user_ns(struct cred *new) | |||
87 | 97 | ||
88 | set_cred_user_ns(new, ns); | 98 | set_cred_user_ns(new, ns); |
89 | 99 | ||
100 | update_mnt_policy(ns); | ||
101 | |||
90 | return 0; | 102 | return 0; |
91 | } | 103 | } |
92 | 104 | ||
@@ -601,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
601 | if (map->nr_extents != 0) | 613 | if (map->nr_extents != 0) |
602 | goto out; | 614 | goto out; |
603 | 615 | ||
604 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID | 616 | /* |
605 | * over the user namespace in order to set the id mapping. | 617 | * Adjusting namespace settings requires capabilities on the target. |
606 | */ | 618 | */ |
607 | if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) | 619 | if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) |
608 | goto out; | 620 | goto out; |
609 | 621 | ||
610 | /* Get a buffer */ | 622 | /* Get a buffer */ |
@@ -689,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
689 | 701 | ||
690 | ret = -EPERM; | 702 | ret = -EPERM; |
691 | /* Validate the user is allowed to use user id's mapped to. */ | 703 | /* Validate the user is allowed to use user id's mapped to. */ |
692 | if (!new_idmap_permitted(ns, cap_setid, &new_map)) | 704 | if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) |
693 | goto out; | 705 | goto out; |
694 | 706 | ||
695 | /* Map the lower ids from the parent user namespace to the | 707 | /* Map the lower ids from the parent user namespace to the |
@@ -776,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t | |||
776 | &ns->projid_map, &ns->parent->projid_map); | 788 | &ns->projid_map, &ns->parent->projid_map); |
777 | } | 789 | } |
778 | 790 | ||
779 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | 791 | static bool new_idmap_permitted(const struct file *file, |
792 | struct user_namespace *ns, int cap_setid, | ||
780 | struct uid_gid_map *new_map) | 793 | struct uid_gid_map *new_map) |
781 | { | 794 | { |
782 | /* Allow mapping to your own filesystem ids */ | 795 | /* Allow mapping to your own filesystem ids */ |
@@ -784,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | |||
784 | u32 id = new_map->extent[0].lower_first; | 797 | u32 id = new_map->extent[0].lower_first; |
785 | if (cap_setid == CAP_SETUID) { | 798 | if (cap_setid == CAP_SETUID) { |
786 | kuid_t uid = make_kuid(ns->parent, id); | 799 | kuid_t uid = make_kuid(ns->parent, id); |
787 | if (uid_eq(uid, current_fsuid())) | 800 | if (uid_eq(uid, file->f_cred->fsuid)) |
788 | return true; | 801 | return true; |
789 | } | 802 | } |
790 | else if (cap_setid == CAP_SETGID) { | 803 | else if (cap_setid == CAP_SETGID) { |
791 | kgid_t gid = make_kgid(ns->parent, id); | 804 | kgid_t gid = make_kgid(ns->parent, id); |
792 | if (gid_eq(gid, current_fsgid())) | 805 | if (gid_eq(gid, file->f_cred->fsgid)) |
793 | return true; | 806 | return true; |
794 | } | 807 | } |
795 | } | 808 | } |
@@ -800,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | |||
800 | 813 | ||
801 | /* Allow the specified ids if we have the appropriate capability | 814 | /* Allow the specified ids if we have the appropriate capability |
802 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. | 815 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. |
816 | * And the opener of the id file also had the approprpiate capability. | ||
803 | */ | 817 | */ |
804 | if (ns_capable(ns->parent, cap_setid)) | 818 | if (ns_capable(ns->parent, cap_setid) && |
819 | file_ns_capable(file, ns->parent, cap_setid)) | ||
805 | return true; | 820 | return true; |
806 | 821 | ||
807 | return false; | 822 | return false; |
diff --git a/kernel/utsname.c b/kernel/utsname.c index a47fc5de3113..2fc8576efaa8 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -15,7 +15,7 @@ | |||
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/user_namespace.h> | 17 | #include <linux/user_namespace.h> |
18 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_ns.h> |
19 | 19 | ||
20 | static struct uts_namespace *create_uts_ns(void) | 20 | static struct uts_namespace *create_uts_ns(void) |
21 | { | 21 | { |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4a944676358e..05039e348f07 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
517 | return ret; | 517 | return ret; |
518 | 518 | ||
519 | set_sample_period(); | 519 | set_sample_period(); |
520 | /* | ||
521 | * Watchdog threads shouldn't be enabled if they are | ||
522 | * disabled. The 'watchdog_disabled' variable check in | ||
523 | * watchdog_*_all_cpus() function takes care of this. | ||
524 | */ | ||
520 | if (watchdog_enabled && watchdog_thresh) | 525 | if (watchdog_enabled && watchdog_thresh) |
521 | watchdog_enable_all_cpus(); | 526 | watchdog_enable_all_cpus(); |
522 | else | 527 | else |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 55fac5b991b7..ee8e29a2320c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -41,7 +41,12 @@ | |||
41 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
42 | #include <linux/lockdep.h> | 42 | #include <linux/lockdep.h> |
43 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
44 | #include <linux/jhash.h> | ||
44 | #include <linux/hashtable.h> | 45 | #include <linux/hashtable.h> |
46 | #include <linux/rculist.h> | ||
47 | #include <linux/nodemask.h> | ||
48 | #include <linux/moduleparam.h> | ||
49 | #include <linux/uaccess.h> | ||
45 | 50 | ||
46 | #include "workqueue_internal.h" | 51 | #include "workqueue_internal.h" |
47 | 52 | ||
@@ -58,12 +63,11 @@ enum { | |||
58 | * %WORKER_UNBOUND set and concurrency management disabled, and may | 63 | * %WORKER_UNBOUND set and concurrency management disabled, and may |
59 | * be executing on any CPU. The pool behaves as an unbound one. | 64 | * be executing on any CPU. The pool behaves as an unbound one. |
60 | * | 65 | * |
61 | * Note that DISASSOCIATED can be flipped only while holding | 66 | * Note that DISASSOCIATED should be flipped only while holding |
62 | * assoc_mutex to avoid changing binding state while | 67 | * manager_mutex to avoid changing binding state while |
63 | * create_worker() is in progress. | 68 | * create_worker() is in progress. |
64 | */ | 69 | */ |
65 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | 70 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ |
66 | POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ | ||
67 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | 71 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ |
68 | POOL_FREEZING = 1 << 3, /* freeze in progress */ | 72 | POOL_FREEZING = 1 << 3, /* freeze in progress */ |
69 | 73 | ||
@@ -74,12 +78,14 @@ enum { | |||
74 | WORKER_PREP = 1 << 3, /* preparing to run works */ | 78 | WORKER_PREP = 1 << 3, /* preparing to run works */ |
75 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | 79 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ |
76 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | 80 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ |
81 | WORKER_REBOUND = 1 << 8, /* worker was rebound */ | ||
77 | 82 | ||
78 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | | 83 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | |
79 | WORKER_CPU_INTENSIVE, | 84 | WORKER_UNBOUND | WORKER_REBOUND, |
80 | 85 | ||
81 | NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ | 86 | NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ |
82 | 87 | ||
88 | UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ | ||
83 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | 89 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ |
84 | 90 | ||
85 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | 91 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ |
@@ -97,6 +103,8 @@ enum { | |||
97 | */ | 103 | */ |
98 | RESCUER_NICE_LEVEL = -20, | 104 | RESCUER_NICE_LEVEL = -20, |
99 | HIGHPRI_NICE_LEVEL = -20, | 105 | HIGHPRI_NICE_LEVEL = -20, |
106 | |||
107 | WQ_NAME_LEN = 24, | ||
100 | }; | 108 | }; |
101 | 109 | ||
102 | /* | 110 | /* |
@@ -115,16 +123,26 @@ enum { | |||
115 | * cpu or grabbing pool->lock is enough for read access. If | 123 | * cpu or grabbing pool->lock is enough for read access. If |
116 | * POOL_DISASSOCIATED is set, it's identical to L. | 124 | * POOL_DISASSOCIATED is set, it's identical to L. |
117 | * | 125 | * |
118 | * F: wq->flush_mutex protected. | 126 | * MG: pool->manager_mutex and pool->lock protected. Writes require both |
127 | * locks. Reads can happen under either lock. | ||
128 | * | ||
129 | * PL: wq_pool_mutex protected. | ||
130 | * | ||
131 | * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. | ||
132 | * | ||
133 | * WQ: wq->mutex protected. | ||
119 | * | 134 | * |
120 | * W: workqueue_lock protected. | 135 | * WR: wq->mutex protected for writes. Sched-RCU protected for reads. |
136 | * | ||
137 | * MD: wq_mayday_lock protected. | ||
121 | */ | 138 | */ |
122 | 139 | ||
123 | /* struct worker is defined in workqueue_internal.h */ | 140 | /* struct worker is defined in workqueue_internal.h */ |
124 | 141 | ||
125 | struct worker_pool { | 142 | struct worker_pool { |
126 | spinlock_t lock; /* the pool lock */ | 143 | spinlock_t lock; /* the pool lock */ |
127 | unsigned int cpu; /* I: the associated cpu */ | 144 | int cpu; /* I: the associated cpu */ |
145 | int node; /* I: the associated node ID */ | ||
128 | int id; /* I: pool ID */ | 146 | int id; /* I: pool ID */ |
129 | unsigned int flags; /* X: flags */ | 147 | unsigned int flags; /* X: flags */ |
130 | 148 | ||
@@ -138,12 +156,18 @@ struct worker_pool { | |||
138 | struct timer_list idle_timer; /* L: worker idle timeout */ | 156 | struct timer_list idle_timer; /* L: worker idle timeout */ |
139 | struct timer_list mayday_timer; /* L: SOS timer for workers */ | 157 | struct timer_list mayday_timer; /* L: SOS timer for workers */ |
140 | 158 | ||
141 | /* workers are chained either in busy_hash or idle_list */ | 159 | /* a workers is either on busy_hash or idle_list, or the manager */ |
142 | DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); | 160 | DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); |
143 | /* L: hash of busy workers */ | 161 | /* L: hash of busy workers */ |
144 | 162 | ||
145 | struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ | 163 | /* see manage_workers() for details on the two manager mutexes */ |
146 | struct ida worker_ida; /* L: for worker IDs */ | 164 | struct mutex manager_arb; /* manager arbitration */ |
165 | struct mutex manager_mutex; /* manager exclusion */ | ||
166 | struct idr worker_idr; /* MG: worker IDs and iteration */ | ||
167 | |||
168 | struct workqueue_attrs *attrs; /* I: worker attributes */ | ||
169 | struct hlist_node hash_node; /* PL: unbound_pool_hash node */ | ||
170 | int refcnt; /* PL: refcnt for unbound pools */ | ||
147 | 171 | ||
148 | /* | 172 | /* |
149 | * The current concurrency level. As it's likely to be accessed | 173 | * The current concurrency level. As it's likely to be accessed |
@@ -151,6 +175,12 @@ struct worker_pool { | |||
151 | * cacheline. | 175 | * cacheline. |
152 | */ | 176 | */ |
153 | atomic_t nr_running ____cacheline_aligned_in_smp; | 177 | atomic_t nr_running ____cacheline_aligned_in_smp; |
178 | |||
179 | /* | ||
180 | * Destruction of pool is sched-RCU protected to allow dereferences | ||
181 | * from get_work_pool(). | ||
182 | */ | ||
183 | struct rcu_head rcu; | ||
154 | } ____cacheline_aligned_in_smp; | 184 | } ____cacheline_aligned_in_smp; |
155 | 185 | ||
156 | /* | 186 | /* |
@@ -164,77 +194,109 @@ struct pool_workqueue { | |||
164 | struct workqueue_struct *wq; /* I: the owning workqueue */ | 194 | struct workqueue_struct *wq; /* I: the owning workqueue */ |
165 | int work_color; /* L: current color */ | 195 | int work_color; /* L: current color */ |
166 | int flush_color; /* L: flushing color */ | 196 | int flush_color; /* L: flushing color */ |
197 | int refcnt; /* L: reference count */ | ||
167 | int nr_in_flight[WORK_NR_COLORS]; | 198 | int nr_in_flight[WORK_NR_COLORS]; |
168 | /* L: nr of in_flight works */ | 199 | /* L: nr of in_flight works */ |
169 | int nr_active; /* L: nr of active works */ | 200 | int nr_active; /* L: nr of active works */ |
170 | int max_active; /* L: max active works */ | 201 | int max_active; /* L: max active works */ |
171 | struct list_head delayed_works; /* L: delayed works */ | 202 | struct list_head delayed_works; /* L: delayed works */ |
172 | }; | 203 | struct list_head pwqs_node; /* WR: node on wq->pwqs */ |
204 | struct list_head mayday_node; /* MD: node on wq->maydays */ | ||
205 | |||
206 | /* | ||
207 | * Release of unbound pwq is punted to system_wq. See put_pwq() | ||
208 | * and pwq_unbound_release_workfn() for details. pool_workqueue | ||
209 | * itself is also sched-RCU protected so that the first pwq can be | ||
210 | * determined without grabbing wq->mutex. | ||
211 | */ | ||
212 | struct work_struct unbound_release_work; | ||
213 | struct rcu_head rcu; | ||
214 | } __aligned(1 << WORK_STRUCT_FLAG_BITS); | ||
173 | 215 | ||
174 | /* | 216 | /* |
175 | * Structure used to wait for workqueue flush. | 217 | * Structure used to wait for workqueue flush. |
176 | */ | 218 | */ |
177 | struct wq_flusher { | 219 | struct wq_flusher { |
178 | struct list_head list; /* F: list of flushers */ | 220 | struct list_head list; /* WQ: list of flushers */ |
179 | int flush_color; /* F: flush color waiting for */ | 221 | int flush_color; /* WQ: flush color waiting for */ |
180 | struct completion done; /* flush completion */ | 222 | struct completion done; /* flush completion */ |
181 | }; | 223 | }; |
182 | 224 | ||
183 | /* | 225 | struct wq_device; |
184 | * All cpumasks are assumed to be always set on UP and thus can't be | ||
185 | * used to determine whether there's something to be done. | ||
186 | */ | ||
187 | #ifdef CONFIG_SMP | ||
188 | typedef cpumask_var_t mayday_mask_t; | ||
189 | #define mayday_test_and_set_cpu(cpu, mask) \ | ||
190 | cpumask_test_and_set_cpu((cpu), (mask)) | ||
191 | #define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) | ||
192 | #define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) | ||
193 | #define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) | ||
194 | #define free_mayday_mask(mask) free_cpumask_var((mask)) | ||
195 | #else | ||
196 | typedef unsigned long mayday_mask_t; | ||
197 | #define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) | ||
198 | #define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) | ||
199 | #define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) | ||
200 | #define alloc_mayday_mask(maskp, gfp) true | ||
201 | #define free_mayday_mask(mask) do { } while (0) | ||
202 | #endif | ||
203 | 226 | ||
204 | /* | 227 | /* |
205 | * The externally visible workqueue abstraction is an array of | 228 | * The externally visible workqueue. It relays the issued work items to |
206 | * per-CPU workqueues: | 229 | * the appropriate worker_pool through its pool_workqueues. |
207 | */ | 230 | */ |
208 | struct workqueue_struct { | 231 | struct workqueue_struct { |
209 | unsigned int flags; /* W: WQ_* flags */ | 232 | struct list_head pwqs; /* WR: all pwqs of this wq */ |
210 | union { | 233 | struct list_head list; /* PL: list of all workqueues */ |
211 | struct pool_workqueue __percpu *pcpu; | 234 | |
212 | struct pool_workqueue *single; | 235 | struct mutex mutex; /* protects this wq */ |
213 | unsigned long v; | 236 | int work_color; /* WQ: current work color */ |
214 | } pool_wq; /* I: pwq's */ | 237 | int flush_color; /* WQ: current flush color */ |
215 | struct list_head list; /* W: list of all workqueues */ | ||
216 | |||
217 | struct mutex flush_mutex; /* protects wq flushing */ | ||
218 | int work_color; /* F: current work color */ | ||
219 | int flush_color; /* F: current flush color */ | ||
220 | atomic_t nr_pwqs_to_flush; /* flush in progress */ | 238 | atomic_t nr_pwqs_to_flush; /* flush in progress */ |
221 | struct wq_flusher *first_flusher; /* F: first flusher */ | 239 | struct wq_flusher *first_flusher; /* WQ: first flusher */ |
222 | struct list_head flusher_queue; /* F: flush waiters */ | 240 | struct list_head flusher_queue; /* WQ: flush waiters */ |
223 | struct list_head flusher_overflow; /* F: flush overflow list */ | 241 | struct list_head flusher_overflow; /* WQ: flush overflow list */ |
224 | 242 | ||
225 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ | 243 | struct list_head maydays; /* MD: pwqs requesting rescue */ |
226 | struct worker *rescuer; /* I: rescue worker */ | 244 | struct worker *rescuer; /* I: rescue worker */ |
227 | 245 | ||
228 | int nr_drainers; /* W: drain in progress */ | 246 | int nr_drainers; /* WQ: drain in progress */ |
229 | int saved_max_active; /* W: saved pwq max_active */ | 247 | int saved_max_active; /* WQ: saved pwq max_active */ |
248 | |||
249 | struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ | ||
250 | struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */ | ||
251 | |||
252 | #ifdef CONFIG_SYSFS | ||
253 | struct wq_device *wq_dev; /* I: for sysfs interface */ | ||
254 | #endif | ||
230 | #ifdef CONFIG_LOCKDEP | 255 | #ifdef CONFIG_LOCKDEP |
231 | struct lockdep_map lockdep_map; | 256 | struct lockdep_map lockdep_map; |
232 | #endif | 257 | #endif |
233 | char name[]; /* I: workqueue name */ | 258 | char name[WQ_NAME_LEN]; /* I: workqueue name */ |
259 | |||
260 | /* hot fields used during command issue, aligned to cacheline */ | ||
261 | unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ | ||
262 | struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ | ||
263 | struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */ | ||
234 | }; | 264 | }; |
235 | 265 | ||
266 | static struct kmem_cache *pwq_cache; | ||
267 | |||
268 | static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ | ||
269 | static cpumask_var_t *wq_numa_possible_cpumask; | ||
270 | /* possible CPUs of each node */ | ||
271 | |||
272 | static bool wq_disable_numa; | ||
273 | module_param_named(disable_numa, wq_disable_numa, bool, 0444); | ||
274 | |||
275 | static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ | ||
276 | |||
277 | /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ | ||
278 | static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; | ||
279 | |||
280 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ | ||
281 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ | ||
282 | |||
283 | static LIST_HEAD(workqueues); /* PL: list of all workqueues */ | ||
284 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ | ||
285 | |||
286 | /* the per-cpu worker pools */ | ||
287 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], | ||
288 | cpu_worker_pools); | ||
289 | |||
290 | static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ | ||
291 | |||
292 | /* PL: hash of all unbound pools keyed by pool->attrs */ | ||
293 | static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); | ||
294 | |||
295 | /* I: attributes used when instantiating standard unbound pools on demand */ | ||
296 | static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; | ||
297 | |||
236 | struct workqueue_struct *system_wq __read_mostly; | 298 | struct workqueue_struct *system_wq __read_mostly; |
237 | EXPORT_SYMBOL_GPL(system_wq); | 299 | EXPORT_SYMBOL(system_wq); |
238 | struct workqueue_struct *system_highpri_wq __read_mostly; | 300 | struct workqueue_struct *system_highpri_wq __read_mostly; |
239 | EXPORT_SYMBOL_GPL(system_highpri_wq); | 301 | EXPORT_SYMBOL_GPL(system_highpri_wq); |
240 | struct workqueue_struct *system_long_wq __read_mostly; | 302 | struct workqueue_struct *system_long_wq __read_mostly; |
@@ -244,64 +306,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq); | |||
244 | struct workqueue_struct *system_freezable_wq __read_mostly; | 306 | struct workqueue_struct *system_freezable_wq __read_mostly; |
245 | EXPORT_SYMBOL_GPL(system_freezable_wq); | 307 | EXPORT_SYMBOL_GPL(system_freezable_wq); |
246 | 308 | ||
309 | static int worker_thread(void *__worker); | ||
310 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | ||
311 | const struct workqueue_attrs *from); | ||
312 | |||
247 | #define CREATE_TRACE_POINTS | 313 | #define CREATE_TRACE_POINTS |
248 | #include <trace/events/workqueue.h> | 314 | #include <trace/events/workqueue.h> |
249 | 315 | ||
250 | #define for_each_std_worker_pool(pool, cpu) \ | 316 | #define assert_rcu_or_pool_mutex() \ |
251 | for ((pool) = &std_worker_pools(cpu)[0]; \ | 317 | rcu_lockdep_assert(rcu_read_lock_sched_held() || \ |
252 | (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) | 318 | lockdep_is_held(&wq_pool_mutex), \ |
319 | "sched RCU or wq_pool_mutex should be held") | ||
253 | 320 | ||
254 | #define for_each_busy_worker(worker, i, pool) \ | 321 | #define assert_rcu_or_wq_mutex(wq) \ |
255 | hash_for_each(pool->busy_hash, i, worker, hentry) | 322 | rcu_lockdep_assert(rcu_read_lock_sched_held() || \ |
323 | lockdep_is_held(&wq->mutex), \ | ||
324 | "sched RCU or wq->mutex should be held") | ||
256 | 325 | ||
257 | static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | 326 | #ifdef CONFIG_LOCKDEP |
258 | unsigned int sw) | 327 | #define assert_manager_or_pool_lock(pool) \ |
259 | { | 328 | WARN_ONCE(debug_locks && \ |
260 | if (cpu < nr_cpu_ids) { | 329 | !lockdep_is_held(&(pool)->manager_mutex) && \ |
261 | if (sw & 1) { | 330 | !lockdep_is_held(&(pool)->lock), \ |
262 | cpu = cpumask_next(cpu, mask); | 331 | "pool->manager_mutex or ->lock should be held") |
263 | if (cpu < nr_cpu_ids) | 332 | #else |
264 | return cpu; | 333 | #define assert_manager_or_pool_lock(pool) do { } while (0) |
265 | } | 334 | #endif |
266 | if (sw & 2) | ||
267 | return WORK_CPU_UNBOUND; | ||
268 | } | ||
269 | return WORK_CPU_END; | ||
270 | } | ||
271 | 335 | ||
272 | static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, | 336 | #define for_each_cpu_worker_pool(pool, cpu) \ |
273 | struct workqueue_struct *wq) | 337 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ |
274 | { | 338 | (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ |
275 | return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); | 339 | (pool)++) |
276 | } | ||
277 | 340 | ||
278 | /* | 341 | /** |
279 | * CPU iterators | 342 | * for_each_pool - iterate through all worker_pools in the system |
343 | * @pool: iteration cursor | ||
344 | * @pi: integer used for iteration | ||
280 | * | 345 | * |
281 | * An extra cpu number is defined using an invalid cpu number | 346 | * This must be called either with wq_pool_mutex held or sched RCU read |
282 | * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any | 347 | * locked. If the pool needs to be used beyond the locking in effect, the |
283 | * specific CPU. The following iterators are similar to for_each_*_cpu() | 348 | * caller is responsible for guaranteeing that the pool stays online. |
284 | * iterators but also considers the unbound CPU. | ||
285 | * | 349 | * |
286 | * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND | 350 | * The if/else clause exists only for the lockdep assertion and can be |
287 | * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND | 351 | * ignored. |
288 | * for_each_pwq_cpu() : possible CPUs for bound workqueues, | ||
289 | * WORK_CPU_UNBOUND for unbound workqueues | ||
290 | */ | 352 | */ |
291 | #define for_each_wq_cpu(cpu) \ | 353 | #define for_each_pool(pool, pi) \ |
292 | for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ | 354 | idr_for_each_entry(&worker_pool_idr, pool, pi) \ |
293 | (cpu) < WORK_CPU_END; \ | 355 | if (({ assert_rcu_or_pool_mutex(); false; })) { } \ |
294 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) | 356 | else |
295 | 357 | ||
296 | #define for_each_online_wq_cpu(cpu) \ | 358 | /** |
297 | for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ | 359 | * for_each_pool_worker - iterate through all workers of a worker_pool |
298 | (cpu) < WORK_CPU_END; \ | 360 | * @worker: iteration cursor |
299 | (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) | 361 | * @wi: integer used for iteration |
362 | * @pool: worker_pool to iterate workers of | ||
363 | * | ||
364 | * This must be called with either @pool->manager_mutex or ->lock held. | ||
365 | * | ||
366 | * The if/else clause exists only for the lockdep assertion and can be | ||
367 | * ignored. | ||
368 | */ | ||
369 | #define for_each_pool_worker(worker, wi, pool) \ | ||
370 | idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ | ||
371 | if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ | ||
372 | else | ||
300 | 373 | ||
301 | #define for_each_pwq_cpu(cpu, wq) \ | 374 | /** |
302 | for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ | 375 | * for_each_pwq - iterate through all pool_workqueues of the specified workqueue |
303 | (cpu) < WORK_CPU_END; \ | 376 | * @pwq: iteration cursor |
304 | (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) | 377 | * @wq: the target workqueue |
378 | * | ||
379 | * This must be called either with wq->mutex held or sched RCU read locked. | ||
380 | * If the pwq needs to be used beyond the locking in effect, the caller is | ||
381 | * responsible for guaranteeing that the pwq stays online. | ||
382 | * | ||
383 | * The if/else clause exists only for the lockdep assertion and can be | ||
384 | * ignored. | ||
385 | */ | ||
386 | #define for_each_pwq(pwq, wq) \ | ||
387 | list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ | ||
388 | if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ | ||
389 | else | ||
305 | 390 | ||
306 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | 391 | #ifdef CONFIG_DEBUG_OBJECTS_WORK |
307 | 392 | ||
@@ -419,77 +504,35 @@ static inline void debug_work_activate(struct work_struct *work) { } | |||
419 | static inline void debug_work_deactivate(struct work_struct *work) { } | 504 | static inline void debug_work_deactivate(struct work_struct *work) { } |
420 | #endif | 505 | #endif |
421 | 506 | ||
422 | /* Serializes the accesses to the list of workqueues. */ | ||
423 | static DEFINE_SPINLOCK(workqueue_lock); | ||
424 | static LIST_HEAD(workqueues); | ||
425 | static bool workqueue_freezing; /* W: have wqs started freezing? */ | ||
426 | |||
427 | /* | ||
428 | * The CPU and unbound standard worker pools. The unbound ones have | ||
429 | * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. | ||
430 | */ | ||
431 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], | ||
432 | cpu_std_worker_pools); | ||
433 | static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; | ||
434 | |||
435 | /* idr of all pools */ | ||
436 | static DEFINE_MUTEX(worker_pool_idr_mutex); | ||
437 | static DEFINE_IDR(worker_pool_idr); | ||
438 | |||
439 | static int worker_thread(void *__worker); | ||
440 | |||
441 | static struct worker_pool *std_worker_pools(int cpu) | ||
442 | { | ||
443 | if (cpu != WORK_CPU_UNBOUND) | ||
444 | return per_cpu(cpu_std_worker_pools, cpu); | ||
445 | else | ||
446 | return unbound_std_worker_pools; | ||
447 | } | ||
448 | |||
449 | static int std_worker_pool_pri(struct worker_pool *pool) | ||
450 | { | ||
451 | return pool - std_worker_pools(pool->cpu); | ||
452 | } | ||
453 | |||
454 | /* allocate ID and assign it to @pool */ | 507 | /* allocate ID and assign it to @pool */ |
455 | static int worker_pool_assign_id(struct worker_pool *pool) | 508 | static int worker_pool_assign_id(struct worker_pool *pool) |
456 | { | 509 | { |
457 | int ret; | 510 | int ret; |
458 | 511 | ||
459 | mutex_lock(&worker_pool_idr_mutex); | 512 | lockdep_assert_held(&wq_pool_mutex); |
513 | |||
460 | ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); | 514 | ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); |
461 | if (ret >= 0) | 515 | if (ret >= 0) { |
462 | pool->id = ret; | 516 | pool->id = ret; |
463 | mutex_unlock(&worker_pool_idr_mutex); | 517 | return 0; |
464 | 518 | } | |
465 | return ret < 0 ? ret : 0; | 519 | return ret; |
466 | } | 520 | } |
467 | 521 | ||
468 | /* | 522 | /** |
469 | * Lookup worker_pool by id. The idr currently is built during boot and | 523 | * unbound_pwq_by_node - return the unbound pool_workqueue for the given node |
470 | * never modified. Don't worry about locking for now. | 524 | * @wq: the target workqueue |
525 | * @node: the node ID | ||
526 | * | ||
527 | * This must be called either with pwq_lock held or sched RCU read locked. | ||
528 | * If the pwq needs to be used beyond the locking in effect, the caller is | ||
529 | * responsible for guaranteeing that the pwq stays online. | ||
471 | */ | 530 | */ |
472 | static struct worker_pool *worker_pool_by_id(int pool_id) | 531 | static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, |
532 | int node) | ||
473 | { | 533 | { |
474 | return idr_find(&worker_pool_idr, pool_id); | 534 | assert_rcu_or_wq_mutex(wq); |
475 | } | 535 | return rcu_dereference_raw(wq->numa_pwq_tbl[node]); |
476 | |||
477 | static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) | ||
478 | { | ||
479 | struct worker_pool *pools = std_worker_pools(cpu); | ||
480 | |||
481 | return &pools[highpri]; | ||
482 | } | ||
483 | |||
484 | static struct pool_workqueue *get_pwq(unsigned int cpu, | ||
485 | struct workqueue_struct *wq) | ||
486 | { | ||
487 | if (!(wq->flags & WQ_UNBOUND)) { | ||
488 | if (likely(cpu < nr_cpu_ids)) | ||
489 | return per_cpu_ptr(wq->pool_wq.pcpu, cpu); | ||
490 | } else if (likely(cpu == WORK_CPU_UNBOUND)) | ||
491 | return wq->pool_wq.single; | ||
492 | return NULL; | ||
493 | } | 536 | } |
494 | 537 | ||
495 | static unsigned int work_color_to_flags(int color) | 538 | static unsigned int work_color_to_flags(int color) |
@@ -531,7 +574,7 @@ static int work_next_color(int color) | |||
531 | static inline void set_work_data(struct work_struct *work, unsigned long data, | 574 | static inline void set_work_data(struct work_struct *work, unsigned long data, |
532 | unsigned long flags) | 575 | unsigned long flags) |
533 | { | 576 | { |
534 | BUG_ON(!work_pending(work)); | 577 | WARN_ON_ONCE(!work_pending(work)); |
535 | atomic_long_set(&work->data, data | flags | work_static(work)); | 578 | atomic_long_set(&work->data, data | flags | work_static(work)); |
536 | } | 579 | } |
537 | 580 | ||
@@ -583,13 +626,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) | |||
583 | * @work: the work item of interest | 626 | * @work: the work item of interest |
584 | * | 627 | * |
585 | * Return the worker_pool @work was last associated with. %NULL if none. | 628 | * Return the worker_pool @work was last associated with. %NULL if none. |
629 | * | ||
630 | * Pools are created and destroyed under wq_pool_mutex, and allows read | ||
631 | * access under sched-RCU read lock. As such, this function should be | ||
632 | * called under wq_pool_mutex or with preemption disabled. | ||
633 | * | ||
634 | * All fields of the returned pool are accessible as long as the above | ||
635 | * mentioned locking is in effect. If the returned pool needs to be used | ||
636 | * beyond the critical section, the caller is responsible for ensuring the | ||
637 | * returned pool is and stays online. | ||
586 | */ | 638 | */ |
587 | static struct worker_pool *get_work_pool(struct work_struct *work) | 639 | static struct worker_pool *get_work_pool(struct work_struct *work) |
588 | { | 640 | { |
589 | unsigned long data = atomic_long_read(&work->data); | 641 | unsigned long data = atomic_long_read(&work->data); |
590 | struct worker_pool *pool; | ||
591 | int pool_id; | 642 | int pool_id; |
592 | 643 | ||
644 | assert_rcu_or_pool_mutex(); | ||
645 | |||
593 | if (data & WORK_STRUCT_PWQ) | 646 | if (data & WORK_STRUCT_PWQ) |
594 | return ((struct pool_workqueue *) | 647 | return ((struct pool_workqueue *) |
595 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool; | 648 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool; |
@@ -598,9 +651,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) | |||
598 | if (pool_id == WORK_OFFQ_POOL_NONE) | 651 | if (pool_id == WORK_OFFQ_POOL_NONE) |
599 | return NULL; | 652 | return NULL; |
600 | 653 | ||
601 | pool = worker_pool_by_id(pool_id); | 654 | return idr_find(&worker_pool_idr, pool_id); |
602 | WARN_ON_ONCE(!pool); | ||
603 | return pool; | ||
604 | } | 655 | } |
605 | 656 | ||
606 | /** | 657 | /** |
@@ -689,7 +740,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) | |||
689 | /* Do we have too many workers and should some go away? */ | 740 | /* Do we have too many workers and should some go away? */ |
690 | static bool too_many_workers(struct worker_pool *pool) | 741 | static bool too_many_workers(struct worker_pool *pool) |
691 | { | 742 | { |
692 | bool managing = pool->flags & POOL_MANAGING_WORKERS; | 743 | bool managing = mutex_is_locked(&pool->manager_arb); |
693 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ | 744 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
694 | int nr_busy = pool->nr_workers - nr_idle; | 745 | int nr_busy = pool->nr_workers - nr_idle; |
695 | 746 | ||
@@ -744,7 +795,7 @@ static void wake_up_worker(struct worker_pool *pool) | |||
744 | * CONTEXT: | 795 | * CONTEXT: |
745 | * spin_lock_irq(rq->lock) | 796 | * spin_lock_irq(rq->lock) |
746 | */ | 797 | */ |
747 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | 798 | void wq_worker_waking_up(struct task_struct *task, int cpu) |
748 | { | 799 | { |
749 | struct worker *worker = kthread_data(task); | 800 | struct worker *worker = kthread_data(task); |
750 | 801 | ||
@@ -769,8 +820,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
769 | * RETURNS: | 820 | * RETURNS: |
770 | * Worker task on @cpu to wake up, %NULL if none. | 821 | * Worker task on @cpu to wake up, %NULL if none. |
771 | */ | 822 | */ |
772 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | 823 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) |
773 | unsigned int cpu) | ||
774 | { | 824 | { |
775 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | 825 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; |
776 | struct worker_pool *pool; | 826 | struct worker_pool *pool; |
@@ -786,7 +836,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
786 | pool = worker->pool; | 836 | pool = worker->pool; |
787 | 837 | ||
788 | /* this can only happen on the local cpu */ | 838 | /* this can only happen on the local cpu */ |
789 | BUG_ON(cpu != raw_smp_processor_id()); | 839 | if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) |
840 | return NULL; | ||
790 | 841 | ||
791 | /* | 842 | /* |
792 | * The counterpart of the following dec_and_test, implied mb, | 843 | * The counterpart of the following dec_and_test, implied mb, |
@@ -891,13 +942,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
891 | * recycled work item as currently executing and make it wait until the | 942 | * recycled work item as currently executing and make it wait until the |
892 | * current execution finishes, introducing an unwanted dependency. | 943 | * current execution finishes, introducing an unwanted dependency. |
893 | * | 944 | * |
894 | * This function checks the work item address, work function and workqueue | 945 | * This function checks the work item address and work function to avoid |
895 | * to avoid false positives. Note that this isn't complete as one may | 946 | * false positives. Note that this isn't complete as one may construct a |
896 | * construct a work function which can introduce dependency onto itself | 947 | * work function which can introduce dependency onto itself through a |
897 | * through a recycled work item. Well, if somebody wants to shoot oneself | 948 | * recycled work item. Well, if somebody wants to shoot oneself in the |
898 | * in the foot that badly, there's only so much we can do, and if such | 949 | * foot that badly, there's only so much we can do, and if such deadlock |
899 | * deadlock actually occurs, it should be easy to locate the culprit work | 950 | * actually occurs, it should be easy to locate the culprit work function. |
900 | * function. | ||
901 | * | 951 | * |
902 | * CONTEXT: | 952 | * CONTEXT: |
903 | * spin_lock_irq(pool->lock). | 953 | * spin_lock_irq(pool->lock). |
@@ -961,6 +1011,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, | |||
961 | *nextp = n; | 1011 | *nextp = n; |
962 | } | 1012 | } |
963 | 1013 | ||
1014 | /** | ||
1015 | * get_pwq - get an extra reference on the specified pool_workqueue | ||
1016 | * @pwq: pool_workqueue to get | ||
1017 | * | ||
1018 | * Obtain an extra reference on @pwq. The caller should guarantee that | ||
1019 | * @pwq has positive refcnt and be holding the matching pool->lock. | ||
1020 | */ | ||
1021 | static void get_pwq(struct pool_workqueue *pwq) | ||
1022 | { | ||
1023 | lockdep_assert_held(&pwq->pool->lock); | ||
1024 | WARN_ON_ONCE(pwq->refcnt <= 0); | ||
1025 | pwq->refcnt++; | ||
1026 | } | ||
1027 | |||
1028 | /** | ||
1029 | * put_pwq - put a pool_workqueue reference | ||
1030 | * @pwq: pool_workqueue to put | ||
1031 | * | ||
1032 | * Drop a reference of @pwq. If its refcnt reaches zero, schedule its | ||
1033 | * destruction. The caller should be holding the matching pool->lock. | ||
1034 | */ | ||
1035 | static void put_pwq(struct pool_workqueue *pwq) | ||
1036 | { | ||
1037 | lockdep_assert_held(&pwq->pool->lock); | ||
1038 | if (likely(--pwq->refcnt)) | ||
1039 | return; | ||
1040 | if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) | ||
1041 | return; | ||
1042 | /* | ||
1043 | * @pwq can't be released under pool->lock, bounce to | ||
1044 | * pwq_unbound_release_workfn(). This never recurses on the same | ||
1045 | * pool->lock as this path is taken only for unbound workqueues and | ||
1046 | * the release work item is scheduled on a per-cpu workqueue. To | ||
1047 | * avoid lockdep warning, unbound pool->locks are given lockdep | ||
1048 | * subclass of 1 in get_unbound_pool(). | ||
1049 | */ | ||
1050 | schedule_work(&pwq->unbound_release_work); | ||
1051 | } | ||
1052 | |||
1053 | /** | ||
1054 | * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock | ||
1055 | * @pwq: pool_workqueue to put (can be %NULL) | ||
1056 | * | ||
1057 | * put_pwq() with locking. This function also allows %NULL @pwq. | ||
1058 | */ | ||
1059 | static void put_pwq_unlocked(struct pool_workqueue *pwq) | ||
1060 | { | ||
1061 | if (pwq) { | ||
1062 | /* | ||
1063 | * As both pwqs and pools are sched-RCU protected, the | ||
1064 | * following lock operations are safe. | ||
1065 | */ | ||
1066 | spin_lock_irq(&pwq->pool->lock); | ||
1067 | put_pwq(pwq); | ||
1068 | spin_unlock_irq(&pwq->pool->lock); | ||
1069 | } | ||
1070 | } | ||
1071 | |||
964 | static void pwq_activate_delayed_work(struct work_struct *work) | 1072 | static void pwq_activate_delayed_work(struct work_struct *work) |
965 | { | 1073 | { |
966 | struct pool_workqueue *pwq = get_work_pwq(work); | 1074 | struct pool_workqueue *pwq = get_work_pwq(work); |
@@ -992,9 +1100,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) | |||
992 | */ | 1100 | */ |
993 | static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) | 1101 | static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) |
994 | { | 1102 | { |
995 | /* ignore uncolored works */ | 1103 | /* uncolored work items don't participate in flushing or nr_active */ |
996 | if (color == WORK_NO_COLOR) | 1104 | if (color == WORK_NO_COLOR) |
997 | return; | 1105 | goto out_put; |
998 | 1106 | ||
999 | pwq->nr_in_flight[color]--; | 1107 | pwq->nr_in_flight[color]--; |
1000 | 1108 | ||
@@ -1007,11 +1115,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) | |||
1007 | 1115 | ||
1008 | /* is flush in progress and are we at the flushing tip? */ | 1116 | /* is flush in progress and are we at the flushing tip? */ |
1009 | if (likely(pwq->flush_color != color)) | 1117 | if (likely(pwq->flush_color != color)) |
1010 | return; | 1118 | goto out_put; |
1011 | 1119 | ||
1012 | /* are there still in-flight works? */ | 1120 | /* are there still in-flight works? */ |
1013 | if (pwq->nr_in_flight[color]) | 1121 | if (pwq->nr_in_flight[color]) |
1014 | return; | 1122 | goto out_put; |
1015 | 1123 | ||
1016 | /* this pwq is done, clear flush_color */ | 1124 | /* this pwq is done, clear flush_color */ |
1017 | pwq->flush_color = -1; | 1125 | pwq->flush_color = -1; |
@@ -1022,6 +1130,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) | |||
1022 | */ | 1130 | */ |
1023 | if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) | 1131 | if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) |
1024 | complete(&pwq->wq->first_flusher->done); | 1132 | complete(&pwq->wq->first_flusher->done); |
1133 | out_put: | ||
1134 | put_pwq(pwq); | ||
1025 | } | 1135 | } |
1026 | 1136 | ||
1027 | /** | 1137 | /** |
@@ -1144,11 +1254,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, | |||
1144 | /* we own @work, set data and link */ | 1254 | /* we own @work, set data and link */ |
1145 | set_work_pwq(work, pwq, extra_flags); | 1255 | set_work_pwq(work, pwq, extra_flags); |
1146 | list_add_tail(&work->entry, head); | 1256 | list_add_tail(&work->entry, head); |
1257 | get_pwq(pwq); | ||
1147 | 1258 | ||
1148 | /* | 1259 | /* |
1149 | * Ensure either worker_sched_deactivated() sees the above | 1260 | * Ensure either wq_worker_sleeping() sees the above |
1150 | * list_add_tail() or we see zero nr_running to avoid workers | 1261 | * list_add_tail() or we see zero nr_running to avoid workers lying |
1151 | * lying around lazily while there are works to be processed. | 1262 | * around lazily while there are works to be processed. |
1152 | */ | 1263 | */ |
1153 | smp_mb(); | 1264 | smp_mb(); |
1154 | 1265 | ||
@@ -1172,10 +1283,11 @@ static bool is_chained_work(struct workqueue_struct *wq) | |||
1172 | return worker && worker->current_pwq->wq == wq; | 1283 | return worker && worker->current_pwq->wq == wq; |
1173 | } | 1284 | } |
1174 | 1285 | ||
1175 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | 1286 | static void __queue_work(int cpu, struct workqueue_struct *wq, |
1176 | struct work_struct *work) | 1287 | struct work_struct *work) |
1177 | { | 1288 | { |
1178 | struct pool_workqueue *pwq; | 1289 | struct pool_workqueue *pwq; |
1290 | struct worker_pool *last_pool; | ||
1179 | struct list_head *worklist; | 1291 | struct list_head *worklist; |
1180 | unsigned int work_flags; | 1292 | unsigned int work_flags; |
1181 | unsigned int req_cpu = cpu; | 1293 | unsigned int req_cpu = cpu; |
@@ -1191,48 +1303,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1191 | debug_work_activate(work); | 1303 | debug_work_activate(work); |
1192 | 1304 | ||
1193 | /* if dying, only works from the same workqueue are allowed */ | 1305 | /* if dying, only works from the same workqueue are allowed */ |
1194 | if (unlikely(wq->flags & WQ_DRAINING) && | 1306 | if (unlikely(wq->flags & __WQ_DRAINING) && |
1195 | WARN_ON_ONCE(!is_chained_work(wq))) | 1307 | WARN_ON_ONCE(!is_chained_work(wq))) |
1196 | return; | 1308 | return; |
1309 | retry: | ||
1310 | if (req_cpu == WORK_CPU_UNBOUND) | ||
1311 | cpu = raw_smp_processor_id(); | ||
1197 | 1312 | ||
1198 | /* determine the pwq to use */ | 1313 | /* pwq which will be used unless @work is executing elsewhere */ |
1199 | if (!(wq->flags & WQ_UNBOUND)) { | 1314 | if (!(wq->flags & WQ_UNBOUND)) |
1200 | struct worker_pool *last_pool; | 1315 | pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); |
1201 | 1316 | else | |
1202 | if (cpu == WORK_CPU_UNBOUND) | 1317 | pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); |
1203 | cpu = raw_smp_processor_id(); | ||
1204 | |||
1205 | /* | ||
1206 | * It's multi cpu. If @work was previously on a different | ||
1207 | * cpu, it might still be running there, in which case the | ||
1208 | * work needs to be queued on that cpu to guarantee | ||
1209 | * non-reentrancy. | ||
1210 | */ | ||
1211 | pwq = get_pwq(cpu, wq); | ||
1212 | last_pool = get_work_pool(work); | ||
1213 | 1318 | ||
1214 | if (last_pool && last_pool != pwq->pool) { | 1319 | /* |
1215 | struct worker *worker; | 1320 | * If @work was previously on a different pool, it might still be |
1321 | * running there, in which case the work needs to be queued on that | ||
1322 | * pool to guarantee non-reentrancy. | ||
1323 | */ | ||
1324 | last_pool = get_work_pool(work); | ||
1325 | if (last_pool && last_pool != pwq->pool) { | ||
1326 | struct worker *worker; | ||
1216 | 1327 | ||
1217 | spin_lock(&last_pool->lock); | 1328 | spin_lock(&last_pool->lock); |
1218 | 1329 | ||
1219 | worker = find_worker_executing_work(last_pool, work); | 1330 | worker = find_worker_executing_work(last_pool, work); |
1220 | 1331 | ||
1221 | if (worker && worker->current_pwq->wq == wq) { | 1332 | if (worker && worker->current_pwq->wq == wq) { |
1222 | pwq = get_pwq(last_pool->cpu, wq); | 1333 | pwq = worker->current_pwq; |
1223 | } else { | ||
1224 | /* meh... not running there, queue here */ | ||
1225 | spin_unlock(&last_pool->lock); | ||
1226 | spin_lock(&pwq->pool->lock); | ||
1227 | } | ||
1228 | } else { | 1334 | } else { |
1335 | /* meh... not running there, queue here */ | ||
1336 | spin_unlock(&last_pool->lock); | ||
1229 | spin_lock(&pwq->pool->lock); | 1337 | spin_lock(&pwq->pool->lock); |
1230 | } | 1338 | } |
1231 | } else { | 1339 | } else { |
1232 | pwq = get_pwq(WORK_CPU_UNBOUND, wq); | ||
1233 | spin_lock(&pwq->pool->lock); | 1340 | spin_lock(&pwq->pool->lock); |
1234 | } | 1341 | } |
1235 | 1342 | ||
1343 | /* | ||
1344 | * pwq is determined and locked. For unbound pools, we could have | ||
1345 | * raced with pwq release and it could already be dead. If its | ||
1346 | * refcnt is zero, repeat pwq selection. Note that pwqs never die | ||
1347 | * without another pwq replacing it in the numa_pwq_tbl or while | ||
1348 | * work items are executing on it, so the retrying is guaranteed to | ||
1349 | * make forward-progress. | ||
1350 | */ | ||
1351 | if (unlikely(!pwq->refcnt)) { | ||
1352 | if (wq->flags & WQ_UNBOUND) { | ||
1353 | spin_unlock(&pwq->pool->lock); | ||
1354 | cpu_relax(); | ||
1355 | goto retry; | ||
1356 | } | ||
1357 | /* oops */ | ||
1358 | WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", | ||
1359 | wq->name, cpu); | ||
1360 | } | ||
1361 | |||
1236 | /* pwq determined, queue */ | 1362 | /* pwq determined, queue */ |
1237 | trace_workqueue_queue_work(req_cpu, pwq, work); | 1363 | trace_workqueue_queue_work(req_cpu, pwq, work); |
1238 | 1364 | ||
@@ -1285,23 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, | |||
1285 | local_irq_restore(flags); | 1411 | local_irq_restore(flags); |
1286 | return ret; | 1412 | return ret; |
1287 | } | 1413 | } |
1288 | EXPORT_SYMBOL_GPL(queue_work_on); | 1414 | EXPORT_SYMBOL(queue_work_on); |
1289 | |||
1290 | /** | ||
1291 | * queue_work - queue work on a workqueue | ||
1292 | * @wq: workqueue to use | ||
1293 | * @work: work to queue | ||
1294 | * | ||
1295 | * Returns %false if @work was already on a queue, %true otherwise. | ||
1296 | * | ||
1297 | * We queue the work to the CPU on which it was submitted, but if the CPU dies | ||
1298 | * it can be processed by another CPU. | ||
1299 | */ | ||
1300 | bool queue_work(struct workqueue_struct *wq, struct work_struct *work) | ||
1301 | { | ||
1302 | return queue_work_on(WORK_CPU_UNBOUND, wq, work); | ||
1303 | } | ||
1304 | EXPORT_SYMBOL_GPL(queue_work); | ||
1305 | 1415 | ||
1306 | void delayed_work_timer_fn(unsigned long __data) | 1416 | void delayed_work_timer_fn(unsigned long __data) |
1307 | { | 1417 | { |
@@ -1375,22 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
1375 | local_irq_restore(flags); | 1485 | local_irq_restore(flags); |
1376 | return ret; | 1486 | return ret; |
1377 | } | 1487 | } |
1378 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | 1488 | EXPORT_SYMBOL(queue_delayed_work_on); |
1379 | |||
1380 | /** | ||
1381 | * queue_delayed_work - queue work on a workqueue after delay | ||
1382 | * @wq: workqueue to use | ||
1383 | * @dwork: delayable work to queue | ||
1384 | * @delay: number of jiffies to wait before queueing | ||
1385 | * | ||
1386 | * Equivalent to queue_delayed_work_on() but tries to use the local CPU. | ||
1387 | */ | ||
1388 | bool queue_delayed_work(struct workqueue_struct *wq, | ||
1389 | struct delayed_work *dwork, unsigned long delay) | ||
1390 | { | ||
1391 | return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); | ||
1392 | } | ||
1393 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
1394 | 1489 | ||
1395 | /** | 1490 | /** |
1396 | * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU | 1491 | * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU |
@@ -1431,21 +1526,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
1431 | EXPORT_SYMBOL_GPL(mod_delayed_work_on); | 1526 | EXPORT_SYMBOL_GPL(mod_delayed_work_on); |
1432 | 1527 | ||
1433 | /** | 1528 | /** |
1434 | * mod_delayed_work - modify delay of or queue a delayed work | ||
1435 | * @wq: workqueue to use | ||
1436 | * @dwork: work to queue | ||
1437 | * @delay: number of jiffies to wait before queueing | ||
1438 | * | ||
1439 | * mod_delayed_work_on() on local CPU. | ||
1440 | */ | ||
1441 | bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, | ||
1442 | unsigned long delay) | ||
1443 | { | ||
1444 | return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); | ||
1445 | } | ||
1446 | EXPORT_SYMBOL_GPL(mod_delayed_work); | ||
1447 | |||
1448 | /** | ||
1449 | * worker_enter_idle - enter idle state | 1529 | * worker_enter_idle - enter idle state |
1450 | * @worker: worker which is entering idle state | 1530 | * @worker: worker which is entering idle state |
1451 | * | 1531 | * |
@@ -1459,9 +1539,10 @@ static void worker_enter_idle(struct worker *worker) | |||
1459 | { | 1539 | { |
1460 | struct worker_pool *pool = worker->pool; | 1540 | struct worker_pool *pool = worker->pool; |
1461 | 1541 | ||
1462 | BUG_ON(worker->flags & WORKER_IDLE); | 1542 | if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || |
1463 | BUG_ON(!list_empty(&worker->entry) && | 1543 | WARN_ON_ONCE(!list_empty(&worker->entry) && |
1464 | (worker->hentry.next || worker->hentry.pprev)); | 1544 | (worker->hentry.next || worker->hentry.pprev))) |
1545 | return; | ||
1465 | 1546 | ||
1466 | /* can't use worker_set_flags(), also called from start_worker() */ | 1547 | /* can't use worker_set_flags(), also called from start_worker() */ |
1467 | worker->flags |= WORKER_IDLE; | 1548 | worker->flags |= WORKER_IDLE; |
@@ -1498,22 +1579,25 @@ static void worker_leave_idle(struct worker *worker) | |||
1498 | { | 1579 | { |
1499 | struct worker_pool *pool = worker->pool; | 1580 | struct worker_pool *pool = worker->pool; |
1500 | 1581 | ||
1501 | BUG_ON(!(worker->flags & WORKER_IDLE)); | 1582 | if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) |
1583 | return; | ||
1502 | worker_clr_flags(worker, WORKER_IDLE); | 1584 | worker_clr_flags(worker, WORKER_IDLE); |
1503 | pool->nr_idle--; | 1585 | pool->nr_idle--; |
1504 | list_del_init(&worker->entry); | 1586 | list_del_init(&worker->entry); |
1505 | } | 1587 | } |
1506 | 1588 | ||
1507 | /** | 1589 | /** |
1508 | * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool | 1590 | * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it |
1509 | * @worker: self | 1591 | * @pool: target worker_pool |
1592 | * | ||
1593 | * Bind %current to the cpu of @pool if it is associated and lock @pool. | ||
1510 | * | 1594 | * |
1511 | * Works which are scheduled while the cpu is online must at least be | 1595 | * Works which are scheduled while the cpu is online must at least be |
1512 | * scheduled to a worker which is bound to the cpu so that if they are | 1596 | * scheduled to a worker which is bound to the cpu so that if they are |
1513 | * flushed from cpu callbacks while cpu is going down, they are | 1597 | * flushed from cpu callbacks while cpu is going down, they are |
1514 | * guaranteed to execute on the cpu. | 1598 | * guaranteed to execute on the cpu. |
1515 | * | 1599 | * |
1516 | * This function is to be used by rogue workers and rescuers to bind | 1600 | * This function is to be used by unbound workers and rescuers to bind |
1517 | * themselves to the target cpu and may race with cpu going down or | 1601 | * themselves to the target cpu and may race with cpu going down or |
1518 | * coming online. kthread_bind() can't be used because it may put the | 1602 | * coming online. kthread_bind() can't be used because it may put the |
1519 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used | 1603 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used |
@@ -1534,12 +1618,9 @@ static void worker_leave_idle(struct worker *worker) | |||
1534 | * %true if the associated pool is online (@worker is successfully | 1618 | * %true if the associated pool is online (@worker is successfully |
1535 | * bound), %false if offline. | 1619 | * bound), %false if offline. |
1536 | */ | 1620 | */ |
1537 | static bool worker_maybe_bind_and_lock(struct worker *worker) | 1621 | static bool worker_maybe_bind_and_lock(struct worker_pool *pool) |
1538 | __acquires(&pool->lock) | 1622 | __acquires(&pool->lock) |
1539 | { | 1623 | { |
1540 | struct worker_pool *pool = worker->pool; | ||
1541 | struct task_struct *task = worker->task; | ||
1542 | |||
1543 | while (true) { | 1624 | while (true) { |
1544 | /* | 1625 | /* |
1545 | * The following call may fail, succeed or succeed | 1626 | * The following call may fail, succeed or succeed |
@@ -1548,14 +1629,13 @@ __acquires(&pool->lock) | |||
1548 | * against POOL_DISASSOCIATED. | 1629 | * against POOL_DISASSOCIATED. |
1549 | */ | 1630 | */ |
1550 | if (!(pool->flags & POOL_DISASSOCIATED)) | 1631 | if (!(pool->flags & POOL_DISASSOCIATED)) |
1551 | set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); | 1632 | set_cpus_allowed_ptr(current, pool->attrs->cpumask); |
1552 | 1633 | ||
1553 | spin_lock_irq(&pool->lock); | 1634 | spin_lock_irq(&pool->lock); |
1554 | if (pool->flags & POOL_DISASSOCIATED) | 1635 | if (pool->flags & POOL_DISASSOCIATED) |
1555 | return false; | 1636 | return false; |
1556 | if (task_cpu(task) == pool->cpu && | 1637 | if (task_cpu(current) == pool->cpu && |
1557 | cpumask_equal(¤t->cpus_allowed, | 1638 | cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) |
1558 | get_cpu_mask(pool->cpu))) | ||
1559 | return true; | 1639 | return true; |
1560 | spin_unlock_irq(&pool->lock); | 1640 | spin_unlock_irq(&pool->lock); |
1561 | 1641 | ||
@@ -1570,108 +1650,6 @@ __acquires(&pool->lock) | |||
1570 | } | 1650 | } |
1571 | } | 1651 | } |
1572 | 1652 | ||
1573 | /* | ||
1574 | * Rebind an idle @worker to its CPU. worker_thread() will test | ||
1575 | * list_empty(@worker->entry) before leaving idle and call this function. | ||
1576 | */ | ||
1577 | static void idle_worker_rebind(struct worker *worker) | ||
1578 | { | ||
1579 | /* CPU may go down again inbetween, clear UNBOUND only on success */ | ||
1580 | if (worker_maybe_bind_and_lock(worker)) | ||
1581 | worker_clr_flags(worker, WORKER_UNBOUND); | ||
1582 | |||
1583 | /* rebind complete, become available again */ | ||
1584 | list_add(&worker->entry, &worker->pool->idle_list); | ||
1585 | spin_unlock_irq(&worker->pool->lock); | ||
1586 | } | ||
1587 | |||
1588 | /* | ||
1589 | * Function for @worker->rebind.work used to rebind unbound busy workers to | ||
1590 | * the associated cpu which is coming back online. This is scheduled by | ||
1591 | * cpu up but can race with other cpu hotplug operations and may be | ||
1592 | * executed twice without intervening cpu down. | ||
1593 | */ | ||
1594 | static void busy_worker_rebind_fn(struct work_struct *work) | ||
1595 | { | ||
1596 | struct worker *worker = container_of(work, struct worker, rebind_work); | ||
1597 | |||
1598 | if (worker_maybe_bind_and_lock(worker)) | ||
1599 | worker_clr_flags(worker, WORKER_UNBOUND); | ||
1600 | |||
1601 | spin_unlock_irq(&worker->pool->lock); | ||
1602 | } | ||
1603 | |||
1604 | /** | ||
1605 | * rebind_workers - rebind all workers of a pool to the associated CPU | ||
1606 | * @pool: pool of interest | ||
1607 | * | ||
1608 | * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding | ||
1609 | * is different for idle and busy ones. | ||
1610 | * | ||
1611 | * Idle ones will be removed from the idle_list and woken up. They will | ||
1612 | * add themselves back after completing rebind. This ensures that the | ||
1613 | * idle_list doesn't contain any unbound workers when re-bound busy workers | ||
1614 | * try to perform local wake-ups for concurrency management. | ||
1615 | * | ||
1616 | * Busy workers can rebind after they finish their current work items. | ||
1617 | * Queueing the rebind work item at the head of the scheduled list is | ||
1618 | * enough. Note that nr_running will be properly bumped as busy workers | ||
1619 | * rebind. | ||
1620 | * | ||
1621 | * On return, all non-manager workers are scheduled for rebind - see | ||
1622 | * manage_workers() for the manager special case. Any idle worker | ||
1623 | * including the manager will not appear on @idle_list until rebind is | ||
1624 | * complete, making local wake-ups safe. | ||
1625 | */ | ||
1626 | static void rebind_workers(struct worker_pool *pool) | ||
1627 | { | ||
1628 | struct worker *worker, *n; | ||
1629 | int i; | ||
1630 | |||
1631 | lockdep_assert_held(&pool->assoc_mutex); | ||
1632 | lockdep_assert_held(&pool->lock); | ||
1633 | |||
1634 | /* dequeue and kick idle ones */ | ||
1635 | list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { | ||
1636 | /* | ||
1637 | * idle workers should be off @pool->idle_list until rebind | ||
1638 | * is complete to avoid receiving premature local wake-ups. | ||
1639 | */ | ||
1640 | list_del_init(&worker->entry); | ||
1641 | |||
1642 | /* | ||
1643 | * worker_thread() will see the above dequeuing and call | ||
1644 | * idle_worker_rebind(). | ||
1645 | */ | ||
1646 | wake_up_process(worker->task); | ||
1647 | } | ||
1648 | |||
1649 | /* rebind busy workers */ | ||
1650 | for_each_busy_worker(worker, i, pool) { | ||
1651 | struct work_struct *rebind_work = &worker->rebind_work; | ||
1652 | struct workqueue_struct *wq; | ||
1653 | |||
1654 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
1655 | work_data_bits(rebind_work))) | ||
1656 | continue; | ||
1657 | |||
1658 | debug_work_activate(rebind_work); | ||
1659 | |||
1660 | /* | ||
1661 | * wq doesn't really matter but let's keep @worker->pool | ||
1662 | * and @pwq->pool consistent for sanity. | ||
1663 | */ | ||
1664 | if (std_worker_pool_pri(worker->pool)) | ||
1665 | wq = system_highpri_wq; | ||
1666 | else | ||
1667 | wq = system_wq; | ||
1668 | |||
1669 | insert_work(get_pwq(pool->cpu, wq), rebind_work, | ||
1670 | worker->scheduled.next, | ||
1671 | work_color_to_flags(WORK_NO_COLOR)); | ||
1672 | } | ||
1673 | } | ||
1674 | |||
1675 | static struct worker *alloc_worker(void) | 1653 | static struct worker *alloc_worker(void) |
1676 | { | 1654 | { |
1677 | struct worker *worker; | 1655 | struct worker *worker; |
@@ -1680,7 +1658,6 @@ static struct worker *alloc_worker(void) | |||
1680 | if (worker) { | 1658 | if (worker) { |
1681 | INIT_LIST_HEAD(&worker->entry); | 1659 | INIT_LIST_HEAD(&worker->entry); |
1682 | INIT_LIST_HEAD(&worker->scheduled); | 1660 | INIT_LIST_HEAD(&worker->scheduled); |
1683 | INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); | ||
1684 | /* on creation a worker is in !idle && prep state */ | 1661 | /* on creation a worker is in !idle && prep state */ |
1685 | worker->flags = WORKER_PREP; | 1662 | worker->flags = WORKER_PREP; |
1686 | } | 1663 | } |
@@ -1703,18 +1680,25 @@ static struct worker *alloc_worker(void) | |||
1703 | */ | 1680 | */ |
1704 | static struct worker *create_worker(struct worker_pool *pool) | 1681 | static struct worker *create_worker(struct worker_pool *pool) |
1705 | { | 1682 | { |
1706 | const char *pri = std_worker_pool_pri(pool) ? "H" : ""; | ||
1707 | struct worker *worker = NULL; | 1683 | struct worker *worker = NULL; |
1708 | int id = -1; | 1684 | int id = -1; |
1685 | char id_buf[16]; | ||
1686 | |||
1687 | lockdep_assert_held(&pool->manager_mutex); | ||
1709 | 1688 | ||
1689 | /* | ||
1690 | * ID is needed to determine kthread name. Allocate ID first | ||
1691 | * without installing the pointer. | ||
1692 | */ | ||
1693 | idr_preload(GFP_KERNEL); | ||
1710 | spin_lock_irq(&pool->lock); | 1694 | spin_lock_irq(&pool->lock); |
1711 | while (ida_get_new(&pool->worker_ida, &id)) { | 1695 | |
1712 | spin_unlock_irq(&pool->lock); | 1696 | id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); |
1713 | if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) | 1697 | |
1714 | goto fail; | ||
1715 | spin_lock_irq(&pool->lock); | ||
1716 | } | ||
1717 | spin_unlock_irq(&pool->lock); | 1698 | spin_unlock_irq(&pool->lock); |
1699 | idr_preload_end(); | ||
1700 | if (id < 0) | ||
1701 | goto fail; | ||
1718 | 1702 | ||
1719 | worker = alloc_worker(); | 1703 | worker = alloc_worker(); |
1720 | if (!worker) | 1704 | if (!worker) |
@@ -1723,40 +1707,46 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
1723 | worker->pool = pool; | 1707 | worker->pool = pool; |
1724 | worker->id = id; | 1708 | worker->id = id; |
1725 | 1709 | ||
1726 | if (pool->cpu != WORK_CPU_UNBOUND) | 1710 | if (pool->cpu >= 0) |
1727 | worker->task = kthread_create_on_node(worker_thread, | 1711 | snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, |
1728 | worker, cpu_to_node(pool->cpu), | 1712 | pool->attrs->nice < 0 ? "H" : ""); |
1729 | "kworker/%u:%d%s", pool->cpu, id, pri); | ||
1730 | else | 1713 | else |
1731 | worker->task = kthread_create(worker_thread, worker, | 1714 | snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); |
1732 | "kworker/u:%d%s", id, pri); | 1715 | |
1716 | worker->task = kthread_create_on_node(worker_thread, worker, pool->node, | ||
1717 | "kworker/%s", id_buf); | ||
1733 | if (IS_ERR(worker->task)) | 1718 | if (IS_ERR(worker->task)) |
1734 | goto fail; | 1719 | goto fail; |
1735 | 1720 | ||
1736 | if (std_worker_pool_pri(pool)) | 1721 | /* |
1737 | set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); | 1722 | * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any |
1723 | * online CPUs. It'll be re-applied when any of the CPUs come up. | ||
1724 | */ | ||
1725 | set_user_nice(worker->task, pool->attrs->nice); | ||
1726 | set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); | ||
1727 | |||
1728 | /* prevent userland from meddling with cpumask of workqueue workers */ | ||
1729 | worker->task->flags |= PF_NO_SETAFFINITY; | ||
1738 | 1730 | ||
1739 | /* | 1731 | /* |
1740 | * Determine CPU binding of the new worker depending on | 1732 | * The caller is responsible for ensuring %POOL_DISASSOCIATED |
1741 | * %POOL_DISASSOCIATED. The caller is responsible for ensuring the | 1733 | * remains stable across this function. See the comments above the |
1742 | * flag remains stable across this function. See the comments | 1734 | * flag definition for details. |
1743 | * above the flag definition for details. | ||
1744 | * | ||
1745 | * As an unbound worker may later become a regular one if CPU comes | ||
1746 | * online, make sure every worker has %PF_THREAD_BOUND set. | ||
1747 | */ | 1735 | */ |
1748 | if (!(pool->flags & POOL_DISASSOCIATED)) { | 1736 | if (pool->flags & POOL_DISASSOCIATED) |
1749 | kthread_bind(worker->task, pool->cpu); | ||
1750 | } else { | ||
1751 | worker->task->flags |= PF_THREAD_BOUND; | ||
1752 | worker->flags |= WORKER_UNBOUND; | 1737 | worker->flags |= WORKER_UNBOUND; |
1753 | } | 1738 | |
1739 | /* successful, commit the pointer to idr */ | ||
1740 | spin_lock_irq(&pool->lock); | ||
1741 | idr_replace(&pool->worker_idr, worker, worker->id); | ||
1742 | spin_unlock_irq(&pool->lock); | ||
1754 | 1743 | ||
1755 | return worker; | 1744 | return worker; |
1745 | |||
1756 | fail: | 1746 | fail: |
1757 | if (id >= 0) { | 1747 | if (id >= 0) { |
1758 | spin_lock_irq(&pool->lock); | 1748 | spin_lock_irq(&pool->lock); |
1759 | ida_remove(&pool->worker_ida, id); | 1749 | idr_remove(&pool->worker_idr, id); |
1760 | spin_unlock_irq(&pool->lock); | 1750 | spin_unlock_irq(&pool->lock); |
1761 | } | 1751 | } |
1762 | kfree(worker); | 1752 | kfree(worker); |
@@ -1781,6 +1771,30 @@ static void start_worker(struct worker *worker) | |||
1781 | } | 1771 | } |
1782 | 1772 | ||
1783 | /** | 1773 | /** |
1774 | * create_and_start_worker - create and start a worker for a pool | ||
1775 | * @pool: the target pool | ||
1776 | * | ||
1777 | * Grab the managership of @pool and create and start a new worker for it. | ||
1778 | */ | ||
1779 | static int create_and_start_worker(struct worker_pool *pool) | ||
1780 | { | ||
1781 | struct worker *worker; | ||
1782 | |||
1783 | mutex_lock(&pool->manager_mutex); | ||
1784 | |||
1785 | worker = create_worker(pool); | ||
1786 | if (worker) { | ||
1787 | spin_lock_irq(&pool->lock); | ||
1788 | start_worker(worker); | ||
1789 | spin_unlock_irq(&pool->lock); | ||
1790 | } | ||
1791 | |||
1792 | mutex_unlock(&pool->manager_mutex); | ||
1793 | |||
1794 | return worker ? 0 : -ENOMEM; | ||
1795 | } | ||
1796 | |||
1797 | /** | ||
1784 | * destroy_worker - destroy a workqueue worker | 1798 | * destroy_worker - destroy a workqueue worker |
1785 | * @worker: worker to be destroyed | 1799 | * @worker: worker to be destroyed |
1786 | * | 1800 | * |
@@ -1792,11 +1806,14 @@ static void start_worker(struct worker *worker) | |||
1792 | static void destroy_worker(struct worker *worker) | 1806 | static void destroy_worker(struct worker *worker) |
1793 | { | 1807 | { |
1794 | struct worker_pool *pool = worker->pool; | 1808 | struct worker_pool *pool = worker->pool; |
1795 | int id = worker->id; | 1809 | |
1810 | lockdep_assert_held(&pool->manager_mutex); | ||
1811 | lockdep_assert_held(&pool->lock); | ||
1796 | 1812 | ||
1797 | /* sanity check frenzy */ | 1813 | /* sanity check frenzy */ |
1798 | BUG_ON(worker->current_work); | 1814 | if (WARN_ON(worker->current_work) || |
1799 | BUG_ON(!list_empty(&worker->scheduled)); | 1815 | WARN_ON(!list_empty(&worker->scheduled))) |
1816 | return; | ||
1800 | 1817 | ||
1801 | if (worker->flags & WORKER_STARTED) | 1818 | if (worker->flags & WORKER_STARTED) |
1802 | pool->nr_workers--; | 1819 | pool->nr_workers--; |
@@ -1806,13 +1823,14 @@ static void destroy_worker(struct worker *worker) | |||
1806 | list_del_init(&worker->entry); | 1823 | list_del_init(&worker->entry); |
1807 | worker->flags |= WORKER_DIE; | 1824 | worker->flags |= WORKER_DIE; |
1808 | 1825 | ||
1826 | idr_remove(&pool->worker_idr, worker->id); | ||
1827 | |||
1809 | spin_unlock_irq(&pool->lock); | 1828 | spin_unlock_irq(&pool->lock); |
1810 | 1829 | ||
1811 | kthread_stop(worker->task); | 1830 | kthread_stop(worker->task); |
1812 | kfree(worker); | 1831 | kfree(worker); |
1813 | 1832 | ||
1814 | spin_lock_irq(&pool->lock); | 1833 | spin_lock_irq(&pool->lock); |
1815 | ida_remove(&pool->worker_ida, id); | ||
1816 | } | 1834 | } |
1817 | 1835 | ||
1818 | static void idle_worker_timeout(unsigned long __pool) | 1836 | static void idle_worker_timeout(unsigned long __pool) |
@@ -1841,23 +1859,21 @@ static void idle_worker_timeout(unsigned long __pool) | |||
1841 | spin_unlock_irq(&pool->lock); | 1859 | spin_unlock_irq(&pool->lock); |
1842 | } | 1860 | } |
1843 | 1861 | ||
1844 | static bool send_mayday(struct work_struct *work) | 1862 | static void send_mayday(struct work_struct *work) |
1845 | { | 1863 | { |
1846 | struct pool_workqueue *pwq = get_work_pwq(work); | 1864 | struct pool_workqueue *pwq = get_work_pwq(work); |
1847 | struct workqueue_struct *wq = pwq->wq; | 1865 | struct workqueue_struct *wq = pwq->wq; |
1848 | unsigned int cpu; | ||
1849 | 1866 | ||
1850 | if (!(wq->flags & WQ_RESCUER)) | 1867 | lockdep_assert_held(&wq_mayday_lock); |
1851 | return false; | 1868 | |
1869 | if (!wq->rescuer) | ||
1870 | return; | ||
1852 | 1871 | ||
1853 | /* mayday mayday mayday */ | 1872 | /* mayday mayday mayday */ |
1854 | cpu = pwq->pool->cpu; | 1873 | if (list_empty(&pwq->mayday_node)) { |
1855 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ | 1874 | list_add_tail(&pwq->mayday_node, &wq->maydays); |
1856 | if (cpu == WORK_CPU_UNBOUND) | ||
1857 | cpu = 0; | ||
1858 | if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) | ||
1859 | wake_up_process(wq->rescuer->task); | 1875 | wake_up_process(wq->rescuer->task); |
1860 | return true; | 1876 | } |
1861 | } | 1877 | } |
1862 | 1878 | ||
1863 | static void pool_mayday_timeout(unsigned long __pool) | 1879 | static void pool_mayday_timeout(unsigned long __pool) |
@@ -1865,7 +1881,8 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
1865 | struct worker_pool *pool = (void *)__pool; | 1881 | struct worker_pool *pool = (void *)__pool; |
1866 | struct work_struct *work; | 1882 | struct work_struct *work; |
1867 | 1883 | ||
1868 | spin_lock_irq(&pool->lock); | 1884 | spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ |
1885 | spin_lock(&pool->lock); | ||
1869 | 1886 | ||
1870 | if (need_to_create_worker(pool)) { | 1887 | if (need_to_create_worker(pool)) { |
1871 | /* | 1888 | /* |
@@ -1878,7 +1895,8 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
1878 | send_mayday(work); | 1895 | send_mayday(work); |
1879 | } | 1896 | } |
1880 | 1897 | ||
1881 | spin_unlock_irq(&pool->lock); | 1898 | spin_unlock(&pool->lock); |
1899 | spin_unlock_irq(&wq_mayday_lock); | ||
1882 | 1900 | ||
1883 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); | 1901 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); |
1884 | } | 1902 | } |
@@ -1893,8 +1911,8 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
1893 | * sent to all rescuers with works scheduled on @pool to resolve | 1911 | * sent to all rescuers with works scheduled on @pool to resolve |
1894 | * possible allocation deadlock. | 1912 | * possible allocation deadlock. |
1895 | * | 1913 | * |
1896 | * On return, need_to_create_worker() is guaranteed to be false and | 1914 | * On return, need_to_create_worker() is guaranteed to be %false and |
1897 | * may_start_working() true. | 1915 | * may_start_working() %true. |
1898 | * | 1916 | * |
1899 | * LOCKING: | 1917 | * LOCKING: |
1900 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 1918 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
@@ -1902,7 +1920,7 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
1902 | * manager. | 1920 | * manager. |
1903 | * | 1921 | * |
1904 | * RETURNS: | 1922 | * RETURNS: |
1905 | * false if no action was taken and pool->lock stayed locked, true | 1923 | * %false if no action was taken and pool->lock stayed locked, %true |
1906 | * otherwise. | 1924 | * otherwise. |
1907 | */ | 1925 | */ |
1908 | static bool maybe_create_worker(struct worker_pool *pool) | 1926 | static bool maybe_create_worker(struct worker_pool *pool) |
@@ -1925,7 +1943,8 @@ restart: | |||
1925 | del_timer_sync(&pool->mayday_timer); | 1943 | del_timer_sync(&pool->mayday_timer); |
1926 | spin_lock_irq(&pool->lock); | 1944 | spin_lock_irq(&pool->lock); |
1927 | start_worker(worker); | 1945 | start_worker(worker); |
1928 | BUG_ON(need_to_create_worker(pool)); | 1946 | if (WARN_ON_ONCE(need_to_create_worker(pool))) |
1947 | goto restart; | ||
1929 | return true; | 1948 | return true; |
1930 | } | 1949 | } |
1931 | 1950 | ||
@@ -1958,7 +1977,7 @@ restart: | |||
1958 | * multiple times. Called only from manager. | 1977 | * multiple times. Called only from manager. |
1959 | * | 1978 | * |
1960 | * RETURNS: | 1979 | * RETURNS: |
1961 | * false if no action was taken and pool->lock stayed locked, true | 1980 | * %false if no action was taken and pool->lock stayed locked, %true |
1962 | * otherwise. | 1981 | * otherwise. |
1963 | */ | 1982 | */ |
1964 | static bool maybe_destroy_workers(struct worker_pool *pool) | 1983 | static bool maybe_destroy_workers(struct worker_pool *pool) |
@@ -2009,42 +2028,38 @@ static bool manage_workers(struct worker *worker) | |||
2009 | struct worker_pool *pool = worker->pool; | 2028 | struct worker_pool *pool = worker->pool; |
2010 | bool ret = false; | 2029 | bool ret = false; |
2011 | 2030 | ||
2012 | if (pool->flags & POOL_MANAGING_WORKERS) | 2031 | /* |
2032 | * Managership is governed by two mutexes - manager_arb and | ||
2033 | * manager_mutex. manager_arb handles arbitration of manager role. | ||
2034 | * Anyone who successfully grabs manager_arb wins the arbitration | ||
2035 | * and becomes the manager. mutex_trylock() on pool->manager_arb | ||
2036 | * failure while holding pool->lock reliably indicates that someone | ||
2037 | * else is managing the pool and the worker which failed trylock | ||
2038 | * can proceed to executing work items. This means that anyone | ||
2039 | * grabbing manager_arb is responsible for actually performing | ||
2040 | * manager duties. If manager_arb is grabbed and released without | ||
2041 | * actual management, the pool may stall indefinitely. | ||
2042 | * | ||
2043 | * manager_mutex is used for exclusion of actual management | ||
2044 | * operations. The holder of manager_mutex can be sure that none | ||
2045 | * of management operations, including creation and destruction of | ||
2046 | * workers, won't take place until the mutex is released. Because | ||
2047 | * manager_mutex doesn't interfere with manager role arbitration, | ||
2048 | * it is guaranteed that the pool's management, while may be | ||
2049 | * delayed, won't be disturbed by someone else grabbing | ||
2050 | * manager_mutex. | ||
2051 | */ | ||
2052 | if (!mutex_trylock(&pool->manager_arb)) | ||
2013 | return ret; | 2053 | return ret; |
2014 | 2054 | ||
2015 | pool->flags |= POOL_MANAGING_WORKERS; | ||
2016 | |||
2017 | /* | 2055 | /* |
2018 | * To simplify both worker management and CPU hotplug, hold off | 2056 | * With manager arbitration won, manager_mutex would be free in |
2019 | * management while hotplug is in progress. CPU hotplug path can't | 2057 | * most cases. trylock first without dropping @pool->lock. |
2020 | * grab %POOL_MANAGING_WORKERS to achieve this because that can | ||
2021 | * lead to idle worker depletion (all become busy thinking someone | ||
2022 | * else is managing) which in turn can result in deadlock under | ||
2023 | * extreme circumstances. Use @pool->assoc_mutex to synchronize | ||
2024 | * manager against CPU hotplug. | ||
2025 | * | ||
2026 | * assoc_mutex would always be free unless CPU hotplug is in | ||
2027 | * progress. trylock first without dropping @pool->lock. | ||
2028 | */ | 2058 | */ |
2029 | if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { | 2059 | if (unlikely(!mutex_trylock(&pool->manager_mutex))) { |
2030 | spin_unlock_irq(&pool->lock); | 2060 | spin_unlock_irq(&pool->lock); |
2031 | mutex_lock(&pool->assoc_mutex); | 2061 | mutex_lock(&pool->manager_mutex); |
2032 | /* | 2062 | spin_lock_irq(&pool->lock); |
2033 | * CPU hotplug could have happened while we were waiting | ||
2034 | * for assoc_mutex. Hotplug itself can't handle us | ||
2035 | * because manager isn't either on idle or busy list, and | ||
2036 | * @pool's state and ours could have deviated. | ||
2037 | * | ||
2038 | * As hotplug is now excluded via assoc_mutex, we can | ||
2039 | * simply try to bind. It will succeed or fail depending | ||
2040 | * on @pool's current state. Try it and adjust | ||
2041 | * %WORKER_UNBOUND accordingly. | ||
2042 | */ | ||
2043 | if (worker_maybe_bind_and_lock(worker)) | ||
2044 | worker->flags &= ~WORKER_UNBOUND; | ||
2045 | else | ||
2046 | worker->flags |= WORKER_UNBOUND; | ||
2047 | |||
2048 | ret = true; | 2063 | ret = true; |
2049 | } | 2064 | } |
2050 | 2065 | ||
@@ -2057,8 +2072,8 @@ static bool manage_workers(struct worker *worker) | |||
2057 | ret |= maybe_destroy_workers(pool); | 2072 | ret |= maybe_destroy_workers(pool); |
2058 | ret |= maybe_create_worker(pool); | 2073 | ret |= maybe_create_worker(pool); |
2059 | 2074 | ||
2060 | pool->flags &= ~POOL_MANAGING_WORKERS; | 2075 | mutex_unlock(&pool->manager_mutex); |
2061 | mutex_unlock(&pool->assoc_mutex); | 2076 | mutex_unlock(&pool->manager_arb); |
2062 | return ret; | 2077 | return ret; |
2063 | } | 2078 | } |
2064 | 2079 | ||
@@ -2184,6 +2199,7 @@ __acquires(&pool->lock) | |||
2184 | worker->current_work = NULL; | 2199 | worker->current_work = NULL; |
2185 | worker->current_func = NULL; | 2200 | worker->current_func = NULL; |
2186 | worker->current_pwq = NULL; | 2201 | worker->current_pwq = NULL; |
2202 | worker->desc_valid = false; | ||
2187 | pwq_dec_nr_in_flight(pwq, work_color); | 2203 | pwq_dec_nr_in_flight(pwq, work_color); |
2188 | } | 2204 | } |
2189 | 2205 | ||
@@ -2212,11 +2228,11 @@ static void process_scheduled_works(struct worker *worker) | |||
2212 | * worker_thread - the worker thread function | 2228 | * worker_thread - the worker thread function |
2213 | * @__worker: self | 2229 | * @__worker: self |
2214 | * | 2230 | * |
2215 | * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools | 2231 | * The worker thread function. All workers belong to a worker_pool - |
2216 | * of these per each cpu. These workers process all works regardless of | 2232 | * either a per-cpu one or dynamic unbound one. These workers process all |
2217 | * their specific target workqueue. The only exception is works which | 2233 | * work items regardless of their specific target workqueue. The only |
2218 | * belong to workqueues with a rescuer which will be explained in | 2234 | * exception is work items which belong to workqueues with a rescuer which |
2219 | * rescuer_thread(). | 2235 | * will be explained in rescuer_thread(). |
2220 | */ | 2236 | */ |
2221 | static int worker_thread(void *__worker) | 2237 | static int worker_thread(void *__worker) |
2222 | { | 2238 | { |
@@ -2228,19 +2244,12 @@ static int worker_thread(void *__worker) | |||
2228 | woke_up: | 2244 | woke_up: |
2229 | spin_lock_irq(&pool->lock); | 2245 | spin_lock_irq(&pool->lock); |
2230 | 2246 | ||
2231 | /* we are off idle list if destruction or rebind is requested */ | 2247 | /* am I supposed to die? */ |
2232 | if (unlikely(list_empty(&worker->entry))) { | 2248 | if (unlikely(worker->flags & WORKER_DIE)) { |
2233 | spin_unlock_irq(&pool->lock); | 2249 | spin_unlock_irq(&pool->lock); |
2234 | 2250 | WARN_ON_ONCE(!list_empty(&worker->entry)); | |
2235 | /* if DIE is set, destruction is requested */ | 2251 | worker->task->flags &= ~PF_WQ_WORKER; |
2236 | if (worker->flags & WORKER_DIE) { | 2252 | return 0; |
2237 | worker->task->flags &= ~PF_WQ_WORKER; | ||
2238 | return 0; | ||
2239 | } | ||
2240 | |||
2241 | /* otherwise, rebind */ | ||
2242 | idle_worker_rebind(worker); | ||
2243 | goto woke_up; | ||
2244 | } | 2253 | } |
2245 | 2254 | ||
2246 | worker_leave_idle(worker); | 2255 | worker_leave_idle(worker); |
@@ -2258,14 +2267,16 @@ recheck: | |||
2258 | * preparing to process a work or actually processing it. | 2267 | * preparing to process a work or actually processing it. |
2259 | * Make sure nobody diddled with it while I was sleeping. | 2268 | * Make sure nobody diddled with it while I was sleeping. |
2260 | */ | 2269 | */ |
2261 | BUG_ON(!list_empty(&worker->scheduled)); | 2270 | WARN_ON_ONCE(!list_empty(&worker->scheduled)); |
2262 | 2271 | ||
2263 | /* | 2272 | /* |
2264 | * When control reaches this point, we're guaranteed to have | 2273 | * Finish PREP stage. We're guaranteed to have at least one idle |
2265 | * at least one idle worker or that someone else has already | 2274 | * worker or that someone else has already assumed the manager |
2266 | * assumed the manager role. | 2275 | * role. This is where @worker starts participating in concurrency |
2276 | * management if applicable and concurrency management is restored | ||
2277 | * after being rebound. See rebind_workers() for details. | ||
2267 | */ | 2278 | */ |
2268 | worker_clr_flags(worker, WORKER_PREP); | 2279 | worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); |
2269 | 2280 | ||
2270 | do { | 2281 | do { |
2271 | struct work_struct *work = | 2282 | struct work_struct *work = |
@@ -2307,7 +2318,7 @@ sleep: | |||
2307 | * @__rescuer: self | 2318 | * @__rescuer: self |
2308 | * | 2319 | * |
2309 | * Workqueue rescuer thread function. There's one rescuer for each | 2320 | * Workqueue rescuer thread function. There's one rescuer for each |
2310 | * workqueue which has WQ_RESCUER set. | 2321 | * workqueue which has WQ_MEM_RECLAIM set. |
2311 | * | 2322 | * |
2312 | * Regular work processing on a pool may block trying to create a new | 2323 | * Regular work processing on a pool may block trying to create a new |
2313 | * worker which uses GFP_KERNEL allocation which has slight chance of | 2324 | * worker which uses GFP_KERNEL allocation which has slight chance of |
@@ -2326,8 +2337,6 @@ static int rescuer_thread(void *__rescuer) | |||
2326 | struct worker *rescuer = __rescuer; | 2337 | struct worker *rescuer = __rescuer; |
2327 | struct workqueue_struct *wq = rescuer->rescue_wq; | 2338 | struct workqueue_struct *wq = rescuer->rescue_wq; |
2328 | struct list_head *scheduled = &rescuer->scheduled; | 2339 | struct list_head *scheduled = &rescuer->scheduled; |
2329 | bool is_unbound = wq->flags & WQ_UNBOUND; | ||
2330 | unsigned int cpu; | ||
2331 | 2340 | ||
2332 | set_user_nice(current, RESCUER_NICE_LEVEL); | 2341 | set_user_nice(current, RESCUER_NICE_LEVEL); |
2333 | 2342 | ||
@@ -2345,28 +2354,29 @@ repeat: | |||
2345 | return 0; | 2354 | return 0; |
2346 | } | 2355 | } |
2347 | 2356 | ||
2348 | /* | 2357 | /* see whether any pwq is asking for help */ |
2349 | * See whether any cpu is asking for help. Unbounded | 2358 | spin_lock_irq(&wq_mayday_lock); |
2350 | * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. | 2359 | |
2351 | */ | 2360 | while (!list_empty(&wq->maydays)) { |
2352 | for_each_mayday_cpu(cpu, wq->mayday_mask) { | 2361 | struct pool_workqueue *pwq = list_first_entry(&wq->maydays, |
2353 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; | 2362 | struct pool_workqueue, mayday_node); |
2354 | struct pool_workqueue *pwq = get_pwq(tcpu, wq); | ||
2355 | struct worker_pool *pool = pwq->pool; | 2363 | struct worker_pool *pool = pwq->pool; |
2356 | struct work_struct *work, *n; | 2364 | struct work_struct *work, *n; |
2357 | 2365 | ||
2358 | __set_current_state(TASK_RUNNING); | 2366 | __set_current_state(TASK_RUNNING); |
2359 | mayday_clear_cpu(cpu, wq->mayday_mask); | 2367 | list_del_init(&pwq->mayday_node); |
2368 | |||
2369 | spin_unlock_irq(&wq_mayday_lock); | ||
2360 | 2370 | ||
2361 | /* migrate to the target cpu if possible */ | 2371 | /* migrate to the target cpu if possible */ |
2372 | worker_maybe_bind_and_lock(pool); | ||
2362 | rescuer->pool = pool; | 2373 | rescuer->pool = pool; |
2363 | worker_maybe_bind_and_lock(rescuer); | ||
2364 | 2374 | ||
2365 | /* | 2375 | /* |
2366 | * Slurp in all works issued via this workqueue and | 2376 | * Slurp in all works issued via this workqueue and |
2367 | * process'em. | 2377 | * process'em. |
2368 | */ | 2378 | */ |
2369 | BUG_ON(!list_empty(&rescuer->scheduled)); | 2379 | WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); |
2370 | list_for_each_entry_safe(work, n, &pool->worklist, entry) | 2380 | list_for_each_entry_safe(work, n, &pool->worklist, entry) |
2371 | if (get_work_pwq(work) == pwq) | 2381 | if (get_work_pwq(work) == pwq) |
2372 | move_linked_works(work, scheduled, &n); | 2382 | move_linked_works(work, scheduled, &n); |
@@ -2381,9 +2391,13 @@ repeat: | |||
2381 | if (keep_working(pool)) | 2391 | if (keep_working(pool)) |
2382 | wake_up_worker(pool); | 2392 | wake_up_worker(pool); |
2383 | 2393 | ||
2384 | spin_unlock_irq(&pool->lock); | 2394 | rescuer->pool = NULL; |
2395 | spin_unlock(&pool->lock); | ||
2396 | spin_lock(&wq_mayday_lock); | ||
2385 | } | 2397 | } |
2386 | 2398 | ||
2399 | spin_unlock_irq(&wq_mayday_lock); | ||
2400 | |||
2387 | /* rescuers should never participate in concurrency management */ | 2401 | /* rescuers should never participate in concurrency management */ |
2388 | WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); | 2402 | WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); |
2389 | schedule(); | 2403 | schedule(); |
@@ -2487,7 +2501,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, | |||
2487 | * advanced to @work_color. | 2501 | * advanced to @work_color. |
2488 | * | 2502 | * |
2489 | * CONTEXT: | 2503 | * CONTEXT: |
2490 | * mutex_lock(wq->flush_mutex). | 2504 | * mutex_lock(wq->mutex). |
2491 | * | 2505 | * |
2492 | * RETURNS: | 2506 | * RETURNS: |
2493 | * %true if @flush_color >= 0 and there's something to flush. %false | 2507 | * %true if @flush_color >= 0 and there's something to flush. %false |
@@ -2497,21 +2511,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, | |||
2497 | int flush_color, int work_color) | 2511 | int flush_color, int work_color) |
2498 | { | 2512 | { |
2499 | bool wait = false; | 2513 | bool wait = false; |
2500 | unsigned int cpu; | 2514 | struct pool_workqueue *pwq; |
2501 | 2515 | ||
2502 | if (flush_color >= 0) { | 2516 | if (flush_color >= 0) { |
2503 | BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); | 2517 | WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); |
2504 | atomic_set(&wq->nr_pwqs_to_flush, 1); | 2518 | atomic_set(&wq->nr_pwqs_to_flush, 1); |
2505 | } | 2519 | } |
2506 | 2520 | ||
2507 | for_each_pwq_cpu(cpu, wq) { | 2521 | for_each_pwq(pwq, wq) { |
2508 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | ||
2509 | struct worker_pool *pool = pwq->pool; | 2522 | struct worker_pool *pool = pwq->pool; |
2510 | 2523 | ||
2511 | spin_lock_irq(&pool->lock); | 2524 | spin_lock_irq(&pool->lock); |
2512 | 2525 | ||
2513 | if (flush_color >= 0) { | 2526 | if (flush_color >= 0) { |
2514 | BUG_ON(pwq->flush_color != -1); | 2527 | WARN_ON_ONCE(pwq->flush_color != -1); |
2515 | 2528 | ||
2516 | if (pwq->nr_in_flight[flush_color]) { | 2529 | if (pwq->nr_in_flight[flush_color]) { |
2517 | pwq->flush_color = flush_color; | 2530 | pwq->flush_color = flush_color; |
@@ -2521,7 +2534,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, | |||
2521 | } | 2534 | } |
2522 | 2535 | ||
2523 | if (work_color >= 0) { | 2536 | if (work_color >= 0) { |
2524 | BUG_ON(work_color != work_next_color(pwq->work_color)); | 2537 | WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); |
2525 | pwq->work_color = work_color; | 2538 | pwq->work_color = work_color; |
2526 | } | 2539 | } |
2527 | 2540 | ||
@@ -2538,11 +2551,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, | |||
2538 | * flush_workqueue - ensure that any scheduled work has run to completion. | 2551 | * flush_workqueue - ensure that any scheduled work has run to completion. |
2539 | * @wq: workqueue to flush | 2552 | * @wq: workqueue to flush |
2540 | * | 2553 | * |
2541 | * Forces execution of the workqueue and blocks until its completion. | 2554 | * This function sleeps until all work items which were queued on entry |
2542 | * This is typically used in driver shutdown handlers. | 2555 | * have finished execution, but it is not livelocked by new incoming ones. |
2543 | * | ||
2544 | * We sleep until all works which were queued on entry have been handled, | ||
2545 | * but we are not livelocked by new incoming ones. | ||
2546 | */ | 2556 | */ |
2547 | void flush_workqueue(struct workqueue_struct *wq) | 2557 | void flush_workqueue(struct workqueue_struct *wq) |
2548 | { | 2558 | { |
@@ -2556,7 +2566,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2556 | lock_map_acquire(&wq->lockdep_map); | 2566 | lock_map_acquire(&wq->lockdep_map); |
2557 | lock_map_release(&wq->lockdep_map); | 2567 | lock_map_release(&wq->lockdep_map); |
2558 | 2568 | ||
2559 | mutex_lock(&wq->flush_mutex); | 2569 | mutex_lock(&wq->mutex); |
2560 | 2570 | ||
2561 | /* | 2571 | /* |
2562 | * Start-to-wait phase | 2572 | * Start-to-wait phase |
@@ -2569,13 +2579,13 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2569 | * becomes our flush_color and work_color is advanced | 2579 | * becomes our flush_color and work_color is advanced |
2570 | * by one. | 2580 | * by one. |
2571 | */ | 2581 | */ |
2572 | BUG_ON(!list_empty(&wq->flusher_overflow)); | 2582 | WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); |
2573 | this_flusher.flush_color = wq->work_color; | 2583 | this_flusher.flush_color = wq->work_color; |
2574 | wq->work_color = next_color; | 2584 | wq->work_color = next_color; |
2575 | 2585 | ||
2576 | if (!wq->first_flusher) { | 2586 | if (!wq->first_flusher) { |
2577 | /* no flush in progress, become the first flusher */ | 2587 | /* no flush in progress, become the first flusher */ |
2578 | BUG_ON(wq->flush_color != this_flusher.flush_color); | 2588 | WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); |
2579 | 2589 | ||
2580 | wq->first_flusher = &this_flusher; | 2590 | wq->first_flusher = &this_flusher; |
2581 | 2591 | ||
@@ -2588,7 +2598,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2588 | } | 2598 | } |
2589 | } else { | 2599 | } else { |
2590 | /* wait in queue */ | 2600 | /* wait in queue */ |
2591 | BUG_ON(wq->flush_color == this_flusher.flush_color); | 2601 | WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); |
2592 | list_add_tail(&this_flusher.list, &wq->flusher_queue); | 2602 | list_add_tail(&this_flusher.list, &wq->flusher_queue); |
2593 | flush_workqueue_prep_pwqs(wq, -1, wq->work_color); | 2603 | flush_workqueue_prep_pwqs(wq, -1, wq->work_color); |
2594 | } | 2604 | } |
@@ -2601,7 +2611,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2601 | list_add_tail(&this_flusher.list, &wq->flusher_overflow); | 2611 | list_add_tail(&this_flusher.list, &wq->flusher_overflow); |
2602 | } | 2612 | } |
2603 | 2613 | ||
2604 | mutex_unlock(&wq->flush_mutex); | 2614 | mutex_unlock(&wq->mutex); |
2605 | 2615 | ||
2606 | wait_for_completion(&this_flusher.done); | 2616 | wait_for_completion(&this_flusher.done); |
2607 | 2617 | ||
@@ -2614,7 +2624,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2614 | if (wq->first_flusher != &this_flusher) | 2624 | if (wq->first_flusher != &this_flusher) |
2615 | return; | 2625 | return; |
2616 | 2626 | ||
2617 | mutex_lock(&wq->flush_mutex); | 2627 | mutex_lock(&wq->mutex); |
2618 | 2628 | ||
2619 | /* we might have raced, check again with mutex held */ | 2629 | /* we might have raced, check again with mutex held */ |
2620 | if (wq->first_flusher != &this_flusher) | 2630 | if (wq->first_flusher != &this_flusher) |
@@ -2622,8 +2632,8 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2622 | 2632 | ||
2623 | wq->first_flusher = NULL; | 2633 | wq->first_flusher = NULL; |
2624 | 2634 | ||
2625 | BUG_ON(!list_empty(&this_flusher.list)); | 2635 | WARN_ON_ONCE(!list_empty(&this_flusher.list)); |
2626 | BUG_ON(wq->flush_color != this_flusher.flush_color); | 2636 | WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); |
2627 | 2637 | ||
2628 | while (true) { | 2638 | while (true) { |
2629 | struct wq_flusher *next, *tmp; | 2639 | struct wq_flusher *next, *tmp; |
@@ -2636,8 +2646,8 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2636 | complete(&next->done); | 2646 | complete(&next->done); |
2637 | } | 2647 | } |
2638 | 2648 | ||
2639 | BUG_ON(!list_empty(&wq->flusher_overflow) && | 2649 | WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && |
2640 | wq->flush_color != work_next_color(wq->work_color)); | 2650 | wq->flush_color != work_next_color(wq->work_color)); |
2641 | 2651 | ||
2642 | /* this flush_color is finished, advance by one */ | 2652 | /* this flush_color is finished, advance by one */ |
2643 | wq->flush_color = work_next_color(wq->flush_color); | 2653 | wq->flush_color = work_next_color(wq->flush_color); |
@@ -2661,7 +2671,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2661 | } | 2671 | } |
2662 | 2672 | ||
2663 | if (list_empty(&wq->flusher_queue)) { | 2673 | if (list_empty(&wq->flusher_queue)) { |
2664 | BUG_ON(wq->flush_color != wq->work_color); | 2674 | WARN_ON_ONCE(wq->flush_color != wq->work_color); |
2665 | break; | 2675 | break; |
2666 | } | 2676 | } |
2667 | 2677 | ||
@@ -2669,8 +2679,8 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2669 | * Need to flush more colors. Make the next flusher | 2679 | * Need to flush more colors. Make the next flusher |
2670 | * the new first flusher and arm pwqs. | 2680 | * the new first flusher and arm pwqs. |
2671 | */ | 2681 | */ |
2672 | BUG_ON(wq->flush_color == wq->work_color); | 2682 | WARN_ON_ONCE(wq->flush_color == wq->work_color); |
2673 | BUG_ON(wq->flush_color != next->flush_color); | 2683 | WARN_ON_ONCE(wq->flush_color != next->flush_color); |
2674 | 2684 | ||
2675 | list_del_init(&next->list); | 2685 | list_del_init(&next->list); |
2676 | wq->first_flusher = next; | 2686 | wq->first_flusher = next; |
@@ -2686,7 +2696,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2686 | } | 2696 | } |
2687 | 2697 | ||
2688 | out_unlock: | 2698 | out_unlock: |
2689 | mutex_unlock(&wq->flush_mutex); | 2699 | mutex_unlock(&wq->mutex); |
2690 | } | 2700 | } |
2691 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2701 | EXPORT_SYMBOL_GPL(flush_workqueue); |
2692 | 2702 | ||
@@ -2704,22 +2714,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue); | |||
2704 | void drain_workqueue(struct workqueue_struct *wq) | 2714 | void drain_workqueue(struct workqueue_struct *wq) |
2705 | { | 2715 | { |
2706 | unsigned int flush_cnt = 0; | 2716 | unsigned int flush_cnt = 0; |
2707 | unsigned int cpu; | 2717 | struct pool_workqueue *pwq; |
2708 | 2718 | ||
2709 | /* | 2719 | /* |
2710 | * __queue_work() needs to test whether there are drainers, is much | 2720 | * __queue_work() needs to test whether there are drainers, is much |
2711 | * hotter than drain_workqueue() and already looks at @wq->flags. | 2721 | * hotter than drain_workqueue() and already looks at @wq->flags. |
2712 | * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. | 2722 | * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. |
2713 | */ | 2723 | */ |
2714 | spin_lock(&workqueue_lock); | 2724 | mutex_lock(&wq->mutex); |
2715 | if (!wq->nr_drainers++) | 2725 | if (!wq->nr_drainers++) |
2716 | wq->flags |= WQ_DRAINING; | 2726 | wq->flags |= __WQ_DRAINING; |
2717 | spin_unlock(&workqueue_lock); | 2727 | mutex_unlock(&wq->mutex); |
2718 | reflush: | 2728 | reflush: |
2719 | flush_workqueue(wq); | 2729 | flush_workqueue(wq); |
2720 | 2730 | ||
2721 | for_each_pwq_cpu(cpu, wq) { | 2731 | mutex_lock(&wq->mutex); |
2722 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | 2732 | |
2733 | for_each_pwq(pwq, wq) { | ||
2723 | bool drained; | 2734 | bool drained; |
2724 | 2735 | ||
2725 | spin_lock_irq(&pwq->pool->lock); | 2736 | spin_lock_irq(&pwq->pool->lock); |
@@ -2731,15 +2742,16 @@ reflush: | |||
2731 | 2742 | ||
2732 | if (++flush_cnt == 10 || | 2743 | if (++flush_cnt == 10 || |
2733 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | 2744 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) |
2734 | pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", | 2745 | pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", |
2735 | wq->name, flush_cnt); | 2746 | wq->name, flush_cnt); |
2747 | |||
2748 | mutex_unlock(&wq->mutex); | ||
2736 | goto reflush; | 2749 | goto reflush; |
2737 | } | 2750 | } |
2738 | 2751 | ||
2739 | spin_lock(&workqueue_lock); | ||
2740 | if (!--wq->nr_drainers) | 2752 | if (!--wq->nr_drainers) |
2741 | wq->flags &= ~WQ_DRAINING; | 2753 | wq->flags &= ~__WQ_DRAINING; |
2742 | spin_unlock(&workqueue_lock); | 2754 | mutex_unlock(&wq->mutex); |
2743 | } | 2755 | } |
2744 | EXPORT_SYMBOL_GPL(drain_workqueue); | 2756 | EXPORT_SYMBOL_GPL(drain_workqueue); |
2745 | 2757 | ||
@@ -2750,11 +2762,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) | |||
2750 | struct pool_workqueue *pwq; | 2762 | struct pool_workqueue *pwq; |
2751 | 2763 | ||
2752 | might_sleep(); | 2764 | might_sleep(); |
2765 | |||
2766 | local_irq_disable(); | ||
2753 | pool = get_work_pool(work); | 2767 | pool = get_work_pool(work); |
2754 | if (!pool) | 2768 | if (!pool) { |
2769 | local_irq_enable(); | ||
2755 | return false; | 2770 | return false; |
2771 | } | ||
2756 | 2772 | ||
2757 | spin_lock_irq(&pool->lock); | 2773 | spin_lock(&pool->lock); |
2758 | /* see the comment in try_to_grab_pending() with the same code */ | 2774 | /* see the comment in try_to_grab_pending() with the same code */ |
2759 | pwq = get_work_pwq(work); | 2775 | pwq = get_work_pwq(work); |
2760 | if (pwq) { | 2776 | if (pwq) { |
@@ -2776,7 +2792,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) | |||
2776 | * flusher is not running on the same workqueue by verifying write | 2792 | * flusher is not running on the same workqueue by verifying write |
2777 | * access. | 2793 | * access. |
2778 | */ | 2794 | */ |
2779 | if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) | 2795 | if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) |
2780 | lock_map_acquire(&pwq->wq->lockdep_map); | 2796 | lock_map_acquire(&pwq->wq->lockdep_map); |
2781 | else | 2797 | else |
2782 | lock_map_acquire_read(&pwq->wq->lockdep_map); | 2798 | lock_map_acquire_read(&pwq->wq->lockdep_map); |
@@ -2933,66 +2949,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) | |||
2933 | EXPORT_SYMBOL(cancel_delayed_work_sync); | 2949 | EXPORT_SYMBOL(cancel_delayed_work_sync); |
2934 | 2950 | ||
2935 | /** | 2951 | /** |
2936 | * schedule_work_on - put work task on a specific cpu | ||
2937 | * @cpu: cpu to put the work task on | ||
2938 | * @work: job to be done | ||
2939 | * | ||
2940 | * This puts a job on a specific cpu | ||
2941 | */ | ||
2942 | bool schedule_work_on(int cpu, struct work_struct *work) | ||
2943 | { | ||
2944 | return queue_work_on(cpu, system_wq, work); | ||
2945 | } | ||
2946 | EXPORT_SYMBOL(schedule_work_on); | ||
2947 | |||
2948 | /** | ||
2949 | * schedule_work - put work task in global workqueue | ||
2950 | * @work: job to be done | ||
2951 | * | ||
2952 | * Returns %false if @work was already on the kernel-global workqueue and | ||
2953 | * %true otherwise. | ||
2954 | * | ||
2955 | * This puts a job in the kernel-global workqueue if it was not already | ||
2956 | * queued and leaves it in the same position on the kernel-global | ||
2957 | * workqueue otherwise. | ||
2958 | */ | ||
2959 | bool schedule_work(struct work_struct *work) | ||
2960 | { | ||
2961 | return queue_work(system_wq, work); | ||
2962 | } | ||
2963 | EXPORT_SYMBOL(schedule_work); | ||
2964 | |||
2965 | /** | ||
2966 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | ||
2967 | * @cpu: cpu to use | ||
2968 | * @dwork: job to be done | ||
2969 | * @delay: number of jiffies to wait | ||
2970 | * | ||
2971 | * After waiting for a given time this puts a job in the kernel-global | ||
2972 | * workqueue on the specified CPU. | ||
2973 | */ | ||
2974 | bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, | ||
2975 | unsigned long delay) | ||
2976 | { | ||
2977 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); | ||
2978 | } | ||
2979 | EXPORT_SYMBOL(schedule_delayed_work_on); | ||
2980 | |||
2981 | /** | ||
2982 | * schedule_delayed_work - put work task in global workqueue after delay | ||
2983 | * @dwork: job to be done | ||
2984 | * @delay: number of jiffies to wait or 0 for immediate execution | ||
2985 | * | ||
2986 | * After waiting for a given time this puts a job in the kernel-global | ||
2987 | * workqueue. | ||
2988 | */ | ||
2989 | bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) | ||
2990 | { | ||
2991 | return queue_delayed_work(system_wq, dwork, delay); | ||
2992 | } | ||
2993 | EXPORT_SYMBOL(schedule_delayed_work); | ||
2994 | |||
2995 | /** | ||
2996 | * schedule_on_each_cpu - execute a function synchronously on each online CPU | 2952 | * schedule_on_each_cpu - execute a function synchronously on each online CPU |
2997 | * @func: the function to call | 2953 | * @func: the function to call |
2998 | * | 2954 | * |
@@ -3085,51 +3041,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) | |||
3085 | } | 3041 | } |
3086 | EXPORT_SYMBOL_GPL(execute_in_process_context); | 3042 | EXPORT_SYMBOL_GPL(execute_in_process_context); |
3087 | 3043 | ||
3088 | int keventd_up(void) | 3044 | #ifdef CONFIG_SYSFS |
3045 | /* | ||
3046 | * Workqueues with WQ_SYSFS flag set is visible to userland via | ||
3047 | * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the | ||
3048 | * following attributes. | ||
3049 | * | ||
3050 | * per_cpu RO bool : whether the workqueue is per-cpu or unbound | ||
3051 | * max_active RW int : maximum number of in-flight work items | ||
3052 | * | ||
3053 | * Unbound workqueues have the following extra attributes. | ||
3054 | * | ||
3055 | * id RO int : the associated pool ID | ||
3056 | * nice RW int : nice value of the workers | ||
3057 | * cpumask RW mask : bitmask of allowed CPUs for the workers | ||
3058 | */ | ||
3059 | struct wq_device { | ||
3060 | struct workqueue_struct *wq; | ||
3061 | struct device dev; | ||
3062 | }; | ||
3063 | |||
3064 | static struct workqueue_struct *dev_to_wq(struct device *dev) | ||
3065 | { | ||
3066 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
3067 | |||
3068 | return wq_dev->wq; | ||
3069 | } | ||
3070 | |||
3071 | static ssize_t wq_per_cpu_show(struct device *dev, | ||
3072 | struct device_attribute *attr, char *buf) | ||
3073 | { | ||
3074 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3075 | |||
3076 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | ||
3077 | } | ||
3078 | |||
3079 | static ssize_t wq_max_active_show(struct device *dev, | ||
3080 | struct device_attribute *attr, char *buf) | ||
3089 | { | 3081 | { |
3090 | return system_wq != NULL; | 3082 | struct workqueue_struct *wq = dev_to_wq(dev); |
3083 | |||
3084 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | ||
3091 | } | 3085 | } |
3092 | 3086 | ||
3093 | static int alloc_pwqs(struct workqueue_struct *wq) | 3087 | static ssize_t wq_max_active_store(struct device *dev, |
3088 | struct device_attribute *attr, | ||
3089 | const char *buf, size_t count) | ||
3094 | { | 3090 | { |
3091 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3092 | int val; | ||
3093 | |||
3094 | if (sscanf(buf, "%d", &val) != 1 || val <= 0) | ||
3095 | return -EINVAL; | ||
3096 | |||
3097 | workqueue_set_max_active(wq, val); | ||
3098 | return count; | ||
3099 | } | ||
3100 | |||
3101 | static struct device_attribute wq_sysfs_attrs[] = { | ||
3102 | __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), | ||
3103 | __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), | ||
3104 | __ATTR_NULL, | ||
3105 | }; | ||
3106 | |||
3107 | static ssize_t wq_pool_ids_show(struct device *dev, | ||
3108 | struct device_attribute *attr, char *buf) | ||
3109 | { | ||
3110 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3111 | const char *delim = ""; | ||
3112 | int node, written = 0; | ||
3113 | |||
3114 | rcu_read_lock_sched(); | ||
3115 | for_each_node(node) { | ||
3116 | written += scnprintf(buf + written, PAGE_SIZE - written, | ||
3117 | "%s%d:%d", delim, node, | ||
3118 | unbound_pwq_by_node(wq, node)->pool->id); | ||
3119 | delim = " "; | ||
3120 | } | ||
3121 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
3122 | rcu_read_unlock_sched(); | ||
3123 | |||
3124 | return written; | ||
3125 | } | ||
3126 | |||
3127 | static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, | ||
3128 | char *buf) | ||
3129 | { | ||
3130 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3131 | int written; | ||
3132 | |||
3133 | mutex_lock(&wq->mutex); | ||
3134 | written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); | ||
3135 | mutex_unlock(&wq->mutex); | ||
3136 | |||
3137 | return written; | ||
3138 | } | ||
3139 | |||
3140 | /* prepare workqueue_attrs for sysfs store operations */ | ||
3141 | static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | ||
3142 | { | ||
3143 | struct workqueue_attrs *attrs; | ||
3144 | |||
3145 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
3146 | if (!attrs) | ||
3147 | return NULL; | ||
3148 | |||
3149 | mutex_lock(&wq->mutex); | ||
3150 | copy_workqueue_attrs(attrs, wq->unbound_attrs); | ||
3151 | mutex_unlock(&wq->mutex); | ||
3152 | return attrs; | ||
3153 | } | ||
3154 | |||
3155 | static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, | ||
3156 | const char *buf, size_t count) | ||
3157 | { | ||
3158 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3159 | struct workqueue_attrs *attrs; | ||
3160 | int ret; | ||
3161 | |||
3162 | attrs = wq_sysfs_prep_attrs(wq); | ||
3163 | if (!attrs) | ||
3164 | return -ENOMEM; | ||
3165 | |||
3166 | if (sscanf(buf, "%d", &attrs->nice) == 1 && | ||
3167 | attrs->nice >= -20 && attrs->nice <= 19) | ||
3168 | ret = apply_workqueue_attrs(wq, attrs); | ||
3169 | else | ||
3170 | ret = -EINVAL; | ||
3171 | |||
3172 | free_workqueue_attrs(attrs); | ||
3173 | return ret ?: count; | ||
3174 | } | ||
3175 | |||
3176 | static ssize_t wq_cpumask_show(struct device *dev, | ||
3177 | struct device_attribute *attr, char *buf) | ||
3178 | { | ||
3179 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3180 | int written; | ||
3181 | |||
3182 | mutex_lock(&wq->mutex); | ||
3183 | written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); | ||
3184 | mutex_unlock(&wq->mutex); | ||
3185 | |||
3186 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
3187 | return written; | ||
3188 | } | ||
3189 | |||
3190 | static ssize_t wq_cpumask_store(struct device *dev, | ||
3191 | struct device_attribute *attr, | ||
3192 | const char *buf, size_t count) | ||
3193 | { | ||
3194 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3195 | struct workqueue_attrs *attrs; | ||
3196 | int ret; | ||
3197 | |||
3198 | attrs = wq_sysfs_prep_attrs(wq); | ||
3199 | if (!attrs) | ||
3200 | return -ENOMEM; | ||
3201 | |||
3202 | ret = cpumask_parse(buf, attrs->cpumask); | ||
3203 | if (!ret) | ||
3204 | ret = apply_workqueue_attrs(wq, attrs); | ||
3205 | |||
3206 | free_workqueue_attrs(attrs); | ||
3207 | return ret ?: count; | ||
3208 | } | ||
3209 | |||
3210 | static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, | ||
3211 | char *buf) | ||
3212 | { | ||
3213 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3214 | int written; | ||
3215 | |||
3216 | mutex_lock(&wq->mutex); | ||
3217 | written = scnprintf(buf, PAGE_SIZE, "%d\n", | ||
3218 | !wq->unbound_attrs->no_numa); | ||
3219 | mutex_unlock(&wq->mutex); | ||
3220 | |||
3221 | return written; | ||
3222 | } | ||
3223 | |||
3224 | static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, | ||
3225 | const char *buf, size_t count) | ||
3226 | { | ||
3227 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3228 | struct workqueue_attrs *attrs; | ||
3229 | int v, ret; | ||
3230 | |||
3231 | attrs = wq_sysfs_prep_attrs(wq); | ||
3232 | if (!attrs) | ||
3233 | return -ENOMEM; | ||
3234 | |||
3235 | ret = -EINVAL; | ||
3236 | if (sscanf(buf, "%d", &v) == 1) { | ||
3237 | attrs->no_numa = !v; | ||
3238 | ret = apply_workqueue_attrs(wq, attrs); | ||
3239 | } | ||
3240 | |||
3241 | free_workqueue_attrs(attrs); | ||
3242 | return ret ?: count; | ||
3243 | } | ||
3244 | |||
3245 | static struct device_attribute wq_sysfs_unbound_attrs[] = { | ||
3246 | __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), | ||
3247 | __ATTR(nice, 0644, wq_nice_show, wq_nice_store), | ||
3248 | __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), | ||
3249 | __ATTR(numa, 0644, wq_numa_show, wq_numa_store), | ||
3250 | __ATTR_NULL, | ||
3251 | }; | ||
3252 | |||
3253 | static struct bus_type wq_subsys = { | ||
3254 | .name = "workqueue", | ||
3255 | .dev_attrs = wq_sysfs_attrs, | ||
3256 | }; | ||
3257 | |||
3258 | static int __init wq_sysfs_init(void) | ||
3259 | { | ||
3260 | return subsys_virtual_register(&wq_subsys, NULL); | ||
3261 | } | ||
3262 | core_initcall(wq_sysfs_init); | ||
3263 | |||
3264 | static void wq_device_release(struct device *dev) | ||
3265 | { | ||
3266 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
3267 | |||
3268 | kfree(wq_dev); | ||
3269 | } | ||
3270 | |||
3271 | /** | ||
3272 | * workqueue_sysfs_register - make a workqueue visible in sysfs | ||
3273 | * @wq: the workqueue to register | ||
3274 | * | ||
3275 | * Expose @wq in sysfs under /sys/bus/workqueue/devices. | ||
3276 | * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set | ||
3277 | * which is the preferred method. | ||
3278 | * | ||
3279 | * Workqueue user should use this function directly iff it wants to apply | ||
3280 | * workqueue_attrs before making the workqueue visible in sysfs; otherwise, | ||
3281 | * apply_workqueue_attrs() may race against userland updating the | ||
3282 | * attributes. | ||
3283 | * | ||
3284 | * Returns 0 on success, -errno on failure. | ||
3285 | */ | ||
3286 | int workqueue_sysfs_register(struct workqueue_struct *wq) | ||
3287 | { | ||
3288 | struct wq_device *wq_dev; | ||
3289 | int ret; | ||
3290 | |||
3095 | /* | 3291 | /* |
3096 | * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. | 3292 | * Adjusting max_active or creating new pwqs by applyting |
3097 | * Make sure that the alignment isn't lower than that of | 3293 | * attributes breaks ordering guarantee. Disallow exposing ordered |
3098 | * unsigned long long. | 3294 | * workqueues. |
3099 | */ | 3295 | */ |
3100 | const size_t size = sizeof(struct pool_workqueue); | 3296 | if (WARN_ON(wq->flags & __WQ_ORDERED)) |
3101 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, | 3297 | return -EINVAL; |
3102 | __alignof__(unsigned long long)); | ||
3103 | 3298 | ||
3104 | if (!(wq->flags & WQ_UNBOUND)) | 3299 | wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); |
3105 | wq->pool_wq.pcpu = __alloc_percpu(size, align); | 3300 | if (!wq_dev) |
3106 | else { | 3301 | return -ENOMEM; |
3107 | void *ptr; | 3302 | |
3303 | wq_dev->wq = wq; | ||
3304 | wq_dev->dev.bus = &wq_subsys; | ||
3305 | wq_dev->dev.init_name = wq->name; | ||
3306 | wq_dev->dev.release = wq_device_release; | ||
3307 | |||
3308 | /* | ||
3309 | * unbound_attrs are created separately. Suppress uevent until | ||
3310 | * everything is ready. | ||
3311 | */ | ||
3312 | dev_set_uevent_suppress(&wq_dev->dev, true); | ||
3313 | |||
3314 | ret = device_register(&wq_dev->dev); | ||
3315 | if (ret) { | ||
3316 | kfree(wq_dev); | ||
3317 | wq->wq_dev = NULL; | ||
3318 | return ret; | ||
3319 | } | ||
3320 | |||
3321 | if (wq->flags & WQ_UNBOUND) { | ||
3322 | struct device_attribute *attr; | ||
3323 | |||
3324 | for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { | ||
3325 | ret = device_create_file(&wq_dev->dev, attr); | ||
3326 | if (ret) { | ||
3327 | device_unregister(&wq_dev->dev); | ||
3328 | wq->wq_dev = NULL; | ||
3329 | return ret; | ||
3330 | } | ||
3331 | } | ||
3332 | } | ||
3333 | |||
3334 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | ||
3335 | return 0; | ||
3336 | } | ||
3337 | |||
3338 | /** | ||
3339 | * workqueue_sysfs_unregister - undo workqueue_sysfs_register() | ||
3340 | * @wq: the workqueue to unregister | ||
3341 | * | ||
3342 | * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. | ||
3343 | */ | ||
3344 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) | ||
3345 | { | ||
3346 | struct wq_device *wq_dev = wq->wq_dev; | ||
3347 | |||
3348 | if (!wq->wq_dev) | ||
3349 | return; | ||
3350 | |||
3351 | wq->wq_dev = NULL; | ||
3352 | device_unregister(&wq_dev->dev); | ||
3353 | } | ||
3354 | #else /* CONFIG_SYSFS */ | ||
3355 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } | ||
3356 | #endif /* CONFIG_SYSFS */ | ||
3357 | |||
3358 | /** | ||
3359 | * free_workqueue_attrs - free a workqueue_attrs | ||
3360 | * @attrs: workqueue_attrs to free | ||
3361 | * | ||
3362 | * Undo alloc_workqueue_attrs(). | ||
3363 | */ | ||
3364 | void free_workqueue_attrs(struct workqueue_attrs *attrs) | ||
3365 | { | ||
3366 | if (attrs) { | ||
3367 | free_cpumask_var(attrs->cpumask); | ||
3368 | kfree(attrs); | ||
3369 | } | ||
3370 | } | ||
3371 | |||
3372 | /** | ||
3373 | * alloc_workqueue_attrs - allocate a workqueue_attrs | ||
3374 | * @gfp_mask: allocation mask to use | ||
3375 | * | ||
3376 | * Allocate a new workqueue_attrs, initialize with default settings and | ||
3377 | * return it. Returns NULL on failure. | ||
3378 | */ | ||
3379 | struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) | ||
3380 | { | ||
3381 | struct workqueue_attrs *attrs; | ||
3382 | |||
3383 | attrs = kzalloc(sizeof(*attrs), gfp_mask); | ||
3384 | if (!attrs) | ||
3385 | goto fail; | ||
3386 | if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) | ||
3387 | goto fail; | ||
3388 | |||
3389 | cpumask_copy(attrs->cpumask, cpu_possible_mask); | ||
3390 | return attrs; | ||
3391 | fail: | ||
3392 | free_workqueue_attrs(attrs); | ||
3393 | return NULL; | ||
3394 | } | ||
3395 | |||
3396 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | ||
3397 | const struct workqueue_attrs *from) | ||
3398 | { | ||
3399 | to->nice = from->nice; | ||
3400 | cpumask_copy(to->cpumask, from->cpumask); | ||
3401 | } | ||
3402 | |||
3403 | /* hash value of the content of @attr */ | ||
3404 | static u32 wqattrs_hash(const struct workqueue_attrs *attrs) | ||
3405 | { | ||
3406 | u32 hash = 0; | ||
3407 | |||
3408 | hash = jhash_1word(attrs->nice, hash); | ||
3409 | hash = jhash(cpumask_bits(attrs->cpumask), | ||
3410 | BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); | ||
3411 | return hash; | ||
3412 | } | ||
3413 | |||
3414 | /* content equality test */ | ||
3415 | static bool wqattrs_equal(const struct workqueue_attrs *a, | ||
3416 | const struct workqueue_attrs *b) | ||
3417 | { | ||
3418 | if (a->nice != b->nice) | ||
3419 | return false; | ||
3420 | if (!cpumask_equal(a->cpumask, b->cpumask)) | ||
3421 | return false; | ||
3422 | return true; | ||
3423 | } | ||
3424 | |||
3425 | /** | ||
3426 | * init_worker_pool - initialize a newly zalloc'd worker_pool | ||
3427 | * @pool: worker_pool to initialize | ||
3428 | * | ||
3429 | * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. | ||
3430 | * Returns 0 on success, -errno on failure. Even on failure, all fields | ||
3431 | * inside @pool proper are initialized and put_unbound_pool() can be called | ||
3432 | * on @pool safely to release it. | ||
3433 | */ | ||
3434 | static int init_worker_pool(struct worker_pool *pool) | ||
3435 | { | ||
3436 | spin_lock_init(&pool->lock); | ||
3437 | pool->id = -1; | ||
3438 | pool->cpu = -1; | ||
3439 | pool->node = NUMA_NO_NODE; | ||
3440 | pool->flags |= POOL_DISASSOCIATED; | ||
3441 | INIT_LIST_HEAD(&pool->worklist); | ||
3442 | INIT_LIST_HEAD(&pool->idle_list); | ||
3443 | hash_init(pool->busy_hash); | ||
3444 | |||
3445 | init_timer_deferrable(&pool->idle_timer); | ||
3446 | pool->idle_timer.function = idle_worker_timeout; | ||
3447 | pool->idle_timer.data = (unsigned long)pool; | ||
3448 | |||
3449 | setup_timer(&pool->mayday_timer, pool_mayday_timeout, | ||
3450 | (unsigned long)pool); | ||
3451 | |||
3452 | mutex_init(&pool->manager_arb); | ||
3453 | mutex_init(&pool->manager_mutex); | ||
3454 | idr_init(&pool->worker_idr); | ||
3455 | |||
3456 | INIT_HLIST_NODE(&pool->hash_node); | ||
3457 | pool->refcnt = 1; | ||
3458 | |||
3459 | /* shouldn't fail above this point */ | ||
3460 | pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
3461 | if (!pool->attrs) | ||
3462 | return -ENOMEM; | ||
3463 | return 0; | ||
3464 | } | ||
3465 | |||
3466 | static void rcu_free_pool(struct rcu_head *rcu) | ||
3467 | { | ||
3468 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); | ||
3469 | |||
3470 | idr_destroy(&pool->worker_idr); | ||
3471 | free_workqueue_attrs(pool->attrs); | ||
3472 | kfree(pool); | ||
3473 | } | ||
3474 | |||
3475 | /** | ||
3476 | * put_unbound_pool - put a worker_pool | ||
3477 | * @pool: worker_pool to put | ||
3478 | * | ||
3479 | * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU | ||
3480 | * safe manner. get_unbound_pool() calls this function on its failure path | ||
3481 | * and this function should be able to release pools which went through, | ||
3482 | * successfully or not, init_worker_pool(). | ||
3483 | * | ||
3484 | * Should be called with wq_pool_mutex held. | ||
3485 | */ | ||
3486 | static void put_unbound_pool(struct worker_pool *pool) | ||
3487 | { | ||
3488 | struct worker *worker; | ||
3489 | |||
3490 | lockdep_assert_held(&wq_pool_mutex); | ||
3491 | |||
3492 | if (--pool->refcnt) | ||
3493 | return; | ||
3494 | |||
3495 | /* sanity checks */ | ||
3496 | if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || | ||
3497 | WARN_ON(!list_empty(&pool->worklist))) | ||
3498 | return; | ||
3499 | |||
3500 | /* release id and unhash */ | ||
3501 | if (pool->id >= 0) | ||
3502 | idr_remove(&worker_pool_idr, pool->id); | ||
3503 | hash_del(&pool->hash_node); | ||
3504 | |||
3505 | /* | ||
3506 | * Become the manager and destroy all workers. Grabbing | ||
3507 | * manager_arb prevents @pool's workers from blocking on | ||
3508 | * manager_mutex. | ||
3509 | */ | ||
3510 | mutex_lock(&pool->manager_arb); | ||
3511 | mutex_lock(&pool->manager_mutex); | ||
3512 | spin_lock_irq(&pool->lock); | ||
3513 | |||
3514 | while ((worker = first_worker(pool))) | ||
3515 | destroy_worker(worker); | ||
3516 | WARN_ON(pool->nr_workers || pool->nr_idle); | ||
3517 | |||
3518 | spin_unlock_irq(&pool->lock); | ||
3519 | mutex_unlock(&pool->manager_mutex); | ||
3520 | mutex_unlock(&pool->manager_arb); | ||
3521 | |||
3522 | /* shut down the timers */ | ||
3523 | del_timer_sync(&pool->idle_timer); | ||
3524 | del_timer_sync(&pool->mayday_timer); | ||
3525 | |||
3526 | /* sched-RCU protected to allow dereferences from get_work_pool() */ | ||
3527 | call_rcu_sched(&pool->rcu, rcu_free_pool); | ||
3528 | } | ||
3529 | |||
3530 | /** | ||
3531 | * get_unbound_pool - get a worker_pool with the specified attributes | ||
3532 | * @attrs: the attributes of the worker_pool to get | ||
3533 | * | ||
3534 | * Obtain a worker_pool which has the same attributes as @attrs, bump the | ||
3535 | * reference count and return it. If there already is a matching | ||
3536 | * worker_pool, it will be used; otherwise, this function attempts to | ||
3537 | * create a new one. On failure, returns NULL. | ||
3538 | * | ||
3539 | * Should be called with wq_pool_mutex held. | ||
3540 | */ | ||
3541 | static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | ||
3542 | { | ||
3543 | u32 hash = wqattrs_hash(attrs); | ||
3544 | struct worker_pool *pool; | ||
3545 | int node; | ||
3546 | |||
3547 | lockdep_assert_held(&wq_pool_mutex); | ||
3548 | |||
3549 | /* do we already have a matching pool? */ | ||
3550 | hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { | ||
3551 | if (wqattrs_equal(pool->attrs, attrs)) { | ||
3552 | pool->refcnt++; | ||
3553 | goto out_unlock; | ||
3554 | } | ||
3555 | } | ||
3556 | |||
3557 | /* nope, create a new one */ | ||
3558 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); | ||
3559 | if (!pool || init_worker_pool(pool) < 0) | ||
3560 | goto fail; | ||
3561 | |||
3562 | if (workqueue_freezing) | ||
3563 | pool->flags |= POOL_FREEZING; | ||
3564 | |||
3565 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ | ||
3566 | copy_workqueue_attrs(pool->attrs, attrs); | ||
3567 | |||
3568 | /* if cpumask is contained inside a NUMA node, we belong to that node */ | ||
3569 | if (wq_numa_enabled) { | ||
3570 | for_each_node(node) { | ||
3571 | if (cpumask_subset(pool->attrs->cpumask, | ||
3572 | wq_numa_possible_cpumask[node])) { | ||
3573 | pool->node = node; | ||
3574 | break; | ||
3575 | } | ||
3576 | } | ||
3577 | } | ||
3578 | |||
3579 | if (worker_pool_assign_id(pool) < 0) | ||
3580 | goto fail; | ||
3581 | |||
3582 | /* create and start the initial worker */ | ||
3583 | if (create_and_start_worker(pool) < 0) | ||
3584 | goto fail; | ||
3585 | |||
3586 | /* install */ | ||
3587 | hash_add(unbound_pool_hash, &pool->hash_node, hash); | ||
3588 | out_unlock: | ||
3589 | return pool; | ||
3590 | fail: | ||
3591 | if (pool) | ||
3592 | put_unbound_pool(pool); | ||
3593 | return NULL; | ||
3594 | } | ||
3595 | |||
3596 | static void rcu_free_pwq(struct rcu_head *rcu) | ||
3597 | { | ||
3598 | kmem_cache_free(pwq_cache, | ||
3599 | container_of(rcu, struct pool_workqueue, rcu)); | ||
3600 | } | ||
3601 | |||
3602 | /* | ||
3603 | * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt | ||
3604 | * and needs to be destroyed. | ||
3605 | */ | ||
3606 | static void pwq_unbound_release_workfn(struct work_struct *work) | ||
3607 | { | ||
3608 | struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, | ||
3609 | unbound_release_work); | ||
3610 | struct workqueue_struct *wq = pwq->wq; | ||
3611 | struct worker_pool *pool = pwq->pool; | ||
3612 | bool is_last; | ||
3613 | |||
3614 | if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) | ||
3615 | return; | ||
3616 | |||
3617 | /* | ||
3618 | * Unlink @pwq. Synchronization against wq->mutex isn't strictly | ||
3619 | * necessary on release but do it anyway. It's easier to verify | ||
3620 | * and consistent with the linking path. | ||
3621 | */ | ||
3622 | mutex_lock(&wq->mutex); | ||
3623 | list_del_rcu(&pwq->pwqs_node); | ||
3624 | is_last = list_empty(&wq->pwqs); | ||
3625 | mutex_unlock(&wq->mutex); | ||
3626 | |||
3627 | mutex_lock(&wq_pool_mutex); | ||
3628 | put_unbound_pool(pool); | ||
3629 | mutex_unlock(&wq_pool_mutex); | ||
3630 | |||
3631 | call_rcu_sched(&pwq->rcu, rcu_free_pwq); | ||
3632 | |||
3633 | /* | ||
3634 | * If we're the last pwq going away, @wq is already dead and no one | ||
3635 | * is gonna access it anymore. Free it. | ||
3636 | */ | ||
3637 | if (is_last) { | ||
3638 | free_workqueue_attrs(wq->unbound_attrs); | ||
3639 | kfree(wq); | ||
3640 | } | ||
3641 | } | ||
3642 | |||
3643 | /** | ||
3644 | * pwq_adjust_max_active - update a pwq's max_active to the current setting | ||
3645 | * @pwq: target pool_workqueue | ||
3646 | * | ||
3647 | * If @pwq isn't freezing, set @pwq->max_active to the associated | ||
3648 | * workqueue's saved_max_active and activate delayed work items | ||
3649 | * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. | ||
3650 | */ | ||
3651 | static void pwq_adjust_max_active(struct pool_workqueue *pwq) | ||
3652 | { | ||
3653 | struct workqueue_struct *wq = pwq->wq; | ||
3654 | bool freezable = wq->flags & WQ_FREEZABLE; | ||
3655 | |||
3656 | /* for @wq->saved_max_active */ | ||
3657 | lockdep_assert_held(&wq->mutex); | ||
3658 | |||
3659 | /* fast exit for non-freezable wqs */ | ||
3660 | if (!freezable && pwq->max_active == wq->saved_max_active) | ||
3661 | return; | ||
3662 | |||
3663 | spin_lock_irq(&pwq->pool->lock); | ||
3664 | |||
3665 | if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { | ||
3666 | pwq->max_active = wq->saved_max_active; | ||
3667 | |||
3668 | while (!list_empty(&pwq->delayed_works) && | ||
3669 | pwq->nr_active < pwq->max_active) | ||
3670 | pwq_activate_first_delayed(pwq); | ||
3108 | 3671 | ||
3109 | /* | 3672 | /* |
3110 | * Allocate enough room to align pwq and put an extra | 3673 | * Need to kick a worker after thawed or an unbound wq's |
3111 | * pointer at the end pointing back to the originally | 3674 | * max_active is bumped. It's a slow path. Do it always. |
3112 | * allocated pointer which will be used for free. | ||
3113 | */ | 3675 | */ |
3114 | ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); | 3676 | wake_up_worker(pwq->pool); |
3115 | if (ptr) { | 3677 | } else { |
3116 | wq->pool_wq.single = PTR_ALIGN(ptr, align); | 3678 | pwq->max_active = 0; |
3117 | *(void **)(wq->pool_wq.single + 1) = ptr; | 3679 | } |
3680 | |||
3681 | spin_unlock_irq(&pwq->pool->lock); | ||
3682 | } | ||
3683 | |||
3684 | /* initialize newly alloced @pwq which is associated with @wq and @pool */ | ||
3685 | static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, | ||
3686 | struct worker_pool *pool) | ||
3687 | { | ||
3688 | BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); | ||
3689 | |||
3690 | memset(pwq, 0, sizeof(*pwq)); | ||
3691 | |||
3692 | pwq->pool = pool; | ||
3693 | pwq->wq = wq; | ||
3694 | pwq->flush_color = -1; | ||
3695 | pwq->refcnt = 1; | ||
3696 | INIT_LIST_HEAD(&pwq->delayed_works); | ||
3697 | INIT_LIST_HEAD(&pwq->pwqs_node); | ||
3698 | INIT_LIST_HEAD(&pwq->mayday_node); | ||
3699 | INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); | ||
3700 | } | ||
3701 | |||
3702 | /* sync @pwq with the current state of its associated wq and link it */ | ||
3703 | static void link_pwq(struct pool_workqueue *pwq) | ||
3704 | { | ||
3705 | struct workqueue_struct *wq = pwq->wq; | ||
3706 | |||
3707 | lockdep_assert_held(&wq->mutex); | ||
3708 | |||
3709 | /* may be called multiple times, ignore if already linked */ | ||
3710 | if (!list_empty(&pwq->pwqs_node)) | ||
3711 | return; | ||
3712 | |||
3713 | /* | ||
3714 | * Set the matching work_color. This is synchronized with | ||
3715 | * wq->mutex to avoid confusing flush_workqueue(). | ||
3716 | */ | ||
3717 | pwq->work_color = wq->work_color; | ||
3718 | |||
3719 | /* sync max_active to the current setting */ | ||
3720 | pwq_adjust_max_active(pwq); | ||
3721 | |||
3722 | /* link in @pwq */ | ||
3723 | list_add_rcu(&pwq->pwqs_node, &wq->pwqs); | ||
3724 | } | ||
3725 | |||
3726 | /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ | ||
3727 | static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, | ||
3728 | const struct workqueue_attrs *attrs) | ||
3729 | { | ||
3730 | struct worker_pool *pool; | ||
3731 | struct pool_workqueue *pwq; | ||
3732 | |||
3733 | lockdep_assert_held(&wq_pool_mutex); | ||
3734 | |||
3735 | pool = get_unbound_pool(attrs); | ||
3736 | if (!pool) | ||
3737 | return NULL; | ||
3738 | |||
3739 | pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); | ||
3740 | if (!pwq) { | ||
3741 | put_unbound_pool(pool); | ||
3742 | return NULL; | ||
3743 | } | ||
3744 | |||
3745 | init_pwq(pwq, wq, pool); | ||
3746 | return pwq; | ||
3747 | } | ||
3748 | |||
3749 | /* undo alloc_unbound_pwq(), used only in the error path */ | ||
3750 | static void free_unbound_pwq(struct pool_workqueue *pwq) | ||
3751 | { | ||
3752 | lockdep_assert_held(&wq_pool_mutex); | ||
3753 | |||
3754 | if (pwq) { | ||
3755 | put_unbound_pool(pwq->pool); | ||
3756 | kmem_cache_free(pwq_cache, pwq); | ||
3757 | } | ||
3758 | } | ||
3759 | |||
3760 | /** | ||
3761 | * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node | ||
3762 | * @attrs: the wq_attrs of interest | ||
3763 | * @node: the target NUMA node | ||
3764 | * @cpu_going_down: if >= 0, the CPU to consider as offline | ||
3765 | * @cpumask: outarg, the resulting cpumask | ||
3766 | * | ||
3767 | * Calculate the cpumask a workqueue with @attrs should use on @node. If | ||
3768 | * @cpu_going_down is >= 0, that cpu is considered offline during | ||
3769 | * calculation. The result is stored in @cpumask. This function returns | ||
3770 | * %true if the resulting @cpumask is different from @attrs->cpumask, | ||
3771 | * %false if equal. | ||
3772 | * | ||
3773 | * If NUMA affinity is not enabled, @attrs->cpumask is always used. If | ||
3774 | * enabled and @node has online CPUs requested by @attrs, the returned | ||
3775 | * cpumask is the intersection of the possible CPUs of @node and | ||
3776 | * @attrs->cpumask. | ||
3777 | * | ||
3778 | * The caller is responsible for ensuring that the cpumask of @node stays | ||
3779 | * stable. | ||
3780 | */ | ||
3781 | static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, | ||
3782 | int cpu_going_down, cpumask_t *cpumask) | ||
3783 | { | ||
3784 | if (!wq_numa_enabled || attrs->no_numa) | ||
3785 | goto use_dfl; | ||
3786 | |||
3787 | /* does @node have any online CPUs @attrs wants? */ | ||
3788 | cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); | ||
3789 | if (cpu_going_down >= 0) | ||
3790 | cpumask_clear_cpu(cpu_going_down, cpumask); | ||
3791 | |||
3792 | if (cpumask_empty(cpumask)) | ||
3793 | goto use_dfl; | ||
3794 | |||
3795 | /* yeap, return possible CPUs in @node that @attrs wants */ | ||
3796 | cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); | ||
3797 | return !cpumask_equal(cpumask, attrs->cpumask); | ||
3798 | |||
3799 | use_dfl: | ||
3800 | cpumask_copy(cpumask, attrs->cpumask); | ||
3801 | return false; | ||
3802 | } | ||
3803 | |||
3804 | /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ | ||
3805 | static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, | ||
3806 | int node, | ||
3807 | struct pool_workqueue *pwq) | ||
3808 | { | ||
3809 | struct pool_workqueue *old_pwq; | ||
3810 | |||
3811 | lockdep_assert_held(&wq->mutex); | ||
3812 | |||
3813 | /* link_pwq() can handle duplicate calls */ | ||
3814 | link_pwq(pwq); | ||
3815 | |||
3816 | old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); | ||
3817 | rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); | ||
3818 | return old_pwq; | ||
3819 | } | ||
3820 | |||
3821 | /** | ||
3822 | * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue | ||
3823 | * @wq: the target workqueue | ||
3824 | * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() | ||
3825 | * | ||
3826 | * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA | ||
3827 | * machines, this function maps a separate pwq to each NUMA node with | ||
3828 | * possibles CPUs in @attrs->cpumask so that work items are affine to the | ||
3829 | * NUMA node it was issued on. Older pwqs are released as in-flight work | ||
3830 | * items finish. Note that a work item which repeatedly requeues itself | ||
3831 | * back-to-back will stay on its current pwq. | ||
3832 | * | ||
3833 | * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on | ||
3834 | * failure. | ||
3835 | */ | ||
3836 | int apply_workqueue_attrs(struct workqueue_struct *wq, | ||
3837 | const struct workqueue_attrs *attrs) | ||
3838 | { | ||
3839 | struct workqueue_attrs *new_attrs, *tmp_attrs; | ||
3840 | struct pool_workqueue **pwq_tbl, *dfl_pwq; | ||
3841 | int node, ret; | ||
3842 | |||
3843 | /* only unbound workqueues can change attributes */ | ||
3844 | if (WARN_ON(!(wq->flags & WQ_UNBOUND))) | ||
3845 | return -EINVAL; | ||
3846 | |||
3847 | /* creating multiple pwqs breaks ordering guarantee */ | ||
3848 | if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) | ||
3849 | return -EINVAL; | ||
3850 | |||
3851 | pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); | ||
3852 | new_attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
3853 | tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
3854 | if (!pwq_tbl || !new_attrs || !tmp_attrs) | ||
3855 | goto enomem; | ||
3856 | |||
3857 | /* make a copy of @attrs and sanitize it */ | ||
3858 | copy_workqueue_attrs(new_attrs, attrs); | ||
3859 | cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); | ||
3860 | |||
3861 | /* | ||
3862 | * We may create multiple pwqs with differing cpumasks. Make a | ||
3863 | * copy of @new_attrs which will be modified and used to obtain | ||
3864 | * pools. | ||
3865 | */ | ||
3866 | copy_workqueue_attrs(tmp_attrs, new_attrs); | ||
3867 | |||
3868 | /* | ||
3869 | * CPUs should stay stable across pwq creations and installations. | ||
3870 | * Pin CPUs, determine the target cpumask for each node and create | ||
3871 | * pwqs accordingly. | ||
3872 | */ | ||
3873 | get_online_cpus(); | ||
3874 | |||
3875 | mutex_lock(&wq_pool_mutex); | ||
3876 | |||
3877 | /* | ||
3878 | * If something goes wrong during CPU up/down, we'll fall back to | ||
3879 | * the default pwq covering whole @attrs->cpumask. Always create | ||
3880 | * it even if we don't use it immediately. | ||
3881 | */ | ||
3882 | dfl_pwq = alloc_unbound_pwq(wq, new_attrs); | ||
3883 | if (!dfl_pwq) | ||
3884 | goto enomem_pwq; | ||
3885 | |||
3886 | for_each_node(node) { | ||
3887 | if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { | ||
3888 | pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); | ||
3889 | if (!pwq_tbl[node]) | ||
3890 | goto enomem_pwq; | ||
3891 | } else { | ||
3892 | dfl_pwq->refcnt++; | ||
3893 | pwq_tbl[node] = dfl_pwq; | ||
3118 | } | 3894 | } |
3119 | } | 3895 | } |
3120 | 3896 | ||
3121 | /* just in case, make sure it's actually aligned */ | 3897 | mutex_unlock(&wq_pool_mutex); |
3122 | BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); | 3898 | |
3123 | return wq->pool_wq.v ? 0 : -ENOMEM; | 3899 | /* all pwqs have been created successfully, let's install'em */ |
3900 | mutex_lock(&wq->mutex); | ||
3901 | |||
3902 | copy_workqueue_attrs(wq->unbound_attrs, new_attrs); | ||
3903 | |||
3904 | /* save the previous pwq and install the new one */ | ||
3905 | for_each_node(node) | ||
3906 | pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); | ||
3907 | |||
3908 | /* @dfl_pwq might not have been used, ensure it's linked */ | ||
3909 | link_pwq(dfl_pwq); | ||
3910 | swap(wq->dfl_pwq, dfl_pwq); | ||
3911 | |||
3912 | mutex_unlock(&wq->mutex); | ||
3913 | |||
3914 | /* put the old pwqs */ | ||
3915 | for_each_node(node) | ||
3916 | put_pwq_unlocked(pwq_tbl[node]); | ||
3917 | put_pwq_unlocked(dfl_pwq); | ||
3918 | |||
3919 | put_online_cpus(); | ||
3920 | ret = 0; | ||
3921 | /* fall through */ | ||
3922 | out_free: | ||
3923 | free_workqueue_attrs(tmp_attrs); | ||
3924 | free_workqueue_attrs(new_attrs); | ||
3925 | kfree(pwq_tbl); | ||
3926 | return ret; | ||
3927 | |||
3928 | enomem_pwq: | ||
3929 | free_unbound_pwq(dfl_pwq); | ||
3930 | for_each_node(node) | ||
3931 | if (pwq_tbl && pwq_tbl[node] != dfl_pwq) | ||
3932 | free_unbound_pwq(pwq_tbl[node]); | ||
3933 | mutex_unlock(&wq_pool_mutex); | ||
3934 | put_online_cpus(); | ||
3935 | enomem: | ||
3936 | ret = -ENOMEM; | ||
3937 | goto out_free; | ||
3124 | } | 3938 | } |
3125 | 3939 | ||
3126 | static void free_pwqs(struct workqueue_struct *wq) | 3940 | /** |
3941 | * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug | ||
3942 | * @wq: the target workqueue | ||
3943 | * @cpu: the CPU coming up or going down | ||
3944 | * @online: whether @cpu is coming up or going down | ||
3945 | * | ||
3946 | * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and | ||
3947 | * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of | ||
3948 | * @wq accordingly. | ||
3949 | * | ||
3950 | * If NUMA affinity can't be adjusted due to memory allocation failure, it | ||
3951 | * falls back to @wq->dfl_pwq which may not be optimal but is always | ||
3952 | * correct. | ||
3953 | * | ||
3954 | * Note that when the last allowed CPU of a NUMA node goes offline for a | ||
3955 | * workqueue with a cpumask spanning multiple nodes, the workers which were | ||
3956 | * already executing the work items for the workqueue will lose their CPU | ||
3957 | * affinity and may execute on any CPU. This is similar to how per-cpu | ||
3958 | * workqueues behave on CPU_DOWN. If a workqueue user wants strict | ||
3959 | * affinity, it's the user's responsibility to flush the work item from | ||
3960 | * CPU_DOWN_PREPARE. | ||
3961 | */ | ||
3962 | static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, | ||
3963 | bool online) | ||
3127 | { | 3964 | { |
3128 | if (!(wq->flags & WQ_UNBOUND)) | 3965 | int node = cpu_to_node(cpu); |
3129 | free_percpu(wq->pool_wq.pcpu); | 3966 | int cpu_off = online ? -1 : cpu; |
3130 | else if (wq->pool_wq.single) { | 3967 | struct pool_workqueue *old_pwq = NULL, *pwq; |
3131 | /* the pointer to free is stored right after the pwq */ | 3968 | struct workqueue_attrs *target_attrs; |
3132 | kfree(*(void **)(wq->pool_wq.single + 1)); | 3969 | cpumask_t *cpumask; |
3970 | |||
3971 | lockdep_assert_held(&wq_pool_mutex); | ||
3972 | |||
3973 | if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) | ||
3974 | return; | ||
3975 | |||
3976 | /* | ||
3977 | * We don't wanna alloc/free wq_attrs for each wq for each CPU. | ||
3978 | * Let's use a preallocated one. The following buf is protected by | ||
3979 | * CPU hotplug exclusion. | ||
3980 | */ | ||
3981 | target_attrs = wq_update_unbound_numa_attrs_buf; | ||
3982 | cpumask = target_attrs->cpumask; | ||
3983 | |||
3984 | mutex_lock(&wq->mutex); | ||
3985 | if (wq->unbound_attrs->no_numa) | ||
3986 | goto out_unlock; | ||
3987 | |||
3988 | copy_workqueue_attrs(target_attrs, wq->unbound_attrs); | ||
3989 | pwq = unbound_pwq_by_node(wq, node); | ||
3990 | |||
3991 | /* | ||
3992 | * Let's determine what needs to be done. If the target cpumask is | ||
3993 | * different from wq's, we need to compare it to @pwq's and create | ||
3994 | * a new one if they don't match. If the target cpumask equals | ||
3995 | * wq's, the default pwq should be used. If @pwq is already the | ||
3996 | * default one, nothing to do; otherwise, install the default one. | ||
3997 | */ | ||
3998 | if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { | ||
3999 | if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) | ||
4000 | goto out_unlock; | ||
4001 | } else { | ||
4002 | if (pwq == wq->dfl_pwq) | ||
4003 | goto out_unlock; | ||
4004 | else | ||
4005 | goto use_dfl_pwq; | ||
4006 | } | ||
4007 | |||
4008 | mutex_unlock(&wq->mutex); | ||
4009 | |||
4010 | /* create a new pwq */ | ||
4011 | pwq = alloc_unbound_pwq(wq, target_attrs); | ||
4012 | if (!pwq) { | ||
4013 | pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", | ||
4014 | wq->name); | ||
4015 | goto out_unlock; | ||
4016 | } | ||
4017 | |||
4018 | /* | ||
4019 | * Install the new pwq. As this function is called only from CPU | ||
4020 | * hotplug callbacks and applying a new attrs is wrapped with | ||
4021 | * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed | ||
4022 | * inbetween. | ||
4023 | */ | ||
4024 | mutex_lock(&wq->mutex); | ||
4025 | old_pwq = numa_pwq_tbl_install(wq, node, pwq); | ||
4026 | goto out_unlock; | ||
4027 | |||
4028 | use_dfl_pwq: | ||
4029 | spin_lock_irq(&wq->dfl_pwq->pool->lock); | ||
4030 | get_pwq(wq->dfl_pwq); | ||
4031 | spin_unlock_irq(&wq->dfl_pwq->pool->lock); | ||
4032 | old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); | ||
4033 | out_unlock: | ||
4034 | mutex_unlock(&wq->mutex); | ||
4035 | put_pwq_unlocked(old_pwq); | ||
4036 | } | ||
4037 | |||
4038 | static int alloc_and_link_pwqs(struct workqueue_struct *wq) | ||
4039 | { | ||
4040 | bool highpri = wq->flags & WQ_HIGHPRI; | ||
4041 | int cpu; | ||
4042 | |||
4043 | if (!(wq->flags & WQ_UNBOUND)) { | ||
4044 | wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); | ||
4045 | if (!wq->cpu_pwqs) | ||
4046 | return -ENOMEM; | ||
4047 | |||
4048 | for_each_possible_cpu(cpu) { | ||
4049 | struct pool_workqueue *pwq = | ||
4050 | per_cpu_ptr(wq->cpu_pwqs, cpu); | ||
4051 | struct worker_pool *cpu_pools = | ||
4052 | per_cpu(cpu_worker_pools, cpu); | ||
4053 | |||
4054 | init_pwq(pwq, wq, &cpu_pools[highpri]); | ||
4055 | |||
4056 | mutex_lock(&wq->mutex); | ||
4057 | link_pwq(pwq); | ||
4058 | mutex_unlock(&wq->mutex); | ||
4059 | } | ||
4060 | return 0; | ||
4061 | } else { | ||
4062 | return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); | ||
3133 | } | 4063 | } |
3134 | } | 4064 | } |
3135 | 4065 | ||
@@ -3151,30 +4081,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
3151 | struct lock_class_key *key, | 4081 | struct lock_class_key *key, |
3152 | const char *lock_name, ...) | 4082 | const char *lock_name, ...) |
3153 | { | 4083 | { |
3154 | va_list args, args1; | 4084 | size_t tbl_size = 0; |
4085 | va_list args; | ||
3155 | struct workqueue_struct *wq; | 4086 | struct workqueue_struct *wq; |
3156 | unsigned int cpu; | 4087 | struct pool_workqueue *pwq; |
3157 | size_t namelen; | ||
3158 | 4088 | ||
3159 | /* determine namelen, allocate wq and format name */ | 4089 | /* allocate wq and format name */ |
3160 | va_start(args, lock_name); | 4090 | if (flags & WQ_UNBOUND) |
3161 | va_copy(args1, args); | 4091 | tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); |
3162 | namelen = vsnprintf(NULL, 0, fmt, args) + 1; | ||
3163 | 4092 | ||
3164 | wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); | 4093 | wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); |
3165 | if (!wq) | 4094 | if (!wq) |
3166 | goto err; | 4095 | return NULL; |
3167 | 4096 | ||
3168 | vsnprintf(wq->name, namelen, fmt, args1); | 4097 | if (flags & WQ_UNBOUND) { |
3169 | va_end(args); | 4098 | wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); |
3170 | va_end(args1); | 4099 | if (!wq->unbound_attrs) |
4100 | goto err_free_wq; | ||
4101 | } | ||
3171 | 4102 | ||
3172 | /* | 4103 | va_start(args, lock_name); |
3173 | * Workqueues which may be used during memory reclaim should | 4104 | vsnprintf(wq->name, sizeof(wq->name), fmt, args); |
3174 | * have a rescuer to guarantee forward progress. | 4105 | va_end(args); |
3175 | */ | ||
3176 | if (flags & WQ_MEM_RECLAIM) | ||
3177 | flags |= WQ_RESCUER; | ||
3178 | 4106 | ||
3179 | max_active = max_active ?: WQ_DFL_ACTIVE; | 4107 | max_active = max_active ?: WQ_DFL_ACTIVE; |
3180 | max_active = wq_clamp_max_active(max_active, flags, wq->name); | 4108 | max_active = wq_clamp_max_active(max_active, flags, wq->name); |
@@ -3182,71 +4110,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
3182 | /* init wq */ | 4110 | /* init wq */ |
3183 | wq->flags = flags; | 4111 | wq->flags = flags; |
3184 | wq->saved_max_active = max_active; | 4112 | wq->saved_max_active = max_active; |
3185 | mutex_init(&wq->flush_mutex); | 4113 | mutex_init(&wq->mutex); |
3186 | atomic_set(&wq->nr_pwqs_to_flush, 0); | 4114 | atomic_set(&wq->nr_pwqs_to_flush, 0); |
4115 | INIT_LIST_HEAD(&wq->pwqs); | ||
3187 | INIT_LIST_HEAD(&wq->flusher_queue); | 4116 | INIT_LIST_HEAD(&wq->flusher_queue); |
3188 | INIT_LIST_HEAD(&wq->flusher_overflow); | 4117 | INIT_LIST_HEAD(&wq->flusher_overflow); |
4118 | INIT_LIST_HEAD(&wq->maydays); | ||
3189 | 4119 | ||
3190 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); | 4120 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
3191 | INIT_LIST_HEAD(&wq->list); | 4121 | INIT_LIST_HEAD(&wq->list); |
3192 | 4122 | ||
3193 | if (alloc_pwqs(wq) < 0) | 4123 | if (alloc_and_link_pwqs(wq) < 0) |
3194 | goto err; | 4124 | goto err_free_wq; |
3195 | |||
3196 | for_each_pwq_cpu(cpu, wq) { | ||
3197 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | ||
3198 | |||
3199 | BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); | ||
3200 | pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); | ||
3201 | pwq->wq = wq; | ||
3202 | pwq->flush_color = -1; | ||
3203 | pwq->max_active = max_active; | ||
3204 | INIT_LIST_HEAD(&pwq->delayed_works); | ||
3205 | } | ||
3206 | 4125 | ||
3207 | if (flags & WQ_RESCUER) { | 4126 | /* |
4127 | * Workqueues which may be used during memory reclaim should | ||
4128 | * have a rescuer to guarantee forward progress. | ||
4129 | */ | ||
4130 | if (flags & WQ_MEM_RECLAIM) { | ||
3208 | struct worker *rescuer; | 4131 | struct worker *rescuer; |
3209 | 4132 | ||
3210 | if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) | 4133 | rescuer = alloc_worker(); |
3211 | goto err; | ||
3212 | |||
3213 | wq->rescuer = rescuer = alloc_worker(); | ||
3214 | if (!rescuer) | 4134 | if (!rescuer) |
3215 | goto err; | 4135 | goto err_destroy; |
3216 | 4136 | ||
3217 | rescuer->rescue_wq = wq; | 4137 | rescuer->rescue_wq = wq; |
3218 | rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", | 4138 | rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", |
3219 | wq->name); | 4139 | wq->name); |
3220 | if (IS_ERR(rescuer->task)) | 4140 | if (IS_ERR(rescuer->task)) { |
3221 | goto err; | 4141 | kfree(rescuer); |
4142 | goto err_destroy; | ||
4143 | } | ||
3222 | 4144 | ||
3223 | rescuer->task->flags |= PF_THREAD_BOUND; | 4145 | wq->rescuer = rescuer; |
4146 | rescuer->task->flags |= PF_NO_SETAFFINITY; | ||
3224 | wake_up_process(rescuer->task); | 4147 | wake_up_process(rescuer->task); |
3225 | } | 4148 | } |
3226 | 4149 | ||
4150 | if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) | ||
4151 | goto err_destroy; | ||
4152 | |||
3227 | /* | 4153 | /* |
3228 | * workqueue_lock protects global freeze state and workqueues | 4154 | * wq_pool_mutex protects global freeze state and workqueues list. |
3229 | * list. Grab it, set max_active accordingly and add the new | 4155 | * Grab it, adjust max_active and add the new @wq to workqueues |
3230 | * workqueue to workqueues list. | 4156 | * list. |
3231 | */ | 4157 | */ |
3232 | spin_lock(&workqueue_lock); | 4158 | mutex_lock(&wq_pool_mutex); |
3233 | 4159 | ||
3234 | if (workqueue_freezing && wq->flags & WQ_FREEZABLE) | 4160 | mutex_lock(&wq->mutex); |
3235 | for_each_pwq_cpu(cpu, wq) | 4161 | for_each_pwq(pwq, wq) |
3236 | get_pwq(cpu, wq)->max_active = 0; | 4162 | pwq_adjust_max_active(pwq); |
4163 | mutex_unlock(&wq->mutex); | ||
3237 | 4164 | ||
3238 | list_add(&wq->list, &workqueues); | 4165 | list_add(&wq->list, &workqueues); |
3239 | 4166 | ||
3240 | spin_unlock(&workqueue_lock); | 4167 | mutex_unlock(&wq_pool_mutex); |
3241 | 4168 | ||
3242 | return wq; | 4169 | return wq; |
3243 | err: | 4170 | |
3244 | if (wq) { | 4171 | err_free_wq: |
3245 | free_pwqs(wq); | 4172 | free_workqueue_attrs(wq->unbound_attrs); |
3246 | free_mayday_mask(wq->mayday_mask); | 4173 | kfree(wq); |
3247 | kfree(wq->rescuer); | 4174 | return NULL; |
3248 | kfree(wq); | 4175 | err_destroy: |
3249 | } | 4176 | destroy_workqueue(wq); |
3250 | return NULL; | 4177 | return NULL; |
3251 | } | 4178 | } |
3252 | EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | 4179 | EXPORT_SYMBOL_GPL(__alloc_workqueue_key); |
@@ -3259,60 +4186,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | |||
3259 | */ | 4186 | */ |
3260 | void destroy_workqueue(struct workqueue_struct *wq) | 4187 | void destroy_workqueue(struct workqueue_struct *wq) |
3261 | { | 4188 | { |
3262 | unsigned int cpu; | 4189 | struct pool_workqueue *pwq; |
4190 | int node; | ||
3263 | 4191 | ||
3264 | /* drain it before proceeding with destruction */ | 4192 | /* drain it before proceeding with destruction */ |
3265 | drain_workqueue(wq); | 4193 | drain_workqueue(wq); |
3266 | 4194 | ||
4195 | /* sanity checks */ | ||
4196 | mutex_lock(&wq->mutex); | ||
4197 | for_each_pwq(pwq, wq) { | ||
4198 | int i; | ||
4199 | |||
4200 | for (i = 0; i < WORK_NR_COLORS; i++) { | ||
4201 | if (WARN_ON(pwq->nr_in_flight[i])) { | ||
4202 | mutex_unlock(&wq->mutex); | ||
4203 | return; | ||
4204 | } | ||
4205 | } | ||
4206 | |||
4207 | if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || | ||
4208 | WARN_ON(pwq->nr_active) || | ||
4209 | WARN_ON(!list_empty(&pwq->delayed_works))) { | ||
4210 | mutex_unlock(&wq->mutex); | ||
4211 | return; | ||
4212 | } | ||
4213 | } | ||
4214 | mutex_unlock(&wq->mutex); | ||
4215 | |||
3267 | /* | 4216 | /* |
3268 | * wq list is used to freeze wq, remove from list after | 4217 | * wq list is used to freeze wq, remove from list after |
3269 | * flushing is complete in case freeze races us. | 4218 | * flushing is complete in case freeze races us. |
3270 | */ | 4219 | */ |
3271 | spin_lock(&workqueue_lock); | 4220 | mutex_lock(&wq_pool_mutex); |
3272 | list_del(&wq->list); | 4221 | list_del_init(&wq->list); |
3273 | spin_unlock(&workqueue_lock); | 4222 | mutex_unlock(&wq_pool_mutex); |
3274 | 4223 | ||
3275 | /* sanity check */ | 4224 | workqueue_sysfs_unregister(wq); |
3276 | for_each_pwq_cpu(cpu, wq) { | ||
3277 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | ||
3278 | int i; | ||
3279 | 4225 | ||
3280 | for (i = 0; i < WORK_NR_COLORS; i++) | 4226 | if (wq->rescuer) { |
3281 | BUG_ON(pwq->nr_in_flight[i]); | ||
3282 | BUG_ON(pwq->nr_active); | ||
3283 | BUG_ON(!list_empty(&pwq->delayed_works)); | ||
3284 | } | ||
3285 | |||
3286 | if (wq->flags & WQ_RESCUER) { | ||
3287 | kthread_stop(wq->rescuer->task); | 4227 | kthread_stop(wq->rescuer->task); |
3288 | free_mayday_mask(wq->mayday_mask); | ||
3289 | kfree(wq->rescuer); | 4228 | kfree(wq->rescuer); |
4229 | wq->rescuer = NULL; | ||
3290 | } | 4230 | } |
3291 | 4231 | ||
3292 | free_pwqs(wq); | 4232 | if (!(wq->flags & WQ_UNBOUND)) { |
3293 | kfree(wq); | 4233 | /* |
3294 | } | 4234 | * The base ref is never dropped on per-cpu pwqs. Directly |
3295 | EXPORT_SYMBOL_GPL(destroy_workqueue); | 4235 | * free the pwqs and wq. |
3296 | 4236 | */ | |
3297 | /** | 4237 | free_percpu(wq->cpu_pwqs); |
3298 | * pwq_set_max_active - adjust max_active of a pwq | 4238 | kfree(wq); |
3299 | * @pwq: target pool_workqueue | 4239 | } else { |
3300 | * @max_active: new max_active value. | 4240 | /* |
3301 | * | 4241 | * We're the sole accessor of @wq at this point. Directly |
3302 | * Set @pwq->max_active to @max_active and activate delayed works if | 4242 | * access numa_pwq_tbl[] and dfl_pwq to put the base refs. |
3303 | * increased. | 4243 | * @wq will be freed when the last pwq is released. |
3304 | * | 4244 | */ |
3305 | * CONTEXT: | 4245 | for_each_node(node) { |
3306 | * spin_lock_irq(pool->lock). | 4246 | pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); |
3307 | */ | 4247 | RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); |
3308 | static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) | 4248 | put_pwq_unlocked(pwq); |
3309 | { | 4249 | } |
3310 | pwq->max_active = max_active; | ||
3311 | 4250 | ||
3312 | while (!list_empty(&pwq->delayed_works) && | 4251 | /* |
3313 | pwq->nr_active < pwq->max_active) | 4252 | * Put dfl_pwq. @wq may be freed any time after dfl_pwq is |
3314 | pwq_activate_first_delayed(pwq); | 4253 | * put. Don't access it afterwards. |
4254 | */ | ||
4255 | pwq = wq->dfl_pwq; | ||
4256 | wq->dfl_pwq = NULL; | ||
4257 | put_pwq_unlocked(pwq); | ||
4258 | } | ||
3315 | } | 4259 | } |
4260 | EXPORT_SYMBOL_GPL(destroy_workqueue); | ||
3316 | 4261 | ||
3317 | /** | 4262 | /** |
3318 | * workqueue_set_max_active - adjust max_active of a workqueue | 4263 | * workqueue_set_max_active - adjust max_active of a workqueue |
@@ -3326,30 +4271,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) | |||
3326 | */ | 4271 | */ |
3327 | void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | 4272 | void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) |
3328 | { | 4273 | { |
3329 | unsigned int cpu; | 4274 | struct pool_workqueue *pwq; |
4275 | |||
4276 | /* disallow meddling with max_active for ordered workqueues */ | ||
4277 | if (WARN_ON(wq->flags & __WQ_ORDERED)) | ||
4278 | return; | ||
3330 | 4279 | ||
3331 | max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); | 4280 | max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); |
3332 | 4281 | ||
3333 | spin_lock(&workqueue_lock); | 4282 | mutex_lock(&wq->mutex); |
3334 | 4283 | ||
3335 | wq->saved_max_active = max_active; | 4284 | wq->saved_max_active = max_active; |
3336 | 4285 | ||
3337 | for_each_pwq_cpu(cpu, wq) { | 4286 | for_each_pwq(pwq, wq) |
3338 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | 4287 | pwq_adjust_max_active(pwq); |
3339 | struct worker_pool *pool = pwq->pool; | ||
3340 | |||
3341 | spin_lock_irq(&pool->lock); | ||
3342 | 4288 | ||
3343 | if (!(wq->flags & WQ_FREEZABLE) || | 4289 | mutex_unlock(&wq->mutex); |
3344 | !(pool->flags & POOL_FREEZING)) | 4290 | } |
3345 | pwq_set_max_active(pwq, max_active); | 4291 | EXPORT_SYMBOL_GPL(workqueue_set_max_active); |
3346 | 4292 | ||
3347 | spin_unlock_irq(&pool->lock); | 4293 | /** |
3348 | } | 4294 | * current_is_workqueue_rescuer - is %current workqueue rescuer? |
4295 | * | ||
4296 | * Determine whether %current is a workqueue rescuer. Can be used from | ||
4297 | * work functions to determine whether it's being run off the rescuer task. | ||
4298 | */ | ||
4299 | bool current_is_workqueue_rescuer(void) | ||
4300 | { | ||
4301 | struct worker *worker = current_wq_worker(); | ||
3349 | 4302 | ||
3350 | spin_unlock(&workqueue_lock); | 4303 | return worker && worker->rescue_wq; |
3351 | } | 4304 | } |
3352 | EXPORT_SYMBOL_GPL(workqueue_set_max_active); | ||
3353 | 4305 | ||
3354 | /** | 4306 | /** |
3355 | * workqueue_congested - test whether a workqueue is congested | 4307 | * workqueue_congested - test whether a workqueue is congested |
@@ -3360,14 +4312,34 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); | |||
3360 | * no synchronization around this function and the test result is | 4312 | * no synchronization around this function and the test result is |
3361 | * unreliable and only useful as advisory hints or for debugging. | 4313 | * unreliable and only useful as advisory hints or for debugging. |
3362 | * | 4314 | * |
4315 | * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. | ||
4316 | * Note that both per-cpu and unbound workqueues may be associated with | ||
4317 | * multiple pool_workqueues which have separate congested states. A | ||
4318 | * workqueue being congested on one CPU doesn't mean the workqueue is also | ||
4319 | * contested on other CPUs / NUMA nodes. | ||
4320 | * | ||
3363 | * RETURNS: | 4321 | * RETURNS: |
3364 | * %true if congested, %false otherwise. | 4322 | * %true if congested, %false otherwise. |
3365 | */ | 4323 | */ |
3366 | bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) | 4324 | bool workqueue_congested(int cpu, struct workqueue_struct *wq) |
3367 | { | 4325 | { |
3368 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | 4326 | struct pool_workqueue *pwq; |
4327 | bool ret; | ||
4328 | |||
4329 | rcu_read_lock_sched(); | ||
4330 | |||
4331 | if (cpu == WORK_CPU_UNBOUND) | ||
4332 | cpu = smp_processor_id(); | ||
4333 | |||
4334 | if (!(wq->flags & WQ_UNBOUND)) | ||
4335 | pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); | ||
4336 | else | ||
4337 | pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); | ||
3369 | 4338 | ||
3370 | return !list_empty(&pwq->delayed_works); | 4339 | ret = !list_empty(&pwq->delayed_works); |
4340 | rcu_read_unlock_sched(); | ||
4341 | |||
4342 | return ret; | ||
3371 | } | 4343 | } |
3372 | EXPORT_SYMBOL_GPL(workqueue_congested); | 4344 | EXPORT_SYMBOL_GPL(workqueue_congested); |
3373 | 4345 | ||
@@ -3384,24 +4356,104 @@ EXPORT_SYMBOL_GPL(workqueue_congested); | |||
3384 | */ | 4356 | */ |
3385 | unsigned int work_busy(struct work_struct *work) | 4357 | unsigned int work_busy(struct work_struct *work) |
3386 | { | 4358 | { |
3387 | struct worker_pool *pool = get_work_pool(work); | 4359 | struct worker_pool *pool; |
3388 | unsigned long flags; | 4360 | unsigned long flags; |
3389 | unsigned int ret = 0; | 4361 | unsigned int ret = 0; |
3390 | 4362 | ||
3391 | if (work_pending(work)) | 4363 | if (work_pending(work)) |
3392 | ret |= WORK_BUSY_PENDING; | 4364 | ret |= WORK_BUSY_PENDING; |
3393 | 4365 | ||
4366 | local_irq_save(flags); | ||
4367 | pool = get_work_pool(work); | ||
3394 | if (pool) { | 4368 | if (pool) { |
3395 | spin_lock_irqsave(&pool->lock, flags); | 4369 | spin_lock(&pool->lock); |
3396 | if (find_worker_executing_work(pool, work)) | 4370 | if (find_worker_executing_work(pool, work)) |
3397 | ret |= WORK_BUSY_RUNNING; | 4371 | ret |= WORK_BUSY_RUNNING; |
3398 | spin_unlock_irqrestore(&pool->lock, flags); | 4372 | spin_unlock(&pool->lock); |
3399 | } | 4373 | } |
4374 | local_irq_restore(flags); | ||
3400 | 4375 | ||
3401 | return ret; | 4376 | return ret; |
3402 | } | 4377 | } |
3403 | EXPORT_SYMBOL_GPL(work_busy); | 4378 | EXPORT_SYMBOL_GPL(work_busy); |
3404 | 4379 | ||
4380 | /** | ||
4381 | * set_worker_desc - set description for the current work item | ||
4382 | * @fmt: printf-style format string | ||
4383 | * @...: arguments for the format string | ||
4384 | * | ||
4385 | * This function can be called by a running work function to describe what | ||
4386 | * the work item is about. If the worker task gets dumped, this | ||
4387 | * information will be printed out together to help debugging. The | ||
4388 | * description can be at most WORKER_DESC_LEN including the trailing '\0'. | ||
4389 | */ | ||
4390 | void set_worker_desc(const char *fmt, ...) | ||
4391 | { | ||
4392 | struct worker *worker = current_wq_worker(); | ||
4393 | va_list args; | ||
4394 | |||
4395 | if (worker) { | ||
4396 | va_start(args, fmt); | ||
4397 | vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); | ||
4398 | va_end(args); | ||
4399 | worker->desc_valid = true; | ||
4400 | } | ||
4401 | } | ||
4402 | |||
4403 | /** | ||
4404 | * print_worker_info - print out worker information and description | ||
4405 | * @log_lvl: the log level to use when printing | ||
4406 | * @task: target task | ||
4407 | * | ||
4408 | * If @task is a worker and currently executing a work item, print out the | ||
4409 | * name of the workqueue being serviced and worker description set with | ||
4410 | * set_worker_desc() by the currently executing work item. | ||
4411 | * | ||
4412 | * This function can be safely called on any task as long as the | ||
4413 | * task_struct itself is accessible. While safe, this function isn't | ||
4414 | * synchronized and may print out mixups or garbages of limited length. | ||
4415 | */ | ||
4416 | void print_worker_info(const char *log_lvl, struct task_struct *task) | ||
4417 | { | ||
4418 | work_func_t *fn = NULL; | ||
4419 | char name[WQ_NAME_LEN] = { }; | ||
4420 | char desc[WORKER_DESC_LEN] = { }; | ||
4421 | struct pool_workqueue *pwq = NULL; | ||
4422 | struct workqueue_struct *wq = NULL; | ||
4423 | bool desc_valid = false; | ||
4424 | struct worker *worker; | ||
4425 | |||
4426 | if (!(task->flags & PF_WQ_WORKER)) | ||
4427 | return; | ||
4428 | |||
4429 | /* | ||
4430 | * This function is called without any synchronization and @task | ||
4431 | * could be in any state. Be careful with dereferences. | ||
4432 | */ | ||
4433 | worker = probe_kthread_data(task); | ||
4434 | |||
4435 | /* | ||
4436 | * Carefully copy the associated workqueue's workfn and name. Keep | ||
4437 | * the original last '\0' in case the original contains garbage. | ||
4438 | */ | ||
4439 | probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); | ||
4440 | probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); | ||
4441 | probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); | ||
4442 | probe_kernel_read(name, wq->name, sizeof(name) - 1); | ||
4443 | |||
4444 | /* copy worker description */ | ||
4445 | probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid)); | ||
4446 | if (desc_valid) | ||
4447 | probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); | ||
4448 | |||
4449 | if (fn || name[0] || desc[0]) { | ||
4450 | printk("%sWorkqueue: %s %pf", log_lvl, name, fn); | ||
4451 | if (desc[0]) | ||
4452 | pr_cont(" (%s)", desc); | ||
4453 | pr_cont("\n"); | ||
4454 | } | ||
4455 | } | ||
4456 | |||
3405 | /* | 4457 | /* |
3406 | * CPU hotplug. | 4458 | * CPU hotplug. |
3407 | * | 4459 | * |
@@ -3422,53 +4474,153 @@ static void wq_unbind_fn(struct work_struct *work) | |||
3422 | int cpu = smp_processor_id(); | 4474 | int cpu = smp_processor_id(); |
3423 | struct worker_pool *pool; | 4475 | struct worker_pool *pool; |
3424 | struct worker *worker; | 4476 | struct worker *worker; |
3425 | int i; | 4477 | int wi; |
3426 | 4478 | ||
3427 | for_each_std_worker_pool(pool, cpu) { | 4479 | for_each_cpu_worker_pool(pool, cpu) { |
3428 | BUG_ON(cpu != smp_processor_id()); | 4480 | WARN_ON_ONCE(cpu != smp_processor_id()); |
3429 | 4481 | ||
3430 | mutex_lock(&pool->assoc_mutex); | 4482 | mutex_lock(&pool->manager_mutex); |
3431 | spin_lock_irq(&pool->lock); | 4483 | spin_lock_irq(&pool->lock); |
3432 | 4484 | ||
3433 | /* | 4485 | /* |
3434 | * We've claimed all manager positions. Make all workers | 4486 | * We've blocked all manager operations. Make all workers |
3435 | * unbound and set DISASSOCIATED. Before this, all workers | 4487 | * unbound and set DISASSOCIATED. Before this, all workers |
3436 | * except for the ones which are still executing works from | 4488 | * except for the ones which are still executing works from |
3437 | * before the last CPU down must be on the cpu. After | 4489 | * before the last CPU down must be on the cpu. After |
3438 | * this, they may become diasporas. | 4490 | * this, they may become diasporas. |
3439 | */ | 4491 | */ |
3440 | list_for_each_entry(worker, &pool->idle_list, entry) | 4492 | for_each_pool_worker(worker, wi, pool) |
3441 | worker->flags |= WORKER_UNBOUND; | ||
3442 | |||
3443 | for_each_busy_worker(worker, i, pool) | ||
3444 | worker->flags |= WORKER_UNBOUND; | 4493 | worker->flags |= WORKER_UNBOUND; |
3445 | 4494 | ||
3446 | pool->flags |= POOL_DISASSOCIATED; | 4495 | pool->flags |= POOL_DISASSOCIATED; |
3447 | 4496 | ||
3448 | spin_unlock_irq(&pool->lock); | 4497 | spin_unlock_irq(&pool->lock); |
3449 | mutex_unlock(&pool->assoc_mutex); | 4498 | mutex_unlock(&pool->manager_mutex); |
4499 | |||
4500 | /* | ||
4501 | * Call schedule() so that we cross rq->lock and thus can | ||
4502 | * guarantee sched callbacks see the %WORKER_UNBOUND flag. | ||
4503 | * This is necessary as scheduler callbacks may be invoked | ||
4504 | * from other cpus. | ||
4505 | */ | ||
4506 | schedule(); | ||
4507 | |||
4508 | /* | ||
4509 | * Sched callbacks are disabled now. Zap nr_running. | ||
4510 | * After this, nr_running stays zero and need_more_worker() | ||
4511 | * and keep_working() are always true as long as the | ||
4512 | * worklist is not empty. This pool now behaves as an | ||
4513 | * unbound (in terms of concurrency management) pool which | ||
4514 | * are served by workers tied to the pool. | ||
4515 | */ | ||
4516 | atomic_set(&pool->nr_running, 0); | ||
4517 | |||
4518 | /* | ||
4519 | * With concurrency management just turned off, a busy | ||
4520 | * worker blocking could lead to lengthy stalls. Kick off | ||
4521 | * unbound chain execution of currently pending work items. | ||
4522 | */ | ||
4523 | spin_lock_irq(&pool->lock); | ||
4524 | wake_up_worker(pool); | ||
4525 | spin_unlock_irq(&pool->lock); | ||
3450 | } | 4526 | } |
4527 | } | ||
3451 | 4528 | ||
3452 | /* | 4529 | /** |
3453 | * Call schedule() so that we cross rq->lock and thus can guarantee | 4530 | * rebind_workers - rebind all workers of a pool to the associated CPU |
3454 | * sched callbacks see the %WORKER_UNBOUND flag. This is necessary | 4531 | * @pool: pool of interest |
3455 | * as scheduler callbacks may be invoked from other cpus. | 4532 | * |
3456 | */ | 4533 | * @pool->cpu is coming online. Rebind all workers to the CPU. |
3457 | schedule(); | 4534 | */ |
4535 | static void rebind_workers(struct worker_pool *pool) | ||
4536 | { | ||
4537 | struct worker *worker; | ||
4538 | int wi; | ||
4539 | |||
4540 | lockdep_assert_held(&pool->manager_mutex); | ||
3458 | 4541 | ||
3459 | /* | 4542 | /* |
3460 | * Sched callbacks are disabled now. Zap nr_running. After this, | 4543 | * Restore CPU affinity of all workers. As all idle workers should |
3461 | * nr_running stays zero and need_more_worker() and keep_working() | 4544 | * be on the run-queue of the associated CPU before any local |
3462 | * are always true as long as the worklist is not empty. Pools on | 4545 | * wake-ups for concurrency management happen, restore CPU affinty |
3463 | * @cpu now behave as unbound (in terms of concurrency management) | 4546 | * of all workers first and then clear UNBOUND. As we're called |
3464 | * pools which are served by workers tied to the CPU. | 4547 | * from CPU_ONLINE, the following shouldn't fail. |
3465 | * | ||
3466 | * On return from this function, the current worker would trigger | ||
3467 | * unbound chain execution of pending work items if other workers | ||
3468 | * didn't already. | ||
3469 | */ | 4548 | */ |
3470 | for_each_std_worker_pool(pool, cpu) | 4549 | for_each_pool_worker(worker, wi, pool) |
3471 | atomic_set(&pool->nr_running, 0); | 4550 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, |
4551 | pool->attrs->cpumask) < 0); | ||
4552 | |||
4553 | spin_lock_irq(&pool->lock); | ||
4554 | |||
4555 | for_each_pool_worker(worker, wi, pool) { | ||
4556 | unsigned int worker_flags = worker->flags; | ||
4557 | |||
4558 | /* | ||
4559 | * A bound idle worker should actually be on the runqueue | ||
4560 | * of the associated CPU for local wake-ups targeting it to | ||
4561 | * work. Kick all idle workers so that they migrate to the | ||
4562 | * associated CPU. Doing this in the same loop as | ||
4563 | * replacing UNBOUND with REBOUND is safe as no worker will | ||
4564 | * be bound before @pool->lock is released. | ||
4565 | */ | ||
4566 | if (worker_flags & WORKER_IDLE) | ||
4567 | wake_up_process(worker->task); | ||
4568 | |||
4569 | /* | ||
4570 | * We want to clear UNBOUND but can't directly call | ||
4571 | * worker_clr_flags() or adjust nr_running. Atomically | ||
4572 | * replace UNBOUND with another NOT_RUNNING flag REBOUND. | ||
4573 | * @worker will clear REBOUND using worker_clr_flags() when | ||
4574 | * it initiates the next execution cycle thus restoring | ||
4575 | * concurrency management. Note that when or whether | ||
4576 | * @worker clears REBOUND doesn't affect correctness. | ||
4577 | * | ||
4578 | * ACCESS_ONCE() is necessary because @worker->flags may be | ||
4579 | * tested without holding any lock in | ||
4580 | * wq_worker_waking_up(). Without it, NOT_RUNNING test may | ||
4581 | * fail incorrectly leading to premature concurrency | ||
4582 | * management operations. | ||
4583 | */ | ||
4584 | WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); | ||
4585 | worker_flags |= WORKER_REBOUND; | ||
4586 | worker_flags &= ~WORKER_UNBOUND; | ||
4587 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
4588 | } | ||
4589 | |||
4590 | spin_unlock_irq(&pool->lock); | ||
4591 | } | ||
4592 | |||
4593 | /** | ||
4594 | * restore_unbound_workers_cpumask - restore cpumask of unbound workers | ||
4595 | * @pool: unbound pool of interest | ||
4596 | * @cpu: the CPU which is coming up | ||
4597 | * | ||
4598 | * An unbound pool may end up with a cpumask which doesn't have any online | ||
4599 | * CPUs. When a worker of such pool get scheduled, the scheduler resets | ||
4600 | * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any | ||
4601 | * online CPU before, cpus_allowed of all its workers should be restored. | ||
4602 | */ | ||
4603 | static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | ||
4604 | { | ||
4605 | static cpumask_t cpumask; | ||
4606 | struct worker *worker; | ||
4607 | int wi; | ||
4608 | |||
4609 | lockdep_assert_held(&pool->manager_mutex); | ||
4610 | |||
4611 | /* is @cpu allowed for @pool? */ | ||
4612 | if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) | ||
4613 | return; | ||
4614 | |||
4615 | /* is @cpu the only online CPU? */ | ||
4616 | cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); | ||
4617 | if (cpumask_weight(&cpumask) != 1) | ||
4618 | return; | ||
4619 | |||
4620 | /* as we're called from CPU_ONLINE, the following shouldn't fail */ | ||
4621 | for_each_pool_worker(worker, wi, pool) | ||
4622 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, | ||
4623 | pool->attrs->cpumask) < 0); | ||
3472 | } | 4624 | } |
3473 | 4625 | ||
3474 | /* | 4626 | /* |
@@ -3479,39 +4631,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
3479 | unsigned long action, | 4631 | unsigned long action, |
3480 | void *hcpu) | 4632 | void *hcpu) |
3481 | { | 4633 | { |
3482 | unsigned int cpu = (unsigned long)hcpu; | 4634 | int cpu = (unsigned long)hcpu; |
3483 | struct worker_pool *pool; | 4635 | struct worker_pool *pool; |
4636 | struct workqueue_struct *wq; | ||
4637 | int pi; | ||
3484 | 4638 | ||
3485 | switch (action & ~CPU_TASKS_FROZEN) { | 4639 | switch (action & ~CPU_TASKS_FROZEN) { |
3486 | case CPU_UP_PREPARE: | 4640 | case CPU_UP_PREPARE: |
3487 | for_each_std_worker_pool(pool, cpu) { | 4641 | for_each_cpu_worker_pool(pool, cpu) { |
3488 | struct worker *worker; | ||
3489 | |||
3490 | if (pool->nr_workers) | 4642 | if (pool->nr_workers) |
3491 | continue; | 4643 | continue; |
3492 | 4644 | if (create_and_start_worker(pool) < 0) | |
3493 | worker = create_worker(pool); | ||
3494 | if (!worker) | ||
3495 | return NOTIFY_BAD; | 4645 | return NOTIFY_BAD; |
3496 | |||
3497 | spin_lock_irq(&pool->lock); | ||
3498 | start_worker(worker); | ||
3499 | spin_unlock_irq(&pool->lock); | ||
3500 | } | 4646 | } |
3501 | break; | 4647 | break; |
3502 | 4648 | ||
3503 | case CPU_DOWN_FAILED: | 4649 | case CPU_DOWN_FAILED: |
3504 | case CPU_ONLINE: | 4650 | case CPU_ONLINE: |
3505 | for_each_std_worker_pool(pool, cpu) { | 4651 | mutex_lock(&wq_pool_mutex); |
3506 | mutex_lock(&pool->assoc_mutex); | ||
3507 | spin_lock_irq(&pool->lock); | ||
3508 | 4652 | ||
3509 | pool->flags &= ~POOL_DISASSOCIATED; | 4653 | for_each_pool(pool, pi) { |
3510 | rebind_workers(pool); | 4654 | mutex_lock(&pool->manager_mutex); |
3511 | 4655 | ||
3512 | spin_unlock_irq(&pool->lock); | 4656 | if (pool->cpu == cpu) { |
3513 | mutex_unlock(&pool->assoc_mutex); | 4657 | spin_lock_irq(&pool->lock); |
4658 | pool->flags &= ~POOL_DISASSOCIATED; | ||
4659 | spin_unlock_irq(&pool->lock); | ||
4660 | |||
4661 | rebind_workers(pool); | ||
4662 | } else if (pool->cpu < 0) { | ||
4663 | restore_unbound_workers_cpumask(pool, cpu); | ||
4664 | } | ||
4665 | |||
4666 | mutex_unlock(&pool->manager_mutex); | ||
3514 | } | 4667 | } |
4668 | |||
4669 | /* update NUMA affinity of unbound workqueues */ | ||
4670 | list_for_each_entry(wq, &workqueues, list) | ||
4671 | wq_update_unbound_numa(wq, cpu, true); | ||
4672 | |||
4673 | mutex_unlock(&wq_pool_mutex); | ||
3515 | break; | 4674 | break; |
3516 | } | 4675 | } |
3517 | return NOTIFY_OK; | 4676 | return NOTIFY_OK; |
@@ -3525,14 +4684,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, | |||
3525 | unsigned long action, | 4684 | unsigned long action, |
3526 | void *hcpu) | 4685 | void *hcpu) |
3527 | { | 4686 | { |
3528 | unsigned int cpu = (unsigned long)hcpu; | 4687 | int cpu = (unsigned long)hcpu; |
3529 | struct work_struct unbind_work; | 4688 | struct work_struct unbind_work; |
4689 | struct workqueue_struct *wq; | ||
3530 | 4690 | ||
3531 | switch (action & ~CPU_TASKS_FROZEN) { | 4691 | switch (action & ~CPU_TASKS_FROZEN) { |
3532 | case CPU_DOWN_PREPARE: | 4692 | case CPU_DOWN_PREPARE: |
3533 | /* unbinding should happen on the local CPU */ | 4693 | /* unbinding per-cpu workers should happen on the local CPU */ |
3534 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); | 4694 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); |
3535 | queue_work_on(cpu, system_highpri_wq, &unbind_work); | 4695 | queue_work_on(cpu, system_highpri_wq, &unbind_work); |
4696 | |||
4697 | /* update NUMA affinity of unbound workqueues */ | ||
4698 | mutex_lock(&wq_pool_mutex); | ||
4699 | list_for_each_entry(wq, &workqueues, list) | ||
4700 | wq_update_unbound_numa(wq, cpu, false); | ||
4701 | mutex_unlock(&wq_pool_mutex); | ||
4702 | |||
4703 | /* wait for per-cpu unbinding to finish */ | ||
3536 | flush_work(&unbind_work); | 4704 | flush_work(&unbind_work); |
3537 | break; | 4705 | break; |
3538 | } | 4706 | } |
@@ -3565,7 +4733,7 @@ static void work_for_cpu_fn(struct work_struct *work) | |||
3565 | * It is up to the caller to ensure that the cpu doesn't go offline. | 4733 | * It is up to the caller to ensure that the cpu doesn't go offline. |
3566 | * The caller must not hold any locks which would prevent @fn from completing. | 4734 | * The caller must not hold any locks which would prevent @fn from completing. |
3567 | */ | 4735 | */ |
3568 | long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) | 4736 | long work_on_cpu(int cpu, long (*fn)(void *), void *arg) |
3569 | { | 4737 | { |
3570 | struct work_for_cpu wfc = { .fn = fn, .arg = arg }; | 4738 | struct work_for_cpu wfc = { .fn = fn, .arg = arg }; |
3571 | 4739 | ||
@@ -3583,44 +4751,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
3583 | * freeze_workqueues_begin - begin freezing workqueues | 4751 | * freeze_workqueues_begin - begin freezing workqueues |
3584 | * | 4752 | * |
3585 | * Start freezing workqueues. After this function returns, all freezable | 4753 | * Start freezing workqueues. After this function returns, all freezable |
3586 | * workqueues will queue new works to their frozen_works list instead of | 4754 | * workqueues will queue new works to their delayed_works list instead of |
3587 | * pool->worklist. | 4755 | * pool->worklist. |
3588 | * | 4756 | * |
3589 | * CONTEXT: | 4757 | * CONTEXT: |
3590 | * Grabs and releases workqueue_lock and pool->lock's. | 4758 | * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. |
3591 | */ | 4759 | */ |
3592 | void freeze_workqueues_begin(void) | 4760 | void freeze_workqueues_begin(void) |
3593 | { | 4761 | { |
3594 | unsigned int cpu; | 4762 | struct worker_pool *pool; |
4763 | struct workqueue_struct *wq; | ||
4764 | struct pool_workqueue *pwq; | ||
4765 | int pi; | ||
3595 | 4766 | ||
3596 | spin_lock(&workqueue_lock); | 4767 | mutex_lock(&wq_pool_mutex); |
3597 | 4768 | ||
3598 | BUG_ON(workqueue_freezing); | 4769 | WARN_ON_ONCE(workqueue_freezing); |
3599 | workqueue_freezing = true; | 4770 | workqueue_freezing = true; |
3600 | 4771 | ||
3601 | for_each_wq_cpu(cpu) { | 4772 | /* set FREEZING */ |
3602 | struct worker_pool *pool; | 4773 | for_each_pool(pool, pi) { |
3603 | struct workqueue_struct *wq; | 4774 | spin_lock_irq(&pool->lock); |
3604 | 4775 | WARN_ON_ONCE(pool->flags & POOL_FREEZING); | |
3605 | for_each_std_worker_pool(pool, cpu) { | 4776 | pool->flags |= POOL_FREEZING; |
3606 | spin_lock_irq(&pool->lock); | 4777 | spin_unlock_irq(&pool->lock); |
3607 | 4778 | } | |
3608 | WARN_ON_ONCE(pool->flags & POOL_FREEZING); | ||
3609 | pool->flags |= POOL_FREEZING; | ||
3610 | |||
3611 | list_for_each_entry(wq, &workqueues, list) { | ||
3612 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | ||
3613 | |||
3614 | if (pwq && pwq->pool == pool && | ||
3615 | (wq->flags & WQ_FREEZABLE)) | ||
3616 | pwq->max_active = 0; | ||
3617 | } | ||
3618 | 4779 | ||
3619 | spin_unlock_irq(&pool->lock); | 4780 | list_for_each_entry(wq, &workqueues, list) { |
3620 | } | 4781 | mutex_lock(&wq->mutex); |
4782 | for_each_pwq(pwq, wq) | ||
4783 | pwq_adjust_max_active(pwq); | ||
4784 | mutex_unlock(&wq->mutex); | ||
3621 | } | 4785 | } |
3622 | 4786 | ||
3623 | spin_unlock(&workqueue_lock); | 4787 | mutex_unlock(&wq_pool_mutex); |
3624 | } | 4788 | } |
3625 | 4789 | ||
3626 | /** | 4790 | /** |
@@ -3630,7 +4794,7 @@ void freeze_workqueues_begin(void) | |||
3630 | * between freeze_workqueues_begin() and thaw_workqueues(). | 4794 | * between freeze_workqueues_begin() and thaw_workqueues(). |
3631 | * | 4795 | * |
3632 | * CONTEXT: | 4796 | * CONTEXT: |
3633 | * Grabs and releases workqueue_lock. | 4797 | * Grabs and releases wq_pool_mutex. |
3634 | * | 4798 | * |
3635 | * RETURNS: | 4799 | * RETURNS: |
3636 | * %true if some freezable workqueues are still busy. %false if freezing | 4800 | * %true if some freezable workqueues are still busy. %false if freezing |
@@ -3638,34 +4802,34 @@ void freeze_workqueues_begin(void) | |||
3638 | */ | 4802 | */ |
3639 | bool freeze_workqueues_busy(void) | 4803 | bool freeze_workqueues_busy(void) |
3640 | { | 4804 | { |
3641 | unsigned int cpu; | ||
3642 | bool busy = false; | 4805 | bool busy = false; |
4806 | struct workqueue_struct *wq; | ||
4807 | struct pool_workqueue *pwq; | ||
3643 | 4808 | ||
3644 | spin_lock(&workqueue_lock); | 4809 | mutex_lock(&wq_pool_mutex); |
3645 | 4810 | ||
3646 | BUG_ON(!workqueue_freezing); | 4811 | WARN_ON_ONCE(!workqueue_freezing); |
3647 | 4812 | ||
3648 | for_each_wq_cpu(cpu) { | 4813 | list_for_each_entry(wq, &workqueues, list) { |
3649 | struct workqueue_struct *wq; | 4814 | if (!(wq->flags & WQ_FREEZABLE)) |
4815 | continue; | ||
3650 | /* | 4816 | /* |
3651 | * nr_active is monotonically decreasing. It's safe | 4817 | * nr_active is monotonically decreasing. It's safe |
3652 | * to peek without lock. | 4818 | * to peek without lock. |
3653 | */ | 4819 | */ |
3654 | list_for_each_entry(wq, &workqueues, list) { | 4820 | rcu_read_lock_sched(); |
3655 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | 4821 | for_each_pwq(pwq, wq) { |
3656 | 4822 | WARN_ON_ONCE(pwq->nr_active < 0); | |
3657 | if (!pwq || !(wq->flags & WQ_FREEZABLE)) | ||
3658 | continue; | ||
3659 | |||
3660 | BUG_ON(pwq->nr_active < 0); | ||
3661 | if (pwq->nr_active) { | 4823 | if (pwq->nr_active) { |
3662 | busy = true; | 4824 | busy = true; |
4825 | rcu_read_unlock_sched(); | ||
3663 | goto out_unlock; | 4826 | goto out_unlock; |
3664 | } | 4827 | } |
3665 | } | 4828 | } |
4829 | rcu_read_unlock_sched(); | ||
3666 | } | 4830 | } |
3667 | out_unlock: | 4831 | out_unlock: |
3668 | spin_unlock(&workqueue_lock); | 4832 | mutex_unlock(&wq_pool_mutex); |
3669 | return busy; | 4833 | return busy; |
3670 | } | 4834 | } |
3671 | 4835 | ||
@@ -3676,104 +4840,142 @@ out_unlock: | |||
3676 | * frozen works are transferred to their respective pool worklists. | 4840 | * frozen works are transferred to their respective pool worklists. |
3677 | * | 4841 | * |
3678 | * CONTEXT: | 4842 | * CONTEXT: |
3679 | * Grabs and releases workqueue_lock and pool->lock's. | 4843 | * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. |
3680 | */ | 4844 | */ |
3681 | void thaw_workqueues(void) | 4845 | void thaw_workqueues(void) |
3682 | { | 4846 | { |
3683 | unsigned int cpu; | 4847 | struct workqueue_struct *wq; |
4848 | struct pool_workqueue *pwq; | ||
4849 | struct worker_pool *pool; | ||
4850 | int pi; | ||
3684 | 4851 | ||
3685 | spin_lock(&workqueue_lock); | 4852 | mutex_lock(&wq_pool_mutex); |
3686 | 4853 | ||
3687 | if (!workqueue_freezing) | 4854 | if (!workqueue_freezing) |
3688 | goto out_unlock; | 4855 | goto out_unlock; |
3689 | 4856 | ||
3690 | for_each_wq_cpu(cpu) { | 4857 | /* clear FREEZING */ |
3691 | struct worker_pool *pool; | 4858 | for_each_pool(pool, pi) { |
3692 | struct workqueue_struct *wq; | 4859 | spin_lock_irq(&pool->lock); |
4860 | WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); | ||
4861 | pool->flags &= ~POOL_FREEZING; | ||
4862 | spin_unlock_irq(&pool->lock); | ||
4863 | } | ||
3693 | 4864 | ||
3694 | for_each_std_worker_pool(pool, cpu) { | 4865 | /* restore max_active and repopulate worklist */ |
3695 | spin_lock_irq(&pool->lock); | 4866 | list_for_each_entry(wq, &workqueues, list) { |
4867 | mutex_lock(&wq->mutex); | ||
4868 | for_each_pwq(pwq, wq) | ||
4869 | pwq_adjust_max_active(pwq); | ||
4870 | mutex_unlock(&wq->mutex); | ||
4871 | } | ||
3696 | 4872 | ||
3697 | WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); | 4873 | workqueue_freezing = false; |
3698 | pool->flags &= ~POOL_FREEZING; | 4874 | out_unlock: |
4875 | mutex_unlock(&wq_pool_mutex); | ||
4876 | } | ||
4877 | #endif /* CONFIG_FREEZER */ | ||
3699 | 4878 | ||
3700 | list_for_each_entry(wq, &workqueues, list) { | 4879 | static void __init wq_numa_init(void) |
3701 | struct pool_workqueue *pwq = get_pwq(cpu, wq); | 4880 | { |
4881 | cpumask_var_t *tbl; | ||
4882 | int node, cpu; | ||
3702 | 4883 | ||
3703 | if (!pwq || pwq->pool != pool || | 4884 | /* determine NUMA pwq table len - highest node id + 1 */ |
3704 | !(wq->flags & WQ_FREEZABLE)) | 4885 | for_each_node(node) |
3705 | continue; | 4886 | wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); |
3706 | 4887 | ||
3707 | /* restore max_active and repopulate worklist */ | 4888 | if (num_possible_nodes() <= 1) |
3708 | pwq_set_max_active(pwq, wq->saved_max_active); | 4889 | return; |
3709 | } | ||
3710 | 4890 | ||
3711 | wake_up_worker(pool); | 4891 | if (wq_disable_numa) { |
4892 | pr_info("workqueue: NUMA affinity support disabled\n"); | ||
4893 | return; | ||
4894 | } | ||
3712 | 4895 | ||
3713 | spin_unlock_irq(&pool->lock); | 4896 | wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); |
4897 | BUG_ON(!wq_update_unbound_numa_attrs_buf); | ||
4898 | |||
4899 | /* | ||
4900 | * We want masks of possible CPUs of each node which isn't readily | ||
4901 | * available. Build one from cpu_to_node() which should have been | ||
4902 | * fully initialized by now. | ||
4903 | */ | ||
4904 | tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); | ||
4905 | BUG_ON(!tbl); | ||
4906 | |||
4907 | for_each_node(node) | ||
4908 | BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, | ||
4909 | node_online(node) ? node : NUMA_NO_NODE)); | ||
4910 | |||
4911 | for_each_possible_cpu(cpu) { | ||
4912 | node = cpu_to_node(cpu); | ||
4913 | if (WARN_ON(node == NUMA_NO_NODE)) { | ||
4914 | pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); | ||
4915 | /* happens iff arch is bonkers, let's just proceed */ | ||
4916 | return; | ||
3714 | } | 4917 | } |
4918 | cpumask_set_cpu(cpu, tbl[node]); | ||
3715 | } | 4919 | } |
3716 | 4920 | ||
3717 | workqueue_freezing = false; | 4921 | wq_numa_possible_cpumask = tbl; |
3718 | out_unlock: | 4922 | wq_numa_enabled = true; |
3719 | spin_unlock(&workqueue_lock); | ||
3720 | } | 4923 | } |
3721 | #endif /* CONFIG_FREEZER */ | ||
3722 | 4924 | ||
3723 | static int __init init_workqueues(void) | 4925 | static int __init init_workqueues(void) |
3724 | { | 4926 | { |
3725 | unsigned int cpu; | 4927 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; |
4928 | int i, cpu; | ||
3726 | 4929 | ||
3727 | /* make sure we have enough bits for OFFQ pool ID */ | 4930 | /* make sure we have enough bits for OFFQ pool ID */ |
3728 | BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < | 4931 | BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < |
3729 | WORK_CPU_END * NR_STD_WORKER_POOLS); | 4932 | WORK_CPU_END * NR_STD_WORKER_POOLS); |
3730 | 4933 | ||
4934 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); | ||
4935 | |||
4936 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | ||
4937 | |||
3731 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); | 4938 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |
3732 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | 4939 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); |
3733 | 4940 | ||
4941 | wq_numa_init(); | ||
4942 | |||
3734 | /* initialize CPU pools */ | 4943 | /* initialize CPU pools */ |
3735 | for_each_wq_cpu(cpu) { | 4944 | for_each_possible_cpu(cpu) { |
3736 | struct worker_pool *pool; | 4945 | struct worker_pool *pool; |
3737 | 4946 | ||
3738 | for_each_std_worker_pool(pool, cpu) { | 4947 | i = 0; |
3739 | spin_lock_init(&pool->lock); | 4948 | for_each_cpu_worker_pool(pool, cpu) { |
4949 | BUG_ON(init_worker_pool(pool)); | ||
3740 | pool->cpu = cpu; | 4950 | pool->cpu = cpu; |
3741 | pool->flags |= POOL_DISASSOCIATED; | 4951 | cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); |
3742 | INIT_LIST_HEAD(&pool->worklist); | 4952 | pool->attrs->nice = std_nice[i++]; |
3743 | INIT_LIST_HEAD(&pool->idle_list); | 4953 | pool->node = cpu_to_node(cpu); |
3744 | hash_init(pool->busy_hash); | ||
3745 | |||
3746 | init_timer_deferrable(&pool->idle_timer); | ||
3747 | pool->idle_timer.function = idle_worker_timeout; | ||
3748 | pool->idle_timer.data = (unsigned long)pool; | ||
3749 | |||
3750 | setup_timer(&pool->mayday_timer, pool_mayday_timeout, | ||
3751 | (unsigned long)pool); | ||
3752 | |||
3753 | mutex_init(&pool->assoc_mutex); | ||
3754 | ida_init(&pool->worker_ida); | ||
3755 | 4954 | ||
3756 | /* alloc pool ID */ | 4955 | /* alloc pool ID */ |
4956 | mutex_lock(&wq_pool_mutex); | ||
3757 | BUG_ON(worker_pool_assign_id(pool)); | 4957 | BUG_ON(worker_pool_assign_id(pool)); |
4958 | mutex_unlock(&wq_pool_mutex); | ||
3758 | } | 4959 | } |
3759 | } | 4960 | } |
3760 | 4961 | ||
3761 | /* create the initial worker */ | 4962 | /* create the initial worker */ |
3762 | for_each_online_wq_cpu(cpu) { | 4963 | for_each_online_cpu(cpu) { |
3763 | struct worker_pool *pool; | 4964 | struct worker_pool *pool; |
3764 | 4965 | ||
3765 | for_each_std_worker_pool(pool, cpu) { | 4966 | for_each_cpu_worker_pool(pool, cpu) { |
3766 | struct worker *worker; | 4967 | pool->flags &= ~POOL_DISASSOCIATED; |
4968 | BUG_ON(create_and_start_worker(pool) < 0); | ||
4969 | } | ||
4970 | } | ||
3767 | 4971 | ||
3768 | if (cpu != WORK_CPU_UNBOUND) | 4972 | /* create default unbound wq attrs */ |
3769 | pool->flags &= ~POOL_DISASSOCIATED; | 4973 | for (i = 0; i < NR_STD_WORKER_POOLS; i++) { |
4974 | struct workqueue_attrs *attrs; | ||
3770 | 4975 | ||
3771 | worker = create_worker(pool); | 4976 | BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); |
3772 | BUG_ON(!worker); | 4977 | attrs->nice = std_nice[i]; |
3773 | spin_lock_irq(&pool->lock); | 4978 | unbound_std_wq_attrs[i] = attrs; |
3774 | start_worker(worker); | ||
3775 | spin_unlock_irq(&pool->lock); | ||
3776 | } | ||
3777 | } | 4979 | } |
3778 | 4980 | ||
3779 | system_wq = alloc_workqueue("events", 0, 0); | 4981 | system_wq = alloc_workqueue("events", 0, 0); |
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 07650264ec15..ad83c96b2ece 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h | |||
@@ -29,16 +29,24 @@ struct worker { | |||
29 | struct work_struct *current_work; /* L: work being processed */ | 29 | struct work_struct *current_work; /* L: work being processed */ |
30 | work_func_t current_func; /* L: current_work's fn */ | 30 | work_func_t current_func; /* L: current_work's fn */ |
31 | struct pool_workqueue *current_pwq; /* L: current_work's pwq */ | 31 | struct pool_workqueue *current_pwq; /* L: current_work's pwq */ |
32 | bool desc_valid; /* ->desc is valid */ | ||
32 | struct list_head scheduled; /* L: scheduled works */ | 33 | struct list_head scheduled; /* L: scheduled works */ |
34 | |||
35 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | ||
36 | |||
33 | struct task_struct *task; /* I: worker task */ | 37 | struct task_struct *task; /* I: worker task */ |
34 | struct worker_pool *pool; /* I: the associated pool */ | 38 | struct worker_pool *pool; /* I: the associated pool */ |
35 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | 39 | /* L: for rescuers */ |
40 | |||
36 | unsigned long last_active; /* L: last active timestamp */ | 41 | unsigned long last_active; /* L: last active timestamp */ |
37 | unsigned int flags; /* X: flags */ | 42 | unsigned int flags; /* X: flags */ |
38 | int id; /* I: worker id */ | 43 | int id; /* I: worker id */ |
39 | 44 | ||
40 | /* for rebinding worker to CPU */ | 45 | /* |
41 | struct work_struct rebind_work; /* L: for busy worker */ | 46 | * Opaque string set with work_set_desc(). Printed out with task |
47 | * dump for debugging - WARN, BUG, panic or sysrq. | ||
48 | */ | ||
49 | char desc[WORKER_DESC_LEN]; | ||
42 | 50 | ||
43 | /* used only by rescuers to point to the target workqueue */ | 51 | /* used only by rescuers to point to the target workqueue */ |
44 | struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ | 52 | struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ |
@@ -58,8 +66,7 @@ static inline struct worker *current_wq_worker(void) | |||
58 | * Scheduler hooks for concurrency managed workqueue. Only to be used from | 66 | * Scheduler hooks for concurrency managed workqueue. Only to be used from |
59 | * sched.c and workqueue.c. | 67 | * sched.c and workqueue.c. |
60 | */ | 68 | */ |
61 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); | 69 | void wq_worker_waking_up(struct task_struct *task, int cpu); |
62 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | 70 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); |
63 | unsigned int cpu); | ||
64 | 71 | ||
65 | #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ | 72 | #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |